Commit 8d5a878e6650c4130784b81026b903e2ffd965c8
1 parent
1c1bf677
- praca nad budowaniem automatu dla zlepiacza segmentów
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@86 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
14 changed files
with
266 additions
and
152 deletions
fsabuilder/buildfsa.spec
0 → 100644
1 | +# -*- mode: python -*- | ||
2 | +a = Analysis(['fsa/buildfsa.py'], | ||
3 | + pathex=['/home/lennyn/xxx/morfeusz/fsabuilder'], | ||
4 | + hiddenimports=[], | ||
5 | + hookspath=None, | ||
6 | + runtime_hooks=None) | ||
7 | +pyz = PYZ(a.pure) | ||
8 | +exe = EXE(pyz, | ||
9 | + a.scripts, | ||
10 | + exclude_binaries=True, | ||
11 | + name='buildfsa', | ||
12 | + debug=False, | ||
13 | + strip=None, | ||
14 | + upx=True, | ||
15 | + console=True ) | ||
16 | +coll = COLLECT(exe, | ||
17 | + a.binaries, | ||
18 | + a.zipfiles, | ||
19 | + a.datas, | ||
20 | + strip=None, | ||
21 | + upx=True, | ||
22 | + name='buildfsa') |
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -14,11 +14,12 @@ class FSA(object): | @@ -14,11 +14,12 @@ class FSA(object): | ||
14 | ''' | 14 | ''' |
15 | 15 | ||
16 | 16 | ||
17 | - def __init__(self, encoder, tagset=None, encodeData=True): | ||
18 | - self.encodeWord = encoder.encodeWord | 17 | + def __init__(self, encoder, tagset=None, encodeData=True, encodeWords=True): |
18 | + self.encodeWord = encoder.encodeWord if encodeWords else lambda x: x | ||
19 | self.encodeData = encoder.encodeData if encodeData else lambda x: x | 19 | self.encodeData = encoder.encodeData if encodeData else lambda x: x |
20 | self.decodeData = encoder.decodeData if encodeData else lambda x: x | 20 | self.decodeData = encoder.decodeData if encodeData else lambda x: x |
21 | self.encodedPrevWord = None | 21 | self.encodedPrevWord = None |
22 | + | ||
22 | self.tagset = tagset | 23 | self.tagset = tagset |
23 | self.initialState = state.State() | 24 | self.initialState = state.State() |
24 | self.register = register.Register() | 25 | self.register = register.Register() |
fsabuilder/morfeuszbuilder/fsa/state.py
@@ -9,7 +9,7 @@ class State(object): | @@ -9,7 +9,7 @@ class State(object): | ||
9 | A state in an automaton | 9 | A state in an automaton |
10 | ''' | 10 | ''' |
11 | 11 | ||
12 | - def __init__(self): | 12 | + def __init__(self, additionalData=None): |
13 | self.transitionsMap = {} | 13 | self.transitionsMap = {} |
14 | self.freq = 0 | 14 | self.freq = 0 |
15 | self.encodedData = None | 15 | self.encodedData = None |
@@ -17,6 +17,7 @@ class State(object): | @@ -17,6 +17,7 @@ class State(object): | ||
17 | self.offset = None | 17 | self.offset = None |
18 | self.label2Freq = {} | 18 | self.label2Freq = {} |
19 | self.serializeAsArray = False | 19 | self.serializeAsArray = False |
20 | + self.additionalData = additionalData | ||
20 | 21 | ||
21 | @property | 22 | @property |
22 | def transitionsNum(self): | 23 | def transitionsNum(self): |
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -34,11 +34,6 @@ class ArgDefine(object): | @@ -34,11 +34,6 @@ class ArgDefine(object): | ||
34 | def __str__(self): | 34 | def __str__(self): |
35 | return '%s(%s) %s' % (self.name, self.arg, self.val) | 35 | return '%s(%s) %s' % (self.name, self.arg, self.val) |
36 | 36 | ||
37 | -class PreprocessorException(Exception): | ||
38 | - | ||
39 | - def __init__(self, msg, line): | ||
40 | - pass | ||
41 | - | ||
42 | def _tryToSubstituteArgDefine(s, t, defines): | 37 | def _tryToSubstituteArgDefine(s, t, defines): |
43 | defineName = t[0] | 38 | defineName = t[0] |
44 | substituteValue = t[1] | 39 | substituteValue = t[1] |
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -4,6 +4,8 @@ Created on 24 sty 2014 | @@ -4,6 +4,8 @@ Created on 24 sty 2014 | ||
4 | @author: mlenart | 4 | @author: mlenart |
5 | ''' | 5 | ''' |
6 | 6 | ||
7 | +from morfeuszbuilder.segrules.rulesNFA import RulesNFAState | ||
8 | + | ||
7 | class SegmentRule(object): | 9 | class SegmentRule(object): |
8 | ''' | 10 | ''' |
9 | classdocs | 11 | classdocs |
@@ -14,46 +16,91 @@ class SegmentRule(object): | @@ -14,46 +16,91 @@ class SegmentRule(object): | ||
14 | ''' | 16 | ''' |
15 | Constructor | 17 | Constructor |
16 | ''' | 18 | ''' |
19 | + | ||
20 | + def addToNFA(self, fsa): | ||
21 | + raise NotImplementedError() | ||
22 | + | ||
23 | + def _doAddToNFA(self, startStates, endState): | ||
24 | + raise NotImplementedError() | ||
17 | 25 | ||
18 | class TagRule(SegmentRule): | 26 | class TagRule(SegmentRule): |
19 | 27 | ||
20 | - def __init__(self, tagType, line): | ||
21 | - self.tagType = tagType | ||
22 | - self.line = line | 28 | + def __init__(self, segnum): |
29 | + self.segnum = segnum | ||
30 | + | ||
31 | + def addToNFA(self, fsa): | ||
32 | + endState = RulesNFAState(final=True) | ||
33 | + self._doAddToNFA(fsa.initialState, endState) | ||
34 | + | ||
35 | + def _doAddToNFA(self, startState, endState): | ||
36 | + startState.addTransition(self.segnum, endState) | ||
23 | 37 | ||
24 | class UnaryRule(SegmentRule): | 38 | class UnaryRule(SegmentRule): |
25 | 39 | ||
26 | - def __init__(self, child, line): | 40 | + def __init__(self, child): |
27 | self.child = child | 41 | self.child = child |
28 | - self.line = line | ||
29 | 42 | ||
30 | class ComplexRule(SegmentRule): | 43 | class ComplexRule(SegmentRule): |
31 | 44 | ||
32 | - def __init__(self, children, line): | 45 | + def __init__(self, children): |
33 | self.children = children | 46 | self.children = children |
34 | - self.line = line | 47 | + |
48 | + def addToNFA(self, fsa): | ||
49 | + endState = RulesNFAState(final=True) | ||
50 | + self._doAddToNFA(fsa.initialState, endState) | ||
35 | 51 | ||
36 | class ConcatRule(ComplexRule): | 52 | class ConcatRule(ComplexRule): |
37 | 53 | ||
38 | - def __init__(self, children, line): | ||
39 | - super(ConcatRule, self).__init__(children, line) | 54 | + def __init__(self, children): |
55 | + super(ConcatRule, self).__init__(children) | ||
56 | + | ||
57 | + def _doAddToNFA(self, startState, endState): | ||
58 | + currStartState = startState | ||
59 | + for child in self.children[:-1]: | ||
60 | + currEndState = RulesNFAState() | ||
61 | + child._doAddToNFA(currStartState, currEndState) | ||
62 | + nextStartState = RulesNFAState() | ||
63 | + currEndState.addTransition(None, nextStartState) | ||
64 | + currStartState = nextStartState | ||
65 | + lastChild = self.children[-1] | ||
66 | + lastChild._doAddToNFA(currStartState, endState) | ||
40 | 67 | ||
41 | class OrRule(ComplexRule): | 68 | class OrRule(ComplexRule): |
42 | 69 | ||
43 | - def __init__(self, children, line): | ||
44 | - super(OrRule, self).__init__(children, line) | 70 | + def __init__(self, children): |
71 | + super(OrRule, self).__init__(children) | ||
72 | + | ||
73 | + def _doAddToNFA(self, startState, endState): | ||
74 | + for child in self.children: | ||
75 | + intermStartState = RulesNFAState() | ||
76 | + intermEndState = RulesNFAState() | ||
77 | + startState.addTransition(None, intermStartState) | ||
78 | + child._doAddToNFA(intermStartState, intermEndState) | ||
79 | + intermEndState.addTransition(None, endState) | ||
45 | 80 | ||
46 | class ZeroOrMoreRule(UnaryRule): | 81 | class ZeroOrMoreRule(UnaryRule): |
47 | 82 | ||
48 | - def __init__(self, child, line): | ||
49 | - super(ZeroOrMoreRule, self).__init__(child, line) | ||
50 | - | ||
51 | -class OneOrMoreRule(UnaryRule): | 83 | + def __init__(self, child): |
84 | + super(ZeroOrMoreRule, self).__init__(child) | ||
85 | + | ||
86 | + def addToNFA(self, fsa): | ||
87 | + raise ValueError() | ||
52 | 88 | ||
53 | - def __init__(self, child, line): | ||
54 | - super(OneOrMoreRule, self).__init__(child, line) | 89 | + def _doAddToNFA(self, startState, endState): |
90 | + intermStartState = RulesNFAState() | ||
91 | + intermEndState = RulesNFAState() | ||
92 | + | ||
93 | + startState.addTransition(None, intermStartState) | ||
94 | + startState.addTransition(None, endState) | ||
95 | + self.child._doAddToNFA(intermStartState, intermEndState) | ||
96 | + intermEndState.addTransition(None, endState) | ||
97 | + endState.addTransition(None, intermStartState) | ||
55 | 98 | ||
56 | class IgnoreOrthRule(UnaryRule): | 99 | class IgnoreOrthRule(UnaryRule): |
57 | 100 | ||
58 | - def __init__(self, child, line): | ||
59 | - super(IgnoreOrthRule, self).__init__(child, line) | 101 | + def __init__(self, child): |
102 | + super(IgnoreOrthRule, self).__init__(child) | ||
103 | + | ||
104 | + def _doAddToNFA(self, startState, endState): | ||
105 | + startState.addTransition(self.child.segnum, endState, ignoreOrth=True) | ||
106 | + |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
0 → 100644
1 | +''' | ||
2 | +Created on 24 sty 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | + | ||
7 | +from morfeuszbuilder.fsa import fsa, state, encode | ||
8 | + | ||
9 | +class RulesNFAState(object): | ||
10 | + | ||
11 | + def __init__(self, initial=False, final=False): | ||
12 | + self.transitionsMap = {} | ||
13 | + self.initial = initial | ||
14 | + self.final = final | ||
15 | + | ||
16 | + def addTransition(self, label, targetState, ignoreOrth=False): | ||
17 | + assert not ignoreOrth or label is not None | ||
18 | + self.transitionsMap.setdefault((label, ignoreOrth), set()) | ||
19 | + self.transitionsMap[(label, ignoreOrth)].add(targetState) | ||
20 | + | ||
21 | +class RulesNFA(object): | ||
22 | + | ||
23 | + def __init__(self, key2Def={}): | ||
24 | + self.initialState = RulesNFAState(initial=True) | ||
25 | + | ||
26 | + def _doConvertState(self, dfaState, nfaStates): | ||
27 | + for label, (nextIgnoreOrth, nextNFAStates) in self._groupOutputByLabels(nfaStates).iteritems(): | ||
28 | + nextDFAState = state.State(additionalData=nextIgnoreOrth) | ||
29 | + dfaState.setTransition(label, nextDFAState) | ||
30 | + dfaState.encodedData = bytearray() | ||
31 | + self._doConvertState(nextDFAState, nextNFAStates) | ||
32 | + | ||
33 | + def convertToDFA(self): | ||
34 | + dfa = fsa.FSA(encoder=None, encodeWords=False) | ||
35 | + startStates = self.initialState.getClosure() | ||
36 | + assert not any(filter(lambda s: s.final, startStates)) | ||
37 | + dfa.initialState = state.State(additionalData=False) | ||
38 | + self._doConvertState(dfa.initialState, startStates) | ||
39 | + | ||
40 | + | ||
0 | \ No newline at end of file | 41 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
1 | 1 | ||
2 | from pyparsing import * | 2 | from pyparsing import * |
3 | +ParserElement.enablePackrat() | ||
3 | from morfeuszbuilder.tagset import segtypes | 4 | from morfeuszbuilder.tagset import segtypes |
4 | -from morfeuszbuilder.utils import configFile | ||
5 | -from morfeuszbuilder.segrules import preprocessor | 5 | +from morfeuszbuilder.utils import configFile, exceptions |
6 | +from morfeuszbuilder.segrules import preprocessor, rules | ||
6 | import codecs | 7 | import codecs |
7 | import re | 8 | import re |
8 | 9 | ||
9 | import itertools | 10 | import itertools |
10 | import logging | 11 | import logging |
11 | -import segsfsa | ||
12 | - | ||
13 | -# header = Suppress('[') + Word(alphas, bodyChars=alphanums+'_') + Suppress(']') | ||
14 | -# define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | ||
15 | -# ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() | ||
16 | -# endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() | ||
17 | - | ||
18 | -def doprint(toks): | ||
19 | - print toks | 12 | +from morfeuszbuilder.segrules import rulesNFA |
20 | 13 | ||
21 | class RulesParser(object): | 14 | class RulesParser(object): |
22 | 15 | ||
@@ -31,7 +24,7 @@ class RulesParser(object): | @@ -31,7 +24,7 @@ class RulesParser(object): | ||
31 | key, defs = lineToParse.parseString(line) | 24 | key, defs = lineToParse.parseString(line) |
32 | res[key] = tuple(defs) | 25 | res[key] = tuple(defs) |
33 | except Exception as ex: | 26 | except Exception as ex: |
34 | - raise configFile.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) | 27 | + raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) |
35 | return res | 28 | return res |
36 | 29 | ||
37 | def parse(self, filename): | 30 | def parse(self, filename): |
@@ -48,12 +41,12 @@ class RulesParser(object): | @@ -48,12 +41,12 @@ class RulesParser(object): | ||
48 | 41 | ||
49 | for defs in itertools.product(*key2Defs.values()): | 42 | for defs in itertools.product(*key2Defs.values()): |
50 | key2Def = dict([(def2Key[define], define) for define in defs]) | 43 | key2Def = dict([(def2Key[define], define) for define in defs]) |
51 | - fsa = segsfsa.SegmentsFSA(key2Def) | 44 | + nfa = rulesNFA.RulesNFA(key2Def) |
52 | combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') | 45 | combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') |
53 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) | 46 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) |
54 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): | 47 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): |
55 | - fsa.addSegmentRule(rule) | ||
56 | - res.append(fsa) | 48 | + rule.addToNFA(nfa) |
49 | + res.append(nfa) | ||
57 | return res | 50 | return res |
58 | 51 | ||
59 | def _doParse(self, combinationEnumeratedLines, segtypesHelper): | 52 | def _doParse(self, combinationEnumeratedLines, segtypesHelper): |
@@ -61,6 +54,12 @@ class RulesParser(object): | @@ -61,6 +54,12 @@ class RulesParser(object): | ||
61 | if not line.startswith('#'): | 54 | if not line.startswith('#'): |
62 | yield self._doParseOneLine(lineNum, line, segtypesHelper) | 55 | yield self._doParseOneLine(lineNum, line, segtypesHelper) |
63 | 56 | ||
57 | + def _createNewTagRule(self, segtype, lineNum, line, segtypesHelper): | ||
58 | + if not segtypesHelper.hasSegtype(segtype): | ||
59 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) | ||
60 | + else: | ||
61 | + return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype)) | ||
62 | + | ||
64 | def _doParseOneLine(self, lineNum, line, segtypesHelper): | 63 | def _doParseOneLine(self, lineNum, line, segtypesHelper): |
65 | rule = Forward() | 64 | rule = Forward() |
66 | tagRule = Word(alphanums+'_') | 65 | tagRule = Word(alphanums+'_') |
@@ -74,9 +73,21 @@ class RulesParser(object): | @@ -74,9 +73,21 @@ class RulesParser(object): | ||
74 | complexRule = unaryRule ^ oneOfRule | 73 | complexRule = unaryRule ^ oneOfRule |
75 | concatRule = OneOrMore(complexRule) | 74 | concatRule = OneOrMore(complexRule) |
76 | rule << concatRule | 75 | rule << concatRule |
76 | + | ||
77 | + tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) | ||
78 | + ignoreOrthRule.setParseAction(lambda string, loc, toks: rules.IgnoreOrthRule(toks[0])) | ||
79 | +# parenRule.setParseAction(lambda string, loc, toks: toks[0]) | ||
80 | + zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) | ||
81 | + oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) | ||
82 | + oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) | ||
83 | + concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) | ||
84 | + | ||
85 | + | ||
77 | # rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule | 86 | # rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule |
78 | 87 | ||
79 | # tagRule.setParseAction(lambda s,l,toks: doprint(toks)) | 88 | # tagRule.setParseAction(lambda s,l,toks: doprint(toks)) |
80 | # print lineNum, line | 89 | # print lineNum, line |
81 | - parsedLine = rule.parseString(line, parseAll=True) | 90 | + parsedRule = rule.parseString(line, parseAll=True)[0] |
91 | + print parsedRule | ||
92 | + return parsedRule | ||
82 | # print parsedLine | 93 | # print parsedLine |
fsabuilder/morfeuszbuilder/segrules/segsfsa.py deleted
1 | -''' | ||
2 | -Created on 24 sty 2014 | ||
3 | - | ||
4 | -@author: mlenart | ||
5 | -''' | ||
6 | - | ||
7 | -class SegmentsFSAState(object): | ||
8 | - | ||
9 | - def __init__(self): | ||
10 | - self.transitionsMap = {} | ||
11 | - | ||
12 | - def addSegmentRule(self, segmentRule): | ||
13 | - pass | ||
14 | - | ||
15 | -class SegmentsFSA(object): | ||
16 | - | ||
17 | - def __init__(self, key2Def={}): | ||
18 | - self.initialState = SegmentsFSAState() | ||
19 | - | ||
20 | - def addSegmentRule(self, segmentRule): | ||
21 | - self.initialState.addSegmentRule(segmentRule) | ||
22 | - | ||
23 | - def serialize(self): | ||
24 | - res = bytearray() | ||
25 | - return res | ||
26 | - | ||
27 | - | ||
28 | \ No newline at end of file | 0 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
@@ -9,9 +9,11 @@ from morfeuszbuilder.segrules import rulesParser | @@ -9,9 +9,11 @@ from morfeuszbuilder.segrules import rulesParser | ||
9 | from morfeuszbuilder.tagset import tagset | 9 | from morfeuszbuilder.tagset import tagset |
10 | 10 | ||
11 | class Test(unittest.TestCase): | 11 | class Test(unittest.TestCase): |
12 | + print 'do test' | ||
12 | t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) | 13 | t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) |
13 | parser = rulesParser.RulesParser(t) | 14 | parser = rulesParser.RulesParser(t) |
14 | parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | 15 | parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) |
16 | + print 'done' | ||
15 | 17 | ||
16 | if __name__ == "__main__": | 18 | if __name__ == "__main__": |
17 | unittest.main() | 19 | unittest.main() |
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
@@ -3,7 +3,6 @@ aggl=permissive strict isolated | @@ -3,7 +3,6 @@ aggl=permissive strict isolated | ||
3 | praet=split composite | 3 | praet=split composite |
4 | 4 | ||
5 | [combinations] | 5 | [combinations] |
6 | -(dupa|dupa) | ||
7 | #define wsz_interp (interp|kropka|dywiz)* | 6 | #define wsz_interp (interp|kropka|dywiz)* |
8 | 7 | ||
9 | #define moze_interp(segmenty) wsz_interp segmenty wsz_interp | 8 | #define moze_interp(segmenty) wsz_interp segmenty wsz_interp |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -4,80 +4,85 @@ Created on 17 lut 2014 | @@ -4,80 +4,85 @@ Created on 17 lut 2014 | ||
4 | @author: mlenart | 4 | @author: mlenart |
5 | ''' | 5 | ''' |
6 | import re | 6 | import re |
7 | +from morfeuszbuilder.utils import exceptions | ||
7 | 8 | ||
8 | class Segtypes(object): | 9 | class Segtypes(object): |
9 | 10 | ||
10 | - def __init__(self, tagset, segrulesFile): | 11 | + def __init__(self, tagset, segrulesConfigFile): |
11 | 12 | ||
12 | self.tagset = tagset | 13 | self.tagset = tagset |
13 | 14 | ||
14 | - self.segrulesConfigFile = segrulesFile | 15 | + self.filename = segrulesConfigFile.filename |
15 | 16 | ||
16 | self.segtype2Segnum = {} | 17 | self.segtype2Segnum = {} |
17 | self.patternsList = [] | 18 | self.patternsList = [] |
19 | + self._readLexemes(segrulesConfigFile) | ||
20 | + self._readTags(segrulesConfigFile) | ||
21 | + | ||
22 | + def _validate(self, msg, lineNum, cond): | ||
23 | + if not cond: | ||
24 | + raise exceptions.ConfigFileException(self.filename, lineNum, msg) | ||
25 | + | ||
26 | + def _readTags(self, segrulesConfigFile): | ||
27 | + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): | ||
28 | + print lineNum, line | ||
29 | + splitLine = re.split(r'\s+', line.strip()) | ||
30 | + self._validate( | ||
31 | + u'Line in [tags] section must contain exactly two fields - segment type and tag pattern', | ||
32 | + lineNum, | ||
33 | + len(splitLine) == 2) | ||
34 | + segtype, pattern = splitLine | ||
35 | + self._validate( | ||
36 | + u'Segment type must be a lowercase alphanumeric with optional underscores', | ||
37 | + lineNum, | ||
38 | + re.match(r'[a-z_]+', segtype)) | ||
39 | + self._validate( | ||
40 | + u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', | ||
41 | + lineNum, | ||
42 | + re.match(r'[a-z_\.\:\%]+', pattern)) | ||
43 | + | ||
44 | + if segtype in self.segtype2Segnum: | ||
45 | + segnum = self.segtype2Segnum[segtype] | ||
46 | + else: | ||
47 | + segnum = len(self.segtype2Segnum) | ||
48 | + self.segtype2Segnum[segtype] = segnum | ||
49 | + | ||
50 | + self.patternsList.append(SegtypePattern(None, pattern, segnum)) | ||
51 | + | ||
52 | + def _readLexemes(self, segrulesConfigFile): | ||
53 | + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('lexemes'): | ||
54 | + segtype, pattern = line.strip().split('\t') | ||
55 | + self._validate( | ||
56 | + u'Segment type must be a lowercase alphanumeric with optional underscores', | ||
57 | + lineNum, | ||
58 | + re.match(r'[a-z_]+', segtype)) | ||
59 | + self._validate( | ||
60 | + u'Pattern must contain lemma and POS', | ||
61 | + lineNum, | ||
62 | + re.match(r'.+\:[a-z_]+', pattern, re.U)) | ||
63 | + | ||
64 | + if segtype in self.segtype2Segnum: | ||
65 | + segnum = self.segtype2Segnum[segtype] | ||
66 | + else: | ||
67 | + segnum = len(self.segtype2Segnum) | ||
68 | + self.segtype2Segnum[segtype] = segnum | ||
69 | + | ||
70 | + lemma, pos = pattern.split(':') | ||
71 | + | ||
72 | + self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum)) | ||
18 | 73 | ||
19 | - def readTags(self, lines): | ||
20 | - inTags = False | ||
21 | - for lineNum, line in enumerate(lines, start=1): | ||
22 | - header = self._getHeaderValue(line, lineNum) | ||
23 | - if header == 'tags': | ||
24 | - inTags = True | ||
25 | - elif header: | ||
26 | - inTags = False | ||
27 | - elif inTags: | ||
28 | - segtype, pattern = line.strip().split('\t') | ||
29 | - self._validate( | ||
30 | - u'Segment type must be a lowercase alphanumeric with optional underscores', | ||
31 | - lineNum, | ||
32 | - re.match(r'[a-z_]+', segtype)) | ||
33 | - self._validate( | ||
34 | - u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', | ||
35 | - lineNum, | ||
36 | - re.match(r'[a-z_\.\:\%]+', pattern)) | ||
37 | - | ||
38 | - if segtype in self.segtype2Segnum: | ||
39 | - segnum = self.segtype2Segnum[segtype] | ||
40 | - else: | ||
41 | - segnum = len(self.segtype2Segnum) | ||
42 | - self.segtype2Segnum[segtype] = segnum | ||
43 | - | ||
44 | - self.patternsList.append(SegtypePattern(None, pattern, segnum)) | 74 | + def hasSegtype(self, segTypeString): |
75 | + return segTypeString in self.segtype2Segnum | ||
45 | 76 | ||
46 | - def readLexemes(self, lines): | ||
47 | - inLexemes = False | ||
48 | - for lineNum, line in enumerate(lines, start=1): | ||
49 | - header = self._getHeaderValue(line, lineNum) | ||
50 | - if header == 'lexemes': | ||
51 | - inLexemes = True | ||
52 | - elif header: | ||
53 | - inLexemes = False | ||
54 | - elif inLexemes: | ||
55 | - segtype, pattern = line.strip().split('\t') | ||
56 | - self._validate( | ||
57 | - u'Segment type must be a lowercase alphanumeric with optional underscores', | ||
58 | - lineNum, | ||
59 | - re.match(r'[a-z_]+', segtype)) | ||
60 | - self._validate( | ||
61 | - u'Pattern must contain lemma and POS', | ||
62 | - lineNum, | ||
63 | - re.match(r'\w+\:[a-z_]+', pattern, re.U)) | ||
64 | - | ||
65 | - if segtype in self.segtype2Segnum: | ||
66 | - segnum = self.segtype2Segnum[segtype] | ||
67 | - else: | ||
68 | - segnum = len(self.segtype2Segnum) | ||
69 | - self.segtype2Segnum[segtype] = segnum | ||
70 | - | ||
71 | - lemma, pos = pattern.split(':') | ||
72 | - | ||
73 | - self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum)) | 77 | + def getSegnum4Segtype(self, segTypeString): |
78 | + return self.segtype2Segnum[segTypeString] | ||
74 | 79 | ||
75 | def lexeme2Segnum(self, lemma, tag): | 80 | def lexeme2Segnum(self, lemma, tag): |
76 | for p in self.patternsList: | 81 | for p in self.patternsList: |
77 | res = p.tryToMatch(lemma, tag) | 82 | res = p.tryToMatch(lemma, tag) |
78 | if res >= 0: | 83 | if res >= 0: |
79 | return res | 84 | return res |
80 | - raise SegtypesException('Cannot find segment type for given tag: %s' % tag) | 85 | + return None |
81 | 86 | ||
82 | class SegtypePattern(object): | 87 | class SegtypePattern(object): |
83 | 88 | ||
@@ -92,11 +97,3 @@ class SegtypePattern(object): | @@ -92,11 +97,3 @@ class SegtypePattern(object): | ||
92 | return self.segnum | 97 | return self.segnum |
93 | else: | 98 | else: |
94 | return -1 | 99 | return -1 |
95 | - | ||
96 | -class SegtypesException(Exception): | ||
97 | - | ||
98 | - def __init__(self, msg): | ||
99 | - self.msg = msg | ||
100 | - | ||
101 | - def __str__(self): | ||
102 | - return u'Error in segment rules: %s' % self.msg |
fsabuilder/morfeuszbuilder/tagset/tagset.py
@@ -12,10 +12,11 @@ class Tagset(object): | @@ -12,10 +12,11 @@ class Tagset(object): | ||
12 | NAMES = 2 | 12 | NAMES = 2 |
13 | SEP = '\t' | 13 | SEP = '\t' |
14 | 14 | ||
15 | - def __init__(self, filename, encoding='utf8'): | 15 | + def __init__(self, filename=None, encoding='utf8'): |
16 | self.tag2tagnum = {} | 16 | self.tag2tagnum = {} |
17 | self.name2namenum = {} | 17 | self.name2namenum = {} |
18 | - self._doInit(filename, encoding) | 18 | + if filename: |
19 | + self._doInit(filename, encoding) | ||
19 | self.tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) | 20 | self.tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) |
20 | 21 | ||
21 | def _doInit(self, filename, encoding): | 22 | def _doInit(self, filename, encoding): |
@@ -37,4 +38,4 @@ class Tagset(object): | @@ -37,4 +38,4 @@ class Tagset(object): | ||
37 | res[tag] = int(tagNum) | 38 | res[tag] = int(tagNum) |
38 | 39 | ||
39 | def getTag4Tagnum(self, tagnum): | 40 | def getTag4Tagnum(self, tagnum): |
40 | - return self.tagnum2tag[tagnum] | ||
41 | \ No newline at end of file | 41 | \ No newline at end of file |
42 | + return self.tagnum2tag[tagnum] |
fsabuilder/morfeuszbuilder/utils/configFile.py
@@ -6,6 +6,7 @@ Created on 18 lut 2014 | @@ -6,6 +6,7 @@ Created on 18 lut 2014 | ||
6 | 6 | ||
7 | import re | 7 | import re |
8 | import codecs | 8 | import codecs |
9 | +import exceptions | ||
9 | 10 | ||
10 | def getHeaderValue(line, lineNum): | 11 | def getHeaderValue(line, lineNum): |
11 | m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) | 12 | m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) |
@@ -25,9 +26,9 @@ class ConfigFile(object): | @@ -25,9 +26,9 @@ class ConfigFile(object): | ||
25 | 26 | ||
26 | def _addSectionStart(self, sectionName, lineNum): | 27 | def _addSectionStart(self, sectionName, lineNum): |
27 | if not sectionName in self.sectionNames: | 28 | if not sectionName in self.sectionNames: |
28 | - raise ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName) | 29 | + raise exceptions.ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName) |
29 | if sectionName in self.section2Lines: | 30 | if sectionName in self.section2Lines: |
30 | - raise ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName) | 31 | + raise exceptions.ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName) |
31 | self.section2Lines[sectionName] = [] | 32 | self.section2Lines[sectionName] = [] |
32 | self.currSection = sectionName | 33 | self.currSection = sectionName |
33 | 34 | ||
@@ -35,7 +36,7 @@ class ConfigFile(object): | @@ -35,7 +36,7 @@ class ConfigFile(object): | ||
35 | line = line.strip() | 36 | line = line.strip() |
36 | if line: | 37 | if line: |
37 | if self.currSection is None and not line.startswith('#'): | 38 | if self.currSection is None and not line.startswith('#'): |
38 | - raise ConfigFileException(self.filename, lineNum, 'Text outside of any section') | 39 | + raise exceptions.ConfigFileException(self.filename, lineNum, 'Text outside of any section') |
39 | self.section2Lines[self.currSection].append((lineNum, line)) | 40 | self.section2Lines[self.currSection].append((lineNum, line)) |
40 | 41 | ||
41 | def _getHeaderValue(self, line, lineNum): | 42 | def _getHeaderValue(self, line, lineNum): |
@@ -56,13 +57,3 @@ class ConfigFile(object): | @@ -56,13 +57,3 @@ class ConfigFile(object): | ||
56 | self._addSectionStart(header, lineNum) | 57 | self._addSectionStart(header, lineNum) |
57 | else: | 58 | else: |
58 | self._addLine(line, lineNum) | 59 | self._addLine(line, lineNum) |
59 | - | ||
60 | -class ConfigFileException(Exception): | ||
61 | - | ||
62 | - def __init__(self, filename, lineNum, msg): | ||
63 | - self.filename = filename | ||
64 | - self.lineNum = lineNum | ||
65 | - self.msg = msg | ||
66 | - | ||
67 | - def __str__(self): | ||
68 | - return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) |
fsabuilder/morfeuszbuilder/utils/exceptions.py
0 → 100644
1 | +''' | ||
2 | +Created on Feb 19, 2014 | ||
3 | + | ||
4 | +@author: lennyn | ||
5 | +''' | ||
6 | + | ||
7 | +class FSABuilderException(Exception): | ||
8 | + ''' | ||
9 | + Exception in configFile module | ||
10 | + ''' | ||
11 | + | ||
12 | + def __init__(self, msg): | ||
13 | + self.msg = msg | ||
14 | + | ||
15 | + def __str__(self): | ||
16 | + return 'Failed to create FSA files: ' + self.msg | ||
17 | + | ||
18 | +class SegtypesException(FSABuilderException): | ||
19 | + | ||
20 | + def __init__(self, msg): | ||
21 | + self.msg = msg | ||
22 | + | ||
23 | + def __str__(self): | ||
24 | + return u'Error in segment rules: %s' % self.msg | ||
25 | + | ||
26 | +class ConfigFileException(FSABuilderException): | ||
27 | + | ||
28 | + def __init__(self, filename, lineNum, msg): | ||
29 | + self.filename = filename | ||
30 | + self.lineNum = lineNum | ||
31 | + self.msg = msg | ||
32 | + | ||
33 | + def __str__(self): | ||
34 | + return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) |