Commit 8d5a878e6650c4130784b81026b903e2ffd965c8
1 parent
1c1bf677
- praca nad budowaniem automatu dla zlepiacza segmentów
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@86 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
14 changed files
with
266 additions
and
152 deletions
fsabuilder/buildfsa.spec
0 → 100644
1 | +# -*- mode: python -*- | |
2 | +a = Analysis(['fsa/buildfsa.py'], | |
3 | + pathex=['/home/lennyn/xxx/morfeusz/fsabuilder'], | |
4 | + hiddenimports=[], | |
5 | + hookspath=None, | |
6 | + runtime_hooks=None) | |
7 | +pyz = PYZ(a.pure) | |
8 | +exe = EXE(pyz, | |
9 | + a.scripts, | |
10 | + exclude_binaries=True, | |
11 | + name='buildfsa', | |
12 | + debug=False, | |
13 | + strip=None, | |
14 | + upx=True, | |
15 | + console=True ) | |
16 | +coll = COLLECT(exe, | |
17 | + a.binaries, | |
18 | + a.zipfiles, | |
19 | + a.datas, | |
20 | + strip=None, | |
21 | + upx=True, | |
22 | + name='buildfsa') | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/fsa.py
... | ... | @@ -14,11 +14,12 @@ class FSA(object): |
14 | 14 | ''' |
15 | 15 | |
16 | 16 | |
17 | - def __init__(self, encoder, tagset=None, encodeData=True): | |
18 | - self.encodeWord = encoder.encodeWord | |
17 | + def __init__(self, encoder, tagset=None, encodeData=True, encodeWords=True): | |
18 | + self.encodeWord = encoder.encodeWord if encodeWords else lambda x: x | |
19 | 19 | self.encodeData = encoder.encodeData if encodeData else lambda x: x |
20 | 20 | self.decodeData = encoder.decodeData if encodeData else lambda x: x |
21 | 21 | self.encodedPrevWord = None |
22 | + | |
22 | 23 | self.tagset = tagset |
23 | 24 | self.initialState = state.State() |
24 | 25 | self.register = register.Register() |
... | ... |
fsabuilder/morfeuszbuilder/fsa/state.py
... | ... | @@ -9,7 +9,7 @@ class State(object): |
9 | 9 | A state in an automaton |
10 | 10 | ''' |
11 | 11 | |
12 | - def __init__(self): | |
12 | + def __init__(self, additionalData=None): | |
13 | 13 | self.transitionsMap = {} |
14 | 14 | self.freq = 0 |
15 | 15 | self.encodedData = None |
... | ... | @@ -17,6 +17,7 @@ class State(object): |
17 | 17 | self.offset = None |
18 | 18 | self.label2Freq = {} |
19 | 19 | self.serializeAsArray = False |
20 | + self.additionalData = additionalData | |
20 | 21 | |
21 | 22 | @property |
22 | 23 | def transitionsNum(self): |
... | ... |
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
... | ... | @@ -34,11 +34,6 @@ class ArgDefine(object): |
34 | 34 | def __str__(self): |
35 | 35 | return '%s(%s) %s' % (self.name, self.arg, self.val) |
36 | 36 | |
37 | -class PreprocessorException(Exception): | |
38 | - | |
39 | - def __init__(self, msg, line): | |
40 | - pass | |
41 | - | |
42 | 37 | def _tryToSubstituteArgDefine(s, t, defines): |
43 | 38 | defineName = t[0] |
44 | 39 | substituteValue = t[1] |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rules.py
... | ... | @@ -4,6 +4,8 @@ Created on 24 sty 2014 |
4 | 4 | @author: mlenart |
5 | 5 | ''' |
6 | 6 | |
7 | +from morfeuszbuilder.segrules.rulesNFA import RulesNFAState | |
8 | + | |
7 | 9 | class SegmentRule(object): |
8 | 10 | ''' |
9 | 11 | classdocs |
... | ... | @@ -14,46 +16,91 @@ class SegmentRule(object): |
14 | 16 | ''' |
15 | 17 | Constructor |
16 | 18 | ''' |
19 | + | |
20 | + def addToNFA(self, fsa): | |
21 | + raise NotImplementedError() | |
22 | + | |
23 | + def _doAddToNFA(self, startStates, endState): | |
24 | + raise NotImplementedError() | |
17 | 25 | |
18 | 26 | class TagRule(SegmentRule): |
19 | 27 | |
20 | - def __init__(self, tagType, line): | |
21 | - self.tagType = tagType | |
22 | - self.line = line | |
28 | + def __init__(self, segnum): | |
29 | + self.segnum = segnum | |
30 | + | |
31 | + def addToNFA(self, fsa): | |
32 | + endState = RulesNFAState(final=True) | |
33 | + self._doAddToNFA(fsa.initialState, endState) | |
34 | + | |
35 | + def _doAddToNFA(self, startState, endState): | |
36 | + startState.addTransition(self.segnum, endState) | |
23 | 37 | |
24 | 38 | class UnaryRule(SegmentRule): |
25 | 39 | |
26 | - def __init__(self, child, line): | |
40 | + def __init__(self, child): | |
27 | 41 | self.child = child |
28 | - self.line = line | |
29 | 42 | |
30 | 43 | class ComplexRule(SegmentRule): |
31 | 44 | |
32 | - def __init__(self, children, line): | |
45 | + def __init__(self, children): | |
33 | 46 | self.children = children |
34 | - self.line = line | |
47 | + | |
48 | + def addToNFA(self, fsa): | |
49 | + endState = RulesNFAState(final=True) | |
50 | + self._doAddToNFA(fsa.initialState, endState) | |
35 | 51 | |
36 | 52 | class ConcatRule(ComplexRule): |
37 | 53 | |
38 | - def __init__(self, children, line): | |
39 | - super(ConcatRule, self).__init__(children, line) | |
54 | + def __init__(self, children): | |
55 | + super(ConcatRule, self).__init__(children) | |
56 | + | |
57 | + def _doAddToNFA(self, startState, endState): | |
58 | + currStartState = startState | |
59 | + for child in self.children[:-1]: | |
60 | + currEndState = RulesNFAState() | |
61 | + child._doAddToNFA(currStartState, currEndState) | |
62 | + nextStartState = RulesNFAState() | |
63 | + currEndState.addTransition(None, nextStartState) | |
64 | + currStartState = nextStartState | |
65 | + lastChild = self.children[-1] | |
66 | + lastChild._doAddToNFA(currStartState, endState) | |
40 | 67 | |
41 | 68 | class OrRule(ComplexRule): |
42 | 69 | |
43 | - def __init__(self, children, line): | |
44 | - super(OrRule, self).__init__(children, line) | |
70 | + def __init__(self, children): | |
71 | + super(OrRule, self).__init__(children) | |
72 | + | |
73 | + def _doAddToNFA(self, startState, endState): | |
74 | + for child in self.children: | |
75 | + intermStartState = RulesNFAState() | |
76 | + intermEndState = RulesNFAState() | |
77 | + startState.addTransition(None, intermStartState) | |
78 | + child._doAddToNFA(intermStartState, intermEndState) | |
79 | + intermEndState.addTransition(None, endState) | |
45 | 80 | |
46 | 81 | class ZeroOrMoreRule(UnaryRule): |
47 | 82 | |
48 | - def __init__(self, child, line): | |
49 | - super(ZeroOrMoreRule, self).__init__(child, line) | |
50 | - | |
51 | -class OneOrMoreRule(UnaryRule): | |
83 | + def __init__(self, child): | |
84 | + super(ZeroOrMoreRule, self).__init__(child) | |
85 | + | |
86 | + def addToNFA(self, fsa): | |
87 | + raise ValueError() | |
52 | 88 | |
53 | - def __init__(self, child, line): | |
54 | - super(OneOrMoreRule, self).__init__(child, line) | |
89 | + def _doAddToNFA(self, startState, endState): | |
90 | + intermStartState = RulesNFAState() | |
91 | + intermEndState = RulesNFAState() | |
92 | + | |
93 | + startState.addTransition(None, intermStartState) | |
94 | + startState.addTransition(None, endState) | |
95 | + self.child._doAddToNFA(intermStartState, intermEndState) | |
96 | + intermEndState.addTransition(None, endState) | |
97 | + endState.addTransition(None, intermStartState) | |
55 | 98 | |
56 | 99 | class IgnoreOrthRule(UnaryRule): |
57 | 100 | |
58 | - def __init__(self, child, line): | |
59 | - super(IgnoreOrthRule, self).__init__(child, line) | |
101 | + def __init__(self, child): | |
102 | + super(IgnoreOrthRule, self).__init__(child) | |
103 | + | |
104 | + def _doAddToNFA(self, startState, endState): | |
105 | + startState.addTransition(self.child.segnum, endState, ignoreOrth=True) | |
106 | + | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
0 → 100644
1 | +''' | |
2 | +Created on 24 sty 2014 | |
3 | + | |
4 | +@author: mlenart | |
5 | +''' | |
6 | + | |
7 | +from morfeuszbuilder.fsa import fsa, state, encode | |
8 | + | |
9 | +class RulesNFAState(object): | |
10 | + | |
11 | + def __init__(self, initial=False, final=False): | |
12 | + self.transitionsMap = {} | |
13 | + self.initial = initial | |
14 | + self.final = final | |
15 | + | |
16 | + def addTransition(self, label, targetState, ignoreOrth=False): | |
17 | + assert not ignoreOrth or label is not None | |
18 | + self.transitionsMap.setdefault((label, ignoreOrth), set()) | |
19 | + self.transitionsMap[(label, ignoreOrth)].add(targetState) | |
20 | + | |
21 | +class RulesNFA(object): | |
22 | + | |
23 | + def __init__(self, key2Def={}): | |
24 | + self.initialState = RulesNFAState(initial=True) | |
25 | + | |
26 | + def _doConvertState(self, dfaState, nfaStates): | |
27 | + for label, (nextIgnoreOrth, nextNFAStates) in self._groupOutputByLabels(nfaStates).iteritems(): | |
28 | + nextDFAState = state.State(additionalData=nextIgnoreOrth) | |
29 | + dfaState.setTransition(label, nextDFAState) | |
30 | + dfaState.encodedData = bytearray() | |
31 | + self._doConvertState(nextDFAState, nextNFAStates) | |
32 | + | |
33 | + def convertToDFA(self): | |
34 | + dfa = fsa.FSA(encoder=None, encodeWords=False) | |
35 | + startStates = self.initialState.getClosure() | |
36 | + assert not any(filter(lambda s: s.final, startStates)) | |
37 | + dfa.initialState = state.State(additionalData=False) | |
38 | + self._doConvertState(dfa.initialState, startStates) | |
39 | + | |
40 | + | |
0 | 41 | \ No newline at end of file |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
1 | 1 | |
2 | 2 | from pyparsing import * |
3 | +ParserElement.enablePackrat() | |
3 | 4 | from morfeuszbuilder.tagset import segtypes |
4 | -from morfeuszbuilder.utils import configFile | |
5 | -from morfeuszbuilder.segrules import preprocessor | |
5 | +from morfeuszbuilder.utils import configFile, exceptions | |
6 | +from morfeuszbuilder.segrules import preprocessor, rules | |
6 | 7 | import codecs |
7 | 8 | import re |
8 | 9 | |
9 | 10 | import itertools |
10 | 11 | import logging |
11 | -import segsfsa | |
12 | - | |
13 | -# header = Suppress('[') + Word(alphas, bodyChars=alphanums+'_') + Suppress(']') | |
14 | -# define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | |
15 | -# ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() | |
16 | -# endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() | |
17 | - | |
18 | -def doprint(toks): | |
19 | - print toks | |
12 | +from morfeuszbuilder.segrules import rulesNFA | |
20 | 13 | |
21 | 14 | class RulesParser(object): |
22 | 15 | |
... | ... | @@ -31,7 +24,7 @@ class RulesParser(object): |
31 | 24 | key, defs = lineToParse.parseString(line) |
32 | 25 | res[key] = tuple(defs) |
33 | 26 | except Exception as ex: |
34 | - raise configFile.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) | |
27 | + raise exceptions.ConfigFileException(segtypesConfigFile.filename, lineNum, u'Error in [options] section: %s' % str(ex)) | |
35 | 28 | return res |
36 | 29 | |
37 | 30 | def parse(self, filename): |
... | ... | @@ -48,12 +41,12 @@ class RulesParser(object): |
48 | 41 | |
49 | 42 | for defs in itertools.product(*key2Defs.values()): |
50 | 43 | key2Def = dict([(def2Key[define], define) for define in defs]) |
51 | - fsa = segsfsa.SegmentsFSA(key2Def) | |
44 | + nfa = rulesNFA.RulesNFA(key2Def) | |
52 | 45 | combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') |
53 | 46 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) |
54 | 47 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): |
55 | - fsa.addSegmentRule(rule) | |
56 | - res.append(fsa) | |
48 | + rule.addToNFA(nfa) | |
49 | + res.append(nfa) | |
57 | 50 | return res |
58 | 51 | |
59 | 52 | def _doParse(self, combinationEnumeratedLines, segtypesHelper): |
... | ... | @@ -61,6 +54,12 @@ class RulesParser(object): |
61 | 54 | if not line.startswith('#'): |
62 | 55 | yield self._doParseOneLine(lineNum, line, segtypesHelper) |
63 | 56 | |
57 | + def _createNewTagRule(self, segtype, lineNum, line, segtypesHelper): | |
58 | + if not segtypesHelper.hasSegtype(segtype): | |
59 | + raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) | |
60 | + else: | |
61 | + return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype)) | |
62 | + | |
64 | 63 | def _doParseOneLine(self, lineNum, line, segtypesHelper): |
65 | 64 | rule = Forward() |
66 | 65 | tagRule = Word(alphanums+'_') |
... | ... | @@ -74,9 +73,21 @@ class RulesParser(object): |
74 | 73 | complexRule = unaryRule ^ oneOfRule |
75 | 74 | concatRule = OneOrMore(complexRule) |
76 | 75 | rule << concatRule |
76 | + | |
77 | + tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) | |
78 | + ignoreOrthRule.setParseAction(lambda string, loc, toks: rules.IgnoreOrthRule(toks[0])) | |
79 | +# parenRule.setParseAction(lambda string, loc, toks: toks[0]) | |
80 | + zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) | |
81 | + oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) | |
82 | + oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) | |
83 | + concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) | |
84 | + | |
85 | + | |
77 | 86 | # rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule |
78 | 87 | |
79 | 88 | # tagRule.setParseAction(lambda s,l,toks: doprint(toks)) |
80 | 89 | # print lineNum, line |
81 | - parsedLine = rule.parseString(line, parseAll=True) | |
90 | + parsedRule = rule.parseString(line, parseAll=True)[0] | |
91 | + print parsedRule | |
92 | + return parsedRule | |
82 | 93 | # print parsedLine |
... | ... |
fsabuilder/morfeuszbuilder/segrules/segsfsa.py deleted
1 | -''' | |
2 | -Created on 24 sty 2014 | |
3 | - | |
4 | -@author: mlenart | |
5 | -''' | |
6 | - | |
7 | -class SegmentsFSAState(object): | |
8 | - | |
9 | - def __init__(self): | |
10 | - self.transitionsMap = {} | |
11 | - | |
12 | - def addSegmentRule(self, segmentRule): | |
13 | - pass | |
14 | - | |
15 | -class SegmentsFSA(object): | |
16 | - | |
17 | - def __init__(self, key2Def={}): | |
18 | - self.initialState = SegmentsFSAState() | |
19 | - | |
20 | - def addSegmentRule(self, segmentRule): | |
21 | - self.initialState.addSegmentRule(segmentRule) | |
22 | - | |
23 | - def serialize(self): | |
24 | - res = bytearray() | |
25 | - return res | |
26 | - | |
27 | - | |
28 | 0 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
... | ... | @@ -9,9 +9,11 @@ from morfeuszbuilder.segrules import rulesParser |
9 | 9 | from morfeuszbuilder.tagset import tagset |
10 | 10 | |
11 | 11 | class Test(unittest.TestCase): |
12 | + print 'do test' | |
12 | 13 | t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) |
13 | 14 | parser = rulesParser.RulesParser(t) |
14 | 15 | parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) |
16 | + print 'done' | |
15 | 17 | |
16 | 18 | if __name__ == "__main__": |
17 | 19 | unittest.main() |
... | ... |
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... | ... | @@ -4,80 +4,85 @@ Created on 17 lut 2014 |
4 | 4 | @author: mlenart |
5 | 5 | ''' |
6 | 6 | import re |
7 | +from morfeuszbuilder.utils import exceptions | |
7 | 8 | |
8 | 9 | class Segtypes(object): |
9 | 10 | |
10 | - def __init__(self, tagset, segrulesFile): | |
11 | + def __init__(self, tagset, segrulesConfigFile): | |
11 | 12 | |
12 | 13 | self.tagset = tagset |
13 | 14 | |
14 | - self.segrulesConfigFile = segrulesFile | |
15 | + self.filename = segrulesConfigFile.filename | |
15 | 16 | |
16 | 17 | self.segtype2Segnum = {} |
17 | 18 | self.patternsList = [] |
19 | + self._readLexemes(segrulesConfigFile) | |
20 | + self._readTags(segrulesConfigFile) | |
21 | + | |
22 | + def _validate(self, msg, lineNum, cond): | |
23 | + if not cond: | |
24 | + raise exceptions.ConfigFileException(self.filename, lineNum, msg) | |
25 | + | |
26 | + def _readTags(self, segrulesConfigFile): | |
27 | + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('tags'): | |
28 | + print lineNum, line | |
29 | + splitLine = re.split(r'\s+', line.strip()) | |
30 | + self._validate( | |
31 | + u'Line in [tags] section must contain exactly two fields - segment type and tag pattern', | |
32 | + lineNum, | |
33 | + len(splitLine) == 2) | |
34 | + segtype, pattern = splitLine | |
35 | + self._validate( | |
36 | + u'Segment type must be a lowercase alphanumeric with optional underscores', | |
37 | + lineNum, | |
38 | + re.match(r'[a-z_]+', segtype)) | |
39 | + self._validate( | |
40 | + u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', | |
41 | + lineNum, | |
42 | + re.match(r'[a-z_\.\:\%]+', pattern)) | |
43 | + | |
44 | + if segtype in self.segtype2Segnum: | |
45 | + segnum = self.segtype2Segnum[segtype] | |
46 | + else: | |
47 | + segnum = len(self.segtype2Segnum) | |
48 | + self.segtype2Segnum[segtype] = segnum | |
49 | + | |
50 | + self.patternsList.append(SegtypePattern(None, pattern, segnum)) | |
51 | + | |
52 | + def _readLexemes(self, segrulesConfigFile): | |
53 | + for lineNum, line in segrulesConfigFile.enumerateLinesInSection('lexemes'): | |
54 | + segtype, pattern = line.strip().split('\t') | |
55 | + self._validate( | |
56 | + u'Segment type must be a lowercase alphanumeric with optional underscores', | |
57 | + lineNum, | |
58 | + re.match(r'[a-z_]+', segtype)) | |
59 | + self._validate( | |
60 | + u'Pattern must contain lemma and POS', | |
61 | + lineNum, | |
62 | + re.match(r'.+\:[a-z_]+', pattern, re.U)) | |
63 | + | |
64 | + if segtype in self.segtype2Segnum: | |
65 | + segnum = self.segtype2Segnum[segtype] | |
66 | + else: | |
67 | + segnum = len(self.segtype2Segnum) | |
68 | + self.segtype2Segnum[segtype] = segnum | |
69 | + | |
70 | + lemma, pos = pattern.split(':') | |
71 | + | |
72 | + self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum)) | |
18 | 73 | |
19 | - def readTags(self, lines): | |
20 | - inTags = False | |
21 | - for lineNum, line in enumerate(lines, start=1): | |
22 | - header = self._getHeaderValue(line, lineNum) | |
23 | - if header == 'tags': | |
24 | - inTags = True | |
25 | - elif header: | |
26 | - inTags = False | |
27 | - elif inTags: | |
28 | - segtype, pattern = line.strip().split('\t') | |
29 | - self._validate( | |
30 | - u'Segment type must be a lowercase alphanumeric with optional underscores', | |
31 | - lineNum, | |
32 | - re.match(r'[a-z_]+', segtype)) | |
33 | - self._validate( | |
34 | - u'Pattern must contain only ":", "%", "." and lowercase alphanumeric letters', | |
35 | - lineNum, | |
36 | - re.match(r'[a-z_\.\:\%]+', pattern)) | |
37 | - | |
38 | - if segtype in self.segtype2Segnum: | |
39 | - segnum = self.segtype2Segnum[segtype] | |
40 | - else: | |
41 | - segnum = len(self.segtype2Segnum) | |
42 | - self.segtype2Segnum[segtype] = segnum | |
43 | - | |
44 | - self.patternsList.append(SegtypePattern(None, pattern, segnum)) | |
74 | + def hasSegtype(self, segTypeString): | |
75 | + return segTypeString in self.segtype2Segnum | |
45 | 76 | |
46 | - def readLexemes(self, lines): | |
47 | - inLexemes = False | |
48 | - for lineNum, line in enumerate(lines, start=1): | |
49 | - header = self._getHeaderValue(line, lineNum) | |
50 | - if header == 'lexemes': | |
51 | - inLexemes = True | |
52 | - elif header: | |
53 | - inLexemes = False | |
54 | - elif inLexemes: | |
55 | - segtype, pattern = line.strip().split('\t') | |
56 | - self._validate( | |
57 | - u'Segment type must be a lowercase alphanumeric with optional underscores', | |
58 | - lineNum, | |
59 | - re.match(r'[a-z_]+', segtype)) | |
60 | - self._validate( | |
61 | - u'Pattern must contain lemma and POS', | |
62 | - lineNum, | |
63 | - re.match(r'\w+\:[a-z_]+', pattern, re.U)) | |
64 | - | |
65 | - if segtype in self.segtype2Segnum: | |
66 | - segnum = self.segtype2Segnum[segtype] | |
67 | - else: | |
68 | - segnum = len(self.segtype2Segnum) | |
69 | - self.segtype2Segnum[segtype] = segnum | |
70 | - | |
71 | - lemma, pos = pattern.split(':') | |
72 | - | |
73 | - self.patternsList.append(SegtypePattern(lemma, pos + ':%', segnum)) | |
77 | + def getSegnum4Segtype(self, segTypeString): | |
78 | + return self.segtype2Segnum[segTypeString] | |
74 | 79 | |
75 | 80 | def lexeme2Segnum(self, lemma, tag): |
76 | 81 | for p in self.patternsList: |
77 | 82 | res = p.tryToMatch(lemma, tag) |
78 | 83 | if res >= 0: |
79 | 84 | return res |
80 | - raise SegtypesException('Cannot find segment type for given tag: %s' % tag) | |
85 | + return None | |
81 | 86 | |
82 | 87 | class SegtypePattern(object): |
83 | 88 | |
... | ... | @@ -92,11 +97,3 @@ class SegtypePattern(object): |
92 | 97 | return self.segnum |
93 | 98 | else: |
94 | 99 | return -1 |
95 | - | |
96 | -class SegtypesException(Exception): | |
97 | - | |
98 | - def __init__(self, msg): | |
99 | - self.msg = msg | |
100 | - | |
101 | - def __str__(self): | |
102 | - return u'Error in segment rules: %s' % self.msg | |
... | ... |
fsabuilder/morfeuszbuilder/tagset/tagset.py
... | ... | @@ -12,10 +12,11 @@ class Tagset(object): |
12 | 12 | NAMES = 2 |
13 | 13 | SEP = '\t' |
14 | 14 | |
15 | - def __init__(self, filename, encoding='utf8'): | |
15 | + def __init__(self, filename=None, encoding='utf8'): | |
16 | 16 | self.tag2tagnum = {} |
17 | 17 | self.name2namenum = {} |
18 | - self._doInit(filename, encoding) | |
18 | + if filename: | |
19 | + self._doInit(filename, encoding) | |
19 | 20 | self.tagnum2tag = dict(map(lambda (k, v): (v, k), self.tag2tagnum.iteritems())) |
20 | 21 | |
21 | 22 | def _doInit(self, filename, encoding): |
... | ... | @@ -37,4 +38,4 @@ class Tagset(object): |
37 | 38 | res[tag] = int(tagNum) |
38 | 39 | |
39 | 40 | def getTag4Tagnum(self, tagnum): |
40 | - return self.tagnum2tag[tagnum] | |
41 | 41 | \ No newline at end of file |
42 | + return self.tagnum2tag[tagnum] | |
... | ... |
fsabuilder/morfeuszbuilder/utils/configFile.py
... | ... | @@ -6,6 +6,7 @@ Created on 18 lut 2014 |
6 | 6 | |
7 | 7 | import re |
8 | 8 | import codecs |
9 | +import exceptions | |
9 | 10 | |
10 | 11 | def getHeaderValue(line, lineNum): |
11 | 12 | m = re.match(ur'\s*\[(.*?)\]\s*(\#.*)?', line) |
... | ... | @@ -25,9 +26,9 @@ class ConfigFile(object): |
25 | 26 | |
26 | 27 | def _addSectionStart(self, sectionName, lineNum): |
27 | 28 | if not sectionName in self.sectionNames: |
28 | - raise ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName) | |
29 | + raise exceptions.ConfigFileException(self.filename, lineNum, 'Invalid section: %s' % sectionName) | |
29 | 30 | if sectionName in self.section2Lines: |
30 | - raise ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName) | |
31 | + raise exceptions.ConfigFileException(self.filename, lineNum, 'Duplicate section: %s' % sectionName) | |
31 | 32 | self.section2Lines[sectionName] = [] |
32 | 33 | self.currSection = sectionName |
33 | 34 | |
... | ... | @@ -35,7 +36,7 @@ class ConfigFile(object): |
35 | 36 | line = line.strip() |
36 | 37 | if line: |
37 | 38 | if self.currSection is None and not line.startswith('#'): |
38 | - raise ConfigFileException(self.filename, lineNum, 'Text outside of any section') | |
39 | + raise exceptions.ConfigFileException(self.filename, lineNum, 'Text outside of any section') | |
39 | 40 | self.section2Lines[self.currSection].append((lineNum, line)) |
40 | 41 | |
41 | 42 | def _getHeaderValue(self, line, lineNum): |
... | ... | @@ -56,13 +57,3 @@ class ConfigFile(object): |
56 | 57 | self._addSectionStart(header, lineNum) |
57 | 58 | else: |
58 | 59 | self._addLine(line, lineNum) |
59 | - | |
60 | -class ConfigFileException(Exception): | |
61 | - | |
62 | - def __init__(self, filename, lineNum, msg): | |
63 | - self.filename = filename | |
64 | - self.lineNum = lineNum | |
65 | - self.msg = msg | |
66 | - | |
67 | - def __str__(self): | |
68 | - return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) | |
... | ... |
fsabuilder/morfeuszbuilder/utils/exceptions.py
0 → 100644
1 | +''' | |
2 | +Created on Feb 19, 2014 | |
3 | + | |
4 | +@author: lennyn | |
5 | +''' | |
6 | + | |
7 | +class FSABuilderException(Exception): | |
8 | + ''' | |
9 | + Exception in configFile module | |
10 | + ''' | |
11 | + | |
12 | + def __init__(self, msg): | |
13 | + self.msg = msg | |
14 | + | |
15 | + def __str__(self): | |
16 | + return 'Failed to create FSA files: ' + self.msg | |
17 | + | |
18 | +class SegtypesException(FSABuilderException): | |
19 | + | |
20 | + def __init__(self, msg): | |
21 | + self.msg = msg | |
22 | + | |
23 | + def __str__(self): | |
24 | + return u'Error in segment rules: %s' % self.msg | |
25 | + | |
26 | +class ConfigFileException(FSABuilderException): | |
27 | + | |
28 | + def __init__(self, filename, lineNum, msg): | |
29 | + self.filename = filename | |
30 | + self.lineNum = lineNum | |
31 | + self.msg = msg | |
32 | + | |
33 | + def __str__(self): | |
34 | + return u'%s:%d - %s' % (self.filename, self.lineNum, self.msg) | |
... | ... |