Commit 6bb3241ce6ce37c55b691478adad820d84c36abd
1 parent
4ea040d0
- ctest powinno znowu w miarę działać
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@88 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
11 changed files
with
59 additions
and
29 deletions
fsabuilder/buildfsa.py
... | ... | @@ -164,9 +164,8 @@ def buildAnalyzerFromPoliMorf(inputFile, tagset): |
164 | 164 | _printStats(fsa) |
165 | 165 | return fsa |
166 | 166 | |
167 | -def buildGeneratorFromPoliMorf(inputFile, tagsetFile): | |
167 | +def buildGeneratorFromPoliMorf(inputFile, tagset): | |
168 | 168 | encoder = encode.Encoder4Generator() |
169 | - tagset = Tagset(tagsetFile) | |
170 | 169 | fsa = FSA(encoder, tagset) |
171 | 170 | inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) |
172 | 171 | for word, data in inputData: |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/state.py
... | ... | @@ -13,6 +13,7 @@ class State(object): |
13 | 13 | |
14 | 14 | def __init__(self, additionalData=None): |
15 | 15 | self.transitionsMap = {} |
16 | + self.transitionsDataMap = {} | |
16 | 17 | self.freq = 0 |
17 | 18 | self.encodedData = None |
18 | 19 | self.reverseOffset = None |
... | ... | @@ -31,6 +32,9 @@ class State(object): |
31 | 32 | def setTransition(self, byte, nextState): |
32 | 33 | self.transitionsMap[byte] = nextState |
33 | 34 | |
35 | + def setTransitionData(self, byte, data): | |
36 | + self.transitionsDataMap[byte] = data | |
37 | + | |
34 | 38 | def hasNext(self, byte): |
35 | 39 | return byte in self.transitionsMap |
36 | 40 | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/state.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/segrules/rules.py
... | ... | @@ -98,3 +98,15 @@ class ZeroOrMoreRule(UnaryRule): |
98 | 98 | self.child._doAddToNFA(intermStartState, intermEndState) |
99 | 99 | intermEndState.addTransition(None, endState) |
100 | 100 | endState.addTransition(None, intermStartState) |
101 | + | |
102 | +class ShiftOrthRule(UnaryRule): | |
103 | + | |
104 | + def __init__(self, child): | |
105 | + super(ShiftOrthRule, self).__init__(child) | |
106 | + | |
107 | + def addToNFA(self, fsa): | |
108 | + raise ValueError() | |
109 | + | |
110 | + def _doAddToNFA(self, startState, endState): | |
111 | + self.child._doAddToNFA(startState, endState) | |
112 | + startState.setTransitionData(self.child.segnum, 1) | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... | ... | @@ -12,7 +12,10 @@ class RulesManager(object): |
12 | 12 | def _options2Key(self, optionsMap): |
13 | 13 | return frozenset(optionsMap.items()) |
14 | 14 | |
15 | - def addDFA4Options(self, optionsMap, dfa): | |
15 | + def getDFA(self, optionsMap): | |
16 | + return self.options2DFA[self._options2Key(optionsMap)] | |
17 | + | |
18 | + def addDFA(self, optionsMap, dfa): | |
16 | 19 | self.options2DFA[self._options2Key(optionsMap)] = dfa |
17 | 20 | |
18 | 21 | def serialize(self): |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... | ... | @@ -12,6 +12,7 @@ class RulesNFAState(object): |
12 | 12 | |
13 | 13 | def __init__(self, initial=False, final=False, weak=False): |
14 | 14 | self.transitionsMap = {} |
15 | + self.transitionsDataMap = {} | |
15 | 16 | self.initial = initial |
16 | 17 | self.final = final |
17 | 18 | self.weak = weak |
... | ... | @@ -21,6 +22,11 @@ class RulesNFAState(object): |
21 | 22 | def addTransition(self, label, targetState): |
22 | 23 | self.transitionsMap.setdefault(label, set()) |
23 | 24 | self.transitionsMap[label].add(targetState) |
25 | + self.transitionsDataMap[label] = 0 | |
26 | + | |
27 | + def setTransitionData(self, label, byte): | |
28 | + assert len(self.transitionsMap[label]) == 1 | |
29 | + self.transitionsDataMap[label] = byte | |
24 | 30 | |
25 | 31 | def getClosure(self, visited): |
26 | 32 | if self in visited: |
... | ... | @@ -61,9 +67,10 @@ class RulesNFA(object): |
61 | 67 | for nfaState in nfaStates: |
62 | 68 | for label, nextStates in nfaState.transitionsMap.iteritems(): |
63 | 69 | if label is not None: |
64 | - res.setdefault(label, set()) | |
70 | + transitionData = nfaState.transitionsDataMap[label] | |
71 | + res.setdefault((label, transitionData), set()) | |
65 | 72 | for nextNFAState in nextStates: |
66 | - res[label] |= nextNFAState.getClosure(set()) | |
73 | + res[(label, transitionData)] |= nextNFAState.getClosure(set()) | |
67 | 74 | # print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)] |
68 | 75 | return res |
69 | 76 | |
... | ... | @@ -77,7 +84,7 @@ class RulesNFA(object): |
77 | 84 | # dfaState should be final |
78 | 85 | # and contain info about weakness |
79 | 86 | dfaState.encodedData = bytearray([1 if weak else 0]) |
80 | - for label, nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | |
87 | + for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | |
81 | 88 | # print '============' |
82 | 89 | # print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)] |
83 | 90 | # print 'label:', label |
... | ... | @@ -90,6 +97,7 @@ class RulesNFA(object): |
90 | 97 | nfaSubset2DFAState[key] = nextDFAState |
91 | 98 | self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) |
92 | 99 | dfaState.setTransition(label, nextDFAState) |
100 | + dfaState.setTransitionData(label, transitionData) | |
93 | 101 | |
94 | 102 | def convertToDFA(self): |
95 | 103 | dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... | ... | @@ -50,7 +50,7 @@ class RulesParser(object): |
50 | 50 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): |
51 | 51 | rule.addToNFA(nfa) |
52 | 52 | dfa = nfa.convertToDFA() |
53 | - res.addDFA4Options(key2Def, dfa) | |
53 | + res.addDFA(key2Def, dfa) | |
54 | 54 | return res |
55 | 55 | |
56 | 56 | def _doParse(self, combinationEnumeratedLines, segtypesHelper): |
... | ... | @@ -67,9 +67,10 @@ class RulesParser(object): |
67 | 67 | |
68 | 68 | def _doParseOneLine(self, lineNum, line, segtypesHelper): |
69 | 69 | rule = Forward() |
70 | - tagRule = Word(alphanums+'_>') | |
70 | + tagRule = Word(alphanums+'_') | |
71 | + shiftOrthRule = tagRule + '>' | |
71 | 72 | parenRule = Suppress('(') + rule + Suppress(')') |
72 | - atomicRule = tagRule ^ parenRule | |
73 | + atomicRule = tagRule ^ shiftOrthRule ^ parenRule | |
73 | 74 | zeroOrMoreRule = atomicRule + Suppress('*') |
74 | 75 | oneOrMoreRule = atomicRule + Suppress('+') |
75 | 76 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule |
... | ... | @@ -79,6 +80,7 @@ class RulesParser(object): |
79 | 80 | rule << concatRule |
80 | 81 | |
81 | 82 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) |
83 | + shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0])) | |
82 | 84 | # parenRule.setParseAction(lambda string, loc, toks: toks[0]) |
83 | 85 | zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) |
84 | 86 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) |
... | ... |
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
... | ... | @@ -10,18 +10,20 @@ from morfeuszbuilder.tagset import tagset |
10 | 10 | from morfeuszbuilder.fsa import visualizer, serializer |
11 | 11 | |
12 | 12 | class Test(unittest.TestCase): |
13 | - print 'do test' | |
14 | - t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) | |
15 | - parser = rulesParser.RulesParser(t) | |
16 | - fsas = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | |
17 | - fsa = fsas[0] | |
18 | - for s in fsa.dfs(): | |
19 | - s.debug() | |
20 | - print 'states:', len(list(fsa.dfs())) | |
21 | - print 'transitions:', fsa.getTransitionsNum() | |
22 | - visualizer.Visualizer().visualize(fsa, charLabels=False) | |
23 | - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) | |
24 | - print 'done' | |
13 | + | |
14 | + def testParser(self): | |
15 | + print 'do test' | |
16 | + t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) | |
17 | + parser = rulesParser.RulesParser(t) | |
18 | + rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | |
19 | + fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'}) | |
20 | + for s in fsa.dfs(): | |
21 | + s.debug() | |
22 | + print 'states:', len(list(fsa.dfs())) | |
23 | + print 'transitions:', fsa.getTransitionsNum() | |
24 | + visualizer.Visualizer().visualize(fsa, charLabels=False) | |
25 | + print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) | |
26 | + print 'done' | |
25 | 27 | |
26 | 28 | if __name__ == "__main__": |
27 | 29 | unittest.main() |
... | ... |
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
... | ... | @@ -16,7 +16,7 @@ class Test(unittest.TestCase): |
16 | 16 | |
17 | 17 | def testPreprocess(self): |
18 | 18 | filename = os.path.join(os.path.dirname(__file__), 'segmenty.dat') |
19 | - parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes']) | |
19 | + parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) | |
20 | 20 | linesEnum = parsedFile.enumerateLinesInSection('combinations') |
21 | 21 | for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): |
22 | 22 | print (lineNum, line) |
... | ... |
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
... | ... | @@ -111,7 +111,7 @@ moze_interp(z_on_agl) |
111 | 111 | moze_interp(z_on_agl on_agl) |
112 | 112 | |
113 | 113 | # Liczba zapisana jako ciąg cyfr: |
114 | -#moze_interp( dig>* dig ) | |
114 | +moze_interp( dig>* dig ) | |
115 | 115 | |
116 | 116 | # Formacje prefiksalne |
117 | 117 | #### trzeba wydzielić odpowiednie samodze! |
... | ... | @@ -133,11 +133,11 @@ adj dywiz samodz |
133 | 133 | samodz dywiz adj |
134 | 134 | |
135 | 135 | [segment types] |
136 | -naj> | |
137 | -nie> | |
136 | +naj | |
137 | +nie | |
138 | 138 | prefs |
139 | 139 | prefv |
140 | -dig> | |
140 | +dig | |
141 | 141 | adja |
142 | 142 | adj |
143 | 143 | adj_sup |
... | ... | @@ -156,11 +156,11 @@ praet_pl |
156 | 156 | samodz |
157 | 157 | |
158 | 158 | [tags] |
159 | -naj> naj | |
160 | -nie> nie | |
159 | +naj naj | |
160 | +nie nie | |
161 | 161 | prefs prefs |
162 | 162 | prefv prefv |
163 | -dig> dig | |
163 | +dig dig | |
164 | 164 | adja adja |
165 | 165 | adj adj:%:pos |
166 | 166 | adj_sup adj:%:sup |
... | ... |