Commit 6bb3241ce6ce37c55b691478adad820d84c36abd
1 parent
4ea040d0
- ctest powinno znowu w miarę działać
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@88 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
11 changed files
with
59 additions
and
29 deletions
fsabuilder/buildfsa.py
@@ -164,9 +164,8 @@ def buildAnalyzerFromPoliMorf(inputFile, tagset): | @@ -164,9 +164,8 @@ def buildAnalyzerFromPoliMorf(inputFile, tagset): | ||
164 | _printStats(fsa) | 164 | _printStats(fsa) |
165 | return fsa | 165 | return fsa |
166 | 166 | ||
167 | -def buildGeneratorFromPoliMorf(inputFile, tagsetFile): | 167 | +def buildGeneratorFromPoliMorf(inputFile, tagset): |
168 | encoder = encode.Encoder4Generator() | 168 | encoder = encode.Encoder4Generator() |
169 | - tagset = Tagset(tagsetFile) | ||
170 | fsa = FSA(encoder, tagset) | 169 | fsa = FSA(encoder, tagset) |
171 | inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) | 170 | inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) |
172 | for word, data in inputData: | 171 | for word, data in inputData: |
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/state.py
@@ -13,6 +13,7 @@ class State(object): | @@ -13,6 +13,7 @@ class State(object): | ||
13 | 13 | ||
14 | def __init__(self, additionalData=None): | 14 | def __init__(self, additionalData=None): |
15 | self.transitionsMap = {} | 15 | self.transitionsMap = {} |
16 | + self.transitionsDataMap = {} | ||
16 | self.freq = 0 | 17 | self.freq = 0 |
17 | self.encodedData = None | 18 | self.encodedData = None |
18 | self.reverseOffset = None | 19 | self.reverseOffset = None |
@@ -31,6 +32,9 @@ class State(object): | @@ -31,6 +32,9 @@ class State(object): | ||
31 | def setTransition(self, byte, nextState): | 32 | def setTransition(self, byte, nextState): |
32 | self.transitionsMap[byte] = nextState | 33 | self.transitionsMap[byte] = nextState |
33 | 34 | ||
35 | + def setTransitionData(self, byte, data): | ||
36 | + self.transitionsDataMap[byte] = data | ||
37 | + | ||
34 | def hasNext(self, byte): | 38 | def hasNext(self, byte): |
35 | return byte in self.transitionsMap | 39 | return byte in self.transitionsMap |
36 | 40 |
fsabuilder/morfeuszbuilder/fsa/state.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -98,3 +98,15 @@ class ZeroOrMoreRule(UnaryRule): | @@ -98,3 +98,15 @@ class ZeroOrMoreRule(UnaryRule): | ||
98 | self.child._doAddToNFA(intermStartState, intermEndState) | 98 | self.child._doAddToNFA(intermStartState, intermEndState) |
99 | intermEndState.addTransition(None, endState) | 99 | intermEndState.addTransition(None, endState) |
100 | endState.addTransition(None, intermStartState) | 100 | endState.addTransition(None, intermStartState) |
101 | + | ||
102 | +class ShiftOrthRule(UnaryRule): | ||
103 | + | ||
104 | + def __init__(self, child): | ||
105 | + super(ShiftOrthRule, self).__init__(child) | ||
106 | + | ||
107 | + def addToNFA(self, fsa): | ||
108 | + raise ValueError() | ||
109 | + | ||
110 | + def _doAddToNFA(self, startState, endState): | ||
111 | + self.child._doAddToNFA(startState, endState) | ||
112 | + startState.setTransitionData(self.child.segnum, 1) |
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -12,7 +12,10 @@ class RulesManager(object): | @@ -12,7 +12,10 @@ class RulesManager(object): | ||
12 | def _options2Key(self, optionsMap): | 12 | def _options2Key(self, optionsMap): |
13 | return frozenset(optionsMap.items()) | 13 | return frozenset(optionsMap.items()) |
14 | 14 | ||
15 | - def addDFA4Options(self, optionsMap, dfa): | 15 | + def getDFA(self, optionsMap): |
16 | + return self.options2DFA[self._options2Key(optionsMap)] | ||
17 | + | ||
18 | + def addDFA(self, optionsMap, dfa): | ||
16 | self.options2DFA[self._options2Key(optionsMap)] = dfa | 19 | self.options2DFA[self._options2Key(optionsMap)] = dfa |
17 | 20 | ||
18 | def serialize(self): | 21 | def serialize(self): |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -12,6 +12,7 @@ class RulesNFAState(object): | @@ -12,6 +12,7 @@ class RulesNFAState(object): | ||
12 | 12 | ||
13 | def __init__(self, initial=False, final=False, weak=False): | 13 | def __init__(self, initial=False, final=False, weak=False): |
14 | self.transitionsMap = {} | 14 | self.transitionsMap = {} |
15 | + self.transitionsDataMap = {} | ||
15 | self.initial = initial | 16 | self.initial = initial |
16 | self.final = final | 17 | self.final = final |
17 | self.weak = weak | 18 | self.weak = weak |
@@ -21,6 +22,11 @@ class RulesNFAState(object): | @@ -21,6 +22,11 @@ class RulesNFAState(object): | ||
21 | def addTransition(self, label, targetState): | 22 | def addTransition(self, label, targetState): |
22 | self.transitionsMap.setdefault(label, set()) | 23 | self.transitionsMap.setdefault(label, set()) |
23 | self.transitionsMap[label].add(targetState) | 24 | self.transitionsMap[label].add(targetState) |
25 | + self.transitionsDataMap[label] = 0 | ||
26 | + | ||
27 | + def setTransitionData(self, label, byte): | ||
28 | + assert len(self.transitionsMap[label]) == 1 | ||
29 | + self.transitionsDataMap[label] = byte | ||
24 | 30 | ||
25 | def getClosure(self, visited): | 31 | def getClosure(self, visited): |
26 | if self in visited: | 32 | if self in visited: |
@@ -61,9 +67,10 @@ class RulesNFA(object): | @@ -61,9 +67,10 @@ class RulesNFA(object): | ||
61 | for nfaState in nfaStates: | 67 | for nfaState in nfaStates: |
62 | for label, nextStates in nfaState.transitionsMap.iteritems(): | 68 | for label, nextStates in nfaState.transitionsMap.iteritems(): |
63 | if label is not None: | 69 | if label is not None: |
64 | - res.setdefault(label, set()) | 70 | + transitionData = nfaState.transitionsDataMap[label] |
71 | + res.setdefault((label, transitionData), set()) | ||
65 | for nextNFAState in nextStates: | 72 | for nextNFAState in nextStates: |
66 | - res[label] |= nextNFAState.getClosure(set()) | 73 | + res[(label, transitionData)] |= nextNFAState.getClosure(set()) |
67 | # print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)] | 74 | # print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)] |
68 | return res | 75 | return res |
69 | 76 | ||
@@ -77,7 +84,7 @@ class RulesNFA(object): | @@ -77,7 +84,7 @@ class RulesNFA(object): | ||
77 | # dfaState should be final | 84 | # dfaState should be final |
78 | # and contain info about weakness | 85 | # and contain info about weakness |
79 | dfaState.encodedData = bytearray([1 if weak else 0]) | 86 | dfaState.encodedData = bytearray([1 if weak else 0]) |
80 | - for label, nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | 87 | + for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): |
81 | # print '============' | 88 | # print '============' |
82 | # print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)] | 89 | # print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)] |
83 | # print 'label:', label | 90 | # print 'label:', label |
@@ -90,6 +97,7 @@ class RulesNFA(object): | @@ -90,6 +97,7 @@ class RulesNFA(object): | ||
90 | nfaSubset2DFAState[key] = nextDFAState | 97 | nfaSubset2DFAState[key] = nextDFAState |
91 | self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) | 98 | self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) |
92 | dfaState.setTransition(label, nextDFAState) | 99 | dfaState.setTransition(label, nextDFAState) |
100 | + dfaState.setTransitionData(label, transitionData) | ||
93 | 101 | ||
94 | def convertToDFA(self): | 102 | def convertToDFA(self): |
95 | dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) | 103 | dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -50,7 +50,7 @@ class RulesParser(object): | @@ -50,7 +50,7 @@ class RulesParser(object): | ||
50 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): | 50 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): |
51 | rule.addToNFA(nfa) | 51 | rule.addToNFA(nfa) |
52 | dfa = nfa.convertToDFA() | 52 | dfa = nfa.convertToDFA() |
53 | - res.addDFA4Options(key2Def, dfa) | 53 | + res.addDFA(key2Def, dfa) |
54 | return res | 54 | return res |
55 | 55 | ||
56 | def _doParse(self, combinationEnumeratedLines, segtypesHelper): | 56 | def _doParse(self, combinationEnumeratedLines, segtypesHelper): |
@@ -67,9 +67,10 @@ class RulesParser(object): | @@ -67,9 +67,10 @@ class RulesParser(object): | ||
67 | 67 | ||
68 | def _doParseOneLine(self, lineNum, line, segtypesHelper): | 68 | def _doParseOneLine(self, lineNum, line, segtypesHelper): |
69 | rule = Forward() | 69 | rule = Forward() |
70 | - tagRule = Word(alphanums+'_>') | 70 | + tagRule = Word(alphanums+'_') |
71 | + shiftOrthRule = tagRule + '>' | ||
71 | parenRule = Suppress('(') + rule + Suppress(')') | 72 | parenRule = Suppress('(') + rule + Suppress(')') |
72 | - atomicRule = tagRule ^ parenRule | 73 | + atomicRule = tagRule ^ shiftOrthRule ^ parenRule |
73 | zeroOrMoreRule = atomicRule + Suppress('*') | 74 | zeroOrMoreRule = atomicRule + Suppress('*') |
74 | oneOrMoreRule = atomicRule + Suppress('+') | 75 | oneOrMoreRule = atomicRule + Suppress('+') |
75 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule | 76 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule |
@@ -79,6 +80,7 @@ class RulesParser(object): | @@ -79,6 +80,7 @@ class RulesParser(object): | ||
79 | rule << concatRule | 80 | rule << concatRule |
80 | 81 | ||
81 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) | 82 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) |
83 | + shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0])) | ||
82 | # parenRule.setParseAction(lambda string, loc, toks: toks[0]) | 84 | # parenRule.setParseAction(lambda string, loc, toks: toks[0]) |
83 | zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) | 85 | zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) |
84 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) | 86 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) |
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
@@ -10,18 +10,20 @@ from morfeuszbuilder.tagset import tagset | @@ -10,18 +10,20 @@ from morfeuszbuilder.tagset import tagset | ||
10 | from morfeuszbuilder.fsa import visualizer, serializer | 10 | from morfeuszbuilder.fsa import visualizer, serializer |
11 | 11 | ||
12 | class Test(unittest.TestCase): | 12 | class Test(unittest.TestCase): |
13 | - print 'do test' | ||
14 | - t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) | ||
15 | - parser = rulesParser.RulesParser(t) | ||
16 | - fsas = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | ||
17 | - fsa = fsas[0] | ||
18 | - for s in fsa.dfs(): | ||
19 | - s.debug() | ||
20 | - print 'states:', len(list(fsa.dfs())) | ||
21 | - print 'transitions:', fsa.getTransitionsNum() | ||
22 | - visualizer.Visualizer().visualize(fsa, charLabels=False) | ||
23 | - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) | ||
24 | - print 'done' | 13 | + |
14 | + def testParser(self): | ||
15 | + print 'do test' | ||
16 | + t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) | ||
17 | + parser = rulesParser.RulesParser(t) | ||
18 | + rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | ||
19 | + fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'}) | ||
20 | + for s in fsa.dfs(): | ||
21 | + s.debug() | ||
22 | + print 'states:', len(list(fsa.dfs())) | ||
23 | + print 'transitions:', fsa.getTransitionsNum() | ||
24 | + visualizer.Visualizer().visualize(fsa, charLabels=False) | ||
25 | + print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) | ||
26 | + print 'done' | ||
25 | 27 | ||
26 | if __name__ == "__main__": | 28 | if __name__ == "__main__": |
27 | unittest.main() | 29 | unittest.main() |
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
@@ -16,7 +16,7 @@ class Test(unittest.TestCase): | @@ -16,7 +16,7 @@ class Test(unittest.TestCase): | ||
16 | 16 | ||
17 | def testPreprocess(self): | 17 | def testPreprocess(self): |
18 | filename = os.path.join(os.path.dirname(__file__), 'segmenty.dat') | 18 | filename = os.path.join(os.path.dirname(__file__), 'segmenty.dat') |
19 | - parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes']) | 19 | + parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) |
20 | linesEnum = parsedFile.enumerateLinesInSection('combinations') | 20 | linesEnum = parsedFile.enumerateLinesInSection('combinations') |
21 | for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): | 21 | for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): |
22 | print (lineNum, line) | 22 | print (lineNum, line) |
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
@@ -111,7 +111,7 @@ moze_interp(z_on_agl) | @@ -111,7 +111,7 @@ moze_interp(z_on_agl) | ||
111 | moze_interp(z_on_agl on_agl) | 111 | moze_interp(z_on_agl on_agl) |
112 | 112 | ||
113 | # Liczba zapisana jako ciąg cyfr: | 113 | # Liczba zapisana jako ciąg cyfr: |
114 | -#moze_interp( dig>* dig ) | 114 | +moze_interp( dig>* dig ) |
115 | 115 | ||
116 | # Formacje prefiksalne | 116 | # Formacje prefiksalne |
117 | #### trzeba wydzielić odpowiednie samodze! | 117 | #### trzeba wydzielić odpowiednie samodze! |
@@ -133,11 +133,11 @@ adj dywiz samodz | @@ -133,11 +133,11 @@ adj dywiz samodz | ||
133 | samodz dywiz adj | 133 | samodz dywiz adj |
134 | 134 | ||
135 | [segment types] | 135 | [segment types] |
136 | -naj> | ||
137 | -nie> | 136 | +naj |
137 | +nie | ||
138 | prefs | 138 | prefs |
139 | prefv | 139 | prefv |
140 | -dig> | 140 | +dig |
141 | adja | 141 | adja |
142 | adj | 142 | adj |
143 | adj_sup | 143 | adj_sup |
@@ -156,11 +156,11 @@ praet_pl | @@ -156,11 +156,11 @@ praet_pl | ||
156 | samodz | 156 | samodz |
157 | 157 | ||
158 | [tags] | 158 | [tags] |
159 | -naj> naj | ||
160 | -nie> nie | 159 | +naj naj |
160 | +nie nie | ||
161 | prefs prefs | 161 | prefs prefs |
162 | prefv prefv | 162 | prefv prefv |
163 | -dig> dig | 163 | +dig dig |
164 | adja adja | 164 | adja adja |
165 | adj adj:%:pos | 165 | adj adj:%:pos |
166 | adj_sup adj:%:sup | 166 | adj_sup adj:%:sup |