Commit 6bb3241ce6ce37c55b691478adad820d84c36abd

Authored by Michał Lenart
1 parent 4ea040d0

- ctest powinno znowu w miarę działać

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@88 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/buildfsa.py
@@ -164,9 +164,8 @@ def buildAnalyzerFromPoliMorf(inputFile, tagset): @@ -164,9 +164,8 @@ def buildAnalyzerFromPoliMorf(inputFile, tagset):
164 _printStats(fsa) 164 _printStats(fsa)
165 return fsa 165 return fsa
166 166
167 -def buildGeneratorFromPoliMorf(inputFile, tagsetFile): 167 +def buildGeneratorFromPoliMorf(inputFile, tagset):
168 encoder = encode.Encoder4Generator() 168 encoder = encode.Encoder4Generator()
169 - tagset = Tagset(tagsetFile)  
170 fsa = FSA(encoder, tagset) 169 fsa = FSA(encoder, tagset)
171 inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) 170 inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder)
172 for word, data in inputData: 171 for word, data in inputData:
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/state.py
@@ -13,6 +13,7 @@ class State(object): @@ -13,6 +13,7 @@ class State(object):
13 13
14 def __init__(self, additionalData=None): 14 def __init__(self, additionalData=None):
15 self.transitionsMap = {} 15 self.transitionsMap = {}
  16 + self.transitionsDataMap = {}
16 self.freq = 0 17 self.freq = 0
17 self.encodedData = None 18 self.encodedData = None
18 self.reverseOffset = None 19 self.reverseOffset = None
@@ -31,6 +32,9 @@ class State(object): @@ -31,6 +32,9 @@ class State(object):
31 def setTransition(self, byte, nextState): 32 def setTransition(self, byte, nextState):
32 self.transitionsMap[byte] = nextState 33 self.transitionsMap[byte] = nextState
33 34
  35 + def setTransitionData(self, byte, data):
  36 + self.transitionsDataMap[byte] = data
  37 +
34 def hasNext(self, byte): 38 def hasNext(self, byte):
35 return byte in self.transitionsMap 39 return byte in self.transitionsMap
36 40
fsabuilder/morfeuszbuilder/fsa/state.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -98,3 +98,15 @@ class ZeroOrMoreRule(UnaryRule): @@ -98,3 +98,15 @@ class ZeroOrMoreRule(UnaryRule):
98 self.child._doAddToNFA(intermStartState, intermEndState) 98 self.child._doAddToNFA(intermStartState, intermEndState)
99 intermEndState.addTransition(None, endState) 99 intermEndState.addTransition(None, endState)
100 endState.addTransition(None, intermStartState) 100 endState.addTransition(None, intermStartState)
  101 +
  102 +class ShiftOrthRule(UnaryRule):
  103 +
  104 + def __init__(self, child):
  105 + super(ShiftOrthRule, self).__init__(child)
  106 +
  107 + def addToNFA(self, fsa):
  108 + raise ValueError()
  109 +
  110 + def _doAddToNFA(self, startState, endState):
  111 + self.child._doAddToNFA(startState, endState)
  112 + startState.setTransitionData(self.child.segnum, 1)
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -12,7 +12,10 @@ class RulesManager(object): @@ -12,7 +12,10 @@ class RulesManager(object):
12 def _options2Key(self, optionsMap): 12 def _options2Key(self, optionsMap):
13 return frozenset(optionsMap.items()) 13 return frozenset(optionsMap.items())
14 14
15 - def addDFA4Options(self, optionsMap, dfa): 15 + def getDFA(self, optionsMap):
  16 + return self.options2DFA[self._options2Key(optionsMap)]
  17 +
  18 + def addDFA(self, optionsMap, dfa):
16 self.options2DFA[self._options2Key(optionsMap)] = dfa 19 self.options2DFA[self._options2Key(optionsMap)] = dfa
17 20
18 def serialize(self): 21 def serialize(self):
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -12,6 +12,7 @@ class RulesNFAState(object): @@ -12,6 +12,7 @@ class RulesNFAState(object):
12 12
13 def __init__(self, initial=False, final=False, weak=False): 13 def __init__(self, initial=False, final=False, weak=False):
14 self.transitionsMap = {} 14 self.transitionsMap = {}
  15 + self.transitionsDataMap = {}
15 self.initial = initial 16 self.initial = initial
16 self.final = final 17 self.final = final
17 self.weak = weak 18 self.weak = weak
@@ -21,6 +22,11 @@ class RulesNFAState(object): @@ -21,6 +22,11 @@ class RulesNFAState(object):
21 def addTransition(self, label, targetState): 22 def addTransition(self, label, targetState):
22 self.transitionsMap.setdefault(label, set()) 23 self.transitionsMap.setdefault(label, set())
23 self.transitionsMap[label].add(targetState) 24 self.transitionsMap[label].add(targetState)
  25 + self.transitionsDataMap[label] = 0
  26 +
  27 + def setTransitionData(self, label, byte):
  28 + assert len(self.transitionsMap[label]) == 1
  29 + self.transitionsDataMap[label] = byte
24 30
25 def getClosure(self, visited): 31 def getClosure(self, visited):
26 if self in visited: 32 if self in visited:
@@ -61,9 +67,10 @@ class RulesNFA(object): @@ -61,9 +67,10 @@ class RulesNFA(object):
61 for nfaState in nfaStates: 67 for nfaState in nfaStates:
62 for label, nextStates in nfaState.transitionsMap.iteritems(): 68 for label, nextStates in nfaState.transitionsMap.iteritems():
63 if label is not None: 69 if label is not None:
64 - res.setdefault(label, set()) 70 + transitionData = nfaState.transitionsDataMap[label]
  71 + res.setdefault((label, transitionData), set())
65 for nextNFAState in nextStates: 72 for nextNFAState in nextStates:
66 - res[label] |= nextNFAState.getClosure(set()) 73 + res[(label, transitionData)] |= nextNFAState.getClosure(set())
67 # print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)] 74 # print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)]
68 return res 75 return res
69 76
@@ -77,7 +84,7 @@ class RulesNFA(object): @@ -77,7 +84,7 @@ class RulesNFA(object):
77 # dfaState should be final 84 # dfaState should be final
78 # and contain info about weakness 85 # and contain info about weakness
79 dfaState.encodedData = bytearray([1 if weak else 0]) 86 dfaState.encodedData = bytearray([1 if weak else 0])
80 - for label, nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): 87 + for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems():
81 # print '============' 88 # print '============'
82 # print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)] 89 # print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)]
83 # print 'label:', label 90 # print 'label:', label
@@ -90,6 +97,7 @@ class RulesNFA(object): @@ -90,6 +97,7 @@ class RulesNFA(object):
90 nfaSubset2DFAState[key] = nextDFAState 97 nfaSubset2DFAState[key] = nextDFAState
91 self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) 98 self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState)
92 dfaState.setTransition(label, nextDFAState) 99 dfaState.setTransition(label, nextDFAState)
  100 + dfaState.setTransitionData(label, transitionData)
93 101
94 def convertToDFA(self): 102 def convertToDFA(self):
95 dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) 103 dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False)
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -50,7 +50,7 @@ class RulesParser(object): @@ -50,7 +50,7 @@ class RulesParser(object):
50 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): 50 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper):
51 rule.addToNFA(nfa) 51 rule.addToNFA(nfa)
52 dfa = nfa.convertToDFA() 52 dfa = nfa.convertToDFA()
53 - res.addDFA4Options(key2Def, dfa) 53 + res.addDFA(key2Def, dfa)
54 return res 54 return res
55 55
56 def _doParse(self, combinationEnumeratedLines, segtypesHelper): 56 def _doParse(self, combinationEnumeratedLines, segtypesHelper):
@@ -67,9 +67,10 @@ class RulesParser(object): @@ -67,9 +67,10 @@ class RulesParser(object):
67 67
68 def _doParseOneLine(self, lineNum, line, segtypesHelper): 68 def _doParseOneLine(self, lineNum, line, segtypesHelper):
69 rule = Forward() 69 rule = Forward()
70 - tagRule = Word(alphanums+'_>') 70 + tagRule = Word(alphanums+'_')
  71 + shiftOrthRule = tagRule + '>'
71 parenRule = Suppress('(') + rule + Suppress(')') 72 parenRule = Suppress('(') + rule + Suppress(')')
72 - atomicRule = tagRule ^ parenRule 73 + atomicRule = tagRule ^ shiftOrthRule ^ parenRule
73 zeroOrMoreRule = atomicRule + Suppress('*') 74 zeroOrMoreRule = atomicRule + Suppress('*')
74 oneOrMoreRule = atomicRule + Suppress('+') 75 oneOrMoreRule = atomicRule + Suppress('+')
75 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule 76 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
@@ -79,6 +80,7 @@ class RulesParser(object): @@ -79,6 +80,7 @@ class RulesParser(object):
79 rule << concatRule 80 rule << concatRule
80 81
81 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) 82 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper))
  83 + shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0]))
82 # parenRule.setParseAction(lambda string, loc, toks: toks[0]) 84 # parenRule.setParseAction(lambda string, loc, toks: toks[0])
83 zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) 85 zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0]))
84 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) 86 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
@@ -10,18 +10,20 @@ from morfeuszbuilder.tagset import tagset @@ -10,18 +10,20 @@ from morfeuszbuilder.tagset import tagset
10 from morfeuszbuilder.fsa import visualizer, serializer 10 from morfeuszbuilder.fsa import visualizer, serializer
11 11
12 class Test(unittest.TestCase): 12 class Test(unittest.TestCase):
13 - print 'do test'  
14 - t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset'))  
15 - parser = rulesParser.RulesParser(t)  
16 - fsas = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))  
17 - fsa = fsas[0]  
18 - for s in fsa.dfs():  
19 - s.debug()  
20 - print 'states:', len(list(fsa.dfs()))  
21 - print 'transitions:', fsa.getTransitionsNum()  
22 - visualizer.Visualizer().visualize(fsa, charLabels=False)  
23 - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray()))  
24 - print 'done' 13 +
  14 + def testParser(self):
  15 + print 'do test'
  16 + t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset'))
  17 + parser = rulesParser.RulesParser(t)
  18 + rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))
  19 + fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'})
  20 + for s in fsa.dfs():
  21 + s.debug()
  22 + print 'states:', len(list(fsa.dfs()))
  23 + print 'transitions:', fsa.getTransitionsNum()
  24 + visualizer.Visualizer().visualize(fsa, charLabels=False)
  25 + print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray()))
  26 + print 'done'
25 27
26 if __name__ == "__main__": 28 if __name__ == "__main__":
27 unittest.main() 29 unittest.main()
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
@@ -16,7 +16,7 @@ class Test(unittest.TestCase): @@ -16,7 +16,7 @@ class Test(unittest.TestCase):
16 16
17 def testPreprocess(self): 17 def testPreprocess(self):
18 filename = os.path.join(os.path.dirname(__file__), 'segmenty.dat') 18 filename = os.path.join(os.path.dirname(__file__), 'segmenty.dat')
19 - parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes']) 19 + parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types'])
20 linesEnum = parsedFile.enumerateLinesInSection('combinations') 20 linesEnum = parsedFile.enumerateLinesInSection('combinations')
21 for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): 21 for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']):
22 print (lineNum, line) 22 print (lineNum, line)
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
@@ -111,7 +111,7 @@ moze_interp(z_on_agl) @@ -111,7 +111,7 @@ moze_interp(z_on_agl)
111 moze_interp(z_on_agl on_agl) 111 moze_interp(z_on_agl on_agl)
112 112
113 # Liczba zapisana jako ciąg cyfr: 113 # Liczba zapisana jako ciąg cyfr:
114 -#moze_interp( dig>* dig ) 114 +moze_interp( dig>* dig )
115 115
116 # Formacje prefiksalne 116 # Formacje prefiksalne
117 #### trzeba wydzielić odpowiednie samodze! 117 #### trzeba wydzielić odpowiednie samodze!
@@ -133,11 +133,11 @@ adj dywiz samodz @@ -133,11 +133,11 @@ adj dywiz samodz
133 samodz dywiz adj 133 samodz dywiz adj
134 134
135 [segment types] 135 [segment types]
136 -naj>  
137 -nie> 136 +naj
  137 +nie
138 prefs 138 prefs
139 prefv 139 prefv
140 -dig> 140 +dig
141 adja 141 adja
142 adj 142 adj
143 adj_sup 143 adj_sup
@@ -156,11 +156,11 @@ praet_pl @@ -156,11 +156,11 @@ praet_pl
156 samodz 156 samodz
157 157
158 [tags] 158 [tags]
159 -naj> naj  
160 -nie> nie 159 +naj naj
  160 +nie nie
161 prefs prefs 161 prefs prefs
162 prefv prefv 162 prefv prefv
163 -dig> dig 163 +dig dig
164 adja adja 164 adja adja
165 adj adj:%:pos 165 adj adj:%:pos
166 adj_sup adj:%:sup 166 adj_sup adj:%:sup