Commit 6bb3241ce6ce37c55b691478adad820d84c36abd

Authored by Michał Lenart
1 parent 4ea040d0

- ctest powinno znowu w miarę działać

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@88 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/buildfsa.py
... ... @@ -164,9 +164,8 @@ def buildAnalyzerFromPoliMorf(inputFile, tagset):
164 164 _printStats(fsa)
165 165 return fsa
166 166  
167   -def buildGeneratorFromPoliMorf(inputFile, tagsetFile):
  167 +def buildGeneratorFromPoliMorf(inputFile, tagset):
168 168 encoder = encode.Encoder4Generator()
169   - tagset = Tagset(tagsetFile)
170 169 fsa = FSA(encoder, tagset)
171 170 inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder)
172 171 for word, data in inputData:
... ...
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/state.py
... ... @@ -13,6 +13,7 @@ class State(object):
13 13  
14 14 def __init__(self, additionalData=None):
15 15 self.transitionsMap = {}
  16 + self.transitionsDataMap = {}
16 17 self.freq = 0
17 18 self.encodedData = None
18 19 self.reverseOffset = None
... ... @@ -31,6 +32,9 @@ class State(object):
31 32 def setTransition(self, byte, nextState):
32 33 self.transitionsMap[byte] = nextState
33 34  
  35 + def setTransitionData(self, byte, data):
  36 + self.transitionsDataMap[byte] = data
  37 +
34 38 def hasNext(self, byte):
35 39 return byte in self.transitionsMap
36 40  
... ...
fsabuilder/morfeuszbuilder/fsa/state.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/segrules/rules.py
... ... @@ -98,3 +98,15 @@ class ZeroOrMoreRule(UnaryRule):
98 98 self.child._doAddToNFA(intermStartState, intermEndState)
99 99 intermEndState.addTransition(None, endState)
100 100 endState.addTransition(None, intermStartState)
  101 +
  102 +class ShiftOrthRule(UnaryRule):
  103 +
  104 + def __init__(self, child):
  105 + super(ShiftOrthRule, self).__init__(child)
  106 +
  107 + def addToNFA(self, fsa):
  108 + raise ValueError()
  109 +
  110 + def _doAddToNFA(self, startState, endState):
  111 + self.child._doAddToNFA(startState, endState)
  112 + startState.setTransitionData(self.child.segnum, 1)
... ...
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... ... @@ -12,7 +12,10 @@ class RulesManager(object):
12 12 def _options2Key(self, optionsMap):
13 13 return frozenset(optionsMap.items())
14 14  
15   - def addDFA4Options(self, optionsMap, dfa):
  15 + def getDFA(self, optionsMap):
  16 + return self.options2DFA[self._options2Key(optionsMap)]
  17 +
  18 + def addDFA(self, optionsMap, dfa):
16 19 self.options2DFA[self._options2Key(optionsMap)] = dfa
17 20  
18 21 def serialize(self):
... ...
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... ... @@ -12,6 +12,7 @@ class RulesNFAState(object):
12 12  
13 13 def __init__(self, initial=False, final=False, weak=False):
14 14 self.transitionsMap = {}
  15 + self.transitionsDataMap = {}
15 16 self.initial = initial
16 17 self.final = final
17 18 self.weak = weak
... ... @@ -21,6 +22,11 @@ class RulesNFAState(object):
21 22 def addTransition(self, label, targetState):
22 23 self.transitionsMap.setdefault(label, set())
23 24 self.transitionsMap[label].add(targetState)
  25 + self.transitionsDataMap[label] = 0
  26 +
  27 + def setTransitionData(self, label, byte):
  28 + assert len(self.transitionsMap[label]) == 1
  29 + self.transitionsDataMap[label] = byte
24 30  
25 31 def getClosure(self, visited):
26 32 if self in visited:
... ... @@ -61,9 +67,10 @@ class RulesNFA(object):
61 67 for nfaState in nfaStates:
62 68 for label, nextStates in nfaState.transitionsMap.iteritems():
63 69 if label is not None:
64   - res.setdefault(label, set())
  70 + transitionData = nfaState.transitionsDataMap[label]
  71 + res.setdefault((label, transitionData), set())
65 72 for nextNFAState in nextStates:
66   - res[label] |= nextNFAState.getClosure(set())
  73 + res[(label, transitionData)] |= nextNFAState.getClosure(set())
67 74 # print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)]
68 75 return res
69 76  
... ... @@ -77,7 +84,7 @@ class RulesNFA(object):
77 84 # dfaState should be final
78 85 # and contain info about weakness
79 86 dfaState.encodedData = bytearray([1 if weak else 0])
80   - for label, nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems():
  87 + for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems():
81 88 # print '============'
82 89 # print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)]
83 90 # print 'label:', label
... ... @@ -90,6 +97,7 @@ class RulesNFA(object):
90 97 nfaSubset2DFAState[key] = nextDFAState
91 98 self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState)
92 99 dfaState.setTransition(label, nextDFAState)
  100 + dfaState.setTransitionData(label, transitionData)
93 101  
94 102 def convertToDFA(self):
95 103 dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False)
... ...
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... ... @@ -50,7 +50,7 @@ class RulesParser(object):
50 50 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper):
51 51 rule.addToNFA(nfa)
52 52 dfa = nfa.convertToDFA()
53   - res.addDFA4Options(key2Def, dfa)
  53 + res.addDFA(key2Def, dfa)
54 54 return res
55 55  
56 56 def _doParse(self, combinationEnumeratedLines, segtypesHelper):
... ... @@ -67,9 +67,10 @@ class RulesParser(object):
67 67  
68 68 def _doParseOneLine(self, lineNum, line, segtypesHelper):
69 69 rule = Forward()
70   - tagRule = Word(alphanums+'_>')
  70 + tagRule = Word(alphanums+'_')
  71 + shiftOrthRule = tagRule + '>'
71 72 parenRule = Suppress('(') + rule + Suppress(')')
72   - atomicRule = tagRule ^ parenRule
  73 + atomicRule = tagRule ^ shiftOrthRule ^ parenRule
73 74 zeroOrMoreRule = atomicRule + Suppress('*')
74 75 oneOrMoreRule = atomicRule + Suppress('+')
75 76 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
... ... @@ -79,6 +80,7 @@ class RulesParser(object):
79 80 rule << concatRule
80 81  
81 82 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper))
  83 + shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0]))
82 84 # parenRule.setParseAction(lambda string, loc, toks: toks[0])
83 85 zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0]))
84 86 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
... ...
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
... ... @@ -10,18 +10,20 @@ from morfeuszbuilder.tagset import tagset
10 10 from morfeuszbuilder.fsa import visualizer, serializer
11 11  
12 12 class Test(unittest.TestCase):
13   - print 'do test'
14   - t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset'))
15   - parser = rulesParser.RulesParser(t)
16   - fsas = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))
17   - fsa = fsas[0]
18   - for s in fsa.dfs():
19   - s.debug()
20   - print 'states:', len(list(fsa.dfs()))
21   - print 'transitions:', fsa.getTransitionsNum()
22   - visualizer.Visualizer().visualize(fsa, charLabels=False)
23   - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray()))
24   - print 'done'
  13 +
  14 + def testParser(self):
  15 + print 'do test'
  16 + t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset'))
  17 + parser = rulesParser.RulesParser(t)
  18 + rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))
  19 + fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'})
  20 + for s in fsa.dfs():
  21 + s.debug()
  22 + print 'states:', len(list(fsa.dfs()))
  23 + print 'transitions:', fsa.getTransitionsNum()
  24 + visualizer.Visualizer().visualize(fsa, charLabels=False)
  25 + print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray()))
  26 + print 'done'
25 27  
26 28 if __name__ == "__main__":
27 29 unittest.main()
... ...
fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py
... ... @@ -16,7 +16,7 @@ class Test(unittest.TestCase):
16 16  
17 17 def testPreprocess(self):
18 18 filename = os.path.join(os.path.dirname(__file__), 'segmenty.dat')
19   - parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes'])
  19 + parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types'])
20 20 linesEnum = parsedFile.enumerateLinesInSection('combinations')
21 21 for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']):
22 22 print (lineNum, line)
... ...
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
... ... @@ -111,7 +111,7 @@ moze_interp(z_on_agl)
111 111 moze_interp(z_on_agl on_agl)
112 112  
113 113 # Liczba zapisana jako ciąg cyfr:
114   -#moze_interp( dig>* dig )
  114 +moze_interp( dig>* dig )
115 115  
116 116 # Formacje prefiksalne
117 117 #### trzeba wydzielić odpowiednie samodze!
... ... @@ -133,11 +133,11 @@ adj dywiz samodz
133 133 samodz dywiz adj
134 134  
135 135 [segment types]
136   -naj>
137   -nie>
  136 +naj
  137 +nie
138 138 prefs
139 139 prefv
140   -dig>
  140 +dig
141 141 adja
142 142 adj
143 143 adj_sup
... ... @@ -156,11 +156,11 @@ praet_pl
156 156 samodz
157 157  
158 158 [tags]
159   -naj> naj
160   -nie> nie
  159 +naj naj
  160 +nie nie
161 161 prefs prefs
162 162 prefv prefv
163   -dig> dig
  163 +dig dig
164 164 adja adja
165 165 adj adj:%:pos
166 166 adj_sup adj:%:sup
... ...