From 6bb3241ce6ce37c55b691478adad820d84c36abd Mon Sep 17 00:00:00 2001 From: Michał Lenart <michall@ipipan.waw.pl> Date: Mon, 24 Feb 2014 17:30:28 +0000 Subject: [PATCH] - ctest powinno znowu w miarę działać --- fsabuilder/buildfsa.py | 3 +-- fsabuilder/morfeuszbuilder/fsa/serializer.pyc | Bin 18382 -> 0 bytes fsabuilder/morfeuszbuilder/fsa/state.py | 4 ++++ fsabuilder/morfeuszbuilder/fsa/state.pyc | Bin 3563 -> 0 bytes fsabuilder/morfeuszbuilder/segrules/rules.py | 12 ++++++++++++ fsabuilder/morfeuszbuilder/segrules/rulesManager.py | 5 ++++- fsabuilder/morfeuszbuilder/segrules/rulesNFA.py | 14 +++++++++++--- fsabuilder/morfeuszbuilder/segrules/rulesParser.py | 8 +++++--- fsabuilder/morfeuszbuilder/segrules/test/parserTest.py | 26 ++++++++++++++------------ fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py | 2 +- fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat | 14 +++++++------- 11 files changed, 59 insertions(+), 29 deletions(-) diff --git a/fsabuilder/buildfsa.py b/fsabuilder/buildfsa.py index 78915da..adce6e7 100644 --- a/fsabuilder/buildfsa.py +++ b/fsabuilder/buildfsa.py @@ -164,9 +164,8 @@ def buildAnalyzerFromPoliMorf(inputFile, tagset): _printStats(fsa) return fsa -def buildGeneratorFromPoliMorf(inputFile, tagsetFile): +def buildGeneratorFromPoliMorf(inputFile, tagset): encoder = encode.Encoder4Generator() - tagset = Tagset(tagsetFile) fsa = FSA(encoder, tagset) inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) for word, data in inputData: diff --git a/fsabuilder/morfeuszbuilder/fsa/serializer.pyc b/fsabuilder/morfeuszbuilder/fsa/serializer.pyc index 7fa639b..8fbb9b4 100644 Binary files a/fsabuilder/morfeuszbuilder/fsa/serializer.pyc and b/fsabuilder/morfeuszbuilder/fsa/serializer.pyc differ diff --git a/fsabuilder/morfeuszbuilder/fsa/state.py b/fsabuilder/morfeuszbuilder/fsa/state.py index e60a7cd..5292b87 100644 --- a/fsabuilder/morfeuszbuilder/fsa/state.py +++ b/fsabuilder/morfeuszbuilder/fsa/state.py @@ -13,6 +13,7 @@ class State(object): def __init__(self, additionalData=None): self.transitionsMap = {} + self.transitionsDataMap = {} self.freq = 0 self.encodedData = None self.reverseOffset = None @@ -31,6 +32,9 @@ class State(object): def setTransition(self, byte, nextState): self.transitionsMap[byte] = nextState + def setTransitionData(self, byte, data): + self.transitionsDataMap[byte] = data + def hasNext(self, byte): return byte in self.transitionsMap diff --git a/fsabuilder/morfeuszbuilder/fsa/state.pyc b/fsabuilder/morfeuszbuilder/fsa/state.pyc index 7199fc0..194b0fd 100644 Binary files a/fsabuilder/morfeuszbuilder/fsa/state.pyc and b/fsabuilder/morfeuszbuilder/fsa/state.pyc differ diff --git a/fsabuilder/morfeuszbuilder/segrules/rules.py b/fsabuilder/morfeuszbuilder/segrules/rules.py index 88fcccc..aff5d1a 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rules.py +++ b/fsabuilder/morfeuszbuilder/segrules/rules.py @@ -98,3 +98,15 @@ class ZeroOrMoreRule(UnaryRule): self.child._doAddToNFA(intermStartState, intermEndState) intermEndState.addTransition(None, endState) endState.addTransition(None, intermStartState) + +class ShiftOrthRule(UnaryRule): + + def __init__(self, child): + super(ShiftOrthRule, self).__init__(child) + + def addToNFA(self, fsa): + raise ValueError() + + def _doAddToNFA(self, startState, endState): + self.child._doAddToNFA(startState, endState) + startState.setTransitionData(self.child.segnum, 1) diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py index 18abbec..e70f3c4 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py @@ -12,7 +12,10 @@ class RulesManager(object): def _options2Key(self, optionsMap): return frozenset(optionsMap.items()) - def addDFA4Options(self, optionsMap, dfa): + def getDFA(self, optionsMap): + return self.options2DFA[self._options2Key(optionsMap)] + + def addDFA(self, optionsMap, dfa): self.options2DFA[self._options2Key(optionsMap)] = dfa def serialize(self): diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py index 4fedcc9..1c34957 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py @@ -12,6 +12,7 @@ class RulesNFAState(object): def __init__(self, initial=False, final=False, weak=False): self.transitionsMap = {} + self.transitionsDataMap = {} self.initial = initial self.final = final self.weak = weak @@ -21,6 +22,11 @@ class RulesNFAState(object): def addTransition(self, label, targetState): self.transitionsMap.setdefault(label, set()) self.transitionsMap[label].add(targetState) + self.transitionsDataMap[label] = 0 + + def setTransitionData(self, label, byte): + assert len(self.transitionsMap[label]) == 1 + self.transitionsDataMap[label] = byte def getClosure(self, visited): if self in visited: @@ -61,9 +67,10 @@ class RulesNFA(object): for nfaState in nfaStates: for label, nextStates in nfaState.transitionsMap.iteritems(): if label is not None: - res.setdefault(label, set()) + transitionData = nfaState.transitionsDataMap[label] + res.setdefault((label, transitionData), set()) for nextNFAState in nextStates: - res[label] |= nextNFAState.getClosure(set()) + res[(label, transitionData)] |= nextNFAState.getClosure(set()) # print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)] return res @@ -77,7 +84,7 @@ class RulesNFA(object): # dfaState should be final # and contain info about weakness dfaState.encodedData = bytearray([1 if weak else 0]) - for label, nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): + for (label, transitionData), nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): # print '============' # print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)] # print 'label:', label @@ -90,6 +97,7 @@ class RulesNFA(object): nfaSubset2DFAState[key] = nextDFAState self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) dfaState.setTransition(label, nextDFAState) + dfaState.setTransitionData(label, transitionData) def convertToDFA(self): dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py index 90f9aa0..96cc8d5 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py @@ -50,7 +50,7 @@ class RulesParser(object): for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): rule.addToNFA(nfa) dfa = nfa.convertToDFA() - res.addDFA4Options(key2Def, dfa) + res.addDFA(key2Def, dfa) return res def _doParse(self, combinationEnumeratedLines, segtypesHelper): @@ -67,9 +67,10 @@ class RulesParser(object): def _doParseOneLine(self, lineNum, line, segtypesHelper): rule = Forward() - tagRule = Word(alphanums+'_>') + tagRule = Word(alphanums+'_') + shiftOrthRule = tagRule + '>' parenRule = Suppress('(') + rule + Suppress(')') - atomicRule = tagRule ^ parenRule + atomicRule = tagRule ^ shiftOrthRule ^ parenRule zeroOrMoreRule = atomicRule + Suppress('*') oneOrMoreRule = atomicRule + Suppress('+') unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule @@ -79,6 +80,7 @@ class RulesParser(object): rule << concatRule tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) + shiftOrthRule.setParseAction(lambda string, loc, toks: rules.ShiftOrthRule(toks[0])) # parenRule.setParseAction(lambda string, loc, toks: toks[0]) zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) diff --git a/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py b/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py index a9d320c..39cbde5 100644 --- a/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py +++ b/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py @@ -10,18 +10,20 @@ from morfeuszbuilder.tagset import tagset from morfeuszbuilder.fsa import visualizer, serializer class Test(unittest.TestCase): - print 'do test' - t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) - parser = rulesParser.RulesParser(t) - fsas = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) - fsa = fsas[0] - for s in fsa.dfs(): - s.debug() - print 'states:', len(list(fsa.dfs())) - print 'transitions:', fsa.getTransitionsNum() - visualizer.Visualizer().visualize(fsa, charLabels=False) - print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) - print 'done' + + def testParser(self): + print 'do test' + t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) + parser = rulesParser.RulesParser(t) + rulesManager = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) + fsa = rulesManager.getDFA({'aggl': 'permissive', 'praet': 'split'}) + for s in fsa.dfs(): + s.debug() + print 'states:', len(list(fsa.dfs())) + print 'transitions:', fsa.getTransitionsNum() + visualizer.Visualizer().visualize(fsa, charLabels=False) + print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) + print 'done' if __name__ == "__main__": unittest.main() diff --git a/fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py b/fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py index 8846344..e1ebc63 100644 --- a/fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py +++ b/fsabuilder/morfeuszbuilder/segrules/test/preprocessorTest.py @@ -16,7 +16,7 @@ class Test(unittest.TestCase): def testPreprocess(self): filename = os.path.join(os.path.dirname(__file__), 'segmenty.dat') - parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes']) + parsedFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) linesEnum = parsedFile.enumerateLinesInSection('combinations') for lineNum, line in preprocessor.preprocess(linesEnum, ['extra', 'superextra']): print (lineNum, line) diff --git a/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat b/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat index fbd9af2..80e96bc 100644 --- a/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat +++ b/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat @@ -111,7 +111,7 @@ moze_interp(z_on_agl) moze_interp(z_on_agl on_agl) # Liczba zapisana jako ciąg cyfr: -#moze_interp( dig>* dig ) +moze_interp( dig>* dig ) # Formacje prefiksalne #### trzeba wydzielić odpowiednie samodze! @@ -133,11 +133,11 @@ adj dywiz samodz samodz dywiz adj [segment types] -naj> -nie> +naj +nie prefs prefv -dig> +dig adja adj adj_sup @@ -156,11 +156,11 @@ praet_pl samodz [tags] -naj> naj -nie> nie +naj naj +nie nie prefs prefs prefv prefv -dig> dig +dig dig adja adja adj adj:%:pos adj_sup adj:%:sup -- libgit2 0.22.2