diff --git a/fsabuilder/PoliMorfTest.cek b/fsabuilder/PoliMorfTest.cek new file mode 100644 index 0000000..a6c0df5 --- /dev/null +++ b/fsabuilder/PoliMorfTest.cek @@ -0,0 +1,128 @@ +bij ABć+impt:sg:sec:imperf+pospolita +bija AAć+fin:sg:ter:imperf+pospolita +bijacie ADć+fin:pl:sec:imperf+pospolita +bijaj ABć+impt:sg:sec:imperf+pospolita +bijajcie AEć+impt:pl:sec:imperf+pospolita +bijajmy ADć+impt:pl:pri:imperf+pospolita +bijają ACć+fin:pl:ter:imperf+pospolita +bijając ADć+pcon:imperf+pospolita +bijająca AEć+pact:sg:nom.voc:f:imperf:aff+pospolita +bijające AEć+pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|AEć+pact:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita +bijającego AGć+pact:sg:acc:m1.m2:imperf:aff+pospolita|AGć+pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita +bijającej AFć+pact:sg:gen.dat.loc:f:imperf:aff+pospolita +bijającemu AGć+pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita +bijający AEć+pact:pl:nom.voc:m1.p1:imperf:aff+pospolita|AEć+pact:sg:acc:m3:imperf:aff+pospolita|AEć+pact:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita +bijających AGć+pact:pl:acc:m1.p1:imperf:aff+pospolita|AGć+pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita +bijającym AFć+pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|AFć+pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita +bijającymi AGć+pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita +bijającą AEć+pact:sg:acc.inst:f:imperf:aff+pospolita +bijak AA+subst:sg:acc:m3+pospolita|AA+subst:sg:nom:m3+pospolita +bijaka AB+subst:sg:gen:m3+pospolita +bijakach AD+subst:pl:loc:m3+pospolita +bijakami AD+subst:pl:inst:m3+pospolita +bijaki AB+subst:pl:acc:m3+pospolita|AB+subst:pl:nom:m3+pospolita|AB+subst:pl:voc:m3+pospolita +bijakiem AD+subst:sg:inst:m3+pospolita +bijakom AC+subst:pl:dat:m3+pospolita +bijakowa ABy+adj:sg:nom.voc:f:pos+pospolita +bijakowe ABy+adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos+pospolita|ABy+adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos+pospolita|ABy+adj:sg:acc:n1.n2:pos+pospolita|ABy+adj:sg:nom.voc:n1.n2:pos+pospolita +bijakowego ADy+adj:sg:acc:m1.m2:pos+pospolita|ADy+adj:sg:gen:m1.m2.m3.n1.n2:pos+pospolita +bijakowej ACy+adj:sg:dat:f:pos+pospolita|ACy+adj:sg:gen:f:pos+pospolita|ACy+adj:sg:loc:f:pos+pospolita +bijakowemu ADy+adj:sg:dat:m1.m2.m3.n1.n2:pos+pospolita +bijakowi ABy+adj:pl:nom.voc:m1.p1:pos+pospolita|AD+subst:sg:dat:m3+pospolita +bijakowo ABy+adja+pospolita +bijakowości ACć+subst:pl:acc:f+pospolita|ACć+subst:pl:gen:f+pospolita|ACć+subst:pl:nom:f+pospolita|ACć+subst:pl:voc:f+pospolita|ACć+subst:sg:dat:f+pospolita|ACć+subst:sg:gen:f+pospolita|ACć+subst:sg:loc:f+pospolita|ACć+subst:sg:voc:f+pospolita +bijakowościach AFć+subst:pl:loc:f+pospolita +bijakowościami AFć+subst:pl:inst:f+pospolita +bijakowościom AEć+subst:pl:dat:f+pospolita +bijakowością ADć+subst:sg:inst:f+pospolita +bijakowość AA+subst:sg:acc:f+pospolita|AA+subst:sg:nom:f+pospolita +bijakowy AA+adj:sg:acc:m3:pos+pospolita|AA+adj:sg:nom.voc:m1.m2.m3:pos+pospolita +bijakowych AC+adj:pl:acc:m1.p1:pos+pospolita|AC+adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita|AC+adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita +bijakowym AB+adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita|AB+adj:sg:inst:m1.m2.m3.n1.n2:pos+pospolita|AB+adj:sg:loc:m1.m2.m3.n1.n2:pos+pospolita +bijakowymi AC+adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita +bijakową ABy+adj:sg:acc:f:pos+pospolita|ABy+adj:sg:inst:f:pos+pospolita +bijaku AB+subst:sg:loc:m3+pospolita|AB+subst:sg:voc:m3+pospolita +bijaków AC+subst:pl:gen:m3+pospolita +bijali ACć+praet:pl:m1.p1:imperf+pospolita +bijam ABć+fin:sg:pri:imperf+pospolita +bijamy ACć+fin:pl:pri:imperf+pospolita +bijana ACć+ppas:sg:nom.voc:f:imperf:aff+pospolita +bijance ACka+subst:sg:dat:f+pospolita|ACka+subst:sg:loc:f+pospolita +bijane ACć+ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|ACć+ppas:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita +bijanego AEć+ppas:sg:acc:m1.m2:imperf:aff+pospolita|AEć+ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita +bijanej ADć+ppas:sg:gen.dat.loc:f:imperf:aff+pospolita +bijanek ACka+subst:pl:gen:f+pospolita +bijanemu AEć+ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita +bijani ACć+ppas:pl:nom.voc:m1.p1:imperf:aff+pospolita +bijania ADć+ger:sg:gen:n2:imperf:aff+pospolita +bijanie ADć+ger:sg:nom.acc:n2:imperf:aff+pospolita +bijaniem AEć+ger:sg:inst:n2:imperf:aff+pospolita +bijaniu ADć+ger:sg:dat.loc:n2:imperf:aff+pospolita +bijanka AA+subst:sg:nom:f+pospolita +bijankach AC+subst:pl:loc:f+pospolita +bijankami AC+subst:pl:inst:f+pospolita +bijanki ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita|ABa+subst:sg:gen:f+pospolita +bijanko ABa+subst:sg:voc:f+pospolita +bijankom ACa+subst:pl:dat:f+pospolita +bijanką ABa+subst:sg:inst:f+pospolita +bijankę ABa+subst:sg:acc:f+pospolita +bijano ACć+imps:imperf+pospolita +bijany ACć+ppas:sg:acc:m3:imperf:aff+pospolita|ACć+ppas:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita +bijanych AEć+ppas:pl:acc:m1.p1:imperf:aff+pospolita|AEć+ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita +bijanym ADć+ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|ADć+ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita +bijanymi AEć+ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita +bijaną ACć+ppas:sg:acc.inst:f:imperf:aff+pospolita +bijasz ACć+fin:sg:sec:imperf+pospolita +bijatyce ACka+subst:sg:dat:f+pospolita|ACka+subst:sg:loc:f+pospolita +bijatyk AAa+subst:pl:gen:f+pospolita +bijatyka AA+subst:sg:nom:f+pospolita +bijatykach AC+subst:pl:loc:f+pospolita +bijatykami AC+subst:pl:inst:f+pospolita +bijatyki ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita|ABa+subst:sg:gen:f+pospolita +bijatyko ABa+subst:sg:voc:f+pospolita +bijatykom ACa+subst:pl:dat:f+pospolita +bijatyką ABa+subst:sg:inst:f+pospolita +bijatykę ABa+subst:sg:acc:f+pospolita +bijać AA+inf:imperf+pospolita +bijał ABć+praet:sg:m1.m2.m3:imperf+pospolita +bijała ACć+praet:sg:f:imperf+pospolita +bijało ACć+praet:sg:n1.n2:imperf+pospolita +bijały ACć+praet:pl:m2.m3.f.n1.n2.p2.p3:imperf+pospolita +bijcie AEć+impt:pl:sec:imperf+pospolita +bije ACć+fin:sg:ter:imperf+pospolita +bijecie AFć+fin:pl:sec:imperf+pospolita +bijekcja AA+subst:sg:nom:f+pospolita +bijekcjach AC+subst:pl:loc:f+pospolita +bijekcjami AC+subst:pl:inst:f+pospolita +bijekcje ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita +bijekcji ABa+subst:pl:gen:f+pospolita|ABa+subst:sg:dat:f+pospolita|ABa+subst:sg:gen:f+pospolita|ABa+subst:sg:loc:f+pospolita +bijekcjo ABa+subst:sg:voc:f+pospolita +bijekcjom ACa+subst:pl:dat:f+pospolita +bijekcją ABa+subst:sg:inst:f+pospolita +bijekcję ABa+subst:sg:acc:f+pospolita +bijekcyj ACja+subst:pl:gen:f+pospolita +bijemy AEć+fin:pl:pri:imperf+pospolita +bijesz AEć+fin:sg:sec:imperf+pospolita +bijmy ADć+impt:pl:pri:imperf+pospolita +bijnik AA+subst:sg:acc:m3+pospolita|AA+subst:sg:nom:m3+pospolita +bijnika AB+subst:sg:gen:m3+pospolita +bijnikach AD+subst:pl:loc:m3+pospolita +bijnikami AD+subst:pl:inst:m3+pospolita +bijniki AB+subst:pl:acc:m3+pospolita|AB+subst:pl:nom:m3+pospolita|AB+subst:pl:voc:m3+pospolita +bijnikiem AD+subst:sg:inst:m3+pospolita +bijnikom AC+subst:pl:dat:m3+pospolita +bijnikowi AD+subst:sg:dat:m3+pospolita +bijniku AB+subst:sg:loc:m3+pospolita|AB+subst:sg:voc:m3+pospolita +bijników AC+subst:pl:gen:m3+pospolita +biją ACć+fin:pl:ter:imperf+pospolita +bijąc ADć+pcon:imperf+pospolita +bijąca AEć+pact:sg:nom.voc:f:imperf:aff+pospolita +bijące AEć+pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|AEć+pact:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita +bijącego AGć+pact:sg:acc:m1.m2:imperf:aff+pospolita|AGć+pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita +bijącej AFć+pact:sg:gen.dat.loc:f:imperf:aff+pospolita +bijącemu AGć+pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita +bijący AEć+pact:pl:nom.voc:m1.p1:imperf:aff+pospolita|AEć+pact:sg:acc:m3:imperf:aff+pospolita|AEć+pact:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita +bijących AGć+pact:pl:acc:m1.p1:imperf:aff+pospolita|AGć+pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita +bijącym AFć+pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|AFć+pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita +bijącymi AGć+pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita +bijącą AEć+pact:sg:acc.inst:f:imperf:aff+pospolita diff --git a/fsabuilder/buildfsa.py b/fsabuilder/buildfsa.py index 58be349..78915da 100644 --- a/fsabuilder/buildfsa.py +++ b/fsabuilder/buildfsa.py @@ -10,9 +10,10 @@ import logging import codecs from morfeuszbuilder.fsa import encode from morfeuszbuilder.fsa import convertinput -from morfeuszbuilder.fsa import common from morfeuszbuilder.fsa.fsa import FSA from morfeuszbuilder.fsa.serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer +from morfeuszbuilder.tagset.tagset import Tagset +from morfeuszbuilder.segrules import rulesParser from optparse import OptionParser # class InputFormat(): @@ -50,6 +51,10 @@ def _parseOptions(): dest='tagsetFile', metavar='FILE', help='path to the file with tagset') + parser.add_option('--segments-file', + dest='segmentsFile', + metavar='FILE', + help='path to the file with segment rules') parser.add_option('-o', '--output-file', dest='outputFile', metavar='FILE', @@ -107,6 +112,8 @@ def _parseOptions(): _checkOption(opts.serializationMethod, parser, "Serialization method file is missing") _checkExactlyOneOptionSet([opts.analyzer, opts.generator], parser, 'Must set exactly one FSA type: --analyzer or --generator') + if opts.analyzer: + _checkOption(opts.segmentsFile, parser, "Segment rules file is missing") if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]: print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')' @@ -147,9 +154,8 @@ def _printStats(fsa): logging.info('sink states num: '+str(sinkNum)) logging.info('array states num: '+str(arrayNum)) -def buildAnalyzerFromPoliMorf(inputFile, tagsetFile): +def buildAnalyzerFromPoliMorf(inputFile, tagset): encoder = encode.MorphEncoder() - tagset = common.Tagset(tagsetFile) fsa = FSA(encoder, tagset) inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder) for word, data in inputData: @@ -160,7 +166,7 @@ def buildAnalyzerFromPoliMorf(inputFile, tagsetFile): def buildGeneratorFromPoliMorf(inputFile, tagsetFile): encoder = encode.Encoder4Generator() - tagset = common.Tagset(tagsetFile) + tagset = Tagset(tagsetFile) fsa = FSA(encoder, tagset) inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) for word, data in inputData: @@ -175,10 +181,15 @@ def main(opts): else: logging.basicConfig(level=logging.INFO) + tagset = Tagset(opts.tagsetFile) + if opts.analyzer: - fsa = buildAnalyzerFromPoliMorf(opts.inputFile, opts.tagsetFile) + fsa = buildAnalyzerFromPoliMorf(opts.inputFile, tagset) + segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile) + additionalData = segmentRulesManager.serialize() else: - fsa = buildGeneratorFromPoliMorf(opts.inputFile, opts.tagsetFile) + fsa = buildGeneratorFromPoliMorf(opts.inputFile, tagset) + additionalData = bytearray() if opts.trainFile: logging.info('training with '+opts.trainFile+' ...') diff --git a/fsabuilder/morfeuszbuilder/fsa/fsa.py b/fsabuilder/morfeuszbuilder/fsa/fsa.py index 7f94fc0..0d0a6e6 100644 --- a/fsabuilder/morfeuszbuilder/fsa/fsa.py +++ b/fsabuilder/morfeuszbuilder/fsa/fsa.py @@ -119,4 +119,3 @@ class FSA(object): state.reverseOffset = currReverseOffset for state in self.initialState.dfs(set()): state.offset = currReverseOffset - state.reverseOffset - \ No newline at end of file diff --git a/fsabuilder/morfeuszbuilder/fsa/fsa.pyc b/fsabuilder/morfeuszbuilder/fsa/fsa.pyc index 86dd415..2e602c4 100644 --- a/fsabuilder/morfeuszbuilder/fsa/fsa.pyc +++ b/fsabuilder/morfeuszbuilder/fsa/fsa.pyc diff --git a/fsabuilder/morfeuszbuilder/fsa/serializer.py b/fsabuilder/morfeuszbuilder/fsa/serializer.py index 99dab22..852d05d 100644 --- a/fsabuilder/morfeuszbuilder/fsa/serializer.py +++ b/fsabuilder/morfeuszbuilder/fsa/serializer.py @@ -45,16 +45,15 @@ class Serializer(object): def serialize2BinaryFile(self, fname): with open(fname, 'wb') as f: - f.write(self.fsa2bytearray()) + f.write(self.fsa2bytearray(self.serializeTagset(self.fsa.tagset))) def getStateSize(self, state): raise NotImplementedError('Not implemented') - def fsa2bytearray(self): + def fsa2bytearray(self, additionalData=bytearray()): res = bytearray() - res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) + res.extend(self.serializePrologue(additionalData)) self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) - logging.debug('SERIALIZE') for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): res.extend(self.state2bytearray(state)) return res diff --git a/fsabuilder/morfeuszbuilder/fsa/serializer.pyc b/fsabuilder/morfeuszbuilder/fsa/serializer.pyc index fee65ce..7fa639b 100644 --- a/fsabuilder/morfeuszbuilder/fsa/serializer.pyc +++ b/fsabuilder/morfeuszbuilder/fsa/serializer.pyc diff --git a/fsabuilder/morfeuszbuilder/fsa/state.py b/fsabuilder/morfeuszbuilder/fsa/state.py index 1ae33ea..e60a7cd 100644 --- a/fsabuilder/morfeuszbuilder/fsa/state.py +++ b/fsabuilder/morfeuszbuilder/fsa/state.py @@ -8,6 +8,8 @@ class State(object): ''' A state in an automaton ''' + + statesCounter = 0 def __init__(self, additionalData=None): self.transitionsMap = {} @@ -18,6 +20,9 @@ class State(object): self.label2Freq = {} self.serializeAsArray = False self.additionalData = additionalData + + self.idx = State.statesCounter + State.statesCounter += 1 @property def transitionsNum(self): @@ -51,10 +56,16 @@ class State(object): else: return self.encodedData - def dfs(self, alreadyVisited=set(), sortKey=lambda (_, state): -state.freq): + def dfs(self, alreadyVisited, sortKey=lambda (_, state): -state.freq): if not self in alreadyVisited: + alreadyVisited.add(self) for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey): for state1 in state.dfs(alreadyVisited): yield state1 - alreadyVisited.add(self) yield self + + def debug(self): + print '----------------' + print 'STATE:', self.idx + for label, s in self.transitionsMap.iteritems(): + print label, '-->', s.idx diff --git a/fsabuilder/morfeuszbuilder/fsa/state.pyc b/fsabuilder/morfeuszbuilder/fsa/state.pyc index e0dc825..7199fc0 100644 --- a/fsabuilder/morfeuszbuilder/fsa/state.pyc +++ b/fsabuilder/morfeuszbuilder/fsa/state.pyc diff --git a/fsabuilder/morfeuszbuilder/fsa/visualizer.py b/fsabuilder/morfeuszbuilder/fsa/visualizer.py index 78c4410..ecc0d70 100644 --- a/fsabuilder/morfeuszbuilder/fsa/visualizer.py +++ b/fsabuilder/morfeuszbuilder/fsa/visualizer.py @@ -12,7 +12,7 @@ class Visualizer(object): def __init__(self): pass - def visualize(self, fsa): + def visualize(self, fsa, charLabels=True): G = nx.DiGraph() allStates = list(reversed(list(fsa.initialState.dfs(set())))) edgeLabelsMap = {} @@ -21,10 +21,12 @@ class Visualizer(object): G.add_node(idx, offset=state.offset) for c, targetState in state.transitionsMap.iteritems(): G.add_edge(idx, allStates.index(targetState)) - label = chr(c) if c <= 127 else '%' + label = (chr(c) if c <= 127 else '%') if charLabels \ + else c edgeLabelsMap[(idx, allStates.index(targetState))] = label nodeLabelsMap[idx] = state.offset if not state.isAccepting() else state.encodedData + '(' + str(state.offset) + ')' pos=nx.shell_layout(G) +# pos=nx.random_layout(G) nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if not s.isAccepting()]), diff --git a/fsabuilder/morfeuszbuilder/fsa/visualizer.pyc b/fsabuilder/morfeuszbuilder/fsa/visualizer.pyc index b43e41c..8e5eeda 100644 --- a/fsabuilder/morfeuszbuilder/fsa/visualizer.pyc +++ b/fsabuilder/morfeuszbuilder/fsa/visualizer.pyc diff --git a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py index 1e3250b..2313709 100644 --- a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py +++ b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py @@ -6,8 +6,7 @@ Created on 23 sty 2014 import re from pyparsing import * -identifier = Word(alphas, bodyChars=alphanums+'_') -token = Word(alphas, bodyChars=alphanums+'_+>') +identifier = Word(alphas, bodyChars=alphanums+'_>*+') define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() diff --git a/fsabuilder/morfeuszbuilder/segrules/rules.py b/fsabuilder/morfeuszbuilder/segrules/rules.py index 1376a9c..88fcccc 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rules.py +++ b/fsabuilder/morfeuszbuilder/segrules/rules.py @@ -34,6 +34,9 @@ class TagRule(SegmentRule): def _doAddToNFA(self, startState, endState): startState.addTransition(self.segnum, endState) + + def __str__(self): + return u''+self.segnum class UnaryRule(SegmentRule): @@ -95,12 +98,3 @@ class ZeroOrMoreRule(UnaryRule): self.child._doAddToNFA(intermStartState, intermEndState) intermEndState.addTransition(None, endState) endState.addTransition(None, intermStartState) - -class IgnoreOrthRule(UnaryRule): - - def __init__(self, child): - super(IgnoreOrthRule, self).__init__(child) - - def _doAddToNFA(self, startState, endState): - startState.addTransition(self.child.segnum, endState, ignoreOrth=True) - diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py new file mode 100644 index 0000000..18abbec --- /dev/null +++ b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py @@ -0,0 +1,19 @@ +''' +Created on 20 lut 2014 + +@author: mlenart +''' + +class RulesManager(object): + + def __init__(self): + self.options2DFA = {} + + def _options2Key(self, optionsMap): + return frozenset(optionsMap.items()) + + def addDFA4Options(self, optionsMap, dfa): + self.options2DFA[self._options2Key(optionsMap)] = dfa + + def serialize(self): + pass \ No newline at end of file diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py index 56c59ce..4fedcc9 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py @@ -8,33 +8,98 @@ from morfeuszbuilder.fsa import fsa, state, encode class RulesNFAState(object): - def __init__(self, initial=False, final=False): + statesCounter = 0 + + def __init__(self, initial=False, final=False, weak=False): self.transitionsMap = {} self.initial = initial self.final = final + self.weak = weak + self.idx = RulesNFAState.statesCounter + RulesNFAState.statesCounter += 1 + + def addTransition(self, label, targetState): + self.transitionsMap.setdefault(label, set()) + self.transitionsMap[label].add(targetState) + + def getClosure(self, visited): + if self in visited: + return set() + else: + visited.add(self) + res = set() + res.add(self) + for nextState in self.transitionsMap.get(None, []): + if self.idx in [6,8,4]: + print nextState.idx + print self.transitionsMap + res |= nextState.getClosure(visited) + return res - def addTransition(self, label, targetState, ignoreOrth=False): - assert not ignoreOrth or label is not None - self.transitionsMap.setdefault((label, ignoreOrth), set()) - self.transitionsMap[(label, ignoreOrth)].add(targetState) + def dfs(self, visitedStates=set()): + if not self in visitedStates: + visitedStates.add(self) + yield self + for _, nextStates in self.transitionsMap.iteritems(): + for state in nextStates: + for state1 in state.dfs(): + yield state1 + + def debug(self): + print '----------------' + print 'STATE:', self.idx + for label, nextStates in self.transitionsMap.iteritems(): + print label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)] class RulesNFA(object): - def __init__(self, key2Def={}): + def __init__(self): self.initialState = RulesNFAState(initial=True) - def _doConvertState(self, dfaState, nfaStates): - for label, (nextIgnoreOrth, nextNFAStates) in self._groupOutputByLabels(nfaStates).iteritems(): - nextDFAState = state.State(additionalData=nextIgnoreOrth) + def _groupOutputByLabels(self, nfaStates): + res = {} + for nfaState in nfaStates: + for label, nextStates in nfaState.transitionsMap.iteritems(): + if label is not None: + res.setdefault(label, set()) + for nextNFAState in nextStates: + res[label] |= nextNFAState.getClosure(set()) +# print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)] + return res + + def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): + assert all(map(lambda state: state.weak, nfaStates)) \ + or not any(map(lambda state: state.weak, nfaStates)) + weak = all(map(lambda state: state.weak, nfaStates)) + final = any(map(lambda state: state.final, nfaStates)) + assert not weak or not final + if final: + # dfaState should be final + # and contain info about weakness + dfaState.encodedData = bytearray([1 if weak else 0]) + for label, nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): +# print '============' +# print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)] +# print 'label:', label +# print 'nextStates:', [s.idx for s in sorted(nextNFAStates, key=lambda s: s.idx)] + key = frozenset(nextNFAStates) + if key in nfaSubset2DFAState: + nextDFAState = nfaSubset2DFAState[key] + else: + nextDFAState = state.State() + nfaSubset2DFAState[key] = nextDFAState + self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) dfaState.setTransition(label, nextDFAState) - dfaState.encodedData = bytearray() - self._doConvertState(nextDFAState, nextNFAStates) def convertToDFA(self): - dfa = fsa.FSA(encoder=None, encodeWords=False) - startStates = self.initialState.getClosure() + dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) + startStates = self.initialState.getClosure(set()) assert not any(filter(lambda s: s.final, startStates)) dfa.initialState = state.State(additionalData=False) - self._doConvertState(dfa.initialState, startStates) - + self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) + return dfa + + def debug(self): + for state in self.initialState.dfs(): + state.debug() \ No newline at end of file diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py index 398e6a6..90f9aa0 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py @@ -3,7 +3,7 @@ from pyparsing import * ParserElement.enablePackrat() from morfeuszbuilder.tagset import segtypes from morfeuszbuilder.utils import configFile, exceptions -from morfeuszbuilder.segrules import preprocessor, rules +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager import codecs import re @@ -28,9 +28,9 @@ class RulesParser(object): return res def parse(self, filename): - res = [] + res = rulesManager.RulesManager() - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes']) + segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) key2Defs = self._getKey2Defs(segtypesConfigFile) segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) @@ -39,14 +39,18 @@ class RulesParser(object): for define in defs: def2Key[define] = key + firstNFA = None for defs in itertools.product(*key2Defs.values()): key2Def = dict([(def2Key[define], define) for define in defs]) - nfa = rulesNFA.RulesNFA(key2Def) + nfa = rulesNFA.RulesNFA() + if not firstNFA: + firstNFA = nfa combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): rule.addToNFA(nfa) - res.append(nfa) + dfa = nfa.convertToDFA() + res.addDFA4Options(key2Def, dfa) return res def _doParse(self, combinationEnumeratedLines, segtypesHelper): @@ -58,14 +62,14 @@ class RulesParser(object): if not segtypesHelper.hasSegtype(segtype): raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) else: +# return rules.TagRule(segtype) return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype)) def _doParseOneLine(self, lineNum, line, segtypesHelper): rule = Forward() - tagRule = Word(alphanums+'_') - ignoreOrthRule = tagRule + Suppress('>') + tagRule = Word(alphanums+'_>') parenRule = Suppress('(') + rule + Suppress(')') - atomicRule = tagRule ^ ignoreOrthRule ^ parenRule + atomicRule = tagRule ^ parenRule zeroOrMoreRule = atomicRule + Suppress('*') oneOrMoreRule = atomicRule + Suppress('+') unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule @@ -75,19 +79,10 @@ class RulesParser(object): rule << concatRule tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) - ignoreOrthRule.setParseAction(lambda string, loc, toks: rules.IgnoreOrthRule(toks[0])) # parenRule.setParseAction(lambda string, loc, toks: toks[0]) zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) - - -# rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule - -# tagRule.setParseAction(lambda s,l,toks: doprint(toks)) -# print lineNum, line parsedRule = rule.parseString(line, parseAll=True)[0] - print parsedRule return parsedRule -# print parsedLine diff --git a/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py b/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py index f74556d..a9d320c 100644 --- a/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py +++ b/fsabuilder/morfeuszbuilder/segrules/test/parserTest.py @@ -7,12 +7,20 @@ import unittest import os from morfeuszbuilder.segrules import rulesParser from morfeuszbuilder.tagset import tagset +from morfeuszbuilder.fsa import visualizer, serializer class Test(unittest.TestCase): print 'do test' t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) parser = rulesParser.RulesParser(t) - parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) + fsas = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) + fsa = fsas[0] + for s in fsa.dfs(): + s.debug() + print 'states:', len(list(fsa.dfs())) + print 'transitions:', fsa.getTransitionsNum() + visualizer.Visualizer().visualize(fsa, charLabels=False) + print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) print 'done' if __name__ == "__main__": diff --git a/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat b/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat index b55cbef..fbd9af2 100644 --- a/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat +++ b/fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat @@ -103,7 +103,7 @@ moze_interp( naj> adj_sup ) # Formy „zanegowane” gerundiów i imiesłowów: # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: -moze_interp( nie > negat ) +moze_interp( nie> negat ) # Przyimki akceptujące krótką formę „-ń” moze_interp(z_on_agl) @@ -111,7 +111,7 @@ moze_interp(z_on_agl) moze_interp(z_on_agl on_agl) # Liczba zapisana jako ciąg cyfr: -moze_interp( dig>* dig ) +#moze_interp( dig>* dig ) # Formacje prefiksalne #### trzeba wydzielić odpowiednie samodze! @@ -132,13 +132,35 @@ adj dywiz samodz # ? samodz dywiz adj +[segment types] +naj> +nie> +prefs +prefv +dig> +adja +adj +adj_sup +negat +on_agl +z_on_agl +samotny +interp +aglsg +aglpl +praetcond +praet_sg_agl +praet_sg_na +praet_sg +praet_pl +samodz [tags] -naj naj -nie nie +naj> naj +nie> nie prefs prefs prefv prefv -dig dig +dig> dig adja adja adj adj:%:pos adj_sup adj:%:sup diff --git a/fsabuilder/morfeuszbuilder/segrules/test/segmenty1.dat b/fsabuilder/morfeuszbuilder/segrules/test/segmenty1.dat new file mode 100644 index 0000000..fad910b --- /dev/null +++ b/fsabuilder/morfeuszbuilder/segrules/test/segmenty1.dat @@ -0,0 +1,93 @@ +[options] +aggl=permissive strict isolated +praet=split composite + +[combinations] +#define wsz_interp (interp|kropka|dywiz)* + +#define moze_interp(segmenty) wsz_interp segmenty wsz_interp + +moze_interp(samodz) +samotny + + +[segment types] +naj> +nie> +prefs +prefv +dig +adja +adj +adj_sup +negat +on_agl +z_on_agl +samotny +interp +aglsg +aglpl +praetcond +praet_sg_agl +praet_sg_na +praet_sg +praet_pl +samodz + +[tags] +naj naj +nie nie +prefs prefs +prefv prefv +dig dig +adja adja +adj adj:%:pos +adj_sup adj:%:sup +adj_sup adv:sup +negat ger:%:neg +negat pact:%:neg +negat ppas:%:neg +on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep +z_on_agl prep:% +samotny brev:pun +samotny brev:npun +samotny intrj +interp interp +aglsg aglt:sg:% +aglpl aglt:pl:% +praetcond cond:% +praetcond praet:%:pri:% +praetcond praet:%:sec:% +praetcond praet:%:ter:% +praet_sg_agl praet:sg:%:agl +praet_sg_na praet:sg:%:nagl +praet_sg praet:sg:% +praet_pl praet:pl:% +praet_sg winien:sg:% +praet_pl winien:pl:% +samodz % + +[lexemes] +z_aglt aby:comp +z_aglt bowiem:comp +by by:qub +z_aglt by:comp +z_aglt cóż:subst +z_aglt czemu:adv +z_aglt czyżby:qub +z_aglt choćby:comp +z_aglt chociażby:comp +z_aglt dlaczego:adv +z_aglt dopóki:comp +z_aglt dopóty:conj +z_aglt gdyby:comp +z_aglt gdzie:qub +z_aglt gdzie:adv +z_aglt jakby:comp +z_aglt jakoby:comp +z_aglt kiedy:adv +z_aglt kiedy:comp +z_aglt tylko:qub +z_aglt żeby:comp +dywiz -:interp +kropka .:interp