Commit 4ea040d0c359bd5f64b432695ae6799011e0fb72

Authored by Michał Lenart
1 parent 8d5a878e

- zrobiona konwersja NFA -> DFA dla automatów do zlepiania segmentów

- usunięcie "ignoreOrth"

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@87 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/PoliMorfTest.cek 0 → 100644
  1 +bij ABć+impt:sg:sec:imperf+pospolita
  2 +bija AAć+fin:sg:ter:imperf+pospolita
  3 +bijacie ADć+fin:pl:sec:imperf+pospolita
  4 +bijaj ABć+impt:sg:sec:imperf+pospolita
  5 +bijajcie AEć+impt:pl:sec:imperf+pospolita
  6 +bijajmy ADć+impt:pl:pri:imperf+pospolita
  7 +bijają ACć+fin:pl:ter:imperf+pospolita
  8 +bijając ADć+pcon:imperf+pospolita
  9 +bijająca AEć+pact:sg:nom.voc:f:imperf:aff+pospolita
  10 +bijające AEć+pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|AEć+pact:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita
  11 +bijającego AGć+pact:sg:acc:m1.m2:imperf:aff+pospolita|AGć+pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita
  12 +bijającej AFć+pact:sg:gen.dat.loc:f:imperf:aff+pospolita
  13 +bijającemu AGć+pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita
  14 +bijający AEć+pact:pl:nom.voc:m1.p1:imperf:aff+pospolita|AEć+pact:sg:acc:m3:imperf:aff+pospolita|AEć+pact:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita
  15 +bijających AGć+pact:pl:acc:m1.p1:imperf:aff+pospolita|AGć+pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita
  16 +bijającym AFć+pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|AFć+pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita
  17 +bijającymi AGć+pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita
  18 +bijającą AEć+pact:sg:acc.inst:f:imperf:aff+pospolita
  19 +bijak AA+subst:sg:acc:m3+pospolita|AA+subst:sg:nom:m3+pospolita
  20 +bijaka AB+subst:sg:gen:m3+pospolita
  21 +bijakach AD+subst:pl:loc:m3+pospolita
  22 +bijakami AD+subst:pl:inst:m3+pospolita
  23 +bijaki AB+subst:pl:acc:m3+pospolita|AB+subst:pl:nom:m3+pospolita|AB+subst:pl:voc:m3+pospolita
  24 +bijakiem AD+subst:sg:inst:m3+pospolita
  25 +bijakom AC+subst:pl:dat:m3+pospolita
  26 +bijakowa ABy+adj:sg:nom.voc:f:pos+pospolita
  27 +bijakowe ABy+adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos+pospolita|ABy+adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos+pospolita|ABy+adj:sg:acc:n1.n2:pos+pospolita|ABy+adj:sg:nom.voc:n1.n2:pos+pospolita
  28 +bijakowego ADy+adj:sg:acc:m1.m2:pos+pospolita|ADy+adj:sg:gen:m1.m2.m3.n1.n2:pos+pospolita
  29 +bijakowej ACy+adj:sg:dat:f:pos+pospolita|ACy+adj:sg:gen:f:pos+pospolita|ACy+adj:sg:loc:f:pos+pospolita
  30 +bijakowemu ADy+adj:sg:dat:m1.m2.m3.n1.n2:pos+pospolita
  31 +bijakowi ABy+adj:pl:nom.voc:m1.p1:pos+pospolita|AD+subst:sg:dat:m3+pospolita
  32 +bijakowo ABy+adja+pospolita
  33 +bijakowości ACć+subst:pl:acc:f+pospolita|ACć+subst:pl:gen:f+pospolita|ACć+subst:pl:nom:f+pospolita|ACć+subst:pl:voc:f+pospolita|ACć+subst:sg:dat:f+pospolita|ACć+subst:sg:gen:f+pospolita|ACć+subst:sg:loc:f+pospolita|ACć+subst:sg:voc:f+pospolita
  34 +bijakowościach AFć+subst:pl:loc:f+pospolita
  35 +bijakowościami AFć+subst:pl:inst:f+pospolita
  36 +bijakowościom AEć+subst:pl:dat:f+pospolita
  37 +bijakowością ADć+subst:sg:inst:f+pospolita
  38 +bijakowość AA+subst:sg:acc:f+pospolita|AA+subst:sg:nom:f+pospolita
  39 +bijakowy AA+adj:sg:acc:m3:pos+pospolita|AA+adj:sg:nom.voc:m1.m2.m3:pos+pospolita
  40 +bijakowych AC+adj:pl:acc:m1.p1:pos+pospolita|AC+adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita|AC+adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita
  41 +bijakowym AB+adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita|AB+adj:sg:inst:m1.m2.m3.n1.n2:pos+pospolita|AB+adj:sg:loc:m1.m2.m3.n1.n2:pos+pospolita
  42 +bijakowymi AC+adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita
  43 +bijakową ABy+adj:sg:acc:f:pos+pospolita|ABy+adj:sg:inst:f:pos+pospolita
  44 +bijaku AB+subst:sg:loc:m3+pospolita|AB+subst:sg:voc:m3+pospolita
  45 +bijaków AC+subst:pl:gen:m3+pospolita
  46 +bijali ACć+praet:pl:m1.p1:imperf+pospolita
  47 +bijam ABć+fin:sg:pri:imperf+pospolita
  48 +bijamy ACć+fin:pl:pri:imperf+pospolita
  49 +bijana ACć+ppas:sg:nom.voc:f:imperf:aff+pospolita
  50 +bijance ACka+subst:sg:dat:f+pospolita|ACka+subst:sg:loc:f+pospolita
  51 +bijane ACć+ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|ACć+ppas:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita
  52 +bijanego AEć+ppas:sg:acc:m1.m2:imperf:aff+pospolita|AEć+ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita
  53 +bijanej ADć+ppas:sg:gen.dat.loc:f:imperf:aff+pospolita
  54 +bijanek ACka+subst:pl:gen:f+pospolita
  55 +bijanemu AEć+ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita
  56 +bijani ACć+ppas:pl:nom.voc:m1.p1:imperf:aff+pospolita
  57 +bijania ADć+ger:sg:gen:n2:imperf:aff+pospolita
  58 +bijanie ADć+ger:sg:nom.acc:n2:imperf:aff+pospolita
  59 +bijaniem AEć+ger:sg:inst:n2:imperf:aff+pospolita
  60 +bijaniu ADć+ger:sg:dat.loc:n2:imperf:aff+pospolita
  61 +bijanka AA+subst:sg:nom:f+pospolita
  62 +bijankach AC+subst:pl:loc:f+pospolita
  63 +bijankami AC+subst:pl:inst:f+pospolita
  64 +bijanki ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita|ABa+subst:sg:gen:f+pospolita
  65 +bijanko ABa+subst:sg:voc:f+pospolita
  66 +bijankom ACa+subst:pl:dat:f+pospolita
  67 +bijanką ABa+subst:sg:inst:f+pospolita
  68 +bijankę ABa+subst:sg:acc:f+pospolita
  69 +bijano ACć+imps:imperf+pospolita
  70 +bijany ACć+ppas:sg:acc:m3:imperf:aff+pospolita|ACć+ppas:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita
  71 +bijanych AEć+ppas:pl:acc:m1.p1:imperf:aff+pospolita|AEć+ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita
  72 +bijanym ADć+ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|ADć+ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita
  73 +bijanymi AEć+ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita
  74 +bijaną ACć+ppas:sg:acc.inst:f:imperf:aff+pospolita
  75 +bijasz ACć+fin:sg:sec:imperf+pospolita
  76 +bijatyce ACka+subst:sg:dat:f+pospolita|ACka+subst:sg:loc:f+pospolita
  77 +bijatyk AAa+subst:pl:gen:f+pospolita
  78 +bijatyka AA+subst:sg:nom:f+pospolita
  79 +bijatykach AC+subst:pl:loc:f+pospolita
  80 +bijatykami AC+subst:pl:inst:f+pospolita
  81 +bijatyki ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita|ABa+subst:sg:gen:f+pospolita
  82 +bijatyko ABa+subst:sg:voc:f+pospolita
  83 +bijatykom ACa+subst:pl:dat:f+pospolita
  84 +bijatyką ABa+subst:sg:inst:f+pospolita
  85 +bijatykę ABa+subst:sg:acc:f+pospolita
  86 +bijać AA+inf:imperf+pospolita
  87 +bijał ABć+praet:sg:m1.m2.m3:imperf+pospolita
  88 +bijała ACć+praet:sg:f:imperf+pospolita
  89 +bijało ACć+praet:sg:n1.n2:imperf+pospolita
  90 +bijały ACć+praet:pl:m2.m3.f.n1.n2.p2.p3:imperf+pospolita
  91 +bijcie AEć+impt:pl:sec:imperf+pospolita
  92 +bije ACć+fin:sg:ter:imperf+pospolita
  93 +bijecie AFć+fin:pl:sec:imperf+pospolita
  94 +bijekcja AA+subst:sg:nom:f+pospolita
  95 +bijekcjach AC+subst:pl:loc:f+pospolita
  96 +bijekcjami AC+subst:pl:inst:f+pospolita
  97 +bijekcje ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita
  98 +bijekcji ABa+subst:pl:gen:f+pospolita|ABa+subst:sg:dat:f+pospolita|ABa+subst:sg:gen:f+pospolita|ABa+subst:sg:loc:f+pospolita
  99 +bijekcjo ABa+subst:sg:voc:f+pospolita
  100 +bijekcjom ACa+subst:pl:dat:f+pospolita
  101 +bijekcją ABa+subst:sg:inst:f+pospolita
  102 +bijekcję ABa+subst:sg:acc:f+pospolita
  103 +bijekcyj ACja+subst:pl:gen:f+pospolita
  104 +bijemy AEć+fin:pl:pri:imperf+pospolita
  105 +bijesz AEć+fin:sg:sec:imperf+pospolita
  106 +bijmy ADć+impt:pl:pri:imperf+pospolita
  107 +bijnik AA+subst:sg:acc:m3+pospolita|AA+subst:sg:nom:m3+pospolita
  108 +bijnika AB+subst:sg:gen:m3+pospolita
  109 +bijnikach AD+subst:pl:loc:m3+pospolita
  110 +bijnikami AD+subst:pl:inst:m3+pospolita
  111 +bijniki AB+subst:pl:acc:m3+pospolita|AB+subst:pl:nom:m3+pospolita|AB+subst:pl:voc:m3+pospolita
  112 +bijnikiem AD+subst:sg:inst:m3+pospolita
  113 +bijnikom AC+subst:pl:dat:m3+pospolita
  114 +bijnikowi AD+subst:sg:dat:m3+pospolita
  115 +bijniku AB+subst:sg:loc:m3+pospolita|AB+subst:sg:voc:m3+pospolita
  116 +bijników AC+subst:pl:gen:m3+pospolita
  117 +biją ACć+fin:pl:ter:imperf+pospolita
  118 +bijąc ADć+pcon:imperf+pospolita
  119 +bijąca AEć+pact:sg:nom.voc:f:imperf:aff+pospolita
  120 +bijące AEć+pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|AEć+pact:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita
  121 +bijącego AGć+pact:sg:acc:m1.m2:imperf:aff+pospolita|AGć+pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita
  122 +bijącej AFć+pact:sg:gen.dat.loc:f:imperf:aff+pospolita
  123 +bijącemu AGć+pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita
  124 +bijący AEć+pact:pl:nom.voc:m1.p1:imperf:aff+pospolita|AEć+pact:sg:acc:m3:imperf:aff+pospolita|AEć+pact:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita
  125 +bijących AGć+pact:pl:acc:m1.p1:imperf:aff+pospolita|AGć+pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita
  126 +bijącym AFć+pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|AFć+pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita
  127 +bijącymi AGć+pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita
  128 +bijącą AEć+pact:sg:acc.inst:f:imperf:aff+pospolita
... ...
fsabuilder/buildfsa.py
... ... @@ -10,9 +10,10 @@ import logging
10 10 import codecs
11 11 from morfeuszbuilder.fsa import encode
12 12 from morfeuszbuilder.fsa import convertinput
13   -from morfeuszbuilder.fsa import common
14 13 from morfeuszbuilder.fsa.fsa import FSA
15 14 from morfeuszbuilder.fsa.serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer
  15 +from morfeuszbuilder.tagset.tagset import Tagset
  16 +from morfeuszbuilder.segrules import rulesParser
16 17 from optparse import OptionParser
17 18  
18 19 # class InputFormat():
... ... @@ -50,6 +51,10 @@ def _parseOptions():
50 51 dest='tagsetFile',
51 52 metavar='FILE',
52 53 help='path to the file with tagset')
  54 + parser.add_option('--segments-file',
  55 + dest='segmentsFile',
  56 + metavar='FILE',
  57 + help='path to the file with segment rules')
53 58 parser.add_option('-o', '--output-file',
54 59 dest='outputFile',
55 60 metavar='FILE',
... ... @@ -107,6 +112,8 @@ def _parseOptions():
107 112 _checkOption(opts.serializationMethod, parser, "Serialization method file is missing")
108 113 _checkExactlyOneOptionSet([opts.analyzer, opts.generator],
109 114 parser, 'Must set exactly one FSA type: --analyzer or --generator')
  115 + if opts.analyzer:
  116 + _checkOption(opts.segmentsFile, parser, "Segment rules file is missing")
110 117  
111 118 if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]:
112 119 print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')'
... ... @@ -147,9 +154,8 @@ def _printStats(fsa):
147 154 logging.info('sink states num: '+str(sinkNum))
148 155 logging.info('array states num: '+str(arrayNum))
149 156  
150   -def buildAnalyzerFromPoliMorf(inputFile, tagsetFile):
  157 +def buildAnalyzerFromPoliMorf(inputFile, tagset):
151 158 encoder = encode.MorphEncoder()
152   - tagset = common.Tagset(tagsetFile)
153 159 fsa = FSA(encoder, tagset)
154 160 inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder)
155 161 for word, data in inputData:
... ... @@ -160,7 +166,7 @@ def buildAnalyzerFromPoliMorf(inputFile, tagsetFile):
160 166  
161 167 def buildGeneratorFromPoliMorf(inputFile, tagsetFile):
162 168 encoder = encode.Encoder4Generator()
163   - tagset = common.Tagset(tagsetFile)
  169 + tagset = Tagset(tagsetFile)
164 170 fsa = FSA(encoder, tagset)
165 171 inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder)
166 172 for word, data in inputData:
... ... @@ -175,10 +181,15 @@ def main(opts):
175 181 else:
176 182 logging.basicConfig(level=logging.INFO)
177 183  
  184 + tagset = Tagset(opts.tagsetFile)
  185 +
178 186 if opts.analyzer:
179   - fsa = buildAnalyzerFromPoliMorf(opts.inputFile, opts.tagsetFile)
  187 + fsa = buildAnalyzerFromPoliMorf(opts.inputFile, tagset)
  188 + segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile)
  189 + additionalData = segmentRulesManager.serialize()
180 190 else:
181   - fsa = buildGeneratorFromPoliMorf(opts.inputFile, opts.tagsetFile)
  191 + fsa = buildGeneratorFromPoliMorf(opts.inputFile, tagset)
  192 + additionalData = bytearray()
182 193  
183 194 if opts.trainFile:
184 195 logging.info('training with '+opts.trainFile+' ...')
... ...
fsabuilder/morfeuszbuilder/fsa/fsa.py
... ... @@ -119,4 +119,3 @@ class FSA(object):
119 119 state.reverseOffset = currReverseOffset
120 120 for state in self.initialState.dfs(set()):
121 121 state.offset = currReverseOffset - state.reverseOffset
122   -
123 122 \ No newline at end of file
... ...
fsabuilder/morfeuszbuilder/fsa/fsa.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/serializer.py
... ... @@ -45,16 +45,15 @@ class Serializer(object):
45 45  
46 46 def serialize2BinaryFile(self, fname):
47 47 with open(fname, 'wb') as f:
48   - f.write(self.fsa2bytearray())
  48 + f.write(self.fsa2bytearray(self.serializeTagset(self.fsa.tagset)))
49 49  
50 50 def getStateSize(self, state):
51 51 raise NotImplementedError('Not implemented')
52 52  
53   - def fsa2bytearray(self):
  53 + def fsa2bytearray(self, additionalData=bytearray()):
54 54 res = bytearray()
55   - res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset)))
  55 + res.extend(self.serializePrologue(additionalData))
56 56 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
57   - logging.debug('SERIALIZE')
58 57 for state in sorted(self.fsa.dfs(), key=lambda s: s.offset):
59 58 res.extend(self.state2bytearray(state))
60 59 return res
... ...
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/state.py
... ... @@ -8,6 +8,8 @@ class State(object):
8 8 '''
9 9 A state in an automaton
10 10 '''
  11 +
  12 + statesCounter = 0
11 13  
12 14 def __init__(self, additionalData=None):
13 15 self.transitionsMap = {}
... ... @@ -18,6 +20,9 @@ class State(object):
18 20 self.label2Freq = {}
19 21 self.serializeAsArray = False
20 22 self.additionalData = additionalData
  23 +
  24 + self.idx = State.statesCounter
  25 + State.statesCounter += 1
21 26  
22 27 @property
23 28 def transitionsNum(self):
... ... @@ -51,10 +56,16 @@ class State(object):
51 56 else:
52 57 return self.encodedData
53 58  
54   - def dfs(self, alreadyVisited=set(), sortKey=lambda (_, state): -state.freq):
  59 + def dfs(self, alreadyVisited, sortKey=lambda (_, state): -state.freq):
55 60 if not self in alreadyVisited:
  61 + alreadyVisited.add(self)
56 62 for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey):
57 63 for state1 in state.dfs(alreadyVisited):
58 64 yield state1
59   - alreadyVisited.add(self)
60 65 yield self
  66 +
  67 + def debug(self):
  68 + print '----------------'
  69 + print 'STATE:', self.idx
  70 + for label, s in self.transitionsMap.iteritems():
  71 + print label, '-->', s.idx
... ...
fsabuilder/morfeuszbuilder/fsa/state.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/visualizer.py
... ... @@ -12,7 +12,7 @@ class Visualizer(object):
12 12 def __init__(self):
13 13 pass
14 14  
15   - def visualize(self, fsa):
  15 + def visualize(self, fsa, charLabels=True):
16 16 G = nx.DiGraph()
17 17 allStates = list(reversed(list(fsa.initialState.dfs(set()))))
18 18 edgeLabelsMap = {}
... ... @@ -21,10 +21,12 @@ class Visualizer(object):
21 21 G.add_node(idx, offset=state.offset)
22 22 for c, targetState in state.transitionsMap.iteritems():
23 23 G.add_edge(idx, allStates.index(targetState))
24   - label = chr(c) if c <= 127 else '%'
  24 + label = (chr(c) if c <= 127 else '%') if charLabels \
  25 + else c
25 26 edgeLabelsMap[(idx, allStates.index(targetState))] = label
26 27 nodeLabelsMap[idx] = state.offset if not state.isAccepting() else state.encodedData + '(' + str(state.offset) + ')'
27 28 pos=nx.shell_layout(G)
  29 +# pos=nx.random_layout(G)
28 30 nx.draw_networkx_nodes(G,
29 31 pos,
30 32 nodelist=list([allStates.index(s) for s in allStates if not s.isAccepting()]),
... ...
fsabuilder/morfeuszbuilder/fsa/visualizer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
... ... @@ -6,8 +6,7 @@ Created on 23 sty 2014
6 6 import re
7 7 from pyparsing import *
8 8  
9   -identifier = Word(alphas, bodyChars=alphanums+'_')
10   -token = Word(alphas, bodyChars=alphanums+'_+>')
  9 +identifier = Word(alphas, bodyChars=alphanums+'_>*+')
11 10 define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
12 11 ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
13 12 endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
... ...
fsabuilder/morfeuszbuilder/segrules/rules.py
... ... @@ -34,6 +34,9 @@ class TagRule(SegmentRule):
34 34  
35 35 def _doAddToNFA(self, startState, endState):
36 36 startState.addTransition(self.segnum, endState)
  37 +
  38 + def __str__(self):
  39 + return u''+self.segnum
37 40  
38 41 class UnaryRule(SegmentRule):
39 42  
... ... @@ -95,12 +98,3 @@ class ZeroOrMoreRule(UnaryRule):
95 98 self.child._doAddToNFA(intermStartState, intermEndState)
96 99 intermEndState.addTransition(None, endState)
97 100 endState.addTransition(None, intermStartState)
98   -
99   -class IgnoreOrthRule(UnaryRule):
100   -
101   - def __init__(self, child):
102   - super(IgnoreOrthRule, self).__init__(child)
103   -
104   - def _doAddToNFA(self, startState, endState):
105   - startState.addTransition(self.child.segnum, endState, ignoreOrth=True)
106   -
... ...
fsabuilder/morfeuszbuilder/segrules/rulesManager.py 0 → 100644
  1 +'''
  2 +Created on 20 lut 2014
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +class RulesManager(object):
  8 +
  9 + def __init__(self):
  10 + self.options2DFA = {}
  11 +
  12 + def _options2Key(self, optionsMap):
  13 + return frozenset(optionsMap.items())
  14 +
  15 + def addDFA4Options(self, optionsMap, dfa):
  16 + self.options2DFA[self._options2Key(optionsMap)] = dfa
  17 +
  18 + def serialize(self):
  19 + pass
0 20 \ No newline at end of file
... ...
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... ... @@ -8,33 +8,98 @@ from morfeuszbuilder.fsa import fsa, state, encode
8 8  
9 9 class RulesNFAState(object):
10 10  
11   - def __init__(self, initial=False, final=False):
  11 + statesCounter = 0
  12 +
  13 + def __init__(self, initial=False, final=False, weak=False):
12 14 self.transitionsMap = {}
13 15 self.initial = initial
14 16 self.final = final
  17 + self.weak = weak
  18 + self.idx = RulesNFAState.statesCounter
  19 + RulesNFAState.statesCounter += 1
  20 +
  21 + def addTransition(self, label, targetState):
  22 + self.transitionsMap.setdefault(label, set())
  23 + self.transitionsMap[label].add(targetState)
  24 +
  25 + def getClosure(self, visited):
  26 + if self in visited:
  27 + return set()
  28 + else:
  29 + visited.add(self)
  30 + res = set()
  31 + res.add(self)
  32 + for nextState in self.transitionsMap.get(None, []):
  33 + if self.idx in [6,8,4]:
  34 + print nextState.idx
  35 + print self.transitionsMap
  36 + res |= nextState.getClosure(visited)
  37 + return res
15 38  
16   - def addTransition(self, label, targetState, ignoreOrth=False):
17   - assert not ignoreOrth or label is not None
18   - self.transitionsMap.setdefault((label, ignoreOrth), set())
19   - self.transitionsMap[(label, ignoreOrth)].add(targetState)
  39 + def dfs(self, visitedStates=set()):
  40 + if not self in visitedStates:
  41 + visitedStates.add(self)
  42 + yield self
  43 + for _, nextStates in self.transitionsMap.iteritems():
  44 + for state in nextStates:
  45 + for state1 in state.dfs():
  46 + yield state1
  47 +
  48 + def debug(self):
  49 + print '----------------'
  50 + print 'STATE:', self.idx
  51 + for label, nextStates in self.transitionsMap.iteritems():
  52 + print label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)]
20 53  
21 54 class RulesNFA(object):
22 55  
23   - def __init__(self, key2Def={}):
  56 + def __init__(self):
24 57 self.initialState = RulesNFAState(initial=True)
25 58  
26   - def _doConvertState(self, dfaState, nfaStates):
27   - for label, (nextIgnoreOrth, nextNFAStates) in self._groupOutputByLabels(nfaStates).iteritems():
28   - nextDFAState = state.State(additionalData=nextIgnoreOrth)
  59 + def _groupOutputByLabels(self, nfaStates):
  60 + res = {}
  61 + for nfaState in nfaStates:
  62 + for label, nextStates in nfaState.transitionsMap.iteritems():
  63 + if label is not None:
  64 + res.setdefault(label, set())
  65 + for nextNFAState in nextStates:
  66 + res[label] |= nextNFAState.getClosure(set())
  67 +# print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)]
  68 + return res
  69 +
  70 + def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState):
  71 + assert all(map(lambda state: state.weak, nfaStates)) \
  72 + or not any(map(lambda state: state.weak, nfaStates))
  73 + weak = all(map(lambda state: state.weak, nfaStates))
  74 + final = any(map(lambda state: state.final, nfaStates))
  75 + assert not weak or not final
  76 + if final:
  77 + # dfaState should be final
  78 + # and contain info about weakness
  79 + dfaState.encodedData = bytearray([1 if weak else 0])
  80 + for label, nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems():
  81 +# print '============'
  82 +# print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)]
  83 +# print 'label:', label
  84 +# print 'nextStates:', [s.idx for s in sorted(nextNFAStates, key=lambda s: s.idx)]
  85 + key = frozenset(nextNFAStates)
  86 + if key in nfaSubset2DFAState:
  87 + nextDFAState = nfaSubset2DFAState[key]
  88 + else:
  89 + nextDFAState = state.State()
  90 + nfaSubset2DFAState[key] = nextDFAState
  91 + self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState)
29 92 dfaState.setTransition(label, nextDFAState)
30   - dfaState.encodedData = bytearray()
31   - self._doConvertState(nextDFAState, nextNFAStates)
32 93  
33 94 def convertToDFA(self):
34   - dfa = fsa.FSA(encoder=None, encodeWords=False)
35   - startStates = self.initialState.getClosure()
  95 + dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False)
  96 + startStates = self.initialState.getClosure(set())
36 97 assert not any(filter(lambda s: s.final, startStates))
37 98 dfa.initialState = state.State(additionalData=False)
38   - self._doConvertState(dfa.initialState, startStates)
39   -
  99 + self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState})
  100 + return dfa
  101 +
  102 + def debug(self):
  103 + for state in self.initialState.dfs():
  104 + state.debug()
40 105  
41 106 \ No newline at end of file
... ...
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... ... @@ -3,7 +3,7 @@ from pyparsing import *
3 3 ParserElement.enablePackrat()
4 4 from morfeuszbuilder.tagset import segtypes
5 5 from morfeuszbuilder.utils import configFile, exceptions
6   -from morfeuszbuilder.segrules import preprocessor, rules
  6 +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager
7 7 import codecs
8 8 import re
9 9  
... ... @@ -28,9 +28,9 @@ class RulesParser(object):
28 28 return res
29 29  
30 30 def parse(self, filename):
31   - res = []
  31 + res = rulesManager.RulesManager()
32 32  
33   - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes'])
  33 + segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types'])
34 34 key2Defs = self._getKey2Defs(segtypesConfigFile)
35 35 segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
36 36  
... ... @@ -39,14 +39,18 @@ class RulesParser(object):
39 39 for define in defs:
40 40 def2Key[define] = key
41 41  
  42 + firstNFA = None
42 43 for defs in itertools.product(*key2Defs.values()):
43 44 key2Def = dict([(def2Key[define], define) for define in defs])
44   - nfa = rulesNFA.RulesNFA(key2Def)
  45 + nfa = rulesNFA.RulesNFA()
  46 + if not firstNFA:
  47 + firstNFA = nfa
45 48 combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations')
46 49 combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs))
47 50 for rule in self._doParse(combinationEnumeratedLines, segtypesHelper):
48 51 rule.addToNFA(nfa)
49   - res.append(nfa)
  52 + dfa = nfa.convertToDFA()
  53 + res.addDFA4Options(key2Def, dfa)
50 54 return res
51 55  
52 56 def _doParse(self, combinationEnumeratedLines, segtypesHelper):
... ... @@ -58,14 +62,14 @@ class RulesParser(object):
58 62 if not segtypesHelper.hasSegtype(segtype):
59 63 raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype))
60 64 else:
  65 +# return rules.TagRule(segtype)
61 66 return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype))
62 67  
63 68 def _doParseOneLine(self, lineNum, line, segtypesHelper):
64 69 rule = Forward()
65   - tagRule = Word(alphanums+'_')
66   - ignoreOrthRule = tagRule + Suppress('>')
  70 + tagRule = Word(alphanums+'_>')
67 71 parenRule = Suppress('(') + rule + Suppress(')')
68   - atomicRule = tagRule ^ ignoreOrthRule ^ parenRule
  72 + atomicRule = tagRule ^ parenRule
69 73 zeroOrMoreRule = atomicRule + Suppress('*')
70 74 oneOrMoreRule = atomicRule + Suppress('+')
71 75 unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule
... ... @@ -75,19 +79,10 @@ class RulesParser(object):
75 79 rule << concatRule
76 80  
77 81 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper))
78   - ignoreOrthRule.setParseAction(lambda string, loc, toks: rules.IgnoreOrthRule(toks[0]))
79 82 # parenRule.setParseAction(lambda string, loc, toks: toks[0])
80 83 zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0]))
81 84 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
82 85 oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
83 86 concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
84   -
85   -
86   -# rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule
87   -
88   -# tagRule.setParseAction(lambda s,l,toks: doprint(toks))
89   -# print lineNum, line
90 87 parsedRule = rule.parseString(line, parseAll=True)[0]
91   - print parsedRule
92 88 return parsedRule
93   -# print parsedLine
... ...
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
... ... @@ -7,12 +7,20 @@ import unittest
7 7 import os
8 8 from morfeuszbuilder.segrules import rulesParser
9 9 from morfeuszbuilder.tagset import tagset
  10 +from morfeuszbuilder.fsa import visualizer, serializer
10 11  
11 12 class Test(unittest.TestCase):
12 13 print 'do test'
13 14 t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset'))
14 15 parser = rulesParser.RulesParser(t)
15   - parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))
  16 + fsas = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat'))
  17 + fsa = fsas[0]
  18 + for s in fsa.dfs():
  19 + s.debug()
  20 + print 'states:', len(list(fsa.dfs()))
  21 + print 'transitions:', fsa.getTransitionsNum()
  22 + visualizer.Visualizer().visualize(fsa, charLabels=False)
  23 + print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray()))
16 24 print 'done'
17 25  
18 26 if __name__ == "__main__":
... ...
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
... ... @@ -103,7 +103,7 @@ moze_interp( naj&gt; adj_sup )
103 103  
104 104 # Formy „zanegowane” gerundiów i imiesłowów:
105 105 # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”:
106   -moze_interp( nie > negat )
  106 +moze_interp( nie> negat )
107 107  
108 108 # Przyimki akceptujące krótką formę „-ń”
109 109 moze_interp(z_on_agl)
... ... @@ -111,7 +111,7 @@ moze_interp(z_on_agl)
111 111 moze_interp(z_on_agl on_agl)
112 112  
113 113 # Liczba zapisana jako ciąg cyfr:
114   -moze_interp( dig>* dig )
  114 +#moze_interp( dig>* dig )
115 115  
116 116 # Formacje prefiksalne
117 117 #### trzeba wydzielić odpowiednie samodze!
... ... @@ -132,13 +132,35 @@ adj dywiz samodz
132 132 # ?
133 133 samodz dywiz adj
134 134  
  135 +[segment types]
  136 +naj>
  137 +nie>
  138 +prefs
  139 +prefv
  140 +dig>
  141 +adja
  142 +adj
  143 +adj_sup
  144 +negat
  145 +on_agl
  146 +z_on_agl
  147 +samotny
  148 +interp
  149 +aglsg
  150 +aglpl
  151 +praetcond
  152 +praet_sg_agl
  153 +praet_sg_na
  154 +praet_sg
  155 +praet_pl
  156 +samodz
135 157  
136 158 [tags]
137   -naj naj
138   -nie nie
  159 +naj> naj
  160 +nie> nie
139 161 prefs prefs
140 162 prefv prefv
141   -dig dig
  163 +dig> dig
142 164 adja adja
143 165 adj adj:%:pos
144 166 adj_sup adj:%:sup
... ...
fsabuilder/morfeuszbuilder/segrules/test/segmenty1.dat 0 → 100644
  1 +[options]
  2 +aggl=permissive strict isolated
  3 +praet=split composite
  4 +
  5 +[combinations]
  6 +#define wsz_interp (interp|kropka|dywiz)*
  7 +
  8 +#define moze_interp(segmenty) wsz_interp segmenty wsz_interp
  9 +
  10 +moze_interp(samodz)
  11 +samotny
  12 +
  13 +
  14 +[segment types]
  15 +naj>
  16 +nie>
  17 +prefs
  18 +prefv
  19 +dig
  20 +adja
  21 +adj
  22 +adj_sup
  23 +negat
  24 +on_agl
  25 +z_on_agl
  26 +samotny
  27 +interp
  28 +aglsg
  29 +aglpl
  30 +praetcond
  31 +praet_sg_agl
  32 +praet_sg_na
  33 +praet_sg
  34 +praet_pl
  35 +samodz
  36 +
  37 +[tags]
  38 +naj naj
  39 +nie nie
  40 +prefs prefs
  41 +prefv prefv
  42 +dig dig
  43 +adja adja
  44 +adj adj:%:pos
  45 +adj_sup adj:%:sup
  46 +adj_sup adv:sup
  47 +negat ger:%:neg
  48 +negat pact:%:neg
  49 +negat ppas:%:neg
  50 +on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep
  51 +z_on_agl prep:%
  52 +samotny brev:pun
  53 +samotny brev:npun
  54 +samotny intrj
  55 +interp interp
  56 +aglsg aglt:sg:%
  57 +aglpl aglt:pl:%
  58 +praetcond cond:%
  59 +praetcond praet:%:pri:%
  60 +praetcond praet:%:sec:%
  61 +praetcond praet:%:ter:%
  62 +praet_sg_agl praet:sg:%:agl
  63 +praet_sg_na praet:sg:%:nagl
  64 +praet_sg praet:sg:%
  65 +praet_pl praet:pl:%
  66 +praet_sg winien:sg:%
  67 +praet_pl winien:pl:%
  68 +samodz %
  69 +
  70 +[lexemes]
  71 +z_aglt aby:comp
  72 +z_aglt bowiem:comp
  73 +by by:qub
  74 +z_aglt by:comp
  75 +z_aglt cóż:subst
  76 +z_aglt czemu:adv
  77 +z_aglt czyżby:qub
  78 +z_aglt choćby:comp
  79 +z_aglt chociażby:comp
  80 +z_aglt dlaczego:adv
  81 +z_aglt dopóki:comp
  82 +z_aglt dopóty:conj
  83 +z_aglt gdyby:comp
  84 +z_aglt gdzie:qub
  85 +z_aglt gdzie:adv
  86 +z_aglt jakby:comp
  87 +z_aglt jakoby:comp
  88 +z_aglt kiedy:adv
  89 +z_aglt kiedy:comp
  90 +z_aglt tylko:qub
  91 +z_aglt żeby:comp
  92 +dywiz -:interp
  93 +kropka .:interp
... ...