Commit 4ea040d0c359bd5f64b432695ae6799011e0fb72
1 parent
8d5a878e
- zrobiona konwersja NFA -> DFA dla automatów do zlepiania segmentów
- usunięcie "ignoreOrth" git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@87 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
18 changed files
with
409 additions
and
64 deletions
fsabuilder/PoliMorfTest.cek
0 → 100644
1 | +bij ABć+impt:sg:sec:imperf+pospolita | |
2 | +bija AAć+fin:sg:ter:imperf+pospolita | |
3 | +bijacie ADć+fin:pl:sec:imperf+pospolita | |
4 | +bijaj ABć+impt:sg:sec:imperf+pospolita | |
5 | +bijajcie AEć+impt:pl:sec:imperf+pospolita | |
6 | +bijajmy ADć+impt:pl:pri:imperf+pospolita | |
7 | +bijają ACć+fin:pl:ter:imperf+pospolita | |
8 | +bijając ADć+pcon:imperf+pospolita | |
9 | +bijająca AEć+pact:sg:nom.voc:f:imperf:aff+pospolita | |
10 | +bijające AEć+pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|AEć+pact:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita | |
11 | +bijającego AGć+pact:sg:acc:m1.m2:imperf:aff+pospolita|AGć+pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita | |
12 | +bijającej AFć+pact:sg:gen.dat.loc:f:imperf:aff+pospolita | |
13 | +bijającemu AGć+pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita | |
14 | +bijający AEć+pact:pl:nom.voc:m1.p1:imperf:aff+pospolita|AEć+pact:sg:acc:m3:imperf:aff+pospolita|AEć+pact:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita | |
15 | +bijających AGć+pact:pl:acc:m1.p1:imperf:aff+pospolita|AGć+pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | |
16 | +bijającym AFć+pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|AFć+pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita | |
17 | +bijającymi AGć+pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | |
18 | +bijającą AEć+pact:sg:acc.inst:f:imperf:aff+pospolita | |
19 | +bijak AA+subst:sg:acc:m3+pospolita|AA+subst:sg:nom:m3+pospolita | |
20 | +bijaka AB+subst:sg:gen:m3+pospolita | |
21 | +bijakach AD+subst:pl:loc:m3+pospolita | |
22 | +bijakami AD+subst:pl:inst:m3+pospolita | |
23 | +bijaki AB+subst:pl:acc:m3+pospolita|AB+subst:pl:nom:m3+pospolita|AB+subst:pl:voc:m3+pospolita | |
24 | +bijakiem AD+subst:sg:inst:m3+pospolita | |
25 | +bijakom AC+subst:pl:dat:m3+pospolita | |
26 | +bijakowa ABy+adj:sg:nom.voc:f:pos+pospolita | |
27 | +bijakowe ABy+adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos+pospolita|ABy+adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos+pospolita|ABy+adj:sg:acc:n1.n2:pos+pospolita|ABy+adj:sg:nom.voc:n1.n2:pos+pospolita | |
28 | +bijakowego ADy+adj:sg:acc:m1.m2:pos+pospolita|ADy+adj:sg:gen:m1.m2.m3.n1.n2:pos+pospolita | |
29 | +bijakowej ACy+adj:sg:dat:f:pos+pospolita|ACy+adj:sg:gen:f:pos+pospolita|ACy+adj:sg:loc:f:pos+pospolita | |
30 | +bijakowemu ADy+adj:sg:dat:m1.m2.m3.n1.n2:pos+pospolita | |
31 | +bijakowi ABy+adj:pl:nom.voc:m1.p1:pos+pospolita|AD+subst:sg:dat:m3+pospolita | |
32 | +bijakowo ABy+adja+pospolita | |
33 | +bijakowości ACć+subst:pl:acc:f+pospolita|ACć+subst:pl:gen:f+pospolita|ACć+subst:pl:nom:f+pospolita|ACć+subst:pl:voc:f+pospolita|ACć+subst:sg:dat:f+pospolita|ACć+subst:sg:gen:f+pospolita|ACć+subst:sg:loc:f+pospolita|ACć+subst:sg:voc:f+pospolita | |
34 | +bijakowościach AFć+subst:pl:loc:f+pospolita | |
35 | +bijakowościami AFć+subst:pl:inst:f+pospolita | |
36 | +bijakowościom AEć+subst:pl:dat:f+pospolita | |
37 | +bijakowością ADć+subst:sg:inst:f+pospolita | |
38 | +bijakowość AA+subst:sg:acc:f+pospolita|AA+subst:sg:nom:f+pospolita | |
39 | +bijakowy AA+adj:sg:acc:m3:pos+pospolita|AA+adj:sg:nom.voc:m1.m2.m3:pos+pospolita | |
40 | +bijakowych AC+adj:pl:acc:m1.p1:pos+pospolita|AC+adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita|AC+adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita | |
41 | +bijakowym AB+adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita|AB+adj:sg:inst:m1.m2.m3.n1.n2:pos+pospolita|AB+adj:sg:loc:m1.m2.m3.n1.n2:pos+pospolita | |
42 | +bijakowymi AC+adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita | |
43 | +bijakową ABy+adj:sg:acc:f:pos+pospolita|ABy+adj:sg:inst:f:pos+pospolita | |
44 | +bijaku AB+subst:sg:loc:m3+pospolita|AB+subst:sg:voc:m3+pospolita | |
45 | +bijaków AC+subst:pl:gen:m3+pospolita | |
46 | +bijali ACć+praet:pl:m1.p1:imperf+pospolita | |
47 | +bijam ABć+fin:sg:pri:imperf+pospolita | |
48 | +bijamy ACć+fin:pl:pri:imperf+pospolita | |
49 | +bijana ACć+ppas:sg:nom.voc:f:imperf:aff+pospolita | |
50 | +bijance ACka+subst:sg:dat:f+pospolita|ACka+subst:sg:loc:f+pospolita | |
51 | +bijane ACć+ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|ACć+ppas:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita | |
52 | +bijanego AEć+ppas:sg:acc:m1.m2:imperf:aff+pospolita|AEć+ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita | |
53 | +bijanej ADć+ppas:sg:gen.dat.loc:f:imperf:aff+pospolita | |
54 | +bijanek ACka+subst:pl:gen:f+pospolita | |
55 | +bijanemu AEć+ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita | |
56 | +bijani ACć+ppas:pl:nom.voc:m1.p1:imperf:aff+pospolita | |
57 | +bijania ADć+ger:sg:gen:n2:imperf:aff+pospolita | |
58 | +bijanie ADć+ger:sg:nom.acc:n2:imperf:aff+pospolita | |
59 | +bijaniem AEć+ger:sg:inst:n2:imperf:aff+pospolita | |
60 | +bijaniu ADć+ger:sg:dat.loc:n2:imperf:aff+pospolita | |
61 | +bijanka AA+subst:sg:nom:f+pospolita | |
62 | +bijankach AC+subst:pl:loc:f+pospolita | |
63 | +bijankami AC+subst:pl:inst:f+pospolita | |
64 | +bijanki ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita|ABa+subst:sg:gen:f+pospolita | |
65 | +bijanko ABa+subst:sg:voc:f+pospolita | |
66 | +bijankom ACa+subst:pl:dat:f+pospolita | |
67 | +bijanką ABa+subst:sg:inst:f+pospolita | |
68 | +bijankę ABa+subst:sg:acc:f+pospolita | |
69 | +bijano ACć+imps:imperf+pospolita | |
70 | +bijany ACć+ppas:sg:acc:m3:imperf:aff+pospolita|ACć+ppas:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita | |
71 | +bijanych AEć+ppas:pl:acc:m1.p1:imperf:aff+pospolita|AEć+ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | |
72 | +bijanym ADć+ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|ADć+ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita | |
73 | +bijanymi AEć+ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | |
74 | +bijaną ACć+ppas:sg:acc.inst:f:imperf:aff+pospolita | |
75 | +bijasz ACć+fin:sg:sec:imperf+pospolita | |
76 | +bijatyce ACka+subst:sg:dat:f+pospolita|ACka+subst:sg:loc:f+pospolita | |
77 | +bijatyk AAa+subst:pl:gen:f+pospolita | |
78 | +bijatyka AA+subst:sg:nom:f+pospolita | |
79 | +bijatykach AC+subst:pl:loc:f+pospolita | |
80 | +bijatykami AC+subst:pl:inst:f+pospolita | |
81 | +bijatyki ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita|ABa+subst:sg:gen:f+pospolita | |
82 | +bijatyko ABa+subst:sg:voc:f+pospolita | |
83 | +bijatykom ACa+subst:pl:dat:f+pospolita | |
84 | +bijatyką ABa+subst:sg:inst:f+pospolita | |
85 | +bijatykę ABa+subst:sg:acc:f+pospolita | |
86 | +bijać AA+inf:imperf+pospolita | |
87 | +bijał ABć+praet:sg:m1.m2.m3:imperf+pospolita | |
88 | +bijała ACć+praet:sg:f:imperf+pospolita | |
89 | +bijało ACć+praet:sg:n1.n2:imperf+pospolita | |
90 | +bijały ACć+praet:pl:m2.m3.f.n1.n2.p2.p3:imperf+pospolita | |
91 | +bijcie AEć+impt:pl:sec:imperf+pospolita | |
92 | +bije ACć+fin:sg:ter:imperf+pospolita | |
93 | +bijecie AFć+fin:pl:sec:imperf+pospolita | |
94 | +bijekcja AA+subst:sg:nom:f+pospolita | |
95 | +bijekcjach AC+subst:pl:loc:f+pospolita | |
96 | +bijekcjami AC+subst:pl:inst:f+pospolita | |
97 | +bijekcje ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita | |
98 | +bijekcji ABa+subst:pl:gen:f+pospolita|ABa+subst:sg:dat:f+pospolita|ABa+subst:sg:gen:f+pospolita|ABa+subst:sg:loc:f+pospolita | |
99 | +bijekcjo ABa+subst:sg:voc:f+pospolita | |
100 | +bijekcjom ACa+subst:pl:dat:f+pospolita | |
101 | +bijekcją ABa+subst:sg:inst:f+pospolita | |
102 | +bijekcję ABa+subst:sg:acc:f+pospolita | |
103 | +bijekcyj ACja+subst:pl:gen:f+pospolita | |
104 | +bijemy AEć+fin:pl:pri:imperf+pospolita | |
105 | +bijesz AEć+fin:sg:sec:imperf+pospolita | |
106 | +bijmy ADć+impt:pl:pri:imperf+pospolita | |
107 | +bijnik AA+subst:sg:acc:m3+pospolita|AA+subst:sg:nom:m3+pospolita | |
108 | +bijnika AB+subst:sg:gen:m3+pospolita | |
109 | +bijnikach AD+subst:pl:loc:m3+pospolita | |
110 | +bijnikami AD+subst:pl:inst:m3+pospolita | |
111 | +bijniki AB+subst:pl:acc:m3+pospolita|AB+subst:pl:nom:m3+pospolita|AB+subst:pl:voc:m3+pospolita | |
112 | +bijnikiem AD+subst:sg:inst:m3+pospolita | |
113 | +bijnikom AC+subst:pl:dat:m3+pospolita | |
114 | +bijnikowi AD+subst:sg:dat:m3+pospolita | |
115 | +bijniku AB+subst:sg:loc:m3+pospolita|AB+subst:sg:voc:m3+pospolita | |
116 | +bijników AC+subst:pl:gen:m3+pospolita | |
117 | +biją ACć+fin:pl:ter:imperf+pospolita | |
118 | +bijąc ADć+pcon:imperf+pospolita | |
119 | +bijąca AEć+pact:sg:nom.voc:f:imperf:aff+pospolita | |
120 | +bijące AEć+pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|AEć+pact:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita | |
121 | +bijącego AGć+pact:sg:acc:m1.m2:imperf:aff+pospolita|AGć+pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita | |
122 | +bijącej AFć+pact:sg:gen.dat.loc:f:imperf:aff+pospolita | |
123 | +bijącemu AGć+pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita | |
124 | +bijący AEć+pact:pl:nom.voc:m1.p1:imperf:aff+pospolita|AEć+pact:sg:acc:m3:imperf:aff+pospolita|AEć+pact:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita | |
125 | +bijących AGć+pact:pl:acc:m1.p1:imperf:aff+pospolita|AGć+pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | |
126 | +bijącym AFć+pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|AFć+pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita | |
127 | +bijącymi AGć+pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | |
128 | +bijącą AEć+pact:sg:acc.inst:f:imperf:aff+pospolita | |
... | ... |
fsabuilder/buildfsa.py
... | ... | @@ -10,9 +10,10 @@ import logging |
10 | 10 | import codecs |
11 | 11 | from morfeuszbuilder.fsa import encode |
12 | 12 | from morfeuszbuilder.fsa import convertinput |
13 | -from morfeuszbuilder.fsa import common | |
14 | 13 | from morfeuszbuilder.fsa.fsa import FSA |
15 | 14 | from morfeuszbuilder.fsa.serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer |
15 | +from morfeuszbuilder.tagset.tagset import Tagset | |
16 | +from morfeuszbuilder.segrules import rulesParser | |
16 | 17 | from optparse import OptionParser |
17 | 18 | |
18 | 19 | # class InputFormat(): |
... | ... | @@ -50,6 +51,10 @@ def _parseOptions(): |
50 | 51 | dest='tagsetFile', |
51 | 52 | metavar='FILE', |
52 | 53 | help='path to the file with tagset') |
54 | + parser.add_option('--segments-file', | |
55 | + dest='segmentsFile', | |
56 | + metavar='FILE', | |
57 | + help='path to the file with segment rules') | |
53 | 58 | parser.add_option('-o', '--output-file', |
54 | 59 | dest='outputFile', |
55 | 60 | metavar='FILE', |
... | ... | @@ -107,6 +112,8 @@ def _parseOptions(): |
107 | 112 | _checkOption(opts.serializationMethod, parser, "Serialization method file is missing") |
108 | 113 | _checkExactlyOneOptionSet([opts.analyzer, opts.generator], |
109 | 114 | parser, 'Must set exactly one FSA type: --analyzer or --generator') |
115 | + if opts.analyzer: | |
116 | + _checkOption(opts.segmentsFile, parser, "Segment rules file is missing") | |
110 | 117 | |
111 | 118 | if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]: |
112 | 119 | print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')' |
... | ... | @@ -147,9 +154,8 @@ def _printStats(fsa): |
147 | 154 | logging.info('sink states num: '+str(sinkNum)) |
148 | 155 | logging.info('array states num: '+str(arrayNum)) |
149 | 156 | |
150 | -def buildAnalyzerFromPoliMorf(inputFile, tagsetFile): | |
157 | +def buildAnalyzerFromPoliMorf(inputFile, tagset): | |
151 | 158 | encoder = encode.MorphEncoder() |
152 | - tagset = common.Tagset(tagsetFile) | |
153 | 159 | fsa = FSA(encoder, tagset) |
154 | 160 | inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder) |
155 | 161 | for word, data in inputData: |
... | ... | @@ -160,7 +166,7 @@ def buildAnalyzerFromPoliMorf(inputFile, tagsetFile): |
160 | 166 | |
161 | 167 | def buildGeneratorFromPoliMorf(inputFile, tagsetFile): |
162 | 168 | encoder = encode.Encoder4Generator() |
163 | - tagset = common.Tagset(tagsetFile) | |
169 | + tagset = Tagset(tagsetFile) | |
164 | 170 | fsa = FSA(encoder, tagset) |
165 | 171 | inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) |
166 | 172 | for word, data in inputData: |
... | ... | @@ -175,10 +181,15 @@ def main(opts): |
175 | 181 | else: |
176 | 182 | logging.basicConfig(level=logging.INFO) |
177 | 183 | |
184 | + tagset = Tagset(opts.tagsetFile) | |
185 | + | |
178 | 186 | if opts.analyzer: |
179 | - fsa = buildAnalyzerFromPoliMorf(opts.inputFile, opts.tagsetFile) | |
187 | + fsa = buildAnalyzerFromPoliMorf(opts.inputFile, tagset) | |
188 | + segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile) | |
189 | + additionalData = segmentRulesManager.serialize() | |
180 | 190 | else: |
181 | - fsa = buildGeneratorFromPoliMorf(opts.inputFile, opts.tagsetFile) | |
191 | + fsa = buildGeneratorFromPoliMorf(opts.inputFile, tagset) | |
192 | + additionalData = bytearray() | |
182 | 193 | |
183 | 194 | if opts.trainFile: |
184 | 195 | logging.info('training with '+opts.trainFile+' ...') |
... | ... |
fsabuilder/morfeuszbuilder/fsa/fsa.py
fsabuilder/morfeuszbuilder/fsa/fsa.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/serializer.py
... | ... | @@ -45,16 +45,15 @@ class Serializer(object): |
45 | 45 | |
46 | 46 | def serialize2BinaryFile(self, fname): |
47 | 47 | with open(fname, 'wb') as f: |
48 | - f.write(self.fsa2bytearray()) | |
48 | + f.write(self.fsa2bytearray(self.serializeTagset(self.fsa.tagset))) | |
49 | 49 | |
50 | 50 | def getStateSize(self, state): |
51 | 51 | raise NotImplementedError('Not implemented') |
52 | 52 | |
53 | - def fsa2bytearray(self): | |
53 | + def fsa2bytearray(self, additionalData=bytearray()): | |
54 | 54 | res = bytearray() |
55 | - res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) | |
55 | + res.extend(self.serializePrologue(additionalData)) | |
56 | 56 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
57 | - logging.debug('SERIALIZE') | |
58 | 57 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): |
59 | 58 | res.extend(self.state2bytearray(state)) |
60 | 59 | return res |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/state.py
... | ... | @@ -8,6 +8,8 @@ class State(object): |
8 | 8 | ''' |
9 | 9 | A state in an automaton |
10 | 10 | ''' |
11 | + | |
12 | + statesCounter = 0 | |
11 | 13 | |
12 | 14 | def __init__(self, additionalData=None): |
13 | 15 | self.transitionsMap = {} |
... | ... | @@ -18,6 +20,9 @@ class State(object): |
18 | 20 | self.label2Freq = {} |
19 | 21 | self.serializeAsArray = False |
20 | 22 | self.additionalData = additionalData |
23 | + | |
24 | + self.idx = State.statesCounter | |
25 | + State.statesCounter += 1 | |
21 | 26 | |
22 | 27 | @property |
23 | 28 | def transitionsNum(self): |
... | ... | @@ -51,10 +56,16 @@ class State(object): |
51 | 56 | else: |
52 | 57 | return self.encodedData |
53 | 58 | |
54 | - def dfs(self, alreadyVisited=set(), sortKey=lambda (_, state): -state.freq): | |
59 | + def dfs(self, alreadyVisited, sortKey=lambda (_, state): -state.freq): | |
55 | 60 | if not self in alreadyVisited: |
61 | + alreadyVisited.add(self) | |
56 | 62 | for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey): |
57 | 63 | for state1 in state.dfs(alreadyVisited): |
58 | 64 | yield state1 |
59 | - alreadyVisited.add(self) | |
60 | 65 | yield self |
66 | + | |
67 | + def debug(self): | |
68 | + print '----------------' | |
69 | + print 'STATE:', self.idx | |
70 | + for label, s in self.transitionsMap.iteritems(): | |
71 | + print label, '-->', s.idx | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/state.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/visualizer.py
... | ... | @@ -12,7 +12,7 @@ class Visualizer(object): |
12 | 12 | def __init__(self): |
13 | 13 | pass |
14 | 14 | |
15 | - def visualize(self, fsa): | |
15 | + def visualize(self, fsa, charLabels=True): | |
16 | 16 | G = nx.DiGraph() |
17 | 17 | allStates = list(reversed(list(fsa.initialState.dfs(set())))) |
18 | 18 | edgeLabelsMap = {} |
... | ... | @@ -21,10 +21,12 @@ class Visualizer(object): |
21 | 21 | G.add_node(idx, offset=state.offset) |
22 | 22 | for c, targetState in state.transitionsMap.iteritems(): |
23 | 23 | G.add_edge(idx, allStates.index(targetState)) |
24 | - label = chr(c) if c <= 127 else '%' | |
24 | + label = (chr(c) if c <= 127 else '%') if charLabels \ | |
25 | + else c | |
25 | 26 | edgeLabelsMap[(idx, allStates.index(targetState))] = label |
26 | 27 | nodeLabelsMap[idx] = state.offset if not state.isAccepting() else state.encodedData + '(' + str(state.offset) + ')' |
27 | 28 | pos=nx.shell_layout(G) |
29 | +# pos=nx.random_layout(G) | |
28 | 30 | nx.draw_networkx_nodes(G, |
29 | 31 | pos, |
30 | 32 | nodelist=list([allStates.index(s) for s in allStates if not s.isAccepting()]), |
... | ... |
fsabuilder/morfeuszbuilder/fsa/visualizer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
... | ... | @@ -6,8 +6,7 @@ Created on 23 sty 2014 |
6 | 6 | import re |
7 | 7 | from pyparsing import * |
8 | 8 | |
9 | -identifier = Word(alphas, bodyChars=alphanums+'_') | |
10 | -token = Word(alphas, bodyChars=alphanums+'_+>') | |
9 | +identifier = Word(alphas, bodyChars=alphanums+'_>*+') | |
11 | 10 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() |
12 | 11 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() |
13 | 12 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rules.py
... | ... | @@ -34,6 +34,9 @@ class TagRule(SegmentRule): |
34 | 34 | |
35 | 35 | def _doAddToNFA(self, startState, endState): |
36 | 36 | startState.addTransition(self.segnum, endState) |
37 | + | |
38 | + def __str__(self): | |
39 | + return u''+self.segnum | |
37 | 40 | |
38 | 41 | class UnaryRule(SegmentRule): |
39 | 42 | |
... | ... | @@ -95,12 +98,3 @@ class ZeroOrMoreRule(UnaryRule): |
95 | 98 | self.child._doAddToNFA(intermStartState, intermEndState) |
96 | 99 | intermEndState.addTransition(None, endState) |
97 | 100 | endState.addTransition(None, intermStartState) |
98 | - | |
99 | -class IgnoreOrthRule(UnaryRule): | |
100 | - | |
101 | - def __init__(self, child): | |
102 | - super(IgnoreOrthRule, self).__init__(child) | |
103 | - | |
104 | - def _doAddToNFA(self, startState, endState): | |
105 | - startState.addTransition(self.child.segnum, endState, ignoreOrth=True) | |
106 | - | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
0 → 100644
1 | +''' | |
2 | +Created on 20 lut 2014 | |
3 | + | |
4 | +@author: mlenart | |
5 | +''' | |
6 | + | |
7 | +class RulesManager(object): | |
8 | + | |
9 | + def __init__(self): | |
10 | + self.options2DFA = {} | |
11 | + | |
12 | + def _options2Key(self, optionsMap): | |
13 | + return frozenset(optionsMap.items()) | |
14 | + | |
15 | + def addDFA4Options(self, optionsMap, dfa): | |
16 | + self.options2DFA[self._options2Key(optionsMap)] = dfa | |
17 | + | |
18 | + def serialize(self): | |
19 | + pass | |
0 | 20 | \ No newline at end of file |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... | ... | @@ -8,33 +8,98 @@ from morfeuszbuilder.fsa import fsa, state, encode |
8 | 8 | |
9 | 9 | class RulesNFAState(object): |
10 | 10 | |
11 | - def __init__(self, initial=False, final=False): | |
11 | + statesCounter = 0 | |
12 | + | |
13 | + def __init__(self, initial=False, final=False, weak=False): | |
12 | 14 | self.transitionsMap = {} |
13 | 15 | self.initial = initial |
14 | 16 | self.final = final |
17 | + self.weak = weak | |
18 | + self.idx = RulesNFAState.statesCounter | |
19 | + RulesNFAState.statesCounter += 1 | |
20 | + | |
21 | + def addTransition(self, label, targetState): | |
22 | + self.transitionsMap.setdefault(label, set()) | |
23 | + self.transitionsMap[label].add(targetState) | |
24 | + | |
25 | + def getClosure(self, visited): | |
26 | + if self in visited: | |
27 | + return set() | |
28 | + else: | |
29 | + visited.add(self) | |
30 | + res = set() | |
31 | + res.add(self) | |
32 | + for nextState in self.transitionsMap.get(None, []): | |
33 | + if self.idx in [6,8,4]: | |
34 | + print nextState.idx | |
35 | + print self.transitionsMap | |
36 | + res |= nextState.getClosure(visited) | |
37 | + return res | |
15 | 38 | |
16 | - def addTransition(self, label, targetState, ignoreOrth=False): | |
17 | - assert not ignoreOrth or label is not None | |
18 | - self.transitionsMap.setdefault((label, ignoreOrth), set()) | |
19 | - self.transitionsMap[(label, ignoreOrth)].add(targetState) | |
39 | + def dfs(self, visitedStates=set()): | |
40 | + if not self in visitedStates: | |
41 | + visitedStates.add(self) | |
42 | + yield self | |
43 | + for _, nextStates in self.transitionsMap.iteritems(): | |
44 | + for state in nextStates: | |
45 | + for state1 in state.dfs(): | |
46 | + yield state1 | |
47 | + | |
48 | + def debug(self): | |
49 | + print '----------------' | |
50 | + print 'STATE:', self.idx | |
51 | + for label, nextStates in self.transitionsMap.iteritems(): | |
52 | + print label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)] | |
20 | 53 | |
21 | 54 | class RulesNFA(object): |
22 | 55 | |
23 | - def __init__(self, key2Def={}): | |
56 | + def __init__(self): | |
24 | 57 | self.initialState = RulesNFAState(initial=True) |
25 | 58 | |
26 | - def _doConvertState(self, dfaState, nfaStates): | |
27 | - for label, (nextIgnoreOrth, nextNFAStates) in self._groupOutputByLabels(nfaStates).iteritems(): | |
28 | - nextDFAState = state.State(additionalData=nextIgnoreOrth) | |
59 | + def _groupOutputByLabels(self, nfaStates): | |
60 | + res = {} | |
61 | + for nfaState in nfaStates: | |
62 | + for label, nextStates in nfaState.transitionsMap.iteritems(): | |
63 | + if label is not None: | |
64 | + res.setdefault(label, set()) | |
65 | + for nextNFAState in nextStates: | |
66 | + res[label] |= nextNFAState.getClosure(set()) | |
67 | +# print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)] | |
68 | + return res | |
69 | + | |
70 | + def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): | |
71 | + assert all(map(lambda state: state.weak, nfaStates)) \ | |
72 | + or not any(map(lambda state: state.weak, nfaStates)) | |
73 | + weak = all(map(lambda state: state.weak, nfaStates)) | |
74 | + final = any(map(lambda state: state.final, nfaStates)) | |
75 | + assert not weak or not final | |
76 | + if final: | |
77 | + # dfaState should be final | |
78 | + # and contain info about weakness | |
79 | + dfaState.encodedData = bytearray([1 if weak else 0]) | |
80 | + for label, nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | |
81 | +# print '============' | |
82 | +# print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)] | |
83 | +# print 'label:', label | |
84 | +# print 'nextStates:', [s.idx for s in sorted(nextNFAStates, key=lambda s: s.idx)] | |
85 | + key = frozenset(nextNFAStates) | |
86 | + if key in nfaSubset2DFAState: | |
87 | + nextDFAState = nfaSubset2DFAState[key] | |
88 | + else: | |
89 | + nextDFAState = state.State() | |
90 | + nfaSubset2DFAState[key] = nextDFAState | |
91 | + self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) | |
29 | 92 | dfaState.setTransition(label, nextDFAState) |
30 | - dfaState.encodedData = bytearray() | |
31 | - self._doConvertState(nextDFAState, nextNFAStates) | |
32 | 93 | |
33 | 94 | def convertToDFA(self): |
34 | - dfa = fsa.FSA(encoder=None, encodeWords=False) | |
35 | - startStates = self.initialState.getClosure() | |
95 | + dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) | |
96 | + startStates = self.initialState.getClosure(set()) | |
36 | 97 | assert not any(filter(lambda s: s.final, startStates)) |
37 | 98 | dfa.initialState = state.State(additionalData=False) |
38 | - self._doConvertState(dfa.initialState, startStates) | |
39 | - | |
99 | + self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) | |
100 | + return dfa | |
101 | + | |
102 | + def debug(self): | |
103 | + for state in self.initialState.dfs(): | |
104 | + state.debug() | |
40 | 105 | |
41 | 106 | \ No newline at end of file |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... | ... | @@ -3,7 +3,7 @@ from pyparsing import * |
3 | 3 | ParserElement.enablePackrat() |
4 | 4 | from morfeuszbuilder.tagset import segtypes |
5 | 5 | from morfeuszbuilder.utils import configFile, exceptions |
6 | -from morfeuszbuilder.segrules import preprocessor, rules | |
6 | +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager | |
7 | 7 | import codecs |
8 | 8 | import re |
9 | 9 | |
... | ... | @@ -28,9 +28,9 @@ class RulesParser(object): |
28 | 28 | return res |
29 | 29 | |
30 | 30 | def parse(self, filename): |
31 | - res = [] | |
31 | + res = rulesManager.RulesManager() | |
32 | 32 | |
33 | - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes']) | |
33 | + segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) | |
34 | 34 | key2Defs = self._getKey2Defs(segtypesConfigFile) |
35 | 35 | segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) |
36 | 36 | |
... | ... | @@ -39,14 +39,18 @@ class RulesParser(object): |
39 | 39 | for define in defs: |
40 | 40 | def2Key[define] = key |
41 | 41 | |
42 | + firstNFA = None | |
42 | 43 | for defs in itertools.product(*key2Defs.values()): |
43 | 44 | key2Def = dict([(def2Key[define], define) for define in defs]) |
44 | - nfa = rulesNFA.RulesNFA(key2Def) | |
45 | + nfa = rulesNFA.RulesNFA() | |
46 | + if not firstNFA: | |
47 | + firstNFA = nfa | |
45 | 48 | combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') |
46 | 49 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) |
47 | 50 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): |
48 | 51 | rule.addToNFA(nfa) |
49 | - res.append(nfa) | |
52 | + dfa = nfa.convertToDFA() | |
53 | + res.addDFA4Options(key2Def, dfa) | |
50 | 54 | return res |
51 | 55 | |
52 | 56 | def _doParse(self, combinationEnumeratedLines, segtypesHelper): |
... | ... | @@ -58,14 +62,14 @@ class RulesParser(object): |
58 | 62 | if not segtypesHelper.hasSegtype(segtype): |
59 | 63 | raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) |
60 | 64 | else: |
65 | +# return rules.TagRule(segtype) | |
61 | 66 | return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype)) |
62 | 67 | |
63 | 68 | def _doParseOneLine(self, lineNum, line, segtypesHelper): |
64 | 69 | rule = Forward() |
65 | - tagRule = Word(alphanums+'_') | |
66 | - ignoreOrthRule = tagRule + Suppress('>') | |
70 | + tagRule = Word(alphanums+'_>') | |
67 | 71 | parenRule = Suppress('(') + rule + Suppress(')') |
68 | - atomicRule = tagRule ^ ignoreOrthRule ^ parenRule | |
72 | + atomicRule = tagRule ^ parenRule | |
69 | 73 | zeroOrMoreRule = atomicRule + Suppress('*') |
70 | 74 | oneOrMoreRule = atomicRule + Suppress('+') |
71 | 75 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule |
... | ... | @@ -75,19 +79,10 @@ class RulesParser(object): |
75 | 79 | rule << concatRule |
76 | 80 | |
77 | 81 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) |
78 | - ignoreOrthRule.setParseAction(lambda string, loc, toks: rules.IgnoreOrthRule(toks[0])) | |
79 | 82 | # parenRule.setParseAction(lambda string, loc, toks: toks[0]) |
80 | 83 | zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) |
81 | 84 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) |
82 | 85 | oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) |
83 | 86 | concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) |
84 | - | |
85 | - | |
86 | -# rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule | |
87 | - | |
88 | -# tagRule.setParseAction(lambda s,l,toks: doprint(toks)) | |
89 | -# print lineNum, line | |
90 | 87 | parsedRule = rule.parseString(line, parseAll=True)[0] |
91 | - print parsedRule | |
92 | 88 | return parsedRule |
93 | -# print parsedLine | |
... | ... |
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
... | ... | @@ -7,12 +7,20 @@ import unittest |
7 | 7 | import os |
8 | 8 | from morfeuszbuilder.segrules import rulesParser |
9 | 9 | from morfeuszbuilder.tagset import tagset |
10 | +from morfeuszbuilder.fsa import visualizer, serializer | |
10 | 11 | |
11 | 12 | class Test(unittest.TestCase): |
12 | 13 | print 'do test' |
13 | 14 | t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) |
14 | 15 | parser = rulesParser.RulesParser(t) |
15 | - parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | |
16 | + fsas = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | |
17 | + fsa = fsas[0] | |
18 | + for s in fsa.dfs(): | |
19 | + s.debug() | |
20 | + print 'states:', len(list(fsa.dfs())) | |
21 | + print 'transitions:', fsa.getTransitionsNum() | |
22 | + visualizer.Visualizer().visualize(fsa, charLabels=False) | |
23 | + print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) | |
16 | 24 | print 'done' |
17 | 25 | |
18 | 26 | if __name__ == "__main__": |
... | ... |
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
... | ... | @@ -103,7 +103,7 @@ moze_interp( naj> adj_sup ) |
103 | 103 | |
104 | 104 | # Formy „zanegowane” gerundiów i imiesłowów: |
105 | 105 | # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: |
106 | -moze_interp( nie > negat ) | |
106 | +moze_interp( nie> negat ) | |
107 | 107 | |
108 | 108 | # Przyimki akceptujące krótką formę „-ń” |
109 | 109 | moze_interp(z_on_agl) |
... | ... | @@ -111,7 +111,7 @@ moze_interp(z_on_agl) |
111 | 111 | moze_interp(z_on_agl on_agl) |
112 | 112 | |
113 | 113 | # Liczba zapisana jako ciąg cyfr: |
114 | -moze_interp( dig>* dig ) | |
114 | +#moze_interp( dig>* dig ) | |
115 | 115 | |
116 | 116 | # Formacje prefiksalne |
117 | 117 | #### trzeba wydzielić odpowiednie samodze! |
... | ... | @@ -132,13 +132,35 @@ adj dywiz samodz |
132 | 132 | # ? |
133 | 133 | samodz dywiz adj |
134 | 134 | |
135 | +[segment types] | |
136 | +naj> | |
137 | +nie> | |
138 | +prefs | |
139 | +prefv | |
140 | +dig> | |
141 | +adja | |
142 | +adj | |
143 | +adj_sup | |
144 | +negat | |
145 | +on_agl | |
146 | +z_on_agl | |
147 | +samotny | |
148 | +interp | |
149 | +aglsg | |
150 | +aglpl | |
151 | +praetcond | |
152 | +praet_sg_agl | |
153 | +praet_sg_na | |
154 | +praet_sg | |
155 | +praet_pl | |
156 | +samodz | |
135 | 157 | |
136 | 158 | [tags] |
137 | -naj naj | |
138 | -nie nie | |
159 | +naj> naj | |
160 | +nie> nie | |
139 | 161 | prefs prefs |
140 | 162 | prefv prefv |
141 | -dig dig | |
163 | +dig> dig | |
142 | 164 | adja adja |
143 | 165 | adj adj:%:pos |
144 | 166 | adj_sup adj:%:sup |
... | ... |
fsabuilder/morfeuszbuilder/segrules/test/segmenty1.dat
0 → 100644
1 | +[options] | |
2 | +aggl=permissive strict isolated | |
3 | +praet=split composite | |
4 | + | |
5 | +[combinations] | |
6 | +#define wsz_interp (interp|kropka|dywiz)* | |
7 | + | |
8 | +#define moze_interp(segmenty) wsz_interp segmenty wsz_interp | |
9 | + | |
10 | +moze_interp(samodz) | |
11 | +samotny | |
12 | + | |
13 | + | |
14 | +[segment types] | |
15 | +naj> | |
16 | +nie> | |
17 | +prefs | |
18 | +prefv | |
19 | +dig | |
20 | +adja | |
21 | +adj | |
22 | +adj_sup | |
23 | +negat | |
24 | +on_agl | |
25 | +z_on_agl | |
26 | +samotny | |
27 | +interp | |
28 | +aglsg | |
29 | +aglpl | |
30 | +praetcond | |
31 | +praet_sg_agl | |
32 | +praet_sg_na | |
33 | +praet_sg | |
34 | +praet_pl | |
35 | +samodz | |
36 | + | |
37 | +[tags] | |
38 | +naj naj | |
39 | +nie nie | |
40 | +prefs prefs | |
41 | +prefv prefv | |
42 | +dig dig | |
43 | +adja adja | |
44 | +adj adj:%:pos | |
45 | +adj_sup adj:%:sup | |
46 | +adj_sup adv:sup | |
47 | +negat ger:%:neg | |
48 | +negat pact:%:neg | |
49 | +negat ppas:%:neg | |
50 | +on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep | |
51 | +z_on_agl prep:% | |
52 | +samotny brev:pun | |
53 | +samotny brev:npun | |
54 | +samotny intrj | |
55 | +interp interp | |
56 | +aglsg aglt:sg:% | |
57 | +aglpl aglt:pl:% | |
58 | +praetcond cond:% | |
59 | +praetcond praet:%:pri:% | |
60 | +praetcond praet:%:sec:% | |
61 | +praetcond praet:%:ter:% | |
62 | +praet_sg_agl praet:sg:%:agl | |
63 | +praet_sg_na praet:sg:%:nagl | |
64 | +praet_sg praet:sg:% | |
65 | +praet_pl praet:pl:% | |
66 | +praet_sg winien:sg:% | |
67 | +praet_pl winien:pl:% | |
68 | +samodz % | |
69 | + | |
70 | +[lexemes] | |
71 | +z_aglt aby:comp | |
72 | +z_aglt bowiem:comp | |
73 | +by by:qub | |
74 | +z_aglt by:comp | |
75 | +z_aglt cóż:subst | |
76 | +z_aglt czemu:adv | |
77 | +z_aglt czyżby:qub | |
78 | +z_aglt choćby:comp | |
79 | +z_aglt chociażby:comp | |
80 | +z_aglt dlaczego:adv | |
81 | +z_aglt dopóki:comp | |
82 | +z_aglt dopóty:conj | |
83 | +z_aglt gdyby:comp | |
84 | +z_aglt gdzie:qub | |
85 | +z_aglt gdzie:adv | |
86 | +z_aglt jakby:comp | |
87 | +z_aglt jakoby:comp | |
88 | +z_aglt kiedy:adv | |
89 | +z_aglt kiedy:comp | |
90 | +z_aglt tylko:qub | |
91 | +z_aglt żeby:comp | |
92 | +dywiz -:interp | |
93 | +kropka .:interp | |
... | ... |