Commit 4ea040d0c359bd5f64b432695ae6799011e0fb72
1 parent
8d5a878e
- zrobiona konwersja NFA -> DFA dla automatów do zlepiania segmentów
- usunięcie "ignoreOrth" git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@87 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
18 changed files
with
409 additions
and
64 deletions
fsabuilder/PoliMorfTest.cek
0 → 100644
1 | +bij ABć+impt:sg:sec:imperf+pospolita | ||
2 | +bija AAć+fin:sg:ter:imperf+pospolita | ||
3 | +bijacie ADć+fin:pl:sec:imperf+pospolita | ||
4 | +bijaj ABć+impt:sg:sec:imperf+pospolita | ||
5 | +bijajcie AEć+impt:pl:sec:imperf+pospolita | ||
6 | +bijajmy ADć+impt:pl:pri:imperf+pospolita | ||
7 | +bijają ACć+fin:pl:ter:imperf+pospolita | ||
8 | +bijając ADć+pcon:imperf+pospolita | ||
9 | +bijająca AEć+pact:sg:nom.voc:f:imperf:aff+pospolita | ||
10 | +bijające AEć+pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|AEć+pact:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita | ||
11 | +bijającego AGć+pact:sg:acc:m1.m2:imperf:aff+pospolita|AGć+pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita | ||
12 | +bijającej AFć+pact:sg:gen.dat.loc:f:imperf:aff+pospolita | ||
13 | +bijającemu AGć+pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita | ||
14 | +bijający AEć+pact:pl:nom.voc:m1.p1:imperf:aff+pospolita|AEć+pact:sg:acc:m3:imperf:aff+pospolita|AEć+pact:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita | ||
15 | +bijających AGć+pact:pl:acc:m1.p1:imperf:aff+pospolita|AGć+pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | ||
16 | +bijającym AFć+pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|AFć+pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita | ||
17 | +bijającymi AGć+pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | ||
18 | +bijającą AEć+pact:sg:acc.inst:f:imperf:aff+pospolita | ||
19 | +bijak AA+subst:sg:acc:m3+pospolita|AA+subst:sg:nom:m3+pospolita | ||
20 | +bijaka AB+subst:sg:gen:m3+pospolita | ||
21 | +bijakach AD+subst:pl:loc:m3+pospolita | ||
22 | +bijakami AD+subst:pl:inst:m3+pospolita | ||
23 | +bijaki AB+subst:pl:acc:m3+pospolita|AB+subst:pl:nom:m3+pospolita|AB+subst:pl:voc:m3+pospolita | ||
24 | +bijakiem AD+subst:sg:inst:m3+pospolita | ||
25 | +bijakom AC+subst:pl:dat:m3+pospolita | ||
26 | +bijakowa ABy+adj:sg:nom.voc:f:pos+pospolita | ||
27 | +bijakowe ABy+adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos+pospolita|ABy+adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos+pospolita|ABy+adj:sg:acc:n1.n2:pos+pospolita|ABy+adj:sg:nom.voc:n1.n2:pos+pospolita | ||
28 | +bijakowego ADy+adj:sg:acc:m1.m2:pos+pospolita|ADy+adj:sg:gen:m1.m2.m3.n1.n2:pos+pospolita | ||
29 | +bijakowej ACy+adj:sg:dat:f:pos+pospolita|ACy+adj:sg:gen:f:pos+pospolita|ACy+adj:sg:loc:f:pos+pospolita | ||
30 | +bijakowemu ADy+adj:sg:dat:m1.m2.m3.n1.n2:pos+pospolita | ||
31 | +bijakowi ABy+adj:pl:nom.voc:m1.p1:pos+pospolita|AD+subst:sg:dat:m3+pospolita | ||
32 | +bijakowo ABy+adja+pospolita | ||
33 | +bijakowości ACć+subst:pl:acc:f+pospolita|ACć+subst:pl:gen:f+pospolita|ACć+subst:pl:nom:f+pospolita|ACć+subst:pl:voc:f+pospolita|ACć+subst:sg:dat:f+pospolita|ACć+subst:sg:gen:f+pospolita|ACć+subst:sg:loc:f+pospolita|ACć+subst:sg:voc:f+pospolita | ||
34 | +bijakowościach AFć+subst:pl:loc:f+pospolita | ||
35 | +bijakowościami AFć+subst:pl:inst:f+pospolita | ||
36 | +bijakowościom AEć+subst:pl:dat:f+pospolita | ||
37 | +bijakowością ADć+subst:sg:inst:f+pospolita | ||
38 | +bijakowość AA+subst:sg:acc:f+pospolita|AA+subst:sg:nom:f+pospolita | ||
39 | +bijakowy AA+adj:sg:acc:m3:pos+pospolita|AA+adj:sg:nom.voc:m1.m2.m3:pos+pospolita | ||
40 | +bijakowych AC+adj:pl:acc:m1.p1:pos+pospolita|AC+adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita|AC+adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita | ||
41 | +bijakowym AB+adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita|AB+adj:sg:inst:m1.m2.m3.n1.n2:pos+pospolita|AB+adj:sg:loc:m1.m2.m3.n1.n2:pos+pospolita | ||
42 | +bijakowymi AC+adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos+pospolita | ||
43 | +bijakową ABy+adj:sg:acc:f:pos+pospolita|ABy+adj:sg:inst:f:pos+pospolita | ||
44 | +bijaku AB+subst:sg:loc:m3+pospolita|AB+subst:sg:voc:m3+pospolita | ||
45 | +bijaków AC+subst:pl:gen:m3+pospolita | ||
46 | +bijali ACć+praet:pl:m1.p1:imperf+pospolita | ||
47 | +bijam ABć+fin:sg:pri:imperf+pospolita | ||
48 | +bijamy ACć+fin:pl:pri:imperf+pospolita | ||
49 | +bijana ACć+ppas:sg:nom.voc:f:imperf:aff+pospolita | ||
50 | +bijance ACka+subst:sg:dat:f+pospolita|ACka+subst:sg:loc:f+pospolita | ||
51 | +bijane ACć+ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|ACć+ppas:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita | ||
52 | +bijanego AEć+ppas:sg:acc:m1.m2:imperf:aff+pospolita|AEć+ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita | ||
53 | +bijanej ADć+ppas:sg:gen.dat.loc:f:imperf:aff+pospolita | ||
54 | +bijanek ACka+subst:pl:gen:f+pospolita | ||
55 | +bijanemu AEć+ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita | ||
56 | +bijani ACć+ppas:pl:nom.voc:m1.p1:imperf:aff+pospolita | ||
57 | +bijania ADć+ger:sg:gen:n2:imperf:aff+pospolita | ||
58 | +bijanie ADć+ger:sg:nom.acc:n2:imperf:aff+pospolita | ||
59 | +bijaniem AEć+ger:sg:inst:n2:imperf:aff+pospolita | ||
60 | +bijaniu ADć+ger:sg:dat.loc:n2:imperf:aff+pospolita | ||
61 | +bijanka AA+subst:sg:nom:f+pospolita | ||
62 | +bijankach AC+subst:pl:loc:f+pospolita | ||
63 | +bijankami AC+subst:pl:inst:f+pospolita | ||
64 | +bijanki ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita|ABa+subst:sg:gen:f+pospolita | ||
65 | +bijanko ABa+subst:sg:voc:f+pospolita | ||
66 | +bijankom ACa+subst:pl:dat:f+pospolita | ||
67 | +bijanką ABa+subst:sg:inst:f+pospolita | ||
68 | +bijankę ABa+subst:sg:acc:f+pospolita | ||
69 | +bijano ACć+imps:imperf+pospolita | ||
70 | +bijany ACć+ppas:sg:acc:m3:imperf:aff+pospolita|ACć+ppas:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita | ||
71 | +bijanych AEć+ppas:pl:acc:m1.p1:imperf:aff+pospolita|AEć+ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | ||
72 | +bijanym ADć+ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|ADć+ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita | ||
73 | +bijanymi AEć+ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | ||
74 | +bijaną ACć+ppas:sg:acc.inst:f:imperf:aff+pospolita | ||
75 | +bijasz ACć+fin:sg:sec:imperf+pospolita | ||
76 | +bijatyce ACka+subst:sg:dat:f+pospolita|ACka+subst:sg:loc:f+pospolita | ||
77 | +bijatyk AAa+subst:pl:gen:f+pospolita | ||
78 | +bijatyka AA+subst:sg:nom:f+pospolita | ||
79 | +bijatykach AC+subst:pl:loc:f+pospolita | ||
80 | +bijatykami AC+subst:pl:inst:f+pospolita | ||
81 | +bijatyki ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita|ABa+subst:sg:gen:f+pospolita | ||
82 | +bijatyko ABa+subst:sg:voc:f+pospolita | ||
83 | +bijatykom ACa+subst:pl:dat:f+pospolita | ||
84 | +bijatyką ABa+subst:sg:inst:f+pospolita | ||
85 | +bijatykę ABa+subst:sg:acc:f+pospolita | ||
86 | +bijać AA+inf:imperf+pospolita | ||
87 | +bijał ABć+praet:sg:m1.m2.m3:imperf+pospolita | ||
88 | +bijała ACć+praet:sg:f:imperf+pospolita | ||
89 | +bijało ACć+praet:sg:n1.n2:imperf+pospolita | ||
90 | +bijały ACć+praet:pl:m2.m3.f.n1.n2.p2.p3:imperf+pospolita | ||
91 | +bijcie AEć+impt:pl:sec:imperf+pospolita | ||
92 | +bije ACć+fin:sg:ter:imperf+pospolita | ||
93 | +bijecie AFć+fin:pl:sec:imperf+pospolita | ||
94 | +bijekcja AA+subst:sg:nom:f+pospolita | ||
95 | +bijekcjach AC+subst:pl:loc:f+pospolita | ||
96 | +bijekcjami AC+subst:pl:inst:f+pospolita | ||
97 | +bijekcje ABa+subst:pl:acc:f+pospolita|ABa+subst:pl:nom:f+pospolita|ABa+subst:pl:voc:f+pospolita | ||
98 | +bijekcji ABa+subst:pl:gen:f+pospolita|ABa+subst:sg:dat:f+pospolita|ABa+subst:sg:gen:f+pospolita|ABa+subst:sg:loc:f+pospolita | ||
99 | +bijekcjo ABa+subst:sg:voc:f+pospolita | ||
100 | +bijekcjom ACa+subst:pl:dat:f+pospolita | ||
101 | +bijekcją ABa+subst:sg:inst:f+pospolita | ||
102 | +bijekcję ABa+subst:sg:acc:f+pospolita | ||
103 | +bijekcyj ACja+subst:pl:gen:f+pospolita | ||
104 | +bijemy AEć+fin:pl:pri:imperf+pospolita | ||
105 | +bijesz AEć+fin:sg:sec:imperf+pospolita | ||
106 | +bijmy ADć+impt:pl:pri:imperf+pospolita | ||
107 | +bijnik AA+subst:sg:acc:m3+pospolita|AA+subst:sg:nom:m3+pospolita | ||
108 | +bijnika AB+subst:sg:gen:m3+pospolita | ||
109 | +bijnikach AD+subst:pl:loc:m3+pospolita | ||
110 | +bijnikami AD+subst:pl:inst:m3+pospolita | ||
111 | +bijniki AB+subst:pl:acc:m3+pospolita|AB+subst:pl:nom:m3+pospolita|AB+subst:pl:voc:m3+pospolita | ||
112 | +bijnikiem AD+subst:sg:inst:m3+pospolita | ||
113 | +bijnikom AC+subst:pl:dat:m3+pospolita | ||
114 | +bijnikowi AD+subst:sg:dat:m3+pospolita | ||
115 | +bijniku AB+subst:sg:loc:m3+pospolita|AB+subst:sg:voc:m3+pospolita | ||
116 | +bijników AC+subst:pl:gen:m3+pospolita | ||
117 | +biją ACć+fin:pl:ter:imperf+pospolita | ||
118 | +bijąc ADć+pcon:imperf+pospolita | ||
119 | +bijąca AEć+pact:sg:nom.voc:f:imperf:aff+pospolita | ||
120 | +bijące AEć+pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff+pospolita|AEć+pact:sg:nom.acc.voc:n1.n2:imperf:aff+pospolita | ||
121 | +bijącego AGć+pact:sg:acc:m1.m2:imperf:aff+pospolita|AGć+pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff+pospolita | ||
122 | +bijącej AFć+pact:sg:gen.dat.loc:f:imperf:aff+pospolita | ||
123 | +bijącemu AGć+pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff+pospolita | ||
124 | +bijący AEć+pact:pl:nom.voc:m1.p1:imperf:aff+pospolita|AEć+pact:sg:acc:m3:imperf:aff+pospolita|AEć+pact:sg:nom.voc:m1.m2.m3:imperf:aff+pospolita | ||
125 | +bijących AGć+pact:pl:acc:m1.p1:imperf:aff+pospolita|AGć+pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | ||
126 | +bijącym AFć+pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita|AFć+pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff+pospolita | ||
127 | +bijącymi AGć+pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff+pospolita | ||
128 | +bijącą AEć+pact:sg:acc.inst:f:imperf:aff+pospolita |
fsabuilder/buildfsa.py
@@ -10,9 +10,10 @@ import logging | @@ -10,9 +10,10 @@ import logging | ||
10 | import codecs | 10 | import codecs |
11 | from morfeuszbuilder.fsa import encode | 11 | from morfeuszbuilder.fsa import encode |
12 | from morfeuszbuilder.fsa import convertinput | 12 | from morfeuszbuilder.fsa import convertinput |
13 | -from morfeuszbuilder.fsa import common | ||
14 | from morfeuszbuilder.fsa.fsa import FSA | 13 | from morfeuszbuilder.fsa.fsa import FSA |
15 | from morfeuszbuilder.fsa.serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer | 14 | from morfeuszbuilder.fsa.serializer import VLengthSerializer1, VLengthSerializer2, SimpleSerializer |
15 | +from morfeuszbuilder.tagset.tagset import Tagset | ||
16 | +from morfeuszbuilder.segrules import rulesParser | ||
16 | from optparse import OptionParser | 17 | from optparse import OptionParser |
17 | 18 | ||
18 | # class InputFormat(): | 19 | # class InputFormat(): |
@@ -50,6 +51,10 @@ def _parseOptions(): | @@ -50,6 +51,10 @@ def _parseOptions(): | ||
50 | dest='tagsetFile', | 51 | dest='tagsetFile', |
51 | metavar='FILE', | 52 | metavar='FILE', |
52 | help='path to the file with tagset') | 53 | help='path to the file with tagset') |
54 | + parser.add_option('--segments-file', | ||
55 | + dest='segmentsFile', | ||
56 | + metavar='FILE', | ||
57 | + help='path to the file with segment rules') | ||
53 | parser.add_option('-o', '--output-file', | 58 | parser.add_option('-o', '--output-file', |
54 | dest='outputFile', | 59 | dest='outputFile', |
55 | metavar='FILE', | 60 | metavar='FILE', |
@@ -107,6 +112,8 @@ def _parseOptions(): | @@ -107,6 +112,8 @@ def _parseOptions(): | ||
107 | _checkOption(opts.serializationMethod, parser, "Serialization method file is missing") | 112 | _checkOption(opts.serializationMethod, parser, "Serialization method file is missing") |
108 | _checkExactlyOneOptionSet([opts.analyzer, opts.generator], | 113 | _checkExactlyOneOptionSet([opts.analyzer, opts.generator], |
109 | parser, 'Must set exactly one FSA type: --analyzer or --generator') | 114 | parser, 'Must set exactly one FSA type: --analyzer or --generator') |
115 | + if opts.analyzer: | ||
116 | + _checkOption(opts.segmentsFile, parser, "Segment rules file is missing") | ||
110 | 117 | ||
111 | if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]: | 118 | if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]: |
112 | print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')' | 119 | print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')' |
@@ -147,9 +154,8 @@ def _printStats(fsa): | @@ -147,9 +154,8 @@ def _printStats(fsa): | ||
147 | logging.info('sink states num: '+str(sinkNum)) | 154 | logging.info('sink states num: '+str(sinkNum)) |
148 | logging.info('array states num: '+str(arrayNum)) | 155 | logging.info('array states num: '+str(arrayNum)) |
149 | 156 | ||
150 | -def buildAnalyzerFromPoliMorf(inputFile, tagsetFile): | 157 | +def buildAnalyzerFromPoliMorf(inputFile, tagset): |
151 | encoder = encode.MorphEncoder() | 158 | encoder = encode.MorphEncoder() |
152 | - tagset = common.Tagset(tagsetFile) | ||
153 | fsa = FSA(encoder, tagset) | 159 | fsa = FSA(encoder, tagset) |
154 | inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder) | 160 | inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder) |
155 | for word, data in inputData: | 161 | for word, data in inputData: |
@@ -160,7 +166,7 @@ def buildAnalyzerFromPoliMorf(inputFile, tagsetFile): | @@ -160,7 +166,7 @@ def buildAnalyzerFromPoliMorf(inputFile, tagsetFile): | ||
160 | 166 | ||
161 | def buildGeneratorFromPoliMorf(inputFile, tagsetFile): | 167 | def buildGeneratorFromPoliMorf(inputFile, tagsetFile): |
162 | encoder = encode.Encoder4Generator() | 168 | encoder = encode.Encoder4Generator() |
163 | - tagset = common.Tagset(tagsetFile) | 169 | + tagset = Tagset(tagsetFile) |
164 | fsa = FSA(encoder, tagset) | 170 | fsa = FSA(encoder, tagset) |
165 | inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) | 171 | inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) |
166 | for word, data in inputData: | 172 | for word, data in inputData: |
@@ -175,10 +181,15 @@ def main(opts): | @@ -175,10 +181,15 @@ def main(opts): | ||
175 | else: | 181 | else: |
176 | logging.basicConfig(level=logging.INFO) | 182 | logging.basicConfig(level=logging.INFO) |
177 | 183 | ||
184 | + tagset = Tagset(opts.tagsetFile) | ||
185 | + | ||
178 | if opts.analyzer: | 186 | if opts.analyzer: |
179 | - fsa = buildAnalyzerFromPoliMorf(opts.inputFile, opts.tagsetFile) | 187 | + fsa = buildAnalyzerFromPoliMorf(opts.inputFile, tagset) |
188 | + segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile) | ||
189 | + additionalData = segmentRulesManager.serialize() | ||
180 | else: | 190 | else: |
181 | - fsa = buildGeneratorFromPoliMorf(opts.inputFile, opts.tagsetFile) | 191 | + fsa = buildGeneratorFromPoliMorf(opts.inputFile, tagset) |
192 | + additionalData = bytearray() | ||
182 | 193 | ||
183 | if opts.trainFile: | 194 | if opts.trainFile: |
184 | logging.info('training with '+opts.trainFile+' ...') | 195 | logging.info('training with '+opts.trainFile+' ...') |
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -119,4 +119,3 @@ class FSA(object): | @@ -119,4 +119,3 @@ class FSA(object): | ||
119 | state.reverseOffset = currReverseOffset | 119 | state.reverseOffset = currReverseOffset |
120 | for state in self.initialState.dfs(set()): | 120 | for state in self.initialState.dfs(set()): |
121 | state.offset = currReverseOffset - state.reverseOffset | 121 | state.offset = currReverseOffset - state.reverseOffset |
122 | - | ||
123 | \ No newline at end of file | 122 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/fsa/fsa.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/serializer.py
@@ -45,16 +45,15 @@ class Serializer(object): | @@ -45,16 +45,15 @@ class Serializer(object): | ||
45 | 45 | ||
46 | def serialize2BinaryFile(self, fname): | 46 | def serialize2BinaryFile(self, fname): |
47 | with open(fname, 'wb') as f: | 47 | with open(fname, 'wb') as f: |
48 | - f.write(self.fsa2bytearray()) | 48 | + f.write(self.fsa2bytearray(self.serializeTagset(self.fsa.tagset))) |
49 | 49 | ||
50 | def getStateSize(self, state): | 50 | def getStateSize(self, state): |
51 | raise NotImplementedError('Not implemented') | 51 | raise NotImplementedError('Not implemented') |
52 | 52 | ||
53 | - def fsa2bytearray(self): | 53 | + def fsa2bytearray(self, additionalData=bytearray()): |
54 | res = bytearray() | 54 | res = bytearray() |
55 | - res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) | 55 | + res.extend(self.serializePrologue(additionalData)) |
56 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) | 56 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
57 | - logging.debug('SERIALIZE') | ||
58 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): | 57 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): |
59 | res.extend(self.state2bytearray(state)) | 58 | res.extend(self.state2bytearray(state)) |
60 | return res | 59 | return res |
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/state.py
@@ -8,6 +8,8 @@ class State(object): | @@ -8,6 +8,8 @@ class State(object): | ||
8 | ''' | 8 | ''' |
9 | A state in an automaton | 9 | A state in an automaton |
10 | ''' | 10 | ''' |
11 | + | ||
12 | + statesCounter = 0 | ||
11 | 13 | ||
12 | def __init__(self, additionalData=None): | 14 | def __init__(self, additionalData=None): |
13 | self.transitionsMap = {} | 15 | self.transitionsMap = {} |
@@ -18,6 +20,9 @@ class State(object): | @@ -18,6 +20,9 @@ class State(object): | ||
18 | self.label2Freq = {} | 20 | self.label2Freq = {} |
19 | self.serializeAsArray = False | 21 | self.serializeAsArray = False |
20 | self.additionalData = additionalData | 22 | self.additionalData = additionalData |
23 | + | ||
24 | + self.idx = State.statesCounter | ||
25 | + State.statesCounter += 1 | ||
21 | 26 | ||
22 | @property | 27 | @property |
23 | def transitionsNum(self): | 28 | def transitionsNum(self): |
@@ -51,10 +56,16 @@ class State(object): | @@ -51,10 +56,16 @@ class State(object): | ||
51 | else: | 56 | else: |
52 | return self.encodedData | 57 | return self.encodedData |
53 | 58 | ||
54 | - def dfs(self, alreadyVisited=set(), sortKey=lambda (_, state): -state.freq): | 59 | + def dfs(self, alreadyVisited, sortKey=lambda (_, state): -state.freq): |
55 | if not self in alreadyVisited: | 60 | if not self in alreadyVisited: |
61 | + alreadyVisited.add(self) | ||
56 | for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey): | 62 | for _, state in sorted(self.transitionsMap.iteritems(), key=sortKey): |
57 | for state1 in state.dfs(alreadyVisited): | 63 | for state1 in state.dfs(alreadyVisited): |
58 | yield state1 | 64 | yield state1 |
59 | - alreadyVisited.add(self) | ||
60 | yield self | 65 | yield self |
66 | + | ||
67 | + def debug(self): | ||
68 | + print '----------------' | ||
69 | + print 'STATE:', self.idx | ||
70 | + for label, s in self.transitionsMap.iteritems(): | ||
71 | + print label, '-->', s.idx |
fsabuilder/morfeuszbuilder/fsa/state.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/visualizer.py
@@ -12,7 +12,7 @@ class Visualizer(object): | @@ -12,7 +12,7 @@ class Visualizer(object): | ||
12 | def __init__(self): | 12 | def __init__(self): |
13 | pass | 13 | pass |
14 | 14 | ||
15 | - def visualize(self, fsa): | 15 | + def visualize(self, fsa, charLabels=True): |
16 | G = nx.DiGraph() | 16 | G = nx.DiGraph() |
17 | allStates = list(reversed(list(fsa.initialState.dfs(set())))) | 17 | allStates = list(reversed(list(fsa.initialState.dfs(set())))) |
18 | edgeLabelsMap = {} | 18 | edgeLabelsMap = {} |
@@ -21,10 +21,12 @@ class Visualizer(object): | @@ -21,10 +21,12 @@ class Visualizer(object): | ||
21 | G.add_node(idx, offset=state.offset) | 21 | G.add_node(idx, offset=state.offset) |
22 | for c, targetState in state.transitionsMap.iteritems(): | 22 | for c, targetState in state.transitionsMap.iteritems(): |
23 | G.add_edge(idx, allStates.index(targetState)) | 23 | G.add_edge(idx, allStates.index(targetState)) |
24 | - label = chr(c) if c <= 127 else '%' | 24 | + label = (chr(c) if c <= 127 else '%') if charLabels \ |
25 | + else c | ||
25 | edgeLabelsMap[(idx, allStates.index(targetState))] = label | 26 | edgeLabelsMap[(idx, allStates.index(targetState))] = label |
26 | nodeLabelsMap[idx] = state.offset if not state.isAccepting() else state.encodedData + '(' + str(state.offset) + ')' | 27 | nodeLabelsMap[idx] = state.offset if not state.isAccepting() else state.encodedData + '(' + str(state.offset) + ')' |
27 | pos=nx.shell_layout(G) | 28 | pos=nx.shell_layout(G) |
29 | +# pos=nx.random_layout(G) | ||
28 | nx.draw_networkx_nodes(G, | 30 | nx.draw_networkx_nodes(G, |
29 | pos, | 31 | pos, |
30 | nodelist=list([allStates.index(s) for s in allStates if not s.isAccepting()]), | 32 | nodelist=list([allStates.index(s) for s in allStates if not s.isAccepting()]), |
fsabuilder/morfeuszbuilder/fsa/visualizer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -6,8 +6,7 @@ Created on 23 sty 2014 | @@ -6,8 +6,7 @@ Created on 23 sty 2014 | ||
6 | import re | 6 | import re |
7 | from pyparsing import * | 7 | from pyparsing import * |
8 | 8 | ||
9 | -identifier = Word(alphas, bodyChars=alphanums+'_') | ||
10 | -token = Word(alphas, bodyChars=alphanums+'_+>') | 9 | +identifier = Word(alphas, bodyChars=alphanums+'_>*+') |
11 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | 10 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() |
12 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() | 11 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() |
13 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() | 12 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() |
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -34,6 +34,9 @@ class TagRule(SegmentRule): | @@ -34,6 +34,9 @@ class TagRule(SegmentRule): | ||
34 | 34 | ||
35 | def _doAddToNFA(self, startState, endState): | 35 | def _doAddToNFA(self, startState, endState): |
36 | startState.addTransition(self.segnum, endState) | 36 | startState.addTransition(self.segnum, endState) |
37 | + | ||
38 | + def __str__(self): | ||
39 | + return u''+self.segnum | ||
37 | 40 | ||
38 | class UnaryRule(SegmentRule): | 41 | class UnaryRule(SegmentRule): |
39 | 42 | ||
@@ -95,12 +98,3 @@ class ZeroOrMoreRule(UnaryRule): | @@ -95,12 +98,3 @@ class ZeroOrMoreRule(UnaryRule): | ||
95 | self.child._doAddToNFA(intermStartState, intermEndState) | 98 | self.child._doAddToNFA(intermStartState, intermEndState) |
96 | intermEndState.addTransition(None, endState) | 99 | intermEndState.addTransition(None, endState) |
97 | endState.addTransition(None, intermStartState) | 100 | endState.addTransition(None, intermStartState) |
98 | - | ||
99 | -class IgnoreOrthRule(UnaryRule): | ||
100 | - | ||
101 | - def __init__(self, child): | ||
102 | - super(IgnoreOrthRule, self).__init__(child) | ||
103 | - | ||
104 | - def _doAddToNFA(self, startState, endState): | ||
105 | - startState.addTransition(self.child.segnum, endState, ignoreOrth=True) | ||
106 | - |
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
0 → 100644
1 | +''' | ||
2 | +Created on 20 lut 2014 | ||
3 | + | ||
4 | +@author: mlenart | ||
5 | +''' | ||
6 | + | ||
7 | +class RulesManager(object): | ||
8 | + | ||
9 | + def __init__(self): | ||
10 | + self.options2DFA = {} | ||
11 | + | ||
12 | + def _options2Key(self, optionsMap): | ||
13 | + return frozenset(optionsMap.items()) | ||
14 | + | ||
15 | + def addDFA4Options(self, optionsMap, dfa): | ||
16 | + self.options2DFA[self._options2Key(optionsMap)] = dfa | ||
17 | + | ||
18 | + def serialize(self): | ||
19 | + pass | ||
0 | \ No newline at end of file | 20 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -8,33 +8,98 @@ from morfeuszbuilder.fsa import fsa, state, encode | @@ -8,33 +8,98 @@ from morfeuszbuilder.fsa import fsa, state, encode | ||
8 | 8 | ||
9 | class RulesNFAState(object): | 9 | class RulesNFAState(object): |
10 | 10 | ||
11 | - def __init__(self, initial=False, final=False): | 11 | + statesCounter = 0 |
12 | + | ||
13 | + def __init__(self, initial=False, final=False, weak=False): | ||
12 | self.transitionsMap = {} | 14 | self.transitionsMap = {} |
13 | self.initial = initial | 15 | self.initial = initial |
14 | self.final = final | 16 | self.final = final |
17 | + self.weak = weak | ||
18 | + self.idx = RulesNFAState.statesCounter | ||
19 | + RulesNFAState.statesCounter += 1 | ||
20 | + | ||
21 | + def addTransition(self, label, targetState): | ||
22 | + self.transitionsMap.setdefault(label, set()) | ||
23 | + self.transitionsMap[label].add(targetState) | ||
24 | + | ||
25 | + def getClosure(self, visited): | ||
26 | + if self in visited: | ||
27 | + return set() | ||
28 | + else: | ||
29 | + visited.add(self) | ||
30 | + res = set() | ||
31 | + res.add(self) | ||
32 | + for nextState in self.transitionsMap.get(None, []): | ||
33 | + if self.idx in [6,8,4]: | ||
34 | + print nextState.idx | ||
35 | + print self.transitionsMap | ||
36 | + res |= nextState.getClosure(visited) | ||
37 | + return res | ||
15 | 38 | ||
16 | - def addTransition(self, label, targetState, ignoreOrth=False): | ||
17 | - assert not ignoreOrth or label is not None | ||
18 | - self.transitionsMap.setdefault((label, ignoreOrth), set()) | ||
19 | - self.transitionsMap[(label, ignoreOrth)].add(targetState) | 39 | + def dfs(self, visitedStates=set()): |
40 | + if not self in visitedStates: | ||
41 | + visitedStates.add(self) | ||
42 | + yield self | ||
43 | + for _, nextStates in self.transitionsMap.iteritems(): | ||
44 | + for state in nextStates: | ||
45 | + for state1 in state.dfs(): | ||
46 | + yield state1 | ||
47 | + | ||
48 | + def debug(self): | ||
49 | + print '----------------' | ||
50 | + print 'STATE:', self.idx | ||
51 | + for label, nextStates in self.transitionsMap.iteritems(): | ||
52 | + print label, '-->', [s.idx for s in sorted(nextStates, key=lambda s: s.idx)] | ||
20 | 53 | ||
21 | class RulesNFA(object): | 54 | class RulesNFA(object): |
22 | 55 | ||
23 | - def __init__(self, key2Def={}): | 56 | + def __init__(self): |
24 | self.initialState = RulesNFAState(initial=True) | 57 | self.initialState = RulesNFAState(initial=True) |
25 | 58 | ||
26 | - def _doConvertState(self, dfaState, nfaStates): | ||
27 | - for label, (nextIgnoreOrth, nextNFAStates) in self._groupOutputByLabels(nfaStates).iteritems(): | ||
28 | - nextDFAState = state.State(additionalData=nextIgnoreOrth) | 59 | + def _groupOutputByLabels(self, nfaStates): |
60 | + res = {} | ||
61 | + for nfaState in nfaStates: | ||
62 | + for label, nextStates in nfaState.transitionsMap.iteritems(): | ||
63 | + if label is not None: | ||
64 | + res.setdefault(label, set()) | ||
65 | + for nextNFAState in nextStates: | ||
66 | + res[label] |= nextNFAState.getClosure(set()) | ||
67 | +# print 'closure of', nextNFAState.idx, 'is', [s.idx for s in sorted(nextNFAState.getClosure(), key=lambda s: s.idx)] | ||
68 | + return res | ||
69 | + | ||
70 | + def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): | ||
71 | + assert all(map(lambda state: state.weak, nfaStates)) \ | ||
72 | + or not any(map(lambda state: state.weak, nfaStates)) | ||
73 | + weak = all(map(lambda state: state.weak, nfaStates)) | ||
74 | + final = any(map(lambda state: state.final, nfaStates)) | ||
75 | + assert not weak or not final | ||
76 | + if final: | ||
77 | + # dfaState should be final | ||
78 | + # and contain info about weakness | ||
79 | + dfaState.encodedData = bytearray([1 if weak else 0]) | ||
80 | + for label, nextNFAStates in self._groupOutputByLabels(nfaStates).iteritems(): | ||
81 | +# print '============' | ||
82 | +# print 'states:', [s.idx for s in sorted(nfaStates, key=lambda s: s.idx)] | ||
83 | +# print 'label:', label | ||
84 | +# print 'nextStates:', [s.idx for s in sorted(nextNFAStates, key=lambda s: s.idx)] | ||
85 | + key = frozenset(nextNFAStates) | ||
86 | + if key in nfaSubset2DFAState: | ||
87 | + nextDFAState = nfaSubset2DFAState[key] | ||
88 | + else: | ||
89 | + nextDFAState = state.State() | ||
90 | + nfaSubset2DFAState[key] = nextDFAState | ||
91 | + self._doConvertState(nextDFAState, nextNFAStates, nfaSubset2DFAState) | ||
29 | dfaState.setTransition(label, nextDFAState) | 92 | dfaState.setTransition(label, nextDFAState) |
30 | - dfaState.encodedData = bytearray() | ||
31 | - self._doConvertState(nextDFAState, nextNFAStates) | ||
32 | 93 | ||
33 | def convertToDFA(self): | 94 | def convertToDFA(self): |
34 | - dfa = fsa.FSA(encoder=None, encodeWords=False) | ||
35 | - startStates = self.initialState.getClosure() | 95 | + dfa = fsa.FSA(encoder=None, encodeData=False, encodeWords=False) |
96 | + startStates = self.initialState.getClosure(set()) | ||
36 | assert not any(filter(lambda s: s.final, startStates)) | 97 | assert not any(filter(lambda s: s.final, startStates)) |
37 | dfa.initialState = state.State(additionalData=False) | 98 | dfa.initialState = state.State(additionalData=False) |
38 | - self._doConvertState(dfa.initialState, startStates) | ||
39 | - | 99 | + self._doConvertState(dfa.initialState, startStates, {frozenset(startStates): dfa.initialState}) |
100 | + return dfa | ||
101 | + | ||
102 | + def debug(self): | ||
103 | + for state in self.initialState.dfs(): | ||
104 | + state.debug() | ||
40 | 105 | ||
41 | \ No newline at end of file | 106 | \ No newline at end of file |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -3,7 +3,7 @@ from pyparsing import * | @@ -3,7 +3,7 @@ from pyparsing import * | ||
3 | ParserElement.enablePackrat() | 3 | ParserElement.enablePackrat() |
4 | from morfeuszbuilder.tagset import segtypes | 4 | from morfeuszbuilder.tagset import segtypes |
5 | from morfeuszbuilder.utils import configFile, exceptions | 5 | from morfeuszbuilder.utils import configFile, exceptions |
6 | -from morfeuszbuilder.segrules import preprocessor, rules | 6 | +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager |
7 | import codecs | 7 | import codecs |
8 | import re | 8 | import re |
9 | 9 | ||
@@ -28,9 +28,9 @@ class RulesParser(object): | @@ -28,9 +28,9 @@ class RulesParser(object): | ||
28 | return res | 28 | return res |
29 | 29 | ||
30 | def parse(self, filename): | 30 | def parse(self, filename): |
31 | - res = [] | 31 | + res = rulesManager.RulesManager() |
32 | 32 | ||
33 | - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes']) | 33 | + segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) |
34 | key2Defs = self._getKey2Defs(segtypesConfigFile) | 34 | key2Defs = self._getKey2Defs(segtypesConfigFile) |
35 | segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) | 35 | segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) |
36 | 36 | ||
@@ -39,14 +39,18 @@ class RulesParser(object): | @@ -39,14 +39,18 @@ class RulesParser(object): | ||
39 | for define in defs: | 39 | for define in defs: |
40 | def2Key[define] = key | 40 | def2Key[define] = key |
41 | 41 | ||
42 | + firstNFA = None | ||
42 | for defs in itertools.product(*key2Defs.values()): | 43 | for defs in itertools.product(*key2Defs.values()): |
43 | key2Def = dict([(def2Key[define], define) for define in defs]) | 44 | key2Def = dict([(def2Key[define], define) for define in defs]) |
44 | - nfa = rulesNFA.RulesNFA(key2Def) | 45 | + nfa = rulesNFA.RulesNFA() |
46 | + if not firstNFA: | ||
47 | + firstNFA = nfa | ||
45 | combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') | 48 | combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations') |
46 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) | 49 | combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs)) |
47 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): | 50 | for rule in self._doParse(combinationEnumeratedLines, segtypesHelper): |
48 | rule.addToNFA(nfa) | 51 | rule.addToNFA(nfa) |
49 | - res.append(nfa) | 52 | + dfa = nfa.convertToDFA() |
53 | + res.addDFA4Options(key2Def, dfa) | ||
50 | return res | 54 | return res |
51 | 55 | ||
52 | def _doParse(self, combinationEnumeratedLines, segtypesHelper): | 56 | def _doParse(self, combinationEnumeratedLines, segtypesHelper): |
@@ -58,14 +62,14 @@ class RulesParser(object): | @@ -58,14 +62,14 @@ class RulesParser(object): | ||
58 | if not segtypesHelper.hasSegtype(segtype): | 62 | if not segtypesHelper.hasSegtype(segtype): |
59 | raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) | 63 | raise exceptions.ConfigFileException(segtypesHelper.filename, lineNum, u'%s - invalid segment type: %s' % (line, segtype)) |
60 | else: | 64 | else: |
65 | +# return rules.TagRule(segtype) | ||
61 | return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype)) | 66 | return rules.TagRule(segtypesHelper.getSegnum4Segtype(segtype)) |
62 | 67 | ||
63 | def _doParseOneLine(self, lineNum, line, segtypesHelper): | 68 | def _doParseOneLine(self, lineNum, line, segtypesHelper): |
64 | rule = Forward() | 69 | rule = Forward() |
65 | - tagRule = Word(alphanums+'_') | ||
66 | - ignoreOrthRule = tagRule + Suppress('>') | 70 | + tagRule = Word(alphanums+'_>') |
67 | parenRule = Suppress('(') + rule + Suppress(')') | 71 | parenRule = Suppress('(') + rule + Suppress(')') |
68 | - atomicRule = tagRule ^ ignoreOrthRule ^ parenRule | 72 | + atomicRule = tagRule ^ parenRule |
69 | zeroOrMoreRule = atomicRule + Suppress('*') | 73 | zeroOrMoreRule = atomicRule + Suppress('*') |
70 | oneOrMoreRule = atomicRule + Suppress('+') | 74 | oneOrMoreRule = atomicRule + Suppress('+') |
71 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule | 75 | unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule |
@@ -75,19 +79,10 @@ class RulesParser(object): | @@ -75,19 +79,10 @@ class RulesParser(object): | ||
75 | rule << concatRule | 79 | rule << concatRule |
76 | 80 | ||
77 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) | 81 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], lineNum, line, segtypesHelper)) |
78 | - ignoreOrthRule.setParseAction(lambda string, loc, toks: rules.IgnoreOrthRule(toks[0])) | ||
79 | # parenRule.setParseAction(lambda string, loc, toks: toks[0]) | 82 | # parenRule.setParseAction(lambda string, loc, toks: toks[0]) |
80 | zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) | 83 | zeroOrMoreRule.setParseAction(lambda string, loc, toks: rules.ZeroOrMoreRule(toks[0])) |
81 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) | 84 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) |
82 | oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) | 85 | oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) |
83 | concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) | 86 | concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) |
84 | - | ||
85 | - | ||
86 | -# rule << tagRule ^ ignoreOrthRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ orRule ^ concatRule ^ parenRule | ||
87 | - | ||
88 | -# tagRule.setParseAction(lambda s,l,toks: doprint(toks)) | ||
89 | -# print lineNum, line | ||
90 | parsedRule = rule.parseString(line, parseAll=True)[0] | 87 | parsedRule = rule.parseString(line, parseAll=True)[0] |
91 | - print parsedRule | ||
92 | return parsedRule | 88 | return parsedRule |
93 | -# print parsedLine |
fsabuilder/morfeuszbuilder/segrules/test/parserTest.py
@@ -7,12 +7,20 @@ import unittest | @@ -7,12 +7,20 @@ import unittest | ||
7 | import os | 7 | import os |
8 | from morfeuszbuilder.segrules import rulesParser | 8 | from morfeuszbuilder.segrules import rulesParser |
9 | from morfeuszbuilder.tagset import tagset | 9 | from morfeuszbuilder.tagset import tagset |
10 | +from morfeuszbuilder.fsa import visualizer, serializer | ||
10 | 11 | ||
11 | class Test(unittest.TestCase): | 12 | class Test(unittest.TestCase): |
12 | print 'do test' | 13 | print 'do test' |
13 | t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) | 14 | t = tagset.Tagset(os.path.join(os.path.dirname(__file__), 'polimorf.tagset')) |
14 | parser = rulesParser.RulesParser(t) | 15 | parser = rulesParser.RulesParser(t) |
15 | - parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) | 16 | + fsas = parser.parse(os.path.join(os.path.dirname(__file__), 'segmenty.dat')) |
17 | + fsa = fsas[0] | ||
18 | + for s in fsa.dfs(): | ||
19 | + s.debug() | ||
20 | + print 'states:', len(list(fsa.dfs())) | ||
21 | + print 'transitions:', fsa.getTransitionsNum() | ||
22 | + visualizer.Visualizer().visualize(fsa, charLabels=False) | ||
23 | + print 'size:', len(serializer.SimpleSerializer(fsa).fsa2bytearray(bytearray())) | ||
16 | print 'done' | 24 | print 'done' |
17 | 25 | ||
18 | if __name__ == "__main__": | 26 | if __name__ == "__main__": |
fsabuilder/morfeuszbuilder/segrules/test/segmenty.dat
@@ -103,7 +103,7 @@ moze_interp( naj> adj_sup ) | @@ -103,7 +103,7 @@ moze_interp( naj> adj_sup ) | ||
103 | 103 | ||
104 | # Formy „zanegowane” gerundiów i imiesłowów: | 104 | # Formy „zanegowane” gerundiów i imiesłowów: |
105 | # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: | 105 | # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: |
106 | -moze_interp( nie > negat ) | 106 | +moze_interp( nie> negat ) |
107 | 107 | ||
108 | # Przyimki akceptujące krótką formę „-ń” | 108 | # Przyimki akceptujące krótką formę „-ń” |
109 | moze_interp(z_on_agl) | 109 | moze_interp(z_on_agl) |
@@ -111,7 +111,7 @@ moze_interp(z_on_agl) | @@ -111,7 +111,7 @@ moze_interp(z_on_agl) | ||
111 | moze_interp(z_on_agl on_agl) | 111 | moze_interp(z_on_agl on_agl) |
112 | 112 | ||
113 | # Liczba zapisana jako ciąg cyfr: | 113 | # Liczba zapisana jako ciąg cyfr: |
114 | -moze_interp( dig>* dig ) | 114 | +#moze_interp( dig>* dig ) |
115 | 115 | ||
116 | # Formacje prefiksalne | 116 | # Formacje prefiksalne |
117 | #### trzeba wydzielić odpowiednie samodze! | 117 | #### trzeba wydzielić odpowiednie samodze! |
@@ -132,13 +132,35 @@ adj dywiz samodz | @@ -132,13 +132,35 @@ adj dywiz samodz | ||
132 | # ? | 132 | # ? |
133 | samodz dywiz adj | 133 | samodz dywiz adj |
134 | 134 | ||
135 | +[segment types] | ||
136 | +naj> | ||
137 | +nie> | ||
138 | +prefs | ||
139 | +prefv | ||
140 | +dig> | ||
141 | +adja | ||
142 | +adj | ||
143 | +adj_sup | ||
144 | +negat | ||
145 | +on_agl | ||
146 | +z_on_agl | ||
147 | +samotny | ||
148 | +interp | ||
149 | +aglsg | ||
150 | +aglpl | ||
151 | +praetcond | ||
152 | +praet_sg_agl | ||
153 | +praet_sg_na | ||
154 | +praet_sg | ||
155 | +praet_pl | ||
156 | +samodz | ||
135 | 157 | ||
136 | [tags] | 158 | [tags] |
137 | -naj naj | ||
138 | -nie nie | 159 | +naj> naj |
160 | +nie> nie | ||
139 | prefs prefs | 161 | prefs prefs |
140 | prefv prefv | 162 | prefv prefv |
141 | -dig dig | 163 | +dig> dig |
142 | adja adja | 164 | adja adja |
143 | adj adj:%:pos | 165 | adj adj:%:pos |
144 | adj_sup adj:%:sup | 166 | adj_sup adj:%:sup |
fsabuilder/morfeuszbuilder/segrules/test/segmenty1.dat
0 → 100644
1 | +[options] | ||
2 | +aggl=permissive strict isolated | ||
3 | +praet=split composite | ||
4 | + | ||
5 | +[combinations] | ||
6 | +#define wsz_interp (interp|kropka|dywiz)* | ||
7 | + | ||
8 | +#define moze_interp(segmenty) wsz_interp segmenty wsz_interp | ||
9 | + | ||
10 | +moze_interp(samodz) | ||
11 | +samotny | ||
12 | + | ||
13 | + | ||
14 | +[segment types] | ||
15 | +naj> | ||
16 | +nie> | ||
17 | +prefs | ||
18 | +prefv | ||
19 | +dig | ||
20 | +adja | ||
21 | +adj | ||
22 | +adj_sup | ||
23 | +negat | ||
24 | +on_agl | ||
25 | +z_on_agl | ||
26 | +samotny | ||
27 | +interp | ||
28 | +aglsg | ||
29 | +aglpl | ||
30 | +praetcond | ||
31 | +praet_sg_agl | ||
32 | +praet_sg_na | ||
33 | +praet_sg | ||
34 | +praet_pl | ||
35 | +samodz | ||
36 | + | ||
37 | +[tags] | ||
38 | +naj naj | ||
39 | +nie nie | ||
40 | +prefs prefs | ||
41 | +prefv prefv | ||
42 | +dig dig | ||
43 | +adja adja | ||
44 | +adj adj:%:pos | ||
45 | +adj_sup adj:%:sup | ||
46 | +adj_sup adv:sup | ||
47 | +negat ger:%:neg | ||
48 | +negat pact:%:neg | ||
49 | +negat ppas:%:neg | ||
50 | +on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep | ||
51 | +z_on_agl prep:% | ||
52 | +samotny brev:pun | ||
53 | +samotny brev:npun | ||
54 | +samotny intrj | ||
55 | +interp interp | ||
56 | +aglsg aglt:sg:% | ||
57 | +aglpl aglt:pl:% | ||
58 | +praetcond cond:% | ||
59 | +praetcond praet:%:pri:% | ||
60 | +praetcond praet:%:sec:% | ||
61 | +praetcond praet:%:ter:% | ||
62 | +praet_sg_agl praet:sg:%:agl | ||
63 | +praet_sg_na praet:sg:%:nagl | ||
64 | +praet_sg praet:sg:% | ||
65 | +praet_pl praet:pl:% | ||
66 | +praet_sg winien:sg:% | ||
67 | +praet_pl winien:pl:% | ||
68 | +samodz % | ||
69 | + | ||
70 | +[lexemes] | ||
71 | +z_aglt aby:comp | ||
72 | +z_aglt bowiem:comp | ||
73 | +by by:qub | ||
74 | +z_aglt by:comp | ||
75 | +z_aglt cóż:subst | ||
76 | +z_aglt czemu:adv | ||
77 | +z_aglt czyżby:qub | ||
78 | +z_aglt choćby:comp | ||
79 | +z_aglt chociażby:comp | ||
80 | +z_aglt dlaczego:adv | ||
81 | +z_aglt dopóki:comp | ||
82 | +z_aglt dopóty:conj | ||
83 | +z_aglt gdyby:comp | ||
84 | +z_aglt gdzie:qub | ||
85 | +z_aglt gdzie:adv | ||
86 | +z_aglt jakby:comp | ||
87 | +z_aglt jakoby:comp | ||
88 | +z_aglt kiedy:adv | ||
89 | +z_aglt kiedy:comp | ||
90 | +z_aglt tylko:qub | ||
91 | +z_aglt żeby:comp | ||
92 | +dywiz -:interp | ||
93 | +kropka .:interp |