Commit 46ef679b5d9a7b626b501dcf93a3d84e4bd94695
1 parent
9c68c820
- praca nad budowaniem automatu do łączenia segmentów
- (w końcu) prawidłowa obsługa typów segmentów git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@90 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
7 changed files
with
50 additions
and
412 deletions
fsabuilder/buildfsa.py
... | ... | @@ -122,9 +122,9 @@ def _parseOptions(): |
122 | 122 | |
123 | 123 | return opts |
124 | 124 | |
125 | -def _readPolimorfInput4Analyzer(inputFile, tagset, encoder): | |
125 | +def _readPolimorfInput4Analyzer(inputFile, tagset, encoder, segmentRulesManager): | |
126 | 126 | with open(inputFile, 'r') as f: |
127 | - for entry in convertinput.PolimorfConverter4Analyzer(tagset, encoder, 'utf8').convert(f): | |
127 | + for entry in convertinput.PolimorfConverter4Analyzer(tagset, encoder, 'utf8', segmentRulesManager).convert(f): | |
128 | 128 | yield entry |
129 | 129 | |
130 | 130 | def _readPolimorfInput4Generator(inputFile, tagset, encoder): |
... | ... | @@ -154,10 +154,10 @@ def _printStats(fsa): |
154 | 154 | logging.info('sink states num: '+str(sinkNum)) |
155 | 155 | logging.info('array states num: '+str(arrayNum)) |
156 | 156 | |
157 | -def buildAnalyzerFromPoliMorf(inputFile, tagset): | |
157 | +def buildAnalyzerFromPoliMorf(inputFile, tagset, segmentRulesManager): | |
158 | 158 | encoder = encode.MorphEncoder() |
159 | 159 | fsa = FSA(encoder, tagset) |
160 | - inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder) | |
160 | + inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder, segmentRulesManager) | |
161 | 161 | for word, data in inputData: |
162 | 162 | fsa.addEntry(word, data) |
163 | 163 | fsa.close() |
... | ... | @@ -183,9 +183,9 @@ def main(opts): |
183 | 183 | tagset = Tagset(opts.tagsetFile) |
184 | 184 | |
185 | 185 | if opts.analyzer: |
186 | - fsa = buildAnalyzerFromPoliMorf(opts.inputFile, tagset) | |
187 | 186 | segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile) |
188 | 187 | additionalData = segmentRulesManager.serialize() |
188 | + fsa = buildAnalyzerFromPoliMorf(opts.inputFile, tagset, segmentRulesManager) | |
189 | 189 | else: |
190 | 190 | fsa = buildGeneratorFromPoliMorf(opts.inputFile, tagset) |
191 | 191 | additionalData = bytearray() |
... | ... |
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... | ... | @@ -7,398 +7,6 @@ import logging |
7 | 7 | from common import Interpretation |
8 | 8 | from morfeuszbuilder.fsa.common import Interpretation4Generator |
9 | 9 | |
10 | -tag2typenum = { | |
11 | - 'aglt:sg:pri:imperf:nwok': 12, | |
12 | - 'aglt:sg:pri:imperf:wok': 12, | |
13 | - 'aglt:sg:sec:imperf:nwok': 12, | |
14 | - 'aglt:sg:sec:imperf:wok': 12, | |
15 | - 'aglt:pl:pri:imperf:nwok': 13, | |
16 | - 'aglt:pl:pri:imperf:wok': 13, | |
17 | - 'aglt:pl:sec:imperf:nwok': 13, | |
18 | - 'aglt:pl:sec:imperf:wok': 13, | |
19 | - 'praet:sg:m1.m2.m3:imperf:agl': 7, | |
20 | - 'praet:sg:m1.m2.m3:imperf.perf:agl': 7, | |
21 | - 'praet:sg:m1.m2.m3:perf:agl': 7, | |
22 | - 'praet:sg:m1.m2.m3:imperf:nagl': 16, | |
23 | - 'praet:sg:m1.m2.m3:imperf.perf:nagl': 16, | |
24 | - 'praet:sg:m1.m2.m3:perf:nagl': 16, | |
25 | - 'praet:sg:f:imperf': 20, | |
26 | - 'praet:sg:f:imperf.perf': 20, | |
27 | - 'praet:sg:f:perf': 20, | |
28 | - 'praet:sg:m1.m2.m3:imperf': 20, | |
29 | - 'praet:sg:m1.m2.m3:imperf.perf': 20, | |
30 | - 'praet:sg:m1.m2.m3:perf': 20, | |
31 | - 'praet:sg:n1.n2:imperf': 20, | |
32 | - 'praet:sg:n1.n2:imperf.perf': 20, | |
33 | - 'praet:sg:n1.n2:perf': 20, | |
34 | - 'praet:pl:m1.p1:imperf': 21, | |
35 | - 'praet:pl:m1.p1:imperf.perf': 21, | |
36 | - 'praet:pl:m1.p1:perf': 21, | |
37 | - 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf': 21, | |
38 | - 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf': 21, | |
39 | - 'praet:pl:m2.m3.f.n1.n2.p2.p3:perf': 21, | |
40 | - 'naj': 10, | |
41 | - 'nie': 5, | |
42 | - 'adj:pl:acc:m1.p1:pos': 1, | |
43 | - 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos': 1, | |
44 | - 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
45 | - 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
46 | - 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
47 | - 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
48 | - 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
49 | - 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:pos': 1, | |
50 | - 'adj:pl:nom.voc:m1.p1:pos': 1, | |
51 | - 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos': 1, | |
52 | - 'adj:sg:acc:m1.m2:pos': 1, | |
53 | - 'adj:sg:acc:n1.n2:pos': 1, | |
54 | - 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1, | |
55 | - 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1, | |
56 | - 'adj:sg:inst:m1.m2.m3.n1.n2:pos': 1, | |
57 | - 'adj:sg:loc:m1.m2.m3.n1.n2:pos': 1, | |
58 | - 'adj:sg:nom.voc:m1.m2.m3:pos': 1, | |
59 | - 'adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos': 1, | |
60 | - 'adj:sg:nom.voc:n1.n2:pos': 1, | |
61 | - 'adj:sg:acc:f:pos': 1, | |
62 | - 'adj:sg:acc.inst:f:pos': 1, | |
63 | - 'adj:sg:acc:m1.m2:pos': 1, | |
64 | - 'adj:sg:acc:m3:pos': 1, | |
65 | - 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1, | |
66 | - 'adj:sg:gen.dat.loc:f:pos': 1, | |
67 | - 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1, | |
68 | - 'adj:sg:inst.loc:m1.m2.m3.n1.n2:pos': 1, | |
69 | - 'adj:sg:nom.voc.acc:n1.n2:pos': 1, | |
70 | - 'adj:sg:nom.voc:f:pos': 1, | |
71 | - 'adj:sg:nom.voc:m1.m2.m3:pos': 1, | |
72 | - 'adj:pl:acc:f:pos': 1, | |
73 | - 'adj:pl:acc:m1:pos': 1, | |
74 | - 'adj:pl:acc:m2:pos': 1, | |
75 | - 'adj:pl:acc:m3:pos': 1, | |
76 | - 'adj:pl:acc:n1:pos': 1, | |
77 | - 'adj:pl:acc:n2:pos': 1, | |
78 | - 'adj:pl:acc:p1:pos': 1, | |
79 | - 'adj:pl:acc:p2:pos': 1, | |
80 | - 'adj:pl:acc:p3:pos': 1, | |
81 | - 'adj:pl:dat:f:pos': 1, | |
82 | - 'adj:pl:dat:m1:pos': 1, | |
83 | - 'adj:pl:dat:m2:pos': 1, | |
84 | - 'adj:pl:dat:m3:pos': 1, | |
85 | - 'adj:pl:dat:n1:pos': 1, | |
86 | - 'adj:pl:dat:n2:pos': 1, | |
87 | - 'adj:pl:dat:p1:pos': 1, | |
88 | - 'adj:pl:dat:p2:pos': 1, | |
89 | - 'adj:pl:dat:p3:pos': 1, | |
90 | - 'adj:pl:gen:f:pos': 1, | |
91 | - 'adj:pl:gen:m1:pos': 1, | |
92 | - 'adj:pl:gen:m2:pos': 1, | |
93 | - 'adj:pl:gen:m3:pos': 1, | |
94 | - 'adj:pl:gen:n1:pos': 1, | |
95 | - 'adj:pl:gen:n2:pos': 1, | |
96 | - 'adj:pl:gen:p1:pos': 1, | |
97 | - 'adj:pl:gen:p2:pos': 1, | |
98 | - 'adj:pl:gen:p3:pos': 1, | |
99 | - 'adj:pl:inst:f:pos': 1, | |
100 | - 'adj:pl:inst:m1:pos': 1, | |
101 | - 'adj:pl:inst:m2:pos': 1, | |
102 | - 'adj:pl:inst:m3:pos': 1, | |
103 | - 'adj:pl:inst:n1:pos': 1, | |
104 | - 'adj:pl:inst:n2:pos': 1, | |
105 | - 'adj:pl:inst:p1:pos': 1, | |
106 | - 'adj:pl:inst:p2:pos': 1, | |
107 | - 'adj:pl:inst:p3:pos': 1, | |
108 | - 'adj:pl:loc:f:pos': 1, | |
109 | - 'adj:pl:loc:m1:pos': 1, | |
110 | - 'adj:pl:loc:m2:pos': 1, | |
111 | - 'adj:pl:loc:m3:pos': 1, | |
112 | - 'adj:pl:loc:n1:pos': 1, | |
113 | - 'adj:pl:loc:n2:pos': 1, | |
114 | - 'adj:pl:loc:p1:pos': 1, | |
115 | - 'adj:pl:loc:p2:pos': 1, | |
116 | - 'adj:pl:loc:p3:pos': 1, | |
117 | - 'adj:pl:nom:f:pos': 1, | |
118 | - 'adj:pl:nom:m1:pos': 1, | |
119 | - 'adj:pl:nom:m2:pos': 1, | |
120 | - 'adj:pl:nom:m3:pos': 1, | |
121 | - 'adj:pl:nom:n1:pos': 1, | |
122 | - 'adj:pl:nom:n2:pos': 1, | |
123 | - 'adj:pl:nom:p1:pos': 1, | |
124 | - 'adj:pl:nom:p2:pos': 1, | |
125 | - 'adj:pl:nom:p3:pos': 1, | |
126 | - 'adj:sg:acc:f:pos': 1, | |
127 | - 'adj:sg:acc:m1:pos': 1, | |
128 | - 'adj:sg:acc:m2:pos': 1, | |
129 | - 'adj:sg:acc:m3:pos': 1, | |
130 | - 'adj:sg:acc:n1:pos': 1, | |
131 | - 'adj:sg:acc:n2:pos': 1, | |
132 | - 'adj:sg:dat:f:pos': 1, | |
133 | - 'adj:sg:dat:m1:pos': 1, | |
134 | - 'adj:sg:dat:m2:pos': 1, | |
135 | - 'adj:sg:dat:m3:pos': 1, | |
136 | - 'adj:sg:dat:n1:pos': 1, | |
137 | - 'adj:sg:dat:n2:pos': 1, | |
138 | - 'adj:sg:gen:f:pos': 1, | |
139 | - 'adj:sg:gen:m1:pos': 1, | |
140 | - 'adj:sg:gen:m2:pos': 1, | |
141 | - 'adj:sg:gen:m3:pos': 1, | |
142 | - 'adj:sg:gen:n1:pos': 1, | |
143 | - 'adj:sg:gen:n2:pos': 1, | |
144 | - 'adj:sg:inst:f:pos': 1, | |
145 | - 'adj:sg:inst:m1:pos': 1, | |
146 | - 'adj:sg:inst:m2:pos': 1, | |
147 | - 'adj:sg:inst:m3:pos': 1, | |
148 | - 'adj:sg:inst:n1:pos': 1, | |
149 | - 'adj:sg:inst:n2:pos': 1, | |
150 | - 'adj:sg:loc:f:pos': 1, | |
151 | - 'adj:sg:loc:m1:pos': 1, | |
152 | - 'adj:sg:loc:m2:pos': 1, | |
153 | - 'adj:sg:loc:m3:pos': 1, | |
154 | - 'adj:sg:loc:n1:pos': 1, | |
155 | - 'adj:sg:loc:n2:pos': 1, | |
156 | - 'adj:sg:nom:f:pos': 1, | |
157 | - 'adj:sg:nom:m1:pos': 1, | |
158 | - 'adj:sg:nom:m2:pos': 1, | |
159 | - 'adj:sg:nom:m3:pos': 1, | |
160 | - 'adj:sg:nom:n1:pos': 1, | |
161 | - 'adj:sg:nom:n2:pos': 1, | |
162 | - 'adj:pl:acc:m1.p1:sup': 19, | |
163 | - 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup': 19, | |
164 | - 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
165 | - 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
166 | - 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
167 | - 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
168 | - 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
169 | - 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:sup': 19, | |
170 | - 'adj:pl:nom.voc:m1.p1:sup': 19, | |
171 | - 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup': 19, | |
172 | - 'adj:sg:acc:f:sup': 19, | |
173 | - 'adj:sg:acc.inst:f:sup': 19, | |
174 | - 'adj:sg:acc:m1.m2:sup': 19, | |
175 | - 'adj:sg:acc:m3:sup': 19, | |
176 | - 'adj:sg:acc:n1.n2:sup': 19, | |
177 | - 'adj:sg:dat:f:sup': 19, | |
178 | - 'adj:sg:dat:m1.m2.m3.n1.n2:sup': 19, | |
179 | - 'adj:sg:gen:f:sup': 19, | |
180 | - 'adj:sg:gen.dat.loc:f:sup': 19, | |
181 | - 'adj:sg:gen:m1.m2.m3.n1.n2:sup': 19, | |
182 | - 'adj:sg:inst:f:sup': 19, | |
183 | - 'adj:sg:inst:m1.m2.m3.n1.n2:sup': 19, | |
184 | - 'adj:sg:inst.loc:m1.m2.m3.n1.n2:sup': 19, | |
185 | - 'adj:sg:loc:f:sup': 19, | |
186 | - 'adj:sg:loc:m1.m2.m3.n1.n2:sup': 19, | |
187 | - 'adj:sg:nom.acc:n1.n2:sup': 19, | |
188 | - 'adj:sg:nom.voc:f:sup': 19, | |
189 | - 'adj:sg:nom.voc:m1.m2.m3:sup': 19, | |
190 | - 'adj:sg:nom.voc:n1.n2:sup': 19, | |
191 | - 'adj:pl:acc:f:sup': 19, | |
192 | - 'adj:pl:acc:m1:sup': 19, | |
193 | - 'adj:pl:acc:m2:sup': 19, | |
194 | - 'adj:pl:acc:m3:sup': 19, | |
195 | - 'adj:pl:acc:n1:sup': 19, | |
196 | - 'adj:pl:acc:n2:sup': 19, | |
197 | - 'adj:pl:acc:p1:sup': 19, | |
198 | - 'adj:pl:acc:p2:sup': 19, | |
199 | - 'adj:pl:acc:p3:sup': 19, | |
200 | - 'adj:pl:dat:f:sup': 19, | |
201 | - 'adj:pl:dat:m1:sup': 19, | |
202 | - 'adj:pl:dat:m2:sup': 19, | |
203 | - 'adj:pl:dat:m3:sup': 19, | |
204 | - 'adj:pl:dat:n1:sup': 19, | |
205 | - 'adj:pl:dat:n2:sup': 19, | |
206 | - 'adj:pl:dat:p1:sup': 19, | |
207 | - 'adj:pl:dat:p2:sup': 19, | |
208 | - 'adj:pl:dat:p3:sup': 19, | |
209 | - 'adj:pl:gen:f:sup': 19, | |
210 | - 'adj:pl:gen:m1:sup': 19, | |
211 | - 'adj:pl:gen:m2:sup': 19, | |
212 | - 'adj:pl:gen:m3:sup': 19, | |
213 | - 'adj:pl:gen:n1:sup': 19, | |
214 | - 'adj:pl:gen:n2:sup': 19, | |
215 | - 'adj:pl:gen:p1:sup': 19, | |
216 | - 'adj:pl:gen:p2:sup': 19, | |
217 | - 'adj:pl:gen:p3:sup': 19, | |
218 | - 'adj:pl:inst:f:sup': 19, | |
219 | - 'adj:pl:inst:m1:sup': 19, | |
220 | - 'adj:pl:inst:m2:sup': 19, | |
221 | - 'adj:pl:inst:m3:sup': 19, | |
222 | - 'adj:pl:inst:n1:sup': 19, | |
223 | - 'adj:pl:inst:n2:sup': 19, | |
224 | - 'adj:pl:inst:p1:sup': 19, | |
225 | - 'adj:pl:inst:p2:sup': 19, | |
226 | - 'adj:pl:inst:p3:sup': 19, | |
227 | - 'adj:pl:loc:f:sup': 19, | |
228 | - 'adj:pl:loc:m1:sup': 19, | |
229 | - 'adj:pl:loc:m2:sup': 19, | |
230 | - 'adj:pl:loc:m3:sup': 19, | |
231 | - 'adj:pl:loc:n1:sup': 19, | |
232 | - 'adj:pl:loc:n2:sup': 19, | |
233 | - 'adj:pl:loc:p1:sup': 19, | |
234 | - 'adj:pl:loc:p2:sup': 19, | |
235 | - 'adj:pl:loc:p3:sup': 19, | |
236 | - 'adj:pl:nom:f:sup': 19, | |
237 | - 'adj:pl:nom:m1:sup': 19, | |
238 | - 'adj:pl:nom:m2:sup': 19, | |
239 | - 'adj:pl:nom:m3:sup': 19, | |
240 | - 'adj:pl:nom:n1:sup': 19, | |
241 | - 'adj:pl:nom:n2:sup': 19, | |
242 | - 'adj:pl:nom:p1:sup': 19, | |
243 | - 'adj:pl:nom:p2:sup': 19, | |
244 | - 'adj:pl:nom:p3:sup': 19, | |
245 | - 'adj:sg:acc:f:sup': 19, | |
246 | - 'adj:sg:acc:m1:sup': 19, | |
247 | - 'adj:sg:acc:m2:sup': 19, | |
248 | - 'adj:sg:acc:m3:sup': 19, | |
249 | - 'adj:sg:acc:n1:sup': 19, | |
250 | - 'adj:sg:acc:n2:sup': 19, | |
251 | - 'adj:sg:dat:f:sup': 19, | |
252 | - 'adj:sg:dat:m1:sup': 19, | |
253 | - 'adj:sg:dat:m2:sup': 19, | |
254 | - 'adj:sg:dat:m3:sup': 19, | |
255 | - 'adj:sg:dat:n1:sup': 19, | |
256 | - 'adj:sg:dat:n2:sup': 19, | |
257 | - 'adj:sg:gen:f:sup': 19, | |
258 | - 'adj:sg:gen:m1:sup': 19, | |
259 | - 'adj:sg:gen:m2:sup': 19, | |
260 | - 'adj:sg:gen:m3:sup': 19, | |
261 | - 'adj:sg:gen:n1:sup': 19, | |
262 | - 'adj:sg:gen:n2:sup': 19, | |
263 | - 'adj:sg:inst:f:sup': 19, | |
264 | - 'adj:sg:inst:m1:sup': 19, | |
265 | - 'adj:sg:inst:m2:sup': 19, | |
266 | - 'adj:sg:inst:m3:sup': 19, | |
267 | - 'adj:sg:inst:n1:sup': 19, | |
268 | - 'adj:sg:inst:n2:sup': 19, | |
269 | - 'adj:sg:loc:f:sup': 19, | |
270 | - 'adj:sg:loc:m1:sup': 19, | |
271 | - 'adj:sg:loc:m2:sup': 19, | |
272 | - 'adj:sg:loc:m3:sup': 19, | |
273 | - 'adj:sg:loc:n1:sup': 19, | |
274 | - 'adj:sg:loc:n2:sup': 19, | |
275 | - 'adj:sg:nom:f:sup': 19, | |
276 | - 'adj:sg:nom:m1:sup': 19, | |
277 | - 'adj:sg:nom:m2:sup': 19, | |
278 | - 'adj:sg:nom:m3:sup': 19, | |
279 | - 'adj:sg:nom:n1:sup': 19, | |
280 | - 'adj:sg:nom:n2:sup': 19, | |
281 | - 'adv:sup': 19, | |
282 | - 'winien:sg:m1.m2.m3:imperf': 3, | |
283 | - 'winien:sg:f:imperf': 3, | |
284 | - 'winien:sg:n1.n2:imperf': 3, | |
285 | - 'winien:pl:m1.p1:imperf': 3, | |
286 | - 'winien:pl:m2.m3.f.n1.n2.p2.p3:imperf': 3, | |
287 | - 'adja': 15, | |
288 | - 'ger:sg:dat.loc:n2:imperf:neg': 18, | |
289 | - 'ger:sg:dat.loc:n2:imperf.perf:neg': 18, | |
290 | - 'ger:sg:dat.loc:n2:perf:neg': 18, | |
291 | - 'ger:sg:gen:n2:imperf:neg': 18, | |
292 | - 'ger:sg:gen:n2:imperf.perf:neg': 18, | |
293 | - 'ger:sg:gen:n2:perf:neg': 18, | |
294 | - 'ger:sg:inst:n2:imperf:neg': 18, | |
295 | - 'ger:sg:inst:n2:imperf.perf:neg': 18, | |
296 | - 'ger:sg:inst:n2:perf:neg': 18, | |
297 | - 'ger:sg:nom.acc:n2:imperf:neg': 18, | |
298 | - 'ger:sg:nom.acc:n2:imperf.perf:neg': 18, | |
299 | - 'ger:sg:nom.acc:n2:perf:neg': 18, | |
300 | - 'pact:pl:acc:m1.p1:imperf:neg': 18, | |
301 | - 'pact:pl:acc:m1.p1:imperf.perf:neg': 18, | |
302 | - 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
303 | - 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
304 | - 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
305 | - 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
306 | - 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
307 | - 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
308 | - 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18, | |
309 | - 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18, | |
310 | - 'pact:pl:nom:m1.p1:imperf:neg': 18, | |
311 | - 'pact:pl:nom:m1.p1:imperf.perf:neg': 18, | |
312 | - 'pact:sg:acc.inst:f:imperf:neg': 18, | |
313 | - 'pact:sg:acc.inst:f:imperf.perf:neg': 18, | |
314 | - 'pact:sg:acc:m1.m2:imperf:neg': 18, | |
315 | - 'pact:sg:acc:m1.m2:imperf.perf:neg': 18, | |
316 | - 'pact:sg:acc:m3:imperf:neg': 18, | |
317 | - 'pact:sg:acc:m3:imperf.perf:neg': 18, | |
318 | - 'pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18, | |
319 | - 'pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
320 | - 'pact:sg:gen.dat.loc:f:imperf:neg': 18, | |
321 | - 'pact:sg:gen.dat.loc:f:imperf.perf:neg': 18, | |
322 | - 'pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18, | |
323 | - 'pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
324 | - 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18, | |
325 | - 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
326 | - 'pact:sg:nom.acc:n1.n2:imperf:neg': 18, | |
327 | - 'pact:sg:nom.acc:n1.n2:imperf.perf:neg': 18, | |
328 | - 'pact:sg:nom:f:imperf:neg': 18, | |
329 | - 'pact:sg:nom:f:imperf.perf:neg': 18, | |
330 | - 'pact:sg:nom:m1.m2.m3:imperf:neg': 18, | |
331 | - 'pact:sg:nom:m1.m2.m3:imperf.perf:neg': 18, | |
332 | - 'ppas:pl:acc:m1.p1:imperf:neg': 18, | |
333 | - 'ppas:pl:acc:m1.p1:imperf.perf:neg': 18, | |
334 | - 'ppas:pl:acc:m1.p1:perf:neg': 18, | |
335 | - 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
336 | - 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
337 | - 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, | |
338 | - 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
339 | - 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
340 | - 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, | |
341 | - 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
342 | - 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
343 | - 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, | |
344 | - 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18, | |
345 | - 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18, | |
346 | - 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:perf:neg': 18, | |
347 | - 'ppas:pl:nom:m1.p1:imperf:neg': 18, | |
348 | - 'ppas:pl:nom:m1.p1:imperf.perf:neg': 18, | |
349 | - 'ppas:pl:nom:m1.p1:perf:neg': 18, | |
350 | - 'ppas:sg:acc.inst:f:imperf:neg': 18, | |
351 | - 'ppas:sg:acc.inst:f:imperf.perf:neg': 18, | |
352 | - 'ppas:sg:acc.inst:f:perf:neg': 18, | |
353 | - 'ppas:sg:acc:m1.m2:imperf:neg': 18, | |
354 | - 'ppas:sg:acc:m1.m2:imperf.perf:neg': 18, | |
355 | - 'ppas:sg:acc:m1.m2:perf:neg': 18, | |
356 | - 'ppas:sg:acc:m3:imperf:neg': 18, | |
357 | - 'ppas:sg:acc:m3:imperf.perf:neg': 18, | |
358 | - 'ppas:sg:acc:m3:perf:neg': 18, | |
359 | - 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18, | |
360 | - 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
361 | - 'ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg': 18, | |
362 | - 'ppas:sg:gen.dat.loc:f:imperf:neg': 18, | |
363 | - 'ppas:sg:gen.dat.loc:f:imperf.perf:neg': 18, | |
364 | - 'ppas:sg:gen.dat.loc:f:perf:neg': 18, | |
365 | - 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18, | |
366 | - 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
367 | - 'ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg': 18, | |
368 | - 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18, | |
369 | - 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
370 | - 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg': 18, | |
371 | - 'ppas:sg:nom.acc:n1.n2:imperf:neg': 18, | |
372 | - 'ppas:sg:nom.acc:n1.n2:imperf.perf:neg': 18, | |
373 | - 'ppas:sg:nom.acc:n1.n2:perf:neg': 18, | |
374 | - 'ppas:sg:nom:f:imperf:neg': 18, | |
375 | - 'ppas:sg:nom:f:imperf.perf:neg': 18, | |
376 | - 'ppas:sg:nom:f:perf:neg': 18, | |
377 | - 'ppas:sg:nom:m1.m2.m3:imperf:neg': 18, | |
378 | - 'ppas:sg:nom:m1.m2.m3:imperf.perf:neg': 18, | |
379 | - 'ppas:sg:nom:m1.m2.m3:perf:neg': 18, | |
380 | - 'ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep': 8, | |
381 | - 'prep:acc': 6, | |
382 | - 'prep:acc:wok': 6, | |
383 | - 'prep:acc.inst': 6, | |
384 | - 'prep:acc.inst:wok': 6, | |
385 | - 'prep:inst.acc': 6, | |
386 | - 'prep:inst.acc:wok': 6, | |
387 | - 'prep:inst.gen.acc:wok': 6, | |
388 | - 'prep:acc.loc': 6, | |
389 | - 'prep:acc.loc:wok': 6, | |
390 | - 'prep:loc.acc': 6, | |
391 | - 'prep:loc.acc:wok': 6, | |
392 | - 'prep:gen': 6, | |
393 | - 'prep:gen.dat': 6, | |
394 | - 'prep:gen:wok': 6, | |
395 | - 'prep:gen.inst:wok': 6, | |
396 | - 'brev:pun': 9, | |
397 | - 'brev:npun': 9, | |
398 | - 'intrj': 9, | |
399 | - 'burk': 9, | |
400 | -} | |
401 | - | |
402 | 10 | def _mergeEntries(inputLines): |
403 | 11 | prevKey = None |
404 | 12 | prevInterps = None |
... | ... | @@ -416,10 +24,11 @@ def _mergeEntries(inputLines): |
416 | 24 | |
417 | 25 | class PolimorfConverter4Analyzer(object): |
418 | 26 | |
419 | - def __init__(self, tagset, encoder, inputEncoding='utf8'): | |
27 | + def __init__(self, tagset, encoder, inputEncoding, segmentRulesManager): | |
420 | 28 | self.tagset = tagset |
421 | 29 | self.encoder = encoder |
422 | 30 | self.inputEncoding = inputEncoding |
31 | + self.segmentRulesManager = segmentRulesManager | |
423 | 32 | |
424 | 33 | # we do it the ugly way (parse to plain text) because it is way more memory-efficient |
425 | 34 | def _partiallyParseLines(self, inputLines): |
... | ... | @@ -428,7 +37,8 @@ class PolimorfConverter4Analyzer(object): |
428 | 37 | orth, base, tag, name = line.split(u'\t') |
429 | 38 | tagnum = self.tagset.tag2tagnum[tag] |
430 | 39 | namenum = self.tagset.name2namenum[name] |
431 | - typenum = tag2typenum.get(tag, 0) | |
40 | +# typenum = tag2typenum.get(tag, 0) | |
41 | + typenum = self.segmentRulesManager.lexeme2SegmentTypeNum(base, tag) | |
432 | 42 | yield '%s %s %d %d %d' % ( |
433 | 43 | orth.encode(self.inputEncoding), |
434 | 44 | base.encode(self.inputEncoding), |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.py
... | ... | @@ -20,7 +20,7 @@ class Serializer(object): |
20 | 20 | return self._fsa |
21 | 21 | |
22 | 22 | def getVersion(self): |
23 | - return 9 | |
23 | + return 10 | |
24 | 24 | |
25 | 25 | def serialize2CppFile(self, fname, generator, additionalData): |
26 | 26 | res = [] |
... | ... | @@ -56,10 +56,15 @@ class Serializer(object): |
56 | 56 | |
57 | 57 | def fsa2bytearray(self, additionalData=bytearray(), moreAdditionalData=bytearray()): |
58 | 58 | res = bytearray() |
59 | - res.extend(self.serializePrologue(additionalData, moreAdditionalData)) | |
59 | + res.extend(self.serializePrologue()) | |
60 | + fsaData = bytearray() | |
61 | + fsaData.extend(self.serializeFSAPrologue()) | |
60 | 62 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
61 | 63 | for state in sorted(self.fsa.dfs(), key=lambda s: s.offset): |
62 | - res.extend(self.state2bytearray(state)) | |
64 | + fsaData.extend(self.state2bytearray(state)) | |
65 | + res.extend(self.htonl(len(fsaData))) | |
66 | + res.extend(fsaData) | |
67 | + res.extend(self.serializeEpilogue(additionalData, moreAdditionalData)) | |
63 | 68 | return res |
64 | 69 | |
65 | 70 | def serializeTags(self, tagsMap): |
... | ... | @@ -96,7 +101,7 @@ class Serializer(object): |
96 | 101 | res.append(n & 0x000000FF) |
97 | 102 | return res |
98 | 103 | |
99 | - def serializePrologue(self, additionalData=None, moreAdditionalData=None): | |
104 | + def serializePrologue(self): | |
100 | 105 | res = bytearray() |
101 | 106 | |
102 | 107 | # serialize magic number in big-endian order |
... | ... | @@ -111,10 +116,16 @@ class Serializer(object): |
111 | 116 | # serialize implementation code |
112 | 117 | res.append(self.getImplementationCode()) |
113 | 118 | |
114 | - # serialize additional data size in 2-byte big-endian | |
119 | + return res | |
120 | + | |
121 | +# def serializeFSAPrologue(self): | |
122 | +# raise NotImplementedError('Not implemented') | |
123 | + | |
124 | + def serializeEpilogue(self, additionalData, moreAdditionalData): | |
125 | + res = bytearray() | |
115 | 126 | additionalDataSize = len(additionalData) if additionalData else 0 |
116 | 127 | moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0 |
117 | - res.extend(self.htonl(additionalDataSize + moreAdditionalDataSize)) | |
128 | + res.extend(self.htonl(additionalDataSize)) | |
118 | 129 | |
119 | 130 | # add additional data itself |
120 | 131 | if additionalDataSize: |
... | ... | @@ -124,7 +135,6 @@ class Serializer(object): |
124 | 135 | if moreAdditionalDataSize: |
125 | 136 | assert type(moreAdditionalData) == bytearray |
126 | 137 | res.extend(moreAdditionalData) |
127 | - | |
128 | 138 | return res |
129 | 139 | |
130 | 140 | def state2bytearray(self, state): |
... | ... | @@ -156,7 +166,10 @@ class SimpleSerializer(Serializer): |
156 | 166 | self.serializeTransitionsData = serializeTransitionsData |
157 | 167 | |
158 | 168 | def getImplementationCode(self): |
159 | - return 0 | |
169 | + return 0 if not self.serializeTransitionsData else 128 | |
170 | + | |
171 | + def serializeFSAPrologue(self): | |
172 | + return bytearray() | |
160 | 173 | |
161 | 174 | def getStateSize(self, state): |
162 | 175 | if self.serializeTransitionsData: |
... | ... | @@ -204,6 +217,7 @@ class VLengthSerializer1(Serializer): |
204 | 217 | self.state2Index = dict([(state, idx) for (idx, state) in enumerate(self.statesTable)]) |
205 | 218 | self._chooseArrayStates() |
206 | 219 | self.useArrays = useArrays |
220 | + self.label2ShortLabel = None | |
207 | 221 | |
208 | 222 | self.ACCEPTING_FLAG = 0b10000000 |
209 | 223 | self.ARRAY_FLAG = 0b01000000 |
... | ... | @@ -211,8 +225,8 @@ class VLengthSerializer1(Serializer): |
211 | 225 | def getImplementationCode(self): |
212 | 226 | return 1 |
213 | 227 | |
214 | - def serializePrologue(self, additionalData, moreAdditionalData): | |
215 | - res = super(VLengthSerializer1, self).serializePrologue(additionalData, moreAdditionalData) | |
228 | + def serializeFSAPrologue(self): | |
229 | + res = bytearray() | |
216 | 230 | |
217 | 231 | # labels sorted by popularity |
218 | 232 | sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda (label, freq): (-freq, label))] |
... | ... | @@ -366,6 +380,9 @@ class VLengthSerializer2(Serializer): |
366 | 380 | self.ACCEPTING_FLAG = 64 |
367 | 381 | self.LAST_FLAG = 32 |
368 | 382 | |
383 | + def serializeFSAPrologue(self): | |
384 | + return bytearray() | |
385 | + | |
369 | 386 | def getImplementationCode(self): |
370 | 387 | return 2 |
371 | 388 | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... | ... | @@ -8,8 +8,9 @@ from morfeuszbuilder.fsa.serializer import SimpleSerializer |
8 | 8 | |
9 | 9 | class RulesManager(object): |
10 | 10 | |
11 | - def __init__(self): | |
11 | + def __init__(self, segtypes): | |
12 | 12 | self.options2DFA = {} |
13 | + self.segtypes = segtypes | |
13 | 14 | |
14 | 15 | def _options2Key(self, optionsMap): |
15 | 16 | return frozenset(optionsMap.items()) |
... | ... | @@ -23,6 +24,13 @@ class RulesManager(object): |
23 | 24 | def addDFA(self, optionsMap, dfa): |
24 | 25 | self.options2DFA[self._options2Key(optionsMap)] = dfa |
25 | 26 | |
27 | + def lexeme2SegmentTypeNum(self, lemma, tag): | |
28 | + res = self.segtypes.lexeme2Segnum(lemma, tag) | |
29 | + if res is None: | |
30 | + raise ValueError() | |
31 | + else: | |
32 | + return res | |
33 | + | |
26 | 34 | def serialize(self): |
27 | 35 | res = bytearray() |
28 | 36 | dfasNum = len(self.options2DFA) |
... | ... | @@ -38,7 +46,6 @@ class RulesManager(object): |
38 | 46 | def _serializeOptionsMap(self, optionsMap): |
39 | 47 | assert len(optionsMap) < 256 |
40 | 48 | res = bytearray() |
41 | - res.append(len(optionsMap)) | |
42 | 49 | self._serializeString(optionsMap['aggl']) |
43 | 50 | self._serializeString(optionsMap['praet']) |
44 | 51 | return res |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... | ... | @@ -28,12 +28,13 @@ class RulesParser(object): |
28 | 28 | return res |
29 | 29 | |
30 | 30 | def parse(self, filename): |
31 | - res = rulesManager.RulesManager() | |
32 | 31 | |
33 | 32 | segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types']) |
34 | 33 | key2Defs = self._getKey2Defs(segtypesConfigFile) |
35 | 34 | segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) |
36 | 35 | |
36 | + res = rulesManager.RulesManager(segtypesHelper) | |
37 | + | |
37 | 38 | def2Key = {} |
38 | 39 | for key, defs in key2Defs.iteritems(): |
39 | 40 | for define in defs: |
... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... | ... | @@ -15,6 +15,7 @@ class Segtypes(object): |
15 | 15 | self.filename = segrulesConfigFile.filename |
16 | 16 | |
17 | 17 | self.segtype2Segnum = {} |
18 | + self.segnum2Segtype = {} | |
18 | 19 | self.patternsList = [] |
19 | 20 | self._readLexemes(segrulesConfigFile) |
20 | 21 | self._readTags(segrulesConfigFile) |
... | ... | @@ -48,6 +49,8 @@ class Segtypes(object): |
48 | 49 | self.segtype2Segnum[segtype] = segnum |
49 | 50 | |
50 | 51 | self.patternsList.append(SegtypePattern(None, pattern, segnum)) |
52 | + | |
53 | + self.segnum2Segtype = dict([(v, k) for (k, v) in self.segtype2Segnum.iteritems()]) | |
51 | 54 | |
52 | 55 | def _readLexemes(self, segrulesConfigFile): |
53 | 56 | for lineNum, line in segrulesConfigFile.enumerateLinesInSection('lexemes'): |
... | ... |