Commit 46ef679b5d9a7b626b501dcf93a3d84e4bd94695

Authored by Michał Lenart
1 parent 9c68c820

- praca nad budowaniem automatu do łączenia segmentów

- (w końcu) prawidłowa obsługa typów segmentów

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@90 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/buildfsa.py
... ... @@ -122,9 +122,9 @@ def _parseOptions():
122 122  
123 123 return opts
124 124  
125   -def _readPolimorfInput4Analyzer(inputFile, tagset, encoder):
  125 +def _readPolimorfInput4Analyzer(inputFile, tagset, encoder, segmentRulesManager):
126 126 with open(inputFile, 'r') as f:
127   - for entry in convertinput.PolimorfConverter4Analyzer(tagset, encoder, 'utf8').convert(f):
  127 + for entry in convertinput.PolimorfConverter4Analyzer(tagset, encoder, 'utf8', segmentRulesManager).convert(f):
128 128 yield entry
129 129  
130 130 def _readPolimorfInput4Generator(inputFile, tagset, encoder):
... ... @@ -154,10 +154,10 @@ def _printStats(fsa):
154 154 logging.info('sink states num: '+str(sinkNum))
155 155 logging.info('array states num: '+str(arrayNum))
156 156  
157   -def buildAnalyzerFromPoliMorf(inputFile, tagset):
  157 +def buildAnalyzerFromPoliMorf(inputFile, tagset, segmentRulesManager):
158 158 encoder = encode.MorphEncoder()
159 159 fsa = FSA(encoder, tagset)
160   - inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder)
  160 + inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder, segmentRulesManager)
161 161 for word, data in inputData:
162 162 fsa.addEntry(word, data)
163 163 fsa.close()
... ... @@ -183,9 +183,9 @@ def main(opts):
183 183 tagset = Tagset(opts.tagsetFile)
184 184  
185 185 if opts.analyzer:
186   - fsa = buildAnalyzerFromPoliMorf(opts.inputFile, tagset)
187 186 segmentRulesManager = rulesParser.RulesParser(tagset).parse(opts.segmentsFile)
188 187 additionalData = segmentRulesManager.serialize()
  188 + fsa = buildAnalyzerFromPoliMorf(opts.inputFile, tagset, segmentRulesManager)
189 189 else:
190 190 fsa = buildGeneratorFromPoliMorf(opts.inputFile, tagset)
191 191 additionalData = bytearray()
... ...
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... ... @@ -7,398 +7,6 @@ import logging
7 7 from common import Interpretation
8 8 from morfeuszbuilder.fsa.common import Interpretation4Generator
9 9  
10   -tag2typenum = {
11   - 'aglt:sg:pri:imperf:nwok': 12,
12   - 'aglt:sg:pri:imperf:wok': 12,
13   - 'aglt:sg:sec:imperf:nwok': 12,
14   - 'aglt:sg:sec:imperf:wok': 12,
15   - 'aglt:pl:pri:imperf:nwok': 13,
16   - 'aglt:pl:pri:imperf:wok': 13,
17   - 'aglt:pl:sec:imperf:nwok': 13,
18   - 'aglt:pl:sec:imperf:wok': 13,
19   - 'praet:sg:m1.m2.m3:imperf:agl': 7,
20   - 'praet:sg:m1.m2.m3:imperf.perf:agl': 7,
21   - 'praet:sg:m1.m2.m3:perf:agl': 7,
22   - 'praet:sg:m1.m2.m3:imperf:nagl': 16,
23   - 'praet:sg:m1.m2.m3:imperf.perf:nagl': 16,
24   - 'praet:sg:m1.m2.m3:perf:nagl': 16,
25   - 'praet:sg:f:imperf': 20,
26   - 'praet:sg:f:imperf.perf': 20,
27   - 'praet:sg:f:perf': 20,
28   - 'praet:sg:m1.m2.m3:imperf': 20,
29   - 'praet:sg:m1.m2.m3:imperf.perf': 20,
30   - 'praet:sg:m1.m2.m3:perf': 20,
31   - 'praet:sg:n1.n2:imperf': 20,
32   - 'praet:sg:n1.n2:imperf.perf': 20,
33   - 'praet:sg:n1.n2:perf': 20,
34   - 'praet:pl:m1.p1:imperf': 21,
35   - 'praet:pl:m1.p1:imperf.perf': 21,
36   - 'praet:pl:m1.p1:perf': 21,
37   - 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf': 21,
38   - 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf': 21,
39   - 'praet:pl:m2.m3.f.n1.n2.p2.p3:perf': 21,
40   - 'naj': 10,
41   - 'nie': 5,
42   - 'adj:pl:acc:m1.p1:pos': 1,
43   - 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos': 1,
44   - 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
45   - 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
46   - 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
47   - 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
48   - 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
49   - 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:pos': 1,
50   - 'adj:pl:nom.voc:m1.p1:pos': 1,
51   - 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos': 1,
52   - 'adj:sg:acc:m1.m2:pos': 1,
53   - 'adj:sg:acc:n1.n2:pos': 1,
54   - 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1,
55   - 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1,
56   - 'adj:sg:inst:m1.m2.m3.n1.n2:pos': 1,
57   - 'adj:sg:loc:m1.m2.m3.n1.n2:pos': 1,
58   - 'adj:sg:nom.voc:m1.m2.m3:pos': 1,
59   - 'adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos': 1,
60   - 'adj:sg:nom.voc:n1.n2:pos': 1,
61   - 'adj:sg:acc:f:pos': 1,
62   - 'adj:sg:acc.inst:f:pos': 1,
63   - 'adj:sg:acc:m1.m2:pos': 1,
64   - 'adj:sg:acc:m3:pos': 1,
65   - 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1,
66   - 'adj:sg:gen.dat.loc:f:pos': 1,
67   - 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1,
68   - 'adj:sg:inst.loc:m1.m2.m3.n1.n2:pos': 1,
69   - 'adj:sg:nom.voc.acc:n1.n2:pos': 1,
70   - 'adj:sg:nom.voc:f:pos': 1,
71   - 'adj:sg:nom.voc:m1.m2.m3:pos': 1,
72   - 'adj:pl:acc:f:pos': 1,
73   - 'adj:pl:acc:m1:pos': 1,
74   - 'adj:pl:acc:m2:pos': 1,
75   - 'adj:pl:acc:m3:pos': 1,
76   - 'adj:pl:acc:n1:pos': 1,
77   - 'adj:pl:acc:n2:pos': 1,
78   - 'adj:pl:acc:p1:pos': 1,
79   - 'adj:pl:acc:p2:pos': 1,
80   - 'adj:pl:acc:p3:pos': 1,
81   - 'adj:pl:dat:f:pos': 1,
82   - 'adj:pl:dat:m1:pos': 1,
83   - 'adj:pl:dat:m2:pos': 1,
84   - 'adj:pl:dat:m3:pos': 1,
85   - 'adj:pl:dat:n1:pos': 1,
86   - 'adj:pl:dat:n2:pos': 1,
87   - 'adj:pl:dat:p1:pos': 1,
88   - 'adj:pl:dat:p2:pos': 1,
89   - 'adj:pl:dat:p3:pos': 1,
90   - 'adj:pl:gen:f:pos': 1,
91   - 'adj:pl:gen:m1:pos': 1,
92   - 'adj:pl:gen:m2:pos': 1,
93   - 'adj:pl:gen:m3:pos': 1,
94   - 'adj:pl:gen:n1:pos': 1,
95   - 'adj:pl:gen:n2:pos': 1,
96   - 'adj:pl:gen:p1:pos': 1,
97   - 'adj:pl:gen:p2:pos': 1,
98   - 'adj:pl:gen:p3:pos': 1,
99   - 'adj:pl:inst:f:pos': 1,
100   - 'adj:pl:inst:m1:pos': 1,
101   - 'adj:pl:inst:m2:pos': 1,
102   - 'adj:pl:inst:m3:pos': 1,
103   - 'adj:pl:inst:n1:pos': 1,
104   - 'adj:pl:inst:n2:pos': 1,
105   - 'adj:pl:inst:p1:pos': 1,
106   - 'adj:pl:inst:p2:pos': 1,
107   - 'adj:pl:inst:p3:pos': 1,
108   - 'adj:pl:loc:f:pos': 1,
109   - 'adj:pl:loc:m1:pos': 1,
110   - 'adj:pl:loc:m2:pos': 1,
111   - 'adj:pl:loc:m3:pos': 1,
112   - 'adj:pl:loc:n1:pos': 1,
113   - 'adj:pl:loc:n2:pos': 1,
114   - 'adj:pl:loc:p1:pos': 1,
115   - 'adj:pl:loc:p2:pos': 1,
116   - 'adj:pl:loc:p3:pos': 1,
117   - 'adj:pl:nom:f:pos': 1,
118   - 'adj:pl:nom:m1:pos': 1,
119   - 'adj:pl:nom:m2:pos': 1,
120   - 'adj:pl:nom:m3:pos': 1,
121   - 'adj:pl:nom:n1:pos': 1,
122   - 'adj:pl:nom:n2:pos': 1,
123   - 'adj:pl:nom:p1:pos': 1,
124   - 'adj:pl:nom:p2:pos': 1,
125   - 'adj:pl:nom:p3:pos': 1,
126   - 'adj:sg:acc:f:pos': 1,
127   - 'adj:sg:acc:m1:pos': 1,
128   - 'adj:sg:acc:m2:pos': 1,
129   - 'adj:sg:acc:m3:pos': 1,
130   - 'adj:sg:acc:n1:pos': 1,
131   - 'adj:sg:acc:n2:pos': 1,
132   - 'adj:sg:dat:f:pos': 1,
133   - 'adj:sg:dat:m1:pos': 1,
134   - 'adj:sg:dat:m2:pos': 1,
135   - 'adj:sg:dat:m3:pos': 1,
136   - 'adj:sg:dat:n1:pos': 1,
137   - 'adj:sg:dat:n2:pos': 1,
138   - 'adj:sg:gen:f:pos': 1,
139   - 'adj:sg:gen:m1:pos': 1,
140   - 'adj:sg:gen:m2:pos': 1,
141   - 'adj:sg:gen:m3:pos': 1,
142   - 'adj:sg:gen:n1:pos': 1,
143   - 'adj:sg:gen:n2:pos': 1,
144   - 'adj:sg:inst:f:pos': 1,
145   - 'adj:sg:inst:m1:pos': 1,
146   - 'adj:sg:inst:m2:pos': 1,
147   - 'adj:sg:inst:m3:pos': 1,
148   - 'adj:sg:inst:n1:pos': 1,
149   - 'adj:sg:inst:n2:pos': 1,
150   - 'adj:sg:loc:f:pos': 1,
151   - 'adj:sg:loc:m1:pos': 1,
152   - 'adj:sg:loc:m2:pos': 1,
153   - 'adj:sg:loc:m3:pos': 1,
154   - 'adj:sg:loc:n1:pos': 1,
155   - 'adj:sg:loc:n2:pos': 1,
156   - 'adj:sg:nom:f:pos': 1,
157   - 'adj:sg:nom:m1:pos': 1,
158   - 'adj:sg:nom:m2:pos': 1,
159   - 'adj:sg:nom:m3:pos': 1,
160   - 'adj:sg:nom:n1:pos': 1,
161   - 'adj:sg:nom:n2:pos': 1,
162   - 'adj:pl:acc:m1.p1:sup': 19,
163   - 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup': 19,
164   - 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
165   - 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
166   - 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
167   - 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
168   - 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
169   - 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:sup': 19,
170   - 'adj:pl:nom.voc:m1.p1:sup': 19,
171   - 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup': 19,
172   - 'adj:sg:acc:f:sup': 19,
173   - 'adj:sg:acc.inst:f:sup': 19,
174   - 'adj:sg:acc:m1.m2:sup': 19,
175   - 'adj:sg:acc:m3:sup': 19,
176   - 'adj:sg:acc:n1.n2:sup': 19,
177   - 'adj:sg:dat:f:sup': 19,
178   - 'adj:sg:dat:m1.m2.m3.n1.n2:sup': 19,
179   - 'adj:sg:gen:f:sup': 19,
180   - 'adj:sg:gen.dat.loc:f:sup': 19,
181   - 'adj:sg:gen:m1.m2.m3.n1.n2:sup': 19,
182   - 'adj:sg:inst:f:sup': 19,
183   - 'adj:sg:inst:m1.m2.m3.n1.n2:sup': 19,
184   - 'adj:sg:inst.loc:m1.m2.m3.n1.n2:sup': 19,
185   - 'adj:sg:loc:f:sup': 19,
186   - 'adj:sg:loc:m1.m2.m3.n1.n2:sup': 19,
187   - 'adj:sg:nom.acc:n1.n2:sup': 19,
188   - 'adj:sg:nom.voc:f:sup': 19,
189   - 'adj:sg:nom.voc:m1.m2.m3:sup': 19,
190   - 'adj:sg:nom.voc:n1.n2:sup': 19,
191   - 'adj:pl:acc:f:sup': 19,
192   - 'adj:pl:acc:m1:sup': 19,
193   - 'adj:pl:acc:m2:sup': 19,
194   - 'adj:pl:acc:m3:sup': 19,
195   - 'adj:pl:acc:n1:sup': 19,
196   - 'adj:pl:acc:n2:sup': 19,
197   - 'adj:pl:acc:p1:sup': 19,
198   - 'adj:pl:acc:p2:sup': 19,
199   - 'adj:pl:acc:p3:sup': 19,
200   - 'adj:pl:dat:f:sup': 19,
201   - 'adj:pl:dat:m1:sup': 19,
202   - 'adj:pl:dat:m2:sup': 19,
203   - 'adj:pl:dat:m3:sup': 19,
204   - 'adj:pl:dat:n1:sup': 19,
205   - 'adj:pl:dat:n2:sup': 19,
206   - 'adj:pl:dat:p1:sup': 19,
207   - 'adj:pl:dat:p2:sup': 19,
208   - 'adj:pl:dat:p3:sup': 19,
209   - 'adj:pl:gen:f:sup': 19,
210   - 'adj:pl:gen:m1:sup': 19,
211   - 'adj:pl:gen:m2:sup': 19,
212   - 'adj:pl:gen:m3:sup': 19,
213   - 'adj:pl:gen:n1:sup': 19,
214   - 'adj:pl:gen:n2:sup': 19,
215   - 'adj:pl:gen:p1:sup': 19,
216   - 'adj:pl:gen:p2:sup': 19,
217   - 'adj:pl:gen:p3:sup': 19,
218   - 'adj:pl:inst:f:sup': 19,
219   - 'adj:pl:inst:m1:sup': 19,
220   - 'adj:pl:inst:m2:sup': 19,
221   - 'adj:pl:inst:m3:sup': 19,
222   - 'adj:pl:inst:n1:sup': 19,
223   - 'adj:pl:inst:n2:sup': 19,
224   - 'adj:pl:inst:p1:sup': 19,
225   - 'adj:pl:inst:p2:sup': 19,
226   - 'adj:pl:inst:p3:sup': 19,
227   - 'adj:pl:loc:f:sup': 19,
228   - 'adj:pl:loc:m1:sup': 19,
229   - 'adj:pl:loc:m2:sup': 19,
230   - 'adj:pl:loc:m3:sup': 19,
231   - 'adj:pl:loc:n1:sup': 19,
232   - 'adj:pl:loc:n2:sup': 19,
233   - 'adj:pl:loc:p1:sup': 19,
234   - 'adj:pl:loc:p2:sup': 19,
235   - 'adj:pl:loc:p3:sup': 19,
236   - 'adj:pl:nom:f:sup': 19,
237   - 'adj:pl:nom:m1:sup': 19,
238   - 'adj:pl:nom:m2:sup': 19,
239   - 'adj:pl:nom:m3:sup': 19,
240   - 'adj:pl:nom:n1:sup': 19,
241   - 'adj:pl:nom:n2:sup': 19,
242   - 'adj:pl:nom:p1:sup': 19,
243   - 'adj:pl:nom:p2:sup': 19,
244   - 'adj:pl:nom:p3:sup': 19,
245   - 'adj:sg:acc:f:sup': 19,
246   - 'adj:sg:acc:m1:sup': 19,
247   - 'adj:sg:acc:m2:sup': 19,
248   - 'adj:sg:acc:m3:sup': 19,
249   - 'adj:sg:acc:n1:sup': 19,
250   - 'adj:sg:acc:n2:sup': 19,
251   - 'adj:sg:dat:f:sup': 19,
252   - 'adj:sg:dat:m1:sup': 19,
253   - 'adj:sg:dat:m2:sup': 19,
254   - 'adj:sg:dat:m3:sup': 19,
255   - 'adj:sg:dat:n1:sup': 19,
256   - 'adj:sg:dat:n2:sup': 19,
257   - 'adj:sg:gen:f:sup': 19,
258   - 'adj:sg:gen:m1:sup': 19,
259   - 'adj:sg:gen:m2:sup': 19,
260   - 'adj:sg:gen:m3:sup': 19,
261   - 'adj:sg:gen:n1:sup': 19,
262   - 'adj:sg:gen:n2:sup': 19,
263   - 'adj:sg:inst:f:sup': 19,
264   - 'adj:sg:inst:m1:sup': 19,
265   - 'adj:sg:inst:m2:sup': 19,
266   - 'adj:sg:inst:m3:sup': 19,
267   - 'adj:sg:inst:n1:sup': 19,
268   - 'adj:sg:inst:n2:sup': 19,
269   - 'adj:sg:loc:f:sup': 19,
270   - 'adj:sg:loc:m1:sup': 19,
271   - 'adj:sg:loc:m2:sup': 19,
272   - 'adj:sg:loc:m3:sup': 19,
273   - 'adj:sg:loc:n1:sup': 19,
274   - 'adj:sg:loc:n2:sup': 19,
275   - 'adj:sg:nom:f:sup': 19,
276   - 'adj:sg:nom:m1:sup': 19,
277   - 'adj:sg:nom:m2:sup': 19,
278   - 'adj:sg:nom:m3:sup': 19,
279   - 'adj:sg:nom:n1:sup': 19,
280   - 'adj:sg:nom:n2:sup': 19,
281   - 'adv:sup': 19,
282   - 'winien:sg:m1.m2.m3:imperf': 3,
283   - 'winien:sg:f:imperf': 3,
284   - 'winien:sg:n1.n2:imperf': 3,
285   - 'winien:pl:m1.p1:imperf': 3,
286   - 'winien:pl:m2.m3.f.n1.n2.p2.p3:imperf': 3,
287   - 'adja': 15,
288   - 'ger:sg:dat.loc:n2:imperf:neg': 18,
289   - 'ger:sg:dat.loc:n2:imperf.perf:neg': 18,
290   - 'ger:sg:dat.loc:n2:perf:neg': 18,
291   - 'ger:sg:gen:n2:imperf:neg': 18,
292   - 'ger:sg:gen:n2:imperf.perf:neg': 18,
293   - 'ger:sg:gen:n2:perf:neg': 18,
294   - 'ger:sg:inst:n2:imperf:neg': 18,
295   - 'ger:sg:inst:n2:imperf.perf:neg': 18,
296   - 'ger:sg:inst:n2:perf:neg': 18,
297   - 'ger:sg:nom.acc:n2:imperf:neg': 18,
298   - 'ger:sg:nom.acc:n2:imperf.perf:neg': 18,
299   - 'ger:sg:nom.acc:n2:perf:neg': 18,
300   - 'pact:pl:acc:m1.p1:imperf:neg': 18,
301   - 'pact:pl:acc:m1.p1:imperf.perf:neg': 18,
302   - 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
303   - 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
304   - 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
305   - 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
306   - 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
307   - 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
308   - 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18,
309   - 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18,
310   - 'pact:pl:nom:m1.p1:imperf:neg': 18,
311   - 'pact:pl:nom:m1.p1:imperf.perf:neg': 18,
312   - 'pact:sg:acc.inst:f:imperf:neg': 18,
313   - 'pact:sg:acc.inst:f:imperf.perf:neg': 18,
314   - 'pact:sg:acc:m1.m2:imperf:neg': 18,
315   - 'pact:sg:acc:m1.m2:imperf.perf:neg': 18,
316   - 'pact:sg:acc:m3:imperf:neg': 18,
317   - 'pact:sg:acc:m3:imperf.perf:neg': 18,
318   - 'pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18,
319   - 'pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
320   - 'pact:sg:gen.dat.loc:f:imperf:neg': 18,
321   - 'pact:sg:gen.dat.loc:f:imperf.perf:neg': 18,
322   - 'pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18,
323   - 'pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
324   - 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18,
325   - 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
326   - 'pact:sg:nom.acc:n1.n2:imperf:neg': 18,
327   - 'pact:sg:nom.acc:n1.n2:imperf.perf:neg': 18,
328   - 'pact:sg:nom:f:imperf:neg': 18,
329   - 'pact:sg:nom:f:imperf.perf:neg': 18,
330   - 'pact:sg:nom:m1.m2.m3:imperf:neg': 18,
331   - 'pact:sg:nom:m1.m2.m3:imperf.perf:neg': 18,
332   - 'ppas:pl:acc:m1.p1:imperf:neg': 18,
333   - 'ppas:pl:acc:m1.p1:imperf.perf:neg': 18,
334   - 'ppas:pl:acc:m1.p1:perf:neg': 18,
335   - 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
336   - 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
337   - 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18,
338   - 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
339   - 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
340   - 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18,
341   - 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
342   - 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
343   - 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18,
344   - 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18,
345   - 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18,
346   - 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:perf:neg': 18,
347   - 'ppas:pl:nom:m1.p1:imperf:neg': 18,
348   - 'ppas:pl:nom:m1.p1:imperf.perf:neg': 18,
349   - 'ppas:pl:nom:m1.p1:perf:neg': 18,
350   - 'ppas:sg:acc.inst:f:imperf:neg': 18,
351   - 'ppas:sg:acc.inst:f:imperf.perf:neg': 18,
352   - 'ppas:sg:acc.inst:f:perf:neg': 18,
353   - 'ppas:sg:acc:m1.m2:imperf:neg': 18,
354   - 'ppas:sg:acc:m1.m2:imperf.perf:neg': 18,
355   - 'ppas:sg:acc:m1.m2:perf:neg': 18,
356   - 'ppas:sg:acc:m3:imperf:neg': 18,
357   - 'ppas:sg:acc:m3:imperf.perf:neg': 18,
358   - 'ppas:sg:acc:m3:perf:neg': 18,
359   - 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18,
360   - 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
361   - 'ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg': 18,
362   - 'ppas:sg:gen.dat.loc:f:imperf:neg': 18,
363   - 'ppas:sg:gen.dat.loc:f:imperf.perf:neg': 18,
364   - 'ppas:sg:gen.dat.loc:f:perf:neg': 18,
365   - 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18,
366   - 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
367   - 'ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg': 18,
368   - 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18,
369   - 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
370   - 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg': 18,
371   - 'ppas:sg:nom.acc:n1.n2:imperf:neg': 18,
372   - 'ppas:sg:nom.acc:n1.n2:imperf.perf:neg': 18,
373   - 'ppas:sg:nom.acc:n1.n2:perf:neg': 18,
374   - 'ppas:sg:nom:f:imperf:neg': 18,
375   - 'ppas:sg:nom:f:imperf.perf:neg': 18,
376   - 'ppas:sg:nom:f:perf:neg': 18,
377   - 'ppas:sg:nom:m1.m2.m3:imperf:neg': 18,
378   - 'ppas:sg:nom:m1.m2.m3:imperf.perf:neg': 18,
379   - 'ppas:sg:nom:m1.m2.m3:perf:neg': 18,
380   - 'ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep': 8,
381   - 'prep:acc': 6,
382   - 'prep:acc:wok': 6,
383   - 'prep:acc.inst': 6,
384   - 'prep:acc.inst:wok': 6,
385   - 'prep:inst.acc': 6,
386   - 'prep:inst.acc:wok': 6,
387   - 'prep:inst.gen.acc:wok': 6,
388   - 'prep:acc.loc': 6,
389   - 'prep:acc.loc:wok': 6,
390   - 'prep:loc.acc': 6,
391   - 'prep:loc.acc:wok': 6,
392   - 'prep:gen': 6,
393   - 'prep:gen.dat': 6,
394   - 'prep:gen:wok': 6,
395   - 'prep:gen.inst:wok': 6,
396   - 'brev:pun': 9,
397   - 'brev:npun': 9,
398   - 'intrj': 9,
399   - 'burk': 9,
400   -}
401   -
402 10 def _mergeEntries(inputLines):
403 11 prevKey = None
404 12 prevInterps = None
... ... @@ -416,10 +24,11 @@ def _mergeEntries(inputLines):
416 24  
417 25 class PolimorfConverter4Analyzer(object):
418 26  
419   - def __init__(self, tagset, encoder, inputEncoding='utf8'):
  27 + def __init__(self, tagset, encoder, inputEncoding, segmentRulesManager):
420 28 self.tagset = tagset
421 29 self.encoder = encoder
422 30 self.inputEncoding = inputEncoding
  31 + self.segmentRulesManager = segmentRulesManager
423 32  
424 33 # we do it the ugly way (parse to plain text) because it is way more memory-efficient
425 34 def _partiallyParseLines(self, inputLines):
... ... @@ -428,7 +37,8 @@ class PolimorfConverter4Analyzer(object):
428 37 orth, base, tag, name = line.split(u'\t')
429 38 tagnum = self.tagset.tag2tagnum[tag]
430 39 namenum = self.tagset.name2namenum[name]
431   - typenum = tag2typenum.get(tag, 0)
  40 +# typenum = tag2typenum.get(tag, 0)
  41 + typenum = self.segmentRulesManager.lexeme2SegmentTypeNum(base, tag)
432 42 yield '%s %s %d %d %d' % (
433 43 orth.encode(self.inputEncoding),
434 44 base.encode(self.inputEncoding),
... ...
fsabuilder/morfeuszbuilder/fsa/serializer.py
... ... @@ -20,7 +20,7 @@ class Serializer(object):
20 20 return self._fsa
21 21  
22 22 def getVersion(self):
23   - return 9
  23 + return 10
24 24  
25 25 def serialize2CppFile(self, fname, generator, additionalData):
26 26 res = []
... ... @@ -56,10 +56,15 @@ class Serializer(object):
56 56  
57 57 def fsa2bytearray(self, additionalData=bytearray(), moreAdditionalData=bytearray()):
58 58 res = bytearray()
59   - res.extend(self.serializePrologue(additionalData, moreAdditionalData))
  59 + res.extend(self.serializePrologue())
  60 + fsaData = bytearray()
  61 + fsaData.extend(self.serializeFSAPrologue())
60 62 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
61 63 for state in sorted(self.fsa.dfs(), key=lambda s: s.offset):
62   - res.extend(self.state2bytearray(state))
  64 + fsaData.extend(self.state2bytearray(state))
  65 + res.extend(self.htonl(len(fsaData)))
  66 + res.extend(fsaData)
  67 + res.extend(self.serializeEpilogue(additionalData, moreAdditionalData))
63 68 return res
64 69  
65 70 def serializeTags(self, tagsMap):
... ... @@ -96,7 +101,7 @@ class Serializer(object):
96 101 res.append(n & 0x000000FF)
97 102 return res
98 103  
99   - def serializePrologue(self, additionalData=None, moreAdditionalData=None):
  104 + def serializePrologue(self):
100 105 res = bytearray()
101 106  
102 107 # serialize magic number in big-endian order
... ... @@ -111,10 +116,16 @@ class Serializer(object):
111 116 # serialize implementation code
112 117 res.append(self.getImplementationCode())
113 118  
114   - # serialize additional data size in 2-byte big-endian
  119 + return res
  120 +
  121 +# def serializeFSAPrologue(self):
  122 +# raise NotImplementedError('Not implemented')
  123 +
  124 + def serializeEpilogue(self, additionalData, moreAdditionalData):
  125 + res = bytearray()
115 126 additionalDataSize = len(additionalData) if additionalData else 0
116 127 moreAdditionalDataSize = len(moreAdditionalData) if moreAdditionalData else 0
117   - res.extend(self.htonl(additionalDataSize + moreAdditionalDataSize))
  128 + res.extend(self.htonl(additionalDataSize))
118 129  
119 130 # add additional data itself
120 131 if additionalDataSize:
... ... @@ -124,7 +135,6 @@ class Serializer(object):
124 135 if moreAdditionalDataSize:
125 136 assert type(moreAdditionalData) == bytearray
126 137 res.extend(moreAdditionalData)
127   -
128 138 return res
129 139  
130 140 def state2bytearray(self, state):
... ... @@ -156,7 +166,10 @@ class SimpleSerializer(Serializer):
156 166 self.serializeTransitionsData = serializeTransitionsData
157 167  
158 168 def getImplementationCode(self):
159   - return 0
  169 + return 0 if not self.serializeTransitionsData else 128
  170 +
  171 + def serializeFSAPrologue(self):
  172 + return bytearray()
160 173  
161 174 def getStateSize(self, state):
162 175 if self.serializeTransitionsData:
... ... @@ -204,6 +217,7 @@ class VLengthSerializer1(Serializer):
204 217 self.state2Index = dict([(state, idx) for (idx, state) in enumerate(self.statesTable)])
205 218 self._chooseArrayStates()
206 219 self.useArrays = useArrays
  220 + self.label2ShortLabel = None
207 221  
208 222 self.ACCEPTING_FLAG = 0b10000000
209 223 self.ARRAY_FLAG = 0b01000000
... ... @@ -211,8 +225,8 @@ class VLengthSerializer1(Serializer):
211 225 def getImplementationCode(self):
212 226 return 1
213 227  
214   - def serializePrologue(self, additionalData, moreAdditionalData):
215   - res = super(VLengthSerializer1, self).serializePrologue(additionalData, moreAdditionalData)
  228 + def serializeFSAPrologue(self):
  229 + res = bytearray()
216 230  
217 231 # labels sorted by popularity
218 232 sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda (label, freq): (-freq, label))]
... ... @@ -366,6 +380,9 @@ class VLengthSerializer2(Serializer):
366 380 self.ACCEPTING_FLAG = 64
367 381 self.LAST_FLAG = 32
368 382  
  383 + def serializeFSAPrologue(self):
  384 + return bytearray()
  385 +
369 386 def getImplementationCode(self):
370 387 return 2
371 388  
... ...
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... ... @@ -8,8 +8,9 @@ from morfeuszbuilder.fsa.serializer import SimpleSerializer
8 8  
9 9 class RulesManager(object):
10 10  
11   - def __init__(self):
  11 + def __init__(self, segtypes):
12 12 self.options2DFA = {}
  13 + self.segtypes = segtypes
13 14  
14 15 def _options2Key(self, optionsMap):
15 16 return frozenset(optionsMap.items())
... ... @@ -23,6 +24,13 @@ class RulesManager(object):
23 24 def addDFA(self, optionsMap, dfa):
24 25 self.options2DFA[self._options2Key(optionsMap)] = dfa
25 26  
  27 + def lexeme2SegmentTypeNum(self, lemma, tag):
  28 + res = self.segtypes.lexeme2Segnum(lemma, tag)
  29 + if res is None:
  30 + raise ValueError()
  31 + else:
  32 + return res
  33 +
26 34 def serialize(self):
27 35 res = bytearray()
28 36 dfasNum = len(self.options2DFA)
... ... @@ -38,7 +46,6 @@ class RulesManager(object):
38 46 def _serializeOptionsMap(self, optionsMap):
39 47 assert len(optionsMap) < 256
40 48 res = bytearray()
41   - res.append(len(optionsMap))
42 49 self._serializeString(optionsMap['aggl'])
43 50 self._serializeString(optionsMap['praet'])
44 51 return res
... ...
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... ... @@ -28,12 +28,13 @@ class RulesParser(object):
28 28 return res
29 29  
30 30 def parse(self, filename):
31   - res = rulesManager.RulesManager()
32 31  
33 32 segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'tags', 'lexemes', 'segment types'])
34 33 key2Defs = self._getKey2Defs(segtypesConfigFile)
35 34 segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
36 35  
  36 + res = rulesManager.RulesManager(segtypesHelper)
  37 +
37 38 def2Key = {}
38 39 for key, defs in key2Defs.iteritems():
39 40 for define in defs:
... ...
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... ... @@ -15,6 +15,7 @@ class Segtypes(object):
15 15 self.filename = segrulesConfigFile.filename
16 16  
17 17 self.segtype2Segnum = {}
  18 + self.segnum2Segtype = {}
18 19 self.patternsList = []
19 20 self._readLexemes(segrulesConfigFile)
20 21 self._readTags(segrulesConfigFile)
... ... @@ -48,6 +49,8 @@ class Segtypes(object):
48 49 self.segtype2Segnum[segtype] = segnum
49 50  
50 51 self.patternsList.append(SegtypePattern(None, pattern, segnum))
  52 +
  53 + self.segnum2Segtype = dict([(v, k) for (k, v) in self.segtype2Segnum.iteritems()])
51 54  
52 55 def _readLexemes(self, segrulesConfigFile):
53 56 for lineNum, line in segrulesConfigFile.enumerateLinesInSection('lexemes'):
... ...