Commit a9d3e65c15f2e43bc637fbda8342a6242dc1174f
1 parent
f23aead2
- refaktoryzacja, odkomentowanie na-razie-niedziałających kawałków kodu
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@20 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
22 changed files
with
718 additions
and
116 deletions
fsabuilder/fsa/common.py
... | ... | @@ -14,7 +14,7 @@ class Lemma(object): |
14 | 14 | |
15 | 15 | class Interpretation(object): |
16 | 16 | |
17 | - def __init__(self, orth, base, tagnum, namenum, encoder): | |
17 | + def __init__(self, orth, base, tagnum, namenum, typenum, encoder): | |
18 | 18 | assert type(orth) == unicode |
19 | 19 | assert type(base) == unicode |
20 | 20 | root = u'' |
... | ... | @@ -29,6 +29,7 @@ class Interpretation(object): |
29 | 29 | suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False)) |
30 | 30 | self.tagnum = tagnum |
31 | 31 | self.namenum = namenum |
32 | + self.typenum = typenum | |
32 | 33 | |
33 | 34 | def getSortKey(self): |
34 | 35 | return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum) |
... | ... |
fsabuilder/fsa/convertinput.py
... | ... | @@ -6,6 +6,398 @@ Created on Oct 23, 2013 |
6 | 6 | import logging |
7 | 7 | from common import Interpretation |
8 | 8 | |
9 | +tag2typenum = { | |
10 | + 'aglt:sg:pri:imperf:nwok': 12, | |
11 | + 'aglt:sg:pri:imperf:wok': 12, | |
12 | + 'aglt:sg:sec:imperf:nwok': 12, | |
13 | + 'aglt:sg:sec:imperf:wok': 12, | |
14 | + 'aglt:pl:pri:imperf:nwok': 13, | |
15 | + 'aglt:pl:pri:imperf:wok': 13, | |
16 | + 'aglt:pl:sec:imperf:nwok': 13, | |
17 | + 'aglt:pl:sec:imperf:wok': 13, | |
18 | + 'praet:sg:m1.m2.m3:imperf:agl': 7, | |
19 | + 'praet:sg:m1.m2.m3:imperf.perf:agl': 7, | |
20 | + 'praet:sg:m1.m2.m3:perf:agl': 7, | |
21 | + 'praet:sg:m1.m2.m3:imperf:nagl': 16, | |
22 | + 'praet:sg:m1.m2.m3:imperf.perf:nagl': 16, | |
23 | + 'praet:sg:m1.m2.m3:perf:nagl': 16, | |
24 | + 'praet:sg:f:imperf': 20, | |
25 | + 'praet:sg:f:imperf.perf': 20, | |
26 | + 'praet:sg:f:perf': 20, | |
27 | + 'praet:sg:m1.m2.m3:imperf': 20, | |
28 | + 'praet:sg:m1.m2.m3:imperf.perf': 20, | |
29 | + 'praet:sg:m1.m2.m3:perf': 20, | |
30 | + 'praet:sg:n1.n2:imperf': 20, | |
31 | + 'praet:sg:n1.n2:imperf.perf': 20, | |
32 | + 'praet:sg:n1.n2:perf': 20, | |
33 | + 'praet:pl:m1.p1:imperf': 21, | |
34 | + 'praet:pl:m1.p1:imperf.perf': 21, | |
35 | + 'praet:pl:m1.p1:perf': 21, | |
36 | + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf': 21, | |
37 | + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf': 21, | |
38 | + 'praet:pl:m2.m3.f.n1.n2.p2.p3:perf': 21, | |
39 | + 'naj': 10, | |
40 | + 'nie': 5, | |
41 | + 'adj:pl:acc:m1.p1:pos': 1, | |
42 | + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos': 1, | |
43 | + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
44 | + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
45 | + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
46 | + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
47 | + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, | |
48 | + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:pos': 1, | |
49 | + 'adj:pl:nom.voc:m1.p1:pos': 1, | |
50 | + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos': 1, | |
51 | + 'adj:sg:acc:m1.m2:pos': 1, | |
52 | + 'adj:sg:acc:n1.n2:pos': 1, | |
53 | + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1, | |
54 | + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1, | |
55 | + 'adj:sg:inst:m1.m2.m3.n1.n2:pos': 1, | |
56 | + 'adj:sg:loc:m1.m2.m3.n1.n2:pos': 1, | |
57 | + 'adj:sg:nom.voc:m1.m2.m3:pos': 1, | |
58 | + 'adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos': 1, | |
59 | + 'adj:sg:nom.voc:n1.n2:pos': 1, | |
60 | + 'adj:sg:acc:f:pos': 1, | |
61 | + 'adj:sg:acc.inst:f:pos': 1, | |
62 | + 'adj:sg:acc:m1.m2:pos': 1, | |
63 | + 'adj:sg:acc:m3:pos': 1, | |
64 | + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1, | |
65 | + 'adj:sg:gen.dat.loc:f:pos': 1, | |
66 | + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1, | |
67 | + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:pos': 1, | |
68 | + 'adj:sg:nom.voc.acc:n1.n2:pos': 1, | |
69 | + 'adj:sg:nom.voc:f:pos': 1, | |
70 | + 'adj:sg:nom.voc:m1.m2.m3:pos': 1, | |
71 | + 'adj:pl:acc:f:pos': 1, | |
72 | + 'adj:pl:acc:m1:pos': 1, | |
73 | + 'adj:pl:acc:m2:pos': 1, | |
74 | + 'adj:pl:acc:m3:pos': 1, | |
75 | + 'adj:pl:acc:n1:pos': 1, | |
76 | + 'adj:pl:acc:n2:pos': 1, | |
77 | + 'adj:pl:acc:p1:pos': 1, | |
78 | + 'adj:pl:acc:p2:pos': 1, | |
79 | + 'adj:pl:acc:p3:pos': 1, | |
80 | + 'adj:pl:dat:f:pos': 1, | |
81 | + 'adj:pl:dat:m1:pos': 1, | |
82 | + 'adj:pl:dat:m2:pos': 1, | |
83 | + 'adj:pl:dat:m3:pos': 1, | |
84 | + 'adj:pl:dat:n1:pos': 1, | |
85 | + 'adj:pl:dat:n2:pos': 1, | |
86 | + 'adj:pl:dat:p1:pos': 1, | |
87 | + 'adj:pl:dat:p2:pos': 1, | |
88 | + 'adj:pl:dat:p3:pos': 1, | |
89 | + 'adj:pl:gen:f:pos': 1, | |
90 | + 'adj:pl:gen:m1:pos': 1, | |
91 | + 'adj:pl:gen:m2:pos': 1, | |
92 | + 'adj:pl:gen:m3:pos': 1, | |
93 | + 'adj:pl:gen:n1:pos': 1, | |
94 | + 'adj:pl:gen:n2:pos': 1, | |
95 | + 'adj:pl:gen:p1:pos': 1, | |
96 | + 'adj:pl:gen:p2:pos': 1, | |
97 | + 'adj:pl:gen:p3:pos': 1, | |
98 | + 'adj:pl:inst:f:pos': 1, | |
99 | + 'adj:pl:inst:m1:pos': 1, | |
100 | + 'adj:pl:inst:m2:pos': 1, | |
101 | + 'adj:pl:inst:m3:pos': 1, | |
102 | + 'adj:pl:inst:n1:pos': 1, | |
103 | + 'adj:pl:inst:n2:pos': 1, | |
104 | + 'adj:pl:inst:p1:pos': 1, | |
105 | + 'adj:pl:inst:p2:pos': 1, | |
106 | + 'adj:pl:inst:p3:pos': 1, | |
107 | + 'adj:pl:loc:f:pos': 1, | |
108 | + 'adj:pl:loc:m1:pos': 1, | |
109 | + 'adj:pl:loc:m2:pos': 1, | |
110 | + 'adj:pl:loc:m3:pos': 1, | |
111 | + 'adj:pl:loc:n1:pos': 1, | |
112 | + 'adj:pl:loc:n2:pos': 1, | |
113 | + 'adj:pl:loc:p1:pos': 1, | |
114 | + 'adj:pl:loc:p2:pos': 1, | |
115 | + 'adj:pl:loc:p3:pos': 1, | |
116 | + 'adj:pl:nom:f:pos': 1, | |
117 | + 'adj:pl:nom:m1:pos': 1, | |
118 | + 'adj:pl:nom:m2:pos': 1, | |
119 | + 'adj:pl:nom:m3:pos': 1, | |
120 | + 'adj:pl:nom:n1:pos': 1, | |
121 | + 'adj:pl:nom:n2:pos': 1, | |
122 | + 'adj:pl:nom:p1:pos': 1, | |
123 | + 'adj:pl:nom:p2:pos': 1, | |
124 | + 'adj:pl:nom:p3:pos': 1, | |
125 | + 'adj:sg:acc:f:pos': 1, | |
126 | + 'adj:sg:acc:m1:pos': 1, | |
127 | + 'adj:sg:acc:m2:pos': 1, | |
128 | + 'adj:sg:acc:m3:pos': 1, | |
129 | + 'adj:sg:acc:n1:pos': 1, | |
130 | + 'adj:sg:acc:n2:pos': 1, | |
131 | + 'adj:sg:dat:f:pos': 1, | |
132 | + 'adj:sg:dat:m1:pos': 1, | |
133 | + 'adj:sg:dat:m2:pos': 1, | |
134 | + 'adj:sg:dat:m3:pos': 1, | |
135 | + 'adj:sg:dat:n1:pos': 1, | |
136 | + 'adj:sg:dat:n2:pos': 1, | |
137 | + 'adj:sg:gen:f:pos': 1, | |
138 | + 'adj:sg:gen:m1:pos': 1, | |
139 | + 'adj:sg:gen:m2:pos': 1, | |
140 | + 'adj:sg:gen:m3:pos': 1, | |
141 | + 'adj:sg:gen:n1:pos': 1, | |
142 | + 'adj:sg:gen:n2:pos': 1, | |
143 | + 'adj:sg:inst:f:pos': 1, | |
144 | + 'adj:sg:inst:m1:pos': 1, | |
145 | + 'adj:sg:inst:m2:pos': 1, | |
146 | + 'adj:sg:inst:m3:pos': 1, | |
147 | + 'adj:sg:inst:n1:pos': 1, | |
148 | + 'adj:sg:inst:n2:pos': 1, | |
149 | + 'adj:sg:loc:f:pos': 1, | |
150 | + 'adj:sg:loc:m1:pos': 1, | |
151 | + 'adj:sg:loc:m2:pos': 1, | |
152 | + 'adj:sg:loc:m3:pos': 1, | |
153 | + 'adj:sg:loc:n1:pos': 1, | |
154 | + 'adj:sg:loc:n2:pos': 1, | |
155 | + 'adj:sg:nom:f:pos': 1, | |
156 | + 'adj:sg:nom:m1:pos': 1, | |
157 | + 'adj:sg:nom:m2:pos': 1, | |
158 | + 'adj:sg:nom:m3:pos': 1, | |
159 | + 'adj:sg:nom:n1:pos': 1, | |
160 | + 'adj:sg:nom:n2:pos': 1, | |
161 | + 'adj:pl:acc:m1.p1:sup': 19, | |
162 | + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup': 19, | |
163 | + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
164 | + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
165 | + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
166 | + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
167 | + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, | |
168 | + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:sup': 19, | |
169 | + 'adj:pl:nom.voc:m1.p1:sup': 19, | |
170 | + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup': 19, | |
171 | + 'adj:sg:acc:f:sup': 19, | |
172 | + 'adj:sg:acc.inst:f:sup': 19, | |
173 | + 'adj:sg:acc:m1.m2:sup': 19, | |
174 | + 'adj:sg:acc:m3:sup': 19, | |
175 | + 'adj:sg:acc:n1.n2:sup': 19, | |
176 | + 'adj:sg:dat:f:sup': 19, | |
177 | + 'adj:sg:dat:m1.m2.m3.n1.n2:sup': 19, | |
178 | + 'adj:sg:gen:f:sup': 19, | |
179 | + 'adj:sg:gen.dat.loc:f:sup': 19, | |
180 | + 'adj:sg:gen:m1.m2.m3.n1.n2:sup': 19, | |
181 | + 'adj:sg:inst:f:sup': 19, | |
182 | + 'adj:sg:inst:m1.m2.m3.n1.n2:sup': 19, | |
183 | + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:sup': 19, | |
184 | + 'adj:sg:loc:f:sup': 19, | |
185 | + 'adj:sg:loc:m1.m2.m3.n1.n2:sup': 19, | |
186 | + 'adj:sg:nom.acc:n1.n2:sup': 19, | |
187 | + 'adj:sg:nom.voc:f:sup': 19, | |
188 | + 'adj:sg:nom.voc:m1.m2.m3:sup': 19, | |
189 | + 'adj:sg:nom.voc:n1.n2:sup': 19, | |
190 | + 'adj:pl:acc:f:sup': 19, | |
191 | + 'adj:pl:acc:m1:sup': 19, | |
192 | + 'adj:pl:acc:m2:sup': 19, | |
193 | + 'adj:pl:acc:m3:sup': 19, | |
194 | + 'adj:pl:acc:n1:sup': 19, | |
195 | + 'adj:pl:acc:n2:sup': 19, | |
196 | + 'adj:pl:acc:p1:sup': 19, | |
197 | + 'adj:pl:acc:p2:sup': 19, | |
198 | + 'adj:pl:acc:p3:sup': 19, | |
199 | + 'adj:pl:dat:f:sup': 19, | |
200 | + 'adj:pl:dat:m1:sup': 19, | |
201 | + 'adj:pl:dat:m2:sup': 19, | |
202 | + 'adj:pl:dat:m3:sup': 19, | |
203 | + 'adj:pl:dat:n1:sup': 19, | |
204 | + 'adj:pl:dat:n2:sup': 19, | |
205 | + 'adj:pl:dat:p1:sup': 19, | |
206 | + 'adj:pl:dat:p2:sup': 19, | |
207 | + 'adj:pl:dat:p3:sup': 19, | |
208 | + 'adj:pl:gen:f:sup': 19, | |
209 | + 'adj:pl:gen:m1:sup': 19, | |
210 | + 'adj:pl:gen:m2:sup': 19, | |
211 | + 'adj:pl:gen:m3:sup': 19, | |
212 | + 'adj:pl:gen:n1:sup': 19, | |
213 | + 'adj:pl:gen:n2:sup': 19, | |
214 | + 'adj:pl:gen:p1:sup': 19, | |
215 | + 'adj:pl:gen:p2:sup': 19, | |
216 | + 'adj:pl:gen:p3:sup': 19, | |
217 | + 'adj:pl:inst:f:sup': 19, | |
218 | + 'adj:pl:inst:m1:sup': 19, | |
219 | + 'adj:pl:inst:m2:sup': 19, | |
220 | + 'adj:pl:inst:m3:sup': 19, | |
221 | + 'adj:pl:inst:n1:sup': 19, | |
222 | + 'adj:pl:inst:n2:sup': 19, | |
223 | + 'adj:pl:inst:p1:sup': 19, | |
224 | + 'adj:pl:inst:p2:sup': 19, | |
225 | + 'adj:pl:inst:p3:sup': 19, | |
226 | + 'adj:pl:loc:f:sup': 19, | |
227 | + 'adj:pl:loc:m1:sup': 19, | |
228 | + 'adj:pl:loc:m2:sup': 19, | |
229 | + 'adj:pl:loc:m3:sup': 19, | |
230 | + 'adj:pl:loc:n1:sup': 19, | |
231 | + 'adj:pl:loc:n2:sup': 19, | |
232 | + 'adj:pl:loc:p1:sup': 19, | |
233 | + 'adj:pl:loc:p2:sup': 19, | |
234 | + 'adj:pl:loc:p3:sup': 19, | |
235 | + 'adj:pl:nom:f:sup': 19, | |
236 | + 'adj:pl:nom:m1:sup': 19, | |
237 | + 'adj:pl:nom:m2:sup': 19, | |
238 | + 'adj:pl:nom:m3:sup': 19, | |
239 | + 'adj:pl:nom:n1:sup': 19, | |
240 | + 'adj:pl:nom:n2:sup': 19, | |
241 | + 'adj:pl:nom:p1:sup': 19, | |
242 | + 'adj:pl:nom:p2:sup': 19, | |
243 | + 'adj:pl:nom:p3:sup': 19, | |
244 | + 'adj:sg:acc:f:sup': 19, | |
245 | + 'adj:sg:acc:m1:sup': 19, | |
246 | + 'adj:sg:acc:m2:sup': 19, | |
247 | + 'adj:sg:acc:m3:sup': 19, | |
248 | + 'adj:sg:acc:n1:sup': 19, | |
249 | + 'adj:sg:acc:n2:sup': 19, | |
250 | + 'adj:sg:dat:f:sup': 19, | |
251 | + 'adj:sg:dat:m1:sup': 19, | |
252 | + 'adj:sg:dat:m2:sup': 19, | |
253 | + 'adj:sg:dat:m3:sup': 19, | |
254 | + 'adj:sg:dat:n1:sup': 19, | |
255 | + 'adj:sg:dat:n2:sup': 19, | |
256 | + 'adj:sg:gen:f:sup': 19, | |
257 | + 'adj:sg:gen:m1:sup': 19, | |
258 | + 'adj:sg:gen:m2:sup': 19, | |
259 | + 'adj:sg:gen:m3:sup': 19, | |
260 | + 'adj:sg:gen:n1:sup': 19, | |
261 | + 'adj:sg:gen:n2:sup': 19, | |
262 | + 'adj:sg:inst:f:sup': 19, | |
263 | + 'adj:sg:inst:m1:sup': 19, | |
264 | + 'adj:sg:inst:m2:sup': 19, | |
265 | + 'adj:sg:inst:m3:sup': 19, | |
266 | + 'adj:sg:inst:n1:sup': 19, | |
267 | + 'adj:sg:inst:n2:sup': 19, | |
268 | + 'adj:sg:loc:f:sup': 19, | |
269 | + 'adj:sg:loc:m1:sup': 19, | |
270 | + 'adj:sg:loc:m2:sup': 19, | |
271 | + 'adj:sg:loc:m3:sup': 19, | |
272 | + 'adj:sg:loc:n1:sup': 19, | |
273 | + 'adj:sg:loc:n2:sup': 19, | |
274 | + 'adj:sg:nom:f:sup': 19, | |
275 | + 'adj:sg:nom:m1:sup': 19, | |
276 | + 'adj:sg:nom:m2:sup': 19, | |
277 | + 'adj:sg:nom:m3:sup': 19, | |
278 | + 'adj:sg:nom:n1:sup': 19, | |
279 | + 'adj:sg:nom:n2:sup': 19, | |
280 | + 'adv:sup': 19, | |
281 | + 'winien:sg:m1.m2.m3:imperf': 3, | |
282 | + 'winien:sg:f:imperf': 3, | |
283 | + 'winien:sg:n1.n2:imperf': 3, | |
284 | + 'winien:pl:m1.p1:imperf': 3, | |
285 | + 'winien:pl:m2.m3.f.n1.n2.p2.p3:imperf': 3, | |
286 | + 'adja': 15, | |
287 | + 'ger:sg:dat.loc:n2:imperf:neg': 18, | |
288 | + 'ger:sg:dat.loc:n2:imperf.perf:neg': 18, | |
289 | + 'ger:sg:dat.loc:n2:perf:neg': 18, | |
290 | + 'ger:sg:gen:n2:imperf:neg': 18, | |
291 | + 'ger:sg:gen:n2:imperf.perf:neg': 18, | |
292 | + 'ger:sg:gen:n2:perf:neg': 18, | |
293 | + 'ger:sg:inst:n2:imperf:neg': 18, | |
294 | + 'ger:sg:inst:n2:imperf.perf:neg': 18, | |
295 | + 'ger:sg:inst:n2:perf:neg': 18, | |
296 | + 'ger:sg:nom.acc:n2:imperf:neg': 18, | |
297 | + 'ger:sg:nom.acc:n2:imperf.perf:neg': 18, | |
298 | + 'ger:sg:nom.acc:n2:perf:neg': 18, | |
299 | + 'pact:pl:acc:m1.p1:imperf:neg': 18, | |
300 | + 'pact:pl:acc:m1.p1:imperf.perf:neg': 18, | |
301 | + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
302 | + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
303 | + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
304 | + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
305 | + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
306 | + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
307 | + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18, | |
308 | + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18, | |
309 | + 'pact:pl:nom:m1.p1:imperf:neg': 18, | |
310 | + 'pact:pl:nom:m1.p1:imperf.perf:neg': 18, | |
311 | + 'pact:sg:acc.inst:f:imperf:neg': 18, | |
312 | + 'pact:sg:acc.inst:f:imperf.perf:neg': 18, | |
313 | + 'pact:sg:acc:m1.m2:imperf:neg': 18, | |
314 | + 'pact:sg:acc:m1.m2:imperf.perf:neg': 18, | |
315 | + 'pact:sg:acc:m3:imperf:neg': 18, | |
316 | + 'pact:sg:acc:m3:imperf.perf:neg': 18, | |
317 | + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18, | |
318 | + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
319 | + 'pact:sg:gen.dat.loc:f:imperf:neg': 18, | |
320 | + 'pact:sg:gen.dat.loc:f:imperf.perf:neg': 18, | |
321 | + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18, | |
322 | + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
323 | + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18, | |
324 | + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
325 | + 'pact:sg:nom.acc:n1.n2:imperf:neg': 18, | |
326 | + 'pact:sg:nom.acc:n1.n2:imperf.perf:neg': 18, | |
327 | + 'pact:sg:nom:f:imperf:neg': 18, | |
328 | + 'pact:sg:nom:f:imperf.perf:neg': 18, | |
329 | + 'pact:sg:nom:m1.m2.m3:imperf:neg': 18, | |
330 | + 'pact:sg:nom:m1.m2.m3:imperf.perf:neg': 18, | |
331 | + 'ppas:pl:acc:m1.p1:imperf:neg': 18, | |
332 | + 'ppas:pl:acc:m1.p1:imperf.perf:neg': 18, | |
333 | + 'ppas:pl:acc:m1.p1:perf:neg': 18, | |
334 | + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
335 | + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
336 | + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, | |
337 | + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
338 | + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
339 | + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, | |
340 | + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, | |
341 | + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, | |
342 | + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, | |
343 | + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18, | |
344 | + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18, | |
345 | + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:perf:neg': 18, | |
346 | + 'ppas:pl:nom:m1.p1:imperf:neg': 18, | |
347 | + 'ppas:pl:nom:m1.p1:imperf.perf:neg': 18, | |
348 | + 'ppas:pl:nom:m1.p1:perf:neg': 18, | |
349 | + 'ppas:sg:acc.inst:f:imperf:neg': 18, | |
350 | + 'ppas:sg:acc.inst:f:imperf.perf:neg': 18, | |
351 | + 'ppas:sg:acc.inst:f:perf:neg': 18, | |
352 | + 'ppas:sg:acc:m1.m2:imperf:neg': 18, | |
353 | + 'ppas:sg:acc:m1.m2:imperf.perf:neg': 18, | |
354 | + 'ppas:sg:acc:m1.m2:perf:neg': 18, | |
355 | + 'ppas:sg:acc:m3:imperf:neg': 18, | |
356 | + 'ppas:sg:acc:m3:imperf.perf:neg': 18, | |
357 | + 'ppas:sg:acc:m3:perf:neg': 18, | |
358 | + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18, | |
359 | + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
360 | + 'ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg': 18, | |
361 | + 'ppas:sg:gen.dat.loc:f:imperf:neg': 18, | |
362 | + 'ppas:sg:gen.dat.loc:f:imperf.perf:neg': 18, | |
363 | + 'ppas:sg:gen.dat.loc:f:perf:neg': 18, | |
364 | + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18, | |
365 | + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
366 | + 'ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg': 18, | |
367 | + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18, | |
368 | + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18, | |
369 | + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg': 18, | |
370 | + 'ppas:sg:nom.acc:n1.n2:imperf:neg': 18, | |
371 | + 'ppas:sg:nom.acc:n1.n2:imperf.perf:neg': 18, | |
372 | + 'ppas:sg:nom.acc:n1.n2:perf:neg': 18, | |
373 | + 'ppas:sg:nom:f:imperf:neg': 18, | |
374 | + 'ppas:sg:nom:f:imperf.perf:neg': 18, | |
375 | + 'ppas:sg:nom:f:perf:neg': 18, | |
376 | + 'ppas:sg:nom:m1.m2.m3:imperf:neg': 18, | |
377 | + 'ppas:sg:nom:m1.m2.m3:imperf.perf:neg': 18, | |
378 | + 'ppas:sg:nom:m1.m2.m3:perf:neg': 18, | |
379 | + 'ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep': 8, | |
380 | + 'prep:acc': 6, | |
381 | + 'prep:acc:wok': 6, | |
382 | + 'prep:acc.inst': 6, | |
383 | + 'prep:acc.inst:wok': 6, | |
384 | + 'prep:inst.acc': 6, | |
385 | + 'prep:inst.acc:wok': 6, | |
386 | + 'prep:inst.gen.acc:wok': 6, | |
387 | + 'prep:acc.loc': 6, | |
388 | + 'prep:acc.loc:wok': 6, | |
389 | + 'prep:loc.acc': 6, | |
390 | + 'prep:loc.acc:wok': 6, | |
391 | + 'prep:gen': 6, | |
392 | + 'prep:gen.dat': 6, | |
393 | + 'prep:gen:wok': 6, | |
394 | + 'prep:gen.inst:wok': 6, | |
395 | + 'brev:pun': 9, | |
396 | + 'brev:npun': 9, | |
397 | + 'intrj': 9, | |
398 | + 'burk': 9, | |
399 | +} | |
400 | + | |
9 | 401 | def _sortLines(inputLines, encoder): |
10 | 402 | logging.info('sorting input...') |
11 | 403 | lines = list(inputLines) |
... | ... | @@ -22,7 +414,8 @@ def _parseLines(inputLines, tagset, encoder): |
22 | 414 | orth, base, tag, name = line.split(u'\t') |
23 | 415 | tagnum = tagset.tag2tagnum[tag] |
24 | 416 | namenum = tagset.name2namenum[name] |
25 | - yield (orth, Interpretation(orth, base, tagnum, namenum, encoder)) | |
417 | + typenum = tag2typenum.get(tag, 0) | |
418 | + yield (orth, Interpretation(orth, base, tagnum, namenum, typenum, encoder)) | |
26 | 419 | |
27 | 420 | def _mergeEntries(inputLines): |
28 | 421 | prevOrth = None |
... | ... |
fsabuilder/fsa/encode.py
... | ... | @@ -61,11 +61,16 @@ class MorphEncoder(Encoder): |
61 | 61 | res.append(firstByte) |
62 | 62 | assert type(interpsList) == frozenset |
63 | 63 | for interp in sorted(interpsList, key=lambda i: i.getSortKey()): |
64 | + res.extend(self._encodeTypeNum(interp.typenum)) | |
64 | 65 | res.extend(self._encodeLemma(interp.lemma)) |
65 | 66 | res.extend(self._encodeTagNum(interp.tagnum)) |
66 | 67 | res.extend(self._encodeNameNum(interp.namenum)) |
67 | 68 | return res |
68 | 69 | |
70 | + def _encodeTypeNum(self, typenum): | |
71 | + assert typenum >= 0 and typenum < 256 | |
72 | + return bytearray([typenum]) | |
73 | + | |
69 | 74 | def _encodeLemma(self, lemma): |
70 | 75 | res = bytearray() |
71 | 76 | assert lemma.cutLength < 256 and lemma.cutLength >= 0 |
... | ... |
fsabuilder/fsa/serializer.py
... | ... | @@ -40,7 +40,6 @@ class Serializer(object): |
40 | 40 | raise NotImplementedError('Not implemented') |
41 | 41 | |
42 | 42 | def fsa2bytearray(self): |
43 | - | |
44 | 43 | res = bytearray() |
45 | 44 | res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) |
46 | 45 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
... | ... |
morfeusz/CMakeLists.txt
... | ... | @@ -7,7 +7,7 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) |
7 | 7 | add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) |
8 | 8 | add_executable (morfeusz2_analyze main.cpp) |
9 | 9 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) |
10 | -add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp) | |
10 | +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp) | |
11 | 11 | |
12 | 12 | # Link the executable to the Hello library. |
13 | 13 | target_link_libraries (morfeusz2_analyze morfeusz2) |
... | ... |
morfeusz/EncodedInterpretation.hpp
morfeusz/InterpsGroup.hpp
0 → 100644
1 | +/* | |
2 | + * File: GroupedInterpretations.hpp | |
3 | + * Author: lennyn | |
4 | + * | |
5 | + * Created on November 16, 2013, 7:58 PM | |
6 | + */ | |
7 | + | |
8 | +#ifndef GROUPEDINTERPRETATIONS_HPP | |
9 | +#define GROUPEDINTERPRETATIONS_HPP | |
10 | + | |
11 | +#include <vector> | |
12 | +#include <string> | |
13 | +#include "EncodedInterpretation.hpp" | |
14 | +#include "MorphInterpretation.hpp" | |
15 | +#include "Tagset.hpp" | |
16 | + | |
17 | +class InterpsGroup { | |
18 | +public: | |
19 | + | |
20 | + InterpsGroup() { | |
21 | + | |
22 | + } | |
23 | + | |
24 | + explicit InterpsGroup(const int type) | |
25 | + : type(type) { | |
26 | + | |
27 | + } | |
28 | + | |
29 | + std::vector<MorphInterpretation> getRealInterps(const std::string& orth, const Tagset& tagset) { | |
30 | + std::vector<MorphInterpretation> res; | |
31 | + for (EncodedInterpretation& ei: interps) { | |
32 | + res.push_back(MorphInterpretation(startNode, endNode, orth, ei, tagset)); | |
33 | + } | |
34 | + return res; | |
35 | + } | |
36 | + | |
37 | + void addInterpretation(const EncodedInterpretation& interp) { | |
38 | + interps.push_back(interp); | |
39 | + } | |
40 | + | |
41 | + int type; | |
42 | + int startNode; | |
43 | + int endNode; | |
44 | +private: | |
45 | + std::vector<EncodedInterpretation> interps; | |
46 | +}; | |
47 | + | |
48 | +#endif /* GROUPEDINTERPRETATIONS_HPP */ | |
49 | + | |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -6,17 +6,18 @@ |
6 | 6 | */ |
7 | 7 | |
8 | 8 | #include <string> |
9 | +#include "fsa.hpp" | |
9 | 10 | #include "utils.hpp" |
10 | 11 | #include "Morfeusz.hpp" |
11 | 12 | #include "MorphDeserializer.hpp" |
12 | -#include "encoding/CharsetConverter.hpp" | |
13 | +#include "charset/CharsetConverter.hpp" | |
13 | 14 | |
14 | 15 | using namespace std; |
15 | 16 | |
16 | -static FSA<vector<EncodedInterpretation>>* initializeFSA(const string& filename) { | |
17 | - static Deserializer<vector<EncodedInterpretation>>* deserializer | |
18 | - = new MorphDeserializer(); | |
19 | - return FSA<vector<EncodedInterpretation>>::getFSA(filename, *deserializer); | |
17 | +static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) { | |
18 | + static Deserializer < vector < InterpsGroup >> *deserializer | |
19 | + = new MorphDeserializer(); | |
20 | + return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer); | |
20 | 21 | } |
21 | 22 | |
22 | 23 | static CharsetConverter* initializeCharsetConverter() { |
... | ... | @@ -26,7 +27,7 @@ static CharsetConverter* initializeCharsetConverter() { |
26 | 27 | |
27 | 28 | Morfeusz::Morfeusz(const string& filename) |
28 | 29 | : fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) { |
29 | - | |
30 | + | |
30 | 31 | } |
31 | 32 | |
32 | 33 | //Morfeusz::Morfeusz(const Morfeusz& orig) { |
... | ... | @@ -36,12 +37,57 @@ Morfeusz::~Morfeusz() { |
36 | 37 | delete &this->fsa; |
37 | 38 | } |
38 | 39 | |
39 | -AnalyzeResult Morfeusz::analyze(const std::string& text) { | |
40 | - const char* textStart = text.c_str(); | |
41 | - const char* textEnd = text.c_str() + text.length(); | |
42 | - AnalyzeResult res = { | |
43 | - ResultsIterator(textStart, textEnd, *this), | |
44 | - ResultsIterator(textEnd, textEnd, *this)}; | |
45 | - return res; | |
40 | +ResultsIterator Morfeusz::analyze(const std::string& text) { | |
41 | +// const char* textStart = text.c_str(); | |
42 | +// const char* textEnd = text.c_str() + text.length(); | |
43 | + return ResultsIterator(text, *this); | |
44 | +} | |
45 | + | |
46 | +ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz) | |
47 | +: rawInput(text.c_str()), | |
48 | +morfeusz(morfeusz) { | |
49 | +} | |
50 | + | |
51 | +MorphInterpretation ResultsIterator::getNext() { | |
52 | +// if (resultsBuffer.empty()) { | |
53 | +// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer)); | |
54 | +// } | |
55 | +// startNode = resultsBuffer.back().getEndNode(); | |
56 | +// MorphInterpretation res = resultsBuffer.front(); | |
57 | +// resultsBuffer.pop_front(); | |
58 | +// return res; | |
59 | +} | |
60 | + | |
61 | +bool ResultsIterator::hasNext() { | |
62 | + return rawInput[0] != '\0' && resultsBuffer.empty(); | |
46 | 63 | } |
47 | 64 | |
65 | +//int Morfeusz::doProcessOneWord(const char*& inputPtr, const char* inputEnd, int startNodeNum, std::vector<EncodedInterpretation>& interps) const { | |
66 | +// assert(inputPtr[0] != '\0'); | |
67 | +// const char* start = inputPtr; | |
68 | +// StateType state = fsa->getInitialState(); | |
69 | +// int currNodeNum = startNodeNum; | |
70 | +// int codepoint = this->charsetConverter->next(inputPtr, inputEnd); | |
71 | +// assert(!isEndOfWord(codepoint)); | |
72 | +// while(!isEndOfWord(codepoint)) { | |
73 | +// feedState(state, codepoint); | |
74 | +// if (state.isAccepting()) { | |
75 | +// const char* currInputPtr = inputPtr; | |
76 | +// vector<EncodedInterpretation> startInterps = state.getValue(); | |
77 | +// filterOutNonGluableInterps(startInterps); | |
78 | +// if (!startInterps.empty()) { | |
79 | +// | |
80 | +// } | |
81 | +// vector<EncodedInterpretation> additionalInterps; | |
82 | +// int nextNodeNum = doProcessOneWord(currInputPtr, inputEnd, currNodeNum + 1, additionalInterps); | |
83 | +// if (!additionalInterps.empty()) { | |
84 | +// for (EncodedInterpretation& interp: state.getValue()) { | |
85 | +// interp.startNode = currNodeNum; | |
86 | +// interp.endNode = currNodeNum + 1; | |
87 | +// interps.push_back(interp); | |
88 | +// } | |
89 | +// | |
90 | +// } | |
91 | +// } | |
92 | +// } | |
93 | +//} | |
... | ... |
morfeusz/Morfeusz.hpp
... | ... | @@ -9,53 +9,78 @@ |
9 | 9 | #define MORFEUSZ_HPP |
10 | 10 | |
11 | 11 | #include <string> |
12 | +#include <list> | |
12 | 13 | #include <vector> |
13 | 14 | #include "EncodedInterpretation.hpp" |
14 | 15 | #include "fsa.hpp" |
15 | 16 | #include "MorphInterpretation.hpp" |
16 | -#include "encoding/CharsetConverter.hpp" | |
17 | +#include "InterpsGroup.hpp" | |
18 | +#include "charset/CharsetConverter.hpp" | |
17 | 19 | |
18 | 20 | class Morfeusz; |
19 | -class AnalyzeResult; | |
21 | +//class AnalyzeResult; | |
20 | 22 | class ResultsIterator; |
21 | 23 | |
24 | +typedef FSA<std::vector<InterpsGroup>> FSAType; | |
25 | +typedef State<std::vector<InterpsGroup>> StateType; | |
26 | + | |
22 | 27 | class Morfeusz { |
23 | 28 | public: |
24 | 29 | explicit Morfeusz(const std::string& filename); |
25 | 30 | virtual ~Morfeusz(); |
26 | 31 | // Morfeusz(const Morfeusz& orig); |
27 | - AnalyzeResult analyze(const std::string& text); | |
32 | + ResultsIterator analyze(const std::string& text); | |
28 | 33 | |
29 | 34 | // Morfeusz(); |
35 | + friend class ResultsIterator; | |
30 | 36 | private: |
31 | - void processOneWord(const char*& inputData, int startNodeNum, std::vector<MorphInterpretation>& resInterps); | |
32 | - const FSA<std::vector<EncodedInterpretation>>* fsa; | |
37 | + template <class OutputIterator> | |
38 | +// void processOneWord(const char*& inputData, int startNodeNum, OutputIterator resInterps) const; | |
39 | + | |
40 | + int doProcessOneWord(const char*& inputData, int startNodeNum, std::vector<InterpsGroup>& interps) const; | |
41 | + | |
42 | + const FSAType* fsa; | |
33 | 43 | CharsetConverter* charsetConverter; |
34 | 44 | }; |
35 | 45 | |
46 | +#include "Morfeusz_impl.hpp" | |
47 | + | |
36 | 48 | class ResultsIterator { |
37 | 49 | public: |
38 | - ResultsIterator( | |
39 | - const char* startOfInput, | |
40 | - const char* endOfInput, | |
41 | - const Morfeusz& morfeusz); | |
42 | - virtual ~ResultsIterator(); | |
43 | -// ResultsIterator(int* x); | |
44 | - ResultsIterator(const ResultsIterator& mit); | |
45 | - ResultsIterator& operator++(); | |
46 | - ResultsIterator operator++(int); | |
47 | - bool operator==(const ResultsIterator& rhs); | |
48 | - bool operator!=(const ResultsIterator& rhs); | |
49 | - MorphInterpretation& operator*(); | |
50 | + ResultsIterator(const std::string& text, const Morfeusz& morfeusz); | |
51 | + MorphInterpretation getNext(); | |
52 | + bool hasNext(); | |
50 | 53 | private: |
51 | 54 | const char* rawInput; |
52 | - const char* endOfInput; | |
55 | + const Morfeusz& morfeusz; | |
56 | + std::list<MorphInterpretation> resultsBuffer; | |
57 | + int startNode; | |
53 | 58 | }; |
54 | 59 | |
55 | -struct AnalyzeResult { | |
56 | - ResultsIterator iterator; | |
57 | - const ResultsIterator end; | |
58 | -}; | |
60 | +//class ResultsIterator { | |
61 | +//public: | |
62 | +// ResultsIterator( | |
63 | +// const char* startOfInput, | |
64 | +// const char* endOfInput, | |
65 | +// const Morfeusz& morfeusz); | |
66 | +// virtual ~ResultsIterator(); | |
67 | +// ResultsIterator(const ResultsIterator& mit); | |
68 | +// ResultsIterator& operator++(); | |
69 | +// ResultsIterator operator++(int); | |
70 | +// bool operator==(const ResultsIterator& rhs); | |
71 | +// bool operator!=(const ResultsIterator& rhs); | |
72 | +// MorphInterpretation& operator*(); | |
73 | +//private: | |
74 | +// const char* rawInput; | |
75 | +// const char* endOfInput; | |
76 | +// const Morfeusz& morfeusz; | |
77 | +// vector<MorphInterpretation> resultsBuffer; | |
78 | +//}; | |
79 | + | |
80 | +//struct AnalyzeResult { | |
81 | +// ResultsIterator iterator; | |
82 | +// const ResultsIterator end; | |
83 | +//}; | |
59 | 84 | |
60 | 85 | #endif /* MORFEUSZ_HPP */ |
61 | 86 | |
... | ... |
morfeusz/Morfeusz_impl.hpp
0 → 100644
1 | +/* | |
2 | + * File: Morfeusz_impl.hpp | |
3 | + * Author: lennyn | |
4 | + * | |
5 | + * Created on November 15, 2013, 1:43 PM | |
6 | + */ | |
7 | + | |
8 | +#ifndef MORFEUSZ_IMPL_HPP | |
9 | +#define MORFEUSZ_IMPL_HPP | |
10 | + | |
11 | +#include <cassert> | |
12 | +#include "Morfeusz.hpp" | |
13 | + | |
14 | +//template <class OutputIterator> | |
15 | +//void Morfeusz::processOneWord(const char*& inputData, const char* inputEnd, int startNodeNum, OutputIterator output, bool insertIgn = true) const { | |
16 | +// if (inputData == inputEnd) { | |
17 | +// return; | |
18 | +// } | |
19 | +// const char* start = inputData; | |
20 | +// StateType state = fsa->getInitialState(); | |
21 | +// int currNodeNum = startNodeNum; | |
22 | +// do { | |
23 | +// int codepoint = this->charsetConverter->next(inputData, inputEnd); | |
24 | +// if (!isSpace(codepoint) && codepoint != 0) { | |
25 | +// feedAutomaton(state, codepoint); | |
26 | +// if (state.isAccepting()) { | |
27 | +// int currInput = inputData; | |
28 | +// vector<MorphInterpretation> additionalInterps; | |
29 | +// processOneWord( | |
30 | +// currInput, inputEnd, | |
31 | +// currNodeNum + 1, | |
32 | +// back_inserter(additionalInterps), false); | |
33 | +// if (!additionalInterps.empty()) { | |
34 | +// currNodeNum = additionalInterps.back().getEndNode(); | |
35 | +// } | |
36 | +// } | |
37 | +// } | |
38 | +// } | |
39 | +//} | |
40 | + | |
41 | +#endif /* MORFEUSZ_IMPL_HPP */ | |
42 | + | |
... | ... |
morfeusz/MorphDeserializer.cpp
... | ... | @@ -5,7 +5,10 @@ |
5 | 5 | * Created on 12 listopad 2013, 15:31 |
6 | 6 | */ |
7 | 7 | |
8 | +#include <map> | |
8 | 9 | #include "MorphDeserializer.hpp" |
10 | +#include "EncodedInterpretation.hpp" | |
11 | +#include "InterpsGroup.hpp" | |
9 | 12 | |
10 | 13 | MorphDeserializer::MorphDeserializer() { |
11 | 14 | } |
... | ... | @@ -25,6 +28,8 @@ static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) { |
25 | 28 | } |
26 | 29 | |
27 | 30 | static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { |
31 | + interp.type = *ptr; | |
32 | + ptr++; | |
28 | 33 | deserializeLemma(ptr, interp.lemma); |
29 | 34 | interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); |
30 | 35 | ptr += 2; |
... | ... | @@ -32,17 +37,58 @@ static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& |
32 | 37 | ptr++; |
33 | 38 | } |
34 | 39 | |
35 | -long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const { | |
40 | +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { | |
36 | 41 | const unsigned char* currPtr = ptr; |
37 | 42 | uint8_t interpsNum = *ptr; |
38 | 43 | interps.clear(); |
39 | 44 | interps.reserve(interpsNum); |
40 | 45 | currPtr++; |
46 | + // FIXME - to jest do poprawy | |
47 | + map<int, InterpsGroup> results; | |
41 | 48 | for (unsigned int i = 0; i < interpsNum; ++i) { |
42 | 49 | EncodedInterpretation interp; |
43 | 50 | deserializeInterp(currPtr, interp); |
44 | - interps.push_back(interp); | |
51 | + if (results.count(interp.type) == 0) { | |
52 | + results[interp.type] = InterpsGroup(interp.type); | |
53 | + } | |
54 | + results[interp.type].addInterpretation(interp); | |
55 | +// interps.push_back(interp); | |
56 | + } | |
57 | + for (auto& kv: results) { | |
58 | + interps.push_back(kv.second); | |
45 | 59 | } |
46 | 60 | return currPtr - ptr; |
47 | 61 | } |
48 | 62 | |
63 | +//static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) { | |
64 | +// // XXX uważać na poprawność danych | |
65 | +// lemma.suffixToCut = *ptr; | |
66 | +// ptr++; | |
67 | +// lemma.suffixToAdd = (const char*) ptr; | |
68 | +// ptr += strlen((const char*) ptr) + 1; | |
69 | +//} | |
70 | +// | |
71 | +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { | |
72 | +// interp.type = *ptr; | |
73 | +// ptr++; | |
74 | +// deserializeLemma(ptr, interp.lemma); | |
75 | +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | |
76 | +// ptr += 2; | |
77 | +// interp.nameClassifier = *ptr; | |
78 | +// ptr++; | |
79 | +//} | |
80 | +// | |
81 | +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const { | |
82 | +// const unsigned char* currPtr = ptr; | |
83 | +// uint8_t interpsNum = *ptr; | |
84 | +// interps.clear(); | |
85 | +// interps.reserve(interpsNum); | |
86 | +// currPtr++; | |
87 | +// for (unsigned int i = 0; i < interpsNum; ++i) { | |
88 | +// EncodedInterpretation interp; | |
89 | +// deserializeInterp(currPtr, interp); | |
90 | +// interps.push_back(interp); | |
91 | +// } | |
92 | +// return currPtr - ptr; | |
93 | +//} | |
94 | + | |
... | ... |
morfeusz/MorphDeserializer.hpp
... | ... | @@ -10,19 +10,31 @@ |
10 | 10 | |
11 | 11 | #include <vector> |
12 | 12 | #include "fsa.hpp" |
13 | -#include "EncodedInterpretation.hpp" | |
13 | +#include "InterpsGroup.hpp" | |
14 | 14 | |
15 | -class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> { | |
15 | +class MorphDeserializer: public Deserializer<std::vector<InterpsGroup>> { | |
16 | 16 | public: |
17 | 17 | MorphDeserializer(); |
18 | 18 | MorphDeserializer(const MorphDeserializer& orig); |
19 | 19 | virtual ~MorphDeserializer(); |
20 | 20 | long deserialize( |
21 | 21 | const unsigned char* ptr, |
22 | - std::vector<EncodedInterpretation>& interps) const; | |
22 | + std::vector<InterpsGroup>& interps) const; | |
23 | 23 | private: |
24 | 24 | |
25 | 25 | }; |
26 | 26 | |
27 | +//class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> { | |
28 | +//public: | |
29 | +// MorphDeserializer(); | |
30 | +// MorphDeserializer(const MorphDeserializer& orig); | |
31 | +// virtual ~MorphDeserializer(); | |
32 | +// long deserialize( | |
33 | +// const unsigned char* ptr, | |
34 | +// std::vector<EncodedInterpretation>& interps) const; | |
35 | +//private: | |
36 | +// | |
37 | +//}; | |
38 | + | |
27 | 39 | #endif /* MORPHDESERIALIZER_HPP */ |
28 | 40 | |
... | ... |
morfeusz/MorphInterpretation.hpp
morfeusz/encoding/CharsetConverter.cpp renamed to morfeusz/charset/CharsetConverter.cpp
... | ... | @@ -11,6 +11,6 @@ |
11 | 11 | uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { |
12 | 12 | return utf8::next(it, end); |
13 | 13 | } |
14 | -const char* UTF8CharsetConverter::append(uint32_t cp, const char* result) const { | |
14 | +char* UTF8CharsetConverter::append(uint32_t cp, char* result) const { | |
15 | 15 | return utf8::append(cp, result); |
16 | 16 | } |
... | ... |
morfeusz/encoding/CharsetConverter.hpp renamed to morfeusz/charset/CharsetConverter.hpp
... | ... | @@ -11,35 +11,35 @@ |
11 | 11 | class CharsetConverter { |
12 | 12 | public: |
13 | 13 | virtual uint32_t next(const char*& it, const char* end) const = 0; |
14 | - virtual const char* append(uint32_t cp, const char* result) const = 0; | |
14 | + virtual char* append(uint32_t cp, char* result) const = 0; | |
15 | 15 | private: |
16 | 16 | }; |
17 | 17 | |
18 | 18 | class UTF8CharsetConverter: public CharsetConverter { |
19 | 19 | public: |
20 | 20 | uint32_t next(const char*& it, const char* end) const; |
21 | - const char* append(uint32_t cp, const char* result) const; | |
21 | + char* append(uint32_t cp, char* result) const; | |
22 | 22 | private: |
23 | 23 | }; |
24 | 24 | |
25 | 25 | class UTF16CharsetConverter: public CharsetConverter { |
26 | 26 | public: |
27 | 27 | uint32_t next(const char*& it, const char* end) const; |
28 | - const char* append(uint32_t cp, const char* result) const; | |
28 | + char* append(uint32_t cp, char* result) const; | |
29 | 29 | private: |
30 | 30 | }; |
31 | 31 | |
32 | 32 | class UTF32CharsetConverter: public CharsetConverter { |
33 | 33 | public: |
34 | 34 | uint32_t next(const char*& it, const char* end) const; |
35 | - const char* append(uint32_t cp, const char* result) const; | |
35 | + char* append(uint32_t cp, char* result) const; | |
36 | 36 | private: |
37 | 37 | }; |
38 | 38 | |
39 | 39 | class ISO8859_2_CharsetConverter: public CharsetConverter { |
40 | 40 | public: |
41 | 41 | uint32_t next(const char*& it, const char* end) const; |
42 | - const char* append(uint32_t cp, const char* result) const; | |
42 | + char* append(uint32_t cp, char* result) const; | |
43 | 43 | private: |
44 | 44 | }; |
45 | 45 | |
... | ... |
morfeusz/charset/charset_utils.hpp
0 → 100644
morfeusz/encoding/utf8.h renamed to morfeusz/charset/utf8.h
morfeusz/encoding/utf8/checked.h renamed to morfeusz/charset/utf8/checked.h
morfeusz/encoding/utf8/core.h renamed to morfeusz/charset/utf8/core.h
morfeusz/encoding/utf8/unchecked.h renamed to morfeusz/charset/utf8/unchecked.h
morfeusz/test_morph.cpp
... | ... | @@ -18,7 +18,7 @@ |
18 | 18 | using namespace std; |
19 | 19 | |
20 | 20 | void doTest( |
21 | - const FSA<vector<EncodedInterpretation>>& fsa, | |
21 | + const FSA<vector<InterpsGroup>>& fsa, | |
22 | 22 | const Tagset& tagset, |
23 | 23 | // const InterpretationsDecoder<TaggedInterpretation>& interpsConverter, |
24 | 24 | const char* fname) { |
... | ... | @@ -32,14 +32,15 @@ void doTest( |
32 | 32 | string lemma = splitVector[1]; |
33 | 33 | string tag = splitVector[2]; |
34 | 34 | string name = splitVector[3]; |
35 | - vector<EncodedInterpretation> value2; | |
35 | + vector<InterpsGroup> value2; | |
36 | 36 | fsa.tryToRecognize(orth.c_str(), value2); |
37 | 37 | DEBUG("recognized "+to_string(value2.size())); |
38 | 38 | // vector<TaggedInterpretation> parsedValues; |
39 | 39 | bool found = false; |
40 | - for (EncodedInterpretation encodedInterp: value2) { | |
40 | + for (InterpsGroup gi: value2) | |
41 | + for (MorphInterpretation interp: gi.getRealInterps(orth, tagset)) { | |
41 | 42 | // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp); |
42 | - MorphInterpretation interp(0, 0, orth, encodedInterp, tagset); | |
43 | +// (0, 0, orth, encodedInterp, tagset); | |
43 | 44 | // parsedValues.push_back(parsedValue); |
44 | 45 | // debug(orth, parsedValue); |
45 | 46 | if (lemma == interp.getLemma() && tag == interp.getTag() && name == interp.getName()) { |
... | ... | @@ -62,10 +63,7 @@ int main(int argc, char** argv) { |
62 | 63 | validate(argc == 3, "Must provide exactly two arguments - FSA filename, and dictionary filename."); |
63 | 64 | const unsigned char* fsaData = readFile(argv[1]); |
64 | 65 | MorphDeserializer deserializer; |
65 | - DEBUG("will read FSA"); | |
66 | - FSA<vector<EncodedInterpretation>>* fsa = FSA<vector<EncodedInterpretation>>::getFSA(fsaData, deserializer); | |
67 | - DEBUG("DONE read FSA"); | |
68 | - DEBUG("will read tagset"); | |
66 | + FSA<vector<InterpsGroup>>* fsa = FSA<vector<InterpsGroup>>::getFSA(fsaData, deserializer); | |
69 | 67 | Tagset tagset(fsaData); |
70 | 68 | // TaggedInterpretationsDecoder interpsDecoder(tagset); |
71 | 69 | DEBUG("DONE read tagset"); |
... | ... |
nbproject/configurations.xml
... | ... | @@ -8,11 +8,13 @@ |
8 | 8 | <in>test_speed.cpp</in> |
9 | 9 | </df> |
10 | 10 | <df root="morfeusz" name="1"> |
11 | - <df name="encoding"> | |
11 | + <df name="charset"> | |
12 | 12 | <in>CharsetConverter.cpp</in> |
13 | - <in>CharsetConverter.hpp</in> | |
13 | + <in>charset_utils.hpp</in> | |
14 | 14 | </df> |
15 | + <in>InterpsGroup.hpp</in> | |
15 | 16 | <in>Morfeusz.cpp</in> |
17 | + <in>Morfeusz_impl.hpp</in> | |
16 | 18 | <in>MorphDeserializer.cpp</in> |
17 | 19 | <in>MorphInterpretation.cpp</in> |
18 | 20 | <in>Tagset.cpp</in> |
... | ... | @@ -51,11 +53,19 @@ |
51 | 53 | <executablePath>build/fsa/test_dict</executablePath> |
52 | 54 | </makeTool> |
53 | 55 | </makefileType> |
54 | - <item path="fsa/const.cpp" ex="false" tool="1" flavor2="4"> | |
56 | + <folder path="1"> | |
55 | 57 | <ccTool> |
56 | 58 | <incDir> |
57 | 59 | <pElem>fsa</pElem> |
58 | - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> | |
60 | + <pElem>build/morfeusz</pElem> | |
61 | + </incDir> | |
62 | + </ccTool> | |
63 | + </folder> | |
64 | + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8"> | |
65 | + <ccTool> | |
66 | + <incDir> | |
67 | + <pElem>fsa</pElem> | |
68 | + <pElem>build/morfeusz</pElem> | |
59 | 69 | </incDir> |
60 | 70 | </ccTool> |
61 | 71 | </item> |
... | ... | @@ -80,86 +90,45 @@ |
80 | 90 | </incDir> |
81 | 91 | </ccTool> |
82 | 92 | </item> |
93 | + <item path="morfeusz/InterpsGroup.hpp" ex="false" tool="3" flavor2="0"> | |
94 | + </item> | |
83 | 95 | <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> |
84 | 96 | <ccTool> |
85 | - <incDir> | |
86 | - <pElem>fsa</pElem> | |
87 | - <pElem>build/morfeusz</pElem> | |
88 | - </incDir> | |
89 | 97 | </ccTool> |
90 | 98 | </item> |
99 | + <item path="morfeusz/Morfeusz_impl.hpp" ex="false" tool="3" flavor2="0"> | |
100 | + </item> | |
91 | 101 | <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> |
92 | 102 | <ccTool> |
93 | - <incDir> | |
94 | - <pElem>fsa</pElem> | |
95 | - <pElem>build/morfeusz</pElem> | |
96 | - </incDir> | |
97 | 103 | </ccTool> |
98 | 104 | </item> |
99 | - <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="4"> | |
105 | + <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8"> | |
100 | 106 | <ccTool> |
101 | - <incDir> | |
102 | - <pElem>morfeusz</pElem> | |
103 | - <pElem>/usr/include/c++/4.8/bits</pElem> | |
104 | - <pElem>/usr/include/c++/4.8/ext</pElem> | |
105 | - <pElem>/usr/include/c++/4.8</pElem> | |
106 | - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> | |
107 | - <pElem>/usr/include/c++/4.8/debug</pElem> | |
108 | - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> | |
109 | - <pElem>/usr/include/c++/4.8/backward</pElem> | |
110 | - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> | |
111 | - <pElem>build/morfeusz</pElem> | |
112 | - </incDir> | |
113 | 107 | </ccTool> |
114 | 108 | </item> |
115 | - <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> | |
109 | + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> | |
116 | 110 | <ccTool> |
117 | - <incDir> | |
118 | - <pElem>morfeusz</pElem> | |
119 | - <pElem>/usr/include/c++/4.8/bits</pElem> | |
120 | - <pElem>/usr/include/c++/4.8/ext</pElem> | |
121 | - <pElem>/usr/include/c++/4.8</pElem> | |
122 | - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> | |
123 | - <pElem>/usr/include/c++/4.8/debug</pElem> | |
124 | - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> | |
125 | - <pElem>fsa</pElem> | |
126 | - <pElem>/usr/include/c++/4.8/backward</pElem> | |
127 | - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> | |
128 | - <pElem>build/morfeusz</pElem> | |
129 | - </incDir> | |
130 | 111 | </ccTool> |
131 | 112 | </item> |
132 | - <item path="morfeusz/encoding/CharsetConverter.cpp" | |
113 | + <item path="morfeusz/charset/CharsetConverter.cpp" | |
133 | 114 | ex="false" |
134 | 115 | tool="1" |
135 | - flavor2="0"> | |
116 | + flavor2="8"> | |
117 | + <ccTool> | |
118 | + </ccTool> | |
136 | 119 | </item> |
137 | - <item path="morfeusz/encoding/CharsetConverter.hpp" | |
138 | - ex="false" | |
139 | - tool="3" | |
140 | - flavor2="0"> | |
120 | + <item path="morfeusz/charset/charset_utils.hpp" ex="false" tool="3" flavor2="0"> | |
141 | 121 | </item> |
142 | 122 | <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> |
143 | 123 | <ccTool> |
144 | - <incDir> | |
145 | - <pElem>fsa</pElem> | |
146 | - <pElem>build/morfeusz</pElem> | |
147 | - </incDir> | |
148 | 124 | </ccTool> |
149 | 125 | </item> |
150 | 126 | <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> |
151 | 127 | <ccTool> |
152 | - <incDir> | |
153 | - <pElem>morfeusz</pElem> | |
154 | - </incDir> | |
155 | 128 | </ccTool> |
156 | 129 | </item> |
157 | 130 | <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> |
158 | 131 | <ccTool> |
159 | - <incDir> | |
160 | - <pElem>fsa</pElem> | |
161 | - <pElem>build/morfeusz</pElem> | |
162 | - </incDir> | |
163 | 132 | </ccTool> |
164 | 133 | </item> |
165 | 134 | </conf> |
... | ... |