Commit a9d3e65c15f2e43bc637fbda8342a6242dc1174f

Authored by Michał Lenart
1 parent f23aead2

- refaktoryzacja, odkomentowanie na-razie-niedziałających kawałków kodu

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@20 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/fsa/common.py
@@ -14,7 +14,7 @@ class Lemma(object): @@ -14,7 +14,7 @@ class Lemma(object):
14 14
15 class Interpretation(object): 15 class Interpretation(object):
16 16
17 - def __init__(self, orth, base, tagnum, namenum, encoder): 17 + def __init__(self, orth, base, tagnum, namenum, typenum, encoder):
18 assert type(orth) == unicode 18 assert type(orth) == unicode
19 assert type(base) == unicode 19 assert type(base) == unicode
20 root = u'' 20 root = u''
@@ -29,6 +29,7 @@ class Interpretation(object): @@ -29,6 +29,7 @@ class Interpretation(object):
29 suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False)) 29 suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False))
30 self.tagnum = tagnum 30 self.tagnum = tagnum
31 self.namenum = namenum 31 self.namenum = namenum
  32 + self.typenum = typenum
32 33
33 def getSortKey(self): 34 def getSortKey(self):
34 return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum) 35 return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum)
fsabuilder/fsa/convertinput.py
@@ -6,6 +6,398 @@ Created on Oct 23, 2013 @@ -6,6 +6,398 @@ Created on Oct 23, 2013
6 import logging 6 import logging
7 from common import Interpretation 7 from common import Interpretation
8 8
  9 +tag2typenum = {
  10 + 'aglt:sg:pri:imperf:nwok': 12,
  11 + 'aglt:sg:pri:imperf:wok': 12,
  12 + 'aglt:sg:sec:imperf:nwok': 12,
  13 + 'aglt:sg:sec:imperf:wok': 12,
  14 + 'aglt:pl:pri:imperf:nwok': 13,
  15 + 'aglt:pl:pri:imperf:wok': 13,
  16 + 'aglt:pl:sec:imperf:nwok': 13,
  17 + 'aglt:pl:sec:imperf:wok': 13,
  18 + 'praet:sg:m1.m2.m3:imperf:agl': 7,
  19 + 'praet:sg:m1.m2.m3:imperf.perf:agl': 7,
  20 + 'praet:sg:m1.m2.m3:perf:agl': 7,
  21 + 'praet:sg:m1.m2.m3:imperf:nagl': 16,
  22 + 'praet:sg:m1.m2.m3:imperf.perf:nagl': 16,
  23 + 'praet:sg:m1.m2.m3:perf:nagl': 16,
  24 + 'praet:sg:f:imperf': 20,
  25 + 'praet:sg:f:imperf.perf': 20,
  26 + 'praet:sg:f:perf': 20,
  27 + 'praet:sg:m1.m2.m3:imperf': 20,
  28 + 'praet:sg:m1.m2.m3:imperf.perf': 20,
  29 + 'praet:sg:m1.m2.m3:perf': 20,
  30 + 'praet:sg:n1.n2:imperf': 20,
  31 + 'praet:sg:n1.n2:imperf.perf': 20,
  32 + 'praet:sg:n1.n2:perf': 20,
  33 + 'praet:pl:m1.p1:imperf': 21,
  34 + 'praet:pl:m1.p1:imperf.perf': 21,
  35 + 'praet:pl:m1.p1:perf': 21,
  36 + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf': 21,
  37 + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf': 21,
  38 + 'praet:pl:m2.m3.f.n1.n2.p2.p3:perf': 21,
  39 + 'naj': 10,
  40 + 'nie': 5,
  41 + 'adj:pl:acc:m1.p1:pos': 1,
  42 + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos': 1,
  43 + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  44 + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  45 + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  46 + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  47 + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  48 + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:pos': 1,
  49 + 'adj:pl:nom.voc:m1.p1:pos': 1,
  50 + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos': 1,
  51 + 'adj:sg:acc:m1.m2:pos': 1,
  52 + 'adj:sg:acc:n1.n2:pos': 1,
  53 + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1,
  54 + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1,
  55 + 'adj:sg:inst:m1.m2.m3.n1.n2:pos': 1,
  56 + 'adj:sg:loc:m1.m2.m3.n1.n2:pos': 1,
  57 + 'adj:sg:nom.voc:m1.m2.m3:pos': 1,
  58 + 'adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos': 1,
  59 + 'adj:sg:nom.voc:n1.n2:pos': 1,
  60 + 'adj:sg:acc:f:pos': 1,
  61 + 'adj:sg:acc.inst:f:pos': 1,
  62 + 'adj:sg:acc:m1.m2:pos': 1,
  63 + 'adj:sg:acc:m3:pos': 1,
  64 + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1,
  65 + 'adj:sg:gen.dat.loc:f:pos': 1,
  66 + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1,
  67 + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:pos': 1,
  68 + 'adj:sg:nom.voc.acc:n1.n2:pos': 1,
  69 + 'adj:sg:nom.voc:f:pos': 1,
  70 + 'adj:sg:nom.voc:m1.m2.m3:pos': 1,
  71 + 'adj:pl:acc:f:pos': 1,
  72 + 'adj:pl:acc:m1:pos': 1,
  73 + 'adj:pl:acc:m2:pos': 1,
  74 + 'adj:pl:acc:m3:pos': 1,
  75 + 'adj:pl:acc:n1:pos': 1,
  76 + 'adj:pl:acc:n2:pos': 1,
  77 + 'adj:pl:acc:p1:pos': 1,
  78 + 'adj:pl:acc:p2:pos': 1,
  79 + 'adj:pl:acc:p3:pos': 1,
  80 + 'adj:pl:dat:f:pos': 1,
  81 + 'adj:pl:dat:m1:pos': 1,
  82 + 'adj:pl:dat:m2:pos': 1,
  83 + 'adj:pl:dat:m3:pos': 1,
  84 + 'adj:pl:dat:n1:pos': 1,
  85 + 'adj:pl:dat:n2:pos': 1,
  86 + 'adj:pl:dat:p1:pos': 1,
  87 + 'adj:pl:dat:p2:pos': 1,
  88 + 'adj:pl:dat:p3:pos': 1,
  89 + 'adj:pl:gen:f:pos': 1,
  90 + 'adj:pl:gen:m1:pos': 1,
  91 + 'adj:pl:gen:m2:pos': 1,
  92 + 'adj:pl:gen:m3:pos': 1,
  93 + 'adj:pl:gen:n1:pos': 1,
  94 + 'adj:pl:gen:n2:pos': 1,
  95 + 'adj:pl:gen:p1:pos': 1,
  96 + 'adj:pl:gen:p2:pos': 1,
  97 + 'adj:pl:gen:p3:pos': 1,
  98 + 'adj:pl:inst:f:pos': 1,
  99 + 'adj:pl:inst:m1:pos': 1,
  100 + 'adj:pl:inst:m2:pos': 1,
  101 + 'adj:pl:inst:m3:pos': 1,
  102 + 'adj:pl:inst:n1:pos': 1,
  103 + 'adj:pl:inst:n2:pos': 1,
  104 + 'adj:pl:inst:p1:pos': 1,
  105 + 'adj:pl:inst:p2:pos': 1,
  106 + 'adj:pl:inst:p3:pos': 1,
  107 + 'adj:pl:loc:f:pos': 1,
  108 + 'adj:pl:loc:m1:pos': 1,
  109 + 'adj:pl:loc:m2:pos': 1,
  110 + 'adj:pl:loc:m3:pos': 1,
  111 + 'adj:pl:loc:n1:pos': 1,
  112 + 'adj:pl:loc:n2:pos': 1,
  113 + 'adj:pl:loc:p1:pos': 1,
  114 + 'adj:pl:loc:p2:pos': 1,
  115 + 'adj:pl:loc:p3:pos': 1,
  116 + 'adj:pl:nom:f:pos': 1,
  117 + 'adj:pl:nom:m1:pos': 1,
  118 + 'adj:pl:nom:m2:pos': 1,
  119 + 'adj:pl:nom:m3:pos': 1,
  120 + 'adj:pl:nom:n1:pos': 1,
  121 + 'adj:pl:nom:n2:pos': 1,
  122 + 'adj:pl:nom:p1:pos': 1,
  123 + 'adj:pl:nom:p2:pos': 1,
  124 + 'adj:pl:nom:p3:pos': 1,
  125 + 'adj:sg:acc:f:pos': 1,
  126 + 'adj:sg:acc:m1:pos': 1,
  127 + 'adj:sg:acc:m2:pos': 1,
  128 + 'adj:sg:acc:m3:pos': 1,
  129 + 'adj:sg:acc:n1:pos': 1,
  130 + 'adj:sg:acc:n2:pos': 1,
  131 + 'adj:sg:dat:f:pos': 1,
  132 + 'adj:sg:dat:m1:pos': 1,
  133 + 'adj:sg:dat:m2:pos': 1,
  134 + 'adj:sg:dat:m3:pos': 1,
  135 + 'adj:sg:dat:n1:pos': 1,
  136 + 'adj:sg:dat:n2:pos': 1,
  137 + 'adj:sg:gen:f:pos': 1,
  138 + 'adj:sg:gen:m1:pos': 1,
  139 + 'adj:sg:gen:m2:pos': 1,
  140 + 'adj:sg:gen:m3:pos': 1,
  141 + 'adj:sg:gen:n1:pos': 1,
  142 + 'adj:sg:gen:n2:pos': 1,
  143 + 'adj:sg:inst:f:pos': 1,
  144 + 'adj:sg:inst:m1:pos': 1,
  145 + 'adj:sg:inst:m2:pos': 1,
  146 + 'adj:sg:inst:m3:pos': 1,
  147 + 'adj:sg:inst:n1:pos': 1,
  148 + 'adj:sg:inst:n2:pos': 1,
  149 + 'adj:sg:loc:f:pos': 1,
  150 + 'adj:sg:loc:m1:pos': 1,
  151 + 'adj:sg:loc:m2:pos': 1,
  152 + 'adj:sg:loc:m3:pos': 1,
  153 + 'adj:sg:loc:n1:pos': 1,
  154 + 'adj:sg:loc:n2:pos': 1,
  155 + 'adj:sg:nom:f:pos': 1,
  156 + 'adj:sg:nom:m1:pos': 1,
  157 + 'adj:sg:nom:m2:pos': 1,
  158 + 'adj:sg:nom:m3:pos': 1,
  159 + 'adj:sg:nom:n1:pos': 1,
  160 + 'adj:sg:nom:n2:pos': 1,
  161 + 'adj:pl:acc:m1.p1:sup': 19,
  162 + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup': 19,
  163 + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  164 + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  165 + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  166 + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  167 + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  168 + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:sup': 19,
  169 + 'adj:pl:nom.voc:m1.p1:sup': 19,
  170 + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup': 19,
  171 + 'adj:sg:acc:f:sup': 19,
  172 + 'adj:sg:acc.inst:f:sup': 19,
  173 + 'adj:sg:acc:m1.m2:sup': 19,
  174 + 'adj:sg:acc:m3:sup': 19,
  175 + 'adj:sg:acc:n1.n2:sup': 19,
  176 + 'adj:sg:dat:f:sup': 19,
  177 + 'adj:sg:dat:m1.m2.m3.n1.n2:sup': 19,
  178 + 'adj:sg:gen:f:sup': 19,
  179 + 'adj:sg:gen.dat.loc:f:sup': 19,
  180 + 'adj:sg:gen:m1.m2.m3.n1.n2:sup': 19,
  181 + 'adj:sg:inst:f:sup': 19,
  182 + 'adj:sg:inst:m1.m2.m3.n1.n2:sup': 19,
  183 + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:sup': 19,
  184 + 'adj:sg:loc:f:sup': 19,
  185 + 'adj:sg:loc:m1.m2.m3.n1.n2:sup': 19,
  186 + 'adj:sg:nom.acc:n1.n2:sup': 19,
  187 + 'adj:sg:nom.voc:f:sup': 19,
  188 + 'adj:sg:nom.voc:m1.m2.m3:sup': 19,
  189 + 'adj:sg:nom.voc:n1.n2:sup': 19,
  190 + 'adj:pl:acc:f:sup': 19,
  191 + 'adj:pl:acc:m1:sup': 19,
  192 + 'adj:pl:acc:m2:sup': 19,
  193 + 'adj:pl:acc:m3:sup': 19,
  194 + 'adj:pl:acc:n1:sup': 19,
  195 + 'adj:pl:acc:n2:sup': 19,
  196 + 'adj:pl:acc:p1:sup': 19,
  197 + 'adj:pl:acc:p2:sup': 19,
  198 + 'adj:pl:acc:p3:sup': 19,
  199 + 'adj:pl:dat:f:sup': 19,
  200 + 'adj:pl:dat:m1:sup': 19,
  201 + 'adj:pl:dat:m2:sup': 19,
  202 + 'adj:pl:dat:m3:sup': 19,
  203 + 'adj:pl:dat:n1:sup': 19,
  204 + 'adj:pl:dat:n2:sup': 19,
  205 + 'adj:pl:dat:p1:sup': 19,
  206 + 'adj:pl:dat:p2:sup': 19,
  207 + 'adj:pl:dat:p3:sup': 19,
  208 + 'adj:pl:gen:f:sup': 19,
  209 + 'adj:pl:gen:m1:sup': 19,
  210 + 'adj:pl:gen:m2:sup': 19,
  211 + 'adj:pl:gen:m3:sup': 19,
  212 + 'adj:pl:gen:n1:sup': 19,
  213 + 'adj:pl:gen:n2:sup': 19,
  214 + 'adj:pl:gen:p1:sup': 19,
  215 + 'adj:pl:gen:p2:sup': 19,
  216 + 'adj:pl:gen:p3:sup': 19,
  217 + 'adj:pl:inst:f:sup': 19,
  218 + 'adj:pl:inst:m1:sup': 19,
  219 + 'adj:pl:inst:m2:sup': 19,
  220 + 'adj:pl:inst:m3:sup': 19,
  221 + 'adj:pl:inst:n1:sup': 19,
  222 + 'adj:pl:inst:n2:sup': 19,
  223 + 'adj:pl:inst:p1:sup': 19,
  224 + 'adj:pl:inst:p2:sup': 19,
  225 + 'adj:pl:inst:p3:sup': 19,
  226 + 'adj:pl:loc:f:sup': 19,
  227 + 'adj:pl:loc:m1:sup': 19,
  228 + 'adj:pl:loc:m2:sup': 19,
  229 + 'adj:pl:loc:m3:sup': 19,
  230 + 'adj:pl:loc:n1:sup': 19,
  231 + 'adj:pl:loc:n2:sup': 19,
  232 + 'adj:pl:loc:p1:sup': 19,
  233 + 'adj:pl:loc:p2:sup': 19,
  234 + 'adj:pl:loc:p3:sup': 19,
  235 + 'adj:pl:nom:f:sup': 19,
  236 + 'adj:pl:nom:m1:sup': 19,
  237 + 'adj:pl:nom:m2:sup': 19,
  238 + 'adj:pl:nom:m3:sup': 19,
  239 + 'adj:pl:nom:n1:sup': 19,
  240 + 'adj:pl:nom:n2:sup': 19,
  241 + 'adj:pl:nom:p1:sup': 19,
  242 + 'adj:pl:nom:p2:sup': 19,
  243 + 'adj:pl:nom:p3:sup': 19,
  244 + 'adj:sg:acc:f:sup': 19,
  245 + 'adj:sg:acc:m1:sup': 19,
  246 + 'adj:sg:acc:m2:sup': 19,
  247 + 'adj:sg:acc:m3:sup': 19,
  248 + 'adj:sg:acc:n1:sup': 19,
  249 + 'adj:sg:acc:n2:sup': 19,
  250 + 'adj:sg:dat:f:sup': 19,
  251 + 'adj:sg:dat:m1:sup': 19,
  252 + 'adj:sg:dat:m2:sup': 19,
  253 + 'adj:sg:dat:m3:sup': 19,
  254 + 'adj:sg:dat:n1:sup': 19,
  255 + 'adj:sg:dat:n2:sup': 19,
  256 + 'adj:sg:gen:f:sup': 19,
  257 + 'adj:sg:gen:m1:sup': 19,
  258 + 'adj:sg:gen:m2:sup': 19,
  259 + 'adj:sg:gen:m3:sup': 19,
  260 + 'adj:sg:gen:n1:sup': 19,
  261 + 'adj:sg:gen:n2:sup': 19,
  262 + 'adj:sg:inst:f:sup': 19,
  263 + 'adj:sg:inst:m1:sup': 19,
  264 + 'adj:sg:inst:m2:sup': 19,
  265 + 'adj:sg:inst:m3:sup': 19,
  266 + 'adj:sg:inst:n1:sup': 19,
  267 + 'adj:sg:inst:n2:sup': 19,
  268 + 'adj:sg:loc:f:sup': 19,
  269 + 'adj:sg:loc:m1:sup': 19,
  270 + 'adj:sg:loc:m2:sup': 19,
  271 + 'adj:sg:loc:m3:sup': 19,
  272 + 'adj:sg:loc:n1:sup': 19,
  273 + 'adj:sg:loc:n2:sup': 19,
  274 + 'adj:sg:nom:f:sup': 19,
  275 + 'adj:sg:nom:m1:sup': 19,
  276 + 'adj:sg:nom:m2:sup': 19,
  277 + 'adj:sg:nom:m3:sup': 19,
  278 + 'adj:sg:nom:n1:sup': 19,
  279 + 'adj:sg:nom:n2:sup': 19,
  280 + 'adv:sup': 19,
  281 + 'winien:sg:m1.m2.m3:imperf': 3,
  282 + 'winien:sg:f:imperf': 3,
  283 + 'winien:sg:n1.n2:imperf': 3,
  284 + 'winien:pl:m1.p1:imperf': 3,
  285 + 'winien:pl:m2.m3.f.n1.n2.p2.p3:imperf': 3,
  286 + 'adja': 15,
  287 + 'ger:sg:dat.loc:n2:imperf:neg': 18,
  288 + 'ger:sg:dat.loc:n2:imperf.perf:neg': 18,
  289 + 'ger:sg:dat.loc:n2:perf:neg': 18,
  290 + 'ger:sg:gen:n2:imperf:neg': 18,
  291 + 'ger:sg:gen:n2:imperf.perf:neg': 18,
  292 + 'ger:sg:gen:n2:perf:neg': 18,
  293 + 'ger:sg:inst:n2:imperf:neg': 18,
  294 + 'ger:sg:inst:n2:imperf.perf:neg': 18,
  295 + 'ger:sg:inst:n2:perf:neg': 18,
  296 + 'ger:sg:nom.acc:n2:imperf:neg': 18,
  297 + 'ger:sg:nom.acc:n2:imperf.perf:neg': 18,
  298 + 'ger:sg:nom.acc:n2:perf:neg': 18,
  299 + 'pact:pl:acc:m1.p1:imperf:neg': 18,
  300 + 'pact:pl:acc:m1.p1:imperf.perf:neg': 18,
  301 + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  302 + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  303 + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  304 + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  305 + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  306 + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  307 + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18,
  308 + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18,
  309 + 'pact:pl:nom:m1.p1:imperf:neg': 18,
  310 + 'pact:pl:nom:m1.p1:imperf.perf:neg': 18,
  311 + 'pact:sg:acc.inst:f:imperf:neg': 18,
  312 + 'pact:sg:acc.inst:f:imperf.perf:neg': 18,
  313 + 'pact:sg:acc:m1.m2:imperf:neg': 18,
  314 + 'pact:sg:acc:m1.m2:imperf.perf:neg': 18,
  315 + 'pact:sg:acc:m3:imperf:neg': 18,
  316 + 'pact:sg:acc:m3:imperf.perf:neg': 18,
  317 + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18,
  318 + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  319 + 'pact:sg:gen.dat.loc:f:imperf:neg': 18,
  320 + 'pact:sg:gen.dat.loc:f:imperf.perf:neg': 18,
  321 + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18,
  322 + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  323 + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18,
  324 + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  325 + 'pact:sg:nom.acc:n1.n2:imperf:neg': 18,
  326 + 'pact:sg:nom.acc:n1.n2:imperf.perf:neg': 18,
  327 + 'pact:sg:nom:f:imperf:neg': 18,
  328 + 'pact:sg:nom:f:imperf.perf:neg': 18,
  329 + 'pact:sg:nom:m1.m2.m3:imperf:neg': 18,
  330 + 'pact:sg:nom:m1.m2.m3:imperf.perf:neg': 18,
  331 + 'ppas:pl:acc:m1.p1:imperf:neg': 18,
  332 + 'ppas:pl:acc:m1.p1:imperf.perf:neg': 18,
  333 + 'ppas:pl:acc:m1.p1:perf:neg': 18,
  334 + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  335 + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  336 + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18,
  337 + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  338 + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  339 + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18,
  340 + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  341 + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  342 + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18,
  343 + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18,
  344 + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18,
  345 + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:perf:neg': 18,
  346 + 'ppas:pl:nom:m1.p1:imperf:neg': 18,
  347 + 'ppas:pl:nom:m1.p1:imperf.perf:neg': 18,
  348 + 'ppas:pl:nom:m1.p1:perf:neg': 18,
  349 + 'ppas:sg:acc.inst:f:imperf:neg': 18,
  350 + 'ppas:sg:acc.inst:f:imperf.perf:neg': 18,
  351 + 'ppas:sg:acc.inst:f:perf:neg': 18,
  352 + 'ppas:sg:acc:m1.m2:imperf:neg': 18,
  353 + 'ppas:sg:acc:m1.m2:imperf.perf:neg': 18,
  354 + 'ppas:sg:acc:m1.m2:perf:neg': 18,
  355 + 'ppas:sg:acc:m3:imperf:neg': 18,
  356 + 'ppas:sg:acc:m3:imperf.perf:neg': 18,
  357 + 'ppas:sg:acc:m3:perf:neg': 18,
  358 + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18,
  359 + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  360 + 'ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg': 18,
  361 + 'ppas:sg:gen.dat.loc:f:imperf:neg': 18,
  362 + 'ppas:sg:gen.dat.loc:f:imperf.perf:neg': 18,
  363 + 'ppas:sg:gen.dat.loc:f:perf:neg': 18,
  364 + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18,
  365 + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  366 + 'ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg': 18,
  367 + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18,
  368 + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  369 + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg': 18,
  370 + 'ppas:sg:nom.acc:n1.n2:imperf:neg': 18,
  371 + 'ppas:sg:nom.acc:n1.n2:imperf.perf:neg': 18,
  372 + 'ppas:sg:nom.acc:n1.n2:perf:neg': 18,
  373 + 'ppas:sg:nom:f:imperf:neg': 18,
  374 + 'ppas:sg:nom:f:imperf.perf:neg': 18,
  375 + 'ppas:sg:nom:f:perf:neg': 18,
  376 + 'ppas:sg:nom:m1.m2.m3:imperf:neg': 18,
  377 + 'ppas:sg:nom:m1.m2.m3:imperf.perf:neg': 18,
  378 + 'ppas:sg:nom:m1.m2.m3:perf:neg': 18,
  379 + 'ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep': 8,
  380 + 'prep:acc': 6,
  381 + 'prep:acc:wok': 6,
  382 + 'prep:acc.inst': 6,
  383 + 'prep:acc.inst:wok': 6,
  384 + 'prep:inst.acc': 6,
  385 + 'prep:inst.acc:wok': 6,
  386 + 'prep:inst.gen.acc:wok': 6,
  387 + 'prep:acc.loc': 6,
  388 + 'prep:acc.loc:wok': 6,
  389 + 'prep:loc.acc': 6,
  390 + 'prep:loc.acc:wok': 6,
  391 + 'prep:gen': 6,
  392 + 'prep:gen.dat': 6,
  393 + 'prep:gen:wok': 6,
  394 + 'prep:gen.inst:wok': 6,
  395 + 'brev:pun': 9,
  396 + 'brev:npun': 9,
  397 + 'intrj': 9,
  398 + 'burk': 9,
  399 +}
  400 +
9 def _sortLines(inputLines, encoder): 401 def _sortLines(inputLines, encoder):
10 logging.info('sorting input...') 402 logging.info('sorting input...')
11 lines = list(inputLines) 403 lines = list(inputLines)
@@ -22,7 +414,8 @@ def _parseLines(inputLines, tagset, encoder): @@ -22,7 +414,8 @@ def _parseLines(inputLines, tagset, encoder):
22 orth, base, tag, name = line.split(u'\t') 414 orth, base, tag, name = line.split(u'\t')
23 tagnum = tagset.tag2tagnum[tag] 415 tagnum = tagset.tag2tagnum[tag]
24 namenum = tagset.name2namenum[name] 416 namenum = tagset.name2namenum[name]
25 - yield (orth, Interpretation(orth, base, tagnum, namenum, encoder)) 417 + typenum = tag2typenum.get(tag, 0)
  418 + yield (orth, Interpretation(orth, base, tagnum, namenum, typenum, encoder))
26 419
27 def _mergeEntries(inputLines): 420 def _mergeEntries(inputLines):
28 prevOrth = None 421 prevOrth = None
fsabuilder/fsa/encode.py
@@ -61,11 +61,16 @@ class MorphEncoder(Encoder): @@ -61,11 +61,16 @@ class MorphEncoder(Encoder):
61 res.append(firstByte) 61 res.append(firstByte)
62 assert type(interpsList) == frozenset 62 assert type(interpsList) == frozenset
63 for interp in sorted(interpsList, key=lambda i: i.getSortKey()): 63 for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  64 + res.extend(self._encodeTypeNum(interp.typenum))
64 res.extend(self._encodeLemma(interp.lemma)) 65 res.extend(self._encodeLemma(interp.lemma))
65 res.extend(self._encodeTagNum(interp.tagnum)) 66 res.extend(self._encodeTagNum(interp.tagnum))
66 res.extend(self._encodeNameNum(interp.namenum)) 67 res.extend(self._encodeNameNum(interp.namenum))
67 return res 68 return res
68 69
  70 + def _encodeTypeNum(self, typenum):
  71 + assert typenum >= 0 and typenum < 256
  72 + return bytearray([typenum])
  73 +
69 def _encodeLemma(self, lemma): 74 def _encodeLemma(self, lemma):
70 res = bytearray() 75 res = bytearray()
71 assert lemma.cutLength < 256 and lemma.cutLength >= 0 76 assert lemma.cutLength < 256 and lemma.cutLength >= 0
fsabuilder/fsa/serializer.py
@@ -40,7 +40,6 @@ class Serializer(object): @@ -40,7 +40,6 @@ class Serializer(object):
40 raise NotImplementedError('Not implemented') 40 raise NotImplementedError('Not implemented')
41 41
42 def fsa2bytearray(self): 42 def fsa2bytearray(self):
43 -  
44 res = bytearray() 43 res = bytearray()
45 res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) 44 res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset)))
46 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) 45 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
morfeusz/CMakeLists.txt
@@ -7,7 +7,7 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) @@ -7,7 +7,7 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa)
7 add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) 7 add_library (morfeusz2 morfeusz.hpp morfeusz.cpp)
8 add_executable (morfeusz2_analyze main.cpp) 8 add_executable (morfeusz2_analyze main.cpp)
9 add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) 9 add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp)
10 -add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp) 10 +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp)
11 11
12 # Link the executable to the Hello library. 12 # Link the executable to the Hello library.
13 target_link_libraries (morfeusz2_analyze morfeusz2) 13 target_link_libraries (morfeusz2_analyze morfeusz2)
morfeusz/EncodedInterpretation.hpp
@@ -28,8 +28,11 @@ struct EncodedLemma { @@ -28,8 +28,11 @@ struct EncodedLemma {
28 */ 28 */
29 struct EncodedInterpretation { 29 struct EncodedInterpretation {
30 EncodedLemma lemma; 30 EncodedLemma lemma;
  31 + int type;
31 int tag; 32 int tag;
32 int nameClassifier; 33 int nameClassifier;
  34 + int startNode;
  35 + int endNode;
33 }; 36 };
34 37
35 #endif /* INTERPRETATION_HPP */ 38 #endif /* INTERPRETATION_HPP */
morfeusz/InterpsGroup.hpp 0 → 100644
  1 +/*
  2 + * File: GroupedInterpretations.hpp
  3 + * Author: lennyn
  4 + *
  5 + * Created on November 16, 2013, 7:58 PM
  6 + */
  7 +
  8 +#ifndef GROUPEDINTERPRETATIONS_HPP
  9 +#define GROUPEDINTERPRETATIONS_HPP
  10 +
  11 +#include <vector>
  12 +#include <string>
  13 +#include "EncodedInterpretation.hpp"
  14 +#include "MorphInterpretation.hpp"
  15 +#include "Tagset.hpp"
  16 +
  17 +class InterpsGroup {
  18 +public:
  19 +
  20 + InterpsGroup() {
  21 +
  22 + }
  23 +
  24 + explicit InterpsGroup(const int type)
  25 + : type(type) {
  26 +
  27 + }
  28 +
  29 + std::vector<MorphInterpretation> getRealInterps(const std::string& orth, const Tagset& tagset) {
  30 + std::vector<MorphInterpretation> res;
  31 + for (EncodedInterpretation& ei: interps) {
  32 + res.push_back(MorphInterpretation(startNode, endNode, orth, ei, tagset));
  33 + }
  34 + return res;
  35 + }
  36 +
  37 + void addInterpretation(const EncodedInterpretation& interp) {
  38 + interps.push_back(interp);
  39 + }
  40 +
  41 + int type;
  42 + int startNode;
  43 + int endNode;
  44 +private:
  45 + std::vector<EncodedInterpretation> interps;
  46 +};
  47 +
  48 +#endif /* GROUPEDINTERPRETATIONS_HPP */
  49 +
morfeusz/Morfeusz.cpp
@@ -6,17 +6,18 @@ @@ -6,17 +6,18 @@
6 */ 6 */
7 7
8 #include <string> 8 #include <string>
  9 +#include "fsa.hpp"
9 #include "utils.hpp" 10 #include "utils.hpp"
10 #include "Morfeusz.hpp" 11 #include "Morfeusz.hpp"
11 #include "MorphDeserializer.hpp" 12 #include "MorphDeserializer.hpp"
12 -#include "encoding/CharsetConverter.hpp" 13 +#include "charset/CharsetConverter.hpp"
13 14
14 using namespace std; 15 using namespace std;
15 16
16 -static FSA<vector<EncodedInterpretation>>* initializeFSA(const string& filename) {  
17 - static Deserializer<vector<EncodedInterpretation>>* deserializer  
18 - = new MorphDeserializer();  
19 - return FSA<vector<EncodedInterpretation>>::getFSA(filename, *deserializer); 17 +static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) {
  18 + static Deserializer < vector < InterpsGroup >> *deserializer
  19 + = new MorphDeserializer();
  20 + return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer);
20 } 21 }
21 22
22 static CharsetConverter* initializeCharsetConverter() { 23 static CharsetConverter* initializeCharsetConverter() {
@@ -26,7 +27,7 @@ static CharsetConverter* initializeCharsetConverter() { @@ -26,7 +27,7 @@ static CharsetConverter* initializeCharsetConverter() {
26 27
27 Morfeusz::Morfeusz(const string& filename) 28 Morfeusz::Morfeusz(const string& filename)
28 : fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) { 29 : fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) {
29 - 30 +
30 } 31 }
31 32
32 //Morfeusz::Morfeusz(const Morfeusz& orig) { 33 //Morfeusz::Morfeusz(const Morfeusz& orig) {
@@ -36,12 +37,57 @@ Morfeusz::~Morfeusz() { @@ -36,12 +37,57 @@ Morfeusz::~Morfeusz() {
36 delete &this->fsa; 37 delete &this->fsa;
37 } 38 }
38 39
39 -AnalyzeResult Morfeusz::analyze(const std::string& text) {  
40 - const char* textStart = text.c_str();  
41 - const char* textEnd = text.c_str() + text.length();  
42 - AnalyzeResult res = {  
43 - ResultsIterator(textStart, textEnd, *this),  
44 - ResultsIterator(textEnd, textEnd, *this)};  
45 - return res; 40 +ResultsIterator Morfeusz::analyze(const std::string& text) {
  41 +// const char* textStart = text.c_str();
  42 +// const char* textEnd = text.c_str() + text.length();
  43 + return ResultsIterator(text, *this);
  44 +}
  45 +
  46 +ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz)
  47 +: rawInput(text.c_str()),
  48 +morfeusz(morfeusz) {
  49 +}
  50 +
  51 +MorphInterpretation ResultsIterator::getNext() {
  52 +// if (resultsBuffer.empty()) {
  53 +// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer));
  54 +// }
  55 +// startNode = resultsBuffer.back().getEndNode();
  56 +// MorphInterpretation res = resultsBuffer.front();
  57 +// resultsBuffer.pop_front();
  58 +// return res;
  59 +}
  60 +
  61 +bool ResultsIterator::hasNext() {
  62 + return rawInput[0] != '\0' && resultsBuffer.empty();
46 } 63 }
47 64
  65 +//int Morfeusz::doProcessOneWord(const char*& inputPtr, const char* inputEnd, int startNodeNum, std::vector<EncodedInterpretation>& interps) const {
  66 +// assert(inputPtr[0] != '\0');
  67 +// const char* start = inputPtr;
  68 +// StateType state = fsa->getInitialState();
  69 +// int currNodeNum = startNodeNum;
  70 +// int codepoint = this->charsetConverter->next(inputPtr, inputEnd);
  71 +// assert(!isEndOfWord(codepoint));
  72 +// while(!isEndOfWord(codepoint)) {
  73 +// feedState(state, codepoint);
  74 +// if (state.isAccepting()) {
  75 +// const char* currInputPtr = inputPtr;
  76 +// vector<EncodedInterpretation> startInterps = state.getValue();
  77 +// filterOutNonGluableInterps(startInterps);
  78 +// if (!startInterps.empty()) {
  79 +//
  80 +// }
  81 +// vector<EncodedInterpretation> additionalInterps;
  82 +// int nextNodeNum = doProcessOneWord(currInputPtr, inputEnd, currNodeNum + 1, additionalInterps);
  83 +// if (!additionalInterps.empty()) {
  84 +// for (EncodedInterpretation& interp: state.getValue()) {
  85 +// interp.startNode = currNodeNum;
  86 +// interp.endNode = currNodeNum + 1;
  87 +// interps.push_back(interp);
  88 +// }
  89 +//
  90 +// }
  91 +// }
  92 +// }
  93 +//}
morfeusz/Morfeusz.hpp
@@ -9,53 +9,78 @@ @@ -9,53 +9,78 @@
9 #define MORFEUSZ_HPP 9 #define MORFEUSZ_HPP
10 10
11 #include <string> 11 #include <string>
  12 +#include <list>
12 #include <vector> 13 #include <vector>
13 #include "EncodedInterpretation.hpp" 14 #include "EncodedInterpretation.hpp"
14 #include "fsa.hpp" 15 #include "fsa.hpp"
15 #include "MorphInterpretation.hpp" 16 #include "MorphInterpretation.hpp"
16 -#include "encoding/CharsetConverter.hpp" 17 +#include "InterpsGroup.hpp"
  18 +#include "charset/CharsetConverter.hpp"
17 19
18 class Morfeusz; 20 class Morfeusz;
19 -class AnalyzeResult; 21 +//class AnalyzeResult;
20 class ResultsIterator; 22 class ResultsIterator;
21 23
  24 +typedef FSA<std::vector<InterpsGroup>> FSAType;
  25 +typedef State<std::vector<InterpsGroup>> StateType;
  26 +
22 class Morfeusz { 27 class Morfeusz {
23 public: 28 public:
24 explicit Morfeusz(const std::string& filename); 29 explicit Morfeusz(const std::string& filename);
25 virtual ~Morfeusz(); 30 virtual ~Morfeusz();
26 // Morfeusz(const Morfeusz& orig); 31 // Morfeusz(const Morfeusz& orig);
27 - AnalyzeResult analyze(const std::string& text); 32 + ResultsIterator analyze(const std::string& text);
28 33
29 // Morfeusz(); 34 // Morfeusz();
  35 + friend class ResultsIterator;
30 private: 36 private:
31 - void processOneWord(const char*& inputData, int startNodeNum, std::vector<MorphInterpretation>& resInterps);  
32 - const FSA<std::vector<EncodedInterpretation>>* fsa; 37 + template <class OutputIterator>
  38 +// void processOneWord(const char*& inputData, int startNodeNum, OutputIterator resInterps) const;
  39 +
  40 + int doProcessOneWord(const char*& inputData, int startNodeNum, std::vector<InterpsGroup>& interps) const;
  41 +
  42 + const FSAType* fsa;
33 CharsetConverter* charsetConverter; 43 CharsetConverter* charsetConverter;
34 }; 44 };
35 45
  46 +#include "Morfeusz_impl.hpp"
  47 +
36 class ResultsIterator { 48 class ResultsIterator {
37 public: 49 public:
38 - ResultsIterator(  
39 - const char* startOfInput,  
40 - const char* endOfInput,  
41 - const Morfeusz& morfeusz);  
42 - virtual ~ResultsIterator();  
43 -// ResultsIterator(int* x);  
44 - ResultsIterator(const ResultsIterator& mit);  
45 - ResultsIterator& operator++();  
46 - ResultsIterator operator++(int);  
47 - bool operator==(const ResultsIterator& rhs);  
48 - bool operator!=(const ResultsIterator& rhs);  
49 - MorphInterpretation& operator*(); 50 + ResultsIterator(const std::string& text, const Morfeusz& morfeusz);
  51 + MorphInterpretation getNext();
  52 + bool hasNext();
50 private: 53 private:
51 const char* rawInput; 54 const char* rawInput;
52 - const char* endOfInput; 55 + const Morfeusz& morfeusz;
  56 + std::list<MorphInterpretation> resultsBuffer;
  57 + int startNode;
53 }; 58 };
54 59
55 -struct AnalyzeResult {  
56 - ResultsIterator iterator;  
57 - const ResultsIterator end;  
58 -}; 60 +//class ResultsIterator {
  61 +//public:
  62 +// ResultsIterator(
  63 +// const char* startOfInput,
  64 +// const char* endOfInput,
  65 +// const Morfeusz& morfeusz);
  66 +// virtual ~ResultsIterator();
  67 +// ResultsIterator(const ResultsIterator& mit);
  68 +// ResultsIterator& operator++();
  69 +// ResultsIterator operator++(int);
  70 +// bool operator==(const ResultsIterator& rhs);
  71 +// bool operator!=(const ResultsIterator& rhs);
  72 +// MorphInterpretation& operator*();
  73 +//private:
  74 +// const char* rawInput;
  75 +// const char* endOfInput;
  76 +// const Morfeusz& morfeusz;
  77 +// vector<MorphInterpretation> resultsBuffer;
  78 +//};
  79 +
  80 +//struct AnalyzeResult {
  81 +// ResultsIterator iterator;
  82 +// const ResultsIterator end;
  83 +//};
59 84
60 #endif /* MORFEUSZ_HPP */ 85 #endif /* MORFEUSZ_HPP */
61 86
morfeusz/Morfeusz_impl.hpp 0 → 100644
  1 +/*
  2 + * File: Morfeusz_impl.hpp
  3 + * Author: lennyn
  4 + *
  5 + * Created on November 15, 2013, 1:43 PM
  6 + */
  7 +
  8 +#ifndef MORFEUSZ_IMPL_HPP
  9 +#define MORFEUSZ_IMPL_HPP
  10 +
  11 +#include <cassert>
  12 +#include "Morfeusz.hpp"
  13 +
  14 +//template <class OutputIterator>
  15 +//void Morfeusz::processOneWord(const char*& inputData, const char* inputEnd, int startNodeNum, OutputIterator output, bool insertIgn = true) const {
  16 +// if (inputData == inputEnd) {
  17 +// return;
  18 +// }
  19 +// const char* start = inputData;
  20 +// StateType state = fsa->getInitialState();
  21 +// int currNodeNum = startNodeNum;
  22 +// do {
  23 +// int codepoint = this->charsetConverter->next(inputData, inputEnd);
  24 +// if (!isSpace(codepoint) && codepoint != 0) {
  25 +// feedAutomaton(state, codepoint);
  26 +// if (state.isAccepting()) {
  27 +// int currInput = inputData;
  28 +// vector<MorphInterpretation> additionalInterps;
  29 +// processOneWord(
  30 +// currInput, inputEnd,
  31 +// currNodeNum + 1,
  32 +// back_inserter(additionalInterps), false);
  33 +// if (!additionalInterps.empty()) {
  34 +// currNodeNum = additionalInterps.back().getEndNode();
  35 +// }
  36 +// }
  37 +// }
  38 +// }
  39 +//}
  40 +
  41 +#endif /* MORFEUSZ_IMPL_HPP */
  42 +
morfeusz/MorphDeserializer.cpp
@@ -5,7 +5,10 @@ @@ -5,7 +5,10 @@
5 * Created on 12 listopad 2013, 15:31 5 * Created on 12 listopad 2013, 15:31
6 */ 6 */
7 7
  8 +#include <map>
8 #include "MorphDeserializer.hpp" 9 #include "MorphDeserializer.hpp"
  10 +#include "EncodedInterpretation.hpp"
  11 +#include "InterpsGroup.hpp"
9 12
10 MorphDeserializer::MorphDeserializer() { 13 MorphDeserializer::MorphDeserializer() {
11 } 14 }
@@ -25,6 +28,8 @@ static void deserializeLemma(const unsigned char*&amp; ptr, EncodedLemma&amp; lemma) { @@ -25,6 +28,8 @@ static void deserializeLemma(const unsigned char*&amp; ptr, EncodedLemma&amp; lemma) {
25 } 28 }
26 29
27 static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { 30 static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
  31 + interp.type = *ptr;
  32 + ptr++;
28 deserializeLemma(ptr, interp.lemma); 33 deserializeLemma(ptr, interp.lemma);
29 interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); 34 interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
30 ptr += 2; 35 ptr += 2;
@@ -32,17 +37,58 @@ static void deserializeInterp(const unsigned char*&amp; ptr, EncodedInterpretation&amp; @@ -32,17 +37,58 @@ static void deserializeInterp(const unsigned char*&amp; ptr, EncodedInterpretation&amp;
32 ptr++; 37 ptr++;
33 } 38 }
34 39
35 -long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const { 40 +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
36 const unsigned char* currPtr = ptr; 41 const unsigned char* currPtr = ptr;
37 uint8_t interpsNum = *ptr; 42 uint8_t interpsNum = *ptr;
38 interps.clear(); 43 interps.clear();
39 interps.reserve(interpsNum); 44 interps.reserve(interpsNum);
40 currPtr++; 45 currPtr++;
  46 + // FIXME - to jest do poprawy
  47 + map<int, InterpsGroup> results;
41 for (unsigned int i = 0; i < interpsNum; ++i) { 48 for (unsigned int i = 0; i < interpsNum; ++i) {
42 EncodedInterpretation interp; 49 EncodedInterpretation interp;
43 deserializeInterp(currPtr, interp); 50 deserializeInterp(currPtr, interp);
44 - interps.push_back(interp); 51 + if (results.count(interp.type) == 0) {
  52 + results[interp.type] = InterpsGroup(interp.type);
  53 + }
  54 + results[interp.type].addInterpretation(interp);
  55 +// interps.push_back(interp);
  56 + }
  57 + for (auto& kv: results) {
  58 + interps.push_back(kv.second);
45 } 59 }
46 return currPtr - ptr; 60 return currPtr - ptr;
47 } 61 }
48 62
  63 +//static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) {
  64 +// // XXX uważać na poprawność danych
  65 +// lemma.suffixToCut = *ptr;
  66 +// ptr++;
  67 +// lemma.suffixToAdd = (const char*) ptr;
  68 +// ptr += strlen((const char*) ptr) + 1;
  69 +//}
  70 +//
  71 +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
  72 +// interp.type = *ptr;
  73 +// ptr++;
  74 +// deserializeLemma(ptr, interp.lemma);
  75 +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
  76 +// ptr += 2;
  77 +// interp.nameClassifier = *ptr;
  78 +// ptr++;
  79 +//}
  80 +//
  81 +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const {
  82 +// const unsigned char* currPtr = ptr;
  83 +// uint8_t interpsNum = *ptr;
  84 +// interps.clear();
  85 +// interps.reserve(interpsNum);
  86 +// currPtr++;
  87 +// for (unsigned int i = 0; i < interpsNum; ++i) {
  88 +// EncodedInterpretation interp;
  89 +// deserializeInterp(currPtr, interp);
  90 +// interps.push_back(interp);
  91 +// }
  92 +// return currPtr - ptr;
  93 +//}
  94 +
morfeusz/MorphDeserializer.hpp
@@ -10,19 +10,31 @@ @@ -10,19 +10,31 @@
10 10
11 #include <vector> 11 #include <vector>
12 #include "fsa.hpp" 12 #include "fsa.hpp"
13 -#include "EncodedInterpretation.hpp" 13 +#include "InterpsGroup.hpp"
14 14
15 -class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> { 15 +class MorphDeserializer: public Deserializer<std::vector<InterpsGroup>> {
16 public: 16 public:
17 MorphDeserializer(); 17 MorphDeserializer();
18 MorphDeserializer(const MorphDeserializer& orig); 18 MorphDeserializer(const MorphDeserializer& orig);
19 virtual ~MorphDeserializer(); 19 virtual ~MorphDeserializer();
20 long deserialize( 20 long deserialize(
21 const unsigned char* ptr, 21 const unsigned char* ptr,
22 - std::vector<EncodedInterpretation>& interps) const; 22 + std::vector<InterpsGroup>& interps) const;
23 private: 23 private:
24 24
25 }; 25 };
26 26
  27 +//class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> {
  28 +//public:
  29 +// MorphDeserializer();
  30 +// MorphDeserializer(const MorphDeserializer& orig);
  31 +// virtual ~MorphDeserializer();
  32 +// long deserialize(
  33 +// const unsigned char* ptr,
  34 +// std::vector<EncodedInterpretation>& interps) const;
  35 +//private:
  36 +//
  37 +//};
  38 +
27 #endif /* MORPHDESERIALIZER_HPP */ 39 #endif /* MORPHDESERIALIZER_HPP */
28 40
morfeusz/MorphInterpretation.hpp
@@ -36,8 +36,8 @@ private: @@ -36,8 +36,8 @@ private:
36 std::string lemma; 36 std::string lemma;
37 int tagnum; 37 int tagnum;
38 int namenum; 38 int namenum;
39 - const std::string& tag;  
40 - const std::string& name; 39 + std::string tag;
  40 + std::string name;
41 }; 41 };
42 42
43 #endif /* MORPHINTERPRETATION_HPP */ 43 #endif /* MORPHINTERPRETATION_HPP */
morfeusz/encoding/CharsetConverter.cpp renamed to morfeusz/charset/CharsetConverter.cpp
@@ -11,6 +11,6 @@ @@ -11,6 +11,6 @@
11 uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { 11 uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
12 return utf8::next(it, end); 12 return utf8::next(it, end);
13 } 13 }
14 -const char* UTF8CharsetConverter::append(uint32_t cp, const char* result) const { 14 +char* UTF8CharsetConverter::append(uint32_t cp, char* result) const {
15 return utf8::append(cp, result); 15 return utf8::append(cp, result);
16 } 16 }
morfeusz/encoding/CharsetConverter.hpp renamed to morfeusz/charset/CharsetConverter.hpp
@@ -11,35 +11,35 @@ @@ -11,35 +11,35 @@
11 class CharsetConverter { 11 class CharsetConverter {
12 public: 12 public:
13 virtual uint32_t next(const char*& it, const char* end) const = 0; 13 virtual uint32_t next(const char*& it, const char* end) const = 0;
14 - virtual const char* append(uint32_t cp, const char* result) const = 0; 14 + virtual char* append(uint32_t cp, char* result) const = 0;
15 private: 15 private:
16 }; 16 };
17 17
18 class UTF8CharsetConverter: public CharsetConverter { 18 class UTF8CharsetConverter: public CharsetConverter {
19 public: 19 public:
20 uint32_t next(const char*& it, const char* end) const; 20 uint32_t next(const char*& it, const char* end) const;
21 - const char* append(uint32_t cp, const char* result) const; 21 + char* append(uint32_t cp, char* result) const;
22 private: 22 private:
23 }; 23 };
24 24
25 class UTF16CharsetConverter: public CharsetConverter { 25 class UTF16CharsetConverter: public CharsetConverter {
26 public: 26 public:
27 uint32_t next(const char*& it, const char* end) const; 27 uint32_t next(const char*& it, const char* end) const;
28 - const char* append(uint32_t cp, const char* result) const; 28 + char* append(uint32_t cp, char* result) const;
29 private: 29 private:
30 }; 30 };
31 31
32 class UTF32CharsetConverter: public CharsetConverter { 32 class UTF32CharsetConverter: public CharsetConverter {
33 public: 33 public:
34 uint32_t next(const char*& it, const char* end) const; 34 uint32_t next(const char*& it, const char* end) const;
35 - const char* append(uint32_t cp, const char* result) const; 35 + char* append(uint32_t cp, char* result) const;
36 private: 36 private:
37 }; 37 };
38 38
39 class ISO8859_2_CharsetConverter: public CharsetConverter { 39 class ISO8859_2_CharsetConverter: public CharsetConverter {
40 public: 40 public:
41 uint32_t next(const char*& it, const char* end) const; 41 uint32_t next(const char*& it, const char* end) const;
42 - const char* append(uint32_t cp, const char* result) const; 42 + char* append(uint32_t cp, char* result) const;
43 private: 43 private:
44 }; 44 };
45 45
morfeusz/charset/charset_utils.hpp 0 → 100644
  1 +/*
  2 + * File: charset_utils.hpp
  3 + * Author: lennyn
  4 + *
  5 + * Created on November 15, 2013, 1:57 PM
  6 + */
  7 +
  8 +#ifndef CHARSET_UTILS_HPP
  9 +#define CHARSET_UTILS_HPP
  10 +
  11 +
  12 +
  13 +#endif /* CHARSET_UTILS_HPP */
  14 +
morfeusz/encoding/utf8.h renamed to morfeusz/charset/utf8.h
morfeusz/encoding/utf8/checked.h renamed to morfeusz/charset/utf8/checked.h
morfeusz/encoding/utf8/core.h renamed to morfeusz/charset/utf8/core.h
morfeusz/encoding/utf8/unchecked.h renamed to morfeusz/charset/utf8/unchecked.h
morfeusz/test_morph.cpp
@@ -18,7 +18,7 @@ @@ -18,7 +18,7 @@
18 using namespace std; 18 using namespace std;
19 19
20 void doTest( 20 void doTest(
21 - const FSA<vector<EncodedInterpretation>>& fsa, 21 + const FSA<vector<InterpsGroup>>& fsa,
22 const Tagset& tagset, 22 const Tagset& tagset,
23 // const InterpretationsDecoder<TaggedInterpretation>& interpsConverter, 23 // const InterpretationsDecoder<TaggedInterpretation>& interpsConverter,
24 const char* fname) { 24 const char* fname) {
@@ -32,14 +32,15 @@ void doTest( @@ -32,14 +32,15 @@ void doTest(
32 string lemma = splitVector[1]; 32 string lemma = splitVector[1];
33 string tag = splitVector[2]; 33 string tag = splitVector[2];
34 string name = splitVector[3]; 34 string name = splitVector[3];
35 - vector<EncodedInterpretation> value2; 35 + vector<InterpsGroup> value2;
36 fsa.tryToRecognize(orth.c_str(), value2); 36 fsa.tryToRecognize(orth.c_str(), value2);
37 DEBUG("recognized "+to_string(value2.size())); 37 DEBUG("recognized "+to_string(value2.size()));
38 // vector<TaggedInterpretation> parsedValues; 38 // vector<TaggedInterpretation> parsedValues;
39 bool found = false; 39 bool found = false;
40 - for (EncodedInterpretation encodedInterp: value2) { 40 + for (InterpsGroup gi: value2)
  41 + for (MorphInterpretation interp: gi.getRealInterps(orth, tagset)) {
41 // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp); 42 // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp);
42 - MorphInterpretation interp(0, 0, orth, encodedInterp, tagset); 43 +// (0, 0, orth, encodedInterp, tagset);
43 // parsedValues.push_back(parsedValue); 44 // parsedValues.push_back(parsedValue);
44 // debug(orth, parsedValue); 45 // debug(orth, parsedValue);
45 if (lemma == interp.getLemma() && tag == interp.getTag() && name == interp.getName()) { 46 if (lemma == interp.getLemma() && tag == interp.getTag() && name == interp.getName()) {
@@ -62,10 +63,7 @@ int main(int argc, char** argv) { @@ -62,10 +63,7 @@ int main(int argc, char** argv) {
62 validate(argc == 3, "Must provide exactly two arguments - FSA filename, and dictionary filename."); 63 validate(argc == 3, "Must provide exactly two arguments - FSA filename, and dictionary filename.");
63 const unsigned char* fsaData = readFile(argv[1]); 64 const unsigned char* fsaData = readFile(argv[1]);
64 MorphDeserializer deserializer; 65 MorphDeserializer deserializer;
65 - DEBUG("will read FSA");  
66 - FSA<vector<EncodedInterpretation>>* fsa = FSA<vector<EncodedInterpretation>>::getFSA(fsaData, deserializer);  
67 - DEBUG("DONE read FSA");  
68 - DEBUG("will read tagset"); 66 + FSA<vector<InterpsGroup>>* fsa = FSA<vector<InterpsGroup>>::getFSA(fsaData, deserializer);
69 Tagset tagset(fsaData); 67 Tagset tagset(fsaData);
70 // TaggedInterpretationsDecoder interpsDecoder(tagset); 68 // TaggedInterpretationsDecoder interpsDecoder(tagset);
71 DEBUG("DONE read tagset"); 69 DEBUG("DONE read tagset");
nbproject/configurations.xml
@@ -8,11 +8,13 @@ @@ -8,11 +8,13 @@
8 <in>test_speed.cpp</in> 8 <in>test_speed.cpp</in>
9 </df> 9 </df>
10 <df root="morfeusz" name="1"> 10 <df root="morfeusz" name="1">
11 - <df name="encoding"> 11 + <df name="charset">
12 <in>CharsetConverter.cpp</in> 12 <in>CharsetConverter.cpp</in>
13 - <in>CharsetConverter.hpp</in> 13 + <in>charset_utils.hpp</in>
14 </df> 14 </df>
  15 + <in>InterpsGroup.hpp</in>
15 <in>Morfeusz.cpp</in> 16 <in>Morfeusz.cpp</in>
  17 + <in>Morfeusz_impl.hpp</in>
16 <in>MorphDeserializer.cpp</in> 18 <in>MorphDeserializer.cpp</in>
17 <in>MorphInterpretation.cpp</in> 19 <in>MorphInterpretation.cpp</in>
18 <in>Tagset.cpp</in> 20 <in>Tagset.cpp</in>
@@ -51,11 +53,19 @@ @@ -51,11 +53,19 @@
51 <executablePath>build/fsa/test_dict</executablePath> 53 <executablePath>build/fsa/test_dict</executablePath>
52 </makeTool> 54 </makeTool>
53 </makefileType> 55 </makefileType>
54 - <item path="fsa/const.cpp" ex="false" tool="1" flavor2="4"> 56 + <folder path="1">
55 <ccTool> 57 <ccTool>
56 <incDir> 58 <incDir>
57 <pElem>fsa</pElem> 59 <pElem>fsa</pElem>
58 - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> 60 + <pElem>build/morfeusz</pElem>
  61 + </incDir>
  62 + </ccTool>
  63 + </folder>
  64 + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8">
  65 + <ccTool>
  66 + <incDir>
  67 + <pElem>fsa</pElem>
  68 + <pElem>build/morfeusz</pElem>
59 </incDir> 69 </incDir>
60 </ccTool> 70 </ccTool>
61 </item> 71 </item>
@@ -80,86 +90,45 @@ @@ -80,86 +90,45 @@
80 </incDir> 90 </incDir>
81 </ccTool> 91 </ccTool>
82 </item> 92 </item>
  93 + <item path="morfeusz/InterpsGroup.hpp" ex="false" tool="3" flavor2="0">
  94 + </item>
83 <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> 95 <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8">
84 <ccTool> 96 <ccTool>
85 - <incDir>  
86 - <pElem>fsa</pElem>  
87 - <pElem>build/morfeusz</pElem>  
88 - </incDir>  
89 </ccTool> 97 </ccTool>
90 </item> 98 </item>
  99 + <item path="morfeusz/Morfeusz_impl.hpp" ex="false" tool="3" flavor2="0">
  100 + </item>
91 <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> 101 <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8">
92 <ccTool> 102 <ccTool>
93 - <incDir>  
94 - <pElem>fsa</pElem>  
95 - <pElem>build/morfeusz</pElem>  
96 - </incDir>  
97 </ccTool> 103 </ccTool>
98 </item> 104 </item>
99 - <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="4"> 105 + <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8">
100 <ccTool> 106 <ccTool>
101 - <incDir>  
102 - <pElem>morfeusz</pElem>  
103 - <pElem>/usr/include/c++/4.8/bits</pElem>  
104 - <pElem>/usr/include/c++/4.8/ext</pElem>  
105 - <pElem>/usr/include/c++/4.8</pElem>  
106 - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem>  
107 - <pElem>/usr/include/c++/4.8/debug</pElem>  
108 - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem>  
109 - <pElem>/usr/include/c++/4.8/backward</pElem>  
110 - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem>  
111 - <pElem>build/morfeusz</pElem>  
112 - </incDir>  
113 </ccTool> 107 </ccTool>
114 </item> 108 </item>
115 - <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> 109 + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8">
116 <ccTool> 110 <ccTool>
117 - <incDir>  
118 - <pElem>morfeusz</pElem>  
119 - <pElem>/usr/include/c++/4.8/bits</pElem>  
120 - <pElem>/usr/include/c++/4.8/ext</pElem>  
121 - <pElem>/usr/include/c++/4.8</pElem>  
122 - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem>  
123 - <pElem>/usr/include/c++/4.8/debug</pElem>  
124 - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem>  
125 - <pElem>fsa</pElem>  
126 - <pElem>/usr/include/c++/4.8/backward</pElem>  
127 - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem>  
128 - <pElem>build/morfeusz</pElem>  
129 - </incDir>  
130 </ccTool> 111 </ccTool>
131 </item> 112 </item>
132 - <item path="morfeusz/encoding/CharsetConverter.cpp" 113 + <item path="morfeusz/charset/CharsetConverter.cpp"
133 ex="false" 114 ex="false"
134 tool="1" 115 tool="1"
135 - flavor2="0"> 116 + flavor2="8">
  117 + <ccTool>
  118 + </ccTool>
136 </item> 119 </item>
137 - <item path="morfeusz/encoding/CharsetConverter.hpp"  
138 - ex="false"  
139 - tool="3"  
140 - flavor2="0"> 120 + <item path="morfeusz/charset/charset_utils.hpp" ex="false" tool="3" flavor2="0">
141 </item> 121 </item>
142 <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> 122 <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
143 <ccTool> 123 <ccTool>
144 - <incDir>  
145 - <pElem>fsa</pElem>  
146 - <pElem>build/morfeusz</pElem>  
147 - </incDir>  
148 </ccTool> 124 </ccTool>
149 </item> 125 </item>
150 <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> 126 <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4">
151 <ccTool> 127 <ccTool>
152 - <incDir>  
153 - <pElem>morfeusz</pElem>  
154 - </incDir>  
155 </ccTool> 128 </ccTool>
156 </item> 129 </item>
157 <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> 130 <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8">
158 <ccTool> 131 <ccTool>
159 - <incDir>  
160 - <pElem>fsa</pElem>  
161 - <pElem>build/morfeusz</pElem>  
162 - </incDir>  
163 </ccTool> 132 </ccTool>
164 </item> 133 </item>
165 </conf> 134 </conf>