Commit a9d3e65c15f2e43bc637fbda8342a6242dc1174f

Authored by Michał Lenart
1 parent f23aead2

- refaktoryzacja, odkomentowanie na-razie-niedziałających kawałków kodu

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@20 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/fsa/common.py
... ... @@ -14,7 +14,7 @@ class Lemma(object):
14 14  
15 15 class Interpretation(object):
16 16  
17   - def __init__(self, orth, base, tagnum, namenum, encoder):
  17 + def __init__(self, orth, base, tagnum, namenum, typenum, encoder):
18 18 assert type(orth) == unicode
19 19 assert type(base) == unicode
20 20 root = u''
... ... @@ -29,6 +29,7 @@ class Interpretation(object):
29 29 suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False))
30 30 self.tagnum = tagnum
31 31 self.namenum = namenum
  32 + self.typenum = typenum
32 33  
33 34 def getSortKey(self):
34 35 return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum)
... ...
fsabuilder/fsa/convertinput.py
... ... @@ -6,6 +6,398 @@ Created on Oct 23, 2013
6 6 import logging
7 7 from common import Interpretation
8 8  
  9 +tag2typenum = {
  10 + 'aglt:sg:pri:imperf:nwok': 12,
  11 + 'aglt:sg:pri:imperf:wok': 12,
  12 + 'aglt:sg:sec:imperf:nwok': 12,
  13 + 'aglt:sg:sec:imperf:wok': 12,
  14 + 'aglt:pl:pri:imperf:nwok': 13,
  15 + 'aglt:pl:pri:imperf:wok': 13,
  16 + 'aglt:pl:sec:imperf:nwok': 13,
  17 + 'aglt:pl:sec:imperf:wok': 13,
  18 + 'praet:sg:m1.m2.m3:imperf:agl': 7,
  19 + 'praet:sg:m1.m2.m3:imperf.perf:agl': 7,
  20 + 'praet:sg:m1.m2.m3:perf:agl': 7,
  21 + 'praet:sg:m1.m2.m3:imperf:nagl': 16,
  22 + 'praet:sg:m1.m2.m3:imperf.perf:nagl': 16,
  23 + 'praet:sg:m1.m2.m3:perf:nagl': 16,
  24 + 'praet:sg:f:imperf': 20,
  25 + 'praet:sg:f:imperf.perf': 20,
  26 + 'praet:sg:f:perf': 20,
  27 + 'praet:sg:m1.m2.m3:imperf': 20,
  28 + 'praet:sg:m1.m2.m3:imperf.perf': 20,
  29 + 'praet:sg:m1.m2.m3:perf': 20,
  30 + 'praet:sg:n1.n2:imperf': 20,
  31 + 'praet:sg:n1.n2:imperf.perf': 20,
  32 + 'praet:sg:n1.n2:perf': 20,
  33 + 'praet:pl:m1.p1:imperf': 21,
  34 + 'praet:pl:m1.p1:imperf.perf': 21,
  35 + 'praet:pl:m1.p1:perf': 21,
  36 + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf': 21,
  37 + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf': 21,
  38 + 'praet:pl:m2.m3.f.n1.n2.p2.p3:perf': 21,
  39 + 'naj': 10,
  40 + 'nie': 5,
  41 + 'adj:pl:acc:m1.p1:pos': 1,
  42 + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos': 1,
  43 + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  44 + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  45 + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  46 + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  47 + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1,
  48 + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:pos': 1,
  49 + 'adj:pl:nom.voc:m1.p1:pos': 1,
  50 + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos': 1,
  51 + 'adj:sg:acc:m1.m2:pos': 1,
  52 + 'adj:sg:acc:n1.n2:pos': 1,
  53 + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1,
  54 + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1,
  55 + 'adj:sg:inst:m1.m2.m3.n1.n2:pos': 1,
  56 + 'adj:sg:loc:m1.m2.m3.n1.n2:pos': 1,
  57 + 'adj:sg:nom.voc:m1.m2.m3:pos': 1,
  58 + 'adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos': 1,
  59 + 'adj:sg:nom.voc:n1.n2:pos': 1,
  60 + 'adj:sg:acc:f:pos': 1,
  61 + 'adj:sg:acc.inst:f:pos': 1,
  62 + 'adj:sg:acc:m1.m2:pos': 1,
  63 + 'adj:sg:acc:m3:pos': 1,
  64 + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1,
  65 + 'adj:sg:gen.dat.loc:f:pos': 1,
  66 + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1,
  67 + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:pos': 1,
  68 + 'adj:sg:nom.voc.acc:n1.n2:pos': 1,
  69 + 'adj:sg:nom.voc:f:pos': 1,
  70 + 'adj:sg:nom.voc:m1.m2.m3:pos': 1,
  71 + 'adj:pl:acc:f:pos': 1,
  72 + 'adj:pl:acc:m1:pos': 1,
  73 + 'adj:pl:acc:m2:pos': 1,
  74 + 'adj:pl:acc:m3:pos': 1,
  75 + 'adj:pl:acc:n1:pos': 1,
  76 + 'adj:pl:acc:n2:pos': 1,
  77 + 'adj:pl:acc:p1:pos': 1,
  78 + 'adj:pl:acc:p2:pos': 1,
  79 + 'adj:pl:acc:p3:pos': 1,
  80 + 'adj:pl:dat:f:pos': 1,
  81 + 'adj:pl:dat:m1:pos': 1,
  82 + 'adj:pl:dat:m2:pos': 1,
  83 + 'adj:pl:dat:m3:pos': 1,
  84 + 'adj:pl:dat:n1:pos': 1,
  85 + 'adj:pl:dat:n2:pos': 1,
  86 + 'adj:pl:dat:p1:pos': 1,
  87 + 'adj:pl:dat:p2:pos': 1,
  88 + 'adj:pl:dat:p3:pos': 1,
  89 + 'adj:pl:gen:f:pos': 1,
  90 + 'adj:pl:gen:m1:pos': 1,
  91 + 'adj:pl:gen:m2:pos': 1,
  92 + 'adj:pl:gen:m3:pos': 1,
  93 + 'adj:pl:gen:n1:pos': 1,
  94 + 'adj:pl:gen:n2:pos': 1,
  95 + 'adj:pl:gen:p1:pos': 1,
  96 + 'adj:pl:gen:p2:pos': 1,
  97 + 'adj:pl:gen:p3:pos': 1,
  98 + 'adj:pl:inst:f:pos': 1,
  99 + 'adj:pl:inst:m1:pos': 1,
  100 + 'adj:pl:inst:m2:pos': 1,
  101 + 'adj:pl:inst:m3:pos': 1,
  102 + 'adj:pl:inst:n1:pos': 1,
  103 + 'adj:pl:inst:n2:pos': 1,
  104 + 'adj:pl:inst:p1:pos': 1,
  105 + 'adj:pl:inst:p2:pos': 1,
  106 + 'adj:pl:inst:p3:pos': 1,
  107 + 'adj:pl:loc:f:pos': 1,
  108 + 'adj:pl:loc:m1:pos': 1,
  109 + 'adj:pl:loc:m2:pos': 1,
  110 + 'adj:pl:loc:m3:pos': 1,
  111 + 'adj:pl:loc:n1:pos': 1,
  112 + 'adj:pl:loc:n2:pos': 1,
  113 + 'adj:pl:loc:p1:pos': 1,
  114 + 'adj:pl:loc:p2:pos': 1,
  115 + 'adj:pl:loc:p3:pos': 1,
  116 + 'adj:pl:nom:f:pos': 1,
  117 + 'adj:pl:nom:m1:pos': 1,
  118 + 'adj:pl:nom:m2:pos': 1,
  119 + 'adj:pl:nom:m3:pos': 1,
  120 + 'adj:pl:nom:n1:pos': 1,
  121 + 'adj:pl:nom:n2:pos': 1,
  122 + 'adj:pl:nom:p1:pos': 1,
  123 + 'adj:pl:nom:p2:pos': 1,
  124 + 'adj:pl:nom:p3:pos': 1,
  125 + 'adj:sg:acc:f:pos': 1,
  126 + 'adj:sg:acc:m1:pos': 1,
  127 + 'adj:sg:acc:m2:pos': 1,
  128 + 'adj:sg:acc:m3:pos': 1,
  129 + 'adj:sg:acc:n1:pos': 1,
  130 + 'adj:sg:acc:n2:pos': 1,
  131 + 'adj:sg:dat:f:pos': 1,
  132 + 'adj:sg:dat:m1:pos': 1,
  133 + 'adj:sg:dat:m2:pos': 1,
  134 + 'adj:sg:dat:m3:pos': 1,
  135 + 'adj:sg:dat:n1:pos': 1,
  136 + 'adj:sg:dat:n2:pos': 1,
  137 + 'adj:sg:gen:f:pos': 1,
  138 + 'adj:sg:gen:m1:pos': 1,
  139 + 'adj:sg:gen:m2:pos': 1,
  140 + 'adj:sg:gen:m3:pos': 1,
  141 + 'adj:sg:gen:n1:pos': 1,
  142 + 'adj:sg:gen:n2:pos': 1,
  143 + 'adj:sg:inst:f:pos': 1,
  144 + 'adj:sg:inst:m1:pos': 1,
  145 + 'adj:sg:inst:m2:pos': 1,
  146 + 'adj:sg:inst:m3:pos': 1,
  147 + 'adj:sg:inst:n1:pos': 1,
  148 + 'adj:sg:inst:n2:pos': 1,
  149 + 'adj:sg:loc:f:pos': 1,
  150 + 'adj:sg:loc:m1:pos': 1,
  151 + 'adj:sg:loc:m2:pos': 1,
  152 + 'adj:sg:loc:m3:pos': 1,
  153 + 'adj:sg:loc:n1:pos': 1,
  154 + 'adj:sg:loc:n2:pos': 1,
  155 + 'adj:sg:nom:f:pos': 1,
  156 + 'adj:sg:nom:m1:pos': 1,
  157 + 'adj:sg:nom:m2:pos': 1,
  158 + 'adj:sg:nom:m3:pos': 1,
  159 + 'adj:sg:nom:n1:pos': 1,
  160 + 'adj:sg:nom:n2:pos': 1,
  161 + 'adj:pl:acc:m1.p1:sup': 19,
  162 + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup': 19,
  163 + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  164 + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  165 + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  166 + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  167 + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19,
  168 + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:sup': 19,
  169 + 'adj:pl:nom.voc:m1.p1:sup': 19,
  170 + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup': 19,
  171 + 'adj:sg:acc:f:sup': 19,
  172 + 'adj:sg:acc.inst:f:sup': 19,
  173 + 'adj:sg:acc:m1.m2:sup': 19,
  174 + 'adj:sg:acc:m3:sup': 19,
  175 + 'adj:sg:acc:n1.n2:sup': 19,
  176 + 'adj:sg:dat:f:sup': 19,
  177 + 'adj:sg:dat:m1.m2.m3.n1.n2:sup': 19,
  178 + 'adj:sg:gen:f:sup': 19,
  179 + 'adj:sg:gen.dat.loc:f:sup': 19,
  180 + 'adj:sg:gen:m1.m2.m3.n1.n2:sup': 19,
  181 + 'adj:sg:inst:f:sup': 19,
  182 + 'adj:sg:inst:m1.m2.m3.n1.n2:sup': 19,
  183 + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:sup': 19,
  184 + 'adj:sg:loc:f:sup': 19,
  185 + 'adj:sg:loc:m1.m2.m3.n1.n2:sup': 19,
  186 + 'adj:sg:nom.acc:n1.n2:sup': 19,
  187 + 'adj:sg:nom.voc:f:sup': 19,
  188 + 'adj:sg:nom.voc:m1.m2.m3:sup': 19,
  189 + 'adj:sg:nom.voc:n1.n2:sup': 19,
  190 + 'adj:pl:acc:f:sup': 19,
  191 + 'adj:pl:acc:m1:sup': 19,
  192 + 'adj:pl:acc:m2:sup': 19,
  193 + 'adj:pl:acc:m3:sup': 19,
  194 + 'adj:pl:acc:n1:sup': 19,
  195 + 'adj:pl:acc:n2:sup': 19,
  196 + 'adj:pl:acc:p1:sup': 19,
  197 + 'adj:pl:acc:p2:sup': 19,
  198 + 'adj:pl:acc:p3:sup': 19,
  199 + 'adj:pl:dat:f:sup': 19,
  200 + 'adj:pl:dat:m1:sup': 19,
  201 + 'adj:pl:dat:m2:sup': 19,
  202 + 'adj:pl:dat:m3:sup': 19,
  203 + 'adj:pl:dat:n1:sup': 19,
  204 + 'adj:pl:dat:n2:sup': 19,
  205 + 'adj:pl:dat:p1:sup': 19,
  206 + 'adj:pl:dat:p2:sup': 19,
  207 + 'adj:pl:dat:p3:sup': 19,
  208 + 'adj:pl:gen:f:sup': 19,
  209 + 'adj:pl:gen:m1:sup': 19,
  210 + 'adj:pl:gen:m2:sup': 19,
  211 + 'adj:pl:gen:m3:sup': 19,
  212 + 'adj:pl:gen:n1:sup': 19,
  213 + 'adj:pl:gen:n2:sup': 19,
  214 + 'adj:pl:gen:p1:sup': 19,
  215 + 'adj:pl:gen:p2:sup': 19,
  216 + 'adj:pl:gen:p3:sup': 19,
  217 + 'adj:pl:inst:f:sup': 19,
  218 + 'adj:pl:inst:m1:sup': 19,
  219 + 'adj:pl:inst:m2:sup': 19,
  220 + 'adj:pl:inst:m3:sup': 19,
  221 + 'adj:pl:inst:n1:sup': 19,
  222 + 'adj:pl:inst:n2:sup': 19,
  223 + 'adj:pl:inst:p1:sup': 19,
  224 + 'adj:pl:inst:p2:sup': 19,
  225 + 'adj:pl:inst:p3:sup': 19,
  226 + 'adj:pl:loc:f:sup': 19,
  227 + 'adj:pl:loc:m1:sup': 19,
  228 + 'adj:pl:loc:m2:sup': 19,
  229 + 'adj:pl:loc:m3:sup': 19,
  230 + 'adj:pl:loc:n1:sup': 19,
  231 + 'adj:pl:loc:n2:sup': 19,
  232 + 'adj:pl:loc:p1:sup': 19,
  233 + 'adj:pl:loc:p2:sup': 19,
  234 + 'adj:pl:loc:p3:sup': 19,
  235 + 'adj:pl:nom:f:sup': 19,
  236 + 'adj:pl:nom:m1:sup': 19,
  237 + 'adj:pl:nom:m2:sup': 19,
  238 + 'adj:pl:nom:m3:sup': 19,
  239 + 'adj:pl:nom:n1:sup': 19,
  240 + 'adj:pl:nom:n2:sup': 19,
  241 + 'adj:pl:nom:p1:sup': 19,
  242 + 'adj:pl:nom:p2:sup': 19,
  243 + 'adj:pl:nom:p3:sup': 19,
  244 + 'adj:sg:acc:f:sup': 19,
  245 + 'adj:sg:acc:m1:sup': 19,
  246 + 'adj:sg:acc:m2:sup': 19,
  247 + 'adj:sg:acc:m3:sup': 19,
  248 + 'adj:sg:acc:n1:sup': 19,
  249 + 'adj:sg:acc:n2:sup': 19,
  250 + 'adj:sg:dat:f:sup': 19,
  251 + 'adj:sg:dat:m1:sup': 19,
  252 + 'adj:sg:dat:m2:sup': 19,
  253 + 'adj:sg:dat:m3:sup': 19,
  254 + 'adj:sg:dat:n1:sup': 19,
  255 + 'adj:sg:dat:n2:sup': 19,
  256 + 'adj:sg:gen:f:sup': 19,
  257 + 'adj:sg:gen:m1:sup': 19,
  258 + 'adj:sg:gen:m2:sup': 19,
  259 + 'adj:sg:gen:m3:sup': 19,
  260 + 'adj:sg:gen:n1:sup': 19,
  261 + 'adj:sg:gen:n2:sup': 19,
  262 + 'adj:sg:inst:f:sup': 19,
  263 + 'adj:sg:inst:m1:sup': 19,
  264 + 'adj:sg:inst:m2:sup': 19,
  265 + 'adj:sg:inst:m3:sup': 19,
  266 + 'adj:sg:inst:n1:sup': 19,
  267 + 'adj:sg:inst:n2:sup': 19,
  268 + 'adj:sg:loc:f:sup': 19,
  269 + 'adj:sg:loc:m1:sup': 19,
  270 + 'adj:sg:loc:m2:sup': 19,
  271 + 'adj:sg:loc:m3:sup': 19,
  272 + 'adj:sg:loc:n1:sup': 19,
  273 + 'adj:sg:loc:n2:sup': 19,
  274 + 'adj:sg:nom:f:sup': 19,
  275 + 'adj:sg:nom:m1:sup': 19,
  276 + 'adj:sg:nom:m2:sup': 19,
  277 + 'adj:sg:nom:m3:sup': 19,
  278 + 'adj:sg:nom:n1:sup': 19,
  279 + 'adj:sg:nom:n2:sup': 19,
  280 + 'adv:sup': 19,
  281 + 'winien:sg:m1.m2.m3:imperf': 3,
  282 + 'winien:sg:f:imperf': 3,
  283 + 'winien:sg:n1.n2:imperf': 3,
  284 + 'winien:pl:m1.p1:imperf': 3,
  285 + 'winien:pl:m2.m3.f.n1.n2.p2.p3:imperf': 3,
  286 + 'adja': 15,
  287 + 'ger:sg:dat.loc:n2:imperf:neg': 18,
  288 + 'ger:sg:dat.loc:n2:imperf.perf:neg': 18,
  289 + 'ger:sg:dat.loc:n2:perf:neg': 18,
  290 + 'ger:sg:gen:n2:imperf:neg': 18,
  291 + 'ger:sg:gen:n2:imperf.perf:neg': 18,
  292 + 'ger:sg:gen:n2:perf:neg': 18,
  293 + 'ger:sg:inst:n2:imperf:neg': 18,
  294 + 'ger:sg:inst:n2:imperf.perf:neg': 18,
  295 + 'ger:sg:inst:n2:perf:neg': 18,
  296 + 'ger:sg:nom.acc:n2:imperf:neg': 18,
  297 + 'ger:sg:nom.acc:n2:imperf.perf:neg': 18,
  298 + 'ger:sg:nom.acc:n2:perf:neg': 18,
  299 + 'pact:pl:acc:m1.p1:imperf:neg': 18,
  300 + 'pact:pl:acc:m1.p1:imperf.perf:neg': 18,
  301 + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  302 + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  303 + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  304 + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  305 + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  306 + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  307 + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18,
  308 + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18,
  309 + 'pact:pl:nom:m1.p1:imperf:neg': 18,
  310 + 'pact:pl:nom:m1.p1:imperf.perf:neg': 18,
  311 + 'pact:sg:acc.inst:f:imperf:neg': 18,
  312 + 'pact:sg:acc.inst:f:imperf.perf:neg': 18,
  313 + 'pact:sg:acc:m1.m2:imperf:neg': 18,
  314 + 'pact:sg:acc:m1.m2:imperf.perf:neg': 18,
  315 + 'pact:sg:acc:m3:imperf:neg': 18,
  316 + 'pact:sg:acc:m3:imperf.perf:neg': 18,
  317 + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18,
  318 + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  319 + 'pact:sg:gen.dat.loc:f:imperf:neg': 18,
  320 + 'pact:sg:gen.dat.loc:f:imperf.perf:neg': 18,
  321 + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18,
  322 + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  323 + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18,
  324 + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  325 + 'pact:sg:nom.acc:n1.n2:imperf:neg': 18,
  326 + 'pact:sg:nom.acc:n1.n2:imperf.perf:neg': 18,
  327 + 'pact:sg:nom:f:imperf:neg': 18,
  328 + 'pact:sg:nom:f:imperf.perf:neg': 18,
  329 + 'pact:sg:nom:m1.m2.m3:imperf:neg': 18,
  330 + 'pact:sg:nom:m1.m2.m3:imperf.perf:neg': 18,
  331 + 'ppas:pl:acc:m1.p1:imperf:neg': 18,
  332 + 'ppas:pl:acc:m1.p1:imperf.perf:neg': 18,
  333 + 'ppas:pl:acc:m1.p1:perf:neg': 18,
  334 + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  335 + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  336 + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18,
  337 + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  338 + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  339 + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18,
  340 + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18,
  341 + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18,
  342 + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18,
  343 + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18,
  344 + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18,
  345 + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:perf:neg': 18,
  346 + 'ppas:pl:nom:m1.p1:imperf:neg': 18,
  347 + 'ppas:pl:nom:m1.p1:imperf.perf:neg': 18,
  348 + 'ppas:pl:nom:m1.p1:perf:neg': 18,
  349 + 'ppas:sg:acc.inst:f:imperf:neg': 18,
  350 + 'ppas:sg:acc.inst:f:imperf.perf:neg': 18,
  351 + 'ppas:sg:acc.inst:f:perf:neg': 18,
  352 + 'ppas:sg:acc:m1.m2:imperf:neg': 18,
  353 + 'ppas:sg:acc:m1.m2:imperf.perf:neg': 18,
  354 + 'ppas:sg:acc:m1.m2:perf:neg': 18,
  355 + 'ppas:sg:acc:m3:imperf:neg': 18,
  356 + 'ppas:sg:acc:m3:imperf.perf:neg': 18,
  357 + 'ppas:sg:acc:m3:perf:neg': 18,
  358 + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18,
  359 + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  360 + 'ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg': 18,
  361 + 'ppas:sg:gen.dat.loc:f:imperf:neg': 18,
  362 + 'ppas:sg:gen.dat.loc:f:imperf.perf:neg': 18,
  363 + 'ppas:sg:gen.dat.loc:f:perf:neg': 18,
  364 + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18,
  365 + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  366 + 'ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg': 18,
  367 + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18,
  368 + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18,
  369 + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg': 18,
  370 + 'ppas:sg:nom.acc:n1.n2:imperf:neg': 18,
  371 + 'ppas:sg:nom.acc:n1.n2:imperf.perf:neg': 18,
  372 + 'ppas:sg:nom.acc:n1.n2:perf:neg': 18,
  373 + 'ppas:sg:nom:f:imperf:neg': 18,
  374 + 'ppas:sg:nom:f:imperf.perf:neg': 18,
  375 + 'ppas:sg:nom:f:perf:neg': 18,
  376 + 'ppas:sg:nom:m1.m2.m3:imperf:neg': 18,
  377 + 'ppas:sg:nom:m1.m2.m3:imperf.perf:neg': 18,
  378 + 'ppas:sg:nom:m1.m2.m3:perf:neg': 18,
  379 + 'ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep': 8,
  380 + 'prep:acc': 6,
  381 + 'prep:acc:wok': 6,
  382 + 'prep:acc.inst': 6,
  383 + 'prep:acc.inst:wok': 6,
  384 + 'prep:inst.acc': 6,
  385 + 'prep:inst.acc:wok': 6,
  386 + 'prep:inst.gen.acc:wok': 6,
  387 + 'prep:acc.loc': 6,
  388 + 'prep:acc.loc:wok': 6,
  389 + 'prep:loc.acc': 6,
  390 + 'prep:loc.acc:wok': 6,
  391 + 'prep:gen': 6,
  392 + 'prep:gen.dat': 6,
  393 + 'prep:gen:wok': 6,
  394 + 'prep:gen.inst:wok': 6,
  395 + 'brev:pun': 9,
  396 + 'brev:npun': 9,
  397 + 'intrj': 9,
  398 + 'burk': 9,
  399 +}
  400 +
9 401 def _sortLines(inputLines, encoder):
10 402 logging.info('sorting input...')
11 403 lines = list(inputLines)
... ... @@ -22,7 +414,8 @@ def _parseLines(inputLines, tagset, encoder):
22 414 orth, base, tag, name = line.split(u'\t')
23 415 tagnum = tagset.tag2tagnum[tag]
24 416 namenum = tagset.name2namenum[name]
25   - yield (orth, Interpretation(orth, base, tagnum, namenum, encoder))
  417 + typenum = tag2typenum.get(tag, 0)
  418 + yield (orth, Interpretation(orth, base, tagnum, namenum, typenum, encoder))
26 419  
27 420 def _mergeEntries(inputLines):
28 421 prevOrth = None
... ...
fsabuilder/fsa/encode.py
... ... @@ -61,11 +61,16 @@ class MorphEncoder(Encoder):
61 61 res.append(firstByte)
62 62 assert type(interpsList) == frozenset
63 63 for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
  64 + res.extend(self._encodeTypeNum(interp.typenum))
64 65 res.extend(self._encodeLemma(interp.lemma))
65 66 res.extend(self._encodeTagNum(interp.tagnum))
66 67 res.extend(self._encodeNameNum(interp.namenum))
67 68 return res
68 69  
  70 + def _encodeTypeNum(self, typenum):
  71 + assert typenum >= 0 and typenum < 256
  72 + return bytearray([typenum])
  73 +
69 74 def _encodeLemma(self, lemma):
70 75 res = bytearray()
71 76 assert lemma.cutLength < 256 and lemma.cutLength >= 0
... ...
fsabuilder/fsa/serializer.py
... ... @@ -40,7 +40,6 @@ class Serializer(object):
40 40 raise NotImplementedError('Not implemented')
41 41  
42 42 def fsa2bytearray(self):
43   -
44 43 res = bytearray()
45 44 res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset)))
46 45 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
... ...
morfeusz/CMakeLists.txt
... ... @@ -7,7 +7,7 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa)
7 7 add_library (morfeusz2 morfeusz.hpp morfeusz.cpp)
8 8 add_executable (morfeusz2_analyze main.cpp)
9 9 add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp)
10   -add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp)
  10 +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp)
11 11  
12 12 # Link the executable to the Hello library.
13 13 target_link_libraries (morfeusz2_analyze morfeusz2)
... ...
morfeusz/EncodedInterpretation.hpp
... ... @@ -28,8 +28,11 @@ struct EncodedLemma {
28 28 */
29 29 struct EncodedInterpretation {
30 30 EncodedLemma lemma;
  31 + int type;
31 32 int tag;
32 33 int nameClassifier;
  34 + int startNode;
  35 + int endNode;
33 36 };
34 37  
35 38 #endif /* INTERPRETATION_HPP */
... ...
morfeusz/InterpsGroup.hpp 0 → 100644
  1 +/*
  2 + * File: GroupedInterpretations.hpp
  3 + * Author: lennyn
  4 + *
  5 + * Created on November 16, 2013, 7:58 PM
  6 + */
  7 +
  8 +#ifndef GROUPEDINTERPRETATIONS_HPP
  9 +#define GROUPEDINTERPRETATIONS_HPP
  10 +
  11 +#include <vector>
  12 +#include <string>
  13 +#include "EncodedInterpretation.hpp"
  14 +#include "MorphInterpretation.hpp"
  15 +#include "Tagset.hpp"
  16 +
  17 +class InterpsGroup {
  18 +public:
  19 +
  20 + InterpsGroup() {
  21 +
  22 + }
  23 +
  24 + explicit InterpsGroup(const int type)
  25 + : type(type) {
  26 +
  27 + }
  28 +
  29 + std::vector<MorphInterpretation> getRealInterps(const std::string& orth, const Tagset& tagset) {
  30 + std::vector<MorphInterpretation> res;
  31 + for (EncodedInterpretation& ei: interps) {
  32 + res.push_back(MorphInterpretation(startNode, endNode, orth, ei, tagset));
  33 + }
  34 + return res;
  35 + }
  36 +
  37 + void addInterpretation(const EncodedInterpretation& interp) {
  38 + interps.push_back(interp);
  39 + }
  40 +
  41 + int type;
  42 + int startNode;
  43 + int endNode;
  44 +private:
  45 + std::vector<EncodedInterpretation> interps;
  46 +};
  47 +
  48 +#endif /* GROUPEDINTERPRETATIONS_HPP */
  49 +
... ...
morfeusz/Morfeusz.cpp
... ... @@ -6,17 +6,18 @@
6 6 */
7 7  
8 8 #include <string>
  9 +#include "fsa.hpp"
9 10 #include "utils.hpp"
10 11 #include "Morfeusz.hpp"
11 12 #include "MorphDeserializer.hpp"
12   -#include "encoding/CharsetConverter.hpp"
  13 +#include "charset/CharsetConverter.hpp"
13 14  
14 15 using namespace std;
15 16  
16   -static FSA<vector<EncodedInterpretation>>* initializeFSA(const string& filename) {
17   - static Deserializer<vector<EncodedInterpretation>>* deserializer
18   - = new MorphDeserializer();
19   - return FSA<vector<EncodedInterpretation>>::getFSA(filename, *deserializer);
  17 +static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) {
  18 + static Deserializer < vector < InterpsGroup >> *deserializer
  19 + = new MorphDeserializer();
  20 + return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer);
20 21 }
21 22  
22 23 static CharsetConverter* initializeCharsetConverter() {
... ... @@ -26,7 +27,7 @@ static CharsetConverter* initializeCharsetConverter() {
26 27  
27 28 Morfeusz::Morfeusz(const string& filename)
28 29 : fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) {
29   -
  30 +
30 31 }
31 32  
32 33 //Morfeusz::Morfeusz(const Morfeusz& orig) {
... ... @@ -36,12 +37,57 @@ Morfeusz::~Morfeusz() {
36 37 delete &this->fsa;
37 38 }
38 39  
39   -AnalyzeResult Morfeusz::analyze(const std::string& text) {
40   - const char* textStart = text.c_str();
41   - const char* textEnd = text.c_str() + text.length();
42   - AnalyzeResult res = {
43   - ResultsIterator(textStart, textEnd, *this),
44   - ResultsIterator(textEnd, textEnd, *this)};
45   - return res;
  40 +ResultsIterator Morfeusz::analyze(const std::string& text) {
  41 +// const char* textStart = text.c_str();
  42 +// const char* textEnd = text.c_str() + text.length();
  43 + return ResultsIterator(text, *this);
  44 +}
  45 +
  46 +ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz)
  47 +: rawInput(text.c_str()),
  48 +morfeusz(morfeusz) {
  49 +}
  50 +
  51 +MorphInterpretation ResultsIterator::getNext() {
  52 +// if (resultsBuffer.empty()) {
  53 +// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer));
  54 +// }
  55 +// startNode = resultsBuffer.back().getEndNode();
  56 +// MorphInterpretation res = resultsBuffer.front();
  57 +// resultsBuffer.pop_front();
  58 +// return res;
  59 +}
  60 +
  61 +bool ResultsIterator::hasNext() {
  62 + return rawInput[0] != '\0' && resultsBuffer.empty();
46 63 }
47 64  
  65 +//int Morfeusz::doProcessOneWord(const char*& inputPtr, const char* inputEnd, int startNodeNum, std::vector<EncodedInterpretation>& interps) const {
  66 +// assert(inputPtr[0] != '\0');
  67 +// const char* start = inputPtr;
  68 +// StateType state = fsa->getInitialState();
  69 +// int currNodeNum = startNodeNum;
  70 +// int codepoint = this->charsetConverter->next(inputPtr, inputEnd);
  71 +// assert(!isEndOfWord(codepoint));
  72 +// while(!isEndOfWord(codepoint)) {
  73 +// feedState(state, codepoint);
  74 +// if (state.isAccepting()) {
  75 +// const char* currInputPtr = inputPtr;
  76 +// vector<EncodedInterpretation> startInterps = state.getValue();
  77 +// filterOutNonGluableInterps(startInterps);
  78 +// if (!startInterps.empty()) {
  79 +//
  80 +// }
  81 +// vector<EncodedInterpretation> additionalInterps;
  82 +// int nextNodeNum = doProcessOneWord(currInputPtr, inputEnd, currNodeNum + 1, additionalInterps);
  83 +// if (!additionalInterps.empty()) {
  84 +// for (EncodedInterpretation& interp: state.getValue()) {
  85 +// interp.startNode = currNodeNum;
  86 +// interp.endNode = currNodeNum + 1;
  87 +// interps.push_back(interp);
  88 +// }
  89 +//
  90 +// }
  91 +// }
  92 +// }
  93 +//}
... ...
morfeusz/Morfeusz.hpp
... ... @@ -9,53 +9,78 @@
9 9 #define MORFEUSZ_HPP
10 10  
11 11 #include <string>
  12 +#include <list>
12 13 #include <vector>
13 14 #include "EncodedInterpretation.hpp"
14 15 #include "fsa.hpp"
15 16 #include "MorphInterpretation.hpp"
16   -#include "encoding/CharsetConverter.hpp"
  17 +#include "InterpsGroup.hpp"
  18 +#include "charset/CharsetConverter.hpp"
17 19  
18 20 class Morfeusz;
19   -class AnalyzeResult;
  21 +//class AnalyzeResult;
20 22 class ResultsIterator;
21 23  
  24 +typedef FSA<std::vector<InterpsGroup>> FSAType;
  25 +typedef State<std::vector<InterpsGroup>> StateType;
  26 +
22 27 class Morfeusz {
23 28 public:
24 29 explicit Morfeusz(const std::string& filename);
25 30 virtual ~Morfeusz();
26 31 // Morfeusz(const Morfeusz& orig);
27   - AnalyzeResult analyze(const std::string& text);
  32 + ResultsIterator analyze(const std::string& text);
28 33  
29 34 // Morfeusz();
  35 + friend class ResultsIterator;
30 36 private:
31   - void processOneWord(const char*& inputData, int startNodeNum, std::vector<MorphInterpretation>& resInterps);
32   - const FSA<std::vector<EncodedInterpretation>>* fsa;
  37 + template <class OutputIterator>
  38 +// void processOneWord(const char*& inputData, int startNodeNum, OutputIterator resInterps) const;
  39 +
  40 + int doProcessOneWord(const char*& inputData, int startNodeNum, std::vector<InterpsGroup>& interps) const;
  41 +
  42 + const FSAType* fsa;
33 43 CharsetConverter* charsetConverter;
34 44 };
35 45  
  46 +#include "Morfeusz_impl.hpp"
  47 +
36 48 class ResultsIterator {
37 49 public:
38   - ResultsIterator(
39   - const char* startOfInput,
40   - const char* endOfInput,
41   - const Morfeusz& morfeusz);
42   - virtual ~ResultsIterator();
43   -// ResultsIterator(int* x);
44   - ResultsIterator(const ResultsIterator& mit);
45   - ResultsIterator& operator++();
46   - ResultsIterator operator++(int);
47   - bool operator==(const ResultsIterator& rhs);
48   - bool operator!=(const ResultsIterator& rhs);
49   - MorphInterpretation& operator*();
  50 + ResultsIterator(const std::string& text, const Morfeusz& morfeusz);
  51 + MorphInterpretation getNext();
  52 + bool hasNext();
50 53 private:
51 54 const char* rawInput;
52   - const char* endOfInput;
  55 + const Morfeusz& morfeusz;
  56 + std::list<MorphInterpretation> resultsBuffer;
  57 + int startNode;
53 58 };
54 59  
55   -struct AnalyzeResult {
56   - ResultsIterator iterator;
57   - const ResultsIterator end;
58   -};
  60 +//class ResultsIterator {
  61 +//public:
  62 +// ResultsIterator(
  63 +// const char* startOfInput,
  64 +// const char* endOfInput,
  65 +// const Morfeusz& morfeusz);
  66 +// virtual ~ResultsIterator();
  67 +// ResultsIterator(const ResultsIterator& mit);
  68 +// ResultsIterator& operator++();
  69 +// ResultsIterator operator++(int);
  70 +// bool operator==(const ResultsIterator& rhs);
  71 +// bool operator!=(const ResultsIterator& rhs);
  72 +// MorphInterpretation& operator*();
  73 +//private:
  74 +// const char* rawInput;
  75 +// const char* endOfInput;
  76 +// const Morfeusz& morfeusz;
  77 +// vector<MorphInterpretation> resultsBuffer;
  78 +//};
  79 +
  80 +//struct AnalyzeResult {
  81 +// ResultsIterator iterator;
  82 +// const ResultsIterator end;
  83 +//};
59 84  
60 85 #endif /* MORFEUSZ_HPP */
61 86  
... ...
morfeusz/Morfeusz_impl.hpp 0 → 100644
  1 +/*
  2 + * File: Morfeusz_impl.hpp
  3 + * Author: lennyn
  4 + *
  5 + * Created on November 15, 2013, 1:43 PM
  6 + */
  7 +
  8 +#ifndef MORFEUSZ_IMPL_HPP
  9 +#define MORFEUSZ_IMPL_HPP
  10 +
  11 +#include <cassert>
  12 +#include "Morfeusz.hpp"
  13 +
  14 +//template <class OutputIterator>
  15 +//void Morfeusz::processOneWord(const char*& inputData, const char* inputEnd, int startNodeNum, OutputIterator output, bool insertIgn = true) const {
  16 +// if (inputData == inputEnd) {
  17 +// return;
  18 +// }
  19 +// const char* start = inputData;
  20 +// StateType state = fsa->getInitialState();
  21 +// int currNodeNum = startNodeNum;
  22 +// do {
  23 +// int codepoint = this->charsetConverter->next(inputData, inputEnd);
  24 +// if (!isSpace(codepoint) && codepoint != 0) {
  25 +// feedAutomaton(state, codepoint);
  26 +// if (state.isAccepting()) {
  27 +// int currInput = inputData;
  28 +// vector<MorphInterpretation> additionalInterps;
  29 +// processOneWord(
  30 +// currInput, inputEnd,
  31 +// currNodeNum + 1,
  32 +// back_inserter(additionalInterps), false);
  33 +// if (!additionalInterps.empty()) {
  34 +// currNodeNum = additionalInterps.back().getEndNode();
  35 +// }
  36 +// }
  37 +// }
  38 +// }
  39 +//}
  40 +
  41 +#endif /* MORFEUSZ_IMPL_HPP */
  42 +
... ...
morfeusz/MorphDeserializer.cpp
... ... @@ -5,7 +5,10 @@
5 5 * Created on 12 listopad 2013, 15:31
6 6 */
7 7  
  8 +#include <map>
8 9 #include "MorphDeserializer.hpp"
  10 +#include "EncodedInterpretation.hpp"
  11 +#include "InterpsGroup.hpp"
9 12  
10 13 MorphDeserializer::MorphDeserializer() {
11 14 }
... ... @@ -25,6 +28,8 @@ static void deserializeLemma(const unsigned char*&amp; ptr, EncodedLemma&amp; lemma) {
25 28 }
26 29  
27 30 static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
  31 + interp.type = *ptr;
  32 + ptr++;
28 33 deserializeLemma(ptr, interp.lemma);
29 34 interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
30 35 ptr += 2;
... ... @@ -32,17 +37,58 @@ static void deserializeInterp(const unsigned char*&amp; ptr, EncodedInterpretation&amp;
32 37 ptr++;
33 38 }
34 39  
35   -long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const {
  40 +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
36 41 const unsigned char* currPtr = ptr;
37 42 uint8_t interpsNum = *ptr;
38 43 interps.clear();
39 44 interps.reserve(interpsNum);
40 45 currPtr++;
  46 + // FIXME - to jest do poprawy
  47 + map<int, InterpsGroup> results;
41 48 for (unsigned int i = 0; i < interpsNum; ++i) {
42 49 EncodedInterpretation interp;
43 50 deserializeInterp(currPtr, interp);
44   - interps.push_back(interp);
  51 + if (results.count(interp.type) == 0) {
  52 + results[interp.type] = InterpsGroup(interp.type);
  53 + }
  54 + results[interp.type].addInterpretation(interp);
  55 +// interps.push_back(interp);
  56 + }
  57 + for (auto& kv: results) {
  58 + interps.push_back(kv.second);
45 59 }
46 60 return currPtr - ptr;
47 61 }
48 62  
  63 +//static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) {
  64 +// // XXX uważać na poprawność danych
  65 +// lemma.suffixToCut = *ptr;
  66 +// ptr++;
  67 +// lemma.suffixToAdd = (const char*) ptr;
  68 +// ptr += strlen((const char*) ptr) + 1;
  69 +//}
  70 +//
  71 +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
  72 +// interp.type = *ptr;
  73 +// ptr++;
  74 +// deserializeLemma(ptr, interp.lemma);
  75 +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
  76 +// ptr += 2;
  77 +// interp.nameClassifier = *ptr;
  78 +// ptr++;
  79 +//}
  80 +//
  81 +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const {
  82 +// const unsigned char* currPtr = ptr;
  83 +// uint8_t interpsNum = *ptr;
  84 +// interps.clear();
  85 +// interps.reserve(interpsNum);
  86 +// currPtr++;
  87 +// for (unsigned int i = 0; i < interpsNum; ++i) {
  88 +// EncodedInterpretation interp;
  89 +// deserializeInterp(currPtr, interp);
  90 +// interps.push_back(interp);
  91 +// }
  92 +// return currPtr - ptr;
  93 +//}
  94 +
... ...
morfeusz/MorphDeserializer.hpp
... ... @@ -10,19 +10,31 @@
10 10  
11 11 #include <vector>
12 12 #include "fsa.hpp"
13   -#include "EncodedInterpretation.hpp"
  13 +#include "InterpsGroup.hpp"
14 14  
15   -class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> {
  15 +class MorphDeserializer: public Deserializer<std::vector<InterpsGroup>> {
16 16 public:
17 17 MorphDeserializer();
18 18 MorphDeserializer(const MorphDeserializer& orig);
19 19 virtual ~MorphDeserializer();
20 20 long deserialize(
21 21 const unsigned char* ptr,
22   - std::vector<EncodedInterpretation>& interps) const;
  22 + std::vector<InterpsGroup>& interps) const;
23 23 private:
24 24  
25 25 };
26 26  
  27 +//class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> {
  28 +//public:
  29 +// MorphDeserializer();
  30 +// MorphDeserializer(const MorphDeserializer& orig);
  31 +// virtual ~MorphDeserializer();
  32 +// long deserialize(
  33 +// const unsigned char* ptr,
  34 +// std::vector<EncodedInterpretation>& interps) const;
  35 +//private:
  36 +//
  37 +//};
  38 +
27 39 #endif /* MORPHDESERIALIZER_HPP */
28 40  
... ...
morfeusz/MorphInterpretation.hpp
... ... @@ -36,8 +36,8 @@ private:
36 36 std::string lemma;
37 37 int tagnum;
38 38 int namenum;
39   - const std::string& tag;
40   - const std::string& name;
  39 + std::string tag;
  40 + std::string name;
41 41 };
42 42  
43 43 #endif /* MORPHINTERPRETATION_HPP */
... ...
morfeusz/encoding/CharsetConverter.cpp renamed to morfeusz/charset/CharsetConverter.cpp
... ... @@ -11,6 +11,6 @@
11 11 uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
12 12 return utf8::next(it, end);
13 13 }
14   -const char* UTF8CharsetConverter::append(uint32_t cp, const char* result) const {
  14 +char* UTF8CharsetConverter::append(uint32_t cp, char* result) const {
15 15 return utf8::append(cp, result);
16 16 }
... ...
morfeusz/encoding/CharsetConverter.hpp renamed to morfeusz/charset/CharsetConverter.hpp
... ... @@ -11,35 +11,35 @@
11 11 class CharsetConverter {
12 12 public:
13 13 virtual uint32_t next(const char*& it, const char* end) const = 0;
14   - virtual const char* append(uint32_t cp, const char* result) const = 0;
  14 + virtual char* append(uint32_t cp, char* result) const = 0;
15 15 private:
16 16 };
17 17  
18 18 class UTF8CharsetConverter: public CharsetConverter {
19 19 public:
20 20 uint32_t next(const char*& it, const char* end) const;
21   - const char* append(uint32_t cp, const char* result) const;
  21 + char* append(uint32_t cp, char* result) const;
22 22 private:
23 23 };
24 24  
25 25 class UTF16CharsetConverter: public CharsetConverter {
26 26 public:
27 27 uint32_t next(const char*& it, const char* end) const;
28   - const char* append(uint32_t cp, const char* result) const;
  28 + char* append(uint32_t cp, char* result) const;
29 29 private:
30 30 };
31 31  
32 32 class UTF32CharsetConverter: public CharsetConverter {
33 33 public:
34 34 uint32_t next(const char*& it, const char* end) const;
35   - const char* append(uint32_t cp, const char* result) const;
  35 + char* append(uint32_t cp, char* result) const;
36 36 private:
37 37 };
38 38  
39 39 class ISO8859_2_CharsetConverter: public CharsetConverter {
40 40 public:
41 41 uint32_t next(const char*& it, const char* end) const;
42   - const char* append(uint32_t cp, const char* result) const;
  42 + char* append(uint32_t cp, char* result) const;
43 43 private:
44 44 };
45 45  
... ...
morfeusz/charset/charset_utils.hpp 0 → 100644
  1 +/*
  2 + * File: charset_utils.hpp
  3 + * Author: lennyn
  4 + *
  5 + * Created on November 15, 2013, 1:57 PM
  6 + */
  7 +
  8 +#ifndef CHARSET_UTILS_HPP
  9 +#define CHARSET_UTILS_HPP
  10 +
  11 +
  12 +
  13 +#endif /* CHARSET_UTILS_HPP */
  14 +
... ...
morfeusz/encoding/utf8.h renamed to morfeusz/charset/utf8.h
morfeusz/encoding/utf8/checked.h renamed to morfeusz/charset/utf8/checked.h
morfeusz/encoding/utf8/core.h renamed to morfeusz/charset/utf8/core.h
morfeusz/encoding/utf8/unchecked.h renamed to morfeusz/charset/utf8/unchecked.h
morfeusz/test_morph.cpp
... ... @@ -18,7 +18,7 @@
18 18 using namespace std;
19 19  
20 20 void doTest(
21   - const FSA<vector<EncodedInterpretation>>& fsa,
  21 + const FSA<vector<InterpsGroup>>& fsa,
22 22 const Tagset& tagset,
23 23 // const InterpretationsDecoder<TaggedInterpretation>& interpsConverter,
24 24 const char* fname) {
... ... @@ -32,14 +32,15 @@ void doTest(
32 32 string lemma = splitVector[1];
33 33 string tag = splitVector[2];
34 34 string name = splitVector[3];
35   - vector<EncodedInterpretation> value2;
  35 + vector<InterpsGroup> value2;
36 36 fsa.tryToRecognize(orth.c_str(), value2);
37 37 DEBUG("recognized "+to_string(value2.size()));
38 38 // vector<TaggedInterpretation> parsedValues;
39 39 bool found = false;
40   - for (EncodedInterpretation encodedInterp: value2) {
  40 + for (InterpsGroup gi: value2)
  41 + for (MorphInterpretation interp: gi.getRealInterps(orth, tagset)) {
41 42 // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp);
42   - MorphInterpretation interp(0, 0, orth, encodedInterp, tagset);
  43 +// (0, 0, orth, encodedInterp, tagset);
43 44 // parsedValues.push_back(parsedValue);
44 45 // debug(orth, parsedValue);
45 46 if (lemma == interp.getLemma() && tag == interp.getTag() && name == interp.getName()) {
... ... @@ -62,10 +63,7 @@ int main(int argc, char** argv) {
62 63 validate(argc == 3, "Must provide exactly two arguments - FSA filename, and dictionary filename.");
63 64 const unsigned char* fsaData = readFile(argv[1]);
64 65 MorphDeserializer deserializer;
65   - DEBUG("will read FSA");
66   - FSA<vector<EncodedInterpretation>>* fsa = FSA<vector<EncodedInterpretation>>::getFSA(fsaData, deserializer);
67   - DEBUG("DONE read FSA");
68   - DEBUG("will read tagset");
  66 + FSA<vector<InterpsGroup>>* fsa = FSA<vector<InterpsGroup>>::getFSA(fsaData, deserializer);
69 67 Tagset tagset(fsaData);
70 68 // TaggedInterpretationsDecoder interpsDecoder(tagset);
71 69 DEBUG("DONE read tagset");
... ...
nbproject/configurations.xml
... ... @@ -8,11 +8,13 @@
8 8 <in>test_speed.cpp</in>
9 9 </df>
10 10 <df root="morfeusz" name="1">
11   - <df name="encoding">
  11 + <df name="charset">
12 12 <in>CharsetConverter.cpp</in>
13   - <in>CharsetConverter.hpp</in>
  13 + <in>charset_utils.hpp</in>
14 14 </df>
  15 + <in>InterpsGroup.hpp</in>
15 16 <in>Morfeusz.cpp</in>
  17 + <in>Morfeusz_impl.hpp</in>
16 18 <in>MorphDeserializer.cpp</in>
17 19 <in>MorphInterpretation.cpp</in>
18 20 <in>Tagset.cpp</in>
... ... @@ -51,11 +53,19 @@
51 53 <executablePath>build/fsa/test_dict</executablePath>
52 54 </makeTool>
53 55 </makefileType>
54   - <item path="fsa/const.cpp" ex="false" tool="1" flavor2="4">
  56 + <folder path="1">
55 57 <ccTool>
56 58 <incDir>
57 59 <pElem>fsa</pElem>
58   - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem>
  60 + <pElem>build/morfeusz</pElem>
  61 + </incDir>
  62 + </ccTool>
  63 + </folder>
  64 + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8">
  65 + <ccTool>
  66 + <incDir>
  67 + <pElem>fsa</pElem>
  68 + <pElem>build/morfeusz</pElem>
59 69 </incDir>
60 70 </ccTool>
61 71 </item>
... ... @@ -80,86 +90,45 @@
80 90 </incDir>
81 91 </ccTool>
82 92 </item>
  93 + <item path="morfeusz/InterpsGroup.hpp" ex="false" tool="3" flavor2="0">
  94 + </item>
83 95 <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8">
84 96 <ccTool>
85   - <incDir>
86   - <pElem>fsa</pElem>
87   - <pElem>build/morfeusz</pElem>
88   - </incDir>
89 97 </ccTool>
90 98 </item>
  99 + <item path="morfeusz/Morfeusz_impl.hpp" ex="false" tool="3" flavor2="0">
  100 + </item>
91 101 <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8">
92 102 <ccTool>
93   - <incDir>
94   - <pElem>fsa</pElem>
95   - <pElem>build/morfeusz</pElem>
96   - </incDir>
97 103 </ccTool>
98 104 </item>
99   - <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="4">
  105 + <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8">
100 106 <ccTool>
101   - <incDir>
102   - <pElem>morfeusz</pElem>
103   - <pElem>/usr/include/c++/4.8/bits</pElem>
104   - <pElem>/usr/include/c++/4.8/ext</pElem>
105   - <pElem>/usr/include/c++/4.8</pElem>
106   - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem>
107   - <pElem>/usr/include/c++/4.8/debug</pElem>
108   - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem>
109   - <pElem>/usr/include/c++/4.8/backward</pElem>
110   - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem>
111   - <pElem>build/morfeusz</pElem>
112   - </incDir>
113 107 </ccTool>
114 108 </item>
115   - <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4">
  109 + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8">
116 110 <ccTool>
117   - <incDir>
118   - <pElem>morfeusz</pElem>
119   - <pElem>/usr/include/c++/4.8/bits</pElem>
120   - <pElem>/usr/include/c++/4.8/ext</pElem>
121   - <pElem>/usr/include/c++/4.8</pElem>
122   - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem>
123   - <pElem>/usr/include/c++/4.8/debug</pElem>
124   - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem>
125   - <pElem>fsa</pElem>
126   - <pElem>/usr/include/c++/4.8/backward</pElem>
127   - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem>
128   - <pElem>build/morfeusz</pElem>
129   - </incDir>
130 111 </ccTool>
131 112 </item>
132   - <item path="morfeusz/encoding/CharsetConverter.cpp"
  113 + <item path="morfeusz/charset/CharsetConverter.cpp"
133 114 ex="false"
134 115 tool="1"
135   - flavor2="0">
  116 + flavor2="8">
  117 + <ccTool>
  118 + </ccTool>
136 119 </item>
137   - <item path="morfeusz/encoding/CharsetConverter.hpp"
138   - ex="false"
139   - tool="3"
140   - flavor2="0">
  120 + <item path="morfeusz/charset/charset_utils.hpp" ex="false" tool="3" flavor2="0">
141 121 </item>
142 122 <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
143 123 <ccTool>
144   - <incDir>
145   - <pElem>fsa</pElem>
146   - <pElem>build/morfeusz</pElem>
147   - </incDir>
148 124 </ccTool>
149 125 </item>
150 126 <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4">
151 127 <ccTool>
152   - <incDir>
153   - <pElem>morfeusz</pElem>
154   - </incDir>
155 128 </ccTool>
156 129 </item>
157 130 <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8">
158 131 <ccTool>
159   - <incDir>
160   - <pElem>fsa</pElem>
161   - <pElem>build/morfeusz</pElem>
162   - </incDir>
163 132 </ccTool>
164 133 </item>
165 134 </conf>
... ...