diff --git a/fsabuilder/fsa/common.py b/fsabuilder/fsa/common.py index 50fe997..6f83cfc 100644 --- a/fsabuilder/fsa/common.py +++ b/fsabuilder/fsa/common.py @@ -14,7 +14,7 @@ class Lemma(object): class Interpretation(object): - def __init__(self, orth, base, tagnum, namenum, encoder): + def __init__(self, orth, base, tagnum, namenum, typenum, encoder): assert type(orth) == unicode assert type(base) == unicode root = u'' @@ -29,6 +29,7 @@ class Interpretation(object): suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False)) self.tagnum = tagnum self.namenum = namenum + self.typenum = typenum def getSortKey(self): return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum) diff --git a/fsabuilder/fsa/convertinput.py b/fsabuilder/fsa/convertinput.py index 9597d2b..1b0a01d 100644 --- a/fsabuilder/fsa/convertinput.py +++ b/fsabuilder/fsa/convertinput.py @@ -6,6 +6,398 @@ Created on Oct 23, 2013 import logging from common import Interpretation +tag2typenum = { + 'aglt:sg:pri:imperf:nwok': 12, + 'aglt:sg:pri:imperf:wok': 12, + 'aglt:sg:sec:imperf:nwok': 12, + 'aglt:sg:sec:imperf:wok': 12, + 'aglt:pl:pri:imperf:nwok': 13, + 'aglt:pl:pri:imperf:wok': 13, + 'aglt:pl:sec:imperf:nwok': 13, + 'aglt:pl:sec:imperf:wok': 13, + 'praet:sg:m1.m2.m3:imperf:agl': 7, + 'praet:sg:m1.m2.m3:imperf.perf:agl': 7, + 'praet:sg:m1.m2.m3:perf:agl': 7, + 'praet:sg:m1.m2.m3:imperf:nagl': 16, + 'praet:sg:m1.m2.m3:imperf.perf:nagl': 16, + 'praet:sg:m1.m2.m3:perf:nagl': 16, + 'praet:sg:f:imperf': 20, + 'praet:sg:f:imperf.perf': 20, + 'praet:sg:f:perf': 20, + 'praet:sg:m1.m2.m3:imperf': 20, + 'praet:sg:m1.m2.m3:imperf.perf': 20, + 'praet:sg:m1.m2.m3:perf': 20, + 'praet:sg:n1.n2:imperf': 20, + 'praet:sg:n1.n2:imperf.perf': 20, + 'praet:sg:n1.n2:perf': 20, + 'praet:pl:m1.p1:imperf': 21, + 'praet:pl:m1.p1:imperf.perf': 21, + 'praet:pl:m1.p1:perf': 21, + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf': 21, + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf': 21, + 'praet:pl:m2.m3.f.n1.n2.p2.p3:perf': 21, + 'naj': 10, + 'nie': 5, + 'adj:pl:acc:m1.p1:pos': 1, + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos': 1, + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:pos': 1, + 'adj:pl:nom.voc:m1.p1:pos': 1, + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos': 1, + 'adj:sg:acc:m1.m2:pos': 1, + 'adj:sg:acc:n1.n2:pos': 1, + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:inst:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:loc:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:nom.voc:m1.m2.m3:pos': 1, + 'adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos': 1, + 'adj:sg:nom.voc:n1.n2:pos': 1, + 'adj:sg:acc:f:pos': 1, + 'adj:sg:acc.inst:f:pos': 1, + 'adj:sg:acc:m1.m2:pos': 1, + 'adj:sg:acc:m3:pos': 1, + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:gen.dat.loc:f:pos': 1, + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:nom.voc.acc:n1.n2:pos': 1, + 'adj:sg:nom.voc:f:pos': 1, + 'adj:sg:nom.voc:m1.m2.m3:pos': 1, + 'adj:pl:acc:f:pos': 1, + 'adj:pl:acc:m1:pos': 1, + 'adj:pl:acc:m2:pos': 1, + 'adj:pl:acc:m3:pos': 1, + 'adj:pl:acc:n1:pos': 1, + 'adj:pl:acc:n2:pos': 1, + 'adj:pl:acc:p1:pos': 1, + 'adj:pl:acc:p2:pos': 1, + 'adj:pl:acc:p3:pos': 1, + 'adj:pl:dat:f:pos': 1, + 'adj:pl:dat:m1:pos': 1, + 'adj:pl:dat:m2:pos': 1, + 'adj:pl:dat:m3:pos': 1, + 'adj:pl:dat:n1:pos': 1, + 'adj:pl:dat:n2:pos': 1, + 'adj:pl:dat:p1:pos': 1, + 'adj:pl:dat:p2:pos': 1, + 'adj:pl:dat:p3:pos': 1, + 'adj:pl:gen:f:pos': 1, + 'adj:pl:gen:m1:pos': 1, + 'adj:pl:gen:m2:pos': 1, + 'adj:pl:gen:m3:pos': 1, + 'adj:pl:gen:n1:pos': 1, + 'adj:pl:gen:n2:pos': 1, + 'adj:pl:gen:p1:pos': 1, + 'adj:pl:gen:p2:pos': 1, + 'adj:pl:gen:p3:pos': 1, + 'adj:pl:inst:f:pos': 1, + 'adj:pl:inst:m1:pos': 1, + 'adj:pl:inst:m2:pos': 1, + 'adj:pl:inst:m3:pos': 1, + 'adj:pl:inst:n1:pos': 1, + 'adj:pl:inst:n2:pos': 1, + 'adj:pl:inst:p1:pos': 1, + 'adj:pl:inst:p2:pos': 1, + 'adj:pl:inst:p3:pos': 1, + 'adj:pl:loc:f:pos': 1, + 'adj:pl:loc:m1:pos': 1, + 'adj:pl:loc:m2:pos': 1, + 'adj:pl:loc:m3:pos': 1, + 'adj:pl:loc:n1:pos': 1, + 'adj:pl:loc:n2:pos': 1, + 'adj:pl:loc:p1:pos': 1, + 'adj:pl:loc:p2:pos': 1, + 'adj:pl:loc:p3:pos': 1, + 'adj:pl:nom:f:pos': 1, + 'adj:pl:nom:m1:pos': 1, + 'adj:pl:nom:m2:pos': 1, + 'adj:pl:nom:m3:pos': 1, + 'adj:pl:nom:n1:pos': 1, + 'adj:pl:nom:n2:pos': 1, + 'adj:pl:nom:p1:pos': 1, + 'adj:pl:nom:p2:pos': 1, + 'adj:pl:nom:p3:pos': 1, + 'adj:sg:acc:f:pos': 1, + 'adj:sg:acc:m1:pos': 1, + 'adj:sg:acc:m2:pos': 1, + 'adj:sg:acc:m3:pos': 1, + 'adj:sg:acc:n1:pos': 1, + 'adj:sg:acc:n2:pos': 1, + 'adj:sg:dat:f:pos': 1, + 'adj:sg:dat:m1:pos': 1, + 'adj:sg:dat:m2:pos': 1, + 'adj:sg:dat:m3:pos': 1, + 'adj:sg:dat:n1:pos': 1, + 'adj:sg:dat:n2:pos': 1, + 'adj:sg:gen:f:pos': 1, + 'adj:sg:gen:m1:pos': 1, + 'adj:sg:gen:m2:pos': 1, + 'adj:sg:gen:m3:pos': 1, + 'adj:sg:gen:n1:pos': 1, + 'adj:sg:gen:n2:pos': 1, + 'adj:sg:inst:f:pos': 1, + 'adj:sg:inst:m1:pos': 1, + 'adj:sg:inst:m2:pos': 1, + 'adj:sg:inst:m3:pos': 1, + 'adj:sg:inst:n1:pos': 1, + 'adj:sg:inst:n2:pos': 1, + 'adj:sg:loc:f:pos': 1, + 'adj:sg:loc:m1:pos': 1, + 'adj:sg:loc:m2:pos': 1, + 'adj:sg:loc:m3:pos': 1, + 'adj:sg:loc:n1:pos': 1, + 'adj:sg:loc:n2:pos': 1, + 'adj:sg:nom:f:pos': 1, + 'adj:sg:nom:m1:pos': 1, + 'adj:sg:nom:m2:pos': 1, + 'adj:sg:nom:m3:pos': 1, + 'adj:sg:nom:n1:pos': 1, + 'adj:sg:nom:n2:pos': 1, + 'adj:pl:acc:m1.p1:sup': 19, + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup': 19, + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:sup': 19, + 'adj:pl:nom.voc:m1.p1:sup': 19, + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup': 19, + 'adj:sg:acc:f:sup': 19, + 'adj:sg:acc.inst:f:sup': 19, + 'adj:sg:acc:m1.m2:sup': 19, + 'adj:sg:acc:m3:sup': 19, + 'adj:sg:acc:n1.n2:sup': 19, + 'adj:sg:dat:f:sup': 19, + 'adj:sg:dat:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:gen:f:sup': 19, + 'adj:sg:gen.dat.loc:f:sup': 19, + 'adj:sg:gen:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:inst:f:sup': 19, + 'adj:sg:inst:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:loc:f:sup': 19, + 'adj:sg:loc:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:nom.acc:n1.n2:sup': 19, + 'adj:sg:nom.voc:f:sup': 19, + 'adj:sg:nom.voc:m1.m2.m3:sup': 19, + 'adj:sg:nom.voc:n1.n2:sup': 19, + 'adj:pl:acc:f:sup': 19, + 'adj:pl:acc:m1:sup': 19, + 'adj:pl:acc:m2:sup': 19, + 'adj:pl:acc:m3:sup': 19, + 'adj:pl:acc:n1:sup': 19, + 'adj:pl:acc:n2:sup': 19, + 'adj:pl:acc:p1:sup': 19, + 'adj:pl:acc:p2:sup': 19, + 'adj:pl:acc:p3:sup': 19, + 'adj:pl:dat:f:sup': 19, + 'adj:pl:dat:m1:sup': 19, + 'adj:pl:dat:m2:sup': 19, + 'adj:pl:dat:m3:sup': 19, + 'adj:pl:dat:n1:sup': 19, + 'adj:pl:dat:n2:sup': 19, + 'adj:pl:dat:p1:sup': 19, + 'adj:pl:dat:p2:sup': 19, + 'adj:pl:dat:p3:sup': 19, + 'adj:pl:gen:f:sup': 19, + 'adj:pl:gen:m1:sup': 19, + 'adj:pl:gen:m2:sup': 19, + 'adj:pl:gen:m3:sup': 19, + 'adj:pl:gen:n1:sup': 19, + 'adj:pl:gen:n2:sup': 19, + 'adj:pl:gen:p1:sup': 19, + 'adj:pl:gen:p2:sup': 19, + 'adj:pl:gen:p3:sup': 19, + 'adj:pl:inst:f:sup': 19, + 'adj:pl:inst:m1:sup': 19, + 'adj:pl:inst:m2:sup': 19, + 'adj:pl:inst:m3:sup': 19, + 'adj:pl:inst:n1:sup': 19, + 'adj:pl:inst:n2:sup': 19, + 'adj:pl:inst:p1:sup': 19, + 'adj:pl:inst:p2:sup': 19, + 'adj:pl:inst:p3:sup': 19, + 'adj:pl:loc:f:sup': 19, + 'adj:pl:loc:m1:sup': 19, + 'adj:pl:loc:m2:sup': 19, + 'adj:pl:loc:m3:sup': 19, + 'adj:pl:loc:n1:sup': 19, + 'adj:pl:loc:n2:sup': 19, + 'adj:pl:loc:p1:sup': 19, + 'adj:pl:loc:p2:sup': 19, + 'adj:pl:loc:p3:sup': 19, + 'adj:pl:nom:f:sup': 19, + 'adj:pl:nom:m1:sup': 19, + 'adj:pl:nom:m2:sup': 19, + 'adj:pl:nom:m3:sup': 19, + 'adj:pl:nom:n1:sup': 19, + 'adj:pl:nom:n2:sup': 19, + 'adj:pl:nom:p1:sup': 19, + 'adj:pl:nom:p2:sup': 19, + 'adj:pl:nom:p3:sup': 19, + 'adj:sg:acc:f:sup': 19, + 'adj:sg:acc:m1:sup': 19, + 'adj:sg:acc:m2:sup': 19, + 'adj:sg:acc:m3:sup': 19, + 'adj:sg:acc:n1:sup': 19, + 'adj:sg:acc:n2:sup': 19, + 'adj:sg:dat:f:sup': 19, + 'adj:sg:dat:m1:sup': 19, + 'adj:sg:dat:m2:sup': 19, + 'adj:sg:dat:m3:sup': 19, + 'adj:sg:dat:n1:sup': 19, + 'adj:sg:dat:n2:sup': 19, + 'adj:sg:gen:f:sup': 19, + 'adj:sg:gen:m1:sup': 19, + 'adj:sg:gen:m2:sup': 19, + 'adj:sg:gen:m3:sup': 19, + 'adj:sg:gen:n1:sup': 19, + 'adj:sg:gen:n2:sup': 19, + 'adj:sg:inst:f:sup': 19, + 'adj:sg:inst:m1:sup': 19, + 'adj:sg:inst:m2:sup': 19, + 'adj:sg:inst:m3:sup': 19, + 'adj:sg:inst:n1:sup': 19, + 'adj:sg:inst:n2:sup': 19, + 'adj:sg:loc:f:sup': 19, + 'adj:sg:loc:m1:sup': 19, + 'adj:sg:loc:m2:sup': 19, + 'adj:sg:loc:m3:sup': 19, + 'adj:sg:loc:n1:sup': 19, + 'adj:sg:loc:n2:sup': 19, + 'adj:sg:nom:f:sup': 19, + 'adj:sg:nom:m1:sup': 19, + 'adj:sg:nom:m2:sup': 19, + 'adj:sg:nom:m3:sup': 19, + 'adj:sg:nom:n1:sup': 19, + 'adj:sg:nom:n2:sup': 19, + 'adv:sup': 19, + 'winien:sg:m1.m2.m3:imperf': 3, + 'winien:sg:f:imperf': 3, + 'winien:sg:n1.n2:imperf': 3, + 'winien:pl:m1.p1:imperf': 3, + 'winien:pl:m2.m3.f.n1.n2.p2.p3:imperf': 3, + 'adja': 15, + 'ger:sg:dat.loc:n2:imperf:neg': 18, + 'ger:sg:dat.loc:n2:imperf.perf:neg': 18, + 'ger:sg:dat.loc:n2:perf:neg': 18, + 'ger:sg:gen:n2:imperf:neg': 18, + 'ger:sg:gen:n2:imperf.perf:neg': 18, + 'ger:sg:gen:n2:perf:neg': 18, + 'ger:sg:inst:n2:imperf:neg': 18, + 'ger:sg:inst:n2:imperf.perf:neg': 18, + 'ger:sg:inst:n2:perf:neg': 18, + 'ger:sg:nom.acc:n2:imperf:neg': 18, + 'ger:sg:nom.acc:n2:imperf.perf:neg': 18, + 'ger:sg:nom.acc:n2:perf:neg': 18, + 'pact:pl:acc:m1.p1:imperf:neg': 18, + 'pact:pl:acc:m1.p1:imperf.perf:neg': 18, + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18, + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18, + 'pact:pl:nom:m1.p1:imperf:neg': 18, + 'pact:pl:nom:m1.p1:imperf.perf:neg': 18, + 'pact:sg:acc.inst:f:imperf:neg': 18, + 'pact:sg:acc.inst:f:imperf.perf:neg': 18, + 'pact:sg:acc:m1.m2:imperf:neg': 18, + 'pact:sg:acc:m1.m2:imperf.perf:neg': 18, + 'pact:sg:acc:m3:imperf:neg': 18, + 'pact:sg:acc:m3:imperf.perf:neg': 18, + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18, + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'pact:sg:gen.dat.loc:f:imperf:neg': 18, + 'pact:sg:gen.dat.loc:f:imperf.perf:neg': 18, + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18, + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18, + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'pact:sg:nom.acc:n1.n2:imperf:neg': 18, + 'pact:sg:nom.acc:n1.n2:imperf.perf:neg': 18, + 'pact:sg:nom:f:imperf:neg': 18, + 'pact:sg:nom:f:imperf.perf:neg': 18, + 'pact:sg:nom:m1.m2.m3:imperf:neg': 18, + 'pact:sg:nom:m1.m2.m3:imperf.perf:neg': 18, + 'ppas:pl:acc:m1.p1:imperf:neg': 18, + 'ppas:pl:acc:m1.p1:imperf.perf:neg': 18, + 'ppas:pl:acc:m1.p1:perf:neg': 18, + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18, + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18, + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:perf:neg': 18, + 'ppas:pl:nom:m1.p1:imperf:neg': 18, + 'ppas:pl:nom:m1.p1:imperf.perf:neg': 18, + 'ppas:pl:nom:m1.p1:perf:neg': 18, + 'ppas:sg:acc.inst:f:imperf:neg': 18, + 'ppas:sg:acc.inst:f:imperf.perf:neg': 18, + 'ppas:sg:acc.inst:f:perf:neg': 18, + 'ppas:sg:acc:m1.m2:imperf:neg': 18, + 'ppas:sg:acc:m1.m2:imperf.perf:neg': 18, + 'ppas:sg:acc:m1.m2:perf:neg': 18, + 'ppas:sg:acc:m3:imperf:neg': 18, + 'ppas:sg:acc:m3:imperf.perf:neg': 18, + 'ppas:sg:acc:m3:perf:neg': 18, + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18, + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg': 18, + 'ppas:sg:gen.dat.loc:f:imperf:neg': 18, + 'ppas:sg:gen.dat.loc:f:imperf.perf:neg': 18, + 'ppas:sg:gen.dat.loc:f:perf:neg': 18, + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18, + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg': 18, + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18, + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg': 18, + 'ppas:sg:nom.acc:n1.n2:imperf:neg': 18, + 'ppas:sg:nom.acc:n1.n2:imperf.perf:neg': 18, + 'ppas:sg:nom.acc:n1.n2:perf:neg': 18, + 'ppas:sg:nom:f:imperf:neg': 18, + 'ppas:sg:nom:f:imperf.perf:neg': 18, + 'ppas:sg:nom:f:perf:neg': 18, + 'ppas:sg:nom:m1.m2.m3:imperf:neg': 18, + 'ppas:sg:nom:m1.m2.m3:imperf.perf:neg': 18, + 'ppas:sg:nom:m1.m2.m3:perf:neg': 18, + 'ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep': 8, + 'prep:acc': 6, + 'prep:acc:wok': 6, + 'prep:acc.inst': 6, + 'prep:acc.inst:wok': 6, + 'prep:inst.acc': 6, + 'prep:inst.acc:wok': 6, + 'prep:inst.gen.acc:wok': 6, + 'prep:acc.loc': 6, + 'prep:acc.loc:wok': 6, + 'prep:loc.acc': 6, + 'prep:loc.acc:wok': 6, + 'prep:gen': 6, + 'prep:gen.dat': 6, + 'prep:gen:wok': 6, + 'prep:gen.inst:wok': 6, + 'brev:pun': 9, + 'brev:npun': 9, + 'intrj': 9, + 'burk': 9, +} + def _sortLines(inputLines, encoder): logging.info('sorting input...') lines = list(inputLines) @@ -22,7 +414,8 @@ def _parseLines(inputLines, tagset, encoder): orth, base, tag, name = line.split(u'\t') tagnum = tagset.tag2tagnum[tag] namenum = tagset.name2namenum[name] - yield (orth, Interpretation(orth, base, tagnum, namenum, encoder)) + typenum = tag2typenum.get(tag, 0) + yield (orth, Interpretation(orth, base, tagnum, namenum, typenum, encoder)) def _mergeEntries(inputLines): prevOrth = None diff --git a/fsabuilder/fsa/encode.py b/fsabuilder/fsa/encode.py index 059ee4d..26c2a2e 100644 --- a/fsabuilder/fsa/encode.py +++ b/fsabuilder/fsa/encode.py @@ -61,11 +61,16 @@ class MorphEncoder(Encoder): res.append(firstByte) assert type(interpsList) == frozenset for interp in sorted(interpsList, key=lambda i: i.getSortKey()): + res.extend(self._encodeTypeNum(interp.typenum)) res.extend(self._encodeLemma(interp.lemma)) res.extend(self._encodeTagNum(interp.tagnum)) res.extend(self._encodeNameNum(interp.namenum)) return res + def _encodeTypeNum(self, typenum): + assert typenum >= 0 and typenum < 256 + return bytearray([typenum]) + def _encodeLemma(self, lemma): res = bytearray() assert lemma.cutLength < 256 and lemma.cutLength >= 0 diff --git a/fsabuilder/fsa/serializer.py b/fsabuilder/fsa/serializer.py index 0cf0b3d..1a285a0 100644 --- a/fsabuilder/fsa/serializer.py +++ b/fsabuilder/fsa/serializer.py @@ -40,7 +40,6 @@ class Serializer(object): raise NotImplementedError('Not implemented') def fsa2bytearray(self): - res = bytearray() res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt index 60f6ac9..b8dc4f7 100644 --- a/morfeusz/CMakeLists.txt +++ b/morfeusz/CMakeLists.txt @@ -7,7 +7,7 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) add_executable (morfeusz2_analyze main.cpp) add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) -add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp) +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp) # Link the executable to the Hello library. target_link_libraries (morfeusz2_analyze morfeusz2) diff --git a/morfeusz/EncodedInterpretation.hpp b/morfeusz/EncodedInterpretation.hpp index 4f2d18f..f969cf6 100644 --- a/morfeusz/EncodedInterpretation.hpp +++ b/morfeusz/EncodedInterpretation.hpp @@ -28,8 +28,11 @@ struct EncodedLemma { */ struct EncodedInterpretation { EncodedLemma lemma; + int type; int tag; int nameClassifier; + int startNode; + int endNode; }; #endif /* INTERPRETATION_HPP */ diff --git a/morfeusz/InterpsGroup.hpp b/morfeusz/InterpsGroup.hpp new file mode 100644 index 0000000..c355f2d --- /dev/null +++ b/morfeusz/InterpsGroup.hpp @@ -0,0 +1,49 @@ +/* + * File: GroupedInterpretations.hpp + * Author: lennyn + * + * Created on November 16, 2013, 7:58 PM + */ + +#ifndef GROUPEDINTERPRETATIONS_HPP +#define GROUPEDINTERPRETATIONS_HPP + +#include <vector> +#include <string> +#include "EncodedInterpretation.hpp" +#include "MorphInterpretation.hpp" +#include "Tagset.hpp" + +class InterpsGroup { +public: + + InterpsGroup() { + + } + + explicit InterpsGroup(const int type) + : type(type) { + + } + + std::vector<MorphInterpretation> getRealInterps(const std::string& orth, const Tagset& tagset) { + std::vector<MorphInterpretation> res; + for (EncodedInterpretation& ei: interps) { + res.push_back(MorphInterpretation(startNode, endNode, orth, ei, tagset)); + } + return res; + } + + void addInterpretation(const EncodedInterpretation& interp) { + interps.push_back(interp); + } + + int type; + int startNode; + int endNode; +private: + std::vector<EncodedInterpretation> interps; +}; + +#endif /* GROUPEDINTERPRETATIONS_HPP */ + diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index bedb63c..1439f44 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -6,17 +6,18 @@ */ #include <string> +#include "fsa.hpp" #include "utils.hpp" #include "Morfeusz.hpp" #include "MorphDeserializer.hpp" -#include "encoding/CharsetConverter.hpp" +#include "charset/CharsetConverter.hpp" using namespace std; -static FSA<vector<EncodedInterpretation>>* initializeFSA(const string& filename) { - static Deserializer<vector<EncodedInterpretation>>* deserializer - = new MorphDeserializer(); - return FSA<vector<EncodedInterpretation>>::getFSA(filename, *deserializer); +static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) { + static Deserializer < vector < InterpsGroup >> *deserializer + = new MorphDeserializer(); + return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer); } static CharsetConverter* initializeCharsetConverter() { @@ -26,7 +27,7 @@ static CharsetConverter* initializeCharsetConverter() { Morfeusz::Morfeusz(const string& filename) : fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) { - + } //Morfeusz::Morfeusz(const Morfeusz& orig) { @@ -36,12 +37,57 @@ Morfeusz::~Morfeusz() { delete &this->fsa; } -AnalyzeResult Morfeusz::analyze(const std::string& text) { - const char* textStart = text.c_str(); - const char* textEnd = text.c_str() + text.length(); - AnalyzeResult res = { - ResultsIterator(textStart, textEnd, *this), - ResultsIterator(textEnd, textEnd, *this)}; - return res; +ResultsIterator Morfeusz::analyze(const std::string& text) { +// const char* textStart = text.c_str(); +// const char* textEnd = text.c_str() + text.length(); + return ResultsIterator(text, *this); +} + +ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz) +: rawInput(text.c_str()), +morfeusz(morfeusz) { +} + +MorphInterpretation ResultsIterator::getNext() { +// if (resultsBuffer.empty()) { +// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer)); +// } +// startNode = resultsBuffer.back().getEndNode(); +// MorphInterpretation res = resultsBuffer.front(); +// resultsBuffer.pop_front(); +// return res; +} + +bool ResultsIterator::hasNext() { + return rawInput[0] != '\0' && resultsBuffer.empty(); } +//int Morfeusz::doProcessOneWord(const char*& inputPtr, const char* inputEnd, int startNodeNum, std::vector<EncodedInterpretation>& interps) const { +// assert(inputPtr[0] != '\0'); +// const char* start = inputPtr; +// StateType state = fsa->getInitialState(); +// int currNodeNum = startNodeNum; +// int codepoint = this->charsetConverter->next(inputPtr, inputEnd); +// assert(!isEndOfWord(codepoint)); +// while(!isEndOfWord(codepoint)) { +// feedState(state, codepoint); +// if (state.isAccepting()) { +// const char* currInputPtr = inputPtr; +// vector<EncodedInterpretation> startInterps = state.getValue(); +// filterOutNonGluableInterps(startInterps); +// if (!startInterps.empty()) { +// +// } +// vector<EncodedInterpretation> additionalInterps; +// int nextNodeNum = doProcessOneWord(currInputPtr, inputEnd, currNodeNum + 1, additionalInterps); +// if (!additionalInterps.empty()) { +// for (EncodedInterpretation& interp: state.getValue()) { +// interp.startNode = currNodeNum; +// interp.endNode = currNodeNum + 1; +// interps.push_back(interp); +// } +// +// } +// } +// } +//} diff --git a/morfeusz/Morfeusz.hpp b/morfeusz/Morfeusz.hpp index 9b5768a..e4671cb 100644 --- a/morfeusz/Morfeusz.hpp +++ b/morfeusz/Morfeusz.hpp @@ -9,53 +9,78 @@ #define MORFEUSZ_HPP #include <string> +#include <list> #include <vector> #include "EncodedInterpretation.hpp" #include "fsa.hpp" #include "MorphInterpretation.hpp" -#include "encoding/CharsetConverter.hpp" +#include "InterpsGroup.hpp" +#include "charset/CharsetConverter.hpp" class Morfeusz; -class AnalyzeResult; +//class AnalyzeResult; class ResultsIterator; +typedef FSA<std::vector<InterpsGroup>> FSAType; +typedef State<std::vector<InterpsGroup>> StateType; + class Morfeusz { public: explicit Morfeusz(const std::string& filename); virtual ~Morfeusz(); // Morfeusz(const Morfeusz& orig); - AnalyzeResult analyze(const std::string& text); + ResultsIterator analyze(const std::string& text); // Morfeusz(); + friend class ResultsIterator; private: - void processOneWord(const char*& inputData, int startNodeNum, std::vector<MorphInterpretation>& resInterps); - const FSA<std::vector<EncodedInterpretation>>* fsa; + template <class OutputIterator> +// void processOneWord(const char*& inputData, int startNodeNum, OutputIterator resInterps) const; + + int doProcessOneWord(const char*& inputData, int startNodeNum, std::vector<InterpsGroup>& interps) const; + + const FSAType* fsa; CharsetConverter* charsetConverter; }; +#include "Morfeusz_impl.hpp" + class ResultsIterator { public: - ResultsIterator( - const char* startOfInput, - const char* endOfInput, - const Morfeusz& morfeusz); - virtual ~ResultsIterator(); -// ResultsIterator(int* x); - ResultsIterator(const ResultsIterator& mit); - ResultsIterator& operator++(); - ResultsIterator operator++(int); - bool operator==(const ResultsIterator& rhs); - bool operator!=(const ResultsIterator& rhs); - MorphInterpretation& operator*(); + ResultsIterator(const std::string& text, const Morfeusz& morfeusz); + MorphInterpretation getNext(); + bool hasNext(); private: const char* rawInput; - const char* endOfInput; + const Morfeusz& morfeusz; + std::list<MorphInterpretation> resultsBuffer; + int startNode; }; -struct AnalyzeResult { - ResultsIterator iterator; - const ResultsIterator end; -}; +//class ResultsIterator { +//public: +// ResultsIterator( +// const char* startOfInput, +// const char* endOfInput, +// const Morfeusz& morfeusz); +// virtual ~ResultsIterator(); +// ResultsIterator(const ResultsIterator& mit); +// ResultsIterator& operator++(); +// ResultsIterator operator++(int); +// bool operator==(const ResultsIterator& rhs); +// bool operator!=(const ResultsIterator& rhs); +// MorphInterpretation& operator*(); +//private: +// const char* rawInput; +// const char* endOfInput; +// const Morfeusz& morfeusz; +// vector<MorphInterpretation> resultsBuffer; +//}; + +//struct AnalyzeResult { +// ResultsIterator iterator; +// const ResultsIterator end; +//}; #endif /* MORFEUSZ_HPP */ diff --git a/morfeusz/Morfeusz_impl.hpp b/morfeusz/Morfeusz_impl.hpp new file mode 100644 index 0000000..aebf57f --- /dev/null +++ b/morfeusz/Morfeusz_impl.hpp @@ -0,0 +1,42 @@ +/* + * File: Morfeusz_impl.hpp + * Author: lennyn + * + * Created on November 15, 2013, 1:43 PM + */ + +#ifndef MORFEUSZ_IMPL_HPP +#define MORFEUSZ_IMPL_HPP + +#include <cassert> +#include "Morfeusz.hpp" + +//template <class OutputIterator> +//void Morfeusz::processOneWord(const char*& inputData, const char* inputEnd, int startNodeNum, OutputIterator output, bool insertIgn = true) const { +// if (inputData == inputEnd) { +// return; +// } +// const char* start = inputData; +// StateType state = fsa->getInitialState(); +// int currNodeNum = startNodeNum; +// do { +// int codepoint = this->charsetConverter->next(inputData, inputEnd); +// if (!isSpace(codepoint) && codepoint != 0) { +// feedAutomaton(state, codepoint); +// if (state.isAccepting()) { +// int currInput = inputData; +// vector<MorphInterpretation> additionalInterps; +// processOneWord( +// currInput, inputEnd, +// currNodeNum + 1, +// back_inserter(additionalInterps), false); +// if (!additionalInterps.empty()) { +// currNodeNum = additionalInterps.back().getEndNode(); +// } +// } +// } +// } +//} + +#endif /* MORFEUSZ_IMPL_HPP */ + diff --git a/morfeusz/MorphDeserializer.cpp b/morfeusz/MorphDeserializer.cpp index 8660443..0c9a601 100644 --- a/morfeusz/MorphDeserializer.cpp +++ b/morfeusz/MorphDeserializer.cpp @@ -5,7 +5,10 @@ * Created on 12 listopad 2013, 15:31 */ +#include <map> #include "MorphDeserializer.hpp" +#include "EncodedInterpretation.hpp" +#include "InterpsGroup.hpp" MorphDeserializer::MorphDeserializer() { } @@ -25,6 +28,8 @@ static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) { } static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { + interp.type = *ptr; + ptr++; deserializeLemma(ptr, interp.lemma); interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); ptr += 2; @@ -32,17 +37,58 @@ static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& ptr++; } -long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const { +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { const unsigned char* currPtr = ptr; uint8_t interpsNum = *ptr; interps.clear(); interps.reserve(interpsNum); currPtr++; + // FIXME - to jest do poprawy + map<int, InterpsGroup> results; for (unsigned int i = 0; i < interpsNum; ++i) { EncodedInterpretation interp; deserializeInterp(currPtr, interp); - interps.push_back(interp); + if (results.count(interp.type) == 0) { + results[interp.type] = InterpsGroup(interp.type); + } + results[interp.type].addInterpretation(interp); +// interps.push_back(interp); + } + for (auto& kv: results) { + interps.push_back(kv.second); } return currPtr - ptr; } +//static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) { +// // XXX uważać na poprawność danych +// lemma.suffixToCut = *ptr; +// ptr++; +// lemma.suffixToAdd = (const char*) ptr; +// ptr += strlen((const char*) ptr) + 1; +//} +// +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { +// interp.type = *ptr; +// ptr++; +// deserializeLemma(ptr, interp.lemma); +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); +// ptr += 2; +// interp.nameClassifier = *ptr; +// ptr++; +//} +// +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const { +// const unsigned char* currPtr = ptr; +// uint8_t interpsNum = *ptr; +// interps.clear(); +// interps.reserve(interpsNum); +// currPtr++; +// for (unsigned int i = 0; i < interpsNum; ++i) { +// EncodedInterpretation interp; +// deserializeInterp(currPtr, interp); +// interps.push_back(interp); +// } +// return currPtr - ptr; +//} + diff --git a/morfeusz/MorphDeserializer.hpp b/morfeusz/MorphDeserializer.hpp index 5572193..cc1c646 100644 --- a/morfeusz/MorphDeserializer.hpp +++ b/morfeusz/MorphDeserializer.hpp @@ -10,19 +10,31 @@ #include <vector> #include "fsa.hpp" -#include "EncodedInterpretation.hpp" +#include "InterpsGroup.hpp" -class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> { +class MorphDeserializer: public Deserializer<std::vector<InterpsGroup>> { public: MorphDeserializer(); MorphDeserializer(const MorphDeserializer& orig); virtual ~MorphDeserializer(); long deserialize( const unsigned char* ptr, - std::vector<EncodedInterpretation>& interps) const; + std::vector<InterpsGroup>& interps) const; private: }; +//class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> { +//public: +// MorphDeserializer(); +// MorphDeserializer(const MorphDeserializer& orig); +// virtual ~MorphDeserializer(); +// long deserialize( +// const unsigned char* ptr, +// std::vector<EncodedInterpretation>& interps) const; +//private: +// +//}; + #endif /* MORPHDESERIALIZER_HPP */ diff --git a/morfeusz/MorphInterpretation.hpp b/morfeusz/MorphInterpretation.hpp index bbebd28..eab9c1a 100644 --- a/morfeusz/MorphInterpretation.hpp +++ b/morfeusz/MorphInterpretation.hpp @@ -36,8 +36,8 @@ private: std::string lemma; int tagnum; int namenum; - const std::string& tag; - const std::string& name; + std::string tag; + std::string name; }; #endif /* MORPHINTERPRETATION_HPP */ diff --git a/morfeusz/encoding/CharsetConverter.cpp b/morfeusz/charset/CharsetConverter.cpp index 358fb31..226353a 100644 --- a/morfeusz/encoding/CharsetConverter.cpp +++ b/morfeusz/charset/CharsetConverter.cpp @@ -11,6 +11,6 @@ uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { return utf8::next(it, end); } -const char* UTF8CharsetConverter::append(uint32_t cp, const char* result) const { +char* UTF8CharsetConverter::append(uint32_t cp, char* result) const { return utf8::append(cp, result); } diff --git a/morfeusz/encoding/CharsetConverter.hpp b/morfeusz/charset/CharsetConverter.hpp index fa25f78..a58b4c1 100644 --- a/morfeusz/encoding/CharsetConverter.hpp +++ b/morfeusz/charset/CharsetConverter.hpp @@ -11,35 +11,35 @@ class CharsetConverter { public: virtual uint32_t next(const char*& it, const char* end) const = 0; - virtual const char* append(uint32_t cp, const char* result) const = 0; + virtual char* append(uint32_t cp, char* result) const = 0; private: }; class UTF8CharsetConverter: public CharsetConverter { public: uint32_t next(const char*& it, const char* end) const; - const char* append(uint32_t cp, const char* result) const; + char* append(uint32_t cp, char* result) const; private: }; class UTF16CharsetConverter: public CharsetConverter { public: uint32_t next(const char*& it, const char* end) const; - const char* append(uint32_t cp, const char* result) const; + char* append(uint32_t cp, char* result) const; private: }; class UTF32CharsetConverter: public CharsetConverter { public: uint32_t next(const char*& it, const char* end) const; - const char* append(uint32_t cp, const char* result) const; + char* append(uint32_t cp, char* result) const; private: }; class ISO8859_2_CharsetConverter: public CharsetConverter { public: uint32_t next(const char*& it, const char* end) const; - const char* append(uint32_t cp, const char* result) const; + char* append(uint32_t cp, char* result) const; private: }; diff --git a/morfeusz/charset/charset_utils.hpp b/morfeusz/charset/charset_utils.hpp new file mode 100644 index 0000000..c845441 --- /dev/null +++ b/morfeusz/charset/charset_utils.hpp @@ -0,0 +1,14 @@ +/* + * File: charset_utils.hpp + * Author: lennyn + * + * Created on November 15, 2013, 1:57 PM + */ + +#ifndef CHARSET_UTILS_HPP +#define CHARSET_UTILS_HPP + + + +#endif /* CHARSET_UTILS_HPP */ + diff --git a/morfeusz/encoding/utf8.h b/morfeusz/charset/utf8.h index 4e44514..4e44514 100644 --- a/morfeusz/encoding/utf8.h +++ b/morfeusz/charset/utf8.h diff --git a/morfeusz/encoding/utf8/checked.h b/morfeusz/charset/utf8/checked.h index 1331155..1331155 100644 --- a/morfeusz/encoding/utf8/checked.h +++ b/morfeusz/charset/utf8/checked.h diff --git a/morfeusz/encoding/utf8/core.h b/morfeusz/charset/utf8/core.h index d237583..d237583 100644 --- a/morfeusz/encoding/utf8/core.h +++ b/morfeusz/charset/utf8/core.h diff --git a/morfeusz/encoding/utf8/unchecked.h b/morfeusz/charset/utf8/unchecked.h index cb24271..cb24271 100644 --- a/morfeusz/encoding/utf8/unchecked.h +++ b/morfeusz/charset/utf8/unchecked.h diff --git a/morfeusz/test_morph.cpp b/morfeusz/test_morph.cpp index 7f66f39..1de8633 100644 --- a/morfeusz/test_morph.cpp +++ b/morfeusz/test_morph.cpp @@ -18,7 +18,7 @@ using namespace std; void doTest( - const FSA<vector<EncodedInterpretation>>& fsa, + const FSA<vector<InterpsGroup>>& fsa, const Tagset& tagset, // const InterpretationsDecoder<TaggedInterpretation>& interpsConverter, const char* fname) { @@ -32,14 +32,15 @@ void doTest( string lemma = splitVector[1]; string tag = splitVector[2]; string name = splitVector[3]; - vector<EncodedInterpretation> value2; + vector<InterpsGroup> value2; fsa.tryToRecognize(orth.c_str(), value2); DEBUG("recognized "+to_string(value2.size())); // vector<TaggedInterpretation> parsedValues; bool found = false; - for (EncodedInterpretation encodedInterp: value2) { + for (InterpsGroup gi: value2) + for (MorphInterpretation interp: gi.getRealInterps(orth, tagset)) { // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp); - MorphInterpretation interp(0, 0, orth, encodedInterp, tagset); +// (0, 0, orth, encodedInterp, tagset); // parsedValues.push_back(parsedValue); // debug(orth, parsedValue); if (lemma == interp.getLemma() && tag == interp.getTag() && name == interp.getName()) { @@ -62,10 +63,7 @@ int main(int argc, char** argv) { validate(argc == 3, "Must provide exactly two arguments - FSA filename, and dictionary filename."); const unsigned char* fsaData = readFile(argv[1]); MorphDeserializer deserializer; - DEBUG("will read FSA"); - FSA<vector<EncodedInterpretation>>* fsa = FSA<vector<EncodedInterpretation>>::getFSA(fsaData, deserializer); - DEBUG("DONE read FSA"); - DEBUG("will read tagset"); + FSA<vector<InterpsGroup>>* fsa = FSA<vector<InterpsGroup>>::getFSA(fsaData, deserializer); Tagset tagset(fsaData); // TaggedInterpretationsDecoder interpsDecoder(tagset); DEBUG("DONE read tagset"); diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml index f4e3032..5db4a12 100644 --- a/nbproject/configurations.xml +++ b/nbproject/configurations.xml @@ -8,11 +8,13 @@ <in>test_speed.cpp</in> </df> <df root="morfeusz" name="1"> - <df name="encoding"> + <df name="charset"> <in>CharsetConverter.cpp</in> - <in>CharsetConverter.hpp</in> + <in>charset_utils.hpp</in> </df> + <in>InterpsGroup.hpp</in> <in>Morfeusz.cpp</in> + <in>Morfeusz_impl.hpp</in> <in>MorphDeserializer.cpp</in> <in>MorphInterpretation.cpp</in> <in>Tagset.cpp</in> @@ -51,11 +53,19 @@ <executablePath>build/fsa/test_dict</executablePath> </makeTool> </makefileType> - <item path="fsa/const.cpp" ex="false" tool="1" flavor2="4"> + <folder path="1"> <ccTool> <incDir> <pElem>fsa</pElem> - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + </ccTool> + </folder> + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8"> + <ccTool> + <incDir> + <pElem>fsa</pElem> + <pElem>build/morfeusz</pElem> </incDir> </ccTool> </item> @@ -80,86 +90,45 @@ </incDir> </ccTool> </item> + <item path="morfeusz/InterpsGroup.hpp" ex="false" tool="3" flavor2="0"> + </item> <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>fsa</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> + <item path="morfeusz/Morfeusz_impl.hpp" ex="false" tool="3" flavor2="0"> + </item> <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>fsa</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> - <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="4"> + <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>morfeusz</pElem> - <pElem>/usr/include/c++/4.8/bits</pElem> - <pElem>/usr/include/c++/4.8/ext</pElem> - <pElem>/usr/include/c++/4.8</pElem> - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> - <pElem>/usr/include/c++/4.8/debug</pElem> - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> - <pElem>/usr/include/c++/4.8/backward</pElem> - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> - <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>morfeusz</pElem> - <pElem>/usr/include/c++/4.8/bits</pElem> - <pElem>/usr/include/c++/4.8/ext</pElem> - <pElem>/usr/include/c++/4.8</pElem> - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> - <pElem>/usr/include/c++/4.8/debug</pElem> - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> - <pElem>fsa</pElem> - <pElem>/usr/include/c++/4.8/backward</pElem> - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> - <item path="morfeusz/encoding/CharsetConverter.cpp" + <item path="morfeusz/charset/CharsetConverter.cpp" ex="false" tool="1" - flavor2="0"> + flavor2="8"> + <ccTool> + </ccTool> </item> - <item path="morfeusz/encoding/CharsetConverter.hpp" - ex="false" - tool="3" - flavor2="0"> + <item path="morfeusz/charset/charset_utils.hpp" ex="false" tool="3" flavor2="0"> </item> <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>fsa</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> <ccTool> - <incDir> - <pElem>morfeusz</pElem> - </incDir> </ccTool> </item> <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>fsa</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> </conf>