From a9d3e65c15f2e43bc637fbda8342a6242dc1174f Mon Sep 17 00:00:00 2001 From: Michał Lenart <michall@ipipan.waw.pl> Date: Sun, 17 Nov 2013 21:57:52 +0000 Subject: [PATCH] - refaktoryzacja, odkomentowanie na-razie-niedziałających kawałków kodu --- fsabuilder/fsa/common.py | 3 ++- fsabuilder/fsa/convertinput.py | 395 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- fsabuilder/fsa/encode.py | 5 +++++ fsabuilder/fsa/serializer.py | 1 - morfeusz/CMakeLists.txt | 2 +- morfeusz/EncodedInterpretation.hpp | 3 +++ morfeusz/InterpsGroup.hpp | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ morfeusz/Morfeusz.cpp | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------- morfeusz/Morfeusz.hpp | 69 +++++++++++++++++++++++++++++++++++++++++++++++---------------------- morfeusz/Morfeusz_impl.hpp | 42 ++++++++++++++++++++++++++++++++++++++++++ morfeusz/MorphDeserializer.cpp | 50 ++++++++++++++++++++++++++++++++++++++++++++++++-- morfeusz/MorphDeserializer.hpp | 18 +++++++++++++++--- morfeusz/MorphInterpretation.hpp | 4 ++-- morfeusz/charset/CharsetConverter.cpp | 16 ++++++++++++++++ morfeusz/charset/CharsetConverter.hpp | 47 +++++++++++++++++++++++++++++++++++++++++++++++ morfeusz/charset/charset_utils.hpp | 14 ++++++++++++++ morfeusz/charset/utf8.h | 34 ++++++++++++++++++++++++++++++++++ morfeusz/charset/utf8/checked.h | 327 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ morfeusz/charset/utf8/core.h | 330 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ morfeusz/charset/utf8/unchecked.h | 228 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ morfeusz/encoding/CharsetConverter.cpp | 16 ---------------- morfeusz/encoding/CharsetConverter.hpp | 47 ----------------------------------------------- morfeusz/encoding/utf8.h | 34 ---------------------------------- morfeusz/encoding/utf8/checked.h | 327 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- morfeusz/encoding/utf8/core.h | 330 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ morfeusz/encoding/utf8/unchecked.h | 228 ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ morfeusz/test_morph.cpp | 14 ++++++-------- nbproject/configurations.xml | 81 +++++++++++++++++++++++++-------------------------------------------------------- 28 files changed, 1694 insertions(+), 1092 deletions(-) create mode 100644 morfeusz/InterpsGroup.hpp create mode 100644 morfeusz/Morfeusz_impl.hpp create mode 100644 morfeusz/charset/CharsetConverter.cpp create mode 100644 morfeusz/charset/CharsetConverter.hpp create mode 100644 morfeusz/charset/charset_utils.hpp create mode 100644 morfeusz/charset/utf8.h create mode 100644 morfeusz/charset/utf8/checked.h create mode 100644 morfeusz/charset/utf8/core.h create mode 100644 morfeusz/charset/utf8/unchecked.h delete mode 100644 morfeusz/encoding/CharsetConverter.cpp delete mode 100644 morfeusz/encoding/CharsetConverter.hpp delete mode 100644 morfeusz/encoding/utf8.h delete mode 100644 morfeusz/encoding/utf8/checked.h delete mode 100644 morfeusz/encoding/utf8/core.h delete mode 100644 morfeusz/encoding/utf8/unchecked.h diff --git a/fsabuilder/fsa/common.py b/fsabuilder/fsa/common.py index 50fe997..6f83cfc 100644 --- a/fsabuilder/fsa/common.py +++ b/fsabuilder/fsa/common.py @@ -14,7 +14,7 @@ class Lemma(object): class Interpretation(object): - def __init__(self, orth, base, tagnum, namenum, encoder): + def __init__(self, orth, base, tagnum, namenum, typenum, encoder): assert type(orth) == unicode assert type(base) == unicode root = u'' @@ -29,6 +29,7 @@ class Interpretation(object): suffixToAdd=encoder.encodeWord(base[len(root):], lowercase=False)) self.tagnum = tagnum self.namenum = namenum + self.typenum = typenum def getSortKey(self): return (self.lemma.cutLength, tuple(self.lemma.suffixToAdd), self.tagnum, self.namenum) diff --git a/fsabuilder/fsa/convertinput.py b/fsabuilder/fsa/convertinput.py index 9597d2b..1b0a01d 100644 --- a/fsabuilder/fsa/convertinput.py +++ b/fsabuilder/fsa/convertinput.py @@ -6,6 +6,398 @@ Created on Oct 23, 2013 import logging from common import Interpretation +tag2typenum = { + 'aglt:sg:pri:imperf:nwok': 12, + 'aglt:sg:pri:imperf:wok': 12, + 'aglt:sg:sec:imperf:nwok': 12, + 'aglt:sg:sec:imperf:wok': 12, + 'aglt:pl:pri:imperf:nwok': 13, + 'aglt:pl:pri:imperf:wok': 13, + 'aglt:pl:sec:imperf:nwok': 13, + 'aglt:pl:sec:imperf:wok': 13, + 'praet:sg:m1.m2.m3:imperf:agl': 7, + 'praet:sg:m1.m2.m3:imperf.perf:agl': 7, + 'praet:sg:m1.m2.m3:perf:agl': 7, + 'praet:sg:m1.m2.m3:imperf:nagl': 16, + 'praet:sg:m1.m2.m3:imperf.perf:nagl': 16, + 'praet:sg:m1.m2.m3:perf:nagl': 16, + 'praet:sg:f:imperf': 20, + 'praet:sg:f:imperf.perf': 20, + 'praet:sg:f:perf': 20, + 'praet:sg:m1.m2.m3:imperf': 20, + 'praet:sg:m1.m2.m3:imperf.perf': 20, + 'praet:sg:m1.m2.m3:perf': 20, + 'praet:sg:n1.n2:imperf': 20, + 'praet:sg:n1.n2:imperf.perf': 20, + 'praet:sg:n1.n2:perf': 20, + 'praet:pl:m1.p1:imperf': 21, + 'praet:pl:m1.p1:imperf.perf': 21, + 'praet:pl:m1.p1:perf': 21, + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf': 21, + 'praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf': 21, + 'praet:pl:m2.m3.f.n1.n2.p2.p3:perf': 21, + 'naj': 10, + 'nie': 5, + 'adj:pl:acc:m1.p1:pos': 1, + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos': 1, + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos': 1, + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:pos': 1, + 'adj:pl:nom.voc:m1.p1:pos': 1, + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos': 1, + 'adj:sg:acc:m1.m2:pos': 1, + 'adj:sg:acc:n1.n2:pos': 1, + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:inst:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:loc:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:nom.voc:m1.m2.m3:pos': 1, + 'adj:sg:nom.voc:m1.m2.m3:pos|adj:sg:acc:m3:pos': 1, + 'adj:sg:nom.voc:n1.n2:pos': 1, + 'adj:sg:acc:f:pos': 1, + 'adj:sg:acc.inst:f:pos': 1, + 'adj:sg:acc:m1.m2:pos': 1, + 'adj:sg:acc:m3:pos': 1, + 'adj:sg:dat:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:gen.dat.loc:f:pos': 1, + 'adj:sg:gen:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:pos': 1, + 'adj:sg:nom.voc.acc:n1.n2:pos': 1, + 'adj:sg:nom.voc:f:pos': 1, + 'adj:sg:nom.voc:m1.m2.m3:pos': 1, + 'adj:pl:acc:f:pos': 1, + 'adj:pl:acc:m1:pos': 1, + 'adj:pl:acc:m2:pos': 1, + 'adj:pl:acc:m3:pos': 1, + 'adj:pl:acc:n1:pos': 1, + 'adj:pl:acc:n2:pos': 1, + 'adj:pl:acc:p1:pos': 1, + 'adj:pl:acc:p2:pos': 1, + 'adj:pl:acc:p3:pos': 1, + 'adj:pl:dat:f:pos': 1, + 'adj:pl:dat:m1:pos': 1, + 'adj:pl:dat:m2:pos': 1, + 'adj:pl:dat:m3:pos': 1, + 'adj:pl:dat:n1:pos': 1, + 'adj:pl:dat:n2:pos': 1, + 'adj:pl:dat:p1:pos': 1, + 'adj:pl:dat:p2:pos': 1, + 'adj:pl:dat:p3:pos': 1, + 'adj:pl:gen:f:pos': 1, + 'adj:pl:gen:m1:pos': 1, + 'adj:pl:gen:m2:pos': 1, + 'adj:pl:gen:m3:pos': 1, + 'adj:pl:gen:n1:pos': 1, + 'adj:pl:gen:n2:pos': 1, + 'adj:pl:gen:p1:pos': 1, + 'adj:pl:gen:p2:pos': 1, + 'adj:pl:gen:p3:pos': 1, + 'adj:pl:inst:f:pos': 1, + 'adj:pl:inst:m1:pos': 1, + 'adj:pl:inst:m2:pos': 1, + 'adj:pl:inst:m3:pos': 1, + 'adj:pl:inst:n1:pos': 1, + 'adj:pl:inst:n2:pos': 1, + 'adj:pl:inst:p1:pos': 1, + 'adj:pl:inst:p2:pos': 1, + 'adj:pl:inst:p3:pos': 1, + 'adj:pl:loc:f:pos': 1, + 'adj:pl:loc:m1:pos': 1, + 'adj:pl:loc:m2:pos': 1, + 'adj:pl:loc:m3:pos': 1, + 'adj:pl:loc:n1:pos': 1, + 'adj:pl:loc:n2:pos': 1, + 'adj:pl:loc:p1:pos': 1, + 'adj:pl:loc:p2:pos': 1, + 'adj:pl:loc:p3:pos': 1, + 'adj:pl:nom:f:pos': 1, + 'adj:pl:nom:m1:pos': 1, + 'adj:pl:nom:m2:pos': 1, + 'adj:pl:nom:m3:pos': 1, + 'adj:pl:nom:n1:pos': 1, + 'adj:pl:nom:n2:pos': 1, + 'adj:pl:nom:p1:pos': 1, + 'adj:pl:nom:p2:pos': 1, + 'adj:pl:nom:p3:pos': 1, + 'adj:sg:acc:f:pos': 1, + 'adj:sg:acc:m1:pos': 1, + 'adj:sg:acc:m2:pos': 1, + 'adj:sg:acc:m3:pos': 1, + 'adj:sg:acc:n1:pos': 1, + 'adj:sg:acc:n2:pos': 1, + 'adj:sg:dat:f:pos': 1, + 'adj:sg:dat:m1:pos': 1, + 'adj:sg:dat:m2:pos': 1, + 'adj:sg:dat:m3:pos': 1, + 'adj:sg:dat:n1:pos': 1, + 'adj:sg:dat:n2:pos': 1, + 'adj:sg:gen:f:pos': 1, + 'adj:sg:gen:m1:pos': 1, + 'adj:sg:gen:m2:pos': 1, + 'adj:sg:gen:m3:pos': 1, + 'adj:sg:gen:n1:pos': 1, + 'adj:sg:gen:n2:pos': 1, + 'adj:sg:inst:f:pos': 1, + 'adj:sg:inst:m1:pos': 1, + 'adj:sg:inst:m2:pos': 1, + 'adj:sg:inst:m3:pos': 1, + 'adj:sg:inst:n1:pos': 1, + 'adj:sg:inst:n2:pos': 1, + 'adj:sg:loc:f:pos': 1, + 'adj:sg:loc:m1:pos': 1, + 'adj:sg:loc:m2:pos': 1, + 'adj:sg:loc:m3:pos': 1, + 'adj:sg:loc:n1:pos': 1, + 'adj:sg:loc:n2:pos': 1, + 'adj:sg:nom:f:pos': 1, + 'adj:sg:nom:m1:pos': 1, + 'adj:sg:nom:m2:pos': 1, + 'adj:sg:nom:m3:pos': 1, + 'adj:sg:nom:n1:pos': 1, + 'adj:sg:nom:n2:pos': 1, + 'adj:pl:acc:m1.p1:sup': 19, + 'adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup': 19, + 'adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup': 19, + 'adj:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:sup': 19, + 'adj:pl:nom.voc:m1.p1:sup': 19, + 'adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup': 19, + 'adj:sg:acc:f:sup': 19, + 'adj:sg:acc.inst:f:sup': 19, + 'adj:sg:acc:m1.m2:sup': 19, + 'adj:sg:acc:m3:sup': 19, + 'adj:sg:acc:n1.n2:sup': 19, + 'adj:sg:dat:f:sup': 19, + 'adj:sg:dat:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:gen:f:sup': 19, + 'adj:sg:gen.dat.loc:f:sup': 19, + 'adj:sg:gen:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:inst:f:sup': 19, + 'adj:sg:inst:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:inst.loc:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:loc:f:sup': 19, + 'adj:sg:loc:m1.m2.m3.n1.n2:sup': 19, + 'adj:sg:nom.acc:n1.n2:sup': 19, + 'adj:sg:nom.voc:f:sup': 19, + 'adj:sg:nom.voc:m1.m2.m3:sup': 19, + 'adj:sg:nom.voc:n1.n2:sup': 19, + 'adj:pl:acc:f:sup': 19, + 'adj:pl:acc:m1:sup': 19, + 'adj:pl:acc:m2:sup': 19, + 'adj:pl:acc:m3:sup': 19, + 'adj:pl:acc:n1:sup': 19, + 'adj:pl:acc:n2:sup': 19, + 'adj:pl:acc:p1:sup': 19, + 'adj:pl:acc:p2:sup': 19, + 'adj:pl:acc:p3:sup': 19, + 'adj:pl:dat:f:sup': 19, + 'adj:pl:dat:m1:sup': 19, + 'adj:pl:dat:m2:sup': 19, + 'adj:pl:dat:m3:sup': 19, + 'adj:pl:dat:n1:sup': 19, + 'adj:pl:dat:n2:sup': 19, + 'adj:pl:dat:p1:sup': 19, + 'adj:pl:dat:p2:sup': 19, + 'adj:pl:dat:p3:sup': 19, + 'adj:pl:gen:f:sup': 19, + 'adj:pl:gen:m1:sup': 19, + 'adj:pl:gen:m2:sup': 19, + 'adj:pl:gen:m3:sup': 19, + 'adj:pl:gen:n1:sup': 19, + 'adj:pl:gen:n2:sup': 19, + 'adj:pl:gen:p1:sup': 19, + 'adj:pl:gen:p2:sup': 19, + 'adj:pl:gen:p3:sup': 19, + 'adj:pl:inst:f:sup': 19, + 'adj:pl:inst:m1:sup': 19, + 'adj:pl:inst:m2:sup': 19, + 'adj:pl:inst:m3:sup': 19, + 'adj:pl:inst:n1:sup': 19, + 'adj:pl:inst:n2:sup': 19, + 'adj:pl:inst:p1:sup': 19, + 'adj:pl:inst:p2:sup': 19, + 'adj:pl:inst:p3:sup': 19, + 'adj:pl:loc:f:sup': 19, + 'adj:pl:loc:m1:sup': 19, + 'adj:pl:loc:m2:sup': 19, + 'adj:pl:loc:m3:sup': 19, + 'adj:pl:loc:n1:sup': 19, + 'adj:pl:loc:n2:sup': 19, + 'adj:pl:loc:p1:sup': 19, + 'adj:pl:loc:p2:sup': 19, + 'adj:pl:loc:p3:sup': 19, + 'adj:pl:nom:f:sup': 19, + 'adj:pl:nom:m1:sup': 19, + 'adj:pl:nom:m2:sup': 19, + 'adj:pl:nom:m3:sup': 19, + 'adj:pl:nom:n1:sup': 19, + 'adj:pl:nom:n2:sup': 19, + 'adj:pl:nom:p1:sup': 19, + 'adj:pl:nom:p2:sup': 19, + 'adj:pl:nom:p3:sup': 19, + 'adj:sg:acc:f:sup': 19, + 'adj:sg:acc:m1:sup': 19, + 'adj:sg:acc:m2:sup': 19, + 'adj:sg:acc:m3:sup': 19, + 'adj:sg:acc:n1:sup': 19, + 'adj:sg:acc:n2:sup': 19, + 'adj:sg:dat:f:sup': 19, + 'adj:sg:dat:m1:sup': 19, + 'adj:sg:dat:m2:sup': 19, + 'adj:sg:dat:m3:sup': 19, + 'adj:sg:dat:n1:sup': 19, + 'adj:sg:dat:n2:sup': 19, + 'adj:sg:gen:f:sup': 19, + 'adj:sg:gen:m1:sup': 19, + 'adj:sg:gen:m2:sup': 19, + 'adj:sg:gen:m3:sup': 19, + 'adj:sg:gen:n1:sup': 19, + 'adj:sg:gen:n2:sup': 19, + 'adj:sg:inst:f:sup': 19, + 'adj:sg:inst:m1:sup': 19, + 'adj:sg:inst:m2:sup': 19, + 'adj:sg:inst:m3:sup': 19, + 'adj:sg:inst:n1:sup': 19, + 'adj:sg:inst:n2:sup': 19, + 'adj:sg:loc:f:sup': 19, + 'adj:sg:loc:m1:sup': 19, + 'adj:sg:loc:m2:sup': 19, + 'adj:sg:loc:m3:sup': 19, + 'adj:sg:loc:n1:sup': 19, + 'adj:sg:loc:n2:sup': 19, + 'adj:sg:nom:f:sup': 19, + 'adj:sg:nom:m1:sup': 19, + 'adj:sg:nom:m2:sup': 19, + 'adj:sg:nom:m3:sup': 19, + 'adj:sg:nom:n1:sup': 19, + 'adj:sg:nom:n2:sup': 19, + 'adv:sup': 19, + 'winien:sg:m1.m2.m3:imperf': 3, + 'winien:sg:f:imperf': 3, + 'winien:sg:n1.n2:imperf': 3, + 'winien:pl:m1.p1:imperf': 3, + 'winien:pl:m2.m3.f.n1.n2.p2.p3:imperf': 3, + 'adja': 15, + 'ger:sg:dat.loc:n2:imperf:neg': 18, + 'ger:sg:dat.loc:n2:imperf.perf:neg': 18, + 'ger:sg:dat.loc:n2:perf:neg': 18, + 'ger:sg:gen:n2:imperf:neg': 18, + 'ger:sg:gen:n2:imperf.perf:neg': 18, + 'ger:sg:gen:n2:perf:neg': 18, + 'ger:sg:inst:n2:imperf:neg': 18, + 'ger:sg:inst:n2:imperf.perf:neg': 18, + 'ger:sg:inst:n2:perf:neg': 18, + 'ger:sg:nom.acc:n2:imperf:neg': 18, + 'ger:sg:nom.acc:n2:imperf.perf:neg': 18, + 'ger:sg:nom.acc:n2:perf:neg': 18, + 'pact:pl:acc:m1.p1:imperf:neg': 18, + 'pact:pl:acc:m1.p1:imperf.perf:neg': 18, + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18, + 'pact:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18, + 'pact:pl:nom:m1.p1:imperf:neg': 18, + 'pact:pl:nom:m1.p1:imperf.perf:neg': 18, + 'pact:sg:acc.inst:f:imperf:neg': 18, + 'pact:sg:acc.inst:f:imperf.perf:neg': 18, + 'pact:sg:acc:m1.m2:imperf:neg': 18, + 'pact:sg:acc:m1.m2:imperf.perf:neg': 18, + 'pact:sg:acc:m3:imperf:neg': 18, + 'pact:sg:acc:m3:imperf.perf:neg': 18, + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18, + 'pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'pact:sg:gen.dat.loc:f:imperf:neg': 18, + 'pact:sg:gen.dat.loc:f:imperf.perf:neg': 18, + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18, + 'pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18, + 'pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'pact:sg:nom.acc:n1.n2:imperf:neg': 18, + 'pact:sg:nom.acc:n1.n2:imperf.perf:neg': 18, + 'pact:sg:nom:f:imperf:neg': 18, + 'pact:sg:nom:f:imperf.perf:neg': 18, + 'pact:sg:nom:m1.m2.m3:imperf:neg': 18, + 'pact:sg:nom:m1.m2.m3:imperf.perf:neg': 18, + 'ppas:pl:acc:m1.p1:imperf:neg': 18, + 'ppas:pl:acc:m1.p1:imperf.perf:neg': 18, + 'ppas:pl:acc:m1.p1:perf:neg': 18, + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg': 18, + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg': 18, + 'ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg': 18, + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf:neg': 18, + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg': 18, + 'ppas:pl:nom.acc:m2.m3.f.n1.n2.p2.p3:perf:neg': 18, + 'ppas:pl:nom:m1.p1:imperf:neg': 18, + 'ppas:pl:nom:m1.p1:imperf.perf:neg': 18, + 'ppas:pl:nom:m1.p1:perf:neg': 18, + 'ppas:sg:acc.inst:f:imperf:neg': 18, + 'ppas:sg:acc.inst:f:imperf.perf:neg': 18, + 'ppas:sg:acc.inst:f:perf:neg': 18, + 'ppas:sg:acc:m1.m2:imperf:neg': 18, + 'ppas:sg:acc:m1.m2:imperf.perf:neg': 18, + 'ppas:sg:acc:m1.m2:perf:neg': 18, + 'ppas:sg:acc:m3:imperf:neg': 18, + 'ppas:sg:acc:m3:imperf.perf:neg': 18, + 'ppas:sg:acc:m3:perf:neg': 18, + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg': 18, + 'ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg': 18, + 'ppas:sg:gen.dat.loc:f:imperf:neg': 18, + 'ppas:sg:gen.dat.loc:f:imperf.perf:neg': 18, + 'ppas:sg:gen.dat.loc:f:perf:neg': 18, + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg': 18, + 'ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg': 18, + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg': 18, + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg': 18, + 'ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg': 18, + 'ppas:sg:nom.acc:n1.n2:imperf:neg': 18, + 'ppas:sg:nom.acc:n1.n2:imperf.perf:neg': 18, + 'ppas:sg:nom.acc:n1.n2:perf:neg': 18, + 'ppas:sg:nom:f:imperf:neg': 18, + 'ppas:sg:nom:f:imperf.perf:neg': 18, + 'ppas:sg:nom:f:perf:neg': 18, + 'ppas:sg:nom:m1.m2.m3:imperf:neg': 18, + 'ppas:sg:nom:m1.m2.m3:imperf.perf:neg': 18, + 'ppas:sg:nom:m1.m2.m3:perf:neg': 18, + 'ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep': 8, + 'prep:acc': 6, + 'prep:acc:wok': 6, + 'prep:acc.inst': 6, + 'prep:acc.inst:wok': 6, + 'prep:inst.acc': 6, + 'prep:inst.acc:wok': 6, + 'prep:inst.gen.acc:wok': 6, + 'prep:acc.loc': 6, + 'prep:acc.loc:wok': 6, + 'prep:loc.acc': 6, + 'prep:loc.acc:wok': 6, + 'prep:gen': 6, + 'prep:gen.dat': 6, + 'prep:gen:wok': 6, + 'prep:gen.inst:wok': 6, + 'brev:pun': 9, + 'brev:npun': 9, + 'intrj': 9, + 'burk': 9, +} + def _sortLines(inputLines, encoder): logging.info('sorting input...') lines = list(inputLines) @@ -22,7 +414,8 @@ def _parseLines(inputLines, tagset, encoder): orth, base, tag, name = line.split(u'\t') tagnum = tagset.tag2tagnum[tag] namenum = tagset.name2namenum[name] - yield (orth, Interpretation(orth, base, tagnum, namenum, encoder)) + typenum = tag2typenum.get(tag, 0) + yield (orth, Interpretation(orth, base, tagnum, namenum, typenum, encoder)) def _mergeEntries(inputLines): prevOrth = None diff --git a/fsabuilder/fsa/encode.py b/fsabuilder/fsa/encode.py index 059ee4d..26c2a2e 100644 --- a/fsabuilder/fsa/encode.py +++ b/fsabuilder/fsa/encode.py @@ -61,11 +61,16 @@ class MorphEncoder(Encoder): res.append(firstByte) assert type(interpsList) == frozenset for interp in sorted(interpsList, key=lambda i: i.getSortKey()): + res.extend(self._encodeTypeNum(interp.typenum)) res.extend(self._encodeLemma(interp.lemma)) res.extend(self._encodeTagNum(interp.tagnum)) res.extend(self._encodeNameNum(interp.namenum)) return res + def _encodeTypeNum(self, typenum): + assert typenum >= 0 and typenum < 256 + return bytearray([typenum]) + def _encodeLemma(self, lemma): res = bytearray() assert lemma.cutLength < 256 and lemma.cutLength >= 0 diff --git a/fsabuilder/fsa/serializer.py b/fsabuilder/fsa/serializer.py index 0cf0b3d..1a285a0 100644 --- a/fsabuilder/fsa/serializer.py +++ b/fsabuilder/fsa/serializer.py @@ -40,7 +40,6 @@ class Serializer(object): raise NotImplementedError('Not implemented') def fsa2bytearray(self): - res = bytearray() res.extend(self.serializePrologue(self.serializeTagset(self.fsa.tagset))) self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt index 60f6ac9..b8dc4f7 100644 --- a/morfeusz/CMakeLists.txt +++ b/morfeusz/CMakeLists.txt @@ -7,7 +7,7 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) add_executable (morfeusz2_analyze main.cpp) add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) -add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp) +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp) # Link the executable to the Hello library. target_link_libraries (morfeusz2_analyze morfeusz2) diff --git a/morfeusz/EncodedInterpretation.hpp b/morfeusz/EncodedInterpretation.hpp index 4f2d18f..f969cf6 100644 --- a/morfeusz/EncodedInterpretation.hpp +++ b/morfeusz/EncodedInterpretation.hpp @@ -28,8 +28,11 @@ struct EncodedLemma { */ struct EncodedInterpretation { EncodedLemma lemma; + int type; int tag; int nameClassifier; + int startNode; + int endNode; }; #endif /* INTERPRETATION_HPP */ diff --git a/morfeusz/InterpsGroup.hpp b/morfeusz/InterpsGroup.hpp new file mode 100644 index 0000000..c355f2d --- /dev/null +++ b/morfeusz/InterpsGroup.hpp @@ -0,0 +1,49 @@ +/* + * File: GroupedInterpretations.hpp + * Author: lennyn + * + * Created on November 16, 2013, 7:58 PM + */ + +#ifndef GROUPEDINTERPRETATIONS_HPP +#define GROUPEDINTERPRETATIONS_HPP + +#include <vector> +#include <string> +#include "EncodedInterpretation.hpp" +#include "MorphInterpretation.hpp" +#include "Tagset.hpp" + +class InterpsGroup { +public: + + InterpsGroup() { + + } + + explicit InterpsGroup(const int type) + : type(type) { + + } + + std::vector<MorphInterpretation> getRealInterps(const std::string& orth, const Tagset& tagset) { + std::vector<MorphInterpretation> res; + for (EncodedInterpretation& ei: interps) { + res.push_back(MorphInterpretation(startNode, endNode, orth, ei, tagset)); + } + return res; + } + + void addInterpretation(const EncodedInterpretation& interp) { + interps.push_back(interp); + } + + int type; + int startNode; + int endNode; +private: + std::vector<EncodedInterpretation> interps; +}; + +#endif /* GROUPEDINTERPRETATIONS_HPP */ + diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index bedb63c..1439f44 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -6,17 +6,18 @@ */ #include <string> +#include "fsa.hpp" #include "utils.hpp" #include "Morfeusz.hpp" #include "MorphDeserializer.hpp" -#include "encoding/CharsetConverter.hpp" +#include "charset/CharsetConverter.hpp" using namespace std; -static FSA<vector<EncodedInterpretation>>* initializeFSA(const string& filename) { - static Deserializer<vector<EncodedInterpretation>>* deserializer - = new MorphDeserializer(); - return FSA<vector<EncodedInterpretation>>::getFSA(filename, *deserializer); +static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) { + static Deserializer < vector < InterpsGroup >> *deserializer + = new MorphDeserializer(); + return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer); } static CharsetConverter* initializeCharsetConverter() { @@ -26,7 +27,7 @@ static CharsetConverter* initializeCharsetConverter() { Morfeusz::Morfeusz(const string& filename) : fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) { - + } //Morfeusz::Morfeusz(const Morfeusz& orig) { @@ -36,12 +37,57 @@ Morfeusz::~Morfeusz() { delete &this->fsa; } -AnalyzeResult Morfeusz::analyze(const std::string& text) { - const char* textStart = text.c_str(); - const char* textEnd = text.c_str() + text.length(); - AnalyzeResult res = { - ResultsIterator(textStart, textEnd, *this), - ResultsIterator(textEnd, textEnd, *this)}; - return res; +ResultsIterator Morfeusz::analyze(const std::string& text) { +// const char* textStart = text.c_str(); +// const char* textEnd = text.c_str() + text.length(); + return ResultsIterator(text, *this); +} + +ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz) +: rawInput(text.c_str()), +morfeusz(morfeusz) { +} + +MorphInterpretation ResultsIterator::getNext() { +// if (resultsBuffer.empty()) { +// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer)); +// } +// startNode = resultsBuffer.back().getEndNode(); +// MorphInterpretation res = resultsBuffer.front(); +// resultsBuffer.pop_front(); +// return res; +} + +bool ResultsIterator::hasNext() { + return rawInput[0] != '\0' && resultsBuffer.empty(); } +//int Morfeusz::doProcessOneWord(const char*& inputPtr, const char* inputEnd, int startNodeNum, std::vector<EncodedInterpretation>& interps) const { +// assert(inputPtr[0] != '\0'); +// const char* start = inputPtr; +// StateType state = fsa->getInitialState(); +// int currNodeNum = startNodeNum; +// int codepoint = this->charsetConverter->next(inputPtr, inputEnd); +// assert(!isEndOfWord(codepoint)); +// while(!isEndOfWord(codepoint)) { +// feedState(state, codepoint); +// if (state.isAccepting()) { +// const char* currInputPtr = inputPtr; +// vector<EncodedInterpretation> startInterps = state.getValue(); +// filterOutNonGluableInterps(startInterps); +// if (!startInterps.empty()) { +// +// } +// vector<EncodedInterpretation> additionalInterps; +// int nextNodeNum = doProcessOneWord(currInputPtr, inputEnd, currNodeNum + 1, additionalInterps); +// if (!additionalInterps.empty()) { +// for (EncodedInterpretation& interp: state.getValue()) { +// interp.startNode = currNodeNum; +// interp.endNode = currNodeNum + 1; +// interps.push_back(interp); +// } +// +// } +// } +// } +//} diff --git a/morfeusz/Morfeusz.hpp b/morfeusz/Morfeusz.hpp index 9b5768a..e4671cb 100644 --- a/morfeusz/Morfeusz.hpp +++ b/morfeusz/Morfeusz.hpp @@ -9,53 +9,78 @@ #define MORFEUSZ_HPP #include <string> +#include <list> #include <vector> #include "EncodedInterpretation.hpp" #include "fsa.hpp" #include "MorphInterpretation.hpp" -#include "encoding/CharsetConverter.hpp" +#include "InterpsGroup.hpp" +#include "charset/CharsetConverter.hpp" class Morfeusz; -class AnalyzeResult; +//class AnalyzeResult; class ResultsIterator; +typedef FSA<std::vector<InterpsGroup>> FSAType; +typedef State<std::vector<InterpsGroup>> StateType; + class Morfeusz { public: explicit Morfeusz(const std::string& filename); virtual ~Morfeusz(); // Morfeusz(const Morfeusz& orig); - AnalyzeResult analyze(const std::string& text); + ResultsIterator analyze(const std::string& text); // Morfeusz(); + friend class ResultsIterator; private: - void processOneWord(const char*& inputData, int startNodeNum, std::vector<MorphInterpretation>& resInterps); - const FSA<std::vector<EncodedInterpretation>>* fsa; + template <class OutputIterator> +// void processOneWord(const char*& inputData, int startNodeNum, OutputIterator resInterps) const; + + int doProcessOneWord(const char*& inputData, int startNodeNum, std::vector<InterpsGroup>& interps) const; + + const FSAType* fsa; CharsetConverter* charsetConverter; }; +#include "Morfeusz_impl.hpp" + class ResultsIterator { public: - ResultsIterator( - const char* startOfInput, - const char* endOfInput, - const Morfeusz& morfeusz); - virtual ~ResultsIterator(); -// ResultsIterator(int* x); - ResultsIterator(const ResultsIterator& mit); - ResultsIterator& operator++(); - ResultsIterator operator++(int); - bool operator==(const ResultsIterator& rhs); - bool operator!=(const ResultsIterator& rhs); - MorphInterpretation& operator*(); + ResultsIterator(const std::string& text, const Morfeusz& morfeusz); + MorphInterpretation getNext(); + bool hasNext(); private: const char* rawInput; - const char* endOfInput; + const Morfeusz& morfeusz; + std::list<MorphInterpretation> resultsBuffer; + int startNode; }; -struct AnalyzeResult { - ResultsIterator iterator; - const ResultsIterator end; -}; +//class ResultsIterator { +//public: +// ResultsIterator( +// const char* startOfInput, +// const char* endOfInput, +// const Morfeusz& morfeusz); +// virtual ~ResultsIterator(); +// ResultsIterator(const ResultsIterator& mit); +// ResultsIterator& operator++(); +// ResultsIterator operator++(int); +// bool operator==(const ResultsIterator& rhs); +// bool operator!=(const ResultsIterator& rhs); +// MorphInterpretation& operator*(); +//private: +// const char* rawInput; +// const char* endOfInput; +// const Morfeusz& morfeusz; +// vector<MorphInterpretation> resultsBuffer; +//}; + +//struct AnalyzeResult { +// ResultsIterator iterator; +// const ResultsIterator end; +//}; #endif /* MORFEUSZ_HPP */ diff --git a/morfeusz/Morfeusz_impl.hpp b/morfeusz/Morfeusz_impl.hpp new file mode 100644 index 0000000..aebf57f --- /dev/null +++ b/morfeusz/Morfeusz_impl.hpp @@ -0,0 +1,42 @@ +/* + * File: Morfeusz_impl.hpp + * Author: lennyn + * + * Created on November 15, 2013, 1:43 PM + */ + +#ifndef MORFEUSZ_IMPL_HPP +#define MORFEUSZ_IMPL_HPP + +#include <cassert> +#include "Morfeusz.hpp" + +//template <class OutputIterator> +//void Morfeusz::processOneWord(const char*& inputData, const char* inputEnd, int startNodeNum, OutputIterator output, bool insertIgn = true) const { +// if (inputData == inputEnd) { +// return; +// } +// const char* start = inputData; +// StateType state = fsa->getInitialState(); +// int currNodeNum = startNodeNum; +// do { +// int codepoint = this->charsetConverter->next(inputData, inputEnd); +// if (!isSpace(codepoint) && codepoint != 0) { +// feedAutomaton(state, codepoint); +// if (state.isAccepting()) { +// int currInput = inputData; +// vector<MorphInterpretation> additionalInterps; +// processOneWord( +// currInput, inputEnd, +// currNodeNum + 1, +// back_inserter(additionalInterps), false); +// if (!additionalInterps.empty()) { +// currNodeNum = additionalInterps.back().getEndNode(); +// } +// } +// } +// } +//} + +#endif /* MORFEUSZ_IMPL_HPP */ + diff --git a/morfeusz/MorphDeserializer.cpp b/morfeusz/MorphDeserializer.cpp index 8660443..0c9a601 100644 --- a/morfeusz/MorphDeserializer.cpp +++ b/morfeusz/MorphDeserializer.cpp @@ -5,7 +5,10 @@ * Created on 12 listopad 2013, 15:31 */ +#include <map> #include "MorphDeserializer.hpp" +#include "EncodedInterpretation.hpp" +#include "InterpsGroup.hpp" MorphDeserializer::MorphDeserializer() { } @@ -25,6 +28,8 @@ static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) { } static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { + interp.type = *ptr; + ptr++; deserializeLemma(ptr, interp.lemma); interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); ptr += 2; @@ -32,17 +37,58 @@ static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& ptr++; } -long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const { +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { const unsigned char* currPtr = ptr; uint8_t interpsNum = *ptr; interps.clear(); interps.reserve(interpsNum); currPtr++; + // FIXME - to jest do poprawy + map<int, InterpsGroup> results; for (unsigned int i = 0; i < interpsNum; ++i) { EncodedInterpretation interp; deserializeInterp(currPtr, interp); - interps.push_back(interp); + if (results.count(interp.type) == 0) { + results[interp.type] = InterpsGroup(interp.type); + } + results[interp.type].addInterpretation(interp); +// interps.push_back(interp); + } + for (auto& kv: results) { + interps.push_back(kv.second); } return currPtr - ptr; } +//static void deserializeLemma(const unsigned char*& ptr, EncodedLemma& lemma) { +// // XXX uważać na poprawność danych +// lemma.suffixToCut = *ptr; +// ptr++; +// lemma.suffixToAdd = (const char*) ptr; +// ptr += strlen((const char*) ptr) + 1; +//} +// +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { +// interp.type = *ptr; +// ptr++; +// deserializeLemma(ptr, interp.lemma); +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); +// ptr += 2; +// interp.nameClassifier = *ptr; +// ptr++; +//} +// +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<EncodedInterpretation>& interps) const { +// const unsigned char* currPtr = ptr; +// uint8_t interpsNum = *ptr; +// interps.clear(); +// interps.reserve(interpsNum); +// currPtr++; +// for (unsigned int i = 0; i < interpsNum; ++i) { +// EncodedInterpretation interp; +// deserializeInterp(currPtr, interp); +// interps.push_back(interp); +// } +// return currPtr - ptr; +//} + diff --git a/morfeusz/MorphDeserializer.hpp b/morfeusz/MorphDeserializer.hpp index 5572193..cc1c646 100644 --- a/morfeusz/MorphDeserializer.hpp +++ b/morfeusz/MorphDeserializer.hpp @@ -10,19 +10,31 @@ #include <vector> #include "fsa.hpp" -#include "EncodedInterpretation.hpp" +#include "InterpsGroup.hpp" -class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> { +class MorphDeserializer: public Deserializer<std::vector<InterpsGroup>> { public: MorphDeserializer(); MorphDeserializer(const MorphDeserializer& orig); virtual ~MorphDeserializer(); long deserialize( const unsigned char* ptr, - std::vector<EncodedInterpretation>& interps) const; + std::vector<InterpsGroup>& interps) const; private: }; +//class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> { +//public: +// MorphDeserializer(); +// MorphDeserializer(const MorphDeserializer& orig); +// virtual ~MorphDeserializer(); +// long deserialize( +// const unsigned char* ptr, +// std::vector<EncodedInterpretation>& interps) const; +//private: +// +//}; + #endif /* MORPHDESERIALIZER_HPP */ diff --git a/morfeusz/MorphInterpretation.hpp b/morfeusz/MorphInterpretation.hpp index bbebd28..eab9c1a 100644 --- a/morfeusz/MorphInterpretation.hpp +++ b/morfeusz/MorphInterpretation.hpp @@ -36,8 +36,8 @@ private: std::string lemma; int tagnum; int namenum; - const std::string& tag; - const std::string& name; + std::string tag; + std::string name; }; #endif /* MORPHINTERPRETATION_HPP */ diff --git a/morfeusz/charset/CharsetConverter.cpp b/morfeusz/charset/CharsetConverter.cpp new file mode 100644 index 0000000..226353a --- /dev/null +++ b/morfeusz/charset/CharsetConverter.cpp @@ -0,0 +1,16 @@ +/* + * File: EncodingConverter.cpp + * Author: mlenart + * + * Created on 14 listopad 2013, 17:28 + */ + +#include "utf8.h" +#include "CharsetConverter.hpp" + +uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { + return utf8::next(it, end); +} +char* UTF8CharsetConverter::append(uint32_t cp, char* result) const { + return utf8::append(cp, result); +} diff --git a/morfeusz/charset/CharsetConverter.hpp b/morfeusz/charset/CharsetConverter.hpp new file mode 100644 index 0000000..a58b4c1 --- /dev/null +++ b/morfeusz/charset/CharsetConverter.hpp @@ -0,0 +1,47 @@ +/* + * File: EncodingConverter.hpp + * Author: mlenart + * + * Created on 14 listopad 2013, 17:28 + */ + +#ifndef ENCODINGCONVERTER_HPP +#define ENCODINGCONVERTER_HPP + +class CharsetConverter { +public: + virtual uint32_t next(const char*& it, const char* end) const = 0; + virtual char* append(uint32_t cp, char* result) const = 0; +private: +}; + +class UTF8CharsetConverter: public CharsetConverter { +public: + uint32_t next(const char*& it, const char* end) const; + char* append(uint32_t cp, char* result) const; +private: +}; + +class UTF16CharsetConverter: public CharsetConverter { +public: + uint32_t next(const char*& it, const char* end) const; + char* append(uint32_t cp, char* result) const; +private: +}; + +class UTF32CharsetConverter: public CharsetConverter { +public: + uint32_t next(const char*& it, const char* end) const; + char* append(uint32_t cp, char* result) const; +private: +}; + +class ISO8859_2_CharsetConverter: public CharsetConverter { +public: + uint32_t next(const char*& it, const char* end) const; + char* append(uint32_t cp, char* result) const; +private: +}; + +#endif /* ENCODINGCONVERTER_HPP */ + diff --git a/morfeusz/charset/charset_utils.hpp b/morfeusz/charset/charset_utils.hpp new file mode 100644 index 0000000..c845441 --- /dev/null +++ b/morfeusz/charset/charset_utils.hpp @@ -0,0 +1,14 @@ +/* + * File: charset_utils.hpp + * Author: lennyn + * + * Created on November 15, 2013, 1:57 PM + */ + +#ifndef CHARSET_UTILS_HPP +#define CHARSET_UTILS_HPP + + + +#endif /* CHARSET_UTILS_HPP */ + diff --git a/morfeusz/charset/utf8.h b/morfeusz/charset/utf8.h new file mode 100644 index 0000000..4e44514 --- /dev/null +++ b/morfeusz/charset/utf8.h @@ -0,0 +1,34 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "utf8/checked.h" +#include "utf8/unchecked.h" + +#endif // header guard diff --git a/morfeusz/charset/utf8/checked.h b/morfeusz/charset/utf8/checked.h new file mode 100644 index 0000000..1331155 --- /dev/null +++ b/morfeusz/charset/utf8/checked.h @@ -0,0 +1,327 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" +#include <stdexcept> + +namespace utf8 +{ + // Base for the exceptions that may be thrown from the library + class exception : public ::std::exception { + }; + + // Exceptions that may be thrown from the library functions. + class invalid_code_point : public exception { + uint32_t cp; + public: + invalid_code_point(uint32_t cp) : cp(cp) {} + virtual const char* what() const throw() { return "Invalid code point"; } + uint32_t code_point() const {return cp;} + }; + + class invalid_utf8 : public exception { + uint8_t u8; + public: + invalid_utf8 (uint8_t u) : u8(u) {} + virtual const char* what() const throw() { return "Invalid UTF-8"; } + uint8_t utf8_octet() const {return u8;} + }; + + class invalid_utf16 : public exception { + uint16_t u16; + public: + invalid_utf16 (uint16_t u) : u16(u) {} + virtual const char* what() const throw() { return "Invalid UTF-16"; } + uint16_t utf16_word() const {return u16;} + }; + + class not_enough_room : public exception { + public: + virtual const char* what() const throw() { return "Not enough space"; } + }; + + /// The library API - functions intended to be called by the users + + template <typename octet_iterator> + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (!utf8::internal::is_code_point_valid(cp)) + throw invalid_code_point(cp); + + if (cp < 0x80) // one octet + *(result++) = static_cast<uint8_t>(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + return result; + } + + template <typename octet_iterator, typename output_iterator> + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) + { + while (start != end) { + octet_iterator sequence_start = start; + internal::utf_error err_code = utf8::internal::validate_next(start, end); + switch (err_code) { + case internal::UTF8_OK : + for (octet_iterator it = sequence_start; it != start; ++it) + *out++ = *it; + break; + case internal::NOT_ENOUGH_ROOM: + throw not_enough_room(); + case internal::INVALID_LEAD: + out = utf8::append (replacement, out); + ++start; + break; + case internal::INCOMPLETE_SEQUENCE: + case internal::OVERLONG_SEQUENCE: + case internal::INVALID_CODE_POINT: + out = utf8::append (replacement, out); + ++start; + // just one replacement mark for the sequence + while (start != end && utf8::internal::is_trail(*start)) + ++start; + break; + } + } + return out; + } + + template <typename octet_iterator, typename output_iterator> + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) + { + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); + return utf8::replace_invalid(start, end, out, replacement_marker); + } + + template <typename octet_iterator> + uint32_t next(octet_iterator& it, octet_iterator end) + { + uint32_t cp = 0; + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + switch (err_code) { + case internal::UTF8_OK : + break; + case internal::NOT_ENOUGH_ROOM : + throw not_enough_room(); + case internal::INVALID_LEAD : + case internal::INCOMPLETE_SEQUENCE : + case internal::OVERLONG_SEQUENCE : + throw invalid_utf8(*it); + case internal::INVALID_CODE_POINT : + throw invalid_code_point(cp); + } + return cp; + } + + template <typename octet_iterator> + uint32_t peek_next(octet_iterator it, octet_iterator end) + { + return utf8::next(it, end); + } + + template <typename octet_iterator> + uint32_t prior(octet_iterator& it, octet_iterator start) + { + // can't do much if it == start + if (it == start) + throw not_enough_room(); + + octet_iterator end = it; + // Go back until we hit either a lead octet or start + while (utf8::internal::is_trail(*(--it))) + if (it == start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + return utf8::peek_next(it, end); + } + + /// Deprecated in versions that include "prior" + template <typename octet_iterator> + uint32_t previous(octet_iterator& it, octet_iterator pass_start) + { + octet_iterator end = it; + while (utf8::internal::is_trail(*(--it))) + if (it == pass_start) + throw invalid_utf8(*it); // error - no lead byte in the sequence + octet_iterator temp = it; + return utf8::next(temp, end); + } + + template <typename octet_iterator, typename distance_type> + void advance (octet_iterator& it, distance_type n, octet_iterator end) + { + for (distance_type i = 0; i < n; ++i) + utf8::next(it, end); + } + + template <typename octet_iterator> + typename std::iterator_traits<octet_iterator>::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits<octet_iterator>::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::next(first, last); + return dist; + } + + template <typename u16bit_iterator, typename octet_iterator> + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + if (start != end) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + if (utf8::internal::is_trail_surrogate(trail_surrogate)) + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + else + throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); + } + else + throw invalid_utf16(static_cast<uint16_t>(cp)); + + } + // Lone trail surrogate + else if (utf8::internal::is_trail_surrogate(cp)) + throw invalid_utf16(static_cast<uint16_t>(cp)); + + result = utf8::append(cp, result); + } + return result; + } + + template <typename u16bit_iterator, typename octet_iterator> + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start != end) { + uint32_t cp = utf8::next(start, end); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast<uint16_t>(cp); + } + return result; + } + + template <typename octet_iterator, typename u32bit_iterator> + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::append(*(start++), result); + + return result; + } + + template <typename octet_iterator, typename u32bit_iterator> + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start != end) + (*result++) = utf8::next(start, end); + + return result; + } + + // The iterator class + template <typename octet_iterator> + class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { + octet_iterator it; + octet_iterator range_start; + octet_iterator range_end; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it, + const octet_iterator& range_start, + const octet_iterator& range_end) : + it(octet_it), range_start(range_start), range_end(range_end) + { + if (it < range_start || it > range_end) + throw std::out_of_range("Invalid utf-8 iterator position"); + } + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::next(temp, range_end); + } + bool operator == (const iterator& rhs) const + { + if (range_start != rhs.range_start || range_end != rhs.range_end) + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + utf8::next(it, range_end); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + utf8::next(it, range_end); + return temp; + } + iterator& operator -- () + { + utf8::prior(it, range_start); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::prior(it, range_start); + return temp; + } + }; // class iterator + +} // namespace utf8 + +#endif //header guard + + diff --git a/morfeusz/charset/utf8/core.h b/morfeusz/charset/utf8/core.h new file mode 100644 index 0000000..d237583 --- /dev/null +++ b/morfeusz/charset/utf8/core.h @@ -0,0 +1,330 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include <iterator> +#include <cstdint> + +namespace utf8 +{ + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers + // You may need to change them to match your system. + // These typedefs have the same names as ones from cstdint, or boost/cstdint +// typedef unsigned char uint8_t; +// typedef unsigned short uint16_t; +// typedef unsigned int uint32_t; + +// Helper code - not intended to be directly called by the library users. May be changed at any time +namespace internal +{ + // Unicode constants + // Leading (high) surrogates: 0xd800 - 0xdbff + // Trailing (low) surrogates: 0xdc00 - 0xdfff + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; + + // Maximum valid value for a Unicode code point + const uint32_t CODE_POINT_MAX = 0x0010ffffu; + + template<typename octet_type> + inline uint8_t mask8(octet_type oc) + { + return static_cast<uint8_t>(0xff & oc); + } + template<typename u16_type> + inline uint16_t mask16(u16_type oc) + { + return static_cast<uint16_t>(0xffff & oc); + } + template<typename octet_type> + inline bool is_trail(octet_type oc) + { + return ((utf8::internal::mask8(oc) >> 6) == 0x2); + } + + template <typename u16> + inline bool is_lead_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); + } + + template <typename u16> + inline bool is_trail_surrogate(u16 cp) + { + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template <typename u16> + inline bool is_surrogate(u16 cp) + { + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); + } + + template <typename u32> + inline bool is_code_point_valid(u32 cp) + { + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); + } + + template <typename octet_iterator> + inline typename std::iterator_traits<octet_iterator>::difference_type + sequence_length(octet_iterator lead_it) + { + uint8_t lead = utf8::internal::mask8(*lead_it); + if (lead < 0x80) + return 1; + else if ((lead >> 5) == 0x6) + return 2; + else if ((lead >> 4) == 0xe) + return 3; + else if ((lead >> 3) == 0x1e) + return 4; + else + return 0; + } + + template <typename octet_difference_type> + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) + { + if (cp < 0x80) { + if (length != 1) + return true; + } + else if (cp < 0x800) { + if (length != 2) + return true; + } + else if (cp < 0x10000) { + if (length != 3) + return true; + } + + return false; + } + + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; + + /// Helper for get_sequence_x + template <typename octet_iterator> + utf_error increase_safely(octet_iterator& it, octet_iterator end) + { + if (++it == end) + return NOT_ENOUGH_ROOM; + + if (!utf8::internal::is_trail(*it)) + return INCOMPLETE_SEQUENCE; + + return UTF8_OK; + } + + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} + + /// get_sequence_x functions decode utf-8 sequences of the length x + template <typename octet_iterator> + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + return UTF8_OK; + } + + template <typename octet_iterator> + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); + + return UTF8_OK; + } + + template <typename octet_iterator> + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + template <typename octet_iterator> + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + if (it == end) + return NOT_ENOUGH_ROOM; + + code_point = utf8::internal::mask8(*it); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; + + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) + + code_point += (*it) & 0x3f; + + return UTF8_OK; + } + + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR + + template <typename octet_iterator> + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) + { + // Save the original value of it so we can go back in case of failure + // Of course, it does not make much sense with i.e. stream iterators + octet_iterator original_it = it; + + uint32_t cp = 0; + // Determine the sequence length based on the lead octet + typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; + const octet_difference_type length = utf8::internal::sequence_length(it); + + // Get trail octets and calculate the code point + utf_error err = UTF8_OK; + switch (length) { + case 0: + return INVALID_LEAD; + case 1: + err = utf8::internal::get_sequence_1(it, end, cp); + break; + case 2: + err = utf8::internal::get_sequence_2(it, end, cp); + break; + case 3: + err = utf8::internal::get_sequence_3(it, end, cp); + break; + case 4: + err = utf8::internal::get_sequence_4(it, end, cp); + break; + } + + if (err == UTF8_OK) { + // Decoding succeeded. Now, security checks... + if (utf8::internal::is_code_point_valid(cp)) { + if (!utf8::internal::is_overlong_sequence(cp, length)){ + // Passed! Return here. + code_point = cp; + ++it; + return UTF8_OK; + } + else + err = OVERLONG_SEQUENCE; + } + else + err = INVALID_CODE_POINT; + } + + // Failure branch - restore the original value of the iterator + it = original_it; + return err; + } + + template <typename octet_iterator> + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { + uint32_t ignored; + return utf8::internal::validate_next(it, end, ignored); + } + +} // namespace internal + + /// The library API - functions intended to be called by the users + + // Byte order mark + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; + + template <typename octet_iterator> + octet_iterator find_invalid(octet_iterator start, octet_iterator end) + { + octet_iterator result = start; + while (result != end) { + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); + if (err_code != internal::UTF8_OK) + return result; + } + return result; + } + + template <typename octet_iterator> + inline bool is_valid(octet_iterator start, octet_iterator end) + { + return (utf8::find_invalid(start, end) == end); + } + + template <typename octet_iterator> + inline bool starts_with_bom (octet_iterator it, octet_iterator end) + { + return ( + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) + ); + } + + //Deprecated in release 2.3 + template <typename octet_iterator> + inline bool is_bom (octet_iterator it) + { + return ( + (utf8::internal::mask8(*it++)) == bom[0] && + (utf8::internal::mask8(*it++)) == bom[1] && + (utf8::internal::mask8(*it)) == bom[2] + ); + } +} // namespace utf8 + +#endif // header guard + + diff --git a/morfeusz/charset/utf8/unchecked.h b/morfeusz/charset/utf8/unchecked.h new file mode 100644 index 0000000..cb24271 --- /dev/null +++ b/morfeusz/charset/utf8/unchecked.h @@ -0,0 +1,228 @@ +// Copyright 2006 Nemanja Trifunovic + +/* +Permission is hereby granted, free of charge, to any person or organization +obtaining a copy of the software and accompanying documentation covered by +this license (the "Software") to use, reproduce, display, distribute, +execute, and transmit the Software, and to prepare derivative works of the +Software, and to permit third-parties to whom the Software is furnished to +do so, all subject to the following: + +The copyright notices in the Software and this entire statement, including +the above license grant, this restriction and the following disclaimer, +must be included in all copies of the Software, in whole or in part, and +all derivative works of the Software, unless such copies or derivative +works are solely in the form of machine-executable object code generated by +a source language processor. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. +*/ + + +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 + +#include "core.h" + +namespace utf8 +{ + namespace unchecked + { + template <typename octet_iterator> + octet_iterator append(uint32_t cp, octet_iterator result) + { + if (cp < 0x80) // one octet + *(result++) = static_cast<uint8_t>(cp); + else if (cp < 0x800) { // two octets + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + else if (cp < 0x10000) { // three octets + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + else { // four octets + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80); + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); + } + return result; + } + + template <typename octet_iterator> + uint32_t next(octet_iterator& it) + { + uint32_t cp = utf8::internal::mask8(*it); + typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it); + switch (length) { + case 1: + break; + case 2: + it++; + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); + break; + case 3: + ++it; + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); + ++it; + cp += (*it) & 0x3f; + break; + case 4: + ++it; + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); + ++it; + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; + ++it; + cp += (*it) & 0x3f; + break; + } + ++it; + return cp; + } + + template <typename octet_iterator> + uint32_t peek_next(octet_iterator it) + { + return utf8::unchecked::next(it); + } + + template <typename octet_iterator> + uint32_t prior(octet_iterator& it) + { + while (utf8::internal::is_trail(*(--it))) ; + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) + template <typename octet_iterator> + inline uint32_t previous(octet_iterator& it) + { + return utf8::unchecked::prior(it); + } + + template <typename octet_iterator, typename distance_type> + void advance (octet_iterator& it, distance_type n) + { + for (distance_type i = 0; i < n; ++i) + utf8::unchecked::next(it); + } + + template <typename octet_iterator> + typename std::iterator_traits<octet_iterator>::difference_type + distance (octet_iterator first, octet_iterator last) + { + typename std::iterator_traits<octet_iterator>::difference_type dist; + for (dist = 0; first < last; ++dist) + utf8::unchecked::next(first); + return dist; + } + + template <typename u16bit_iterator, typename octet_iterator> + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) + { + while (start != end) { + uint32_t cp = utf8::internal::mask16(*start++); + // Take care of surrogate pairs first + if (utf8::internal::is_lead_surrogate(cp)) { + uint32_t trail_surrogate = utf8::internal::mask16(*start++); + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; + } + result = utf8::unchecked::append(cp, result); + } + return result; + } + + template <typename u16bit_iterator, typename octet_iterator> + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) + { + while (start < end) { + uint32_t cp = utf8::unchecked::next(start); + if (cp > 0xffff) { //make a surrogate pair + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); + } + else + *result++ = static_cast<uint16_t>(cp); + } + return result; + } + + template <typename octet_iterator, typename u32bit_iterator> + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) + { + while (start != end) + result = utf8::unchecked::append(*(start++), result); + + return result; + } + + template <typename octet_iterator, typename u32bit_iterator> + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) + { + while (start < end) + (*result++) = utf8::unchecked::next(start); + + return result; + } + + // The iterator class + template <typename octet_iterator> + class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { + octet_iterator it; + public: + iterator () {} + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} + // the default "big three" are OK + octet_iterator base () const { return it; } + uint32_t operator * () const + { + octet_iterator temp = it; + return utf8::unchecked::next(temp); + } + bool operator == (const iterator& rhs) const + { + return (it == rhs.it); + } + bool operator != (const iterator& rhs) const + { + return !(operator == (rhs)); + } + iterator& operator ++ () + { + ::std::advance(it, utf8::internal::sequence_length(it)); + return *this; + } + iterator operator ++ (int) + { + iterator temp = *this; + ::std::advance(it, utf8::internal::sequence_length(it)); + return temp; + } + iterator& operator -- () + { + utf8::unchecked::prior(it); + return *this; + } + iterator operator -- (int) + { + iterator temp = *this; + utf8::unchecked::prior(it); + return temp; + } + }; // class iterator + + } // namespace utf8::unchecked +} // namespace utf8 + + +#endif // header guard + diff --git a/morfeusz/encoding/CharsetConverter.cpp b/morfeusz/encoding/CharsetConverter.cpp deleted file mode 100644 index 358fb31..0000000 --- a/morfeusz/encoding/CharsetConverter.cpp +++ /dev/null @@ -1,16 +0,0 @@ -/* - * File: EncodingConverter.cpp - * Author: mlenart - * - * Created on 14 listopad 2013, 17:28 - */ - -#include "utf8.h" -#include "CharsetConverter.hpp" - -uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { - return utf8::next(it, end); -} -const char* UTF8CharsetConverter::append(uint32_t cp, const char* result) const { - return utf8::append(cp, result); -} diff --git a/morfeusz/encoding/CharsetConverter.hpp b/morfeusz/encoding/CharsetConverter.hpp deleted file mode 100644 index fa25f78..0000000 --- a/morfeusz/encoding/CharsetConverter.hpp +++ /dev/null @@ -1,47 +0,0 @@ -/* - * File: EncodingConverter.hpp - * Author: mlenart - * - * Created on 14 listopad 2013, 17:28 - */ - -#ifndef ENCODINGCONVERTER_HPP -#define ENCODINGCONVERTER_HPP - -class CharsetConverter { -public: - virtual uint32_t next(const char*& it, const char* end) const = 0; - virtual const char* append(uint32_t cp, const char* result) const = 0; -private: -}; - -class UTF8CharsetConverter: public CharsetConverter { -public: - uint32_t next(const char*& it, const char* end) const; - const char* append(uint32_t cp, const char* result) const; -private: -}; - -class UTF16CharsetConverter: public CharsetConverter { -public: - uint32_t next(const char*& it, const char* end) const; - const char* append(uint32_t cp, const char* result) const; -private: -}; - -class UTF32CharsetConverter: public CharsetConverter { -public: - uint32_t next(const char*& it, const char* end) const; - const char* append(uint32_t cp, const char* result) const; -private: -}; - -class ISO8859_2_CharsetConverter: public CharsetConverter { -public: - uint32_t next(const char*& it, const char* end) const; - const char* append(uint32_t cp, const char* result) const; -private: -}; - -#endif /* ENCODINGCONVERTER_HPP */ - diff --git a/morfeusz/encoding/utf8.h b/morfeusz/encoding/utf8.h deleted file mode 100644 index 4e44514..0000000 --- a/morfeusz/encoding/utf8.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "utf8/checked.h" -#include "utf8/unchecked.h" - -#endif // header guard diff --git a/morfeusz/encoding/utf8/checked.h b/morfeusz/encoding/utf8/checked.h deleted file mode 100644 index 1331155..0000000 --- a/morfeusz/encoding/utf8/checked.h +++ /dev/null @@ -1,327 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" -#include <stdexcept> - -namespace utf8 -{ - // Base for the exceptions that may be thrown from the library - class exception : public ::std::exception { - }; - - // Exceptions that may be thrown from the library functions. - class invalid_code_point : public exception { - uint32_t cp; - public: - invalid_code_point(uint32_t cp) : cp(cp) {} - virtual const char* what() const throw() { return "Invalid code point"; } - uint32_t code_point() const {return cp;} - }; - - class invalid_utf8 : public exception { - uint8_t u8; - public: - invalid_utf8 (uint8_t u) : u8(u) {} - virtual const char* what() const throw() { return "Invalid UTF-8"; } - uint8_t utf8_octet() const {return u8;} - }; - - class invalid_utf16 : public exception { - uint16_t u16; - public: - invalid_utf16 (uint16_t u) : u16(u) {} - virtual const char* what() const throw() { return "Invalid UTF-16"; } - uint16_t utf16_word() const {return u16;} - }; - - class not_enough_room : public exception { - public: - virtual const char* what() const throw() { return "Not enough space"; } - }; - - /// The library API - functions intended to be called by the users - - template <typename octet_iterator> - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (!utf8::internal::is_code_point_valid(cp)) - throw invalid_code_point(cp); - - if (cp < 0x80) // one octet - *(result++) = static_cast<uint8_t>(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); - *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - return result; - } - - template <typename octet_iterator, typename output_iterator> - output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) - { - while (start != end) { - octet_iterator sequence_start = start; - internal::utf_error err_code = utf8::internal::validate_next(start, end); - switch (err_code) { - case internal::UTF8_OK : - for (octet_iterator it = sequence_start; it != start; ++it) - *out++ = *it; - break; - case internal::NOT_ENOUGH_ROOM: - throw not_enough_room(); - case internal::INVALID_LEAD: - out = utf8::append (replacement, out); - ++start; - break; - case internal::INCOMPLETE_SEQUENCE: - case internal::OVERLONG_SEQUENCE: - case internal::INVALID_CODE_POINT: - out = utf8::append (replacement, out); - ++start; - // just one replacement mark for the sequence - while (start != end && utf8::internal::is_trail(*start)) - ++start; - break; - } - } - return out; - } - - template <typename octet_iterator, typename output_iterator> - inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) - { - static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); - return utf8::replace_invalid(start, end, out, replacement_marker); - } - - template <typename octet_iterator> - uint32_t next(octet_iterator& it, octet_iterator end) - { - uint32_t cp = 0; - internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); - switch (err_code) { - case internal::UTF8_OK : - break; - case internal::NOT_ENOUGH_ROOM : - throw not_enough_room(); - case internal::INVALID_LEAD : - case internal::INCOMPLETE_SEQUENCE : - case internal::OVERLONG_SEQUENCE : - throw invalid_utf8(*it); - case internal::INVALID_CODE_POINT : - throw invalid_code_point(cp); - } - return cp; - } - - template <typename octet_iterator> - uint32_t peek_next(octet_iterator it, octet_iterator end) - { - return utf8::next(it, end); - } - - template <typename octet_iterator> - uint32_t prior(octet_iterator& it, octet_iterator start) - { - // can't do much if it == start - if (it == start) - throw not_enough_room(); - - octet_iterator end = it; - // Go back until we hit either a lead octet or start - while (utf8::internal::is_trail(*(--it))) - if (it == start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - return utf8::peek_next(it, end); - } - - /// Deprecated in versions that include "prior" - template <typename octet_iterator> - uint32_t previous(octet_iterator& it, octet_iterator pass_start) - { - octet_iterator end = it; - while (utf8::internal::is_trail(*(--it))) - if (it == pass_start) - throw invalid_utf8(*it); // error - no lead byte in the sequence - octet_iterator temp = it; - return utf8::next(temp, end); - } - - template <typename octet_iterator, typename distance_type> - void advance (octet_iterator& it, distance_type n, octet_iterator end) - { - for (distance_type i = 0; i < n; ++i) - utf8::next(it, end); - } - - template <typename octet_iterator> - typename std::iterator_traits<octet_iterator>::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits<octet_iterator>::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::next(first, last); - return dist; - } - - template <typename u16bit_iterator, typename octet_iterator> - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - if (start != end) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - if (utf8::internal::is_trail_surrogate(trail_surrogate)) - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - else - throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); - } - else - throw invalid_utf16(static_cast<uint16_t>(cp)); - - } - // Lone trail surrogate - else if (utf8::internal::is_trail_surrogate(cp)) - throw invalid_utf16(static_cast<uint16_t>(cp)); - - result = utf8::append(cp, result); - } - return result; - } - - template <typename u16bit_iterator, typename octet_iterator> - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start != end) { - uint32_t cp = utf8::next(start, end); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast<uint16_t>(cp); - } - return result; - } - - template <typename octet_iterator, typename u32bit_iterator> - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::append(*(start++), result); - - return result; - } - - template <typename octet_iterator, typename u32bit_iterator> - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start != end) - (*result++) = utf8::next(start, end); - - return result; - } - - // The iterator class - template <typename octet_iterator> - class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { - octet_iterator it; - octet_iterator range_start; - octet_iterator range_end; - public: - iterator () {} - explicit iterator (const octet_iterator& octet_it, - const octet_iterator& range_start, - const octet_iterator& range_end) : - it(octet_it), range_start(range_start), range_end(range_end) - { - if (it < range_start || it > range_end) - throw std::out_of_range("Invalid utf-8 iterator position"); - } - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::next(temp, range_end); - } - bool operator == (const iterator& rhs) const - { - if (range_start != rhs.range_start || range_end != rhs.range_end) - throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - utf8::next(it, range_end); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - utf8::next(it, range_end); - return temp; - } - iterator& operator -- () - { - utf8::prior(it, range_start); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::prior(it, range_start); - return temp; - } - }; // class iterator - -} // namespace utf8 - -#endif //header guard - - diff --git a/morfeusz/encoding/utf8/core.h b/morfeusz/encoding/utf8/core.h deleted file mode 100644 index d237583..0000000 --- a/morfeusz/encoding/utf8/core.h +++ /dev/null @@ -1,330 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include <iterator> -#include <cstdint> - -namespace utf8 -{ - // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers - // You may need to change them to match your system. - // These typedefs have the same names as ones from cstdint, or boost/cstdint -// typedef unsigned char uint8_t; -// typedef unsigned short uint16_t; -// typedef unsigned int uint32_t; - -// Helper code - not intended to be directly called by the library users. May be changed at any time -namespace internal -{ - // Unicode constants - // Leading (high) surrogates: 0xd800 - 0xdbff - // Trailing (low) surrogates: 0xdc00 - 0xdfff - const uint16_t LEAD_SURROGATE_MIN = 0xd800u; - const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; - const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; - const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; - const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); - const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; - - // Maximum valid value for a Unicode code point - const uint32_t CODE_POINT_MAX = 0x0010ffffu; - - template<typename octet_type> - inline uint8_t mask8(octet_type oc) - { - return static_cast<uint8_t>(0xff & oc); - } - template<typename u16_type> - inline uint16_t mask16(u16_type oc) - { - return static_cast<uint16_t>(0xffff & oc); - } - template<typename octet_type> - inline bool is_trail(octet_type oc) - { - return ((utf8::internal::mask8(oc) >> 6) == 0x2); - } - - template <typename u16> - inline bool is_lead_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); - } - - template <typename u16> - inline bool is_trail_surrogate(u16 cp) - { - return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template <typename u16> - inline bool is_surrogate(u16 cp) - { - return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); - } - - template <typename u32> - inline bool is_code_point_valid(u32 cp) - { - return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); - } - - template <typename octet_iterator> - inline typename std::iterator_traits<octet_iterator>::difference_type - sequence_length(octet_iterator lead_it) - { - uint8_t lead = utf8::internal::mask8(*lead_it); - if (lead < 0x80) - return 1; - else if ((lead >> 5) == 0x6) - return 2; - else if ((lead >> 4) == 0xe) - return 3; - else if ((lead >> 3) == 0x1e) - return 4; - else - return 0; - } - - template <typename octet_difference_type> - inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) - { - if (cp < 0x80) { - if (length != 1) - return true; - } - else if (cp < 0x800) { - if (length != 2) - return true; - } - else if (cp < 0x10000) { - if (length != 3) - return true; - } - - return false; - } - - enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; - - /// Helper for get_sequence_x - template <typename octet_iterator> - utf_error increase_safely(octet_iterator& it, octet_iterator end) - { - if (++it == end) - return NOT_ENOUGH_ROOM; - - if (!utf8::internal::is_trail(*it)) - return INCOMPLETE_SEQUENCE; - - return UTF8_OK; - } - - #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} - - /// get_sequence_x functions decode utf-8 sequences of the length x - template <typename octet_iterator> - utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - return UTF8_OK; - } - - template <typename octet_iterator> - utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); - - return UTF8_OK; - } - - template <typename octet_iterator> - utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - template <typename octet_iterator> - utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - if (it == end) - return NOT_ENOUGH_ROOM; - - code_point = utf8::internal::mask8(*it); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; - - UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) - - code_point += (*it) & 0x3f; - - return UTF8_OK; - } - - #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR - - template <typename octet_iterator> - utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) - { - // Save the original value of it so we can go back in case of failure - // Of course, it does not make much sense with i.e. stream iterators - octet_iterator original_it = it; - - uint32_t cp = 0; - // Determine the sequence length based on the lead octet - typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; - const octet_difference_type length = utf8::internal::sequence_length(it); - - // Get trail octets and calculate the code point - utf_error err = UTF8_OK; - switch (length) { - case 0: - return INVALID_LEAD; - case 1: - err = utf8::internal::get_sequence_1(it, end, cp); - break; - case 2: - err = utf8::internal::get_sequence_2(it, end, cp); - break; - case 3: - err = utf8::internal::get_sequence_3(it, end, cp); - break; - case 4: - err = utf8::internal::get_sequence_4(it, end, cp); - break; - } - - if (err == UTF8_OK) { - // Decoding succeeded. Now, security checks... - if (utf8::internal::is_code_point_valid(cp)) { - if (!utf8::internal::is_overlong_sequence(cp, length)){ - // Passed! Return here. - code_point = cp; - ++it; - return UTF8_OK; - } - else - err = OVERLONG_SEQUENCE; - } - else - err = INVALID_CODE_POINT; - } - - // Failure branch - restore the original value of the iterator - it = original_it; - return err; - } - - template <typename octet_iterator> - inline utf_error validate_next(octet_iterator& it, octet_iterator end) { - uint32_t ignored; - return utf8::internal::validate_next(it, end, ignored); - } - -} // namespace internal - - /// The library API - functions intended to be called by the users - - // Byte order mark - const uint8_t bom[] = {0xef, 0xbb, 0xbf}; - - template <typename octet_iterator> - octet_iterator find_invalid(octet_iterator start, octet_iterator end) - { - octet_iterator result = start; - while (result != end) { - utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); - if (err_code != internal::UTF8_OK) - return result; - } - return result; - } - - template <typename octet_iterator> - inline bool is_valid(octet_iterator start, octet_iterator end) - { - return (utf8::find_invalid(start, end) == end); - } - - template <typename octet_iterator> - inline bool starts_with_bom (octet_iterator it, octet_iterator end) - { - return ( - ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && - ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && - ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) - ); - } - - //Deprecated in release 2.3 - template <typename octet_iterator> - inline bool is_bom (octet_iterator it) - { - return ( - (utf8::internal::mask8(*it++)) == bom[0] && - (utf8::internal::mask8(*it++)) == bom[1] && - (utf8::internal::mask8(*it)) == bom[2] - ); - } -} // namespace utf8 - -#endif // header guard - - diff --git a/morfeusz/encoding/utf8/unchecked.h b/morfeusz/encoding/utf8/unchecked.h deleted file mode 100644 index cb24271..0000000 --- a/morfeusz/encoding/utf8/unchecked.h +++ /dev/null @@ -1,228 +0,0 @@ -// Copyright 2006 Nemanja Trifunovic - -/* -Permission is hereby granted, free of charge, to any person or organization -obtaining a copy of the software and accompanying documentation covered by -this license (the "Software") to use, reproduce, display, distribute, -execute, and transmit the Software, and to prepare derivative works of the -Software, and to permit third-parties to whom the Software is furnished to -do so, all subject to the following: - -The copyright notices in the Software and this entire statement, including -the above license grant, this restriction and the following disclaimer, -must be included in all copies of the Software, in whole or in part, and -all derivative works of the Software, unless such copies or derivative -works are solely in the form of machine-executable object code generated by -a source language processor. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT -SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE -FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, -ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER -DEALINGS IN THE SOFTWARE. -*/ - - -#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 -#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 - -#include "core.h" - -namespace utf8 -{ - namespace unchecked - { - template <typename octet_iterator> - octet_iterator append(uint32_t cp, octet_iterator result) - { - if (cp < 0x80) // one octet - *(result++) = static_cast<uint8_t>(cp); - else if (cp < 0x800) { // two octets - *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else if (cp < 0x10000) { // three octets - *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - else { // four octets - *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); - *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80); - *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); - *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); - } - return result; - } - - template <typename octet_iterator> - uint32_t next(octet_iterator& it) - { - uint32_t cp = utf8::internal::mask8(*it); - typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it); - switch (length) { - case 1: - break; - case 2: - it++; - cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); - break; - case 3: - ++it; - cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); - ++it; - cp += (*it) & 0x3f; - break; - case 4: - ++it; - cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); - ++it; - cp += (utf8::internal::mask8(*it) << 6) & 0xfff; - ++it; - cp += (*it) & 0x3f; - break; - } - ++it; - return cp; - } - - template <typename octet_iterator> - uint32_t peek_next(octet_iterator it) - { - return utf8::unchecked::next(it); - } - - template <typename octet_iterator> - uint32_t prior(octet_iterator& it) - { - while (utf8::internal::is_trail(*(--it))) ; - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - - // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) - template <typename octet_iterator> - inline uint32_t previous(octet_iterator& it) - { - return utf8::unchecked::prior(it); - } - - template <typename octet_iterator, typename distance_type> - void advance (octet_iterator& it, distance_type n) - { - for (distance_type i = 0; i < n; ++i) - utf8::unchecked::next(it); - } - - template <typename octet_iterator> - typename std::iterator_traits<octet_iterator>::difference_type - distance (octet_iterator first, octet_iterator last) - { - typename std::iterator_traits<octet_iterator>::difference_type dist; - for (dist = 0; first < last; ++dist) - utf8::unchecked::next(first); - return dist; - } - - template <typename u16bit_iterator, typename octet_iterator> - octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) - { - while (start != end) { - uint32_t cp = utf8::internal::mask16(*start++); - // Take care of surrogate pairs first - if (utf8::internal::is_lead_surrogate(cp)) { - uint32_t trail_surrogate = utf8::internal::mask16(*start++); - cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; - } - result = utf8::unchecked::append(cp, result); - } - return result; - } - - template <typename u16bit_iterator, typename octet_iterator> - u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) - { - while (start < end) { - uint32_t cp = utf8::unchecked::next(start); - if (cp > 0xffff) { //make a surrogate pair - *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); - *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); - } - else - *result++ = static_cast<uint16_t>(cp); - } - return result; - } - - template <typename octet_iterator, typename u32bit_iterator> - octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) - { - while (start != end) - result = utf8::unchecked::append(*(start++), result); - - return result; - } - - template <typename octet_iterator, typename u32bit_iterator> - u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) - { - while (start < end) - (*result++) = utf8::unchecked::next(start); - - return result; - } - - // The iterator class - template <typename octet_iterator> - class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { - octet_iterator it; - public: - iterator () {} - explicit iterator (const octet_iterator& octet_it): it(octet_it) {} - // the default "big three" are OK - octet_iterator base () const { return it; } - uint32_t operator * () const - { - octet_iterator temp = it; - return utf8::unchecked::next(temp); - } - bool operator == (const iterator& rhs) const - { - return (it == rhs.it); - } - bool operator != (const iterator& rhs) const - { - return !(operator == (rhs)); - } - iterator& operator ++ () - { - ::std::advance(it, utf8::internal::sequence_length(it)); - return *this; - } - iterator operator ++ (int) - { - iterator temp = *this; - ::std::advance(it, utf8::internal::sequence_length(it)); - return temp; - } - iterator& operator -- () - { - utf8::unchecked::prior(it); - return *this; - } - iterator operator -- (int) - { - iterator temp = *this; - utf8::unchecked::prior(it); - return temp; - } - }; // class iterator - - } // namespace utf8::unchecked -} // namespace utf8 - - -#endif // header guard - diff --git a/morfeusz/test_morph.cpp b/morfeusz/test_morph.cpp index 7f66f39..1de8633 100644 --- a/morfeusz/test_morph.cpp +++ b/morfeusz/test_morph.cpp @@ -18,7 +18,7 @@ using namespace std; void doTest( - const FSA<vector<EncodedInterpretation>>& fsa, + const FSA<vector<InterpsGroup>>& fsa, const Tagset& tagset, // const InterpretationsDecoder<TaggedInterpretation>& interpsConverter, const char* fname) { @@ -32,14 +32,15 @@ void doTest( string lemma = splitVector[1]; string tag = splitVector[2]; string name = splitVector[3]; - vector<EncodedInterpretation> value2; + vector<InterpsGroup> value2; fsa.tryToRecognize(orth.c_str(), value2); DEBUG("recognized "+to_string(value2.size())); // vector<TaggedInterpretation> parsedValues; bool found = false; - for (EncodedInterpretation encodedInterp: value2) { + for (InterpsGroup gi: value2) + for (MorphInterpretation interp: gi.getRealInterps(orth, tagset)) { // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp); - MorphInterpretation interp(0, 0, orth, encodedInterp, tagset); +// (0, 0, orth, encodedInterp, tagset); // parsedValues.push_back(parsedValue); // debug(orth, parsedValue); if (lemma == interp.getLemma() && tag == interp.getTag() && name == interp.getName()) { @@ -62,10 +63,7 @@ int main(int argc, char** argv) { validate(argc == 3, "Must provide exactly two arguments - FSA filename, and dictionary filename."); const unsigned char* fsaData = readFile(argv[1]); MorphDeserializer deserializer; - DEBUG("will read FSA"); - FSA<vector<EncodedInterpretation>>* fsa = FSA<vector<EncodedInterpretation>>::getFSA(fsaData, deserializer); - DEBUG("DONE read FSA"); - DEBUG("will read tagset"); + FSA<vector<InterpsGroup>>* fsa = FSA<vector<InterpsGroup>>::getFSA(fsaData, deserializer); Tagset tagset(fsaData); // TaggedInterpretationsDecoder interpsDecoder(tagset); DEBUG("DONE read tagset"); diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml index f4e3032..5db4a12 100644 --- a/nbproject/configurations.xml +++ b/nbproject/configurations.xml @@ -8,11 +8,13 @@ <in>test_speed.cpp</in> </df> <df root="morfeusz" name="1"> - <df name="encoding"> + <df name="charset"> <in>CharsetConverter.cpp</in> - <in>CharsetConverter.hpp</in> + <in>charset_utils.hpp</in> </df> + <in>InterpsGroup.hpp</in> <in>Morfeusz.cpp</in> + <in>Morfeusz_impl.hpp</in> <in>MorphDeserializer.cpp</in> <in>MorphInterpretation.cpp</in> <in>Tagset.cpp</in> @@ -51,11 +53,19 @@ <executablePath>build/fsa/test_dict</executablePath> </makeTool> </makefileType> - <item path="fsa/const.cpp" ex="false" tool="1" flavor2="4"> + <folder path="1"> <ccTool> <incDir> <pElem>fsa</pElem> - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + </ccTool> + </folder> + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8"> + <ccTool> + <incDir> + <pElem>fsa</pElem> + <pElem>build/morfeusz</pElem> </incDir> </ccTool> </item> @@ -80,86 +90,45 @@ </incDir> </ccTool> </item> + <item path="morfeusz/InterpsGroup.hpp" ex="false" tool="3" flavor2="0"> + </item> <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>fsa</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> + <item path="morfeusz/Morfeusz_impl.hpp" ex="false" tool="3" flavor2="0"> + </item> <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>fsa</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> - <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="4"> + <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>morfeusz</pElem> - <pElem>/usr/include/c++/4.8/bits</pElem> - <pElem>/usr/include/c++/4.8/ext</pElem> - <pElem>/usr/include/c++/4.8</pElem> - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> - <pElem>/usr/include/c++/4.8/debug</pElem> - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> - <pElem>/usr/include/c++/4.8/backward</pElem> - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> - <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>morfeusz</pElem> - <pElem>/usr/include/c++/4.8/bits</pElem> - <pElem>/usr/include/c++/4.8/ext</pElem> - <pElem>/usr/include/c++/4.8</pElem> - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> - <pElem>/usr/include/c++/4.8/debug</pElem> - <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> - <pElem>fsa</pElem> - <pElem>/usr/include/c++/4.8/backward</pElem> - <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> - <item path="morfeusz/encoding/CharsetConverter.cpp" + <item path="morfeusz/charset/CharsetConverter.cpp" ex="false" tool="1" - flavor2="0"> + flavor2="8"> + <ccTool> + </ccTool> </item> - <item path="morfeusz/encoding/CharsetConverter.hpp" - ex="false" - tool="3" - flavor2="0"> + <item path="morfeusz/charset/charset_utils.hpp" ex="false" tool="3" flavor2="0"> </item> <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>fsa</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> <ccTool> - <incDir> - <pElem>morfeusz</pElem> - </incDir> </ccTool> </item> <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> <ccTool> - <incDir> - <pElem>fsa</pElem> - <pElem>build/morfeusz</pElem> - </incDir> </ccTool> </item> </conf> -- libgit2 0.22.2