From f1e52ff44027610390237bd862449c85c4a044cc Mon Sep 17 00:00:00 2001 From: Michał Lenart <michall@ipipan.waw.pl> Date: Sun, 16 Mar 2014 23:39:11 +0000 Subject: [PATCH] poprawienie czasu działania, przebudowanie analizatora tak, by nie powielać kodu w generatorze, poprawienie rozpoznawania pierwszego segmentu w grafie fleksyjnym --- CMakeLists.txt | 4 ++-- fsabuilder/morfeuszbuilder/fsa/common.py | 24 ++++++++++++------------ fsabuilder/morfeuszbuilder/fsa/convertinput.py | 4 ++-- fsabuilder/morfeuszbuilder/fsa/encode.py | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------- fsabuilder/morfeuszbuilder/tagset/segtypes.py | 14 +++++++------- input/dodatki.tab | 10 ++++++++++ input/segmenty.dat | 1 + input/segmenty1.dat | 19 +++++++++++++++++++ morfeusz/EncodedInterpretation.hpp | 1 - morfeusz/Environment.cpp | 15 +++++++++------ morfeusz/Environment.hpp | 1 + morfeusz/FlexionGraph.cpp | 30 +++++++++++++++++++++++++----- morfeusz/InterpretedChunksDecoder.hpp | 167 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------- morfeusz/InterpsGroup.hpp | 34 ++++++++++++++++++---------------- morfeusz/Morfeusz.cpp | 22 ++++++++++------------ morfeusz/MorphDeserializer.cpp | 145 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------------------------------- nbproject/configurations.xml | 24 +++++++++++++++++++++--- 17 files changed, 411 insertions(+), 205 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index b495d78..6a6a3f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ project (Morfeusz) set (Morfeusz_VERSION_MAJOR 2) set (Morfeusz_VERSION_MINOR 0) set (Morfeusz_VERSION_PATCH 0) -set (CMAKE_BUILD_TYPE "Debug") +set (CMAKE_BUILD_TYPE "Release") enable_testing() @@ -47,7 +47,7 @@ endif () # SEGMENT_RULES_FILE if ("${SEGMENT_RULES_FILE}" STREQUAL "") - set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty1.dat") + set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty.dat") endif () message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules") diff --git a/fsabuilder/morfeuszbuilder/fsa/common.py b/fsabuilder/morfeuszbuilder/fsa/common.py index 5af6cba..d9bd84d 100644 --- a/fsabuilder/morfeuszbuilder/fsa/common.py +++ b/fsabuilder/morfeuszbuilder/fsa/common.py @@ -40,24 +40,24 @@ class EncodedFormWithPrefix(object): self.suffixToAdd = bestEncodedForm.suffixToAdd self.prefixToAdd = targetWord[:bestPrefixLength] -class Interpretation(object): +class Interpretation4Analyzer(object): def __init__(self, orth, base, tagnum, namenum, typenum): - self.lemma = EncodedForm(orth, base) + self.encodedForm = EncodedForm(orth, base) self.tagnum = tagnum self.namenum = namenum self.typenum = typenum def getSortKey(self): return ( - self.lemma.cutLength, - tuple(self.lemma.suffixToAdd), - tuple(self.lemma.casePattern), + self.encodedForm.cutLength, + tuple(self.encodedForm.suffixToAdd), + tuple(self.encodedForm.casePattern), self.tagnum, self.namenum) def __eq__(self, other): - if isinstance(other, Interpretation): + if isinstance(other, Interpretation4Analyzer): return self.getSortKey() == other.getSortKey() else: return False @@ -68,8 +68,8 @@ class Interpretation(object): class Interpretation4Generator(object): def __init__(self, orth, base, tagnum, namenum, typenum): - self.lemma = base - self.orth = EncodedFormWithPrefix(base, orth) + self.encodedForm = base + self.encodedForm = EncodedFormWithPrefix(base, orth) self.tagnum = tagnum self.namenum = namenum self.typenum = typenum @@ -77,9 +77,9 @@ class Interpretation4Generator(object): def getSortKey(self): return ( self.tagnum, - self.orth.cutLength, - tuple(self.orth.suffixToAdd), -# tuple(self.lemma.casePattern), + self.encodedForm.cutLength, + tuple(self.encodedForm.suffixToAdd), +# tuple(self.encodedForm.casePattern), self.namenum) def __eq__(self, other): @@ -92,7 +92,7 @@ class Interpretation4Generator(object): return hash(self.getSortKey()) def __unicode__(self): - return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.orth.cutLength, self.orth.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) + return u'<%s,(%d %s),%d,%d>' % (self.encodedForm.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum) def __repr__(self): return unicode(self) diff --git a/fsabuilder/morfeuszbuilder/fsa/convertinput.py b/fsabuilder/morfeuszbuilder/fsa/convertinput.py index b7e0f88..60dcb35 100644 --- a/fsabuilder/morfeuszbuilder/fsa/convertinput.py +++ b/fsabuilder/morfeuszbuilder/fsa/convertinput.py @@ -4,7 +4,7 @@ Created on Oct 23, 2013 @author: mlenart ''' import logging -from common import Interpretation +from common import Interpretation4Analyzer from morfeuszbuilder.fsa.common import Interpretation4Generator def _mergeEntries(inputLines): @@ -74,7 +74,7 @@ class PolimorfConverter4Analyzer(object): tagnum = int(tagnum) namenum = int(namenum) typenum = int(typenum) - yield (orth, Interpretation(orth, base, tagnum, namenum, typenum)) + yield (orth, Interpretation4Analyzer(orth, base, tagnum, namenum, typenum)) def convert(self, inputLines): return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines)))) diff --git a/fsabuilder/morfeuszbuilder/fsa/encode.py b/fsabuilder/morfeuszbuilder/fsa/encode.py index 97db90b..566a67a 100644 --- a/fsabuilder/morfeuszbuilder/fsa/encode.py +++ b/fsabuilder/morfeuszbuilder/fsa/encode.py @@ -5,6 +5,7 @@ Created on Oct 23, 2013 ''' import logging +from morfeuszbuilder.utils import serializationUtils class Encoder(object): ''' @@ -96,6 +97,54 @@ class Encoder(object): def _encodeNameNum(self, namenum): assert namenum < 256 and namenum >= 0 return bytearray([namenum]) + + def _groupInterpsByType(self, interpsList): + res = {} + for interp in interpsList: + res.setdefault(interp.typenum, []) + res[interp.typenum].append(interp) + return res + + def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix): + res = bytearray() + res.extend(self._encodeTypeNum(typenum)) + + encodedInterpsList = bytearray() + for interp in sorted(interpsList, key=lambda i: i.getSortKey()): + encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) + encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) + encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) + + res.extend(serializationUtils.htons(len(encodedInterpsList))) + res.extend(encodedInterpsList) + return res + + def _doEncodeData(self, interpsList, withCasePattern, withPrefix): + + assert type(interpsList) == frozenset + + segnum2Interps = self._groupInterpsByType(interpsList) + + + res = bytearray() + firstByte = len(segnum2Interps) + assert firstByte < 256 + assert firstByte > 0 + res.append(firstByte) + + for typenum, interpsList in segnum2Interps.iteritems(): + res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix)) + + +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): +# encodedInterpsList.extend(self._encodeTypeNum(interp.typenum)) +# encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) +# encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) +# encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) + del interpsList +# res.extend(serializationUtils.htons(len(encodedInterpsList))) +# res.extend(encodedInterpsList) + return res class MorphEncoder(Encoder): @@ -106,19 +155,20 @@ class MorphEncoder(Encoder): self.LEMMA_MIXED_CASE = 2 def encodeData(self, interpsList): - res = bytearray() - firstByte = len(interpsList) - assert firstByte < 256 - assert firstByte > 0 - res.append(firstByte) - assert type(interpsList) == frozenset - for interp in sorted(interpsList, key=lambda i: i.getSortKey()): - res.extend(self._encodeTypeNum(interp.typenum)) - res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True, withPrefix=False)) - res.extend(self._encodeTagNum(interp.tagnum)) - res.extend(self._encodeNameNum(interp.namenum)) - del interpsList - return res + return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False) +# res = bytearray() +# firstByte = len(interpsList) +# assert firstByte < 256 +# assert firstByte > 0 +# res.append(firstByte) +# assert type(interpsList) == frozenset +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): +# res.extend(self._encodeTypeNum(interp.typenum)) +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=True, withPrefix=False)) +# res.extend(self._encodeTagNum(interp.tagnum)) +# res.extend(self._encodeNameNum(interp.namenum)) +# del interpsList +# return res class Encoder4Generator(Encoder): @@ -126,18 +176,19 @@ class Encoder4Generator(Encoder): super(Encoder4Generator, self).__init__(encoding) def encodeData(self, interpsList): - res = bytearray() - firstByte = len(interpsList) - assert firstByte < 256 - assert firstByte > 0 - res.append(firstByte) - assert type(interpsList) == frozenset - for interp in sorted(interpsList, key=lambda i: i.getSortKey()): - res.extend(self._encodeTypeNum(interp.typenum)) - res.extend(self._encodeEncodedForm(interp.orth, withCasePattern=False, withPrefix=True)) - res.extend(self._encodeTagNum(interp.tagnum)) - res.extend(self._encodeNameNum(interp.namenum)) - return res + return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True) +# res = bytearray() +# firstByte = len(interpsList) +# assert firstByte < 256 +# assert firstByte > 0 +# res.append(firstByte) +# assert type(interpsList) == frozenset +# for interp in sorted(interpsList, key=lambda i: i.getSortKey()): +# res.extend(self._encodeTypeNum(interp.typenum)) +# res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=False, withPrefix=True)) +# res.extend(self._encodeTagNum(interp.tagnum)) +# res.extend(self._encodeNameNum(interp.namenum)) +# return res # # def decodeData(self, data): # diff --git a/fsabuilder/morfeuszbuilder/tagset/segtypes.py b/fsabuilder/morfeuszbuilder/tagset/segtypes.py index b48ef56..9bfd32e 100644 --- a/fsabuilder/morfeuszbuilder/tagset/segtypes.py +++ b/fsabuilder/morfeuszbuilder/tagset/segtypes.py @@ -106,7 +106,7 @@ class Segtypes(object): lineNum, re.match(r'[a-z_]+', segtype)) self._validate( - u'Pattern must contain lemma and part-of-speech fields', + u'Pattern must contain encodedForm and part-of-speech fields', lineNum, re.match(r'.+\:[a-z_]+', pattern, re.U)) @@ -146,13 +146,13 @@ class Segtypes(object): # index lexemes for p in self.patternsList: - if p.lemma: + if p.encodedForm: for tag in self.tagset.getAllTags(): tagnum = self.tagset.getTagnum4Tag(tag) - if not (p.lemma, tagnum) in self._lemmaTagnum2Segnum: - segnum = p.tryToMatch(p.lemma, tag) + if not (p.encodedForm, tagnum) in self._lemmaTagnum2Segnum: + segnum = p.tryToMatch(p.encodedForm, tag) if segnum != -1: - self._lemmaTagnum2Segnum[(p.lemma, tagnum)] = segnum + self._lemmaTagnum2Segnum[(p.encodedForm, tagnum)] = segnum # logging.info('indexing segment type numbers - done') # self._debugSegnums() @@ -171,7 +171,7 @@ class Segtypes(object): class SegtypePattern(object): def __init__(self, lemma, pattern, segnum): - self.lemma = lemma + self.encodedForm = lemma self.pattern = pattern self.segnum = segnum @@ -181,7 +181,7 @@ class SegtypePattern(object): patterns2Match = [] patterns2Match.append(self.pattern.replace('%', '.*')) patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) - if (self.lemma is None or self.lemma == lemma) \ + if (self.encodedForm is None or self.encodedForm == lemma) \ and any([re.match(p, tag) for p in patterns2Match]): return self.segnum else: diff --git a/input/dodatki.tab b/input/dodatki.tab index 2905309..f2ce635 100644 --- a/input/dodatki.tab +++ b/input/dodatki.tab @@ -1,3 +1,13 @@ +0 0 dig +1 1 dig +2 2 dig +3 3 dig +4 4 dig +5 5 dig +6 6 dig +7 7 dig +8 8 dig +9 9 dig ń on ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep by by qub naj naj diff --git a/input/segmenty.dat b/input/segmenty.dat index 4c4794a..d8f28fe 100644 --- a/input/segmenty.dat +++ b/input/segmenty.dat @@ -142,6 +142,7 @@ samodz dywiz adj # Stopień najwyższy: # np. „naj·zieleńszy”, „naj·mądrzej” moze_interp( naj> adj_sup ) +moze_interp( nie> naj> adj_sup ) # Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj moze_interp( praet_sg dywiz li) moze_interp( praet_pl dywiz li) diff --git a/input/segmenty1.dat b/input/segmenty1.dat index 8c23336..40eeee1 100644 --- a/input/segmenty1.dat +++ b/input/segmenty1.dat @@ -52,11 +52,14 @@ naj naj nie nie prefs prefs prefv prefv +prefa prefa dig dig adja adja adj adj:%:pos adj_sup adj:%:sup adj_sup adv:sup +adj_com adj:%:com +adj_com adj:%:com negat ger:%:neg negat pact:%:neg negat ppas:%:neg @@ -69,6 +72,22 @@ interp interp aglsg aglt:sg:% aglpl aglt:pl:% samodz % +praet_fin praet:% +praet_fin fin:% +li li:qub:% +nomina subst:% +nomina ger:% +nomina depr:% +adjectiva adj:% +adjectiva adv:% +adjectiva ppas:% +adjectiva pact:% +verba_imperf praet:%:imperf +verba_imperf fin:%:imperf +verba_imperf inf:imperf +verba_imperf imps:imperf +verba_imperf impt:imperf + [lexemes] z_aglt aby:comp diff --git a/morfeusz/EncodedInterpretation.hpp b/morfeusz/EncodedInterpretation.hpp index 984ad44..98724bb 100644 --- a/morfeusz/EncodedInterpretation.hpp +++ b/morfeusz/EncodedInterpretation.hpp @@ -28,7 +28,6 @@ struct EncodedForm { */ struct EncodedInterpretation { EncodedForm value; - unsigned char type; int tag; int nameClassifier; }; diff --git a/morfeusz/Environment.cpp b/morfeusz/Environment.cpp index dc67241..4eca606 100644 --- a/morfeusz/Environment.cpp +++ b/morfeusz/Environment.cpp @@ -13,10 +13,12 @@ //class InterpretedChunksDecoder4Analyzer; //class InterpretedChunksDecoder4Generator; -static Deserializer<vector<InterpsGroup> >* initializeDeserializer() { - static Deserializer < vector < InterpsGroup > > *deserializer +static Deserializer<vector<InterpsGroup> >& initializeDeserializer(MorfeuszProcessorType processorType) { + static Deserializer < vector < InterpsGroup > > *analyzerDeserializer = new MorphDeserializer(); - return deserializer; + static Deserializer < vector < InterpsGroup > > *generatorDeserializer + = new MorphDeserializer(); + return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer); } static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) { @@ -48,14 +50,15 @@ Environment::Environment( caseConverter(), tagset(fsaFileStartPtr), fsaFileStartPtr(fsaFileStartPtr), - fsa(FSAType::getFSA(fsaFileStartPtr, *initializeDeserializer())), + fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))), segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)), currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)), isFromFile(false), chunksDecoder( processorType == ANALYZER ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) - : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)) + : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)), + processorType(processorType) { } @@ -110,7 +113,7 @@ void Environment::setFSAFile(const std::string& filename) { delete this->fsaFileStartPtr; } this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str()); - this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, *initializeDeserializer()); + this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType)); this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr); this->isFromFile = true; } diff --git a/morfeusz/Environment.hpp b/morfeusz/Environment.hpp index 5a3f6a3..3158221 100644 --- a/morfeusz/Environment.hpp +++ b/morfeusz/Environment.hpp @@ -64,6 +64,7 @@ private: bool isFromFile; const InterpretedChunksDecoder* chunksDecoder; + MorfeuszProcessorType processorType; const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; }; diff --git a/morfeusz/FlexionGraph.cpp b/morfeusz/FlexionGraph.cpp index 50063e6..6309035 100644 --- a/morfeusz/FlexionGraph.cpp +++ b/morfeusz/FlexionGraph.cpp @@ -15,7 +15,6 @@ void FlexionGraph::addStartEdge(const Edge& e) { this->graph.push_back(vector<Edge>()); this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr); } -// cerr << string(e.chunk.chunkStartPtr) << endl; assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr); this->graph[0].push_back(e); } @@ -30,22 +29,43 @@ void FlexionGraph::addMiddleEdge(unsigned int startNode, const Edge& e) { this->graph[startNode].push_back(e); } +static inline bool chunkIsAtFront( + const InterpretedChunk& chunk, + const std::vector<InterpretedChunk>& path) { + unsigned int i; + for (i = 0; i < path.size() - 1 && path[i].orthWasShifted; i++) { + } + assert(!path[i].orthWasShifted); + return &chunk == &(path[i]); +} + +static inline bool chunkIsAtBack( + const InterpretedChunk& chunk, + const std::vector<InterpretedChunk>& path) { + return &chunk == &(path.back()); +} + +static inline bool chunkIsTheOnlyOne( + const InterpretedChunk& chunk, + const std::vector<InterpretedChunk>& path) { + return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path); +} + void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { // debugPath(path); // debugGraph(this->graph); for (unsigned int i = 0; i < path.size(); i++) { const InterpretedChunk& chunk = path[i]; if (!chunk.orthWasShifted) { - if (&chunk == &(path.front()) - && &chunk == &(path.back())) { + if (chunkIsTheOnlyOne(chunk, path)) { Edge e = {chunk, UINT_MAX}; this->addStartEdge(e); } - else if (&chunk == &(path.front())) { + else if (chunkIsAtFront(chunk, path)) { Edge e = {chunk, this->graph.empty() ? 1 : (unsigned int) this->graph.size()}; this->addStartEdge(e); } - else if (&chunk == &(path.back())) { + else if (chunkIsAtBack(chunk, path)) { Edge e = {chunk, UINT_MAX}; this->addMiddleEdge((unsigned int) this->graph.size(), e); } diff --git a/morfeusz/InterpretedChunksDecoder.hpp b/morfeusz/InterpretedChunksDecoder.hpp index ec2b4b0..d0a3d52 100644 --- a/morfeusz/InterpretedChunksDecoder.hpp +++ b/morfeusz/InterpretedChunksDecoder.hpp @@ -18,6 +18,10 @@ #include "charset/CaseConverter.hpp" #include "Environment.hpp" +const uint8_t LEMMA_ONLY_LOWER = 0; +const uint8_t LEMMA_UPPER_PREFIX = 1; +const uint8_t LEMMA_MIXED_CASE = 2; + class InterpretedChunksDecoder { public: @@ -30,22 +34,12 @@ public: unsigned int endNode, const InterpretedChunk& interpretedChunk, std::vector<MorphInterpretation>& out) const = 0; - - virtual ~InterpretedChunksDecoder() {} -protected: - - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const { - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; - originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); - decodeForm( - prefixChunk.lowercaseCodepoints, - prefixChunk.interpsGroup.interps[0].value, - decodedForm); - } + virtual ~InterpretedChunksDecoder() { } - + +protected: + virtual void decodeForm( const std::vector<uint32_t>& orth, const EncodedForm& form, @@ -55,9 +49,10 @@ protected: }; class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { - public: - InterpretedChunksDecoder4Analyzer(const Environment& env): InterpretedChunksDecoder(env) {} + + InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { + } void decode( unsigned int startNode, @@ -65,22 +60,12 @@ public: const InterpretedChunk& interpretedChunk, std::vector<MorphInterpretation>& out) const { string orth; - string lemma; - convertPrefixes(interpretedChunk, orth, lemma); + string lemmaPrefix; + convertPrefixes(interpretedChunk, orth, lemmaPrefix); orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; - decodeForm( - interpretedChunk.lowercaseCodepoints, - ei.value, - lemma); - out.push_back(MorphInterpretation( - startNode, endNode, - orth, lemma, - ei.tag, - ei.nameClassifier, - env.getTagset(), - env.getCharsetConverter())); + const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr; + while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) { + out.push_back(this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr)); } } @@ -104,36 +89,116 @@ protected: env.getCharsetConverter().append(cp, res); } } + +private: + + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const { + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; + originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); + const unsigned char* ptr = prefixChunk.interpsGroup.ptr; + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, originalForm, string(""), prefixChunk, ptr); + decodedForm += mi.getLemma(); + } + } + + MorphInterpretation decodeMorphInterpretation( + unsigned int startNode, unsigned int endNode, + const string& orth, + const string& lemmaPrefix, + const InterpretedChunk& chunk, + const unsigned char*& ptr) const { + string lemma = lemmaPrefix; + EncodedInterpretation ei = this->decodeInterp(ptr); + this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma); + return MorphInterpretation( + startNode, endNode, + orth, lemma, + ei.tag, + ei.nameClassifier, + env.getTagset(), + env.getCharsetConverter()); + } + + void decodeLemma(const unsigned char*& ptr, EncodedForm& lemma) const { + lemma.suffixToCut = *ptr; + ptr++; + lemma.suffixToAdd = (const char*) ptr; + ptr += strlen((const char*) ptr) + 1; + assert(lemma.casePattern.size() == 0); + // lemma.casePattern.resize(MAX_WORD_SIZE, false); + uint8_t casePatternType = *ptr; + ptr++; + uint8_t prefixLength; + uint8_t patternLength; + switch (casePatternType) { + case LEMMA_ONLY_LOWER: + break; + case LEMMA_UPPER_PREFIX: + prefixLength = *ptr; + ptr++; + for (unsigned int i = 0; i < prefixLength; i++) { + // lemma.casePattern[i] = true; + lemma.casePattern.push_back(true); + } + // lemma.casePattern.resize(prefixLength, true); + break; + case LEMMA_MIXED_CASE: + patternLength = *ptr; + ptr++; + for (unsigned int i = 0; i < patternLength; i++) { + uint8_t idx = *ptr; + ptr++; + // lemma.casePattern[idx] = true; + lemma.casePattern.resize(idx + 1, false); + lemma.casePattern[idx] = true; + } + break; + } + } + + EncodedInterpretation decodeInterp(const unsigned char*& ptr) const { + EncodedInterpretation interp; + decodeLemma(ptr, interp.value); + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*> (ptr))); + ptr += 2; + interp.nameClassifier = *ptr; + ptr++; + return interp; + } }; class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { - public: - InterpretedChunksDecoder4Generator(const Environment& env): InterpretedChunksDecoder(env) {} + + InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { + } void decode( unsigned int startNode, unsigned int endNode, const InterpretedChunk& interpretedChunk, std::vector<MorphInterpretation>& out) const { - string orth; - string lemma; - convertPrefixes(interpretedChunk, lemma, orth); - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); - for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { - const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; - decodeForm( - interpretedChunk.originalCodepoints, - ei.value, - orth); - out.push_back(MorphInterpretation( - startNode, endNode, - orth, lemma, - ei.tag, - ei.nameClassifier, - env.getTagset(), - env.getCharsetConverter())); - } + // string orth; + // string lemma; + // convertPrefixes(interpretedChunk, lemma, orth); + // size_t orthLength = orth.length(); + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); + // for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { + // const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; + // decodeForm( + // interpretedChunk.originalCodepoints, + // ei.value, + // orth); + // out.push_back(MorphInterpretation( + // startNode, endNode, + // orth, lemma, + // ei.tag, + // ei.nameClassifier, + // env.getTagset(), + // env.getCharsetConverter())); + // orth.erase(orthLength); + // } } private: diff --git a/morfeusz/InterpsGroup.hpp b/morfeusz/InterpsGroup.hpp index 367e275..2227525 100644 --- a/morfeusz/InterpsGroup.hpp +++ b/morfeusz/InterpsGroup.hpp @@ -14,24 +14,26 @@ #include "MorphInterpretation.hpp" #include "Tagset.hpp" -class InterpsGroup { -public: - - InterpsGroup() { - - } - - explicit InterpsGroup(const unsigned char type) - : type(type) { - - } - - void addInterpretation(const EncodedInterpretation& interp) { - interps.push_back(interp); - } +struct InterpsGroup { +//public: +// +// InterpsGroup() { +// +// } +// +// explicit InterpsGroup(const unsigned char type) +// : type(type) { +// +// } +// +// void addInterpretation(const EncodedInterpretation& interp) { +// interps.push_back(interp); +// } unsigned char type; - std::vector<EncodedInterpretation> interps; + uint16_t size; + const unsigned char* ptr; +// std::vector<EncodedInterpretation> interps; }; #endif /* GROUPEDINTERPRETATIONS_HPP */ diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index a5b3bbd..45e7e01 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -82,7 +82,9 @@ void Morfeusz::processOneWord( FlexionGraph graph; const char* currInput = inputStart; const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); + doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); + if (!graph.empty()) { const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); int srcNode = startNodeNum; @@ -110,6 +112,7 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { from.prefixChunks.end()); to.prefixChunks.push_back(from); from.orthWasShifted = true; + to.chunkStartPtr = from.chunkStartPtr; } void Morfeusz::doProcessOneWord( @@ -119,28 +122,21 @@ void Morfeusz::doProcessOneWord( SegrulesState segrulesState, vector<InterpretedChunk>& accum, FlexionGraph& graph) const { - cerr << "doAnalyzeOneWord " << inputData << endl; - bool endOfProcessing = inputData == inputEnd; +// cerr << "doAnalyzeOneWord " << inputData << endl; const char* currInput = inputData; - uint32_t codepoint = endOfProcessing ? 0 : env.getCharsetConverter().next(currInput, inputEnd); - // UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter)); + uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); vector<uint32_t> originalCodepoints; vector<uint32_t> lowercaseCodepoints; StateType state = env.getFSA().getInitialState(); - while (!endOfProcessing) { - if (isEndOfWord(codepoint)) { - endOfProcessing = true; - } - cerr << "not end of word '" << string(currInput) << "'" << endl; + while (!isEndOfWord(codepoint)) { uint32_t lowerCP = env.getCaseConverter().toLower(codepoint); originalCodepoints.push_back(codepoint); lowercaseCodepoints.push_back(lowerCP); feedState(state, lowerCP, UTF8CharsetConverter()); codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd); if (state.isAccepting()) { - cerr << "accepting" << endl; vector<InterpsGroup> val(state.getValue()); for (unsigned int i = 0; i < val.size(); i++) { InterpsGroup& ig = val[i]; @@ -151,6 +147,9 @@ void Morfeusz::doProcessOneWord( it != newSegrulesStates.end(); ++it) { SegrulesState newSegrulesState = *it; +// if (newSegrulesState.shiftOrthFromPrevious) { +// +// } InterpretedChunk ic = { inputData, originalCodepoints, @@ -165,7 +164,6 @@ void Morfeusz::doProcessOneWord( } accum.push_back(ic); if (isEndOfWord(codepoint)) { - cerr << "end of word inside " << currInput <<endl; if (newSegrulesState.accepting) graph.addPath(accum); } @@ -177,8 +175,8 @@ void Morfeusz::doProcessOneWord( } } } + codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); } - cerr << "end of word " << currInput << endl; inputData = currInput; } diff --git a/morfeusz/MorphDeserializer.cpp b/morfeusz/MorphDeserializer.cpp index ba8452c..349f9fa 100644 --- a/morfeusz/MorphDeserializer.cpp +++ b/morfeusz/MorphDeserializer.cpp @@ -23,74 +23,93 @@ MorphDeserializer::MorphDeserializer() { MorphDeserializer::~MorphDeserializer() { } -static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) { - // XXX uważać na poprawność danych - lemma.suffixToCut = *ptr; - ptr++; - lemma.suffixToAdd = (const char*) ptr; - ptr += strlen((const char*) ptr) + 1; - assert(lemma.casePattern.size() == 0); -// lemma.casePattern.resize(MAX_WORD_SIZE, false); - uint8_t casePatternType = *ptr; - ptr++; - uint8_t prefixLength; - uint8_t patternLength; - switch (casePatternType) { - case LEMMA_ONLY_LOWER: - break; - case LEMMA_UPPER_PREFIX: - prefixLength = *ptr; - ptr++; - for (unsigned int i = 0; i < prefixLength; i++) { -// lemma.casePattern[i] = true; - lemma.casePattern.push_back(true); - } -// lemma.casePattern.resize(prefixLength, true); - break; - case LEMMA_MIXED_CASE: - patternLength = *ptr; - ptr++; - for (unsigned int i = 0; i < patternLength; i++) { - uint8_t idx = *ptr; - ptr++; +//static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) { +// // XXX uważać na poprawność danych +// lemma.suffixToCut = *ptr; +// ptr++; +// lemma.suffixToAdd = (const char*) ptr; +// ptr += strlen((const char*) ptr) + 1; +// assert(lemma.casePattern.size() == 0); +//// lemma.casePattern.resize(MAX_WORD_SIZE, false); +// uint8_t casePatternType = *ptr; +// ptr++; +// uint8_t prefixLength; +// uint8_t patternLength; +// switch (casePatternType) { +// case LEMMA_ONLY_LOWER: +// break; +// case LEMMA_UPPER_PREFIX: +// prefixLength = *ptr; +// ptr++; +// for (unsigned int i = 0; i < prefixLength; i++) { +//// lemma.casePattern[i] = true; +// lemma.casePattern.push_back(true); +// } +//// lemma.casePattern.resize(prefixLength, true); +// break; +// case LEMMA_MIXED_CASE: +// patternLength = *ptr; +// ptr++; +// for (unsigned int i = 0; i < patternLength; i++) { +// uint8_t idx = *ptr; +// ptr++; +//// lemma.casePattern[idx] = true; +// lemma.casePattern.resize(idx + 1, false); // lemma.casePattern[idx] = true; - lemma.casePattern.resize(idx + 1, false); - lemma.casePattern[idx] = true; - } - break; - } -} - -static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { - interp.type = *ptr; - ptr++; - deserializeLemma(ptr, interp.value); - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); - ptr += 2; - interp.nameClassifier = *ptr; - ptr++; -} +// } +// break; +// } +//} +// +//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) { +// interp.type = *ptr; +// ptr++; +// deserializeLemma(ptr, interp.value); +// interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); +// ptr += 2; +// interp.nameClassifier = *ptr; +// ptr++; +//} long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { const unsigned char* currPtr = ptr; - uint8_t interpsNum = *ptr; - interps.clear(); - interps.reserve(interpsNum); + uint8_t interpTypesNum = *currPtr; currPtr++; - // FIXME - to jest do poprawy - map<int, InterpsGroup> results; - for (unsigned int i = 0; i < interpsNum; ++i) { - EncodedInterpretation interp; - deserializeInterp(currPtr, interp); - if (results.count(interp.type) == 0) { - results[interp.type] = InterpsGroup(interp.type); - } - results[interp.type].addInterpretation(interp); -// interps.push_back(interp); - } - map<int, InterpsGroup>::iterator it; - for (it = results.begin(); it != results.end(); ++it) { - interps.push_back((*it).second); + interps.clear(); + interps.reserve(interpTypesNum); + for (unsigned int i = 0; i < interpTypesNum; i++) { + InterpsGroup ig; + ig.type = *currPtr; + currPtr++; + ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr))); + currPtr += 2; + ig.ptr = currPtr; + currPtr += ig.size; + interps.push_back(ig); } return currPtr - ptr; } + +//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { +// const unsigned char* currPtr = ptr; +// uint8_t interpsNum = *ptr; +// interps.clear(); +// interps.reserve(interpsNum); +// currPtr++; +// // FIXME - to jest do poprawy +// map<int, InterpsGroup> results; +// for (unsigned int i = 0; i < interpsNum; ++i) { +// EncodedInterpretation interp; +// deserializeInterp(currPtr, interp); +// if (results.count(interp.type) == 0) { +// results[interp.type] = InterpsGroup(interp.type); +// } +// results[interp.type].addInterpretation(interp); +//// interps.push_back(interp); +// } +// map<int, InterpsGroup>::iterator it; +// for (it = results.begin(); it != results.end(); ++it) { +// interps.push_back((*it).second); +// } +// return currPtr - ptr; +//} diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml index 6f9afea..98211d1 100644 --- a/nbproject/configurations.xml +++ b/nbproject/configurations.xml @@ -106,14 +106,20 @@ </makeTool> </makefileType> <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> </item> <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> </item> <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" ex="false" @@ -169,7 +175,7 @@ <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> </item> <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="1"> + <ccTool> <incDir> <pElem>morfeusz</pElem> <pElem>morfeusz/build/morfeusz</pElem> @@ -180,7 +186,7 @@ </ccTool> </item> <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="1"> + <ccTool> <incDir> <pElem>morfeusz</pElem> <pElem>morfeusz/build/morfeusz</pElem> @@ -273,7 +279,7 @@ <ccTool> <incDir> <pElem>morfeusz</pElem> - <pElem>/usr/lib/jvm/default-java/include</pElem> + <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> </incDir> <preprocessorList> <Elem>libjmorfeusz_EXPORTS</Elem> @@ -408,18 +414,26 @@ </ccTool> </item> <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/charset/CharsetConverter.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/charset/conversion_tables.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> @@ -508,8 +522,12 @@ ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="0"> -- libgit2 0.22.2