diff --git a/fsabuilder/morfeuszbuilder/fsa/serializer.py b/fsabuilder/morfeuszbuilder/fsa/serializer.py index 8a3610b..f9d4389 100644 --- a/fsabuilder/morfeuszbuilder/fsa/serializer.py +++ b/fsabuilder/morfeuszbuilder/fsa/serializer.py @@ -22,7 +22,7 @@ class Serializer(object): # get the Morfeusz file format version that is being encoded def getVersion(self): - return 12 + return 13 def serialize2CppFile(self, fname, generator, segmentationRulesData): res = [] diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py index 9bea7e8..9abe88c 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py @@ -5,12 +5,14 @@ Created on 20 lut 2014 ''' import logging from morfeuszbuilder.utils.serializationUtils import htons, htonl +from morfeuszbuilder.utils import serializationUtils class RulesManager(object): - def __init__(self, segtypes): + def __init__(self, segtypes, separatorsList): self.options2DFA = {} self.segtypes = segtypes + self.separatorsList = separatorsList self.defaultOptions = None def _options2Key(self, optionsMap): @@ -37,6 +39,7 @@ class RulesManager(object): def serialize(self): res = bytearray() + res.extend(self._serializeSeparatorsList()) dfasNum = len(self.options2DFA) assert dfasNum > 0 and dfasNum < 256 res.append(dfasNum) @@ -48,6 +51,13 @@ class RulesManager(object): logging.info('segmentation rules size: %s bytes', len(res)) return res + def _serializeSeparatorsList(self): + res = bytearray() + res.extend(serializationUtils.htons(len(self.separatorsList))) + for cp in sorted(self.separatorsList): + res.extend(serializationUtils.htonl(cp)) + return res + def _serializeOptionsMap(self, optionsMap): assert len(optionsMap) < 256 res = bytearray() diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py index 6472d48..d415384 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py @@ -3,7 +3,7 @@ from pyparsing import * ParserElement.enablePackrat() from morfeuszbuilder.tagset import segtypes from morfeuszbuilder.utils import configFile, exceptions -from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString, separatorChars import codecs import re @@ -34,11 +34,22 @@ class RulesParser(object): def parse(self, filename): - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'generator combinations', 'tags', 'lexemes', 'segment types']) + segtypesConfigFile = configFile.ConfigFile(filename, + [ + 'options', + 'combinations', + 'generator combinations', + 'tags', + 'lexemes', + 'segment types', + 'separator chars']) key2Defs = self._getKey2Defs(segtypesConfigFile) segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) + separatorsList = separatorChars.parseSeparatorChars(segtypesConfigFile) \ + if self.rulesType == RulesParser.PARSE4ANALYZER \ + else [] - res = rulesManager.RulesManager(segtypesHelper) + res = rulesManager.RulesManager(segtypesHelper, separatorsList) def2Key = {} for key, defs in key2Defs.iteritems(): diff --git a/input/segmenty.dat b/input/segmenty.dat index 28eba5b..6c6f271 100644 --- a/input/segmenty.dat +++ b/input/segmenty.dat @@ -243,6 +243,16 @@ moze_interp( dig>+ dywiz> latek ) # interpretacja znaków interpunkcyjnych # moze_interp(samodz interp) +[separator chars] +# , +44 + +# . +46 + +# ; +59 + [generator combinations] [segment types] diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index d326c17..69d3e69 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -62,7 +62,8 @@ void Morfeusz::processOneWord( const char*& inputStart, const char* inputEnd, int startNodeNum, - std::vector<MorphInterpretation>& results) const { + std::vector<MorphInterpretation>& results, + bool insideIgnHandler) const { while (inputStart != inputEnd && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) { env.getCharsetConverter().next(inputStart, inputEnd); @@ -71,9 +72,9 @@ void Morfeusz::processOneWord( InflexionGraph graph; const char* currInput = inputStart; const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); - + doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); - + if (!graph.empty()) { const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); int srcNode = startNodeNum; @@ -87,6 +88,12 @@ void Morfeusz::processOneWord( srcNode++; } } + else if (inputStart != inputEnd + && env.getProcessorType() == ANALYZER + && !insideIgnHandler) { + this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results); + // this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results); + } else if (inputStart != inputEnd) { this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results); } @@ -113,7 +120,7 @@ static inline string debugAccum(vector<InterpretedChunk>& accum) { stringstream res; for (unsigned int i = 0; i < accum.size(); i++) { res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr); -// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; + // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; } return res.str(); } @@ -125,11 +132,11 @@ void Morfeusz::doProcessOneWord( SegrulesState segrulesState, vector<InterpretedChunk>& accum, InflexionGraph& graph) const { -// if (this->options.debug) { -// cerr << "----------" << endl; -// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; -// } -// cerr << "doAnalyzeOneWord " << inputData << endl; + // if (this->options.debug) { + // cerr << "----------" << endl; + // cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; + // } + // cerr << "doAnalyzeOneWord " << inputData << endl; const char* inputStart = inputData; const char* currInput = inputData; uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); @@ -140,8 +147,8 @@ void Morfeusz::doProcessOneWord( while (!isEndOfWord(codepoint)) { uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER - ? env.getCaseConverter().toLower(codepoint) - : codepoint; + ? env.getCaseConverter().toLower(codepoint) + : codepoint; originalCodepoints.push_back(codepoint); normalizedCodepoints.push_back(normalizedCodepoint); feedState(state, normalizedCodepoint, UTF8CharsetConverter()); @@ -152,7 +159,7 @@ void Morfeusz::doProcessOneWord( throw MorfeuszException("Lemma of length > 1 cannot start with a colon"); } homonymId = string(currInput + 1, inputEnd); -// cerr << "homonym " << homonymId << endl; + // cerr << "homonym " << homonymId << endl; currInput = inputEnd; codepoint = 0x00; } @@ -163,13 +170,13 @@ void Morfeusz::doProcessOneWord( if (this->options.debug) { cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; } -// cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; + // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; set<SegrulesState> newSegrulesStates; env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); if (this->options.debug && newSegrulesStates.empty()) { cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; } -// cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; + // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; for ( set<SegrulesState>::iterator it = newSegrulesStates.begin(); it != newSegrulesStates.end(); @@ -190,7 +197,7 @@ void Morfeusz::doProcessOneWord( doShiftOrth(accum.back(), ic); } accum.push_back(ic); - if (isEndOfWord(codepoint) + if (isEndOfWord(codepoint) && newSegrulesState.accepting) { if (this->options.debug) { cerr << "ACCEPTING " << debugAccum(accum) << endl; @@ -198,7 +205,7 @@ void Morfeusz::doProcessOneWord( graph.addPath(accum, newSegrulesState.weak); } else if (!isEndOfWord(codepoint)) { -// cerr << "will process " << currInput << endl; + // cerr << "will process " << currInput << endl; const char* newCurrInput = currInput; doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); } @@ -211,6 +218,58 @@ void Morfeusz::doProcessOneWord( inputData = currInput; } +static inline bool isSeparator(uint32_t codepoint) { + return codepoint == 44; +} + +void Morfeusz::handleIgnChunk( + const Environment& env, + const char* inputStart, + const char* inputEnd, + int startNodeNum, + std::vector<MorphInterpretation>& results) const { + const char* currInput = inputStart; + const char* prevInput; + uint32_t codepoint; + bool separatorFound = false; + while (currInput != inputEnd) { + prevInput = currInput; + const char* nonSeparatorInputEnd = prevInput; + do { + codepoint = env.getCharsetConverter().next(currInput, inputEnd); + if (!isSeparator(codepoint)) { + nonSeparatorInputEnd = currInput; + } + } + while (currInput != inputEnd && !isSeparator(codepoint)); + + if (isSeparator(codepoint)) { + separatorFound = true; + if (nonSeparatorInputEnd != prevInput) { + int startNode = results.empty() ? startNodeNum : results.back().getEndNode(); + this->processOneWord(env, prevInput, nonSeparatorInputEnd, startNode, results, true); + startNode = results.empty() ? startNodeNum : results.back().getEndNode(); + this->processOneWord(env, nonSeparatorInputEnd, currInput, startNode, results, true); + } + else { + int startNode = results.empty() ? startNodeNum : results.back().getEndNode(); + this->processOneWord(env, prevInput, currInput, startNode, results, true); + } + } + } + + // currInput == inputEnd + if (!isSeparator(codepoint)) { + if (separatorFound) { + int startNode = results.empty() ? startNodeNum : results.back().getEndNode(); + this->processOneWord(env, prevInput, inputEnd, startNode, results, true); + } + else { + this->appendIgnotiumToResults(env, string(inputStart, inputEnd), startNodeNum, results); + } + } +} + void Morfeusz::appendIgnotiumToResults( const Environment& env, const string& word, @@ -260,6 +319,7 @@ void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results } // XXX - someday it should be improved + void Morfeusz::generate(const std::string& lemma, int tagnum, vector<MorphInterpretation>& result) const { vector<MorphInterpretation> partRes; this->generate(lemma, partRes); diff --git a/morfeusz/Morfeusz.hpp b/morfeusz/Morfeusz.hpp index a62e167..77bc034 100644 --- a/morfeusz/Morfeusz.hpp +++ b/morfeusz/Morfeusz.hpp @@ -157,7 +157,8 @@ private: const char*& inputData, const char* inputEnd, int startNodeNum, - std::vector<MorphInterpretation>& result) const; + std::vector<MorphInterpretation>& result, + bool insideIgnHandler=false) const; void doProcessOneWord( const Environment& env, @@ -166,6 +167,13 @@ private: SegrulesState segrulesState, std::vector<InterpretedChunk>& accum, InflexionGraph& graph) const; + + void handleIgnChunk( + const Environment& env, + const char* inputStart, + const char* inputEnd, + int startNodeNum, + std::vector<MorphInterpretation>& results) const; void appendIgnotiumToResults( const Environment& env, diff --git a/morfeusz/fsa/const.cpp b/morfeusz/fsa/const.cpp index 8357881..24512ed 100644 --- a/morfeusz/fsa/const.cpp +++ b/morfeusz/fsa/const.cpp @@ -2,7 +2,7 @@ #include "const.hpp" extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; -extern const uint8_t VERSION_NUM = 12; +extern const uint8_t VERSION_NUM = 13; extern const unsigned int VERSION_NUM_OFFSET = 4; extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5; diff --git a/morfeusz/segrules/segrules.cpp b/morfeusz/segrules/segrules.cpp index 47450b8..86755e7 100644 --- a/morfeusz/segrules/segrules.cpp +++ b/morfeusz/segrules/segrules.cpp @@ -18,11 +18,19 @@ static inline string deserializeString(const unsigned char*& ptr) { return res; } +static inline void ignoreSeparatorsList(const unsigned char*& ptr) { + uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr)); + ptr += 2; + ptr += 4 * listSize; +} + static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) { const unsigned char* additionalDataPtr = ptr + FSA_DATA_OFFSET + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); - return additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; + const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; + ignoreSeparatorsList(res); + return res; } static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) {