prawie dorobiona kompletna obsługa ignów

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@147 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

prawie dorobiona kompletna obsługa ignów
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@147 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Michał Lenart
1 parent 7351ce74
Showing 8 changed files with 131 additions and 24 deletions
fsabuilder/morfeuszbuilder/fsa/serializer.py
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
input/segmenty.dat
morfeusz/Morfeusz.cpp
morfeusz/Morfeusz.hpp
morfeusz/fsa/const.cpp
morfeusz/segrules/segrules.cpp
@@ -22,7 +22,7 @@ class Serializer(object):
     # get the Morfeusz file format version that is being encoded
     def getVersion(self):
-        return 12
+        return 13
     def serialize2CppFile(self, fname, generator, segmentationRulesData):
         res = []
@@ -5,12 +5,14 @@ Created on 20 lut 2014
 '''
 import logging
 from morfeuszbuilder.utils.serializationUtils import htons, htonl
+from morfeuszbuilder.utils import serializationUtils
 class RulesManager(object):
-    def __init__(self, segtypes):
+    def __init__(self, segtypes, separatorsList):
         self.options2DFA = {}
         self.segtypes = segtypes
+        self.separatorsList = separatorsList
         self.defaultOptions = None
     def _options2Key(self, optionsMap):
@@ -37,6 +39,7 @@ class RulesManager(object):
     def serialize(self):
         res = bytearray()
+        res.extend(self._serializeSeparatorsList())
         dfasNum = len(self.options2DFA)
         assert dfasNum > 0 and dfasNum < 256
         res.append(dfasNum)
@@ -48,6 +51,13 @@ class RulesManager(object):
         logging.info('segmentation rules size: %s bytes', len(res))
         return res
+    def _serializeSeparatorsList(self):
+        res = bytearray()
+        res.extend(serializationUtils.htons(len(self.separatorsList)))
+        for cp in sorted(self.separatorsList):
+            res.extend(serializationUtils.htonl(cp))
+        return res
+    
     def _serializeOptionsMap(self, optionsMap):
         assert len(optionsMap) < 256
         res = bytearray()
@@ -3,7 +3,7 @@ from pyparsing import *
 ParserElement.enablePackrat()
 from morfeuszbuilder.tagset import segtypes
 from morfeuszbuilder.utils import configFile, exceptions
-from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString
+from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString, separatorChars
 import codecs
 import re
@@ -34,11 +34,22 @@ class RulesParser(object):
     def parse(self, filename):
-        segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'generator combinations', 'tags', 'lexemes', 'segment types'])
+        segtypesConfigFile = configFile.ConfigFile(filename, 
+                                                   [
+                                                    'options', 
+                                                    'combinations', 
+                                                    'generator combinations', 
+                                                    'tags', 
+                                                    'lexemes', 
+                                                    'segment types', 
+                                                    'separator chars'])
         key2Defs = self._getKey2Defs(segtypesConfigFile)
         segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
+        separatorsList = separatorChars.parseSeparatorChars(segtypesConfigFile) \
+            if self.rulesType == RulesParser.PARSE4ANALYZER \
+            else []
-        res = rulesManager.RulesManager(segtypesHelper)
+        res = rulesManager.RulesManager(segtypesHelper, separatorsList)
         def2Key = {}
         for key, defs in key2Defs.iteritems():
@@ -243,6 +243,16 @@ moze_interp( dig&gt;+ dywiz&gt; latek )
 # interpretacja znaków interpunkcyjnych
 # moze_interp(samodz interp)
+[separator chars]
+# ,
+44
+
+# .
+46
+
+# ;
+59
+
 [generator combinations]
 [segment types]
@@ -62,7 +62,8 @@ void Morfeusz::processOneWord(
         const char*& inputStart,
         const char* inputEnd,
         int startNodeNum,
-        std::vector<MorphInterpretation>& results) const {
+        std::vector<MorphInterpretation>& results,
+        bool insideIgnHandler) const {
     while (inputStart != inputEnd
             && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) {
         env.getCharsetConverter().next(inputStart, inputEnd);
@@ -71,9 +72,9 @@ void Morfeusz::processOneWord(
     InflexionGraph graph;
     const char* currInput = inputStart;
     const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
-    
+
     doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
-    
+
     if (!graph.empty()) {
         const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
         int srcNode = startNodeNum;
@@ -87,6 +88,12 @@ void Morfeusz::processOneWord(
             srcNode++;
         }
     }
+    else if (inputStart != inputEnd 
+            && env.getProcessorType() == ANALYZER 
+            && !insideIgnHandler) {
+        this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results);
+        //        this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
+    }
     else if (inputStart != inputEnd) {
         this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
     }
@@ -113,7 +120,7 @@ static inline string debugAccum(vector&lt;InterpretedChunk&gt;&amp; accum) {
     stringstream res;
     for (unsigned int i = 0; i < accum.size(); i++) {
         res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr);
-//        res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
+        //        res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
     }
     return res.str();
 }
@@ -125,11 +132,11 @@ void Morfeusz::doProcessOneWord(
         SegrulesState segrulesState,
         vector<InterpretedChunk>& accum,
         InflexionGraph& graph) const {
-//    if (this->options.debug) {
-//        cerr << "----------" << endl;
-//        cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
-//    }
-//    cerr << "doAnalyzeOneWord " << inputData << endl;
+    //    if (this->options.debug) {
+    //        cerr << "----------" << endl;
+    //        cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
+    //    }
+    //    cerr << "doAnalyzeOneWord " << inputData << endl;
     const char* inputStart = inputData;
     const char* currInput = inputData;
     uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
@@ -140,8 +147,8 @@ void Morfeusz::doProcessOneWord(
     while (!isEndOfWord(codepoint)) {
         uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
-            ? env.getCaseConverter().toLower(codepoint)
-            : codepoint;
+                ? env.getCaseConverter().toLower(codepoint)
+                : codepoint;
         originalCodepoints.push_back(codepoint);
         normalizedCodepoints.push_back(normalizedCodepoint);
         feedState(state, normalizedCodepoint, UTF8CharsetConverter());
@@ -152,7 +159,7 @@ void Morfeusz::doProcessOneWord(
                 throw MorfeuszException("Lemma of length > 1 cannot start with a colon");
             }
             homonymId = string(currInput + 1, inputEnd);
-//            cerr << "homonym " << homonymId << endl;
+            //            cerr << "homonym " << homonymId << endl;
             currInput = inputEnd;
             codepoint = 0x00;
         }
@@ -163,13 +170,13 @@ void Morfeusz::doProcessOneWord(
                 if (this->options.debug) {
                     cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
                 }
-//                cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
+                //                cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
                 set<SegrulesState> newSegrulesStates;
                 env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
                 if (this->options.debug && newSegrulesStates.empty()) {
                     cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
                 }
-//                cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
+                //                cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
                 for (
                         set<SegrulesState>::iterator it = newSegrulesStates.begin();
                         it != newSegrulesStates.end();
@@ -190,7 +197,7 @@ void Morfeusz::doProcessOneWord(
                         doShiftOrth(accum.back(), ic);
                     }
                     accum.push_back(ic);
-                    if (isEndOfWord(codepoint) 
+                    if (isEndOfWord(codepoint)
                             && newSegrulesState.accepting) {
                         if (this->options.debug) {
                             cerr << "ACCEPTING " << debugAccum(accum) << endl;
@@ -198,7 +205,7 @@ void Morfeusz::doProcessOneWord(
                         graph.addPath(accum, newSegrulesState.weak);
                     }
                     else if (!isEndOfWord(codepoint)) {
-//                        cerr << "will process " << currInput << endl;
+                        //                        cerr << "will process " << currInput << endl;
                         const char* newCurrInput = currInput;
                         doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
                     }
@@ -211,6 +218,58 @@ void Morfeusz::doProcessOneWord(
     inputData = currInput;
 }
+static inline bool isSeparator(uint32_t codepoint) {
+    return codepoint == 44;
+}
+
+void Morfeusz::handleIgnChunk(
+        const Environment& env,
+        const char* inputStart,
+        const char* inputEnd,
+        int startNodeNum,
+        std::vector<MorphInterpretation>& results) const {
+    const char* currInput = inputStart;
+    const char* prevInput;
+    uint32_t codepoint;
+    bool separatorFound = false;
+    while (currInput != inputEnd) {
+        prevInput = currInput;
+        const char* nonSeparatorInputEnd = prevInput;
+        do {
+            codepoint = env.getCharsetConverter().next(currInput, inputEnd);
+            if (!isSeparator(codepoint)) {
+                nonSeparatorInputEnd = currInput;
+            }
+        }
+        while (currInput != inputEnd && !isSeparator(codepoint));
+
+        if (isSeparator(codepoint)) {
+            separatorFound = true;
+            if (nonSeparatorInputEnd != prevInput) {
+                int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
+                this->processOneWord(env, prevInput, nonSeparatorInputEnd, startNode, results, true);
+                startNode = results.empty() ? startNodeNum : results.back().getEndNode();
+                this->processOneWord(env, nonSeparatorInputEnd, currInput, startNode, results, true);
+            }
+            else {
+                int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
+                this->processOneWord(env, prevInput, currInput, startNode, results, true);
+            }
+        }
+    }
+
+    // currInput == inputEnd
+    if (!isSeparator(codepoint)) {
+        if (separatorFound) {
+            int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
+            this->processOneWord(env, prevInput, inputEnd, startNode, results, true);
+        }
+        else {
+            this->appendIgnotiumToResults(env, string(inputStart, inputEnd), startNodeNum, results);
+        }
+    }
+}
+
 void Morfeusz::appendIgnotiumToResults(
         const Environment& env,
         const string& word,
@@ -260,6 +319,7 @@ void Morfeusz::generate(const string&amp; text, vector&lt;MorphInterpretation&gt;&amp; results
 }
 // XXX - someday it should be improved
+
 void Morfeusz::generate(const std::string& lemma, int tagnum, vector<MorphInterpretation>& result) const {
     vector<MorphInterpretation> partRes;
     this->generate(lemma, partRes);
@@ -157,7 +157,8 @@ private:
             const char*& inputData,
             const char* inputEnd,
             int startNodeNum,
-            std::vector<MorphInterpretation>& result) const;
+            std::vector<MorphInterpretation>& result,
+            bool insideIgnHandler=false) const;
     void doProcessOneWord(
             const Environment& env,
@@ -166,6 +167,13 @@ private:
             SegrulesState segrulesState,
             std::vector<InterpretedChunk>& accum,
             InflexionGraph& graph) const;
+    
+    void handleIgnChunk(
+        const Environment& env,
+        const char* inputStart,
+        const char* inputEnd,
+        int startNodeNum,
+        std::vector<MorphInterpretation>& results) const;
     void appendIgnotiumToResults(
             const Environment& env,
@@ -2,7 +2,7 @@
 #include "const.hpp"
 extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b;
-extern const uint8_t VERSION_NUM = 12;
+extern const uint8_t VERSION_NUM = 13;
 extern const unsigned int VERSION_NUM_OFFSET = 4;
 extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5;
@@ -18,11 +18,19 @@ static inline string deserializeString(const unsigned char*&amp; ptr) {
     return res;
 }
+static inline void ignoreSeparatorsList(const unsigned char*& ptr) {
+    uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr));
+    ptr += 2;
+    ptr += 4 * listSize;
+}
+
 static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) {
     const unsigned char* additionalDataPtr = ptr 
         + FSA_DATA_OFFSET 
         + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
-    return additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;
+    const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;
+    ignoreSeparatorsList(res);
+    return res;
 }
 static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) {