ciąg dalszy porządków

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@169 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

ciąg dalszy porządków
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@169 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Michał Lenart
1 parent d51aa0dd
Showing 8 changed files with 90 additions and 59 deletions
fsabuilder/morfeuszbuilder/fsa/encode.py
fsabuilder/morfeuszbuilder/fsa/serializer.py
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
morfeusz/CMakeLists.txt
morfeusz/Morfeusz.cpp
morfeusz/Tagset.cpp
morfeusz/charset/charset_utils.hpp
morfeusz/fsa/cfsa1_impl.hpp
@@ -127,18 +127,15 @@ class Encoder(object):
             if isAnalyzer:
                 encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
             else:
-                serializeString(interp.homonymId, encodedInterpsList)
-                serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList)
+                encodedInterpsList.extend(serializeString(interp.homonymId))
+                encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd))
             encodedInterpsList.append(interp.encodedForm.cutLength)
-            serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList)
+            encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd))
             if isAnalyzer:
                 encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
             encodedInterpsList.extend(htons(interp.tagnum))
             encodedInterpsList.append(interp.namenum)
             encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers))
-            
-            if interp.encodedForm.suffixToAdd == 'bc':
-                print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList]
  
         res.extend(htons(len(encodedInterpsList)))
         res.extend(encodedInterpsList)
@@ -271,16 +271,21 @@ class VLengthSerializer1(Serializer):
         return self.useArrays and state.serializeAsArray
  
     def stateData2bytearray(self, state):
-        assert state.transitionsNum < 64
+#         assert state.transitionsNum < 64
         res = bytearray()
         firstByte = 0
         if state.isAccepting():
             firstByte |= self.ACCEPTING_FLAG
         if self.stateShouldBeAnArray(state):
             firstByte |= self.ARRAY_FLAG
-        firstByte |= state.transitionsNum
-        assert firstByte < 256 and firstByte > 0
-        res.append(firstByte)
+        if state.transitionsNum < 63:
+            firstByte |= state.transitionsNum
+            res.append(firstByte)
+        else:
+            firstByte |= 63
+            res.append(firstByte)
+            res.append(state.transitionsNum)
+        
         if state.isAccepting():
             res.extend(state.encodedData)
         return res
@@ -23,6 +23,8 @@ def htonl(n):
     res.append(n & 0x000000FF)
     return res
  
-def serializeString(string, out):
+def serializeString(string):
+    out = bytearray()
     out.extend(string.encode('utf8'))
     out.append(0)
+    return out
@@ -9,7 +9,7 @@ add_custom_command (
 )
 add_custom_command (
         OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}"
-        COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE
+        COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1
         DEPENDS "${INPUT_DICTIONARY}"
         COMMENT "Building default dictionary C++ file"
 )
@@ -74,7 +74,7 @@ void Morfeusz::processOneWord(
         std::vector<MorphInterpretation>& results,
         bool insideIgnHandler) const {
     while (inputStart != inputEnd
-            && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) {
+            && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) {
         env.getCharsetConverter().next(inputStart, inputEnd);
     }
     vector<InterpretedChunk> accum;
@@ -154,7 +154,7 @@ void Morfeusz::doProcessOneWord(
  
     StateType state = env.getFSA().getInitialState();
  
-    while (!isEndOfWord(codepoint)) {
+    while (!isWhitespace(codepoint)) {
         uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
                 ? env.getCaseConverter().toLower(codepoint)
                 : codepoint;
@@ -210,14 +210,14 @@ void Morfeusz::doProcessOneWord(
                             doShiftOrth(accum.back(), ic);
                         }
                         accum.push_back(ic);
-                        if (isEndOfWord(codepoint)
+                        if (isWhitespace(codepoint)
                                 && newSegrulesState.accepting) {
                             if (this->options.debug) {
                                 cerr << "ACCEPTING " << debugAccum(accum) << endl;
                             }
                             graph.addPath(accum, newSegrulesState.weak);
                         }
-                        else if (!isEndOfWord(codepoint)) {
+                        else if (!isWhitespace(codepoint)) {
                             //                        cerr << "will process " << currInput << endl;
                             const char* newCurrInput = currInput;
                             doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
@@ -227,7 +227,7 @@ void Morfeusz::doProcessOneWord(
                 }
             }
         }
-        codepoint = currInput == inputEnd || isEndOfWord(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
+        codepoint = currInput == inputEnd || isWhitespace(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
     }
     inputData = currInput;
 }
@@ -9,7 +9,7 @@
 using namespace std;
  
 Tagset::Tagset(const unsigned char* ptr) {
-    uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
+    uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET);
     const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4;
     readTags(currPtr, this->tags);
     readTags(currPtr, this->names);
@@ -48,10 +48,9 @@ static inline std::vector&lt;uint32_t&gt; initializeWhitespaces() {
     return res;
 }
  
-inline bool isEndOfWord(uint32_t codepoint) {
+inline bool isWhitespace(uint32_t codepoint) {
     static std::vector<uint32_t> whitespaces(initializeWhitespaces());
     return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint);
-    //    return whitespaces.count(codepoint);
 }
  
 template <class StateClass>
@@ -11,35 +11,57 @@
 #include <vector>
  
 #include "fsa.hpp"
+#include "../deserializationUtils.hpp"
  
 using namespace std;
  
-#pragma pack(push, 1)  /* push current alignment to stack */
+static const unsigned char CFSA1_ACCEPTING_FLAG = 128;
+static const unsigned char CFSA1_ARRAY_FLAG = 64;
+static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 63;
+
+static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3;
+
+static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257;
  
 struct StateData2 {
-    unsigned transitionsNum: 6;
-    unsigned array : 1;
-    unsigned accepting : 1;
+    unsigned int transitionsNum;
+    bool isArray;
+    bool isAccepting;
 };
  
 struct TransitionData2 {
-    unsigned offsetSize : 2;
-    unsigned shortLabel : 6;
+    unsigned int offsetSize;
+    unsigned int shortLabel;
 };
  
+static inline StateData2 readStateData(const unsigned char*& ptr) {
+    StateData2 res;
+    unsigned char firstByte = readInt8(ptr);
+    res.isArray = firstByte & CFSA1_ARRAY_FLAG;
+    res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG;
+    res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK;
+    if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) {
+        res.transitionsNum = readInt8(ptr);
+    }
+    return res;
+}
  
-#pragma pack(pop)   /* restore original alignment from stack */
-
-static const unsigned int INITIAL_STATE_OFFSET = 257;
+static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) {
+    TransitionData2 res;
+    unsigned char firstByte = readInt8(ptr);
+    res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK;
+    res.shortLabel = firstByte >> 2;
+    return res;
+}
  
 template <class T>
 vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) {
-    return vector<unsigned char>(ptr, ptr + INITIAL_STATE_OFFSET);
+    return vector<unsigned char>(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET);
 }
  
 template <class T>
 CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer)
-: FSA<T>(ptr + INITIAL_STATE_OFFSET, deserializer),
+: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer),
 label2ShortLabel(initializeChar2PopularCharIdx(ptr)) {
 }
  
@@ -52,10 +74,12 @@ template &lt;class T&gt;
 void CompressedFSA1<T>::reallyDoProceed(
         const unsigned char* statePtr,
         State<T>& state) const {
-    const StateData2* sd = reinterpret_cast<const StateData2*>(statePtr);
-    if (sd->accepting) {
+    const unsigned char* currPtr = statePtr;
+    const StateData2 sd = readStateData(currPtr);
+    if (sd.isAccepting) {
         T object;
-        long size = this->deserializer.deserialize(statePtr + 1, object);
+        long size = this->deserializer.deserialize(currPtr, object);
+        //        long size = this->deserializer.deserialize(statePtr + 1, object);
         state.setNext(statePtr - this->initialStatePtr, object, size);
     }
     else {
@@ -70,54 +94,57 @@ void CompressedFSA1&lt;T&gt;::doProceedToNextByList(
         const unsigned char* ptr,
         const unsigned int transitionsNum,
         State<T>& state) const {
-    register const unsigned char* currPtr = ptr;
+    const unsigned char* currPtr = ptr;
     //    TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset);
     bool found = false;
     TransitionData2 td;
     for (unsigned int i = 0; i < transitionsNum; i++) {
-        //        const_cast<Counter*>(&counter)->increment(1);
-        td = *(reinterpret_cast<const TransitionData2*>(currPtr));
+        td = readTransitionFirstByte(currPtr);
         if (td.shortLabel == shortLabel) {
             if (shortLabel == 0) {
-                currPtr++;
-                char label = (char) *currPtr;
+                char label = static_cast<char>(readInt8(currPtr));
                 if (label == c) {
                     found = true;
                     break;
                 }
                 else {
-                    currPtr += td.offsetSize + 1;
+                    currPtr += td.offsetSize;
                 }
-            } else {
+            }
+            else {
                 found = true;
                 break;
             }
-        } 
+        }
         else {
             if (td.shortLabel == 0) {
                 currPtr++;
             }
-            currPtr += td.offsetSize + 1;
+            currPtr += td.offsetSize;
         }
     }
     if (!found) {
         state.setNextAsSink();
-    } 
+    }
     else {
-        currPtr++;
+        uint32_t offset;
         switch (td.offsetSize) {
             case 0:
+                offset = 0;
                 break;
             case 1:
-                currPtr += *currPtr + 1;
+                offset = readInt8(currPtr);
                 break;
             case 2:
-                currPtr += ntohs(*((const uint16_t*) currPtr)) + 2;
+                offset = readInt16(currPtr);
                 break;
             case 3:
-                currPtr += (((const unsigned int) ntohs(*((const uint16_t*) currPtr))) << 8) + currPtr[2] + 3;
-                break;
+               offset = readInt16(currPtr);
+               offset <<= 8;
+               offset += readInt8(currPtr);
+               break;
         }
+        currPtr += offset;
         reallyDoProceed(currPtr, state);
     }
 }
@@ -139,31 +166,32 @@ void CompressedFSA1&lt;T&gt;::doProceedToNextByArray(
  
 template <class T>
 void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const {
-    const unsigned char* fromPointer = this->initialStatePtr + state.getOffset();
+    const unsigned char* currPtr = this->initialStatePtr + state.getOffset();
     unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c];
-    unsigned long transitionsTableOffset = 1;
+    //    unsigned long transitionsTableOffset = 1;
+    const StateData2 sd = readStateData(currPtr);
     if (state.isAccepting()) {
-        transitionsTableOffset += state.getValueSize();
+        currPtr += state.getValueSize();
     }
-    const StateData2* sd = reinterpret_cast<const StateData2*>(fromPointer);
-    if (sd->array) {
+
+    if (sd.isArray) {
         if (shortLabel > 0) {
             this->doProceedToNextByArray(
                     shortLabel,
-                    reinterpret_cast<const uint32_t*>(fromPointer + transitionsTableOffset),
+                    reinterpret_cast<const uint32_t*> (currPtr),
                     state);
         }
         else {
-            reallyDoProceed(fromPointer + transitionsTableOffset + 256, state);
+            reallyDoProceed(currPtr + 256, state);
             proceedToNext(c, state);
         }
-    } 
+    }
     else {
         this->doProceedToNextByList(
                 c,
                 shortLabel,
-                fromPointer + transitionsTableOffset,
-                sd->transitionsNum,
+                currPtr,
+                sd.transitionsNum,
                 state);
     }
 }