From 445dc6d2c31b6eaa134dfbea0f362f672b5716ee Mon Sep 17 00:00:00 2001 From: Michał Lenart <michall@ipipan.waw.pl> Date: Tue, 22 Apr 2014 16:19:09 +0000 Subject: [PATCH] ciąg dalszy porządków --- fsabuilder/morfeuszbuilder/fsa/encode.py | 9 +++------ fsabuilder/morfeuszbuilder/fsa/serializer.py | 13 +++++++++---- fsabuilder/morfeuszbuilder/utils/serializationUtils.py | 4 +++- morfeusz/CMakeLists.txt | 2 +- morfeusz/Morfeusz.cpp | 10 +++++----- morfeusz/Tagset.cpp | 2 +- morfeusz/charset/charset_utils.hpp | 3 +-- morfeusz/fsa/cfsa1_impl.hpp | 106 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------------------------- 8 files changed, 90 insertions(+), 59 deletions(-) diff --git a/fsabuilder/morfeuszbuilder/fsa/encode.py b/fsabuilder/morfeuszbuilder/fsa/encode.py index 8aecd71..0a99954 100644 --- a/fsabuilder/morfeuszbuilder/fsa/encode.py +++ b/fsabuilder/morfeuszbuilder/fsa/encode.py @@ -127,18 +127,15 @@ class Encoder(object): if isAnalyzer: encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) else: - serializeString(interp.homonymId, encodedInterpsList) - serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList) + encodedInterpsList.extend(serializeString(interp.homonymId)) + encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd)) encodedInterpsList.append(interp.encodedForm.cutLength) - serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList) + encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd)) if isAnalyzer: encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern)) encodedInterpsList.extend(htons(interp.tagnum)) encodedInterpsList.append(interp.namenum) encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) - - if interp.encodedForm.suffixToAdd == 'bc': - print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList] res.extend(htons(len(encodedInterpsList))) res.extend(encodedInterpsList) diff --git a/fsabuilder/morfeuszbuilder/fsa/serializer.py b/fsabuilder/morfeuszbuilder/fsa/serializer.py index f655246..1096810 100644 --- a/fsabuilder/morfeuszbuilder/fsa/serializer.py +++ b/fsabuilder/morfeuszbuilder/fsa/serializer.py @@ -271,16 +271,21 @@ class VLengthSerializer1(Serializer): return self.useArrays and state.serializeAsArray def stateData2bytearray(self, state): - assert state.transitionsNum < 64 +# assert state.transitionsNum < 64 res = bytearray() firstByte = 0 if state.isAccepting(): firstByte |= self.ACCEPTING_FLAG if self.stateShouldBeAnArray(state): firstByte |= self.ARRAY_FLAG - firstByte |= state.transitionsNum - assert firstByte < 256 and firstByte > 0 - res.append(firstByte) + if state.transitionsNum < 63: + firstByte |= state.transitionsNum + res.append(firstByte) + else: + firstByte |= 63 + res.append(firstByte) + res.append(state.transitionsNum) + if state.isAccepting(): res.extend(state.encodedData) return res diff --git a/fsabuilder/morfeuszbuilder/utils/serializationUtils.py b/fsabuilder/morfeuszbuilder/utils/serializationUtils.py index 3a1cd54..38c5d16 100644 --- a/fsabuilder/morfeuszbuilder/utils/serializationUtils.py +++ b/fsabuilder/morfeuszbuilder/utils/serializationUtils.py @@ -23,6 +23,8 @@ def htonl(n): res.append(n & 0x000000FF) return res -def serializeString(string, out): +def serializeString(string): + out = bytearray() out.extend(string.encode('utf8')) out.append(0) + return out diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt index 2ff722a..37263f9 100644 --- a/morfeusz/CMakeLists.txt +++ b/morfeusz/CMakeLists.txt @@ -9,7 +9,7 @@ add_custom_command ( ) add_custom_command ( OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 DEPENDS "${INPUT_DICTIONARY}" COMMENT "Building default dictionary C++ file" ) diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index 55ce467..6786e26 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -74,7 +74,7 @@ void Morfeusz::processOneWord( std::vector<MorphInterpretation>& results, bool insideIgnHandler) const { while (inputStart != inputEnd - && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) { + && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) { env.getCharsetConverter().next(inputStart, inputEnd); } vector<InterpretedChunk> accum; @@ -154,7 +154,7 @@ void Morfeusz::doProcessOneWord( StateType state = env.getFSA().getInitialState(); - while (!isEndOfWord(codepoint)) { + while (!isWhitespace(codepoint)) { uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER ? env.getCaseConverter().toLower(codepoint) : codepoint; @@ -210,14 +210,14 @@ void Morfeusz::doProcessOneWord( doShiftOrth(accum.back(), ic); } accum.push_back(ic); - if (isEndOfWord(codepoint) + if (isWhitespace(codepoint) && newSegrulesState.accepting) { if (this->options.debug) { cerr << "ACCEPTING " << debugAccum(accum) << endl; } graph.addPath(accum, newSegrulesState.weak); } - else if (!isEndOfWord(codepoint)) { + else if (!isWhitespace(codepoint)) { // cerr << "will process " << currInput << endl; const char* newCurrInput = currInput; doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); @@ -227,7 +227,7 @@ void Morfeusz::doProcessOneWord( } } } - codepoint = currInput == inputEnd || isEndOfWord(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd); + codepoint = currInput == inputEnd || isWhitespace(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd); } inputData = currInput; } diff --git a/morfeusz/Tagset.cpp b/morfeusz/Tagset.cpp index 65f88de..9fb76b0 100644 --- a/morfeusz/Tagset.cpp +++ b/morfeusz/Tagset.cpp @@ -9,7 +9,7 @@ using namespace std; Tagset::Tagset(const unsigned char* ptr) { - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET); const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; readTags(currPtr, this->tags); readTags(currPtr, this->names); diff --git a/morfeusz/charset/charset_utils.hpp b/morfeusz/charset/charset_utils.hpp index 1e629a1..363d683 100644 --- a/morfeusz/charset/charset_utils.hpp +++ b/morfeusz/charset/charset_utils.hpp @@ -48,10 +48,9 @@ static inline std::vector<uint32_t> initializeWhitespaces() { return res; } -inline bool isEndOfWord(uint32_t codepoint) { +inline bool isWhitespace(uint32_t codepoint) { static std::vector<uint32_t> whitespaces(initializeWhitespaces()); return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint); - // return whitespaces.count(codepoint); } template <class StateClass> diff --git a/morfeusz/fsa/cfsa1_impl.hpp b/morfeusz/fsa/cfsa1_impl.hpp index 00eebbe..d11ad20 100644 --- a/morfeusz/fsa/cfsa1_impl.hpp +++ b/morfeusz/fsa/cfsa1_impl.hpp @@ -11,35 +11,57 @@ #include <vector> #include "fsa.hpp" +#include "../deserializationUtils.hpp" using namespace std; -#pragma pack(push, 1) /* push current alignment to stack */ +static const unsigned char CFSA1_ACCEPTING_FLAG = 128; +static const unsigned char CFSA1_ARRAY_FLAG = 64; +static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 63; + +static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3; + +static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257; struct StateData2 { - unsigned transitionsNum: 6; - unsigned array : 1; - unsigned accepting : 1; + unsigned int transitionsNum; + bool isArray; + bool isAccepting; }; struct TransitionData2 { - unsigned offsetSize : 2; - unsigned shortLabel : 6; + unsigned int offsetSize; + unsigned int shortLabel; }; +static inline StateData2 readStateData(const unsigned char*& ptr) { + StateData2 res; + unsigned char firstByte = readInt8(ptr); + res.isArray = firstByte & CFSA1_ARRAY_FLAG; + res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG; + res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK; + if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) { + res.transitionsNum = readInt8(ptr); + } + return res; +} -#pragma pack(pop) /* restore original alignment from stack */ - -static const unsigned int INITIAL_STATE_OFFSET = 257; +static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) { + TransitionData2 res; + unsigned char firstByte = readInt8(ptr); + res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK; + res.shortLabel = firstByte >> 2; + return res; +} template <class T> vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) { - return vector<unsigned char>(ptr, ptr + INITIAL_STATE_OFFSET); + return vector<unsigned char>(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET); } template <class T> CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer) -: FSA<T>(ptr + INITIAL_STATE_OFFSET, deserializer), +: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer), label2ShortLabel(initializeChar2PopularCharIdx(ptr)) { } @@ -52,10 +74,12 @@ template <class T> void CompressedFSA1<T>::reallyDoProceed( const unsigned char* statePtr, State<T>& state) const { - const StateData2* sd = reinterpret_cast<const StateData2*>(statePtr); - if (sd->accepting) { + const unsigned char* currPtr = statePtr; + const StateData2 sd = readStateData(currPtr); + if (sd.isAccepting) { T object; - long size = this->deserializer.deserialize(statePtr + 1, object); + long size = this->deserializer.deserialize(currPtr, object); + // long size = this->deserializer.deserialize(statePtr + 1, object); state.setNext(statePtr - this->initialStatePtr, object, size); } else { @@ -70,54 +94,57 @@ void CompressedFSA1<T>::doProceedToNextByList( const unsigned char* ptr, const unsigned int transitionsNum, State<T>& state) const { - register const unsigned char* currPtr = ptr; + const unsigned char* currPtr = ptr; // TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset); bool found = false; TransitionData2 td; for (unsigned int i = 0; i < transitionsNum; i++) { - // const_cast<Counter*>(&counter)->increment(1); - td = *(reinterpret_cast<const TransitionData2*>(currPtr)); + td = readTransitionFirstByte(currPtr); if (td.shortLabel == shortLabel) { if (shortLabel == 0) { - currPtr++; - char label = (char) *currPtr; + char label = static_cast<char>(readInt8(currPtr)); if (label == c) { found = true; break; } else { - currPtr += td.offsetSize + 1; + currPtr += td.offsetSize; } - } else { + } + else { found = true; break; } - } + } else { if (td.shortLabel == 0) { currPtr++; } - currPtr += td.offsetSize + 1; + currPtr += td.offsetSize; } } if (!found) { state.setNextAsSink(); - } + } else { - currPtr++; + uint32_t offset; switch (td.offsetSize) { case 0: + offset = 0; break; case 1: - currPtr += *currPtr + 1; + offset = readInt8(currPtr); break; case 2: - currPtr += ntohs(*((const uint16_t*) currPtr)) + 2; + offset = readInt16(currPtr); break; case 3: - currPtr += (((const unsigned int) ntohs(*((const uint16_t*) currPtr))) << 8) + currPtr[2] + 3; - break; + offset = readInt16(currPtr); + offset <<= 8; + offset += readInt8(currPtr); + break; } + currPtr += offset; reallyDoProceed(currPtr, state); } } @@ -139,31 +166,32 @@ void CompressedFSA1<T>::doProceedToNextByArray( template <class T> void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const { - const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); + const unsigned char* currPtr = this->initialStatePtr + state.getOffset(); unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c]; - unsigned long transitionsTableOffset = 1; + // unsigned long transitionsTableOffset = 1; + const StateData2 sd = readStateData(currPtr); if (state.isAccepting()) { - transitionsTableOffset += state.getValueSize(); + currPtr += state.getValueSize(); } - const StateData2* sd = reinterpret_cast<const StateData2*>(fromPointer); - if (sd->array) { + + if (sd.isArray) { if (shortLabel > 0) { this->doProceedToNextByArray( shortLabel, - reinterpret_cast<const uint32_t*>(fromPointer + transitionsTableOffset), + reinterpret_cast<const uint32_t*> (currPtr), state); } else { - reallyDoProceed(fromPointer + transitionsTableOffset + 256, state); + reallyDoProceed(currPtr + 256, state); proceedToNext(c, state); } - } + } else { this->doProceedToNextByList( c, shortLabel, - fromPointer + transitionsTableOffset, - sd->transitionsNum, + currPtr, + sd.transitionsNum, state); } } -- libgit2 0.22.2