From d51aa0ddcd3289f7abd65d987df92d721cfa66e0 Mon Sep 17 00:00:00 2001 From: Michał Lenart <michall@ipipan.waw.pl> Date: Tue, 22 Apr 2014 13:36:18 +0000 Subject: [PATCH] poprawienie buga w dodawaniu replacement char, ogólne porządki w kodzie --- CMakeLists.txt | 2 +- morfeusz/CMakeLists.txt | 4 ++-- morfeusz/Morfeusz.cpp | 27 +++++++++------------------ morfeusz/Morfeusz.hpp | 2 -- morfeusz/MorphDeserializer.cpp | 10 ++++------ morfeusz/Qualifiers.cpp | 4 ++-- morfeusz/charset/CharsetConverter.cpp | 45 ++++++++++++++++++++++++++------------------- morfeusz/charset/CharsetConverter.hpp | 4 +--- morfeusz/cli/cli.cpp | 5 +---- morfeusz/cli/cli.hpp | 13 ++----------- morfeusz/deserializationUtils.hpp | 5 +++++ morfeusz/endianness.hpp | 6 +----- morfeusz/fsa/fsa_impl.hpp | 2 -- morfeusz/fsa/simplefsa_impl.hpp | 1 - morfeusz/segrules/segrules.cpp | 14 ++++---------- 15 files changed, 58 insertions(+), 86 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 87c0a18..5ee9d6a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -5,7 +5,7 @@ project (Morfeusz) set (Morfeusz_VERSION_MAJOR 2) set (Morfeusz_VERSION_MINOR 0) set (Morfeusz_VERSION_PATCH 0) -#~ set (CMAKE_BUILD_TYPE "Release") +#~ set (CMAKE_BUILD_TYPE "Debug") enable_testing() diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt index 639e3a5..2ff722a 100644 --- a/morfeusz/CMakeLists.txt +++ b/morfeusz/CMakeLists.txt @@ -3,13 +3,13 @@ ########## generate default dictionary data ################# add_custom_command ( OUTPUT "${INPUT_DICTIONARY_CPP}" - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 --trim-supneg DEPENDS "${INPUT_DICTIONARY}" COMMENT "Building default dictionary C++ file" ) add_custom_command ( OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE DEPENDS "${INPUT_DICTIONARY}" COMMENT "Building default dictionary C++ file" ) diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index b142d4f..55ce467 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -66,13 +66,6 @@ void Morfeusz::setGeneratorFile(const string& filename) { Morfeusz::~Morfeusz() { } -string Morfeusz::prepareStringToProcess(const std::string& str) const { - string res; - res.reserve(str.size()); - utf8::replace_invalid(str.begin(), str.end(), back_inserter(res)); - return res; -} - void Morfeusz::processOneWord( const Environment& env, const char*& inputStart, @@ -88,9 +81,9 @@ void Morfeusz::processOneWord( InflexionGraph graph; const char* currInput = inputStart; const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); - + doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); - + if (!graph.empty()) { const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); int srcNode = startNodeNum; @@ -148,10 +141,10 @@ void Morfeusz::doProcessOneWord( SegrulesState segrulesState, vector<InterpretedChunk>& accum, InflexionGraph& graph) const { - // if (this->options.debug) { - // cerr << "----------" << endl; - // cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; - // } + if (this->options.debug) { + cerr << "----------" << endl; + cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; + } // cerr << "doAnalyzeOneWord " << inputData << endl; const char* inputStart = inputData; const char* currInput = inputData; @@ -303,9 +296,8 @@ ResultsIterator Morfeusz::analyze(const string& text) const { } void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const { - string preparedText = this->prepareStringToProcess(text); - const char* input = preparedText.c_str(); - const char* inputEnd = input + preparedText.length(); + const char* input = text.c_str(); + const char* inputEnd = input + text.length(); while (input != inputEnd) { int startNode = results.empty() ? 0 : results.back().getEndNode(); this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results); @@ -324,8 +316,7 @@ ResultsIterator Morfeusz::generate(const string& text, int tagnum) const { return ResultsIterator(res); } -void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results) const { - string lemma = this->prepareStringToProcess(text); +void Morfeusz::generate(const string& lemma, vector<MorphInterpretation>& results) const { const char* input = lemma.c_str(); const char* inputEnd = input + lemma.length(); int startNode = 0; diff --git a/morfeusz/Morfeusz.hpp b/morfeusz/Morfeusz.hpp index 9ea6684..ecbc8fa 100644 --- a/morfeusz/Morfeusz.hpp +++ b/morfeusz/Morfeusz.hpp @@ -158,8 +158,6 @@ public: friend class ResultsIterator; private: - std::string prepareStringToProcess(const std::string& input) const; - void processOneWord( const Environment& env, const char*& inputData, diff --git a/morfeusz/MorphDeserializer.cpp b/morfeusz/MorphDeserializer.cpp index f69421e..4fad880 100644 --- a/morfeusz/MorphDeserializer.cpp +++ b/morfeusz/MorphDeserializer.cpp @@ -11,6 +11,7 @@ #include "MorphDeserializer.hpp" #include "EncodedInterpretation.hpp" #include "InterpsGroup.hpp" +#include "deserializationUtils.hpp" //const uint8_t LEMMA_ONLY_LOWER = 0; //const uint8_t LEMMA_UPPER_PREFIX = 1; @@ -25,16 +26,13 @@ MorphDeserializer::~MorphDeserializer() { long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { const unsigned char* currPtr = ptr; - uint8_t interpTypesNum = *currPtr; - currPtr++; + uint8_t interpTypesNum = readInt8(currPtr); interps.clear(); interps.reserve(interpTypesNum); for (unsigned int i = 0; i < interpTypesNum; i++) { InterpsGroup ig; - ig.type = *currPtr; - currPtr++; - ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr))); - currPtr += 2; + ig.type = readInt8(currPtr); + ig.size = readInt16(currPtr); ig.ptr = currPtr; currPtr += ig.size; interps.push_back(ig); diff --git a/morfeusz/Qualifiers.cpp b/morfeusz/Qualifiers.cpp index b76b3dd..8d285a4 100644 --- a/morfeusz/Qualifiers.cpp +++ b/morfeusz/Qualifiers.cpp @@ -14,7 +14,7 @@ using namespace std; Qualifiers::Qualifiers(const unsigned char* ptr): qualifiers() { - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET); const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; vector<string> _dupa; readTags(currPtr, _dupa); @@ -23,7 +23,7 @@ qualifiers() { uint16_t allCombinationsSize = readInt16(currPtr); this->qualifiers.reserve(allCombinationsSize); for (unsigned int i = 0; i < allCombinationsSize; i++) { - unsigned char qualsNum = *currPtr++; + unsigned char qualsNum = readInt8(currPtr); vector<string> quals; for (unsigned int j = 0; j < qualsNum; j++) { quals.push_back(readString(currPtr)); diff --git a/morfeusz/charset/CharsetConverter.cpp b/morfeusz/charset/CharsetConverter.cpp index 356bede..e7c8f5c 100644 --- a/morfeusz/charset/CharsetConverter.cpp +++ b/morfeusz/charset/CharsetConverter.cpp @@ -11,7 +11,7 @@ using namespace std; -const char DEFAULT_UNDEFINED_CHAR = static_cast<char>(0xF7); +const char DEFAULT_UNDEFINED_CHAR = static_cast<char> (0xF7); string CharsetConverter::toString(const vector<uint32_t>& codepoints) const { string res; @@ -22,22 +22,31 @@ string CharsetConverter::toString(const vector<uint32_t>& codepoints) const { } CharsetConverter::~CharsetConverter() { - + } -uint32_t UTF8CharsetConverter::peek(const char*& it, const char* end) const { - return utf8::unchecked::peek_next(it); +uint32_t CharsetConverter::peek(const char* it, const char* end) const { + return this->next(it, end); +} + +static inline void iterateThroughInvalidUtf8Sequence(const char*& it, const char* end) { + uint32_t _dupa; + while (it != end && utf8::internal::validate_next(it, end, _dupa) != utf8::internal::UTF8_OK) { + it++; + } } uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { - return utf8::unchecked::next(it); -// catch (utf8::exception ex) { -// cerr << "WARNING: Replacing invalid character: " << hex << (uint16_t) *it << dec << " with replacement char: 0xFFFD" << endl; -// while (it != end && peek(it, end) == 0xFFFD) { -// utf8::unchecked::next(it); -// } -// return 0xFFFD; -// } + uint32_t cp = 0; + utf8::internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); + if (err_code == utf8::internal::UTF8_OK) { + return cp; + } + else { + cerr << "WARNING: Replacing invalid sequence with replacement char: 0xFFFD" << endl; + iterateThroughInvalidUtf8Sequence(it, end); + return 0xFFFD; + } } void UTF8CharsetConverter::append(uint32_t cp, string& result) const { @@ -52,7 +61,8 @@ static vector<char> reverseArray(const uint32_t* array) { res.resize(max(static_cast<uint32_t> (res.size()), codepoint + 1), DEFAULT_UNDEFINED_CHAR); res[codepoint] = static_cast<char> (c); c++; - } while (c != 255); + } + while (c != 255); return res; } @@ -63,18 +73,15 @@ codepoint2Char(reverseArray(array)) { // TODO - sprawdzanie zakresu -uint32_t OneByteCharsetConverter::peek(const char*& it, const char* end) const { - return this->array[static_cast<unsigned char> (*it)]; -} - uint32_t OneByteCharsetConverter::next(const char*& it, const char* end) const { - return this->array[static_cast<unsigned char> (*(it++))]; + return this->array[static_cast<unsigned char> (*it++)]; } void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const { if (cp < this->codepoint2Char.size()) { result.push_back(this->codepoint2Char[cp]); - } else { + } + else { result.push_back(DEFAULT_UNDEFINED_CHAR); } } diff --git a/morfeusz/charset/CharsetConverter.hpp b/morfeusz/charset/CharsetConverter.hpp index f7f38ca..8035bb6 100644 --- a/morfeusz/charset/CharsetConverter.hpp +++ b/morfeusz/charset/CharsetConverter.hpp @@ -15,7 +15,7 @@ class CharsetConverter { public: - virtual uint32_t peek(const char*& it, const char* end) const = 0; + uint32_t peek(const char* it, const char* end) const; virtual uint32_t next(const char*& it, const char* end) const = 0; virtual void append(uint32_t cp, std::string& result) const = 0; virtual std::string fromUTF8(const std::string& input) const; @@ -28,7 +28,6 @@ private: class UTF8CharsetConverter : public CharsetConverter { public: - uint32_t peek(const char*& it, const char* end) const; uint32_t next(const char*& it, const char* end) const; void append(uint32_t cp, std::string& result) const; // std::string fromUTF8(const std::string& input) const; @@ -41,7 +40,6 @@ private: class OneByteCharsetConverter : public CharsetConverter { public: explicit OneByteCharsetConverter(const uint32_t* array); - uint32_t peek(const char*& it, const char* end) const; uint32_t next(const char*& it, const char* end) const; void append(uint32_t cp, std::string& result) const; private: diff --git a/morfeusz/cli/cli.cpp b/morfeusz/cli/cli.cpp index 8704b2f..8c83ec2 100644 --- a/morfeusz/cli/cli.cpp +++ b/morfeusz/cli/cli.cpp @@ -139,10 +139,7 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { cerr << "setting case sensitive to FALSE" << endl; morfeusz.setCaseSensitive(false); } -#ifdef _WIN32 - morfeusz.setCharset(CP852); -#endif -#ifdef _WIN64 +#if defined(_WIN64) || defined(_WIN32) morfeusz.setCharset(CP852); #endif } diff --git a/morfeusz/cli/cli.hpp b/morfeusz/cli/cli.hpp index bbf4c55..b263aa1 100644 --- a/morfeusz/cli/cli.hpp +++ b/morfeusz/cli/cli.hpp @@ -8,14 +8,9 @@ #ifndef CLI_HPP #define CLI_HPP -#ifdef _WIN64 +#if defined(_WIN64) || defined(_WIN32) #define TMPDUPA_IN IN #define IN IN -#else -#ifdef _WIN32 -#define TMPDUPA_IN IN -#define IN IN -#endif #endif #include <iostream> @@ -40,12 +35,8 @@ void initializeMorfeusz(ez::ezOptionParser& opt, Morfeusz& morfeusz); #pragma GCC diagnostic pop -#ifdef _WIN64 +#if defined(_WIN64) || defined(_WIN32) #define IN TMPDUPA_IN -#else -#ifdef _WIN32 -#define IN TMPDUPA_IN -#endif #endif #endif /* CLI_HPP */ diff --git a/morfeusz/deserializationUtils.hpp b/morfeusz/deserializationUtils.hpp index b198a99..7ac415d 100644 --- a/morfeusz/deserializationUtils.hpp +++ b/morfeusz/deserializationUtils.hpp @@ -27,6 +27,11 @@ inline uint32_t readInt32(const unsigned char*& currPtr) { return res; } +inline uint32_t readInt32Const(const unsigned char* const currPtr) { + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr)); + return res; +} + inline std::string readString(const unsigned char*& currPtr) { std::string res((const char*) currPtr); currPtr += res.length(); diff --git a/morfeusz/endianness.hpp b/morfeusz/endianness.hpp index 65008b3..10a5ade 100644 --- a/morfeusz/endianness.hpp +++ b/morfeusz/endianness.hpp @@ -8,15 +8,11 @@ #ifndef ENDIANNESS_HPP #define ENDIANNESS_HPP -#ifdef _WIN64 -#include <winsock2.h> -#else -#ifdef _WIN32 +#if defined(_WIN64) || defined(_WIN32) #include <winsock2.h> #else #include <netinet/in.h> #endif -#endif #endif /* ENDIANNESS_HPP */ diff --git a/morfeusz/fsa/fsa_impl.hpp b/morfeusz/fsa/fsa_impl.hpp index ec89b85..19c29c1 100644 --- a/morfeusz/fsa/fsa_impl.hpp +++ b/morfeusz/fsa/fsa_impl.hpp @@ -78,8 +78,6 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); -// uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET))); - const unsigned char* startPtr = ptr + FSA_DATA_OFFSET; switch (implementationNum) { case 0: diff --git a/morfeusz/fsa/simplefsa_impl.hpp b/morfeusz/fsa/simplefsa_impl.hpp index 606daa7..acdad5b 100644 --- a/morfeusz/fsa/simplefsa_impl.hpp +++ b/morfeusz/fsa/simplefsa_impl.hpp @@ -63,7 +63,6 @@ void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { break; } } - // const_cast<Counter*>(&counter)->increment(foundTransition - transitionsStart + 1); if (!found) { state.setNextAsSink(); } diff --git a/morfeusz/segrules/segrules.cpp b/morfeusz/segrules/segrules.cpp index 7bd0619..e820176 100644 --- a/morfeusz/segrules/segrules.cpp +++ b/morfeusz/segrules/segrules.cpp @@ -14,16 +14,12 @@ static inline void skipSeparatorsList(const unsigned char*& ptr) { static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr) { const unsigned char* additionalDataPtr = ptr + FSA_DATA_OFFSET - + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); - const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4; + + readInt32Const(ptr + FSA_DATA_SIZE_OFFSET); + const unsigned char* res = additionalDataPtr + readInt32Const(additionalDataPtr) + 4; return res; } static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) { -// const unsigned char* additionalDataPtr = ptr -// + FSA_DATA_OFFSET -// + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); -// const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; const unsigned char* res = getSeparatorsListPtr(ptr); skipSeparatorsList(res); return res; @@ -84,11 +80,9 @@ SegrulesFSA* getDefaultSegrulesFSA( vector<uint32_t> getSeparatorsList(const unsigned char* ptr) { ptr = getSeparatorsListPtr(ptr); vector<uint32_t> res; - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr)); - ptr += 2; + uint16_t listSize = readInt16(ptr); for (unsigned int i = 0; i < listSize; i++) { - res.push_back(ntohl(*reinterpret_cast<const uint32_t*>(ptr))); - ptr += 4; + res.push_back(readInt32(ptr)); } return res; } -- libgit2 0.22.2