Commit d51aa0ddcd3289f7abd65d987df92d721cfa66e0

Authored by Michał Lenart
1 parent 39810b81

poprawienie buga w dodawaniu replacement char, ogólne porządki w kodzie

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@168 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt
... ... @@ -5,7 +5,7 @@ project (Morfeusz)
5 5 set (Morfeusz_VERSION_MAJOR 2)
6 6 set (Morfeusz_VERSION_MINOR 0)
7 7 set (Morfeusz_VERSION_PATCH 0)
8   -#~ set (CMAKE_BUILD_TYPE "Release")
  8 +#~ set (CMAKE_BUILD_TYPE "Debug")
9 9  
10 10 enable_testing()
11 11  
... ...
morfeusz/CMakeLists.txt
... ... @@ -3,13 +3,13 @@
3 3 ########## generate default dictionary data #################
4 4 add_custom_command (
5 5 OUTPUT "${INPUT_DICTIONARY_CPP}"
6   - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg
  6 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 --trim-supneg
7 7 DEPENDS "${INPUT_DICTIONARY}"
8 8 COMMENT "Building default dictionary C++ file"
9 9 )
10 10 add_custom_command (
11 11 OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}"
12   - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2
  12 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE
13 13 DEPENDS "${INPUT_DICTIONARY}"
14 14 COMMENT "Building default dictionary C++ file"
15 15 )
... ...
morfeusz/Morfeusz.cpp
... ... @@ -66,13 +66,6 @@ void Morfeusz::setGeneratorFile(const string& filename) {
66 66 Morfeusz::~Morfeusz() {
67 67 }
68 68  
69   -string Morfeusz::prepareStringToProcess(const std::string& str) const {
70   - string res;
71   - res.reserve(str.size());
72   - utf8::replace_invalid(str.begin(), str.end(), back_inserter(res));
73   - return res;
74   -}
75   -
76 69 void Morfeusz::processOneWord(
77 70 const Environment& env,
78 71 const char*& inputStart,
... ... @@ -88,9 +81,9 @@ void Morfeusz::processOneWord(
88 81 InflexionGraph graph;
89 82 const char* currInput = inputStart;
90 83 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
91   -
  84 +
92 85 doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
93   -
  86 +
94 87 if (!graph.empty()) {
95 88 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
96 89 int srcNode = startNodeNum;
... ... @@ -148,10 +141,10 @@ void Morfeusz::doProcessOneWord(
148 141 SegrulesState segrulesState,
149 142 vector<InterpretedChunk>& accum,
150 143 InflexionGraph& graph) const {
151   - // if (this->options.debug) {
152   - // cerr << "----------" << endl;
153   - // cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
154   - // }
  144 + if (this->options.debug) {
  145 + cerr << "----------" << endl;
  146 + cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
  147 + }
155 148 // cerr << "doAnalyzeOneWord " << inputData << endl;
156 149 const char* inputStart = inputData;
157 150 const char* currInput = inputData;
... ... @@ -303,9 +296,8 @@ ResultsIterator Morfeusz::analyze(const string&amp; text) const {
303 296 }
304 297  
305 298 void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const {
306   - string preparedText = this->prepareStringToProcess(text);
307   - const char* input = preparedText.c_str();
308   - const char* inputEnd = input + preparedText.length();
  299 + const char* input = text.c_str();
  300 + const char* inputEnd = input + text.length();
309 301 while (input != inputEnd) {
310 302 int startNode = results.empty() ? 0 : results.back().getEndNode();
311 303 this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results);
... ... @@ -324,8 +316,7 @@ ResultsIterator Morfeusz::generate(const string&amp; text, int tagnum) const {
324 316 return ResultsIterator(res);
325 317 }
326 318  
327   -void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results) const {
328   - string lemma = this->prepareStringToProcess(text);
  319 +void Morfeusz::generate(const string& lemma, vector<MorphInterpretation>& results) const {
329 320 const char* input = lemma.c_str();
330 321 const char* inputEnd = input + lemma.length();
331 322 int startNode = 0;
... ...
morfeusz/Morfeusz.hpp
... ... @@ -158,8 +158,6 @@ public:
158 158 friend class ResultsIterator;
159 159 private:
160 160  
161   - std::string prepareStringToProcess(const std::string& input) const;
162   -
163 161 void processOneWord(
164 162 const Environment& env,
165 163 const char*& inputData,
... ...
morfeusz/MorphDeserializer.cpp
... ... @@ -11,6 +11,7 @@
11 11 #include "MorphDeserializer.hpp"
12 12 #include "EncodedInterpretation.hpp"
13 13 #include "InterpsGroup.hpp"
  14 +#include "deserializationUtils.hpp"
14 15  
15 16 //const uint8_t LEMMA_ONLY_LOWER = 0;
16 17 //const uint8_t LEMMA_UPPER_PREFIX = 1;
... ... @@ -25,16 +26,13 @@ MorphDeserializer::~MorphDeserializer() {
25 26  
26 27 long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
27 28 const unsigned char* currPtr = ptr;
28   - uint8_t interpTypesNum = *currPtr;
29   - currPtr++;
  29 + uint8_t interpTypesNum = readInt8(currPtr);
30 30 interps.clear();
31 31 interps.reserve(interpTypesNum);
32 32 for (unsigned int i = 0; i < interpTypesNum; i++) {
33 33 InterpsGroup ig;
34   - ig.type = *currPtr;
35   - currPtr++;
36   - ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr)));
37   - currPtr += 2;
  34 + ig.type = readInt8(currPtr);
  35 + ig.size = readInt16(currPtr);
38 36 ig.ptr = currPtr;
39 37 currPtr += ig.size;
40 38 interps.push_back(ig);
... ...
morfeusz/Qualifiers.cpp
... ... @@ -14,7 +14,7 @@ using namespace std;
14 14  
15 15 Qualifiers::Qualifiers(const unsigned char* ptr):
16 16 qualifiers() {
17   - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
  17 + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET);
18 18 const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4;
19 19 vector<string> _dupa;
20 20 readTags(currPtr, _dupa);
... ... @@ -23,7 +23,7 @@ qualifiers() {
23 23 uint16_t allCombinationsSize = readInt16(currPtr);
24 24 this->qualifiers.reserve(allCombinationsSize);
25 25 for (unsigned int i = 0; i < allCombinationsSize; i++) {
26   - unsigned char qualsNum = *currPtr++;
  26 + unsigned char qualsNum = readInt8(currPtr);
27 27 vector<string> quals;
28 28 for (unsigned int j = 0; j < qualsNum; j++) {
29 29 quals.push_back(readString(currPtr));
... ...
morfeusz/charset/CharsetConverter.cpp
... ... @@ -11,7 +11,7 @@
11 11  
12 12 using namespace std;
13 13  
14   -const char DEFAULT_UNDEFINED_CHAR = static_cast<char>(0xF7);
  14 +const char DEFAULT_UNDEFINED_CHAR = static_cast<char> (0xF7);
15 15  
16 16 string CharsetConverter::toString(const vector<uint32_t>& codepoints) const {
17 17 string res;
... ... @@ -22,22 +22,31 @@ string CharsetConverter::toString(const vector&lt;uint32_t&gt;&amp; codepoints) const {
22 22 }
23 23  
24 24 CharsetConverter::~CharsetConverter() {
25   -
  25 +
26 26 }
27 27  
28   -uint32_t UTF8CharsetConverter::peek(const char*& it, const char* end) const {
29   - return utf8::unchecked::peek_next(it);
  28 +uint32_t CharsetConverter::peek(const char* it, const char* end) const {
  29 + return this->next(it, end);
  30 +}
  31 +
  32 +static inline void iterateThroughInvalidUtf8Sequence(const char*& it, const char* end) {
  33 + uint32_t _dupa;
  34 + while (it != end && utf8::internal::validate_next(it, end, _dupa) != utf8::internal::UTF8_OK) {
  35 + it++;
  36 + }
30 37 }
31 38  
32 39 uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
33   - return utf8::unchecked::next(it);
34   -// catch (utf8::exception ex) {
35   -// cerr << "WARNING: Replacing invalid character: " << hex << (uint16_t) *it << dec << " with replacement char: 0xFFFD" << endl;
36   -// while (it != end && peek(it, end) == 0xFFFD) {
37   -// utf8::unchecked::next(it);
38   -// }
39   -// return 0xFFFD;
40   -// }
  40 + uint32_t cp = 0;
  41 + utf8::internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
  42 + if (err_code == utf8::internal::UTF8_OK) {
  43 + return cp;
  44 + }
  45 + else {
  46 + cerr << "WARNING: Replacing invalid sequence with replacement char: 0xFFFD" << endl;
  47 + iterateThroughInvalidUtf8Sequence(it, end);
  48 + return 0xFFFD;
  49 + }
41 50 }
42 51  
43 52 void UTF8CharsetConverter::append(uint32_t cp, string& result) const {
... ... @@ -52,7 +61,8 @@ static vector&lt;char&gt; reverseArray(const uint32_t* array) {
52 61 res.resize(max(static_cast<uint32_t> (res.size()), codepoint + 1), DEFAULT_UNDEFINED_CHAR);
53 62 res[codepoint] = static_cast<char> (c);
54 63 c++;
55   - } while (c != 255);
  64 + }
  65 + while (c != 255);
56 66 return res;
57 67 }
58 68  
... ... @@ -63,18 +73,15 @@ codepoint2Char(reverseArray(array)) {
63 73  
64 74 // TODO - sprawdzanie zakresu
65 75  
66   -uint32_t OneByteCharsetConverter::peek(const char*& it, const char* end) const {
67   - return this->array[static_cast<unsigned char> (*it)];
68   -}
69   -
70 76 uint32_t OneByteCharsetConverter::next(const char*& it, const char* end) const {
71   - return this->array[static_cast<unsigned char> (*(it++))];
  77 + return this->array[static_cast<unsigned char> (*it++)];
72 78 }
73 79  
74 80 void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const {
75 81 if (cp < this->codepoint2Char.size()) {
76 82 result.push_back(this->codepoint2Char[cp]);
77   - } else {
  83 + }
  84 + else {
78 85 result.push_back(DEFAULT_UNDEFINED_CHAR);
79 86 }
80 87 }
... ...
morfeusz/charset/CharsetConverter.hpp
... ... @@ -15,7 +15,7 @@
15 15  
16 16 class CharsetConverter {
17 17 public:
18   - virtual uint32_t peek(const char*& it, const char* end) const = 0;
  18 + uint32_t peek(const char* it, const char* end) const;
19 19 virtual uint32_t next(const char*& it, const char* end) const = 0;
20 20 virtual void append(uint32_t cp, std::string& result) const = 0;
21 21 virtual std::string fromUTF8(const std::string& input) const;
... ... @@ -28,7 +28,6 @@ private:
28 28  
29 29 class UTF8CharsetConverter : public CharsetConverter {
30 30 public:
31   - uint32_t peek(const char*& it, const char* end) const;
32 31 uint32_t next(const char*& it, const char* end) const;
33 32 void append(uint32_t cp, std::string& result) const;
34 33 // std::string fromUTF8(const std::string& input) const;
... ... @@ -41,7 +40,6 @@ private:
41 40 class OneByteCharsetConverter : public CharsetConverter {
42 41 public:
43 42 explicit OneByteCharsetConverter(const uint32_t* array);
44   - uint32_t peek(const char*& it, const char* end) const;
45 43 uint32_t next(const char*& it, const char* end) const;
46 44 void append(uint32_t cp, std::string& result) const;
47 45 private:
... ...
morfeusz/cli/cli.cpp
... ... @@ -139,10 +139,7 @@ void initializeMorfeusz(ezOptionParser&amp; opt, Morfeusz&amp; morfeusz) {
139 139 cerr << "setting case sensitive to FALSE" << endl;
140 140 morfeusz.setCaseSensitive(false);
141 141 }
142   -#ifdef _WIN32
143   - morfeusz.setCharset(CP852);
144   -#endif
145   -#ifdef _WIN64
  142 +#if defined(_WIN64) || defined(_WIN32)
146 143 morfeusz.setCharset(CP852);
147 144 #endif
148 145 }
... ...
morfeusz/cli/cli.hpp
... ... @@ -8,14 +8,9 @@
8 8 #ifndef CLI_HPP
9 9 #define CLI_HPP
10 10  
11   -#ifdef _WIN64
  11 +#if defined(_WIN64) || defined(_WIN32)
12 12 #define TMPDUPA_IN IN
13 13 #define IN IN
14   -#else
15   -#ifdef _WIN32
16   -#define TMPDUPA_IN IN
17   -#define IN IN
18   -#endif
19 14 #endif
20 15  
21 16 #include <iostream>
... ... @@ -40,12 +35,8 @@ void initializeMorfeusz(ez::ezOptionParser&amp; opt, Morfeusz&amp; morfeusz);
40 35  
41 36 #pragma GCC diagnostic pop
42 37  
43   -#ifdef _WIN64
  38 +#if defined(_WIN64) || defined(_WIN32)
44 39 #define IN TMPDUPA_IN
45   -#else
46   -#ifdef _WIN32
47   -#define IN TMPDUPA_IN
48   -#endif
49 40 #endif
50 41  
51 42 #endif /* CLI_HPP */
... ...
morfeusz/deserializationUtils.hpp
... ... @@ -27,6 +27,11 @@ inline uint32_t readInt32(const unsigned char*&amp; currPtr) {
27 27 return res;
28 28 }
29 29  
  30 +inline uint32_t readInt32Const(const unsigned char* const currPtr) {
  31 + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr));
  32 + return res;
  33 +}
  34 +
30 35 inline std::string readString(const unsigned char*& currPtr) {
31 36 std::string res((const char*) currPtr);
32 37 currPtr += res.length();
... ...
morfeusz/endianness.hpp
... ... @@ -8,15 +8,11 @@
8 8 #ifndef ENDIANNESS_HPP
9 9 #define ENDIANNESS_HPP
10 10  
11   -#ifdef _WIN64
12   -#include <winsock2.h>
13   -#else
14   -#ifdef _WIN32
  11 +#if defined(_WIN64) || defined(_WIN32)
15 12 #include <winsock2.h>
16 13 #else
17 14 #include <netinet/in.h>
18 15 #endif
19   -#endif
20 16  
21 17 #endif /* ENDIANNESS_HPP */
22 18  
... ...
morfeusz/fsa/fsa_impl.hpp
... ... @@ -78,8 +78,6 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
78 78  
79 79 uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET);
80 80  
81   -// uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET)));
82   -
83 81 const unsigned char* startPtr = ptr + FSA_DATA_OFFSET;
84 82 switch (implementationNum) {
85 83 case 0:
... ...
morfeusz/fsa/simplefsa_impl.hpp
... ... @@ -63,7 +63,6 @@ void SimpleFSA&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
63 63 break;
64 64 }
65 65 }
66   - // const_cast<Counter*>(&counter)->increment(foundTransition - transitionsStart + 1);
67 66 if (!found) {
68 67 state.setNextAsSink();
69 68 }
... ...
morfeusz/segrules/segrules.cpp
... ... @@ -14,16 +14,12 @@ static inline void skipSeparatorsList(const unsigned char*&amp; ptr) {
14 14 static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr) {
15 15 const unsigned char* additionalDataPtr = ptr
16 16 + FSA_DATA_OFFSET
17   - + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
18   - const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4;
  17 + + readInt32Const(ptr + FSA_DATA_SIZE_OFFSET);
  18 + const unsigned char* res = additionalDataPtr + readInt32Const(additionalDataPtr) + 4;
19 19 return res;
20 20 }
21 21  
22 22 static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) {
23   -// const unsigned char* additionalDataPtr = ptr
24   -// + FSA_DATA_OFFSET
25   -// + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
26   -// const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;
27 23 const unsigned char* res = getSeparatorsListPtr(ptr);
28 24 skipSeparatorsList(res);
29 25 return res;
... ... @@ -84,11 +80,9 @@ SegrulesFSA* getDefaultSegrulesFSA(
84 80 vector<uint32_t> getSeparatorsList(const unsigned char* ptr) {
85 81 ptr = getSeparatorsListPtr(ptr);
86 82 vector<uint32_t> res;
87   - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr));
88   - ptr += 2;
  83 + uint16_t listSize = readInt16(ptr);
89 84 for (unsigned int i = 0; i < listSize; i++) {
90   - res.push_back(ntohl(*reinterpret_cast<const uint32_t*>(ptr)));
91   - ptr += 4;
  85 + res.push_back(readInt32(ptr));
92 86 }
93 87 return res;
94 88 }
... ...