Commit d51aa0ddcd3289f7abd65d987df92d721cfa66e0
1 parent
39810b81
poprawienie buga w dodawaniu replacement char, ogólne porządki w kodzie
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@168 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
15 changed files
with
58 additions
and
86 deletions
CMakeLists.txt
morfeusz/CMakeLists.txt
... | ... | @@ -3,13 +3,13 @@ |
3 | 3 | ########## generate default dictionary data ################# |
4 | 4 | add_custom_command ( |
5 | 5 | OUTPUT "${INPUT_DICTIONARY_CPP}" |
6 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg | |
6 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 --trim-supneg | |
7 | 7 | DEPENDS "${INPUT_DICTIONARY}" |
8 | 8 | COMMENT "Building default dictionary C++ file" |
9 | 9 | ) |
10 | 10 | add_custom_command ( |
11 | 11 | OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" |
12 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 | |
12 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE | |
13 | 13 | DEPENDS "${INPUT_DICTIONARY}" |
14 | 14 | COMMENT "Building default dictionary C++ file" |
15 | 15 | ) |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -66,13 +66,6 @@ void Morfeusz::setGeneratorFile(const string& filename) { |
66 | 66 | Morfeusz::~Morfeusz() { |
67 | 67 | } |
68 | 68 | |
69 | -string Morfeusz::prepareStringToProcess(const std::string& str) const { | |
70 | - string res; | |
71 | - res.reserve(str.size()); | |
72 | - utf8::replace_invalid(str.begin(), str.end(), back_inserter(res)); | |
73 | - return res; | |
74 | -} | |
75 | - | |
76 | 69 | void Morfeusz::processOneWord( |
77 | 70 | const Environment& env, |
78 | 71 | const char*& inputStart, |
... | ... | @@ -88,9 +81,9 @@ void Morfeusz::processOneWord( |
88 | 81 | InflexionGraph graph; |
89 | 82 | const char* currInput = inputStart; |
90 | 83 | const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); |
91 | - | |
84 | + | |
92 | 85 | doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); |
93 | - | |
86 | + | |
94 | 87 | if (!graph.empty()) { |
95 | 88 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); |
96 | 89 | int srcNode = startNodeNum; |
... | ... | @@ -148,10 +141,10 @@ void Morfeusz::doProcessOneWord( |
148 | 141 | SegrulesState segrulesState, |
149 | 142 | vector<InterpretedChunk>& accum, |
150 | 143 | InflexionGraph& graph) const { |
151 | - // if (this->options.debug) { | |
152 | - // cerr << "----------" << endl; | |
153 | - // cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; | |
154 | - // } | |
144 | + if (this->options.debug) { | |
145 | + cerr << "----------" << endl; | |
146 | + cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; | |
147 | + } | |
155 | 148 | // cerr << "doAnalyzeOneWord " << inputData << endl; |
156 | 149 | const char* inputStart = inputData; |
157 | 150 | const char* currInput = inputData; |
... | ... | @@ -303,9 +296,8 @@ ResultsIterator Morfeusz::analyze(const string& text) const { |
303 | 296 | } |
304 | 297 | |
305 | 298 | void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const { |
306 | - string preparedText = this->prepareStringToProcess(text); | |
307 | - const char* input = preparedText.c_str(); | |
308 | - const char* inputEnd = input + preparedText.length(); | |
299 | + const char* input = text.c_str(); | |
300 | + const char* inputEnd = input + text.length(); | |
309 | 301 | while (input != inputEnd) { |
310 | 302 | int startNode = results.empty() ? 0 : results.back().getEndNode(); |
311 | 303 | this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results); |
... | ... | @@ -324,8 +316,7 @@ ResultsIterator Morfeusz::generate(const string& text, int tagnum) const { |
324 | 316 | return ResultsIterator(res); |
325 | 317 | } |
326 | 318 | |
327 | -void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results) const { | |
328 | - string lemma = this->prepareStringToProcess(text); | |
319 | +void Morfeusz::generate(const string& lemma, vector<MorphInterpretation>& results) const { | |
329 | 320 | const char* input = lemma.c_str(); |
330 | 321 | const char* inputEnd = input + lemma.length(); |
331 | 322 | int startNode = 0; |
... | ... |
morfeusz/Morfeusz.hpp
morfeusz/MorphDeserializer.cpp
... | ... | @@ -11,6 +11,7 @@ |
11 | 11 | #include "MorphDeserializer.hpp" |
12 | 12 | #include "EncodedInterpretation.hpp" |
13 | 13 | #include "InterpsGroup.hpp" |
14 | +#include "deserializationUtils.hpp" | |
14 | 15 | |
15 | 16 | //const uint8_t LEMMA_ONLY_LOWER = 0; |
16 | 17 | //const uint8_t LEMMA_UPPER_PREFIX = 1; |
... | ... | @@ -25,16 +26,13 @@ MorphDeserializer::~MorphDeserializer() { |
25 | 26 | |
26 | 27 | long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { |
27 | 28 | const unsigned char* currPtr = ptr; |
28 | - uint8_t interpTypesNum = *currPtr; | |
29 | - currPtr++; | |
29 | + uint8_t interpTypesNum = readInt8(currPtr); | |
30 | 30 | interps.clear(); |
31 | 31 | interps.reserve(interpTypesNum); |
32 | 32 | for (unsigned int i = 0; i < interpTypesNum; i++) { |
33 | 33 | InterpsGroup ig; |
34 | - ig.type = *currPtr; | |
35 | - currPtr++; | |
36 | - ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr))); | |
37 | - currPtr += 2; | |
34 | + ig.type = readInt8(currPtr); | |
35 | + ig.size = readInt16(currPtr); | |
38 | 36 | ig.ptr = currPtr; |
39 | 37 | currPtr += ig.size; |
40 | 38 | interps.push_back(ig); |
... | ... |
morfeusz/Qualifiers.cpp
... | ... | @@ -14,7 +14,7 @@ using namespace std; |
14 | 14 | |
15 | 15 | Qualifiers::Qualifiers(const unsigned char* ptr): |
16 | 16 | qualifiers() { |
17 | - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | |
17 | + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET); | |
18 | 18 | const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; |
19 | 19 | vector<string> _dupa; |
20 | 20 | readTags(currPtr, _dupa); |
... | ... | @@ -23,7 +23,7 @@ qualifiers() { |
23 | 23 | uint16_t allCombinationsSize = readInt16(currPtr); |
24 | 24 | this->qualifiers.reserve(allCombinationsSize); |
25 | 25 | for (unsigned int i = 0; i < allCombinationsSize; i++) { |
26 | - unsigned char qualsNum = *currPtr++; | |
26 | + unsigned char qualsNum = readInt8(currPtr); | |
27 | 27 | vector<string> quals; |
28 | 28 | for (unsigned int j = 0; j < qualsNum; j++) { |
29 | 29 | quals.push_back(readString(currPtr)); |
... | ... |
morfeusz/charset/CharsetConverter.cpp
... | ... | @@ -11,7 +11,7 @@ |
11 | 11 | |
12 | 12 | using namespace std; |
13 | 13 | |
14 | -const char DEFAULT_UNDEFINED_CHAR = static_cast<char>(0xF7); | |
14 | +const char DEFAULT_UNDEFINED_CHAR = static_cast<char> (0xF7); | |
15 | 15 | |
16 | 16 | string CharsetConverter::toString(const vector<uint32_t>& codepoints) const { |
17 | 17 | string res; |
... | ... | @@ -22,22 +22,31 @@ string CharsetConverter::toString(const vector<uint32_t>& codepoints) const { |
22 | 22 | } |
23 | 23 | |
24 | 24 | CharsetConverter::~CharsetConverter() { |
25 | - | |
25 | + | |
26 | 26 | } |
27 | 27 | |
28 | -uint32_t UTF8CharsetConverter::peek(const char*& it, const char* end) const { | |
29 | - return utf8::unchecked::peek_next(it); | |
28 | +uint32_t CharsetConverter::peek(const char* it, const char* end) const { | |
29 | + return this->next(it, end); | |
30 | +} | |
31 | + | |
32 | +static inline void iterateThroughInvalidUtf8Sequence(const char*& it, const char* end) { | |
33 | + uint32_t _dupa; | |
34 | + while (it != end && utf8::internal::validate_next(it, end, _dupa) != utf8::internal::UTF8_OK) { | |
35 | + it++; | |
36 | + } | |
30 | 37 | } |
31 | 38 | |
32 | 39 | uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { |
33 | - return utf8::unchecked::next(it); | |
34 | -// catch (utf8::exception ex) { | |
35 | -// cerr << "WARNING: Replacing invalid character: " << hex << (uint16_t) *it << dec << " with replacement char: 0xFFFD" << endl; | |
36 | -// while (it != end && peek(it, end) == 0xFFFD) { | |
37 | -// utf8::unchecked::next(it); | |
38 | -// } | |
39 | -// return 0xFFFD; | |
40 | -// } | |
40 | + uint32_t cp = 0; | |
41 | + utf8::internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); | |
42 | + if (err_code == utf8::internal::UTF8_OK) { | |
43 | + return cp; | |
44 | + } | |
45 | + else { | |
46 | + cerr << "WARNING: Replacing invalid sequence with replacement char: 0xFFFD" << endl; | |
47 | + iterateThroughInvalidUtf8Sequence(it, end); | |
48 | + return 0xFFFD; | |
49 | + } | |
41 | 50 | } |
42 | 51 | |
43 | 52 | void UTF8CharsetConverter::append(uint32_t cp, string& result) const { |
... | ... | @@ -52,7 +61,8 @@ static vector<char> reverseArray(const uint32_t* array) { |
52 | 61 | res.resize(max(static_cast<uint32_t> (res.size()), codepoint + 1), DEFAULT_UNDEFINED_CHAR); |
53 | 62 | res[codepoint] = static_cast<char> (c); |
54 | 63 | c++; |
55 | - } while (c != 255); | |
64 | + } | |
65 | + while (c != 255); | |
56 | 66 | return res; |
57 | 67 | } |
58 | 68 | |
... | ... | @@ -63,18 +73,15 @@ codepoint2Char(reverseArray(array)) { |
63 | 73 | |
64 | 74 | // TODO - sprawdzanie zakresu |
65 | 75 | |
66 | -uint32_t OneByteCharsetConverter::peek(const char*& it, const char* end) const { | |
67 | - return this->array[static_cast<unsigned char> (*it)]; | |
68 | -} | |
69 | - | |
70 | 76 | uint32_t OneByteCharsetConverter::next(const char*& it, const char* end) const { |
71 | - return this->array[static_cast<unsigned char> (*(it++))]; | |
77 | + return this->array[static_cast<unsigned char> (*it++)]; | |
72 | 78 | } |
73 | 79 | |
74 | 80 | void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const { |
75 | 81 | if (cp < this->codepoint2Char.size()) { |
76 | 82 | result.push_back(this->codepoint2Char[cp]); |
77 | - } else { | |
83 | + } | |
84 | + else { | |
78 | 85 | result.push_back(DEFAULT_UNDEFINED_CHAR); |
79 | 86 | } |
80 | 87 | } |
... | ... |
morfeusz/charset/CharsetConverter.hpp
... | ... | @@ -15,7 +15,7 @@ |
15 | 15 | |
16 | 16 | class CharsetConverter { |
17 | 17 | public: |
18 | - virtual uint32_t peek(const char*& it, const char* end) const = 0; | |
18 | + uint32_t peek(const char* it, const char* end) const; | |
19 | 19 | virtual uint32_t next(const char*& it, const char* end) const = 0; |
20 | 20 | virtual void append(uint32_t cp, std::string& result) const = 0; |
21 | 21 | virtual std::string fromUTF8(const std::string& input) const; |
... | ... | @@ -28,7 +28,6 @@ private: |
28 | 28 | |
29 | 29 | class UTF8CharsetConverter : public CharsetConverter { |
30 | 30 | public: |
31 | - uint32_t peek(const char*& it, const char* end) const; | |
32 | 31 | uint32_t next(const char*& it, const char* end) const; |
33 | 32 | void append(uint32_t cp, std::string& result) const; |
34 | 33 | // std::string fromUTF8(const std::string& input) const; |
... | ... | @@ -41,7 +40,6 @@ private: |
41 | 40 | class OneByteCharsetConverter : public CharsetConverter { |
42 | 41 | public: |
43 | 42 | explicit OneByteCharsetConverter(const uint32_t* array); |
44 | - uint32_t peek(const char*& it, const char* end) const; | |
45 | 43 | uint32_t next(const char*& it, const char* end) const; |
46 | 44 | void append(uint32_t cp, std::string& result) const; |
47 | 45 | private: |
... | ... |
morfeusz/cli/cli.cpp
... | ... | @@ -139,10 +139,7 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { |
139 | 139 | cerr << "setting case sensitive to FALSE" << endl; |
140 | 140 | morfeusz.setCaseSensitive(false); |
141 | 141 | } |
142 | -#ifdef _WIN32 | |
143 | - morfeusz.setCharset(CP852); | |
144 | -#endif | |
145 | -#ifdef _WIN64 | |
142 | +#if defined(_WIN64) || defined(_WIN32) | |
146 | 143 | morfeusz.setCharset(CP852); |
147 | 144 | #endif |
148 | 145 | } |
... | ... |
morfeusz/cli/cli.hpp
... | ... | @@ -8,14 +8,9 @@ |
8 | 8 | #ifndef CLI_HPP |
9 | 9 | #define CLI_HPP |
10 | 10 | |
11 | -#ifdef _WIN64 | |
11 | +#if defined(_WIN64) || defined(_WIN32) | |
12 | 12 | #define TMPDUPA_IN IN |
13 | 13 | #define IN IN |
14 | -#else | |
15 | -#ifdef _WIN32 | |
16 | -#define TMPDUPA_IN IN | |
17 | -#define IN IN | |
18 | -#endif | |
19 | 14 | #endif |
20 | 15 | |
21 | 16 | #include <iostream> |
... | ... | @@ -40,12 +35,8 @@ void initializeMorfeusz(ez::ezOptionParser& opt, Morfeusz& morfeusz); |
40 | 35 | |
41 | 36 | #pragma GCC diagnostic pop |
42 | 37 | |
43 | -#ifdef _WIN64 | |
38 | +#if defined(_WIN64) || defined(_WIN32) | |
44 | 39 | #define IN TMPDUPA_IN |
45 | -#else | |
46 | -#ifdef _WIN32 | |
47 | -#define IN TMPDUPA_IN | |
48 | -#endif | |
49 | 40 | #endif |
50 | 41 | |
51 | 42 | #endif /* CLI_HPP */ |
... | ... |
morfeusz/deserializationUtils.hpp
... | ... | @@ -27,6 +27,11 @@ inline uint32_t readInt32(const unsigned char*& currPtr) { |
27 | 27 | return res; |
28 | 28 | } |
29 | 29 | |
30 | +inline uint32_t readInt32Const(const unsigned char* const currPtr) { | |
31 | + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr)); | |
32 | + return res; | |
33 | +} | |
34 | + | |
30 | 35 | inline std::string readString(const unsigned char*& currPtr) { |
31 | 36 | std::string res((const char*) currPtr); |
32 | 37 | currPtr += res.length(); |
... | ... |
morfeusz/endianness.hpp
... | ... | @@ -8,15 +8,11 @@ |
8 | 8 | #ifndef ENDIANNESS_HPP |
9 | 9 | #define ENDIANNESS_HPP |
10 | 10 | |
11 | -#ifdef _WIN64 | |
12 | -#include <winsock2.h> | |
13 | -#else | |
14 | -#ifdef _WIN32 | |
11 | +#if defined(_WIN64) || defined(_WIN32) | |
15 | 12 | #include <winsock2.h> |
16 | 13 | #else |
17 | 14 | #include <netinet/in.h> |
18 | 15 | #endif |
19 | -#endif | |
20 | 16 | |
21 | 17 | #endif /* ENDIANNESS_HPP */ |
22 | 18 | |
... | ... |
morfeusz/fsa/fsa_impl.hpp
... | ... | @@ -78,8 +78,6 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial |
78 | 78 | |
79 | 79 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); |
80 | 80 | |
81 | -// uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET))); | |
82 | - | |
83 | 81 | const unsigned char* startPtr = ptr + FSA_DATA_OFFSET; |
84 | 82 | switch (implementationNum) { |
85 | 83 | case 0: |
... | ... |
morfeusz/fsa/simplefsa_impl.hpp
morfeusz/segrules/segrules.cpp
... | ... | @@ -14,16 +14,12 @@ static inline void skipSeparatorsList(const unsigned char*& ptr) { |
14 | 14 | static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr) { |
15 | 15 | const unsigned char* additionalDataPtr = ptr |
16 | 16 | + FSA_DATA_OFFSET |
17 | - + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | |
18 | - const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4; | |
17 | + + readInt32Const(ptr + FSA_DATA_SIZE_OFFSET); | |
18 | + const unsigned char* res = additionalDataPtr + readInt32Const(additionalDataPtr) + 4; | |
19 | 19 | return res; |
20 | 20 | } |
21 | 21 | |
22 | 22 | static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) { |
23 | -// const unsigned char* additionalDataPtr = ptr | |
24 | -// + FSA_DATA_OFFSET | |
25 | -// + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | |
26 | -// const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; | |
27 | 23 | const unsigned char* res = getSeparatorsListPtr(ptr); |
28 | 24 | skipSeparatorsList(res); |
29 | 25 | return res; |
... | ... | @@ -84,11 +80,9 @@ SegrulesFSA* getDefaultSegrulesFSA( |
84 | 80 | vector<uint32_t> getSeparatorsList(const unsigned char* ptr) { |
85 | 81 | ptr = getSeparatorsListPtr(ptr); |
86 | 82 | vector<uint32_t> res; |
87 | - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr)); | |
88 | - ptr += 2; | |
83 | + uint16_t listSize = readInt16(ptr); | |
89 | 84 | for (unsigned int i = 0; i < listSize; i++) { |
90 | - res.push_back(ntohl(*reinterpret_cast<const uint32_t*>(ptr))); | |
91 | - ptr += 4; | |
85 | + res.push_back(readInt32(ptr)); | |
92 | 86 | } |
93 | 87 | return res; |
94 | 88 | } |
... | ... |