Commit d51aa0ddcd3289f7abd65d987df92d721cfa66e0
1 parent
39810b81
poprawienie buga w dodawaniu replacement char, ogólne porządki w kodzie
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@168 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
15 changed files
with
58 additions
and
86 deletions
CMakeLists.txt
@@ -5,7 +5,7 @@ project (Morfeusz) | @@ -5,7 +5,7 @@ project (Morfeusz) | ||
5 | set (Morfeusz_VERSION_MAJOR 2) | 5 | set (Morfeusz_VERSION_MAJOR 2) |
6 | set (Morfeusz_VERSION_MINOR 0) | 6 | set (Morfeusz_VERSION_MINOR 0) |
7 | set (Morfeusz_VERSION_PATCH 0) | 7 | set (Morfeusz_VERSION_PATCH 0) |
8 | -#~ set (CMAKE_BUILD_TYPE "Release") | 8 | +#~ set (CMAKE_BUILD_TYPE "Debug") |
9 | 9 | ||
10 | enable_testing() | 10 | enable_testing() |
11 | 11 |
morfeusz/CMakeLists.txt
@@ -3,13 +3,13 @@ | @@ -3,13 +3,13 @@ | ||
3 | ########## generate default dictionary data ################# | 3 | ########## generate default dictionary data ################# |
4 | add_custom_command ( | 4 | add_custom_command ( |
5 | OUTPUT "${INPUT_DICTIONARY_CPP}" | 5 | OUTPUT "${INPUT_DICTIONARY_CPP}" |
6 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg | 6 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 --trim-supneg |
7 | DEPENDS "${INPUT_DICTIONARY}" | 7 | DEPENDS "${INPUT_DICTIONARY}" |
8 | COMMENT "Building default dictionary C++ file" | 8 | COMMENT "Building default dictionary C++ file" |
9 | ) | 9 | ) |
10 | add_custom_command ( | 10 | add_custom_command ( |
11 | OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" | 11 | OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" |
12 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 | 12 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE |
13 | DEPENDS "${INPUT_DICTIONARY}" | 13 | DEPENDS "${INPUT_DICTIONARY}" |
14 | COMMENT "Building default dictionary C++ file" | 14 | COMMENT "Building default dictionary C++ file" |
15 | ) | 15 | ) |
morfeusz/Morfeusz.cpp
@@ -66,13 +66,6 @@ void Morfeusz::setGeneratorFile(const string& filename) { | @@ -66,13 +66,6 @@ void Morfeusz::setGeneratorFile(const string& filename) { | ||
66 | Morfeusz::~Morfeusz() { | 66 | Morfeusz::~Morfeusz() { |
67 | } | 67 | } |
68 | 68 | ||
69 | -string Morfeusz::prepareStringToProcess(const std::string& str) const { | ||
70 | - string res; | ||
71 | - res.reserve(str.size()); | ||
72 | - utf8::replace_invalid(str.begin(), str.end(), back_inserter(res)); | ||
73 | - return res; | ||
74 | -} | ||
75 | - | ||
76 | void Morfeusz::processOneWord( | 69 | void Morfeusz::processOneWord( |
77 | const Environment& env, | 70 | const Environment& env, |
78 | const char*& inputStart, | 71 | const char*& inputStart, |
@@ -88,9 +81,9 @@ void Morfeusz::processOneWord( | @@ -88,9 +81,9 @@ void Morfeusz::processOneWord( | ||
88 | InflexionGraph graph; | 81 | InflexionGraph graph; |
89 | const char* currInput = inputStart; | 82 | const char* currInput = inputStart; |
90 | const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); | 83 | const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); |
91 | - | 84 | + |
92 | doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); | 85 | doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); |
93 | - | 86 | + |
94 | if (!graph.empty()) { | 87 | if (!graph.empty()) { |
95 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); | 88 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); |
96 | int srcNode = startNodeNum; | 89 | int srcNode = startNodeNum; |
@@ -148,10 +141,10 @@ void Morfeusz::doProcessOneWord( | @@ -148,10 +141,10 @@ void Morfeusz::doProcessOneWord( | ||
148 | SegrulesState segrulesState, | 141 | SegrulesState segrulesState, |
149 | vector<InterpretedChunk>& accum, | 142 | vector<InterpretedChunk>& accum, |
150 | InflexionGraph& graph) const { | 143 | InflexionGraph& graph) const { |
151 | - // if (this->options.debug) { | ||
152 | - // cerr << "----------" << endl; | ||
153 | - // cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; | ||
154 | - // } | 144 | + if (this->options.debug) { |
145 | + cerr << "----------" << endl; | ||
146 | + cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; | ||
147 | + } | ||
155 | // cerr << "doAnalyzeOneWord " << inputData << endl; | 148 | // cerr << "doAnalyzeOneWord " << inputData << endl; |
156 | const char* inputStart = inputData; | 149 | const char* inputStart = inputData; |
157 | const char* currInput = inputData; | 150 | const char* currInput = inputData; |
@@ -303,9 +296,8 @@ ResultsIterator Morfeusz::analyze(const string& text) const { | @@ -303,9 +296,8 @@ ResultsIterator Morfeusz::analyze(const string& text) const { | ||
303 | } | 296 | } |
304 | 297 | ||
305 | void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const { | 298 | void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const { |
306 | - string preparedText = this->prepareStringToProcess(text); | ||
307 | - const char* input = preparedText.c_str(); | ||
308 | - const char* inputEnd = input + preparedText.length(); | 299 | + const char* input = text.c_str(); |
300 | + const char* inputEnd = input + text.length(); | ||
309 | while (input != inputEnd) { | 301 | while (input != inputEnd) { |
310 | int startNode = results.empty() ? 0 : results.back().getEndNode(); | 302 | int startNode = results.empty() ? 0 : results.back().getEndNode(); |
311 | this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results); | 303 | this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results); |
@@ -324,8 +316,7 @@ ResultsIterator Morfeusz::generate(const string& text, int tagnum) const { | @@ -324,8 +316,7 @@ ResultsIterator Morfeusz::generate(const string& text, int tagnum) const { | ||
324 | return ResultsIterator(res); | 316 | return ResultsIterator(res); |
325 | } | 317 | } |
326 | 318 | ||
327 | -void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results) const { | ||
328 | - string lemma = this->prepareStringToProcess(text); | 319 | +void Morfeusz::generate(const string& lemma, vector<MorphInterpretation>& results) const { |
329 | const char* input = lemma.c_str(); | 320 | const char* input = lemma.c_str(); |
330 | const char* inputEnd = input + lemma.length(); | 321 | const char* inputEnd = input + lemma.length(); |
331 | int startNode = 0; | 322 | int startNode = 0; |
morfeusz/Morfeusz.hpp
@@ -158,8 +158,6 @@ public: | @@ -158,8 +158,6 @@ public: | ||
158 | friend class ResultsIterator; | 158 | friend class ResultsIterator; |
159 | private: | 159 | private: |
160 | 160 | ||
161 | - std::string prepareStringToProcess(const std::string& input) const; | ||
162 | - | ||
163 | void processOneWord( | 161 | void processOneWord( |
164 | const Environment& env, | 162 | const Environment& env, |
165 | const char*& inputData, | 163 | const char*& inputData, |
morfeusz/MorphDeserializer.cpp
@@ -11,6 +11,7 @@ | @@ -11,6 +11,7 @@ | ||
11 | #include "MorphDeserializer.hpp" | 11 | #include "MorphDeserializer.hpp" |
12 | #include "EncodedInterpretation.hpp" | 12 | #include "EncodedInterpretation.hpp" |
13 | #include "InterpsGroup.hpp" | 13 | #include "InterpsGroup.hpp" |
14 | +#include "deserializationUtils.hpp" | ||
14 | 15 | ||
15 | //const uint8_t LEMMA_ONLY_LOWER = 0; | 16 | //const uint8_t LEMMA_ONLY_LOWER = 0; |
16 | //const uint8_t LEMMA_UPPER_PREFIX = 1; | 17 | //const uint8_t LEMMA_UPPER_PREFIX = 1; |
@@ -25,16 +26,13 @@ MorphDeserializer::~MorphDeserializer() { | @@ -25,16 +26,13 @@ MorphDeserializer::~MorphDeserializer() { | ||
25 | 26 | ||
26 | long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { | 27 | long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { |
27 | const unsigned char* currPtr = ptr; | 28 | const unsigned char* currPtr = ptr; |
28 | - uint8_t interpTypesNum = *currPtr; | ||
29 | - currPtr++; | 29 | + uint8_t interpTypesNum = readInt8(currPtr); |
30 | interps.clear(); | 30 | interps.clear(); |
31 | interps.reserve(interpTypesNum); | 31 | interps.reserve(interpTypesNum); |
32 | for (unsigned int i = 0; i < interpTypesNum; i++) { | 32 | for (unsigned int i = 0; i < interpTypesNum; i++) { |
33 | InterpsGroup ig; | 33 | InterpsGroup ig; |
34 | - ig.type = *currPtr; | ||
35 | - currPtr++; | ||
36 | - ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr))); | ||
37 | - currPtr += 2; | 34 | + ig.type = readInt8(currPtr); |
35 | + ig.size = readInt16(currPtr); | ||
38 | ig.ptr = currPtr; | 36 | ig.ptr = currPtr; |
39 | currPtr += ig.size; | 37 | currPtr += ig.size; |
40 | interps.push_back(ig); | 38 | interps.push_back(ig); |
morfeusz/Qualifiers.cpp
@@ -14,7 +14,7 @@ using namespace std; | @@ -14,7 +14,7 @@ using namespace std; | ||
14 | 14 | ||
15 | Qualifiers::Qualifiers(const unsigned char* ptr): | 15 | Qualifiers::Qualifiers(const unsigned char* ptr): |
16 | qualifiers() { | 16 | qualifiers() { |
17 | - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | 17 | + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET); |
18 | const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; | 18 | const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; |
19 | vector<string> _dupa; | 19 | vector<string> _dupa; |
20 | readTags(currPtr, _dupa); | 20 | readTags(currPtr, _dupa); |
@@ -23,7 +23,7 @@ qualifiers() { | @@ -23,7 +23,7 @@ qualifiers() { | ||
23 | uint16_t allCombinationsSize = readInt16(currPtr); | 23 | uint16_t allCombinationsSize = readInt16(currPtr); |
24 | this->qualifiers.reserve(allCombinationsSize); | 24 | this->qualifiers.reserve(allCombinationsSize); |
25 | for (unsigned int i = 0; i < allCombinationsSize; i++) { | 25 | for (unsigned int i = 0; i < allCombinationsSize; i++) { |
26 | - unsigned char qualsNum = *currPtr++; | 26 | + unsigned char qualsNum = readInt8(currPtr); |
27 | vector<string> quals; | 27 | vector<string> quals; |
28 | for (unsigned int j = 0; j < qualsNum; j++) { | 28 | for (unsigned int j = 0; j < qualsNum; j++) { |
29 | quals.push_back(readString(currPtr)); | 29 | quals.push_back(readString(currPtr)); |
morfeusz/charset/CharsetConverter.cpp
@@ -11,7 +11,7 @@ | @@ -11,7 +11,7 @@ | ||
11 | 11 | ||
12 | using namespace std; | 12 | using namespace std; |
13 | 13 | ||
14 | -const char DEFAULT_UNDEFINED_CHAR = static_cast<char>(0xF7); | 14 | +const char DEFAULT_UNDEFINED_CHAR = static_cast<char> (0xF7); |
15 | 15 | ||
16 | string CharsetConverter::toString(const vector<uint32_t>& codepoints) const { | 16 | string CharsetConverter::toString(const vector<uint32_t>& codepoints) const { |
17 | string res; | 17 | string res; |
@@ -22,22 +22,31 @@ string CharsetConverter::toString(const vector<uint32_t>& codepoints) const { | @@ -22,22 +22,31 @@ string CharsetConverter::toString(const vector<uint32_t>& codepoints) const { | ||
22 | } | 22 | } |
23 | 23 | ||
24 | CharsetConverter::~CharsetConverter() { | 24 | CharsetConverter::~CharsetConverter() { |
25 | - | 25 | + |
26 | } | 26 | } |
27 | 27 | ||
28 | -uint32_t UTF8CharsetConverter::peek(const char*& it, const char* end) const { | ||
29 | - return utf8::unchecked::peek_next(it); | 28 | +uint32_t CharsetConverter::peek(const char* it, const char* end) const { |
29 | + return this->next(it, end); | ||
30 | +} | ||
31 | + | ||
32 | +static inline void iterateThroughInvalidUtf8Sequence(const char*& it, const char* end) { | ||
33 | + uint32_t _dupa; | ||
34 | + while (it != end && utf8::internal::validate_next(it, end, _dupa) != utf8::internal::UTF8_OK) { | ||
35 | + it++; | ||
36 | + } | ||
30 | } | 37 | } |
31 | 38 | ||
32 | uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { | 39 | uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { |
33 | - return utf8::unchecked::next(it); | ||
34 | -// catch (utf8::exception ex) { | ||
35 | -// cerr << "WARNING: Replacing invalid character: " << hex << (uint16_t) *it << dec << " with replacement char: 0xFFFD" << endl; | ||
36 | -// while (it != end && peek(it, end) == 0xFFFD) { | ||
37 | -// utf8::unchecked::next(it); | ||
38 | -// } | ||
39 | -// return 0xFFFD; | ||
40 | -// } | 40 | + uint32_t cp = 0; |
41 | + utf8::internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); | ||
42 | + if (err_code == utf8::internal::UTF8_OK) { | ||
43 | + return cp; | ||
44 | + } | ||
45 | + else { | ||
46 | + cerr << "WARNING: Replacing invalid sequence with replacement char: 0xFFFD" << endl; | ||
47 | + iterateThroughInvalidUtf8Sequence(it, end); | ||
48 | + return 0xFFFD; | ||
49 | + } | ||
41 | } | 50 | } |
42 | 51 | ||
43 | void UTF8CharsetConverter::append(uint32_t cp, string& result) const { | 52 | void UTF8CharsetConverter::append(uint32_t cp, string& result) const { |
@@ -52,7 +61,8 @@ static vector<char> reverseArray(const uint32_t* array) { | @@ -52,7 +61,8 @@ static vector<char> reverseArray(const uint32_t* array) { | ||
52 | res.resize(max(static_cast<uint32_t> (res.size()), codepoint + 1), DEFAULT_UNDEFINED_CHAR); | 61 | res.resize(max(static_cast<uint32_t> (res.size()), codepoint + 1), DEFAULT_UNDEFINED_CHAR); |
53 | res[codepoint] = static_cast<char> (c); | 62 | res[codepoint] = static_cast<char> (c); |
54 | c++; | 63 | c++; |
55 | - } while (c != 255); | 64 | + } |
65 | + while (c != 255); | ||
56 | return res; | 66 | return res; |
57 | } | 67 | } |
58 | 68 | ||
@@ -63,18 +73,15 @@ codepoint2Char(reverseArray(array)) { | @@ -63,18 +73,15 @@ codepoint2Char(reverseArray(array)) { | ||
63 | 73 | ||
64 | // TODO - sprawdzanie zakresu | 74 | // TODO - sprawdzanie zakresu |
65 | 75 | ||
66 | -uint32_t OneByteCharsetConverter::peek(const char*& it, const char* end) const { | ||
67 | - return this->array[static_cast<unsigned char> (*it)]; | ||
68 | -} | ||
69 | - | ||
70 | uint32_t OneByteCharsetConverter::next(const char*& it, const char* end) const { | 76 | uint32_t OneByteCharsetConverter::next(const char*& it, const char* end) const { |
71 | - return this->array[static_cast<unsigned char> (*(it++))]; | 77 | + return this->array[static_cast<unsigned char> (*it++)]; |
72 | } | 78 | } |
73 | 79 | ||
74 | void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const { | 80 | void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const { |
75 | if (cp < this->codepoint2Char.size()) { | 81 | if (cp < this->codepoint2Char.size()) { |
76 | result.push_back(this->codepoint2Char[cp]); | 82 | result.push_back(this->codepoint2Char[cp]); |
77 | - } else { | 83 | + } |
84 | + else { | ||
78 | result.push_back(DEFAULT_UNDEFINED_CHAR); | 85 | result.push_back(DEFAULT_UNDEFINED_CHAR); |
79 | } | 86 | } |
80 | } | 87 | } |
morfeusz/charset/CharsetConverter.hpp
@@ -15,7 +15,7 @@ | @@ -15,7 +15,7 @@ | ||
15 | 15 | ||
16 | class CharsetConverter { | 16 | class CharsetConverter { |
17 | public: | 17 | public: |
18 | - virtual uint32_t peek(const char*& it, const char* end) const = 0; | 18 | + uint32_t peek(const char* it, const char* end) const; |
19 | virtual uint32_t next(const char*& it, const char* end) const = 0; | 19 | virtual uint32_t next(const char*& it, const char* end) const = 0; |
20 | virtual void append(uint32_t cp, std::string& result) const = 0; | 20 | virtual void append(uint32_t cp, std::string& result) const = 0; |
21 | virtual std::string fromUTF8(const std::string& input) const; | 21 | virtual std::string fromUTF8(const std::string& input) const; |
@@ -28,7 +28,6 @@ private: | @@ -28,7 +28,6 @@ private: | ||
28 | 28 | ||
29 | class UTF8CharsetConverter : public CharsetConverter { | 29 | class UTF8CharsetConverter : public CharsetConverter { |
30 | public: | 30 | public: |
31 | - uint32_t peek(const char*& it, const char* end) const; | ||
32 | uint32_t next(const char*& it, const char* end) const; | 31 | uint32_t next(const char*& it, const char* end) const; |
33 | void append(uint32_t cp, std::string& result) const; | 32 | void append(uint32_t cp, std::string& result) const; |
34 | // std::string fromUTF8(const std::string& input) const; | 33 | // std::string fromUTF8(const std::string& input) const; |
@@ -41,7 +40,6 @@ private: | @@ -41,7 +40,6 @@ private: | ||
41 | class OneByteCharsetConverter : public CharsetConverter { | 40 | class OneByteCharsetConverter : public CharsetConverter { |
42 | public: | 41 | public: |
43 | explicit OneByteCharsetConverter(const uint32_t* array); | 42 | explicit OneByteCharsetConverter(const uint32_t* array); |
44 | - uint32_t peek(const char*& it, const char* end) const; | ||
45 | uint32_t next(const char*& it, const char* end) const; | 43 | uint32_t next(const char*& it, const char* end) const; |
46 | void append(uint32_t cp, std::string& result) const; | 44 | void append(uint32_t cp, std::string& result) const; |
47 | private: | 45 | private: |
morfeusz/cli/cli.cpp
@@ -139,10 +139,7 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { | @@ -139,10 +139,7 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { | ||
139 | cerr << "setting case sensitive to FALSE" << endl; | 139 | cerr << "setting case sensitive to FALSE" << endl; |
140 | morfeusz.setCaseSensitive(false); | 140 | morfeusz.setCaseSensitive(false); |
141 | } | 141 | } |
142 | -#ifdef _WIN32 | ||
143 | - morfeusz.setCharset(CP852); | ||
144 | -#endif | ||
145 | -#ifdef _WIN64 | 142 | +#if defined(_WIN64) || defined(_WIN32) |
146 | morfeusz.setCharset(CP852); | 143 | morfeusz.setCharset(CP852); |
147 | #endif | 144 | #endif |
148 | } | 145 | } |
morfeusz/cli/cli.hpp
@@ -8,14 +8,9 @@ | @@ -8,14 +8,9 @@ | ||
8 | #ifndef CLI_HPP | 8 | #ifndef CLI_HPP |
9 | #define CLI_HPP | 9 | #define CLI_HPP |
10 | 10 | ||
11 | -#ifdef _WIN64 | 11 | +#if defined(_WIN64) || defined(_WIN32) |
12 | #define TMPDUPA_IN IN | 12 | #define TMPDUPA_IN IN |
13 | #define IN IN | 13 | #define IN IN |
14 | -#else | ||
15 | -#ifdef _WIN32 | ||
16 | -#define TMPDUPA_IN IN | ||
17 | -#define IN IN | ||
18 | -#endif | ||
19 | #endif | 14 | #endif |
20 | 15 | ||
21 | #include <iostream> | 16 | #include <iostream> |
@@ -40,12 +35,8 @@ void initializeMorfeusz(ez::ezOptionParser& opt, Morfeusz& morfeusz); | @@ -40,12 +35,8 @@ void initializeMorfeusz(ez::ezOptionParser& opt, Morfeusz& morfeusz); | ||
40 | 35 | ||
41 | #pragma GCC diagnostic pop | 36 | #pragma GCC diagnostic pop |
42 | 37 | ||
43 | -#ifdef _WIN64 | 38 | +#if defined(_WIN64) || defined(_WIN32) |
44 | #define IN TMPDUPA_IN | 39 | #define IN TMPDUPA_IN |
45 | -#else | ||
46 | -#ifdef _WIN32 | ||
47 | -#define IN TMPDUPA_IN | ||
48 | -#endif | ||
49 | #endif | 40 | #endif |
50 | 41 | ||
51 | #endif /* CLI_HPP */ | 42 | #endif /* CLI_HPP */ |
morfeusz/deserializationUtils.hpp
@@ -27,6 +27,11 @@ inline uint32_t readInt32(const unsigned char*& currPtr) { | @@ -27,6 +27,11 @@ inline uint32_t readInt32(const unsigned char*& currPtr) { | ||
27 | return res; | 27 | return res; |
28 | } | 28 | } |
29 | 29 | ||
30 | +inline uint32_t readInt32Const(const unsigned char* const currPtr) { | ||
31 | + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr)); | ||
32 | + return res; | ||
33 | +} | ||
34 | + | ||
30 | inline std::string readString(const unsigned char*& currPtr) { | 35 | inline std::string readString(const unsigned char*& currPtr) { |
31 | std::string res((const char*) currPtr); | 36 | std::string res((const char*) currPtr); |
32 | currPtr += res.length(); | 37 | currPtr += res.length(); |
morfeusz/endianness.hpp
@@ -8,15 +8,11 @@ | @@ -8,15 +8,11 @@ | ||
8 | #ifndef ENDIANNESS_HPP | 8 | #ifndef ENDIANNESS_HPP |
9 | #define ENDIANNESS_HPP | 9 | #define ENDIANNESS_HPP |
10 | 10 | ||
11 | -#ifdef _WIN64 | ||
12 | -#include <winsock2.h> | ||
13 | -#else | ||
14 | -#ifdef _WIN32 | 11 | +#if defined(_WIN64) || defined(_WIN32) |
15 | #include <winsock2.h> | 12 | #include <winsock2.h> |
16 | #else | 13 | #else |
17 | #include <netinet/in.h> | 14 | #include <netinet/in.h> |
18 | #endif | 15 | #endif |
19 | -#endif | ||
20 | 16 | ||
21 | #endif /* ENDIANNESS_HPP */ | 17 | #endif /* ENDIANNESS_HPP */ |
22 | 18 |
morfeusz/fsa/fsa_impl.hpp
@@ -78,8 +78,6 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial | @@ -78,8 +78,6 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial | ||
78 | 78 | ||
79 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); | 79 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); |
80 | 80 | ||
81 | -// uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET))); | ||
82 | - | ||
83 | const unsigned char* startPtr = ptr + FSA_DATA_OFFSET; | 81 | const unsigned char* startPtr = ptr + FSA_DATA_OFFSET; |
84 | switch (implementationNum) { | 82 | switch (implementationNum) { |
85 | case 0: | 83 | case 0: |
morfeusz/fsa/simplefsa_impl.hpp
@@ -63,7 +63,6 @@ void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { | @@ -63,7 +63,6 @@ void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { | ||
63 | break; | 63 | break; |
64 | } | 64 | } |
65 | } | 65 | } |
66 | - // const_cast<Counter*>(&counter)->increment(foundTransition - transitionsStart + 1); | ||
67 | if (!found) { | 66 | if (!found) { |
68 | state.setNextAsSink(); | 67 | state.setNextAsSink(); |
69 | } | 68 | } |
morfeusz/segrules/segrules.cpp
@@ -14,16 +14,12 @@ static inline void skipSeparatorsList(const unsigned char*& ptr) { | @@ -14,16 +14,12 @@ static inline void skipSeparatorsList(const unsigned char*& ptr) { | ||
14 | static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr) { | 14 | static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr) { |
15 | const unsigned char* additionalDataPtr = ptr | 15 | const unsigned char* additionalDataPtr = ptr |
16 | + FSA_DATA_OFFSET | 16 | + FSA_DATA_OFFSET |
17 | - + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | ||
18 | - const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4; | 17 | + + readInt32Const(ptr + FSA_DATA_SIZE_OFFSET); |
18 | + const unsigned char* res = additionalDataPtr + readInt32Const(additionalDataPtr) + 4; | ||
19 | return res; | 19 | return res; |
20 | } | 20 | } |
21 | 21 | ||
22 | static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) { | 22 | static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) { |
23 | -// const unsigned char* additionalDataPtr = ptr | ||
24 | -// + FSA_DATA_OFFSET | ||
25 | -// + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | ||
26 | -// const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; | ||
27 | const unsigned char* res = getSeparatorsListPtr(ptr); | 23 | const unsigned char* res = getSeparatorsListPtr(ptr); |
28 | skipSeparatorsList(res); | 24 | skipSeparatorsList(res); |
29 | return res; | 25 | return res; |
@@ -84,11 +80,9 @@ SegrulesFSA* getDefaultSegrulesFSA( | @@ -84,11 +80,9 @@ SegrulesFSA* getDefaultSegrulesFSA( | ||
84 | vector<uint32_t> getSeparatorsList(const unsigned char* ptr) { | 80 | vector<uint32_t> getSeparatorsList(const unsigned char* ptr) { |
85 | ptr = getSeparatorsListPtr(ptr); | 81 | ptr = getSeparatorsListPtr(ptr); |
86 | vector<uint32_t> res; | 82 | vector<uint32_t> res; |
87 | - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr)); | ||
88 | - ptr += 2; | 83 | + uint16_t listSize = readInt16(ptr); |
89 | for (unsigned int i = 0; i < listSize; i++) { | 84 | for (unsigned int i = 0; i < listSize; i++) { |
90 | - res.push_back(ntohl(*reinterpret_cast<const uint32_t*>(ptr))); | ||
91 | - ptr += 4; | 85 | + res.push_back(readInt32(ptr)); |
92 | } | 86 | } |
93 | return res; | 87 | return res; |
94 | } | 88 | } |