Commit d51aa0ddcd3289f7abd65d987df92d721cfa66e0

Authored by Michał Lenart
1 parent 39810b81

poprawienie buga w dodawaniu replacement char, ogólne porządki w kodzie

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@168 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt
@@ -5,7 +5,7 @@ project (Morfeusz) @@ -5,7 +5,7 @@ project (Morfeusz)
5 set (Morfeusz_VERSION_MAJOR 2) 5 set (Morfeusz_VERSION_MAJOR 2)
6 set (Morfeusz_VERSION_MINOR 0) 6 set (Morfeusz_VERSION_MINOR 0)
7 set (Morfeusz_VERSION_PATCH 0) 7 set (Morfeusz_VERSION_PATCH 0)
8 -#~ set (CMAKE_BUILD_TYPE "Release") 8 +#~ set (CMAKE_BUILD_TYPE "Debug")
9 9
10 enable_testing() 10 enable_testing()
11 11
morfeusz/CMakeLists.txt
@@ -3,13 +3,13 @@ @@ -3,13 +3,13 @@
3 ########## generate default dictionary data ################# 3 ########## generate default dictionary data #################
4 add_custom_command ( 4 add_custom_command (
5 OUTPUT "${INPUT_DICTIONARY_CPP}" 5 OUTPUT "${INPUT_DICTIONARY_CPP}"
6 - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg 6 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 --trim-supneg
7 DEPENDS "${INPUT_DICTIONARY}" 7 DEPENDS "${INPUT_DICTIONARY}"
8 COMMENT "Building default dictionary C++ file" 8 COMMENT "Building default dictionary C++ file"
9 ) 9 )
10 add_custom_command ( 10 add_custom_command (
11 OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" 11 OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}"
12 - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 12 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE
13 DEPENDS "${INPUT_DICTIONARY}" 13 DEPENDS "${INPUT_DICTIONARY}"
14 COMMENT "Building default dictionary C++ file" 14 COMMENT "Building default dictionary C++ file"
15 ) 15 )
morfeusz/Morfeusz.cpp
@@ -66,13 +66,6 @@ void Morfeusz::setGeneratorFile(const string& filename) { @@ -66,13 +66,6 @@ void Morfeusz::setGeneratorFile(const string& filename) {
66 Morfeusz::~Morfeusz() { 66 Morfeusz::~Morfeusz() {
67 } 67 }
68 68
69 -string Morfeusz::prepareStringToProcess(const std::string& str) const {  
70 - string res;  
71 - res.reserve(str.size());  
72 - utf8::replace_invalid(str.begin(), str.end(), back_inserter(res));  
73 - return res;  
74 -}  
75 -  
76 void Morfeusz::processOneWord( 69 void Morfeusz::processOneWord(
77 const Environment& env, 70 const Environment& env,
78 const char*& inputStart, 71 const char*& inputStart,
@@ -88,9 +81,9 @@ void Morfeusz::processOneWord( @@ -88,9 +81,9 @@ void Morfeusz::processOneWord(
88 InflexionGraph graph; 81 InflexionGraph graph;
89 const char* currInput = inputStart; 82 const char* currInput = inputStart;
90 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); 83 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
91 - 84 +
92 doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); 85 doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
93 - 86 +
94 if (!graph.empty()) { 87 if (!graph.empty()) {
95 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); 88 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
96 int srcNode = startNodeNum; 89 int srcNode = startNodeNum;
@@ -148,10 +141,10 @@ void Morfeusz::doProcessOneWord( @@ -148,10 +141,10 @@ void Morfeusz::doProcessOneWord(
148 SegrulesState segrulesState, 141 SegrulesState segrulesState,
149 vector<InterpretedChunk>& accum, 142 vector<InterpretedChunk>& accum,
150 InflexionGraph& graph) const { 143 InflexionGraph& graph) const {
151 - // if (this->options.debug) {  
152 - // cerr << "----------" << endl;  
153 - // cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;  
154 - // } 144 + if (this->options.debug) {
  145 + cerr << "----------" << endl;
  146 + cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
  147 + }
155 // cerr << "doAnalyzeOneWord " << inputData << endl; 148 // cerr << "doAnalyzeOneWord " << inputData << endl;
156 const char* inputStart = inputData; 149 const char* inputStart = inputData;
157 const char* currInput = inputData; 150 const char* currInput = inputData;
@@ -303,9 +296,8 @@ ResultsIterator Morfeusz::analyze(const string&amp; text) const { @@ -303,9 +296,8 @@ ResultsIterator Morfeusz::analyze(const string&amp; text) const {
303 } 296 }
304 297
305 void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const { 298 void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const {
306 - string preparedText = this->prepareStringToProcess(text);  
307 - const char* input = preparedText.c_str();  
308 - const char* inputEnd = input + preparedText.length(); 299 + const char* input = text.c_str();
  300 + const char* inputEnd = input + text.length();
309 while (input != inputEnd) { 301 while (input != inputEnd) {
310 int startNode = results.empty() ? 0 : results.back().getEndNode(); 302 int startNode = results.empty() ? 0 : results.back().getEndNode();
311 this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results); 303 this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results);
@@ -324,8 +316,7 @@ ResultsIterator Morfeusz::generate(const string&amp; text, int tagnum) const { @@ -324,8 +316,7 @@ ResultsIterator Morfeusz::generate(const string&amp; text, int tagnum) const {
324 return ResultsIterator(res); 316 return ResultsIterator(res);
325 } 317 }
326 318
327 -void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results) const {  
328 - string lemma = this->prepareStringToProcess(text); 319 +void Morfeusz::generate(const string& lemma, vector<MorphInterpretation>& results) const {
329 const char* input = lemma.c_str(); 320 const char* input = lemma.c_str();
330 const char* inputEnd = input + lemma.length(); 321 const char* inputEnd = input + lemma.length();
331 int startNode = 0; 322 int startNode = 0;
morfeusz/Morfeusz.hpp
@@ -158,8 +158,6 @@ public: @@ -158,8 +158,6 @@ public:
158 friend class ResultsIterator; 158 friend class ResultsIterator;
159 private: 159 private:
160 160
161 - std::string prepareStringToProcess(const std::string& input) const;  
162 -  
163 void processOneWord( 161 void processOneWord(
164 const Environment& env, 162 const Environment& env,
165 const char*& inputData, 163 const char*& inputData,
morfeusz/MorphDeserializer.cpp
@@ -11,6 +11,7 @@ @@ -11,6 +11,7 @@
11 #include "MorphDeserializer.hpp" 11 #include "MorphDeserializer.hpp"
12 #include "EncodedInterpretation.hpp" 12 #include "EncodedInterpretation.hpp"
13 #include "InterpsGroup.hpp" 13 #include "InterpsGroup.hpp"
  14 +#include "deserializationUtils.hpp"
14 15
15 //const uint8_t LEMMA_ONLY_LOWER = 0; 16 //const uint8_t LEMMA_ONLY_LOWER = 0;
16 //const uint8_t LEMMA_UPPER_PREFIX = 1; 17 //const uint8_t LEMMA_UPPER_PREFIX = 1;
@@ -25,16 +26,13 @@ MorphDeserializer::~MorphDeserializer() { @@ -25,16 +26,13 @@ MorphDeserializer::~MorphDeserializer() {
25 26
26 long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const { 27 long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
27 const unsigned char* currPtr = ptr; 28 const unsigned char* currPtr = ptr;
28 - uint8_t interpTypesNum = *currPtr;  
29 - currPtr++; 29 + uint8_t interpTypesNum = readInt8(currPtr);
30 interps.clear(); 30 interps.clear();
31 interps.reserve(interpTypesNum); 31 interps.reserve(interpTypesNum);
32 for (unsigned int i = 0; i < interpTypesNum; i++) { 32 for (unsigned int i = 0; i < interpTypesNum; i++) {
33 InterpsGroup ig; 33 InterpsGroup ig;
34 - ig.type = *currPtr;  
35 - currPtr++;  
36 - ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr)));  
37 - currPtr += 2; 34 + ig.type = readInt8(currPtr);
  35 + ig.size = readInt16(currPtr);
38 ig.ptr = currPtr; 36 ig.ptr = currPtr;
39 currPtr += ig.size; 37 currPtr += ig.size;
40 interps.push_back(ig); 38 interps.push_back(ig);
morfeusz/Qualifiers.cpp
@@ -14,7 +14,7 @@ using namespace std; @@ -14,7 +14,7 @@ using namespace std;
14 14
15 Qualifiers::Qualifiers(const unsigned char* ptr): 15 Qualifiers::Qualifiers(const unsigned char* ptr):
16 qualifiers() { 16 qualifiers() {
17 - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); 17 + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET);
18 const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; 18 const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4;
19 vector<string> _dupa; 19 vector<string> _dupa;
20 readTags(currPtr, _dupa); 20 readTags(currPtr, _dupa);
@@ -23,7 +23,7 @@ qualifiers() { @@ -23,7 +23,7 @@ qualifiers() {
23 uint16_t allCombinationsSize = readInt16(currPtr); 23 uint16_t allCombinationsSize = readInt16(currPtr);
24 this->qualifiers.reserve(allCombinationsSize); 24 this->qualifiers.reserve(allCombinationsSize);
25 for (unsigned int i = 0; i < allCombinationsSize; i++) { 25 for (unsigned int i = 0; i < allCombinationsSize; i++) {
26 - unsigned char qualsNum = *currPtr++; 26 + unsigned char qualsNum = readInt8(currPtr);
27 vector<string> quals; 27 vector<string> quals;
28 for (unsigned int j = 0; j < qualsNum; j++) { 28 for (unsigned int j = 0; j < qualsNum; j++) {
29 quals.push_back(readString(currPtr)); 29 quals.push_back(readString(currPtr));
morfeusz/charset/CharsetConverter.cpp
@@ -11,7 +11,7 @@ @@ -11,7 +11,7 @@
11 11
12 using namespace std; 12 using namespace std;
13 13
14 -const char DEFAULT_UNDEFINED_CHAR = static_cast<char>(0xF7); 14 +const char DEFAULT_UNDEFINED_CHAR = static_cast<char> (0xF7);
15 15
16 string CharsetConverter::toString(const vector<uint32_t>& codepoints) const { 16 string CharsetConverter::toString(const vector<uint32_t>& codepoints) const {
17 string res; 17 string res;
@@ -22,22 +22,31 @@ string CharsetConverter::toString(const vector&lt;uint32_t&gt;&amp; codepoints) const { @@ -22,22 +22,31 @@ string CharsetConverter::toString(const vector&lt;uint32_t&gt;&amp; codepoints) const {
22 } 22 }
23 23
24 CharsetConverter::~CharsetConverter() { 24 CharsetConverter::~CharsetConverter() {
25 - 25 +
26 } 26 }
27 27
28 -uint32_t UTF8CharsetConverter::peek(const char*& it, const char* end) const {  
29 - return utf8::unchecked::peek_next(it); 28 +uint32_t CharsetConverter::peek(const char* it, const char* end) const {
  29 + return this->next(it, end);
  30 +}
  31 +
  32 +static inline void iterateThroughInvalidUtf8Sequence(const char*& it, const char* end) {
  33 + uint32_t _dupa;
  34 + while (it != end && utf8::internal::validate_next(it, end, _dupa) != utf8::internal::UTF8_OK) {
  35 + it++;
  36 + }
30 } 37 }
31 38
32 uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { 39 uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
33 - return utf8::unchecked::next(it);  
34 -// catch (utf8::exception ex) {  
35 -// cerr << "WARNING: Replacing invalid character: " << hex << (uint16_t) *it << dec << " with replacement char: 0xFFFD" << endl;  
36 -// while (it != end && peek(it, end) == 0xFFFD) {  
37 -// utf8::unchecked::next(it);  
38 -// }  
39 -// return 0xFFFD;  
40 -// } 40 + uint32_t cp = 0;
  41 + utf8::internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
  42 + if (err_code == utf8::internal::UTF8_OK) {
  43 + return cp;
  44 + }
  45 + else {
  46 + cerr << "WARNING: Replacing invalid sequence with replacement char: 0xFFFD" << endl;
  47 + iterateThroughInvalidUtf8Sequence(it, end);
  48 + return 0xFFFD;
  49 + }
41 } 50 }
42 51
43 void UTF8CharsetConverter::append(uint32_t cp, string& result) const { 52 void UTF8CharsetConverter::append(uint32_t cp, string& result) const {
@@ -52,7 +61,8 @@ static vector&lt;char&gt; reverseArray(const uint32_t* array) { @@ -52,7 +61,8 @@ static vector&lt;char&gt; reverseArray(const uint32_t* array) {
52 res.resize(max(static_cast<uint32_t> (res.size()), codepoint + 1), DEFAULT_UNDEFINED_CHAR); 61 res.resize(max(static_cast<uint32_t> (res.size()), codepoint + 1), DEFAULT_UNDEFINED_CHAR);
53 res[codepoint] = static_cast<char> (c); 62 res[codepoint] = static_cast<char> (c);
54 c++; 63 c++;
55 - } while (c != 255); 64 + }
  65 + while (c != 255);
56 return res; 66 return res;
57 } 67 }
58 68
@@ -63,18 +73,15 @@ codepoint2Char(reverseArray(array)) { @@ -63,18 +73,15 @@ codepoint2Char(reverseArray(array)) {
63 73
64 // TODO - sprawdzanie zakresu 74 // TODO - sprawdzanie zakresu
65 75
66 -uint32_t OneByteCharsetConverter::peek(const char*& it, const char* end) const {  
67 - return this->array[static_cast<unsigned char> (*it)];  
68 -}  
69 -  
70 uint32_t OneByteCharsetConverter::next(const char*& it, const char* end) const { 76 uint32_t OneByteCharsetConverter::next(const char*& it, const char* end) const {
71 - return this->array[static_cast<unsigned char> (*(it++))]; 77 + return this->array[static_cast<unsigned char> (*it++)];
72 } 78 }
73 79
74 void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const { 80 void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const {
75 if (cp < this->codepoint2Char.size()) { 81 if (cp < this->codepoint2Char.size()) {
76 result.push_back(this->codepoint2Char[cp]); 82 result.push_back(this->codepoint2Char[cp]);
77 - } else { 83 + }
  84 + else {
78 result.push_back(DEFAULT_UNDEFINED_CHAR); 85 result.push_back(DEFAULT_UNDEFINED_CHAR);
79 } 86 }
80 } 87 }
morfeusz/charset/CharsetConverter.hpp
@@ -15,7 +15,7 @@ @@ -15,7 +15,7 @@
15 15
16 class CharsetConverter { 16 class CharsetConverter {
17 public: 17 public:
18 - virtual uint32_t peek(const char*& it, const char* end) const = 0; 18 + uint32_t peek(const char* it, const char* end) const;
19 virtual uint32_t next(const char*& it, const char* end) const = 0; 19 virtual uint32_t next(const char*& it, const char* end) const = 0;
20 virtual void append(uint32_t cp, std::string& result) const = 0; 20 virtual void append(uint32_t cp, std::string& result) const = 0;
21 virtual std::string fromUTF8(const std::string& input) const; 21 virtual std::string fromUTF8(const std::string& input) const;
@@ -28,7 +28,6 @@ private: @@ -28,7 +28,6 @@ private:
28 28
29 class UTF8CharsetConverter : public CharsetConverter { 29 class UTF8CharsetConverter : public CharsetConverter {
30 public: 30 public:
31 - uint32_t peek(const char*& it, const char* end) const;  
32 uint32_t next(const char*& it, const char* end) const; 31 uint32_t next(const char*& it, const char* end) const;
33 void append(uint32_t cp, std::string& result) const; 32 void append(uint32_t cp, std::string& result) const;
34 // std::string fromUTF8(const std::string& input) const; 33 // std::string fromUTF8(const std::string& input) const;
@@ -41,7 +40,6 @@ private: @@ -41,7 +40,6 @@ private:
41 class OneByteCharsetConverter : public CharsetConverter { 40 class OneByteCharsetConverter : public CharsetConverter {
42 public: 41 public:
43 explicit OneByteCharsetConverter(const uint32_t* array); 42 explicit OneByteCharsetConverter(const uint32_t* array);
44 - uint32_t peek(const char*& it, const char* end) const;  
45 uint32_t next(const char*& it, const char* end) const; 43 uint32_t next(const char*& it, const char* end) const;
46 void append(uint32_t cp, std::string& result) const; 44 void append(uint32_t cp, std::string& result) const;
47 private: 45 private:
morfeusz/cli/cli.cpp
@@ -139,10 +139,7 @@ void initializeMorfeusz(ezOptionParser&amp; opt, Morfeusz&amp; morfeusz) { @@ -139,10 +139,7 @@ void initializeMorfeusz(ezOptionParser&amp; opt, Morfeusz&amp; morfeusz) {
139 cerr << "setting case sensitive to FALSE" << endl; 139 cerr << "setting case sensitive to FALSE" << endl;
140 morfeusz.setCaseSensitive(false); 140 morfeusz.setCaseSensitive(false);
141 } 141 }
142 -#ifdef _WIN32  
143 - morfeusz.setCharset(CP852);  
144 -#endif  
145 -#ifdef _WIN64 142 +#if defined(_WIN64) || defined(_WIN32)
146 morfeusz.setCharset(CP852); 143 morfeusz.setCharset(CP852);
147 #endif 144 #endif
148 } 145 }
morfeusz/cli/cli.hpp
@@ -8,14 +8,9 @@ @@ -8,14 +8,9 @@
8 #ifndef CLI_HPP 8 #ifndef CLI_HPP
9 #define CLI_HPP 9 #define CLI_HPP
10 10
11 -#ifdef _WIN64 11 +#if defined(_WIN64) || defined(_WIN32)
12 #define TMPDUPA_IN IN 12 #define TMPDUPA_IN IN
13 #define IN IN 13 #define IN IN
14 -#else  
15 -#ifdef _WIN32  
16 -#define TMPDUPA_IN IN  
17 -#define IN IN  
18 -#endif  
19 #endif 14 #endif
20 15
21 #include <iostream> 16 #include <iostream>
@@ -40,12 +35,8 @@ void initializeMorfeusz(ez::ezOptionParser&amp; opt, Morfeusz&amp; morfeusz); @@ -40,12 +35,8 @@ void initializeMorfeusz(ez::ezOptionParser&amp; opt, Morfeusz&amp; morfeusz);
40 35
41 #pragma GCC diagnostic pop 36 #pragma GCC diagnostic pop
42 37
43 -#ifdef _WIN64 38 +#if defined(_WIN64) || defined(_WIN32)
44 #define IN TMPDUPA_IN 39 #define IN TMPDUPA_IN
45 -#else  
46 -#ifdef _WIN32  
47 -#define IN TMPDUPA_IN  
48 -#endif  
49 #endif 40 #endif
50 41
51 #endif /* CLI_HPP */ 42 #endif /* CLI_HPP */
morfeusz/deserializationUtils.hpp
@@ -27,6 +27,11 @@ inline uint32_t readInt32(const unsigned char*&amp; currPtr) { @@ -27,6 +27,11 @@ inline uint32_t readInt32(const unsigned char*&amp; currPtr) {
27 return res; 27 return res;
28 } 28 }
29 29
  30 +inline uint32_t readInt32Const(const unsigned char* const currPtr) {
  31 + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr));
  32 + return res;
  33 +}
  34 +
30 inline std::string readString(const unsigned char*& currPtr) { 35 inline std::string readString(const unsigned char*& currPtr) {
31 std::string res((const char*) currPtr); 36 std::string res((const char*) currPtr);
32 currPtr += res.length(); 37 currPtr += res.length();
morfeusz/endianness.hpp
@@ -8,15 +8,11 @@ @@ -8,15 +8,11 @@
8 #ifndef ENDIANNESS_HPP 8 #ifndef ENDIANNESS_HPP
9 #define ENDIANNESS_HPP 9 #define ENDIANNESS_HPP
10 10
11 -#ifdef _WIN64  
12 -#include <winsock2.h>  
13 -#else  
14 -#ifdef _WIN32 11 +#if defined(_WIN64) || defined(_WIN32)
15 #include <winsock2.h> 12 #include <winsock2.h>
16 #else 13 #else
17 #include <netinet/in.h> 14 #include <netinet/in.h>
18 #endif 15 #endif
19 -#endif  
20 16
21 #endif /* ENDIANNESS_HPP */ 17 #endif /* ENDIANNESS_HPP */
22 18
morfeusz/fsa/fsa_impl.hpp
@@ -78,8 +78,6 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial @@ -78,8 +78,6 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
78 78
79 uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); 79 uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET);
80 80
81 -// uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET)));  
82 -  
83 const unsigned char* startPtr = ptr + FSA_DATA_OFFSET; 81 const unsigned char* startPtr = ptr + FSA_DATA_OFFSET;
84 switch (implementationNum) { 82 switch (implementationNum) {
85 case 0: 83 case 0:
morfeusz/fsa/simplefsa_impl.hpp
@@ -63,7 +63,6 @@ void SimpleFSA&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const { @@ -63,7 +63,6 @@ void SimpleFSA&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
63 break; 63 break;
64 } 64 }
65 } 65 }
66 - // const_cast<Counter*>(&counter)->increment(foundTransition - transitionsStart + 1);  
67 if (!found) { 66 if (!found) {
68 state.setNextAsSink(); 67 state.setNextAsSink();
69 } 68 }
morfeusz/segrules/segrules.cpp
@@ -14,16 +14,12 @@ static inline void skipSeparatorsList(const unsigned char*&amp; ptr) { @@ -14,16 +14,12 @@ static inline void skipSeparatorsList(const unsigned char*&amp; ptr) {
14 static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr) { 14 static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr) {
15 const unsigned char* additionalDataPtr = ptr 15 const unsigned char* additionalDataPtr = ptr
16 + FSA_DATA_OFFSET 16 + FSA_DATA_OFFSET
17 - + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));  
18 - const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4; 17 + + readInt32Const(ptr + FSA_DATA_SIZE_OFFSET);
  18 + const unsigned char* res = additionalDataPtr + readInt32Const(additionalDataPtr) + 4;
19 return res; 19 return res;
20 } 20 }
21 21
22 static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) { 22 static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) {
23 -// const unsigned char* additionalDataPtr = ptr  
24 -// + FSA_DATA_OFFSET  
25 -// + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));  
26 -// const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;  
27 const unsigned char* res = getSeparatorsListPtr(ptr); 23 const unsigned char* res = getSeparatorsListPtr(ptr);
28 skipSeparatorsList(res); 24 skipSeparatorsList(res);
29 return res; 25 return res;
@@ -84,11 +80,9 @@ SegrulesFSA* getDefaultSegrulesFSA( @@ -84,11 +80,9 @@ SegrulesFSA* getDefaultSegrulesFSA(
84 vector<uint32_t> getSeparatorsList(const unsigned char* ptr) { 80 vector<uint32_t> getSeparatorsList(const unsigned char* ptr) {
85 ptr = getSeparatorsListPtr(ptr); 81 ptr = getSeparatorsListPtr(ptr);
86 vector<uint32_t> res; 82 vector<uint32_t> res;
87 - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr));  
88 - ptr += 2; 83 + uint16_t listSize = readInt16(ptr);
89 for (unsigned int i = 0; i < listSize; i++) { 84 for (unsigned int i = 0; i < listSize; i++) {
90 - res.push_back(ntohl(*reinterpret_cast<const uint32_t*>(ptr)));  
91 - ptr += 4; 85 + res.push_back(readInt32(ptr));
92 } 86 }
93 return res; 87 return res;
94 } 88 }