Commit 445dc6d2c31b6eaa134dfbea0f362f672b5716ee

Authored by Michał Lenart
1 parent d51aa0dd

ciąg dalszy porządków

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@169 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/morfeuszbuilder/fsa/encode.py
@@ -127,18 +127,15 @@ class Encoder(object): @@ -127,18 +127,15 @@ class Encoder(object):
127 if isAnalyzer: 127 if isAnalyzer:
128 encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) 128 encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
129 else: 129 else:
130 - serializeString(interp.homonymId, encodedInterpsList)  
131 - serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList) 130 + encodedInterpsList.extend(serializeString(interp.homonymId))
  131 + encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd))
132 encodedInterpsList.append(interp.encodedForm.cutLength) 132 encodedInterpsList.append(interp.encodedForm.cutLength)
133 - serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList) 133 + encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd))
134 if isAnalyzer: 134 if isAnalyzer:
135 encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern)) 135 encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
136 encodedInterpsList.extend(htons(interp.tagnum)) 136 encodedInterpsList.extend(htons(interp.tagnum))
137 encodedInterpsList.append(interp.namenum) 137 encodedInterpsList.append(interp.namenum)
138 encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) 138 encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers))
139 -  
140 - if interp.encodedForm.suffixToAdd == 'bc':  
141 - print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList]  
142 139
143 res.extend(htons(len(encodedInterpsList))) 140 res.extend(htons(len(encodedInterpsList)))
144 res.extend(encodedInterpsList) 141 res.extend(encodedInterpsList)
fsabuilder/morfeuszbuilder/fsa/serializer.py
@@ -271,16 +271,21 @@ class VLengthSerializer1(Serializer): @@ -271,16 +271,21 @@ class VLengthSerializer1(Serializer):
271 return self.useArrays and state.serializeAsArray 271 return self.useArrays and state.serializeAsArray
272 272
273 def stateData2bytearray(self, state): 273 def stateData2bytearray(self, state):
274 - assert state.transitionsNum < 64 274 +# assert state.transitionsNum < 64
275 res = bytearray() 275 res = bytearray()
276 firstByte = 0 276 firstByte = 0
277 if state.isAccepting(): 277 if state.isAccepting():
278 firstByte |= self.ACCEPTING_FLAG 278 firstByte |= self.ACCEPTING_FLAG
279 if self.stateShouldBeAnArray(state): 279 if self.stateShouldBeAnArray(state):
280 firstByte |= self.ARRAY_FLAG 280 firstByte |= self.ARRAY_FLAG
281 - firstByte |= state.transitionsNum  
282 - assert firstByte < 256 and firstByte > 0  
283 - res.append(firstByte) 281 + if state.transitionsNum < 63:
  282 + firstByte |= state.transitionsNum
  283 + res.append(firstByte)
  284 + else:
  285 + firstByte |= 63
  286 + res.append(firstByte)
  287 + res.append(state.transitionsNum)
  288 +
284 if state.isAccepting(): 289 if state.isAccepting():
285 res.extend(state.encodedData) 290 res.extend(state.encodedData)
286 return res 291 return res
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
@@ -23,6 +23,8 @@ def htonl(n): @@ -23,6 +23,8 @@ def htonl(n):
23 res.append(n & 0x000000FF) 23 res.append(n & 0x000000FF)
24 return res 24 return res
25 25
26 -def serializeString(string, out): 26 +def serializeString(string):
  27 + out = bytearray()
27 out.extend(string.encode('utf8')) 28 out.extend(string.encode('utf8'))
28 out.append(0) 29 out.append(0)
  30 + return out
morfeusz/CMakeLists.txt
@@ -9,7 +9,7 @@ add_custom_command ( @@ -9,7 +9,7 @@ add_custom_command (
9 ) 9 )
10 add_custom_command ( 10 add_custom_command (
11 OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" 11 OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}"
12 - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE 12 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1
13 DEPENDS "${INPUT_DICTIONARY}" 13 DEPENDS "${INPUT_DICTIONARY}"
14 COMMENT "Building default dictionary C++ file" 14 COMMENT "Building default dictionary C++ file"
15 ) 15 )
morfeusz/Morfeusz.cpp
@@ -74,7 +74,7 @@ void Morfeusz::processOneWord( @@ -74,7 +74,7 @@ void Morfeusz::processOneWord(
74 std::vector<MorphInterpretation>& results, 74 std::vector<MorphInterpretation>& results,
75 bool insideIgnHandler) const { 75 bool insideIgnHandler) const {
76 while (inputStart != inputEnd 76 while (inputStart != inputEnd
77 - && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) { 77 + && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) {
78 env.getCharsetConverter().next(inputStart, inputEnd); 78 env.getCharsetConverter().next(inputStart, inputEnd);
79 } 79 }
80 vector<InterpretedChunk> accum; 80 vector<InterpretedChunk> accum;
@@ -154,7 +154,7 @@ void Morfeusz::doProcessOneWord( @@ -154,7 +154,7 @@ void Morfeusz::doProcessOneWord(
154 154
155 StateType state = env.getFSA().getInitialState(); 155 StateType state = env.getFSA().getInitialState();
156 156
157 - while (!isEndOfWord(codepoint)) { 157 + while (!isWhitespace(codepoint)) {
158 uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER 158 uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
159 ? env.getCaseConverter().toLower(codepoint) 159 ? env.getCaseConverter().toLower(codepoint)
160 : codepoint; 160 : codepoint;
@@ -210,14 +210,14 @@ void Morfeusz::doProcessOneWord( @@ -210,14 +210,14 @@ void Morfeusz::doProcessOneWord(
210 doShiftOrth(accum.back(), ic); 210 doShiftOrth(accum.back(), ic);
211 } 211 }
212 accum.push_back(ic); 212 accum.push_back(ic);
213 - if (isEndOfWord(codepoint) 213 + if (isWhitespace(codepoint)
214 && newSegrulesState.accepting) { 214 && newSegrulesState.accepting) {
215 if (this->options.debug) { 215 if (this->options.debug) {
216 cerr << "ACCEPTING " << debugAccum(accum) << endl; 216 cerr << "ACCEPTING " << debugAccum(accum) << endl;
217 } 217 }
218 graph.addPath(accum, newSegrulesState.weak); 218 graph.addPath(accum, newSegrulesState.weak);
219 } 219 }
220 - else if (!isEndOfWord(codepoint)) { 220 + else if (!isWhitespace(codepoint)) {
221 // cerr << "will process " << currInput << endl; 221 // cerr << "will process " << currInput << endl;
222 const char* newCurrInput = currInput; 222 const char* newCurrInput = currInput;
223 doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); 223 doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
@@ -227,7 +227,7 @@ void Morfeusz::doProcessOneWord( @@ -227,7 +227,7 @@ void Morfeusz::doProcessOneWord(
227 } 227 }
228 } 228 }
229 } 229 }
230 - codepoint = currInput == inputEnd || isEndOfWord(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd); 230 + codepoint = currInput == inputEnd || isWhitespace(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
231 } 231 }
232 inputData = currInput; 232 inputData = currInput;
233 } 233 }
morfeusz/Tagset.cpp
@@ -9,7 +9,7 @@ @@ -9,7 +9,7 @@
9 using namespace std; 9 using namespace std;
10 10
11 Tagset::Tagset(const unsigned char* ptr) { 11 Tagset::Tagset(const unsigned char* ptr) {
12 - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); 12 + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET);
13 const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; 13 const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4;
14 readTags(currPtr, this->tags); 14 readTags(currPtr, this->tags);
15 readTags(currPtr, this->names); 15 readTags(currPtr, this->names);
morfeusz/charset/charset_utils.hpp
@@ -48,10 +48,9 @@ static inline std::vector&lt;uint32_t&gt; initializeWhitespaces() { @@ -48,10 +48,9 @@ static inline std::vector&lt;uint32_t&gt; initializeWhitespaces() {
48 return res; 48 return res;
49 } 49 }
50 50
51 -inline bool isEndOfWord(uint32_t codepoint) { 51 +inline bool isWhitespace(uint32_t codepoint) {
52 static std::vector<uint32_t> whitespaces(initializeWhitespaces()); 52 static std::vector<uint32_t> whitespaces(initializeWhitespaces());
53 return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint); 53 return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint);
54 - // return whitespaces.count(codepoint);  
55 } 54 }
56 55
57 template <class StateClass> 56 template <class StateClass>
morfeusz/fsa/cfsa1_impl.hpp
@@ -11,35 +11,57 @@ @@ -11,35 +11,57 @@
11 #include <vector> 11 #include <vector>
12 12
13 #include "fsa.hpp" 13 #include "fsa.hpp"
  14 +#include "../deserializationUtils.hpp"
14 15
15 using namespace std; 16 using namespace std;
16 17
17 -#pragma pack(push, 1) /* push current alignment to stack */ 18 +static const unsigned char CFSA1_ACCEPTING_FLAG = 128;
  19 +static const unsigned char CFSA1_ARRAY_FLAG = 64;
  20 +static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 63;
  21 +
  22 +static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3;
  23 +
  24 +static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257;
18 25
19 struct StateData2 { 26 struct StateData2 {
20 - unsigned transitionsNum: 6;  
21 - unsigned array : 1;  
22 - unsigned accepting : 1; 27 + unsigned int transitionsNum;
  28 + bool isArray;
  29 + bool isAccepting;
23 }; 30 };
24 31
25 struct TransitionData2 { 32 struct TransitionData2 {
26 - unsigned offsetSize : 2;  
27 - unsigned shortLabel : 6; 33 + unsigned int offsetSize;
  34 + unsigned int shortLabel;
28 }; 35 };
29 36
  37 +static inline StateData2 readStateData(const unsigned char*& ptr) {
  38 + StateData2 res;
  39 + unsigned char firstByte = readInt8(ptr);
  40 + res.isArray = firstByte & CFSA1_ARRAY_FLAG;
  41 + res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG;
  42 + res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK;
  43 + if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) {
  44 + res.transitionsNum = readInt8(ptr);
  45 + }
  46 + return res;
  47 +}
30 48
31 -#pragma pack(pop) /* restore original alignment from stack */  
32 -  
33 -static const unsigned int INITIAL_STATE_OFFSET = 257; 49 +static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) {
  50 + TransitionData2 res;
  51 + unsigned char firstByte = readInt8(ptr);
  52 + res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK;
  53 + res.shortLabel = firstByte >> 2;
  54 + return res;
  55 +}
34 56
35 template <class T> 57 template <class T>
36 vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) { 58 vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) {
37 - return vector<unsigned char>(ptr, ptr + INITIAL_STATE_OFFSET); 59 + return vector<unsigned char>(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET);
38 } 60 }
39 61
40 template <class T> 62 template <class T>
41 CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer) 63 CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer)
42 -: FSA<T>(ptr + INITIAL_STATE_OFFSET, deserializer), 64 +: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer),
43 label2ShortLabel(initializeChar2PopularCharIdx(ptr)) { 65 label2ShortLabel(initializeChar2PopularCharIdx(ptr)) {
44 } 66 }
45 67
@@ -52,10 +74,12 @@ template &lt;class T&gt; @@ -52,10 +74,12 @@ template &lt;class T&gt;
52 void CompressedFSA1<T>::reallyDoProceed( 74 void CompressedFSA1<T>::reallyDoProceed(
53 const unsigned char* statePtr, 75 const unsigned char* statePtr,
54 State<T>& state) const { 76 State<T>& state) const {
55 - const StateData2* sd = reinterpret_cast<const StateData2*>(statePtr);  
56 - if (sd->accepting) { 77 + const unsigned char* currPtr = statePtr;
  78 + const StateData2 sd = readStateData(currPtr);
  79 + if (sd.isAccepting) {
57 T object; 80 T object;
58 - long size = this->deserializer.deserialize(statePtr + 1, object); 81 + long size = this->deserializer.deserialize(currPtr, object);
  82 + // long size = this->deserializer.deserialize(statePtr + 1, object);
59 state.setNext(statePtr - this->initialStatePtr, object, size); 83 state.setNext(statePtr - this->initialStatePtr, object, size);
60 } 84 }
61 else { 85 else {
@@ -70,54 +94,57 @@ void CompressedFSA1&lt;T&gt;::doProceedToNextByList( @@ -70,54 +94,57 @@ void CompressedFSA1&lt;T&gt;::doProceedToNextByList(
70 const unsigned char* ptr, 94 const unsigned char* ptr,
71 const unsigned int transitionsNum, 95 const unsigned int transitionsNum,
72 State<T>& state) const { 96 State<T>& state) const {
73 - register const unsigned char* currPtr = ptr; 97 + const unsigned char* currPtr = ptr;
74 // TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset); 98 // TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset);
75 bool found = false; 99 bool found = false;
76 TransitionData2 td; 100 TransitionData2 td;
77 for (unsigned int i = 0; i < transitionsNum; i++) { 101 for (unsigned int i = 0; i < transitionsNum; i++) {
78 - // const_cast<Counter*>(&counter)->increment(1);  
79 - td = *(reinterpret_cast<const TransitionData2*>(currPtr)); 102 + td = readTransitionFirstByte(currPtr);
80 if (td.shortLabel == shortLabel) { 103 if (td.shortLabel == shortLabel) {
81 if (shortLabel == 0) { 104 if (shortLabel == 0) {
82 - currPtr++;  
83 - char label = (char) *currPtr; 105 + char label = static_cast<char>(readInt8(currPtr));
84 if (label == c) { 106 if (label == c) {
85 found = true; 107 found = true;
86 break; 108 break;
87 } 109 }
88 else { 110 else {
89 - currPtr += td.offsetSize + 1; 111 + currPtr += td.offsetSize;
90 } 112 }
91 - } else { 113 + }
  114 + else {
92 found = true; 115 found = true;
93 break; 116 break;
94 } 117 }
95 - } 118 + }
96 else { 119 else {
97 if (td.shortLabel == 0) { 120 if (td.shortLabel == 0) {
98 currPtr++; 121 currPtr++;
99 } 122 }
100 - currPtr += td.offsetSize + 1; 123 + currPtr += td.offsetSize;
101 } 124 }
102 } 125 }
103 if (!found) { 126 if (!found) {
104 state.setNextAsSink(); 127 state.setNextAsSink();
105 - } 128 + }
106 else { 129 else {
107 - currPtr++; 130 + uint32_t offset;
108 switch (td.offsetSize) { 131 switch (td.offsetSize) {
109 case 0: 132 case 0:
  133 + offset = 0;
110 break; 134 break;
111 case 1: 135 case 1:
112 - currPtr += *currPtr + 1; 136 + offset = readInt8(currPtr);
113 break; 137 break;
114 case 2: 138 case 2:
115 - currPtr += ntohs(*((const uint16_t*) currPtr)) + 2; 139 + offset = readInt16(currPtr);
116 break; 140 break;
117 case 3: 141 case 3:
118 - currPtr += (((const unsigned int) ntohs(*((const uint16_t*) currPtr))) << 8) + currPtr[2] + 3;  
119 - break; 142 + offset = readInt16(currPtr);
  143 + offset <<= 8;
  144 + offset += readInt8(currPtr);
  145 + break;
120 } 146 }
  147 + currPtr += offset;
121 reallyDoProceed(currPtr, state); 148 reallyDoProceed(currPtr, state);
122 } 149 }
123 } 150 }
@@ -139,31 +166,32 @@ void CompressedFSA1&lt;T&gt;::doProceedToNextByArray( @@ -139,31 +166,32 @@ void CompressedFSA1&lt;T&gt;::doProceedToNextByArray(
139 166
140 template <class T> 167 template <class T>
141 void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const { 168 void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const {
142 - const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); 169 + const unsigned char* currPtr = this->initialStatePtr + state.getOffset();
143 unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c]; 170 unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c];
144 - unsigned long transitionsTableOffset = 1; 171 + // unsigned long transitionsTableOffset = 1;
  172 + const StateData2 sd = readStateData(currPtr);
145 if (state.isAccepting()) { 173 if (state.isAccepting()) {
146 - transitionsTableOffset += state.getValueSize(); 174 + currPtr += state.getValueSize();
147 } 175 }
148 - const StateData2* sd = reinterpret_cast<const StateData2*>(fromPointer);  
149 - if (sd->array) { 176 +
  177 + if (sd.isArray) {
150 if (shortLabel > 0) { 178 if (shortLabel > 0) {
151 this->doProceedToNextByArray( 179 this->doProceedToNextByArray(
152 shortLabel, 180 shortLabel,
153 - reinterpret_cast<const uint32_t*>(fromPointer + transitionsTableOffset), 181 + reinterpret_cast<const uint32_t*> (currPtr),
154 state); 182 state);
155 } 183 }
156 else { 184 else {
157 - reallyDoProceed(fromPointer + transitionsTableOffset + 256, state); 185 + reallyDoProceed(currPtr + 256, state);
158 proceedToNext(c, state); 186 proceedToNext(c, state);
159 } 187 }
160 - } 188 + }
161 else { 189 else {
162 this->doProceedToNextByList( 190 this->doProceedToNextByList(
163 c, 191 c,
164 shortLabel, 192 shortLabel,
165 - fromPointer + transitionsTableOffset,  
166 - sd->transitionsNum, 193 + currPtr,
  194 + sd.transitionsNum,
167 state); 195 state);
168 } 196 }
169 } 197 }