Commit 445dc6d2c31b6eaa134dfbea0f362f672b5716ee

Authored by Michał Lenart
1 parent d51aa0dd

ciąg dalszy porządków

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@169 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/morfeuszbuilder/fsa/encode.py
... ... @@ -127,18 +127,15 @@ class Encoder(object):
127 127 if isAnalyzer:
128 128 encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
129 129 else:
130   - serializeString(interp.homonymId, encodedInterpsList)
131   - serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList)
  130 + encodedInterpsList.extend(serializeString(interp.homonymId))
  131 + encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd))
132 132 encodedInterpsList.append(interp.encodedForm.cutLength)
133   - serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList)
  133 + encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd))
134 134 if isAnalyzer:
135 135 encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
136 136 encodedInterpsList.extend(htons(interp.tagnum))
137 137 encodedInterpsList.append(interp.namenum)
138 138 encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers))
139   -
140   - if interp.encodedForm.suffixToAdd == 'bc':
141   - print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList]
142 139  
143 140 res.extend(htons(len(encodedInterpsList)))
144 141 res.extend(encodedInterpsList)
... ...
fsabuilder/morfeuszbuilder/fsa/serializer.py
... ... @@ -271,16 +271,21 @@ class VLengthSerializer1(Serializer):
271 271 return self.useArrays and state.serializeAsArray
272 272  
273 273 def stateData2bytearray(self, state):
274   - assert state.transitionsNum < 64
  274 +# assert state.transitionsNum < 64
275 275 res = bytearray()
276 276 firstByte = 0
277 277 if state.isAccepting():
278 278 firstByte |= self.ACCEPTING_FLAG
279 279 if self.stateShouldBeAnArray(state):
280 280 firstByte |= self.ARRAY_FLAG
281   - firstByte |= state.transitionsNum
282   - assert firstByte < 256 and firstByte > 0
283   - res.append(firstByte)
  281 + if state.transitionsNum < 63:
  282 + firstByte |= state.transitionsNum
  283 + res.append(firstByte)
  284 + else:
  285 + firstByte |= 63
  286 + res.append(firstByte)
  287 + res.append(state.transitionsNum)
  288 +
284 289 if state.isAccepting():
285 290 res.extend(state.encodedData)
286 291 return res
... ...
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
... ... @@ -23,6 +23,8 @@ def htonl(n):
23 23 res.append(n & 0x000000FF)
24 24 return res
25 25  
26   -def serializeString(string, out):
  26 +def serializeString(string):
  27 + out = bytearray()
27 28 out.extend(string.encode('utf8'))
28 29 out.append(0)
  30 + return out
... ...
morfeusz/CMakeLists.txt
... ... @@ -9,7 +9,7 @@ add_custom_command (
9 9 )
10 10 add_custom_command (
11 11 OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}"
12   - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE
  12 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1
13 13 DEPENDS "${INPUT_DICTIONARY}"
14 14 COMMENT "Building default dictionary C++ file"
15 15 )
... ...
morfeusz/Morfeusz.cpp
... ... @@ -74,7 +74,7 @@ void Morfeusz::processOneWord(
74 74 std::vector<MorphInterpretation>& results,
75 75 bool insideIgnHandler) const {
76 76 while (inputStart != inputEnd
77   - && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) {
  77 + && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) {
78 78 env.getCharsetConverter().next(inputStart, inputEnd);
79 79 }
80 80 vector<InterpretedChunk> accum;
... ... @@ -154,7 +154,7 @@ void Morfeusz::doProcessOneWord(
154 154  
155 155 StateType state = env.getFSA().getInitialState();
156 156  
157   - while (!isEndOfWord(codepoint)) {
  157 + while (!isWhitespace(codepoint)) {
158 158 uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
159 159 ? env.getCaseConverter().toLower(codepoint)
160 160 : codepoint;
... ... @@ -210,14 +210,14 @@ void Morfeusz::doProcessOneWord(
210 210 doShiftOrth(accum.back(), ic);
211 211 }
212 212 accum.push_back(ic);
213   - if (isEndOfWord(codepoint)
  213 + if (isWhitespace(codepoint)
214 214 && newSegrulesState.accepting) {
215 215 if (this->options.debug) {
216 216 cerr << "ACCEPTING " << debugAccum(accum) << endl;
217 217 }
218 218 graph.addPath(accum, newSegrulesState.weak);
219 219 }
220   - else if (!isEndOfWord(codepoint)) {
  220 + else if (!isWhitespace(codepoint)) {
221 221 // cerr << "will process " << currInput << endl;
222 222 const char* newCurrInput = currInput;
223 223 doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
... ... @@ -227,7 +227,7 @@ void Morfeusz::doProcessOneWord(
227 227 }
228 228 }
229 229 }
230   - codepoint = currInput == inputEnd || isEndOfWord(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
  230 + codepoint = currInput == inputEnd || isWhitespace(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
231 231 }
232 232 inputData = currInput;
233 233 }
... ...
morfeusz/Tagset.cpp
... ... @@ -9,7 +9,7 @@
9 9 using namespace std;
10 10  
11 11 Tagset::Tagset(const unsigned char* ptr) {
12   - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
  12 + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET);
13 13 const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4;
14 14 readTags(currPtr, this->tags);
15 15 readTags(currPtr, this->names);
... ...
morfeusz/charset/charset_utils.hpp
... ... @@ -48,10 +48,9 @@ static inline std::vector&lt;uint32_t&gt; initializeWhitespaces() {
48 48 return res;
49 49 }
50 50  
51   -inline bool isEndOfWord(uint32_t codepoint) {
  51 +inline bool isWhitespace(uint32_t codepoint) {
52 52 static std::vector<uint32_t> whitespaces(initializeWhitespaces());
53 53 return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint);
54   - // return whitespaces.count(codepoint);
55 54 }
56 55  
57 56 template <class StateClass>
... ...
morfeusz/fsa/cfsa1_impl.hpp
... ... @@ -11,35 +11,57 @@
11 11 #include <vector>
12 12  
13 13 #include "fsa.hpp"
  14 +#include "../deserializationUtils.hpp"
14 15  
15 16 using namespace std;
16 17  
17   -#pragma pack(push, 1) /* push current alignment to stack */
  18 +static const unsigned char CFSA1_ACCEPTING_FLAG = 128;
  19 +static const unsigned char CFSA1_ARRAY_FLAG = 64;
  20 +static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 63;
  21 +
  22 +static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3;
  23 +
  24 +static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257;
18 25  
19 26 struct StateData2 {
20   - unsigned transitionsNum: 6;
21   - unsigned array : 1;
22   - unsigned accepting : 1;
  27 + unsigned int transitionsNum;
  28 + bool isArray;
  29 + bool isAccepting;
23 30 };
24 31  
25 32 struct TransitionData2 {
26   - unsigned offsetSize : 2;
27   - unsigned shortLabel : 6;
  33 + unsigned int offsetSize;
  34 + unsigned int shortLabel;
28 35 };
29 36  
  37 +static inline StateData2 readStateData(const unsigned char*& ptr) {
  38 + StateData2 res;
  39 + unsigned char firstByte = readInt8(ptr);
  40 + res.isArray = firstByte & CFSA1_ARRAY_FLAG;
  41 + res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG;
  42 + res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK;
  43 + if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) {
  44 + res.transitionsNum = readInt8(ptr);
  45 + }
  46 + return res;
  47 +}
30 48  
31   -#pragma pack(pop) /* restore original alignment from stack */
32   -
33   -static const unsigned int INITIAL_STATE_OFFSET = 257;
  49 +static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) {
  50 + TransitionData2 res;
  51 + unsigned char firstByte = readInt8(ptr);
  52 + res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK;
  53 + res.shortLabel = firstByte >> 2;
  54 + return res;
  55 +}
34 56  
35 57 template <class T>
36 58 vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) {
37   - return vector<unsigned char>(ptr, ptr + INITIAL_STATE_OFFSET);
  59 + return vector<unsigned char>(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET);
38 60 }
39 61  
40 62 template <class T>
41 63 CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer)
42   -: FSA<T>(ptr + INITIAL_STATE_OFFSET, deserializer),
  64 +: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer),
43 65 label2ShortLabel(initializeChar2PopularCharIdx(ptr)) {
44 66 }
45 67  
... ... @@ -52,10 +74,12 @@ template &lt;class T&gt;
52 74 void CompressedFSA1<T>::reallyDoProceed(
53 75 const unsigned char* statePtr,
54 76 State<T>& state) const {
55   - const StateData2* sd = reinterpret_cast<const StateData2*>(statePtr);
56   - if (sd->accepting) {
  77 + const unsigned char* currPtr = statePtr;
  78 + const StateData2 sd = readStateData(currPtr);
  79 + if (sd.isAccepting) {
57 80 T object;
58   - long size = this->deserializer.deserialize(statePtr + 1, object);
  81 + long size = this->deserializer.deserialize(currPtr, object);
  82 + // long size = this->deserializer.deserialize(statePtr + 1, object);
59 83 state.setNext(statePtr - this->initialStatePtr, object, size);
60 84 }
61 85 else {
... ... @@ -70,54 +94,57 @@ void CompressedFSA1&lt;T&gt;::doProceedToNextByList(
70 94 const unsigned char* ptr,
71 95 const unsigned int transitionsNum,
72 96 State<T>& state) const {
73   - register const unsigned char* currPtr = ptr;
  97 + const unsigned char* currPtr = ptr;
74 98 // TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset);
75 99 bool found = false;
76 100 TransitionData2 td;
77 101 for (unsigned int i = 0; i < transitionsNum; i++) {
78   - // const_cast<Counter*>(&counter)->increment(1);
79   - td = *(reinterpret_cast<const TransitionData2*>(currPtr));
  102 + td = readTransitionFirstByte(currPtr);
80 103 if (td.shortLabel == shortLabel) {
81 104 if (shortLabel == 0) {
82   - currPtr++;
83   - char label = (char) *currPtr;
  105 + char label = static_cast<char>(readInt8(currPtr));
84 106 if (label == c) {
85 107 found = true;
86 108 break;
87 109 }
88 110 else {
89   - currPtr += td.offsetSize + 1;
  111 + currPtr += td.offsetSize;
90 112 }
91   - } else {
  113 + }
  114 + else {
92 115 found = true;
93 116 break;
94 117 }
95   - }
  118 + }
96 119 else {
97 120 if (td.shortLabel == 0) {
98 121 currPtr++;
99 122 }
100   - currPtr += td.offsetSize + 1;
  123 + currPtr += td.offsetSize;
101 124 }
102 125 }
103 126 if (!found) {
104 127 state.setNextAsSink();
105   - }
  128 + }
106 129 else {
107   - currPtr++;
  130 + uint32_t offset;
108 131 switch (td.offsetSize) {
109 132 case 0:
  133 + offset = 0;
110 134 break;
111 135 case 1:
112   - currPtr += *currPtr + 1;
  136 + offset = readInt8(currPtr);
113 137 break;
114 138 case 2:
115   - currPtr += ntohs(*((const uint16_t*) currPtr)) + 2;
  139 + offset = readInt16(currPtr);
116 140 break;
117 141 case 3:
118   - currPtr += (((const unsigned int) ntohs(*((const uint16_t*) currPtr))) << 8) + currPtr[2] + 3;
119   - break;
  142 + offset = readInt16(currPtr);
  143 + offset <<= 8;
  144 + offset += readInt8(currPtr);
  145 + break;
120 146 }
  147 + currPtr += offset;
121 148 reallyDoProceed(currPtr, state);
122 149 }
123 150 }
... ... @@ -139,31 +166,32 @@ void CompressedFSA1&lt;T&gt;::doProceedToNextByArray(
139 166  
140 167 template <class T>
141 168 void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const {
142   - const unsigned char* fromPointer = this->initialStatePtr + state.getOffset();
  169 + const unsigned char* currPtr = this->initialStatePtr + state.getOffset();
143 170 unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c];
144   - unsigned long transitionsTableOffset = 1;
  171 + // unsigned long transitionsTableOffset = 1;
  172 + const StateData2 sd = readStateData(currPtr);
145 173 if (state.isAccepting()) {
146   - transitionsTableOffset += state.getValueSize();
  174 + currPtr += state.getValueSize();
147 175 }
148   - const StateData2* sd = reinterpret_cast<const StateData2*>(fromPointer);
149   - if (sd->array) {
  176 +
  177 + if (sd.isArray) {
150 178 if (shortLabel > 0) {
151 179 this->doProceedToNextByArray(
152 180 shortLabel,
153   - reinterpret_cast<const uint32_t*>(fromPointer + transitionsTableOffset),
  181 + reinterpret_cast<const uint32_t*> (currPtr),
154 182 state);
155 183 }
156 184 else {
157   - reallyDoProceed(fromPointer + transitionsTableOffset + 256, state);
  185 + reallyDoProceed(currPtr + 256, state);
158 186 proceedToNext(c, state);
159 187 }
160   - }
  188 + }
161 189 else {
162 190 this->doProceedToNextByList(
163 191 c,
164 192 shortLabel,
165   - fromPointer + transitionsTableOffset,
166   - sd->transitionsNum,
  193 + currPtr,
  194 + sd.transitionsNum,
167 195 state);
168 196 }
169 197 }
... ...