Commit 445dc6d2c31b6eaa134dfbea0f362f672b5716ee
1 parent
d51aa0dd
ciąg dalszy porządków
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@169 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
8 changed files
with
90 additions
and
59 deletions
fsabuilder/morfeuszbuilder/fsa/encode.py
@@ -127,18 +127,15 @@ class Encoder(object): | @@ -127,18 +127,15 @@ class Encoder(object): | ||
127 | if isAnalyzer: | 127 | if isAnalyzer: |
128 | encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) | 128 | encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) |
129 | else: | 129 | else: |
130 | - serializeString(interp.homonymId, encodedInterpsList) | ||
131 | - serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList) | 130 | + encodedInterpsList.extend(serializeString(interp.homonymId)) |
131 | + encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd)) | ||
132 | encodedInterpsList.append(interp.encodedForm.cutLength) | 132 | encodedInterpsList.append(interp.encodedForm.cutLength) |
133 | - serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList) | 133 | + encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd)) |
134 | if isAnalyzer: | 134 | if isAnalyzer: |
135 | encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern)) | 135 | encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern)) |
136 | encodedInterpsList.extend(htons(interp.tagnum)) | 136 | encodedInterpsList.extend(htons(interp.tagnum)) |
137 | encodedInterpsList.append(interp.namenum) | 137 | encodedInterpsList.append(interp.namenum) |
138 | encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) | 138 | encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) |
139 | - | ||
140 | - if interp.encodedForm.suffixToAdd == 'bc': | ||
141 | - print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList] | ||
142 | 139 | ||
143 | res.extend(htons(len(encodedInterpsList))) | 140 | res.extend(htons(len(encodedInterpsList))) |
144 | res.extend(encodedInterpsList) | 141 | res.extend(encodedInterpsList) |
fsabuilder/morfeuszbuilder/fsa/serializer.py
@@ -271,16 +271,21 @@ class VLengthSerializer1(Serializer): | @@ -271,16 +271,21 @@ class VLengthSerializer1(Serializer): | ||
271 | return self.useArrays and state.serializeAsArray | 271 | return self.useArrays and state.serializeAsArray |
272 | 272 | ||
273 | def stateData2bytearray(self, state): | 273 | def stateData2bytearray(self, state): |
274 | - assert state.transitionsNum < 64 | 274 | +# assert state.transitionsNum < 64 |
275 | res = bytearray() | 275 | res = bytearray() |
276 | firstByte = 0 | 276 | firstByte = 0 |
277 | if state.isAccepting(): | 277 | if state.isAccepting(): |
278 | firstByte |= self.ACCEPTING_FLAG | 278 | firstByte |= self.ACCEPTING_FLAG |
279 | if self.stateShouldBeAnArray(state): | 279 | if self.stateShouldBeAnArray(state): |
280 | firstByte |= self.ARRAY_FLAG | 280 | firstByte |= self.ARRAY_FLAG |
281 | - firstByte |= state.transitionsNum | ||
282 | - assert firstByte < 256 and firstByte > 0 | ||
283 | - res.append(firstByte) | 281 | + if state.transitionsNum < 63: |
282 | + firstByte |= state.transitionsNum | ||
283 | + res.append(firstByte) | ||
284 | + else: | ||
285 | + firstByte |= 63 | ||
286 | + res.append(firstByte) | ||
287 | + res.append(state.transitionsNum) | ||
288 | + | ||
284 | if state.isAccepting(): | 289 | if state.isAccepting(): |
285 | res.extend(state.encodedData) | 290 | res.extend(state.encodedData) |
286 | return res | 291 | return res |
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
@@ -23,6 +23,8 @@ def htonl(n): | @@ -23,6 +23,8 @@ def htonl(n): | ||
23 | res.append(n & 0x000000FF) | 23 | res.append(n & 0x000000FF) |
24 | return res | 24 | return res |
25 | 25 | ||
26 | -def serializeString(string, out): | 26 | +def serializeString(string): |
27 | + out = bytearray() | ||
27 | out.extend(string.encode('utf8')) | 28 | out.extend(string.encode('utf8')) |
28 | out.append(0) | 29 | out.append(0) |
30 | + return out |
morfeusz/CMakeLists.txt
@@ -9,7 +9,7 @@ add_custom_command ( | @@ -9,7 +9,7 @@ add_custom_command ( | ||
9 | ) | 9 | ) |
10 | add_custom_command ( | 10 | add_custom_command ( |
11 | OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" | 11 | OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" |
12 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE | 12 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 |
13 | DEPENDS "${INPUT_DICTIONARY}" | 13 | DEPENDS "${INPUT_DICTIONARY}" |
14 | COMMENT "Building default dictionary C++ file" | 14 | COMMENT "Building default dictionary C++ file" |
15 | ) | 15 | ) |
morfeusz/Morfeusz.cpp
@@ -74,7 +74,7 @@ void Morfeusz::processOneWord( | @@ -74,7 +74,7 @@ void Morfeusz::processOneWord( | ||
74 | std::vector<MorphInterpretation>& results, | 74 | std::vector<MorphInterpretation>& results, |
75 | bool insideIgnHandler) const { | 75 | bool insideIgnHandler) const { |
76 | while (inputStart != inputEnd | 76 | while (inputStart != inputEnd |
77 | - && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) { | 77 | + && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) { |
78 | env.getCharsetConverter().next(inputStart, inputEnd); | 78 | env.getCharsetConverter().next(inputStart, inputEnd); |
79 | } | 79 | } |
80 | vector<InterpretedChunk> accum; | 80 | vector<InterpretedChunk> accum; |
@@ -154,7 +154,7 @@ void Morfeusz::doProcessOneWord( | @@ -154,7 +154,7 @@ void Morfeusz::doProcessOneWord( | ||
154 | 154 | ||
155 | StateType state = env.getFSA().getInitialState(); | 155 | StateType state = env.getFSA().getInitialState(); |
156 | 156 | ||
157 | - while (!isEndOfWord(codepoint)) { | 157 | + while (!isWhitespace(codepoint)) { |
158 | uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER | 158 | uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER |
159 | ? env.getCaseConverter().toLower(codepoint) | 159 | ? env.getCaseConverter().toLower(codepoint) |
160 | : codepoint; | 160 | : codepoint; |
@@ -210,14 +210,14 @@ void Morfeusz::doProcessOneWord( | @@ -210,14 +210,14 @@ void Morfeusz::doProcessOneWord( | ||
210 | doShiftOrth(accum.back(), ic); | 210 | doShiftOrth(accum.back(), ic); |
211 | } | 211 | } |
212 | accum.push_back(ic); | 212 | accum.push_back(ic); |
213 | - if (isEndOfWord(codepoint) | 213 | + if (isWhitespace(codepoint) |
214 | && newSegrulesState.accepting) { | 214 | && newSegrulesState.accepting) { |
215 | if (this->options.debug) { | 215 | if (this->options.debug) { |
216 | cerr << "ACCEPTING " << debugAccum(accum) << endl; | 216 | cerr << "ACCEPTING " << debugAccum(accum) << endl; |
217 | } | 217 | } |
218 | graph.addPath(accum, newSegrulesState.weak); | 218 | graph.addPath(accum, newSegrulesState.weak); |
219 | } | 219 | } |
220 | - else if (!isEndOfWord(codepoint)) { | 220 | + else if (!isWhitespace(codepoint)) { |
221 | // cerr << "will process " << currInput << endl; | 221 | // cerr << "will process " << currInput << endl; |
222 | const char* newCurrInput = currInput; | 222 | const char* newCurrInput = currInput; |
223 | doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); | 223 | doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); |
@@ -227,7 +227,7 @@ void Morfeusz::doProcessOneWord( | @@ -227,7 +227,7 @@ void Morfeusz::doProcessOneWord( | ||
227 | } | 227 | } |
228 | } | 228 | } |
229 | } | 229 | } |
230 | - codepoint = currInput == inputEnd || isEndOfWord(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | 230 | + codepoint = currInput == inputEnd || isWhitespace(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd); |
231 | } | 231 | } |
232 | inputData = currInput; | 232 | inputData = currInput; |
233 | } | 233 | } |
morfeusz/Tagset.cpp
@@ -9,7 +9,7 @@ | @@ -9,7 +9,7 @@ | ||
9 | using namespace std; | 9 | using namespace std; |
10 | 10 | ||
11 | Tagset::Tagset(const unsigned char* ptr) { | 11 | Tagset::Tagset(const unsigned char* ptr) { |
12 | - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | 12 | + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET); |
13 | const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; | 13 | const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; |
14 | readTags(currPtr, this->tags); | 14 | readTags(currPtr, this->tags); |
15 | readTags(currPtr, this->names); | 15 | readTags(currPtr, this->names); |
morfeusz/charset/charset_utils.hpp
@@ -48,10 +48,9 @@ static inline std::vector<uint32_t> initializeWhitespaces() { | @@ -48,10 +48,9 @@ static inline std::vector<uint32_t> initializeWhitespaces() { | ||
48 | return res; | 48 | return res; |
49 | } | 49 | } |
50 | 50 | ||
51 | -inline bool isEndOfWord(uint32_t codepoint) { | 51 | +inline bool isWhitespace(uint32_t codepoint) { |
52 | static std::vector<uint32_t> whitespaces(initializeWhitespaces()); | 52 | static std::vector<uint32_t> whitespaces(initializeWhitespaces()); |
53 | return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint); | 53 | return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint); |
54 | - // return whitespaces.count(codepoint); | ||
55 | } | 54 | } |
56 | 55 | ||
57 | template <class StateClass> | 56 | template <class StateClass> |
morfeusz/fsa/cfsa1_impl.hpp
@@ -11,35 +11,57 @@ | @@ -11,35 +11,57 @@ | ||
11 | #include <vector> | 11 | #include <vector> |
12 | 12 | ||
13 | #include "fsa.hpp" | 13 | #include "fsa.hpp" |
14 | +#include "../deserializationUtils.hpp" | ||
14 | 15 | ||
15 | using namespace std; | 16 | using namespace std; |
16 | 17 | ||
17 | -#pragma pack(push, 1) /* push current alignment to stack */ | 18 | +static const unsigned char CFSA1_ACCEPTING_FLAG = 128; |
19 | +static const unsigned char CFSA1_ARRAY_FLAG = 64; | ||
20 | +static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 63; | ||
21 | + | ||
22 | +static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3; | ||
23 | + | ||
24 | +static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257; | ||
18 | 25 | ||
19 | struct StateData2 { | 26 | struct StateData2 { |
20 | - unsigned transitionsNum: 6; | ||
21 | - unsigned array : 1; | ||
22 | - unsigned accepting : 1; | 27 | + unsigned int transitionsNum; |
28 | + bool isArray; | ||
29 | + bool isAccepting; | ||
23 | }; | 30 | }; |
24 | 31 | ||
25 | struct TransitionData2 { | 32 | struct TransitionData2 { |
26 | - unsigned offsetSize : 2; | ||
27 | - unsigned shortLabel : 6; | 33 | + unsigned int offsetSize; |
34 | + unsigned int shortLabel; | ||
28 | }; | 35 | }; |
29 | 36 | ||
37 | +static inline StateData2 readStateData(const unsigned char*& ptr) { | ||
38 | + StateData2 res; | ||
39 | + unsigned char firstByte = readInt8(ptr); | ||
40 | + res.isArray = firstByte & CFSA1_ARRAY_FLAG; | ||
41 | + res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG; | ||
42 | + res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK; | ||
43 | + if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) { | ||
44 | + res.transitionsNum = readInt8(ptr); | ||
45 | + } | ||
46 | + return res; | ||
47 | +} | ||
30 | 48 | ||
31 | -#pragma pack(pop) /* restore original alignment from stack */ | ||
32 | - | ||
33 | -static const unsigned int INITIAL_STATE_OFFSET = 257; | 49 | +static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) { |
50 | + TransitionData2 res; | ||
51 | + unsigned char firstByte = readInt8(ptr); | ||
52 | + res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK; | ||
53 | + res.shortLabel = firstByte >> 2; | ||
54 | + return res; | ||
55 | +} | ||
34 | 56 | ||
35 | template <class T> | 57 | template <class T> |
36 | vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) { | 58 | vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) { |
37 | - return vector<unsigned char>(ptr, ptr + INITIAL_STATE_OFFSET); | 59 | + return vector<unsigned char>(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET); |
38 | } | 60 | } |
39 | 61 | ||
40 | template <class T> | 62 | template <class T> |
41 | CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer) | 63 | CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer) |
42 | -: FSA<T>(ptr + INITIAL_STATE_OFFSET, deserializer), | 64 | +: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer), |
43 | label2ShortLabel(initializeChar2PopularCharIdx(ptr)) { | 65 | label2ShortLabel(initializeChar2PopularCharIdx(ptr)) { |
44 | } | 66 | } |
45 | 67 | ||
@@ -52,10 +74,12 @@ template <class T> | @@ -52,10 +74,12 @@ template <class T> | ||
52 | void CompressedFSA1<T>::reallyDoProceed( | 74 | void CompressedFSA1<T>::reallyDoProceed( |
53 | const unsigned char* statePtr, | 75 | const unsigned char* statePtr, |
54 | State<T>& state) const { | 76 | State<T>& state) const { |
55 | - const StateData2* sd = reinterpret_cast<const StateData2*>(statePtr); | ||
56 | - if (sd->accepting) { | 77 | + const unsigned char* currPtr = statePtr; |
78 | + const StateData2 sd = readStateData(currPtr); | ||
79 | + if (sd.isAccepting) { | ||
57 | T object; | 80 | T object; |
58 | - long size = this->deserializer.deserialize(statePtr + 1, object); | 81 | + long size = this->deserializer.deserialize(currPtr, object); |
82 | + // long size = this->deserializer.deserialize(statePtr + 1, object); | ||
59 | state.setNext(statePtr - this->initialStatePtr, object, size); | 83 | state.setNext(statePtr - this->initialStatePtr, object, size); |
60 | } | 84 | } |
61 | else { | 85 | else { |
@@ -70,54 +94,57 @@ void CompressedFSA1<T>::doProceedToNextByList( | @@ -70,54 +94,57 @@ void CompressedFSA1<T>::doProceedToNextByList( | ||
70 | const unsigned char* ptr, | 94 | const unsigned char* ptr, |
71 | const unsigned int transitionsNum, | 95 | const unsigned int transitionsNum, |
72 | State<T>& state) const { | 96 | State<T>& state) const { |
73 | - register const unsigned char* currPtr = ptr; | 97 | + const unsigned char* currPtr = ptr; |
74 | // TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset); | 98 | // TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset); |
75 | bool found = false; | 99 | bool found = false; |
76 | TransitionData2 td; | 100 | TransitionData2 td; |
77 | for (unsigned int i = 0; i < transitionsNum; i++) { | 101 | for (unsigned int i = 0; i < transitionsNum; i++) { |
78 | - // const_cast<Counter*>(&counter)->increment(1); | ||
79 | - td = *(reinterpret_cast<const TransitionData2*>(currPtr)); | 102 | + td = readTransitionFirstByte(currPtr); |
80 | if (td.shortLabel == shortLabel) { | 103 | if (td.shortLabel == shortLabel) { |
81 | if (shortLabel == 0) { | 104 | if (shortLabel == 0) { |
82 | - currPtr++; | ||
83 | - char label = (char) *currPtr; | 105 | + char label = static_cast<char>(readInt8(currPtr)); |
84 | if (label == c) { | 106 | if (label == c) { |
85 | found = true; | 107 | found = true; |
86 | break; | 108 | break; |
87 | } | 109 | } |
88 | else { | 110 | else { |
89 | - currPtr += td.offsetSize + 1; | 111 | + currPtr += td.offsetSize; |
90 | } | 112 | } |
91 | - } else { | 113 | + } |
114 | + else { | ||
92 | found = true; | 115 | found = true; |
93 | break; | 116 | break; |
94 | } | 117 | } |
95 | - } | 118 | + } |
96 | else { | 119 | else { |
97 | if (td.shortLabel == 0) { | 120 | if (td.shortLabel == 0) { |
98 | currPtr++; | 121 | currPtr++; |
99 | } | 122 | } |
100 | - currPtr += td.offsetSize + 1; | 123 | + currPtr += td.offsetSize; |
101 | } | 124 | } |
102 | } | 125 | } |
103 | if (!found) { | 126 | if (!found) { |
104 | state.setNextAsSink(); | 127 | state.setNextAsSink(); |
105 | - } | 128 | + } |
106 | else { | 129 | else { |
107 | - currPtr++; | 130 | + uint32_t offset; |
108 | switch (td.offsetSize) { | 131 | switch (td.offsetSize) { |
109 | case 0: | 132 | case 0: |
133 | + offset = 0; | ||
110 | break; | 134 | break; |
111 | case 1: | 135 | case 1: |
112 | - currPtr += *currPtr + 1; | 136 | + offset = readInt8(currPtr); |
113 | break; | 137 | break; |
114 | case 2: | 138 | case 2: |
115 | - currPtr += ntohs(*((const uint16_t*) currPtr)) + 2; | 139 | + offset = readInt16(currPtr); |
116 | break; | 140 | break; |
117 | case 3: | 141 | case 3: |
118 | - currPtr += (((const unsigned int) ntohs(*((const uint16_t*) currPtr))) << 8) + currPtr[2] + 3; | ||
119 | - break; | 142 | + offset = readInt16(currPtr); |
143 | + offset <<= 8; | ||
144 | + offset += readInt8(currPtr); | ||
145 | + break; | ||
120 | } | 146 | } |
147 | + currPtr += offset; | ||
121 | reallyDoProceed(currPtr, state); | 148 | reallyDoProceed(currPtr, state); |
122 | } | 149 | } |
123 | } | 150 | } |
@@ -139,31 +166,32 @@ void CompressedFSA1<T>::doProceedToNextByArray( | @@ -139,31 +166,32 @@ void CompressedFSA1<T>::doProceedToNextByArray( | ||
139 | 166 | ||
140 | template <class T> | 167 | template <class T> |
141 | void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const { | 168 | void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const { |
142 | - const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); | 169 | + const unsigned char* currPtr = this->initialStatePtr + state.getOffset(); |
143 | unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c]; | 170 | unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c]; |
144 | - unsigned long transitionsTableOffset = 1; | 171 | + // unsigned long transitionsTableOffset = 1; |
172 | + const StateData2 sd = readStateData(currPtr); | ||
145 | if (state.isAccepting()) { | 173 | if (state.isAccepting()) { |
146 | - transitionsTableOffset += state.getValueSize(); | 174 | + currPtr += state.getValueSize(); |
147 | } | 175 | } |
148 | - const StateData2* sd = reinterpret_cast<const StateData2*>(fromPointer); | ||
149 | - if (sd->array) { | 176 | + |
177 | + if (sd.isArray) { | ||
150 | if (shortLabel > 0) { | 178 | if (shortLabel > 0) { |
151 | this->doProceedToNextByArray( | 179 | this->doProceedToNextByArray( |
152 | shortLabel, | 180 | shortLabel, |
153 | - reinterpret_cast<const uint32_t*>(fromPointer + transitionsTableOffset), | 181 | + reinterpret_cast<const uint32_t*> (currPtr), |
154 | state); | 182 | state); |
155 | } | 183 | } |
156 | else { | 184 | else { |
157 | - reallyDoProceed(fromPointer + transitionsTableOffset + 256, state); | 185 | + reallyDoProceed(currPtr + 256, state); |
158 | proceedToNext(c, state); | 186 | proceedToNext(c, state); |
159 | } | 187 | } |
160 | - } | 188 | + } |
161 | else { | 189 | else { |
162 | this->doProceedToNextByList( | 190 | this->doProceedToNextByList( |
163 | c, | 191 | c, |
164 | shortLabel, | 192 | shortLabel, |
165 | - fromPointer + transitionsTableOffset, | ||
166 | - sd->transitionsNum, | 193 | + currPtr, |
194 | + sd.transitionsNum, | ||
167 | state); | 195 | state); |
168 | } | 196 | } |
169 | } | 197 | } |