Commit 445dc6d2c31b6eaa134dfbea0f362f672b5716ee
1 parent
d51aa0dd
ciąg dalszy porządków
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@169 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
8 changed files
with
90 additions
and
59 deletions
fsabuilder/morfeuszbuilder/fsa/encode.py
... | ... | @@ -127,18 +127,15 @@ class Encoder(object): |
127 | 127 | if isAnalyzer: |
128 | 128 | encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) |
129 | 129 | else: |
130 | - serializeString(interp.homonymId, encodedInterpsList) | |
131 | - serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList) | |
130 | + encodedInterpsList.extend(serializeString(interp.homonymId)) | |
131 | + encodedInterpsList.extend(serializeString(interp.encodedForm.prefixToAdd)) | |
132 | 132 | encodedInterpsList.append(interp.encodedForm.cutLength) |
133 | - serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList) | |
133 | + encodedInterpsList.extend(serializeString(interp.encodedForm.suffixToAdd)) | |
134 | 134 | if isAnalyzer: |
135 | 135 | encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern)) |
136 | 136 | encodedInterpsList.extend(htons(interp.tagnum)) |
137 | 137 | encodedInterpsList.append(interp.namenum) |
138 | 138 | encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) |
139 | - | |
140 | - if interp.encodedForm.suffixToAdd == 'bc': | |
141 | - print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList] | |
142 | 139 | |
143 | 140 | res.extend(htons(len(encodedInterpsList))) |
144 | 141 | res.extend(encodedInterpsList) |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.py
... | ... | @@ -271,16 +271,21 @@ class VLengthSerializer1(Serializer): |
271 | 271 | return self.useArrays and state.serializeAsArray |
272 | 272 | |
273 | 273 | def stateData2bytearray(self, state): |
274 | - assert state.transitionsNum < 64 | |
274 | +# assert state.transitionsNum < 64 | |
275 | 275 | res = bytearray() |
276 | 276 | firstByte = 0 |
277 | 277 | if state.isAccepting(): |
278 | 278 | firstByte |= self.ACCEPTING_FLAG |
279 | 279 | if self.stateShouldBeAnArray(state): |
280 | 280 | firstByte |= self.ARRAY_FLAG |
281 | - firstByte |= state.transitionsNum | |
282 | - assert firstByte < 256 and firstByte > 0 | |
283 | - res.append(firstByte) | |
281 | + if state.transitionsNum < 63: | |
282 | + firstByte |= state.transitionsNum | |
283 | + res.append(firstByte) | |
284 | + else: | |
285 | + firstByte |= 63 | |
286 | + res.append(firstByte) | |
287 | + res.append(state.transitionsNum) | |
288 | + | |
284 | 289 | if state.isAccepting(): |
285 | 290 | res.extend(state.encodedData) |
286 | 291 | return res |
... | ... |
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
morfeusz/CMakeLists.txt
... | ... | @@ -9,7 +9,7 @@ add_custom_command ( |
9 | 9 | ) |
10 | 10 | add_custom_command ( |
11 | 11 | OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" |
12 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE | |
12 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 | |
13 | 13 | DEPENDS "${INPUT_DICTIONARY}" |
14 | 14 | COMMENT "Building default dictionary C++ file" |
15 | 15 | ) |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -74,7 +74,7 @@ void Morfeusz::processOneWord( |
74 | 74 | std::vector<MorphInterpretation>& results, |
75 | 75 | bool insideIgnHandler) const { |
76 | 76 | while (inputStart != inputEnd |
77 | - && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) { | |
77 | + && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) { | |
78 | 78 | env.getCharsetConverter().next(inputStart, inputEnd); |
79 | 79 | } |
80 | 80 | vector<InterpretedChunk> accum; |
... | ... | @@ -154,7 +154,7 @@ void Morfeusz::doProcessOneWord( |
154 | 154 | |
155 | 155 | StateType state = env.getFSA().getInitialState(); |
156 | 156 | |
157 | - while (!isEndOfWord(codepoint)) { | |
157 | + while (!isWhitespace(codepoint)) { | |
158 | 158 | uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER |
159 | 159 | ? env.getCaseConverter().toLower(codepoint) |
160 | 160 | : codepoint; |
... | ... | @@ -210,14 +210,14 @@ void Morfeusz::doProcessOneWord( |
210 | 210 | doShiftOrth(accum.back(), ic); |
211 | 211 | } |
212 | 212 | accum.push_back(ic); |
213 | - if (isEndOfWord(codepoint) | |
213 | + if (isWhitespace(codepoint) | |
214 | 214 | && newSegrulesState.accepting) { |
215 | 215 | if (this->options.debug) { |
216 | 216 | cerr << "ACCEPTING " << debugAccum(accum) << endl; |
217 | 217 | } |
218 | 218 | graph.addPath(accum, newSegrulesState.weak); |
219 | 219 | } |
220 | - else if (!isEndOfWord(codepoint)) { | |
220 | + else if (!isWhitespace(codepoint)) { | |
221 | 221 | // cerr << "will process " << currInput << endl; |
222 | 222 | const char* newCurrInput = currInput; |
223 | 223 | doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); |
... | ... | @@ -227,7 +227,7 @@ void Morfeusz::doProcessOneWord( |
227 | 227 | } |
228 | 228 | } |
229 | 229 | } |
230 | - codepoint = currInput == inputEnd || isEndOfWord(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | |
230 | + codepoint = currInput == inputEnd || isWhitespace(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | |
231 | 231 | } |
232 | 232 | inputData = currInput; |
233 | 233 | } |
... | ... |
morfeusz/Tagset.cpp
... | ... | @@ -9,7 +9,7 @@ |
9 | 9 | using namespace std; |
10 | 10 | |
11 | 11 | Tagset::Tagset(const unsigned char* ptr) { |
12 | - uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | |
12 | + uint32_t fsaSize = readInt32Const(ptr + FSA_DATA_SIZE_OFFSET); | |
13 | 13 | const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; |
14 | 14 | readTags(currPtr, this->tags); |
15 | 15 | readTags(currPtr, this->names); |
... | ... |
morfeusz/charset/charset_utils.hpp
... | ... | @@ -48,10 +48,9 @@ static inline std::vector<uint32_t> initializeWhitespaces() { |
48 | 48 | return res; |
49 | 49 | } |
50 | 50 | |
51 | -inline bool isEndOfWord(uint32_t codepoint) { | |
51 | +inline bool isWhitespace(uint32_t codepoint) { | |
52 | 52 | static std::vector<uint32_t> whitespaces(initializeWhitespaces()); |
53 | 53 | return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint); |
54 | - // return whitespaces.count(codepoint); | |
55 | 54 | } |
56 | 55 | |
57 | 56 | template <class StateClass> |
... | ... |
morfeusz/fsa/cfsa1_impl.hpp
... | ... | @@ -11,35 +11,57 @@ |
11 | 11 | #include <vector> |
12 | 12 | |
13 | 13 | #include "fsa.hpp" |
14 | +#include "../deserializationUtils.hpp" | |
14 | 15 | |
15 | 16 | using namespace std; |
16 | 17 | |
17 | -#pragma pack(push, 1) /* push current alignment to stack */ | |
18 | +static const unsigned char CFSA1_ACCEPTING_FLAG = 128; | |
19 | +static const unsigned char CFSA1_ARRAY_FLAG = 64; | |
20 | +static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 63; | |
21 | + | |
22 | +static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3; | |
23 | + | |
24 | +static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257; | |
18 | 25 | |
19 | 26 | struct StateData2 { |
20 | - unsigned transitionsNum: 6; | |
21 | - unsigned array : 1; | |
22 | - unsigned accepting : 1; | |
27 | + unsigned int transitionsNum; | |
28 | + bool isArray; | |
29 | + bool isAccepting; | |
23 | 30 | }; |
24 | 31 | |
25 | 32 | struct TransitionData2 { |
26 | - unsigned offsetSize : 2; | |
27 | - unsigned shortLabel : 6; | |
33 | + unsigned int offsetSize; | |
34 | + unsigned int shortLabel; | |
28 | 35 | }; |
29 | 36 | |
37 | +static inline StateData2 readStateData(const unsigned char*& ptr) { | |
38 | + StateData2 res; | |
39 | + unsigned char firstByte = readInt8(ptr); | |
40 | + res.isArray = firstByte & CFSA1_ARRAY_FLAG; | |
41 | + res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG; | |
42 | + res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK; | |
43 | + if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) { | |
44 | + res.transitionsNum = readInt8(ptr); | |
45 | + } | |
46 | + return res; | |
47 | +} | |
30 | 48 | |
31 | -#pragma pack(pop) /* restore original alignment from stack */ | |
32 | - | |
33 | -static const unsigned int INITIAL_STATE_OFFSET = 257; | |
49 | +static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) { | |
50 | + TransitionData2 res; | |
51 | + unsigned char firstByte = readInt8(ptr); | |
52 | + res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK; | |
53 | + res.shortLabel = firstByte >> 2; | |
54 | + return res; | |
55 | +} | |
34 | 56 | |
35 | 57 | template <class T> |
36 | 58 | vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) { |
37 | - return vector<unsigned char>(ptr, ptr + INITIAL_STATE_OFFSET); | |
59 | + return vector<unsigned char>(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET); | |
38 | 60 | } |
39 | 61 | |
40 | 62 | template <class T> |
41 | 63 | CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer) |
42 | -: FSA<T>(ptr + INITIAL_STATE_OFFSET, deserializer), | |
64 | +: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer), | |
43 | 65 | label2ShortLabel(initializeChar2PopularCharIdx(ptr)) { |
44 | 66 | } |
45 | 67 | |
... | ... | @@ -52,10 +74,12 @@ template <class T> |
52 | 74 | void CompressedFSA1<T>::reallyDoProceed( |
53 | 75 | const unsigned char* statePtr, |
54 | 76 | State<T>& state) const { |
55 | - const StateData2* sd = reinterpret_cast<const StateData2*>(statePtr); | |
56 | - if (sd->accepting) { | |
77 | + const unsigned char* currPtr = statePtr; | |
78 | + const StateData2 sd = readStateData(currPtr); | |
79 | + if (sd.isAccepting) { | |
57 | 80 | T object; |
58 | - long size = this->deserializer.deserialize(statePtr + 1, object); | |
81 | + long size = this->deserializer.deserialize(currPtr, object); | |
82 | + // long size = this->deserializer.deserialize(statePtr + 1, object); | |
59 | 83 | state.setNext(statePtr - this->initialStatePtr, object, size); |
60 | 84 | } |
61 | 85 | else { |
... | ... | @@ -70,54 +94,57 @@ void CompressedFSA1<T>::doProceedToNextByList( |
70 | 94 | const unsigned char* ptr, |
71 | 95 | const unsigned int transitionsNum, |
72 | 96 | State<T>& state) const { |
73 | - register const unsigned char* currPtr = ptr; | |
97 | + const unsigned char* currPtr = ptr; | |
74 | 98 | // TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset); |
75 | 99 | bool found = false; |
76 | 100 | TransitionData2 td; |
77 | 101 | for (unsigned int i = 0; i < transitionsNum; i++) { |
78 | - // const_cast<Counter*>(&counter)->increment(1); | |
79 | - td = *(reinterpret_cast<const TransitionData2*>(currPtr)); | |
102 | + td = readTransitionFirstByte(currPtr); | |
80 | 103 | if (td.shortLabel == shortLabel) { |
81 | 104 | if (shortLabel == 0) { |
82 | - currPtr++; | |
83 | - char label = (char) *currPtr; | |
105 | + char label = static_cast<char>(readInt8(currPtr)); | |
84 | 106 | if (label == c) { |
85 | 107 | found = true; |
86 | 108 | break; |
87 | 109 | } |
88 | 110 | else { |
89 | - currPtr += td.offsetSize + 1; | |
111 | + currPtr += td.offsetSize; | |
90 | 112 | } |
91 | - } else { | |
113 | + } | |
114 | + else { | |
92 | 115 | found = true; |
93 | 116 | break; |
94 | 117 | } |
95 | - } | |
118 | + } | |
96 | 119 | else { |
97 | 120 | if (td.shortLabel == 0) { |
98 | 121 | currPtr++; |
99 | 122 | } |
100 | - currPtr += td.offsetSize + 1; | |
123 | + currPtr += td.offsetSize; | |
101 | 124 | } |
102 | 125 | } |
103 | 126 | if (!found) { |
104 | 127 | state.setNextAsSink(); |
105 | - } | |
128 | + } | |
106 | 129 | else { |
107 | - currPtr++; | |
130 | + uint32_t offset; | |
108 | 131 | switch (td.offsetSize) { |
109 | 132 | case 0: |
133 | + offset = 0; | |
110 | 134 | break; |
111 | 135 | case 1: |
112 | - currPtr += *currPtr + 1; | |
136 | + offset = readInt8(currPtr); | |
113 | 137 | break; |
114 | 138 | case 2: |
115 | - currPtr += ntohs(*((const uint16_t*) currPtr)) + 2; | |
139 | + offset = readInt16(currPtr); | |
116 | 140 | break; |
117 | 141 | case 3: |
118 | - currPtr += (((const unsigned int) ntohs(*((const uint16_t*) currPtr))) << 8) + currPtr[2] + 3; | |
119 | - break; | |
142 | + offset = readInt16(currPtr); | |
143 | + offset <<= 8; | |
144 | + offset += readInt8(currPtr); | |
145 | + break; | |
120 | 146 | } |
147 | + currPtr += offset; | |
121 | 148 | reallyDoProceed(currPtr, state); |
122 | 149 | } |
123 | 150 | } |
... | ... | @@ -139,31 +166,32 @@ void CompressedFSA1<T>::doProceedToNextByArray( |
139 | 166 | |
140 | 167 | template <class T> |
141 | 168 | void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const { |
142 | - const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); | |
169 | + const unsigned char* currPtr = this->initialStatePtr + state.getOffset(); | |
143 | 170 | unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c]; |
144 | - unsigned long transitionsTableOffset = 1; | |
171 | + // unsigned long transitionsTableOffset = 1; | |
172 | + const StateData2 sd = readStateData(currPtr); | |
145 | 173 | if (state.isAccepting()) { |
146 | - transitionsTableOffset += state.getValueSize(); | |
174 | + currPtr += state.getValueSize(); | |
147 | 175 | } |
148 | - const StateData2* sd = reinterpret_cast<const StateData2*>(fromPointer); | |
149 | - if (sd->array) { | |
176 | + | |
177 | + if (sd.isArray) { | |
150 | 178 | if (shortLabel > 0) { |
151 | 179 | this->doProceedToNextByArray( |
152 | 180 | shortLabel, |
153 | - reinterpret_cast<const uint32_t*>(fromPointer + transitionsTableOffset), | |
181 | + reinterpret_cast<const uint32_t*> (currPtr), | |
154 | 182 | state); |
155 | 183 | } |
156 | 184 | else { |
157 | - reallyDoProceed(fromPointer + transitionsTableOffset + 256, state); | |
185 | + reallyDoProceed(currPtr + 256, state); | |
158 | 186 | proceedToNext(c, state); |
159 | 187 | } |
160 | - } | |
188 | + } | |
161 | 189 | else { |
162 | 190 | this->doProceedToNextByList( |
163 | 191 | c, |
164 | 192 | shortLabel, |
165 | - fromPointer + transitionsTableOffset, | |
166 | - sd->transitionsNum, | |
193 | + currPtr, | |
194 | + sd.transitionsNum, | |
167 | 195 | state); |
168 | 196 | } |
169 | 197 | } |
... | ... |