Commit f3f17708743d360a3957135091e4059a4ed135ef
1 parent
c9317018
dalsze próby optymalizacji, poprawa działania operatora ">" w segmentacji (przen…
…osi orth a nie lemat) git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@180 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
13 changed files
with
152 additions
and
133 deletions
CMakeLists.txt
fsabuilder/morfeuszbuilder/fsa/fsa.py
... | ... | @@ -41,7 +41,7 @@ class FSA(object): |
41 | 41 | self.n += 1 |
42 | 42 | |
43 | 43 | # debug |
44 | - if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0 or word.startswith('naj'): | |
44 | + if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: | |
45 | 45 | logging.info(u'%d %s' % (self.n, word)) |
46 | 46 | for label in encodedWord: |
47 | 47 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 |
... | ... |
morfeusz/CasePatternHelper.hpp
... | ... | @@ -74,29 +74,20 @@ public: |
74 | 74 | |
75 | 75 | std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { |
76 | 76 | std::vector<bool> res; |
77 | - uint8_t casePatternType = *ptr; | |
78 | - ptr++; | |
77 | + uint8_t casePatternType = *ptr++; | |
79 | 78 | uint8_t prefixLength; |
80 | 79 | uint8_t patternLength; |
81 | 80 | switch (casePatternType) { |
82 | 81 | case LEMMA_ONLY_LOWER: |
83 | 82 | break; |
84 | 83 | case LEMMA_UPPER_PREFIX: |
85 | - prefixLength = *ptr; | |
86 | - ptr++; | |
87 | - for (unsigned int i = 0; i < prefixLength; i++) { | |
88 | - // lemma.casePattern[i] = true; | |
89 | - res.push_back(true); | |
90 | - } | |
91 | - // lemma.casePattern.resize(prefixLength, true); | |
84 | + prefixLength = *ptr++; | |
85 | + res.resize(prefixLength, true); | |
92 | 86 | break; |
93 | 87 | case LEMMA_MIXED_CASE: |
94 | - patternLength = *ptr; | |
95 | - ptr++; | |
88 | + patternLength = *ptr++; | |
96 | 89 | for (unsigned int i = 0; i < patternLength; i++) { |
97 | - uint8_t idx = *ptr; | |
98 | - ptr++; | |
99 | - // lemma.casePattern[idx] = true; | |
90 | + uint8_t idx = *ptr++; | |
100 | 91 | res.resize(idx + 1, false); |
101 | 92 | res[idx] = true; |
102 | 93 | } |
... | ... |
morfeusz/Environment.cpp
... | ... | @@ -38,10 +38,6 @@ Environment::Environment( |
38 | 38 | MorfeuszProcessorType processorType, |
39 | 39 | const unsigned char* fsaFileStartPtr) |
40 | 40 | : currentCharsetConverter(getCharsetConverter(charset)), |
41 | -utf8CharsetConverter(), | |
42 | -isoCharsetConverter(), | |
43 | -cp1250CharsetConverter(), | |
44 | -cp852CharsetConverter(), | |
45 | 41 | caseConverter(), |
46 | 42 | tagset(fsaFileStartPtr), |
47 | 43 | qualifiers(fsaFileStartPtr), |
... | ... | @@ -63,13 +59,13 @@ casePatternHelper() { |
63 | 59 | const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const { |
64 | 60 | switch (charset) { |
65 | 61 | case UTF8: |
66 | - return &this->utf8CharsetConverter; | |
62 | + return &UTF8CharsetConverter::getInstance(); | |
67 | 63 | case ISO8859_2: |
68 | - return &this->isoCharsetConverter; | |
64 | + return &ISO8859_2_CharsetConverter::getInstance(); | |
69 | 65 | case CP1250: |
70 | - return &this->cp1250CharsetConverter; | |
66 | + return &Windows_1250_CharsetConverter::getInstance(); | |
71 | 67 | case CP852: |
72 | - return &this->cp852CharsetConverter; | |
68 | + return &CP852_CharsetConverter::getInstance(); | |
73 | 69 | default: |
74 | 70 | throw MorfeuszException("invalid charset"); |
75 | 71 | } |
... | ... |
morfeusz/Environment.hpp
... | ... | @@ -65,10 +65,6 @@ public: |
65 | 65 | virtual ~Environment(); |
66 | 66 | private: |
67 | 67 | const CharsetConverter* currentCharsetConverter; |
68 | - const UTF8CharsetConverter utf8CharsetConverter; | |
69 | - const ISO8859_2_CharsetConverter isoCharsetConverter; | |
70 | - const Windows_1250_CharsetConverter cp1250CharsetConverter; | |
71 | - const CP852_CharsetConverter cp852CharsetConverter; | |
72 | 68 | const CaseConverter caseConverter; |
73 | 69 | Tagset tagset; |
74 | 70 | Qualifiers qualifiers; |
... | ... |
morfeusz/InterpretedChunksDecoder.hpp
... | ... | @@ -62,7 +62,7 @@ public: |
62 | 62 | orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); |
63 | 63 | const unsigned char* currPtr = interpretedChunk.interpsPtr; |
64 | 64 | while (currPtr < interpretedChunk.interpsEndPtr) { |
65 | - this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr, out); | |
65 | + this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out); | |
66 | 66 | } |
67 | 67 | } |
68 | 68 | } |
... | ... | @@ -72,6 +72,7 @@ protected: |
72 | 72 | void decodeForm( |
73 | 73 | const vector<uint32_t>& orth, |
74 | 74 | const EncodedForm& lemma, |
75 | + bool forPrefix, | |
75 | 76 | string& res) const { |
76 | 77 | for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) { |
77 | 78 | uint32_t cp = |
... | ... | @@ -80,11 +81,13 @@ protected: |
80 | 81 | : orth[i]; |
81 | 82 | env.getCharsetConverter().append(cp, res); |
82 | 83 | } |
83 | - const char* suffixPtr = lemma.suffixToAdd.c_str(); | |
84 | - const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length(); | |
85 | - while (suffixPtr != suffixEnd) { | |
86 | - uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd); | |
87 | - env.getCharsetConverter().append(cp, res); | |
84 | + if (!forPrefix) { | |
85 | + const char* suffixPtr = lemma.suffixToAdd.c_str(); | |
86 | + const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length(); | |
87 | + while (suffixPtr != suffixEnd) { | |
88 | + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | |
89 | + env.getCharsetConverter().append(cp, res); | |
90 | + } | |
88 | 91 | } |
89 | 92 | } |
90 | 93 | |
... | ... | @@ -97,12 +100,10 @@ protected: |
97 | 100 | assert(encodedForm.casePattern.size() == 0); |
98 | 101 | if (isLemmaOnlyLower(compressionByte)) { |
99 | 102 | encodedForm.casePattern = std::vector<bool>(); |
100 | - } | |
101 | - else if (isLemmaOnlyTitle(compressionByte)) { | |
103 | + } else if (isLemmaOnlyTitle(compressionByte)) { | |
102 | 104 | encodedForm.casePattern = std::vector<bool>(); |
103 | 105 | encodedForm.casePattern.push_back(true); |
104 | - } | |
105 | - else { | |
106 | + } else { | |
106 | 107 | encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); |
107 | 108 | } |
108 | 109 | } |
... | ... | @@ -110,11 +111,9 @@ protected: |
110 | 111 | EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const { |
111 | 112 | EncodedInterpretation interp; |
112 | 113 | if (isOrthOnlyLower(compressionByte)) { |
113 | - } | |
114 | - else if (isOrthOnlyTitle(compressionByte)) { | |
114 | + } else if (isOrthOnlyTitle(compressionByte)) { | |
115 | 115 | interp.orthCasePattern.push_back(true); |
116 | - } | |
117 | - else { | |
116 | + } else { | |
118 | 117 | interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); |
119 | 118 | } |
120 | 119 | deserializeEncodedForm(ptr, compressionByte, interp.value); |
... | ... | @@ -129,8 +128,7 @@ private: |
129 | 128 | vector<string> splitRes(split(lemma, ':')); |
130 | 129 | if (splitRes.size() == 2) { |
131 | 130 | return make_pair(splitRes[0], splitRes[1]); |
132 | - } | |
133 | - else { | |
131 | + } else { | |
134 | 132 | return make_pair(lemma, ""); |
135 | 133 | } |
136 | 134 | } |
... | ... | @@ -140,17 +138,18 @@ private: |
140 | 138 | const string& orth, |
141 | 139 | const string& lemmaPrefix, |
142 | 140 | const InterpretedChunk& chunk, |
141 | + bool forPrefix, | |
143 | 142 | const unsigned char*& ptr, |
144 | 143 | std::vector<MorphInterpretation>& out) const { |
145 | 144 | string lemma = lemmaPrefix; |
146 | 145 | EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr); |
147 | - this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma); | |
146 | + this->decodeForm(chunk.lowercaseCodepoints, ei.value, forPrefix, lemma); | |
148 | 147 | if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.orthCasePattern)) { |
149 | 148 | // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma); |
150 | 149 | out.push_back(MorphInterpretation( |
151 | 150 | startNode, endNode, |
152 | 151 | orth, lemma, |
153 | -// "", | |
152 | + // "", | |
154 | 153 | ei.tag, |
155 | 154 | ei.nameClassifier, |
156 | 155 | ei.qualifiers, |
... | ... | @@ -165,11 +164,10 @@ private: |
165 | 164 | const unsigned char* ptr = prefixChunk.interpsPtr; |
166 | 165 | std::vector<MorphInterpretation> mi; |
167 | 166 | // env.getCasePatternHelper().skipCasePattern(ptr); |
168 | - this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, ptr, mi); | |
167 | + this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi); | |
169 | 168 | if (!mi.empty()) { |
170 | 169 | lemmaPrefix += mi[0].getLemma(); |
171 | - } | |
172 | - else { | |
170 | + } else { | |
173 | 171 | return false; |
174 | 172 | } |
175 | 173 | } |
... | ... | @@ -227,7 +225,7 @@ private: |
227 | 225 | return MorphInterpretation( |
228 | 226 | startNode, endNode, |
229 | 227 | orth, lemma + HOMONYM_SEPARATOR + ei.homonymId, |
230 | -// ei.homonymId, | |
228 | + // ei.homonymId, | |
231 | 229 | ei.tag, |
232 | 230 | ei.nameClassifier, |
233 | 231 | ei.qualifiers, |
... | ... | @@ -245,7 +243,7 @@ private: |
245 | 243 | const char* suffixPtr = orth.suffixToAdd.c_str(); |
246 | 244 | const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); |
247 | 245 | while (suffixPtr != suffixEnd) { |
248 | - uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd); | |
246 | + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | |
249 | 247 | env.getCharsetConverter().append(cp, res); |
250 | 248 | } |
251 | 249 | } |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -39,7 +39,7 @@ Morfeusz::Morfeusz() |
39 | 39 | generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA), |
40 | 40 | options(createDefaultOptions()), |
41 | 41 | accum(), |
42 | -graph(){ | |
42 | +graph() { | |
43 | 43 | analyzerEnv.setCaseSensitive(options.caseSensitive); |
44 | 44 | generatorEnv.setCaseSensitive(false); |
45 | 45 | } |
... | ... | @@ -85,14 +85,14 @@ void Morfeusz::processOneWord( |
85 | 85 | && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) { |
86 | 86 | env.getCharsetConverter().next(inputStart, inputEnd); |
87 | 87 | } |
88 | - | |
88 | + | |
89 | 89 | accum.clear(); |
90 | 90 | graph.clear(); |
91 | - | |
91 | + | |
92 | 92 | const char* currInput = inputStart; |
93 | 93 | const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); |
94 | 94 | |
95 | - doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); | |
95 | + doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState); | |
96 | 96 | |
97 | 97 | if (!graph.empty()) { |
98 | 98 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); |
... | ... | @@ -111,7 +111,6 @@ void Morfeusz::processOneWord( |
111 | 111 | && env.getProcessorType() == ANALYZER |
112 | 112 | && !insideIgnHandler) { |
113 | 113 | this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results); |
114 | - // this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results); | |
115 | 114 | } |
116 | 115 | else if (inputStart != inputEnd) { |
117 | 116 | this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results); |
... | ... | @@ -144,25 +143,44 @@ static inline string debugAccum(vector<InterpretedChunk>& accum) { |
144 | 143 | return res.str(); |
145 | 144 | } |
146 | 145 | |
146 | +static inline void feedStateDirectly( | |
147 | + StateType& state, | |
148 | + const char* inputStart, | |
149 | + const char* inputEnd) { | |
150 | + const char* currInput = inputStart; | |
151 | + while (currInput != inputEnd && !state.isSink()) { | |
152 | + state.proceedToNext(*currInput++); | |
153 | + } | |
154 | +} | |
155 | + | |
156 | +static inline void feedState( | |
157 | + StateType& state, | |
158 | + int codepoint) { | |
159 | + std::string chars; | |
160 | + UTF8CharsetConverter::getInstance().append(codepoint, chars); | |
161 | + for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) { | |
162 | + state.proceedToNext(chars[i]); | |
163 | + } | |
164 | +} | |
165 | + | |
147 | 166 | void Morfeusz::doProcessOneWord( |
148 | 167 | const Environment& env, |
149 | 168 | const char*& inputData, |
150 | 169 | const char* inputEnd, |
151 | - SegrulesState segrulesState, | |
152 | - vector<InterpretedChunk>& accum, | |
153 | - InflexionGraph& graph) const { | |
170 | + SegrulesState segrulesState) const { | |
154 | 171 | if (this->options.debug) { |
155 | 172 | cerr << "----------" << endl; |
156 | 173 | cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; |
157 | 174 | } |
158 | 175 | // cerr << "doAnalyzeOneWord " << inputData << endl; |
159 | 176 | const char* inputStart = inputData; |
177 | + const char* prevInput = inputData; | |
160 | 178 | const char* currInput = inputData; |
161 | 179 | uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); |
162 | 180 | bool currCodepointIsWhitespace = isWhitespace(codepoint); |
163 | 181 | vector<uint32_t> originalCodepoints; |
164 | 182 | vector<uint32_t> normalizedCodepoints; |
165 | - | |
183 | + | |
166 | 184 | originalCodepoints.reserve(16); |
167 | 185 | normalizedCodepoints.reserve(16); |
168 | 186 | |
... | ... | @@ -174,7 +192,13 @@ void Morfeusz::doProcessOneWord( |
174 | 192 | : codepoint; |
175 | 193 | originalCodepoints.push_back(codepoint); |
176 | 194 | normalizedCodepoints.push_back(normalizedCodepoint); |
177 | - feedState(state, normalizedCodepoint, UTF8CharsetConverter()); | |
195 | + if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) { | |
196 | + feedStateDirectly(state, prevInput, currInput); | |
197 | + } | |
198 | + else { | |
199 | + feedState(state, normalizedCodepoint); | |
200 | + } | |
201 | + | |
178 | 202 | codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd); |
179 | 203 | currCodepointIsWhitespace = isWhitespace(codepoint); |
180 | 204 | string homonymId; |
... | ... | @@ -184,6 +208,7 @@ void Morfeusz::doProcessOneWord( |
184 | 208 | } |
185 | 209 | homonymId = string(currInput + 1, inputEnd); |
186 | 210 | // cerr << "homonym " << homonymId << endl; |
211 | + prevInput = currInput; | |
187 | 212 | currInput = inputEnd; |
188 | 213 | codepoint = 0x00; |
189 | 214 | currCodepointIsWhitespace = true; |
... | ... | @@ -195,9 +220,8 @@ void Morfeusz::doProcessOneWord( |
195 | 220 | if (this->options.debug) { |
196 | 221 | cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; |
197 | 222 | } |
198 | - vector<SegrulesState> newSegrulesStates; | |
199 | - env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); | |
200 | - if (!newSegrulesStates.empty() | |
223 | + vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace); | |
224 | + if (!newSegrulesStates.empty() | |
201 | 225 | && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig)) { |
202 | 226 | |
203 | 227 | for ( |
... | ... | @@ -225,17 +249,18 @@ void Morfeusz::doProcessOneWord( |
225 | 249 | doShiftOrth(accum.back(), ic); |
226 | 250 | } |
227 | 251 | accum.push_back(ic); |
228 | - if (currCodepointIsWhitespace | |
229 | - && newSegrulesState.accepting) { | |
252 | + if (currCodepointIsWhitespace) { | |
253 | + assert(newSegrulesState.accepting); | |
230 | 254 | if (this->options.debug) { |
231 | 255 | cerr << "ACCEPTING " << debugAccum(accum) << endl; |
232 | 256 | } |
233 | 257 | graph.addPath(accum, newSegrulesState.weak); |
234 | 258 | } |
235 | - else if (!currCodepointIsWhitespace) { | |
259 | + else { | |
260 | + assert(!newSegrulesState.sink); | |
236 | 261 | // cerr << "will process " << currInput << endl; |
237 | 262 | const char* newCurrInput = currInput; |
238 | - doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); | |
263 | + doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState); | |
239 | 264 | } |
240 | 265 | accum.pop_back(); |
241 | 266 | } |
... | ... | @@ -246,6 +271,7 @@ void Morfeusz::doProcessOneWord( |
246 | 271 | } |
247 | 272 | } |
248 | 273 | } |
274 | + prevInput = currInput; | |
249 | 275 | codepoint = currInput == inputEnd || currCodepointIsWhitespace ? 0x00 : env.getCharsetConverter().next(currInput, inputEnd); |
250 | 276 | } |
251 | 277 | inputData = currInput; |
... | ... |
morfeusz/Morfeusz.hpp
... | ... | @@ -170,9 +170,7 @@ private: |
170 | 170 | const Environment& env, |
171 | 171 | const char*& inputData, |
172 | 172 | const char* inputEnd, |
173 | - SegrulesState segrulesState, | |
174 | - std::vector<InterpretedChunk>& accum, | |
175 | - InflexionGraph& graph) const; | |
173 | + SegrulesState segrulesState) const; | |
176 | 174 | |
177 | 175 | void handleIgnChunk( |
178 | 176 | const Environment& env, |
... | ... |
morfeusz/charset/CharsetConverter.cpp
... | ... | @@ -36,6 +36,13 @@ static inline void iterateThroughInvalidUtf8Sequence(const char*& it, const char |
36 | 36 | } |
37 | 37 | } |
38 | 38 | |
39 | +const UTF8CharsetConverter& UTF8CharsetConverter::getInstance() { | |
40 | + static UTF8CharsetConverter instance; | |
41 | + return instance; | |
42 | +} | |
43 | + | |
44 | +UTF8CharsetConverter::UTF8CharsetConverter() {} | |
45 | + | |
39 | 46 | uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { |
40 | 47 | uint32_t cp = 0; |
41 | 48 | utf8::internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); |
... | ... | @@ -86,14 +93,29 @@ void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const { |
86 | 93 | } |
87 | 94 | } |
88 | 95 | |
96 | +const ISO8859_2_CharsetConverter& ISO8859_2_CharsetConverter::getInstance() { | |
97 | + static ISO8859_2_CharsetConverter instance; | |
98 | + return instance; | |
99 | +} | |
100 | + | |
89 | 101 | ISO8859_2_CharsetConverter::ISO8859_2_CharsetConverter() |
90 | 102 | : OneByteCharsetConverter(ISO_8859_2_TO_CODEPOINT) { |
91 | 103 | } |
92 | 104 | |
105 | +const Windows_1250_CharsetConverter& Windows_1250_CharsetConverter::getInstance() { | |
106 | + static Windows_1250_CharsetConverter instance; | |
107 | + return instance; | |
108 | +} | |
109 | + | |
93 | 110 | Windows_1250_CharsetConverter::Windows_1250_CharsetConverter() |
94 | 111 | : OneByteCharsetConverter(WINDOWS_1250_TO_CODEPOINT) { |
95 | 112 | } |
96 | 113 | |
114 | +const CP852_CharsetConverter& CP852_CharsetConverter::getInstance() { | |
115 | + static CP852_CharsetConverter instance; | |
116 | + return instance; | |
117 | +} | |
118 | + | |
97 | 119 | CP852_CharsetConverter::CP852_CharsetConverter() |
98 | 120 | : OneByteCharsetConverter(CP852_TO_CODEPOINT) { |
99 | 121 | } |
... | ... |
morfeusz/charset/CharsetConverter.hpp
... | ... | @@ -21,17 +21,23 @@ public: |
21 | 21 | virtual std::string fromUTF8(const std::string& input) const; |
22 | 22 | |
23 | 23 | std::string toString(const std::vector<uint32_t>& codepoints) const; |
24 | - | |
24 | + | |
25 | 25 | virtual ~CharsetConverter(); |
26 | 26 | private: |
27 | 27 | }; |
28 | 28 | |
29 | 29 | class UTF8CharsetConverter : public CharsetConverter { |
30 | 30 | public: |
31 | + | |
32 | + static const UTF8CharsetConverter& getInstance(); | |
33 | + | |
31 | 34 | uint32_t next(const char*& it, const char* end) const; |
32 | 35 | void append(uint32_t cp, std::string& result) const; |
33 | 36 | std::string fromUTF8(const std::string& input) const; |
34 | 37 | private: |
38 | + UTF8CharsetConverter(); | |
39 | + UTF8CharsetConverter(const UTF8CharsetConverter&); // do not implement | |
40 | + void operator=(const UTF8CharsetConverter&); // do not implement | |
35 | 41 | }; |
36 | 42 | |
37 | 43 | /* |
... | ... | @@ -49,20 +55,29 @@ private: |
49 | 55 | |
50 | 56 | class ISO8859_2_CharsetConverter : public OneByteCharsetConverter { |
51 | 57 | public: |
52 | - ISO8859_2_CharsetConverter(); | |
58 | + static const ISO8859_2_CharsetConverter& getInstance(); | |
53 | 59 | private: |
60 | + ISO8859_2_CharsetConverter(); | |
61 | + ISO8859_2_CharsetConverter(const ISO8859_2_CharsetConverter&); // do not implement | |
62 | + void operator=(const ISO8859_2_CharsetConverter&); // do not implement | |
54 | 63 | }; |
55 | 64 | |
56 | 65 | class Windows_1250_CharsetConverter : public OneByteCharsetConverter { |
57 | 66 | public: |
58 | - Windows_1250_CharsetConverter(); | |
67 | + static const Windows_1250_CharsetConverter& getInstance(); | |
59 | 68 | private: |
69 | + Windows_1250_CharsetConverter(); | |
70 | + Windows_1250_CharsetConverter(const Windows_1250_CharsetConverter&); // do not implement | |
71 | + void operator=(const Windows_1250_CharsetConverter&); // do not implement | |
60 | 72 | }; |
61 | 73 | |
62 | 74 | class CP852_CharsetConverter : public OneByteCharsetConverter { |
63 | 75 | public: |
64 | - CP852_CharsetConverter(); | |
76 | + static const CP852_CharsetConverter& getInstance(); | |
65 | 77 | private: |
78 | + CP852_CharsetConverter(); | |
79 | + CP852_CharsetConverter(const CP852_CharsetConverter&); // do not implement | |
80 | + void operator=(const CP852_CharsetConverter&); // do not implement | |
66 | 81 | }; |
67 | 82 | |
68 | 83 | #endif /* ENCODINGCONVERTER_HPP */ |
... | ... |
morfeusz/charset/charset_utils.hpp
... | ... | @@ -53,17 +53,5 @@ inline bool isWhitespace(uint32_t codepoint) { |
53 | 53 | return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint); |
54 | 54 | } |
55 | 55 | |
56 | -template <class StateClass> | |
57 | -void feedState( | |
58 | - StateClass& state, | |
59 | - int codepoint, | |
60 | - const CharsetConverter& charsetConverter) { | |
61 | - std::string chars; | |
62 | - charsetConverter.append(codepoint, chars); | |
63 | - for (unsigned int i = 0; i < chars.length(); i++) { | |
64 | - state.proceedToNext(chars[i]); | |
65 | - } | |
66 | -} | |
67 | - | |
68 | 56 | #endif /* CHARSET_UTILS_HPP */ |
69 | 57 | |
... | ... |
morfeusz/segrules/SegrulesFSA.hpp
... | ... | @@ -17,42 +17,48 @@ struct SegrulesState { |
17 | 17 | bool accepting; |
18 | 18 | bool weak; |
19 | 19 | bool shiftOrthFromPrevious; |
20 | + bool sink; | |
20 | 21 | }; |
21 | 22 | |
22 | -inline bool operator<(const SegrulesState& s1, const SegrulesState& s2) | |
23 | -{ | |
24 | - return s1.offset < s2.offset; | |
23 | +inline bool operator<(const SegrulesState& s1, const SegrulesState& s2) { | |
24 | + return s1.offset < s2.offset; | |
25 | 25 | } |
26 | 26 | |
27 | 27 | class SegrulesFSA { |
28 | 28 | public: |
29 | - SegrulesFSA(const unsigned char* ptr): initialState(), ptr(ptr) { | |
30 | - SegrulesState state = {0, false, false, false}; | |
29 | + | |
30 | + SegrulesFSA(const unsigned char* ptr) : initialState(), ptr(ptr) { | |
31 | + SegrulesState state = {0, false, false, false, false}; | |
31 | 32 | initialState = state; |
32 | 33 | } |
33 | - | |
34 | - void proceedToNext( | |
35 | - const unsigned char segnum, | |
36 | - const SegrulesState state, | |
37 | - std::vector<SegrulesState>& newStates) const { | |
38 | - | |
39 | - const unsigned char* currPtr = ptr + state.offset; | |
40 | - currPtr++; | |
34 | + | |
35 | + std::vector<SegrulesState> proceedToNext( | |
36 | + const unsigned char segnum, | |
37 | + const SegrulesState state, | |
38 | + bool atEndOfWord) const { | |
39 | + std::vector<SegrulesState> res; | |
40 | + const unsigned char* currPtr = ptr + state.offset + 1; | |
41 | 41 | const unsigned char transitionsNum = *currPtr++; |
42 | 42 | for (unsigned int i = 0; i < transitionsNum; i++) { |
43 | 43 | if (*currPtr == segnum) { |
44 | - newStates.push_back(this->transition2State(currPtr)); | |
44 | + SegrulesState newState = this->transition2State(currPtr); | |
45 | + if ((atEndOfWord && newState.accepting) | |
46 | + || (!atEndOfWord && !newState.sink)) { | |
47 | + res.push_back(newState); | |
48 | + } | |
45 | 49 | } |
46 | 50 | currPtr += 4; |
47 | 51 | } |
52 | + return res; | |
48 | 53 | } |
49 | - | |
50 | - virtual ~SegrulesFSA() {} | |
51 | - | |
54 | + | |
55 | + virtual ~SegrulesFSA() { | |
56 | + } | |
57 | + | |
52 | 58 | SegrulesState initialState; |
53 | 59 | private: |
54 | 60 | const unsigned char* ptr; |
55 | - | |
61 | + | |
56 | 62 | SegrulesState transition2State(const unsigned char* transitionPtr) const { |
57 | 63 | unsigned char ACCEPTING_FLAG = 1; |
58 | 64 | unsigned char WEAK_FLAG = 2; |
... | ... | @@ -62,6 +68,7 @@ private: |
62 | 68 | res.offset = readInt16(transitionPtr); |
63 | 69 | res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; |
64 | 70 | res.weak = *(ptr + res.offset) & WEAK_FLAG; |
71 | + res.sink = *(ptr + res.offset + 1) == 0; | |
65 | 72 | return res; |
66 | 73 | } |
67 | 74 | }; |
... | ... |
nbproject/configurations.xml
... | ... | @@ -26,9 +26,6 @@ |
26 | 26 | <df name="cli"> |
27 | 27 | <in>cli.cpp</in> |
28 | 28 | </df> |
29 | - <df name="data"> | |
30 | - <in>default_fsa.cpp</in> | |
31 | - </df> | |
32 | 29 | <df name="fsa"> |
33 | 30 | <in>const.cpp</in> |
34 | 31 | <in>test_not_recognize.cpp</in> |
... | ... | @@ -133,8 +130,6 @@ |
133 | 130 | </ccTool> |
134 | 131 | </item> |
135 | 132 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
136 | - <ccTool flags="1"> | |
137 | - </ccTool> | |
138 | 133 | </item> |
139 | 134 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
140 | 135 | ex="false" |
... | ... | @@ -236,25 +231,6 @@ |
236 | 231 | </preprocessorList> |
237 | 232 | </ccTool> |
238 | 233 | </folder> |
239 | - <folder path="0/data"> | |
240 | - <ccTool> | |
241 | - <incDir> | |
242 | - <pElem>build</pElem> | |
243 | - <pElem>build/morfeusz</pElem> | |
244 | - </incDir> | |
245 | - <preprocessorList> | |
246 | - <Elem>NDEBUG</Elem> | |
247 | - <Elem>_OPTIMIZE__=1</Elem> | |
248 | - <Elem>__PIC__=2</Elem> | |
249 | - <Elem>__pic__=2</Elem> | |
250 | - <Elem>libmorfeusz_EXPORTS</Elem> | |
251 | - </preprocessorList> | |
252 | - <undefinedList> | |
253 | - <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> | |
254 | - <Elem>__NO_INLINE__</Elem> | |
255 | - </undefinedList> | |
256 | - </ccTool> | |
257 | - </folder> | |
258 | 234 | <folder path="0/segrules"> |
259 | 235 | <ccTool> |
260 | 236 | <incDir> |
... | ... | @@ -307,7 +283,7 @@ |
307 | 283 | <ccTool> |
308 | 284 | <incDir> |
309 | 285 | <pElem>morfeusz</pElem> |
310 | - <pElem>/usr/lib/jvm/default-java/include</pElem> | |
286 | + <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | |
311 | 287 | </incDir> |
312 | 288 | <preprocessorList> |
313 | 289 | <Elem>NDEBUG</Elem> |
... | ... | @@ -373,13 +349,14 @@ |
373 | 349 | </ccTool> |
374 | 350 | </item> |
375 | 351 | <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="4"> |
376 | - <ccTool flags="2"> | |
352 | + <ccTool flags="1"> | |
377 | 353 | <incDir> |
378 | 354 | <pElem>build</pElem> |
379 | 355 | <pElem>morfeusz</pElem> |
380 | 356 | <pElem>build/morfeusz</pElem> |
381 | 357 | </incDir> |
382 | 358 | <preprocessorList> |
359 | + <Elem>NDEBUG</Elem> | |
383 | 360 | <Elem>libmorfeusz_EXPORTS</Elem> |
384 | 361 | </preprocessorList> |
385 | 362 | </ccTool> |
... | ... | @@ -410,13 +387,14 @@ |
410 | 387 | </ccTool> |
411 | 388 | </item> |
412 | 389 | <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> |
413 | - <ccTool flags="2"> | |
390 | + <ccTool flags="1"> | |
414 | 391 | <incDir> |
415 | 392 | <pElem>build</pElem> |
416 | 393 | <pElem>morfeusz</pElem> |
417 | 394 | <pElem>build/morfeusz</pElem> |
418 | 395 | </incDir> |
419 | 396 | <preprocessorList> |
397 | + <Elem>NDEBUG</Elem> | |
420 | 398 | <Elem>libmorfeusz_EXPORTS</Elem> |
421 | 399 | </preprocessorList> |
422 | 400 | </ccTool> |
... | ... | @@ -427,6 +405,11 @@ |
427 | 405 | ex="false" |
428 | 406 | tool="1" |
429 | 407 | flavor2="4"> |
408 | + <ccTool flags="1"> | |
409 | + <preprocessorList> | |
410 | + <Elem>NDEBUG</Elem> | |
411 | + </preprocessorList> | |
412 | + </ccTool> | |
430 | 413 | </item> |
431 | 414 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
432 | 415 | </item> |
... | ... | @@ -436,8 +419,6 @@ |
436 | 419 | flavor2="4"> |
437 | 420 | </item> |
438 | 421 | <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4"> |
439 | - <ccTool flags="1"> | |
440 | - </ccTool> | |
441 | 422 | </item> |
442 | 423 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
443 | 424 | <ccTool flags="1"> |
... | ... |