Commit f80dea595a7fb0c3ef6f9dea0075249a41c6f86b
1 parent
f3f17708
dalsza optymalizacja kodu
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@181 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
21 changed files
with
572 additions
and
434 deletions
morfeusz/CMakeLists.txt
morfeusz/CasePatternHelper.hpp
... | ... | @@ -12,6 +12,9 @@ |
12 | 12 | #include "InterpsGroup.hpp" |
13 | 13 | #include "CasePatternHelper.hpp" |
14 | 14 | #include "compressionByteUtils.hpp" |
15 | +#include "Environment.hpp" | |
16 | + | |
17 | +class Environment; | |
15 | 18 | |
16 | 19 | class CasePatternHelper { |
17 | 20 | public: |
... | ... | @@ -39,64 +42,17 @@ public: |
39 | 42 | } |
40 | 43 | |
41 | 44 | bool checkInterpsGroupOrthCasePatterns( |
42 | - const std::vector<uint32_t>& lowercaseCodepoints, | |
43 | - const std::vector<uint32_t>& originalCodepoints, | |
44 | - const InterpsGroup& ig) const { | |
45 | - const unsigned char* currPtr = ig.ptr; | |
46 | - unsigned char compressionByte = *currPtr++; | |
47 | - if (!this->caseSensitive) { | |
48 | - return true; | |
49 | - } | |
50 | - else if (isOrthOnlyLower(compressionByte)) { | |
51 | - return true; | |
52 | - } | |
53 | - else if (isOrthOnlyTitle(compressionByte)) { | |
54 | - return lowercaseCodepoints[0] != originalCodepoints[0]; | |
55 | - } | |
56 | - else { | |
57 | - unsigned char casePatternsNum = *currPtr++; | |
58 | - if (casePatternsNum == 0) { | |
59 | - return true; | |
60 | - } | |
61 | - else { | |
62 | - for (unsigned int i = 0; i < casePatternsNum; i++) { | |
63 | - if (checkCasePattern( | |
64 | - lowercaseCodepoints, | |
65 | - originalCodepoints, | |
66 | - deserializeOneCasePattern(currPtr))) { | |
67 | - return true; | |
68 | - } | |
69 | - } | |
70 | - return false; | |
71 | - } | |
72 | - } | |
73 | - } | |
45 | + const Environment& env, | |
46 | + const char* orthStart, | |
47 | + const char* orthEnd, | |
48 | + const InterpsGroup& ig) const; | |
74 | 49 | |
75 | - std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { | |
76 | - std::vector<bool> res; | |
77 | - uint8_t casePatternType = *ptr++; | |
78 | - uint8_t prefixLength; | |
79 | - uint8_t patternLength; | |
80 | - switch (casePatternType) { | |
81 | - case LEMMA_ONLY_LOWER: | |
82 | - break; | |
83 | - case LEMMA_UPPER_PREFIX: | |
84 | - prefixLength = *ptr++; | |
85 | - res.resize(prefixLength, true); | |
86 | - break; | |
87 | - case LEMMA_MIXED_CASE: | |
88 | - patternLength = *ptr++; | |
89 | - for (unsigned int i = 0; i < patternLength; i++) { | |
90 | - uint8_t idx = *ptr++; | |
91 | - res.resize(idx + 1, false); | |
92 | - res[idx] = true; | |
93 | - } | |
94 | - break; | |
95 | - } | |
96 | - return res; | |
97 | - } | |
50 | + static std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr); | |
98 | 51 | private: |
99 | 52 | bool caseSensitive; |
53 | + | |
54 | + mutable vector<uint32_t> orthCodepoints; | |
55 | + mutable vector<uint32_t> normalizedCodepoints; | |
100 | 56 | |
101 | 57 | static const uint8_t LEMMA_ONLY_LOWER = 0; |
102 | 58 | static const uint8_t LEMMA_UPPER_PREFIX = 1; |
... | ... |
morfeusz/Environment.cpp
... | ... | @@ -8,9 +8,11 @@ |
8 | 8 | #include <vector> |
9 | 9 | #include <algorithm> |
10 | 10 | #include "Environment.hpp" |
11 | -#include "InterpretedChunksDecoder.hpp" | |
11 | +#include "decoder/InterpretedChunksDecoder.hpp" | |
12 | 12 | #include "MorphDeserializer.hpp" |
13 | 13 | #include "exceptions.hpp" |
14 | +#include "decoder/InterpretedChunksDecoder4Analyzer.hpp" | |
15 | +#include "decoder/InterpretedChunksDecoder4Generator.hpp" | |
14 | 16 | |
15 | 17 | //class InterpretedChunksDecoder4Analyzer; |
16 | 18 | //class InterpretedChunksDecoder4Generator; |
... | ... | @@ -53,7 +55,7 @@ processorType == ANALYZER |
53 | 55 | ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) |
54 | 56 | : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)), |
55 | 57 | processorType(processorType), |
56 | -casePatternHelper() { | |
58 | +casePatternHelper(new CasePatternHelper()) { | |
57 | 59 | } |
58 | 60 | |
59 | 61 | const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const { |
... | ... | @@ -78,6 +80,7 @@ Environment::~Environment() { |
78 | 80 | delete this->fsaFileStartPtr; |
79 | 81 | } |
80 | 82 | delete this->chunksDecoder; |
83 | + delete this->casePatternHelper; | |
81 | 84 | } |
82 | 85 | |
83 | 86 | void Environment::setCharset(MorfeuszCharset charset) { |
... | ... | @@ -146,11 +149,11 @@ MorfeuszProcessorType Environment::getProcessorType() const { |
146 | 149 | } |
147 | 150 | |
148 | 151 | void Environment::setCaseSensitive(bool caseSensitive) { |
149 | - this->casePatternHelper.setCaseSensitive(caseSensitive); | |
152 | + this->casePatternHelper->setCaseSensitive(caseSensitive); | |
150 | 153 | } |
151 | 154 | |
152 | 155 | const CasePatternHelper& Environment::getCasePatternHelper() const { |
153 | - return this->casePatternHelper; | |
156 | + return *this->casePatternHelper; | |
154 | 157 | } |
155 | 158 | |
156 | 159 | const Qualifiers& Environment::getQualifiersHelper() const { |
... | ... |
morfeusz/Environment.hpp
... | ... | @@ -11,6 +11,7 @@ |
11 | 11 | #include <vector> |
12 | 12 | |
13 | 13 | class InterpretedChunksDecoder; |
14 | +class CasePatternHelper; | |
14 | 15 | |
15 | 16 | #include "charset/CaseConverter.hpp" |
16 | 17 | #include "charset/CharsetConverter.hpp" |
... | ... | @@ -79,7 +80,7 @@ private: |
79 | 80 | |
80 | 81 | const InterpretedChunksDecoder* chunksDecoder; |
81 | 82 | MorfeuszProcessorType processorType; |
82 | - CasePatternHelper casePatternHelper; | |
83 | + CasePatternHelper* casePatternHelper; | |
83 | 84 | |
84 | 85 | const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; |
85 | 86 | }; |
... | ... |
morfeusz/InflexionGraph.cpp
... | ... | @@ -78,7 +78,7 @@ void InflexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool wea |
78 | 78 | this->addMiddleEdge((unsigned int) this->graph.size(), e); |
79 | 79 | } |
80 | 80 | else { |
81 | - Edge e = {chunk, (int) this->graph.size() + 1}; | |
81 | + Edge e = {chunk, (unsigned long) this->graph.size() + 1}; | |
82 | 82 | this->addMiddleEdge((unsigned int) this->graph.size(), e); |
83 | 83 | } |
84 | 84 | } |
... | ... | @@ -117,7 +117,8 @@ static bool containsEqualEdge(const vector<InflexionGraph::Edge>& edges, const I |
117 | 117 | for (unsigned int i = 0; i < edges.size(); i++) { |
118 | 118 | const InflexionGraph::Edge& e1 = edges[i]; |
119 | 119 | if (e1.chunk.textStartPtr == e.chunk.textStartPtr |
120 | - && e1.chunk.lowercaseCodepoints == e.chunk.lowercaseCodepoints | |
120 | + && e1.chunk.textStartPtr == e.chunk.textStartPtr | |
121 | + && e1.chunk.textEndPtr == e.chunk.textEndPtr | |
121 | 122 | && e1.chunk.segmentType == e.chunk.segmentType |
122 | 123 | && e1.nextNode == e.nextNode) { |
123 | 124 | return true; |
... | ... |
morfeusz/InflexionGraph.hpp
morfeusz/InterpretedChunk.hpp
... | ... | @@ -15,8 +15,6 @@ struct InterpretedChunk { |
15 | 15 | unsigned char segmentType; |
16 | 16 | const char* textStartPtr; |
17 | 17 | const char* textEndPtr; |
18 | - std::vector<uint32_t> originalCodepoints; | |
19 | - std::vector<uint32_t> lowercaseCodepoints; | |
20 | 18 | const unsigned char* interpsGroupPtr; |
21 | 19 | const unsigned char* interpsPtr; |
22 | 20 | const unsigned char* interpsEndPtr; |
... | ... |
morfeusz/InterpretedChunksDecoder.hpp deleted
1 | -/* | |
2 | - * File: InterpsGroupDecoder.hpp | |
3 | - * Author: mlenart | |
4 | - * | |
5 | - * Created on November 22, 2013, 10:35 PM | |
6 | - */ | |
7 | - | |
8 | -#ifndef INTERPSGROUPDECODER_HPP | |
9 | -#define INTERPSGROUPDECODER_HPP | |
10 | - | |
11 | -#include <string> | |
12 | -#include <vector> | |
13 | -#include <utility> | |
14 | - | |
15 | -#include "charset/CharsetConverter.hpp" | |
16 | -#include "EncodedInterpretation.hpp" | |
17 | -#include "InterpretedChunk.hpp" | |
18 | -#include "EncodedInterpretation.hpp" | |
19 | -#include "charset/CaseConverter.hpp" | |
20 | -#include "Environment.hpp" | |
21 | -#include "MorphInterpretation.hpp" | |
22 | -#include "CasePatternHelper.hpp" | |
23 | -#include "deserializationUtils.hpp" | |
24 | -#include "compressionByteUtils.hpp" | |
25 | -#include "const.hpp" | |
26 | - | |
27 | -class InterpretedChunksDecoder { | |
28 | -public: | |
29 | - | |
30 | - InterpretedChunksDecoder(const Environment& env) | |
31 | - : env(env) { | |
32 | - } | |
33 | - | |
34 | - virtual ~InterpretedChunksDecoder() { | |
35 | - } | |
36 | - | |
37 | - virtual void decode( | |
38 | - unsigned int startNode, | |
39 | - unsigned int endNode, | |
40 | - const InterpretedChunk& interpretedChunk, | |
41 | - std::vector<MorphInterpretation>& out) const = 0; | |
42 | - | |
43 | -protected: | |
44 | - | |
45 | - const Environment& env; | |
46 | -}; | |
47 | - | |
48 | -class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { | |
49 | -public: | |
50 | - | |
51 | - InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { | |
52 | - } | |
53 | - | |
54 | - void decode( | |
55 | - unsigned int startNode, | |
56 | - unsigned int endNode, | |
57 | - const InterpretedChunk& interpretedChunk, | |
58 | - std::vector<MorphInterpretation>& out) const { | |
59 | - string orth; | |
60 | - string lemmaPrefix; | |
61 | - if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) { | |
62 | - orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
63 | - const unsigned char* currPtr = interpretedChunk.interpsPtr; | |
64 | - while (currPtr < interpretedChunk.interpsEndPtr) { | |
65 | - this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out); | |
66 | - } | |
67 | - } | |
68 | - } | |
69 | - | |
70 | -protected: | |
71 | - | |
72 | - void decodeForm( | |
73 | - const vector<uint32_t>& orth, | |
74 | - const EncodedForm& lemma, | |
75 | - bool forPrefix, | |
76 | - string& res) const { | |
77 | - for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) { | |
78 | - uint32_t cp = | |
79 | - (i < lemma.casePattern.size() && lemma.casePattern[i]) | |
80 | - ? env.getCaseConverter().toTitle(orth[i]) | |
81 | - : orth[i]; | |
82 | - env.getCharsetConverter().append(cp, res); | |
83 | - } | |
84 | - if (!forPrefix) { | |
85 | - const char* suffixPtr = lemma.suffixToAdd.c_str(); | |
86 | - const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length(); | |
87 | - while (suffixPtr != suffixEnd) { | |
88 | - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | |
89 | - env.getCharsetConverter().append(cp, res); | |
90 | - } | |
91 | - } | |
92 | - } | |
93 | - | |
94 | - void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const { | |
95 | - encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte) | |
96 | - ? getPrefixCutLength(compressionByte) | |
97 | - : readInt8(ptr); | |
98 | - encodedForm.suffixToCut = readInt8(ptr); | |
99 | - encodedForm.suffixToAdd = readString(ptr); | |
100 | - assert(encodedForm.casePattern.size() == 0); | |
101 | - if (isLemmaOnlyLower(compressionByte)) { | |
102 | - encodedForm.casePattern = std::vector<bool>(); | |
103 | - } else if (isLemmaOnlyTitle(compressionByte)) { | |
104 | - encodedForm.casePattern = std::vector<bool>(); | |
105 | - encodedForm.casePattern.push_back(true); | |
106 | - } else { | |
107 | - encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); | |
108 | - } | |
109 | - } | |
110 | - | |
111 | - EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const { | |
112 | - EncodedInterpretation interp; | |
113 | - if (isOrthOnlyLower(compressionByte)) { | |
114 | - } else if (isOrthOnlyTitle(compressionByte)) { | |
115 | - interp.orthCasePattern.push_back(true); | |
116 | - } else { | |
117 | - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); | |
118 | - } | |
119 | - deserializeEncodedForm(ptr, compressionByte, interp.value); | |
120 | - interp.tag = readInt16(ptr); | |
121 | - interp.nameClassifier = *ptr++; | |
122 | - interp.qualifiers = readInt16(ptr); | |
123 | - return interp; | |
124 | - } | |
125 | -private: | |
126 | - | |
127 | - pair<string, string> getLemmaHomonymIdPair(const string& lemma) const { | |
128 | - vector<string> splitRes(split(lemma, ':')); | |
129 | - if (splitRes.size() == 2) { | |
130 | - return make_pair(splitRes[0], splitRes[1]); | |
131 | - } else { | |
132 | - return make_pair(lemma, ""); | |
133 | - } | |
134 | - } | |
135 | - | |
136 | - void decodeMorphInterpretation( | |
137 | - unsigned int startNode, unsigned int endNode, | |
138 | - const string& orth, | |
139 | - const string& lemmaPrefix, | |
140 | - const InterpretedChunk& chunk, | |
141 | - bool forPrefix, | |
142 | - const unsigned char*& ptr, | |
143 | - std::vector<MorphInterpretation>& out) const { | |
144 | - string lemma = lemmaPrefix; | |
145 | - EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr); | |
146 | - this->decodeForm(chunk.lowercaseCodepoints, ei.value, forPrefix, lemma); | |
147 | - if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.orthCasePattern)) { | |
148 | - // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma); | |
149 | - out.push_back(MorphInterpretation( | |
150 | - startNode, endNode, | |
151 | - orth, lemma, | |
152 | - // "", | |
153 | - ei.tag, | |
154 | - ei.nameClassifier, | |
155 | - ei.qualifiers, | |
156 | - env)); | |
157 | - } | |
158 | - } | |
159 | - | |
160 | - bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const { | |
161 | - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | |
162 | - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | |
163 | - orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | |
164 | - const unsigned char* ptr = prefixChunk.interpsPtr; | |
165 | - std::vector<MorphInterpretation> mi; | |
166 | - // env.getCasePatternHelper().skipCasePattern(ptr); | |
167 | - this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi); | |
168 | - if (!mi.empty()) { | |
169 | - lemmaPrefix += mi[0].getLemma(); | |
170 | - } else { | |
171 | - return false; | |
172 | - } | |
173 | - } | |
174 | - return true; | |
175 | - } | |
176 | -}; | |
177 | - | |
178 | -class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { | |
179 | -public: | |
180 | - | |
181 | - InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { | |
182 | - } | |
183 | - | |
184 | - void decode( | |
185 | - unsigned int startNode, | |
186 | - unsigned int endNode, | |
187 | - const InterpretedChunk& interpretedChunk, | |
188 | - std::vector<MorphInterpretation>& out) const { | |
189 | - string orthPrefix; | |
190 | - string lemma; | |
191 | - convertPrefixes(interpretedChunk, orthPrefix, lemma); | |
192 | - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
193 | - const unsigned char* currPtr = interpretedChunk.interpsPtr; | |
194 | - while (currPtr < interpretedChunk.interpsEndPtr) { | |
195 | - MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); | |
196 | - // cerr << mi.toString(false) << endl; | |
197 | - // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; | |
198 | - if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) { | |
199 | - out.push_back(mi); | |
200 | - } | |
201 | - } | |
202 | - } | |
203 | - | |
204 | -private: | |
205 | - | |
206 | - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const { | |
207 | - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | |
208 | - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | |
209 | - lemma += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | |
210 | - const unsigned char* ptr = prefixChunk.interpsPtr; | |
211 | - MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr); | |
212 | - orthPrefix += mi.getOrth(); | |
213 | - } | |
214 | - } | |
215 | - | |
216 | - MorphInterpretation decodeMorphInterpretation( | |
217 | - unsigned int startNode, unsigned int endNode, | |
218 | - const string& orthPrefix, | |
219 | - const string& lemma, | |
220 | - const InterpretedChunk& chunk, | |
221 | - const unsigned char*& ptr) const { | |
222 | - string orth = orthPrefix; | |
223 | - EncodedInterpretation ei = this->deserializeInterp(ptr); | |
224 | - this->decodeForm(chunk.originalCodepoints, ei.value, orth); | |
225 | - return MorphInterpretation( | |
226 | - startNode, endNode, | |
227 | - orth, lemma + HOMONYM_SEPARATOR + ei.homonymId, | |
228 | - // ei.homonymId, | |
229 | - ei.tag, | |
230 | - ei.nameClassifier, | |
231 | - ei.qualifiers, | |
232 | - env); | |
233 | - } | |
234 | - | |
235 | - void decodeForm( | |
236 | - const vector<uint32_t>& lemma, | |
237 | - const EncodedForm& orth, | |
238 | - string& res) const { | |
239 | - res += orth.prefixToAdd; | |
240 | - for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) { | |
241 | - env.getCharsetConverter().append(lemma[i], res); | |
242 | - } | |
243 | - const char* suffixPtr = orth.suffixToAdd.c_str(); | |
244 | - const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); | |
245 | - while (suffixPtr != suffixEnd) { | |
246 | - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | |
247 | - env.getCharsetConverter().append(cp, res); | |
248 | - } | |
249 | - } | |
250 | - | |
251 | - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { | |
252 | - EncodedInterpretation interp; | |
253 | - interp.homonymId = readString(ptr); | |
254 | - interp.value.prefixToAdd = readString(ptr); | |
255 | - interp.value.suffixToCut = readInt8(ptr); | |
256 | - interp.value.suffixToAdd = readString(ptr); | |
257 | - interp.tag = readInt16(ptr); | |
258 | - interp.nameClassifier = readInt8(ptr); | |
259 | - interp.qualifiers = readInt16(ptr); | |
260 | - return interp; | |
261 | - } | |
262 | -}; | |
263 | - | |
264 | -#endif /* INTERPSGROUPDECODER_HPP */ | |
265 | - |
morfeusz/Morfeusz.cpp
... | ... | @@ -12,7 +12,7 @@ |
12 | 12 | #include "data/default_fsa.hpp" |
13 | 13 | #include "Morfeusz.hpp" |
14 | 14 | #include "MorphDeserializer.hpp" |
15 | -#include "InterpretedChunksDecoder.hpp" | |
15 | +#include "decoder/InterpretedChunksDecoder.hpp" | |
16 | 16 | #include "charset/CharsetConverter.hpp" |
17 | 17 | #include "charset/charset_utils.hpp" |
18 | 18 | #include "charset/CaseConverter.hpp" |
... | ... | @@ -34,6 +34,51 @@ static MorfeuszOptions createDefaultOptions() { |
34 | 34 | return res; |
35 | 35 | } |
36 | 36 | |
37 | +static void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { | |
38 | + to.prefixChunks.insert( | |
39 | + to.prefixChunks.begin(), | |
40 | + from.prefixChunks.begin(), | |
41 | + from.prefixChunks.end()); | |
42 | + to.prefixChunks.push_back(from); | |
43 | + to.textStartPtr = from.textStartPtr; | |
44 | + from.orthWasShifted = true; | |
45 | +} | |
46 | + | |
47 | +static string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { | |
48 | + stringstream res; | |
49 | + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), "; | |
50 | + return res.str(); | |
51 | +} | |
52 | + | |
53 | +static string debugAccum(vector<InterpretedChunk>& accum) { | |
54 | + stringstream res; | |
55 | + for (unsigned int i = 0; i < accum.size(); i++) { | |
56 | + res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr); | |
57 | + // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; | |
58 | + } | |
59 | + return res.str(); | |
60 | +} | |
61 | + | |
62 | +static void feedStateDirectly( | |
63 | + StateType& state, | |
64 | + const char* inputStart, | |
65 | + const char* inputEnd) { | |
66 | + const char* currInput = inputStart; | |
67 | + while (currInput != inputEnd && !state.isSink()) { | |
68 | + state.proceedToNext(*currInput++); | |
69 | + } | |
70 | +} | |
71 | + | |
72 | +static void feedState( | |
73 | + StateType& state, | |
74 | + int codepoint) { | |
75 | + std::string chars; | |
76 | + UTF8CharsetConverter::getInstance().append(codepoint, chars); | |
77 | + for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) { | |
78 | + state.proceedToNext(chars[i]); | |
79 | + } | |
80 | +} | |
81 | + | |
37 | 82 | Morfeusz::Morfeusz() |
38 | 83 | : analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA), |
39 | 84 | generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA), |
... | ... | @@ -97,11 +142,12 @@ void Morfeusz::processOneWord( |
97 | 142 | if (!graph.empty()) { |
98 | 143 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); |
99 | 144 | int srcNode = startNodeNum; |
100 | - for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) { | |
101 | - const vector<InflexionGraph::Edge>& edges = graph.getTheGraph()[i]; | |
145 | + const std::vector< std::vector<InflexionGraph::Edge> >& theGraph = graph.getTheGraph(); | |
146 | + for (unsigned int i = 0; i < theGraph.size(); i++) { | |
147 | + const vector<InflexionGraph::Edge>& edges = theGraph[i]; | |
102 | 148 | for (unsigned int j = 0; j < edges.size(); j++) { |
103 | 149 | const InflexionGraph::Edge& e = edges[j]; |
104 | - int targetNode = startNodeNum + e.nextNode; | |
150 | + unsigned long targetNode = startNodeNum + e.nextNode; | |
105 | 151 | interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results); |
106 | 152 | } |
107 | 153 | srcNode++; |
... | ... | @@ -118,56 +164,11 @@ void Morfeusz::processOneWord( |
118 | 164 | inputStart = currInput; |
119 | 165 | } |
120 | 166 | |
121 | -static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { | |
122 | - to.prefixChunks.insert( | |
123 | - to.prefixChunks.begin(), | |
124 | - from.prefixChunks.begin(), | |
125 | - from.prefixChunks.end()); | |
126 | - to.prefixChunks.push_back(from); | |
127 | - from.orthWasShifted = true; | |
128 | - to.textStartPtr = from.textStartPtr; | |
129 | -} | |
130 | - | |
131 | -static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { | |
132 | - stringstream res; | |
133 | - res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), "; | |
134 | - return res.str(); | |
135 | -} | |
136 | - | |
137 | -static inline string debugAccum(vector<InterpretedChunk>& accum) { | |
138 | - stringstream res; | |
139 | - for (unsigned int i = 0; i < accum.size(); i++) { | |
140 | - res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr); | |
141 | - // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; | |
142 | - } | |
143 | - return res.str(); | |
144 | -} | |
145 | - | |
146 | -static inline void feedStateDirectly( | |
147 | - StateType& state, | |
148 | - const char* inputStart, | |
149 | - const char* inputEnd) { | |
150 | - const char* currInput = inputStart; | |
151 | - while (currInput != inputEnd && !state.isSink()) { | |
152 | - state.proceedToNext(*currInput++); | |
153 | - } | |
154 | -} | |
155 | - | |
156 | -static inline void feedState( | |
157 | - StateType& state, | |
158 | - int codepoint) { | |
159 | - std::string chars; | |
160 | - UTF8CharsetConverter::getInstance().append(codepoint, chars); | |
161 | - for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) { | |
162 | - state.proceedToNext(chars[i]); | |
163 | - } | |
164 | -} | |
165 | - | |
166 | 167 | void Morfeusz::doProcessOneWord( |
167 | 168 | const Environment& env, |
168 | 169 | const char*& inputData, |
169 | 170 | const char* inputEnd, |
170 | - SegrulesState segrulesState) const { | |
171 | + const SegrulesState& segrulesState) const { | |
171 | 172 | if (this->options.debug) { |
172 | 173 | cerr << "----------" << endl; |
173 | 174 | cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; |
... | ... | @@ -178,11 +179,6 @@ void Morfeusz::doProcessOneWord( |
178 | 179 | const char* currInput = inputData; |
179 | 180 | uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); |
180 | 181 | bool currCodepointIsWhitespace = isWhitespace(codepoint); |
181 | - vector<uint32_t> originalCodepoints; | |
182 | - vector<uint32_t> normalizedCodepoints; | |
183 | - | |
184 | - originalCodepoints.reserve(16); | |
185 | - normalizedCodepoints.reserve(16); | |
186 | 182 | |
187 | 183 | StateType state = env.getFSA().getInitialState(); |
188 | 184 | |
... | ... | @@ -190,8 +186,6 @@ void Morfeusz::doProcessOneWord( |
190 | 186 | uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER |
191 | 187 | ? env.getCaseConverter().toLower(codepoint) |
192 | 188 | : codepoint; |
193 | - originalCodepoints.push_back(codepoint); | |
194 | - normalizedCodepoints.push_back(normalizedCodepoint); | |
195 | 189 | if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) { |
196 | 190 | feedStateDirectly(state, prevInput, currInput); |
197 | 191 | } |
... | ... | @@ -203,48 +197,37 @@ void Morfeusz::doProcessOneWord( |
203 | 197 | currCodepointIsWhitespace = isWhitespace(codepoint); |
204 | 198 | string homonymId; |
205 | 199 | if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) { |
206 | - if (originalCodepoints.size() == 1) { | |
207 | - throw MorfeuszException("Lemma of length > 1 cannot start with a colon"); | |
208 | - } | |
209 | 200 | homonymId = string(currInput + 1, inputEnd); |
210 | - // cerr << "homonym " << homonymId << endl; | |
211 | 201 | prevInput = currInput; |
212 | 202 | currInput = inputEnd; |
213 | 203 | codepoint = 0x00; |
214 | 204 | currCodepointIsWhitespace = true; |
215 | 205 | } |
216 | 206 | if (state.isAccepting()) { |
217 | - vector<InterpsGroup> val(state.getValue()); | |
218 | - for (unsigned int i = 0; i < val.size(); i++) { | |
219 | - InterpsGroup& ig = val[i]; | |
207 | +// vector<InterpsGroup> val(state.getValue()); | |
208 | + for (unsigned int i = 0; i < state.getValue().size(); i++) { | |
209 | + const InterpsGroup& ig = state.getValue()[i]; | |
220 | 210 | if (this->options.debug) { |
221 | 211 | cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; |
222 | 212 | } |
223 | - vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace); | |
213 | + const vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace); | |
224 | 214 | if (!newSegrulesStates.empty() |
225 | - && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig)) { | |
226 | - | |
227 | - for ( | |
228 | - vector<SegrulesState>::iterator it = newSegrulesStates.begin(); | |
229 | - it != newSegrulesStates.end(); | |
230 | - ++it) { | |
231 | - SegrulesState newSegrulesState = *it; | |
215 | + && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(env, inputStart, currInput, ig)) { | |
216 | + for (unsigned int i = 0; i < newSegrulesStates.size(); i++) { | |
217 | + const SegrulesState& newSegrulesState = newSegrulesStates[i]; | |
232 | 218 | const unsigned char* interpsPtr = getInterpretationsPtr(env, ig); |
233 | 219 | const unsigned char* interpsEndPtr = ig.ptr + ig.size; |
234 | - InterpretedChunk ic = { | |
235 | - ig.type, | |
236 | - inputStart, | |
237 | - currInput, | |
238 | - originalCodepoints, | |
239 | - normalizedCodepoints, | |
240 | - ig.ptr, | |
241 | - interpsPtr, | |
242 | - interpsEndPtr, | |
243 | - newSegrulesState.shiftOrthFromPrevious, | |
244 | - false, | |
245 | - vector<InterpretedChunk>(), | |
246 | - homonymId | |
247 | - }; | |
220 | + InterpretedChunk ic; | |
221 | + ic.segmentType = ig.type; | |
222 | + ic.textStartPtr = inputStart; | |
223 | + ic.textEndPtr = currInput; | |
224 | + ic.interpsGroupPtr = ig.ptr; | |
225 | + ic.interpsPtr = interpsPtr; | |
226 | + ic.interpsEndPtr = interpsEndPtr; | |
227 | + ic.shiftOrth = newSegrulesState.shiftOrthFromPrevious; | |
228 | + ic.orthWasShifted = false; | |
229 | + ic.requiredHomonymId = homonymId; | |
230 | + | |
248 | 231 | if (!accum.empty() && accum.back().shiftOrth) { |
249 | 232 | doShiftOrth(accum.back(), ic); |
250 | 233 | } |
... | ... | @@ -266,7 +249,7 @@ void Morfeusz::doProcessOneWord( |
266 | 249 | } |
267 | 250 | } |
268 | 251 | else if (this->options.debug) { |
269 | - cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl; | |
252 | +// cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl; | |
270 | 253 | cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; |
271 | 254 | } |
272 | 255 | } |
... | ... |
morfeusz/Morfeusz.hpp
morfeusz/decoder/InterpretedChunksDecoder.hpp
0 → 100644
1 | +/* | |
2 | + * File: InterpsGroupDecoder.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on November 22, 2013, 10:35 PM | |
6 | + */ | |
7 | + | |
8 | +#ifndef INTERPSGROUPDECODER_HPP | |
9 | +#define INTERPSGROUPDECODER_HPP | |
10 | + | |
11 | +#include <string> | |
12 | +#include <vector> | |
13 | +#include <utility> | |
14 | + | |
15 | +#include "charset/CharsetConverter.hpp" | |
16 | +#include "EncodedInterpretation.hpp" | |
17 | +#include "InterpretedChunk.hpp" | |
18 | +#include "EncodedInterpretation.hpp" | |
19 | +#include "charset/CaseConverter.hpp" | |
20 | +#include "Environment.hpp" | |
21 | +#include "MorphInterpretation.hpp" | |
22 | +#include "CasePatternHelper.hpp" | |
23 | +#include "deserializationUtils.hpp" | |
24 | +#include "compressionByteUtils.hpp" | |
25 | +#include "const.hpp" | |
26 | + | |
27 | +class InterpretedChunksDecoder { | |
28 | +public: | |
29 | + | |
30 | + InterpretedChunksDecoder(const Environment& env): env(env) { | |
31 | + } | |
32 | + | |
33 | + virtual ~InterpretedChunksDecoder() { | |
34 | + } | |
35 | + | |
36 | + virtual void decode( | |
37 | + unsigned int startNode, | |
38 | + unsigned int endNode, | |
39 | + const InterpretedChunk& interpretedChunk, | |
40 | + std::vector<MorphInterpretation>& out) const = 0; | |
41 | + | |
42 | +protected: | |
43 | + | |
44 | + const Environment& env; | |
45 | +}; | |
46 | + | |
47 | +#endif /* INTERPSGROUPDECODER_HPP */ | |
48 | + | |
... | ... |
morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp
0 → 100644
1 | +/* | |
2 | + * File: InterpretedChunksDecoder4Analyzer.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 15 maj 2014, 15:28 | |
6 | + */ | |
7 | + | |
8 | +#include "InterpretedChunksDecoder4Analyzer.hpp" | |
9 | +#include <string> | |
10 | + | |
11 | +using namespace std; | |
12 | + | |
13 | +InterpretedChunksDecoder4Analyzer::InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { | |
14 | +} | |
15 | + | |
16 | +void InterpretedChunksDecoder4Analyzer::decode( | |
17 | + unsigned int startNode, | |
18 | + unsigned int endNode, | |
19 | + const InterpretedChunk& interpretedChunk, | |
20 | + std::vector<MorphInterpretation>& out) const { | |
21 | + string orth; | |
22 | + string lemmaPrefix; | |
23 | + if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) { | |
24 | + // orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
25 | + orth.insert(orth.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr); | |
26 | + const unsigned char* currPtr = interpretedChunk.interpsPtr; | |
27 | + while (currPtr < interpretedChunk.interpsEndPtr) { | |
28 | + this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out); | |
29 | + } | |
30 | + } | |
31 | +} | |
32 | + | |
33 | +void InterpretedChunksDecoder4Analyzer::decodeLemma( | |
34 | + const vector<uint32_t>& orth, | |
35 | + const EncodedForm& lemma, | |
36 | + bool forPrefix, | |
37 | + string& res) const { | |
38 | + for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) { | |
39 | + uint32_t cp = | |
40 | + (i < lemma.casePattern.size() && lemma.casePattern[i]) | |
41 | + ? env.getCaseConverter().toTitle(orth[i]) | |
42 | + : orth[i]; | |
43 | + env.getCharsetConverter().append(cp, res); | |
44 | + } | |
45 | + if (!forPrefix) { | |
46 | + const char* suffixPtr = lemma.suffixToAdd.c_str(); | |
47 | + const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length(); | |
48 | + while (suffixPtr != suffixEnd) { | |
49 | + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | |
50 | + env.getCharsetConverter().append(cp, res); | |
51 | + } | |
52 | + } | |
53 | +} | |
54 | + | |
55 | +void InterpretedChunksDecoder4Analyzer::deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const { | |
56 | + encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte) | |
57 | + ? getPrefixCutLength(compressionByte) | |
58 | + : readInt8(ptr); | |
59 | + encodedForm.suffixToCut = readInt8(ptr); | |
60 | + encodedForm.suffixToAdd = readString(ptr); | |
61 | + assert(encodedForm.casePattern.size() == 0); | |
62 | + if (isLemmaOnlyLower(compressionByte)) { | |
63 | +// encodedForm.casePattern = std::vector<bool>(); | |
64 | + } | |
65 | + else if (isLemmaOnlyTitle(compressionByte)) { | |
66 | +// encodedForm.casePattern = std::vector<bool>(); | |
67 | + encodedForm.casePattern.push_back(true); | |
68 | + } | |
69 | + else { | |
70 | + encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); | |
71 | + } | |
72 | +} | |
73 | + | |
74 | +EncodedInterpretation InterpretedChunksDecoder4Analyzer::deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const { | |
75 | + EncodedInterpretation interp; | |
76 | + if (isOrthOnlyLower(compressionByte)) { | |
77 | + } | |
78 | + else if (isOrthOnlyTitle(compressionByte)) { | |
79 | + interp.orthCasePattern.push_back(true); | |
80 | + } | |
81 | + else { | |
82 | + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); | |
83 | + } | |
84 | + deserializeEncodedForm(ptr, compressionByte, interp.value); | |
85 | + interp.tag = readInt16(ptr); | |
86 | + interp.nameClassifier = *ptr++; | |
87 | + interp.qualifiers = readInt16(ptr); | |
88 | + return interp; | |
89 | +} | |
90 | + | |
91 | +void InterpretedChunksDecoder4Analyzer::decodeMorphInterpretation( | |
92 | + unsigned int startNode, unsigned int endNode, | |
93 | + const string& orth, | |
94 | + const string& lemmaPrefix, | |
95 | + const InterpretedChunk& chunk, | |
96 | + bool forPrefix, | |
97 | + const unsigned char*& ptr, | |
98 | + std::vector<MorphInterpretation>& out) const { | |
99 | + string lemma(lemmaPrefix); | |
100 | + orthCodepoints.clear(); | |
101 | + normalizedCodepoints.clear(); | |
102 | + const char* currPtr = chunk.textStartPtr; | |
103 | + while (currPtr != chunk.textEndPtr) { | |
104 | + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr); | |
105 | + orthCodepoints.push_back(cp); | |
106 | + normalizedCodepoints.push_back(env.getCaseConverter().toLower(cp)); | |
107 | + } | |
108 | + EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr); | |
109 | + if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, orthCodepoints, ei.orthCasePattern)) { | |
110 | + this->decodeLemma(normalizedCodepoints, ei.value, forPrefix, lemma); | |
111 | + // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma); | |
112 | + out.push_back(MorphInterpretation( | |
113 | + startNode, endNode, | |
114 | + orth, lemma, | |
115 | + // "", | |
116 | + ei.tag, | |
117 | + ei.nameClassifier, | |
118 | + ei.qualifiers, | |
119 | + env)); | |
120 | + } | |
121 | +} | |
122 | + | |
123 | +bool InterpretedChunksDecoder4Analyzer::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const { | |
124 | + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | |
125 | + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | |
126 | + orth.insert(orth.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr); | |
127 | + const unsigned char* ptr = prefixChunk.interpsPtr; | |
128 | + std::vector<MorphInterpretation> mi; | |
129 | + this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi); | |
130 | + if (!mi.empty()) { | |
131 | + lemmaPrefix += mi[0].getLemma(); | |
132 | + } | |
133 | + else { | |
134 | + return false; | |
135 | + } | |
136 | + } | |
137 | + return true; | |
138 | +} | |
... | ... |
morfeusz/decoder/InterpretedChunksDecoder4Analyzer.hpp
0 → 100644
1 | +/* | |
2 | + * File: InterpretedChunksDecoder4Analyzer.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 15 maj 2014, 15:28 | |
6 | + */ | |
7 | + | |
8 | +#ifndef INTERPRETEDCHUNKSDECODER4ANALYZER_HPP | |
9 | +#define INTERPRETEDCHUNKSDECODER4ANALYZER_HPP | |
10 | + | |
11 | +#include "InterpretedChunksDecoder.hpp" | |
12 | + | |
13 | +class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { | |
14 | +public: | |
15 | + | |
16 | + InterpretedChunksDecoder4Analyzer(const Environment& env); | |
17 | + | |
18 | + void decode( | |
19 | + unsigned int startNode, | |
20 | + unsigned int endNode, | |
21 | + const InterpretedChunk& interpretedChunk, | |
22 | + std::vector<MorphInterpretation>& out) const; | |
23 | + | |
24 | +private: | |
25 | + | |
26 | + void decodeLemma( | |
27 | + const vector<uint32_t>& orth, | |
28 | + const EncodedForm& lemma, | |
29 | + bool forPrefix, | |
30 | + string& res) const; | |
31 | + | |
32 | + void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const; | |
33 | + | |
34 | + EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const; | |
35 | + | |
36 | + void decodeMorphInterpretation( | |
37 | + unsigned int startNode, unsigned int endNode, | |
38 | + const string& orth, | |
39 | + const string& lemmaPrefix, | |
40 | + const InterpretedChunk& chunk, | |
41 | + bool forPrefix, | |
42 | + const unsigned char*& ptr, | |
43 | + std::vector<MorphInterpretation>& out) const; | |
44 | + | |
45 | + bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const; | |
46 | + | |
47 | + mutable std::vector<uint32_t> orthCodepoints; | |
48 | + mutable std::vector<uint32_t> normalizedCodepoints; | |
49 | +}; | |
50 | + | |
51 | +#endif /* INTERPRETEDCHUNKSDECODER4ANALYZER_HPP */ | |
52 | + | |
... | ... |
morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp
0 → 100644
1 | +/* | |
2 | + * File: InterpretedChunksDecoder4Generator.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 15 maj 2014, 15:28 | |
6 | + */ | |
7 | + | |
8 | +#include "InterpretedChunksDecoder4Generator.hpp" | |
9 | +#include <string> | |
10 | +#include <vector> | |
11 | + | |
12 | +using namespace std; | |
13 | + | |
14 | +InterpretedChunksDecoder4Generator::InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { | |
15 | +} | |
16 | + | |
17 | +void InterpretedChunksDecoder4Generator::decode( | |
18 | + unsigned int startNode, | |
19 | + unsigned int endNode, | |
20 | + const InterpretedChunk& interpretedChunk, | |
21 | + std::vector<MorphInterpretation>& out) const { | |
22 | + string orthPrefix; | |
23 | + string lemma; | |
24 | + convertPrefixes(interpretedChunk, orthPrefix, lemma); | |
25 | + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
26 | + lemma.insert(lemma.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr); | |
27 | + const unsigned char* currPtr = interpretedChunk.interpsPtr; | |
28 | + while (currPtr < interpretedChunk.interpsEndPtr) { | |
29 | + MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); | |
30 | + // cerr << mi.toString(false) << endl; | |
31 | + // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; | |
32 | + if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) { | |
33 | + out.push_back(mi); | |
34 | + } | |
35 | + } | |
36 | +} | |
37 | + | |
38 | +void InterpretedChunksDecoder4Generator::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const { | |
39 | + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | |
40 | + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | |
41 | + lemma.insert(lemma.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr); | |
42 | + const unsigned char* ptr = prefixChunk.interpsPtr; | |
43 | + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr); | |
44 | + orthPrefix += mi.getOrth(); | |
45 | + } | |
46 | +} | |
47 | + | |
48 | +MorphInterpretation InterpretedChunksDecoder4Generator::decodeMorphInterpretation( | |
49 | + unsigned int startNode, unsigned int endNode, | |
50 | + const string& orthPrefix, | |
51 | + const string& lemma, | |
52 | + const InterpretedChunk& chunk, | |
53 | + const unsigned char*& ptr) const { | |
54 | + string orth = orthPrefix; | |
55 | + EncodedInterpretation ei = this->deserializeInterp(ptr); | |
56 | + codepoints.clear(); | |
57 | + const char* currPtr = chunk.textStartPtr; | |
58 | + while (currPtr != chunk.textEndPtr) { | |
59 | + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr); | |
60 | + codepoints.push_back(cp); | |
61 | + } | |
62 | + this->decodeForm(codepoints, ei.value, orth); | |
63 | + return MorphInterpretation( | |
64 | + startNode, endNode, | |
65 | + orth, ei.homonymId.empty() ? lemma : (lemma + HOMONYM_SEPARATOR + ei.homonymId), | |
66 | + // ei.homonymId, | |
67 | + ei.tag, | |
68 | + ei.nameClassifier, | |
69 | + ei.qualifiers, | |
70 | + env); | |
71 | +} | |
72 | + | |
73 | +void InterpretedChunksDecoder4Generator::decodeForm( | |
74 | + const vector<uint32_t>& lemma, | |
75 | + const EncodedForm& orth, | |
76 | + string& res) const { | |
77 | + res += orth.prefixToAdd; | |
78 | + for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) { | |
79 | + env.getCharsetConverter().append(lemma[i], res); | |
80 | + } | |
81 | + const char* suffixPtr = orth.suffixToAdd.c_str(); | |
82 | + const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); | |
83 | + while (suffixPtr != suffixEnd) { | |
84 | + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | |
85 | + env.getCharsetConverter().append(cp, res); | |
86 | + } | |
87 | +} | |
88 | + | |
89 | +EncodedInterpretation InterpretedChunksDecoder4Generator::deserializeInterp(const unsigned char*& ptr) const { | |
90 | + EncodedInterpretation interp; | |
91 | + interp.homonymId = readString(ptr); | |
92 | + interp.value.prefixToAdd = readString(ptr); | |
93 | + interp.value.suffixToCut = readInt8(ptr); | |
94 | + interp.value.suffixToAdd = readString(ptr); | |
95 | + interp.tag = readInt16(ptr); | |
96 | + interp.nameClassifier = readInt8(ptr); | |
97 | + interp.qualifiers = readInt16(ptr); | |
98 | + return interp; | |
99 | +} | |
... | ... |
morfeusz/decoder/InterpretedChunksDecoder4Generator.hpp
0 → 100644
1 | +/* | |
2 | + * File: InterpretedChunksDecoder4Generator.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 15 maj 2014, 15:28 | |
6 | + */ | |
7 | + | |
8 | +#ifndef INTERPRETEDCHUNKSDECODER4GENERATOR_HPP | |
9 | +#define INTERPRETEDCHUNKSDECODER4GENERATOR_HPP | |
10 | + | |
11 | +#include "InterpretedChunksDecoder.hpp" | |
12 | + | |
13 | +class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { | |
14 | +public: | |
15 | + | |
16 | + InterpretedChunksDecoder4Generator(const Environment& env); | |
17 | + | |
18 | + void decode( | |
19 | + unsigned int startNode, | |
20 | + unsigned int endNode, | |
21 | + const InterpretedChunk& interpretedChunk, | |
22 | + std::vector<MorphInterpretation>& out) const; | |
23 | + | |
24 | +private: | |
25 | + | |
26 | + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const; | |
27 | + | |
28 | + MorphInterpretation decodeMorphInterpretation( | |
29 | + unsigned int startNode, unsigned int endNode, | |
30 | + const string& orthPrefix, | |
31 | + const string& lemma, | |
32 | + const InterpretedChunk& chunk, | |
33 | + const unsigned char*& ptr) const; | |
34 | + | |
35 | + void decodeForm( | |
36 | + const vector<uint32_t>& lemma, | |
37 | + const EncodedForm& orth, | |
38 | + string& res) const; | |
39 | + | |
40 | + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const; | |
41 | + | |
42 | + mutable std::vector<uint32_t> codepoints; | |
43 | +}; | |
44 | + | |
45 | + | |
46 | +#endif /* INTERPRETEDCHUNKSDECODER4GENERATOR_HPP */ | |
47 | + | |
... | ... |
morfeusz/fsa/fsa.hpp
morfeusz/fsa/state_impl.hpp
morfeusz/morfeusz_analyzer.cpp
... | ... | @@ -43,11 +43,20 @@ int main(int argc, const char** argv) { |
43 | 43 | else if (prevStart != -1) { |
44 | 44 | printf("; "); |
45 | 45 | } |
46 | - printf("%s", mi.toString(true).c_str()); | |
47 | -// printf("%d,%d,%s,%s,%s,%s", | |
48 | -// mi.getStartNode(), mi.getEndNode(), | |
49 | -// mi.getOrth().c_str(), lemmaToShow.c_str(), | |
50 | -// mi.getTag().c_str(), lemmaToShow.c_str()); | |
46 | +// printf("%s", mi.toString(true).c_str()); | |
47 | + printf("%d,%d,%s,%s,%s", | |
48 | + mi.getStartNode(), mi.getEndNode(), | |
49 | + mi.getOrth().c_str(), mi.getLemma().c_str(), | |
50 | + mi.getTag().c_str()); | |
51 | + if (!mi.getName().empty()) { | |
52 | + printf(",%s", mi.getName().c_str()); | |
53 | + } | |
54 | + if (!mi.getQualifiers().empty()) { | |
55 | + printf(",%s", mi.getQualifiers()[0].c_str()); | |
56 | + for (unsigned int i = 1; i < mi.getQualifiers().size(); i++) { | |
57 | + printf("|%s", mi.getQualifiers()[i].c_str()); | |
58 | + } | |
59 | + } | |
51 | 60 | prevStart = mi.getStartNode(); |
52 | 61 | prevEnd = mi.getEndNode(); |
53 | 62 | } |
... | ... |
morfeusz/segrules/SegrulesFSA.hpp
... | ... | @@ -34,12 +34,12 @@ public: |
34 | 34 | |
35 | 35 | std::vector<SegrulesState> proceedToNext( |
36 | 36 | const unsigned char segnum, |
37 | - const SegrulesState state, | |
37 | + const SegrulesState& state, | |
38 | 38 | bool atEndOfWord) const { |
39 | 39 | std::vector<SegrulesState> res; |
40 | 40 | const unsigned char* currPtr = ptr + state.offset + 1; |
41 | 41 | const unsigned char transitionsNum = *currPtr++; |
42 | - for (unsigned int i = 0; i < transitionsNum; i++) { | |
42 | + for (int i = 0; i < transitionsNum; i++) { | |
43 | 43 | if (*currPtr == segnum) { |
44 | 44 | SegrulesState newState = this->transition2State(currPtr); |
45 | 45 | if ((atEndOfWord && newState.accepting) |
... | ... |
nbproject/configurations.xml
... | ... | @@ -130,6 +130,8 @@ |
130 | 130 | </ccTool> |
131 | 131 | </item> |
132 | 132 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
133 | + <ccTool flags="1"> | |
134 | + </ccTool> | |
133 | 135 | </item> |
134 | 136 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
135 | 137 | ex="false" |
... | ... | @@ -239,6 +241,7 @@ |
239 | 241 | <pElem>build/morfeusz</pElem> |
240 | 242 | </incDir> |
241 | 243 | <preprocessorList> |
244 | + <Elem>NDEBUG</Elem> | |
242 | 245 | <Elem>libmorfeusz_EXPORTS</Elem> |
243 | 246 | </preprocessorList> |
244 | 247 | </ccTool> |
... | ... | @@ -283,7 +286,7 @@ |
283 | 286 | <ccTool> |
284 | 287 | <incDir> |
285 | 288 | <pElem>morfeusz</pElem> |
286 | - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | |
289 | + <pElem>/usr/lib/jvm/default-java/include</pElem> | |
287 | 290 | </incDir> |
288 | 291 | <preprocessorList> |
289 | 292 | <Elem>NDEBUG</Elem> |
... | ... | @@ -310,6 +313,19 @@ |
310 | 313 | </undefinedList> |
311 | 314 | </ccTool> |
312 | 315 | </folder> |
316 | + <item path="morfeusz/CasePatternHelper.cpp" ex="false" tool="1" flavor2="4"> | |
317 | + <ccTool flags="1"> | |
318 | + <incDir> | |
319 | + <pElem>build</pElem> | |
320 | + <pElem>morfeusz</pElem> | |
321 | + <pElem>build/morfeusz</pElem> | |
322 | + </incDir> | |
323 | + <preprocessorList> | |
324 | + <Elem>NDEBUG</Elem> | |
325 | + <Elem>libmorfeusz_EXPORTS</Elem> | |
326 | + </preprocessorList> | |
327 | + </ccTool> | |
328 | + </item> | |
313 | 329 | <item path="morfeusz/Environment.cpp" ex="false" tool="1" flavor2="4"> |
314 | 330 | <ccTool flags="1"> |
315 | 331 | <incDir> |
... | ... | @@ -387,40 +403,75 @@ |
387 | 403 | </ccTool> |
388 | 404 | </item> |
389 | 405 | <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> |
390 | - <ccTool flags="1"> | |
406 | + <ccTool flags="2"> | |
391 | 407 | <incDir> |
392 | 408 | <pElem>build</pElem> |
393 | 409 | <pElem>morfeusz</pElem> |
394 | 410 | <pElem>build/morfeusz</pElem> |
395 | 411 | </incDir> |
396 | 412 | <preprocessorList> |
397 | - <Elem>NDEBUG</Elem> | |
398 | 413 | <Elem>libmorfeusz_EXPORTS</Elem> |
399 | 414 | </preprocessorList> |
400 | 415 | </ccTool> |
401 | 416 | </item> |
402 | 417 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
418 | + <ccTool flags="2"> | |
419 | + </ccTool> | |
403 | 420 | </item> |
404 | 421 | <item path="morfeusz/charset/CharsetConverter.cpp" |
405 | 422 | ex="false" |
406 | 423 | tool="1" |
407 | 424 | flavor2="4"> |
408 | - <ccTool flags="1"> | |
409 | - <preprocessorList> | |
410 | - <Elem>NDEBUG</Elem> | |
411 | - </preprocessorList> | |
425 | + <ccTool flags="2"> | |
412 | 426 | </ccTool> |
413 | 427 | </item> |
414 | 428 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
429 | + <ccTool flags="2"> | |
430 | + </ccTool> | |
415 | 431 | </item> |
416 | 432 | <item path="morfeusz/charset/conversion_tables.cpp" |
417 | 433 | ex="false" |
418 | 434 | tool="1" |
419 | 435 | flavor2="4"> |
436 | + <ccTool flags="2"> | |
437 | + </ccTool> | |
420 | 438 | </item> |
421 | 439 | <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4"> |
440 | + <ccTool flags="1"> | |
441 | + </ccTool> | |
422 | 442 | </item> |
423 | 443 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
444 | + <ccTool flags="2"> | |
445 | + <incDir> | |
446 | + <pElem>build</pElem> | |
447 | + <pElem>morfeusz</pElem> | |
448 | + <pElem>build/morfeusz</pElem> | |
449 | + </incDir> | |
450 | + <preprocessorList> | |
451 | + <Elem>libmorfeusz_EXPORTS</Elem> | |
452 | + </preprocessorList> | |
453 | + </ccTool> | |
454 | + </item> | |
455 | + <item path="morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp" | |
456 | + ex="false" | |
457 | + tool="1" | |
458 | + flavor2="4"> | |
459 | + <ccTool flags="1"> | |
460 | + <incDir> | |
461 | + <pElem>build</pElem> | |
462 | + <pElem>morfeusz</pElem> | |
463 | + <pElem>build/morfeusz</pElem> | |
464 | + </incDir> | |
465 | + <preprocessorList> | |
466 | + <Elem>NDEBUG</Elem> | |
467 | + <Elem>libmorfeusz_EXPORTS</Elem> | |
468 | + </preprocessorList> | |
469 | + </ccTool> | |
470 | + </item> | |
471 | + <item path="morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp" | |
472 | + ex="false" | |
473 | + tool="1" | |
474 | + flavor2="4"> | |
424 | 475 | <ccTool flags="1"> |
425 | 476 | <incDir> |
426 | 477 | <pElem>build</pElem> |
... | ... | @@ -509,6 +560,8 @@ |
509 | 560 | </ccTool> |
510 | 561 | </item> |
511 | 562 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
563 | + <ccTool flags="1"> | |
564 | + </ccTool> | |
512 | 565 | </item> |
513 | 566 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
514 | 567 | <ccTool flags="0"> |
... | ... |
profile.sh
0 → 100755
1 | +#!/bin/bash | |
2 | + | |
3 | +rm -rf profbuild | |
4 | +mkdir -p profbuild | |
5 | +cd profbuild | |
6 | +cmake -D INPUT_DICTIONARIES=../input/dodatki.tab,../input/PoliMorfSmall.tab -D CMAKE_BUILD_TYPE=Debug -D CMAKE_SHARED_LINKER_FLAGS="-lprofiler" -D CMAKE_EXE_LINKER_FLAGS="-lprofiler" .. | |
7 | +make -j4 | |
8 | +rm -f /tmp/morfeusz.prof | |
9 | +export LD_PRELOAD="/usr/lib/libprofiler.so" | |
10 | +export CPUPROFILE="/tmp/morfeusz.prof" | |
11 | +morfeusz/morfeusz_analyzer -i /tmp/dupadupa < /mnt/storage/morfeusz/sents10k > /dev/null | |
12 | +### pprof --gv profbuild/morfeusz/morfeusz_analyzer /tmp/morfeusz.prof | |
... | ... |