Commit f80dea595a7fb0c3ef6f9dea0075249a41c6f86b

Authored by Michał Lenart
1 parent f3f17708

dalsza optymalizacja kodu

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@181 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
morfeusz/CMakeLists.txt
@@ -38,6 +38,9 @@ set(SRC_FILES @@ -38,6 +38,9 @@ set(SRC_FILES
38 charset/conversion_tables.cpp 38 charset/conversion_tables.cpp
39 cli/cli.cpp 39 cli/cli.cpp
40 segrules/segrules.cpp 40 segrules/segrules.cpp
  41 + CasePatternHelper.cpp
  42 + decoder/InterpretedChunksDecoder4Analyzer.cpp
  43 + decoder/InterpretedChunksDecoder4Generator.cpp
41 ) 44 )
42 45
43 set(INCLUDE_FILES 46 set(INCLUDE_FILES
morfeusz/CasePatternHelper.hpp
@@ -12,6 +12,9 @@ @@ -12,6 +12,9 @@
12 #include "InterpsGroup.hpp" 12 #include "InterpsGroup.hpp"
13 #include "CasePatternHelper.hpp" 13 #include "CasePatternHelper.hpp"
14 #include "compressionByteUtils.hpp" 14 #include "compressionByteUtils.hpp"
  15 +#include "Environment.hpp"
  16 +
  17 +class Environment;
15 18
16 class CasePatternHelper { 19 class CasePatternHelper {
17 public: 20 public:
@@ -39,64 +42,17 @@ public: @@ -39,64 +42,17 @@ public:
39 } 42 }
40 43
41 bool checkInterpsGroupOrthCasePatterns( 44 bool checkInterpsGroupOrthCasePatterns(
42 - const std::vector<uint32_t>& lowercaseCodepoints,  
43 - const std::vector<uint32_t>& originalCodepoints,  
44 - const InterpsGroup& ig) const {  
45 - const unsigned char* currPtr = ig.ptr;  
46 - unsigned char compressionByte = *currPtr++;  
47 - if (!this->caseSensitive) {  
48 - return true;  
49 - }  
50 - else if (isOrthOnlyLower(compressionByte)) {  
51 - return true;  
52 - }  
53 - else if (isOrthOnlyTitle(compressionByte)) {  
54 - return lowercaseCodepoints[0] != originalCodepoints[0];  
55 - }  
56 - else {  
57 - unsigned char casePatternsNum = *currPtr++;  
58 - if (casePatternsNum == 0) {  
59 - return true;  
60 - }  
61 - else {  
62 - for (unsigned int i = 0; i < casePatternsNum; i++) {  
63 - if (checkCasePattern(  
64 - lowercaseCodepoints,  
65 - originalCodepoints,  
66 - deserializeOneCasePattern(currPtr))) {  
67 - return true;  
68 - }  
69 - }  
70 - return false;  
71 - }  
72 - }  
73 - } 45 + const Environment& env,
  46 + const char* orthStart,
  47 + const char* orthEnd,
  48 + const InterpsGroup& ig) const;
74 49
75 - std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {  
76 - std::vector<bool> res;  
77 - uint8_t casePatternType = *ptr++;  
78 - uint8_t prefixLength;  
79 - uint8_t patternLength;  
80 - switch (casePatternType) {  
81 - case LEMMA_ONLY_LOWER:  
82 - break;  
83 - case LEMMA_UPPER_PREFIX:  
84 - prefixLength = *ptr++;  
85 - res.resize(prefixLength, true);  
86 - break;  
87 - case LEMMA_MIXED_CASE:  
88 - patternLength = *ptr++;  
89 - for (unsigned int i = 0; i < patternLength; i++) {  
90 - uint8_t idx = *ptr++;  
91 - res.resize(idx + 1, false);  
92 - res[idx] = true;  
93 - }  
94 - break;  
95 - }  
96 - return res;  
97 - } 50 + static std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr);
98 private: 51 private:
99 bool caseSensitive; 52 bool caseSensitive;
  53 +
  54 + mutable vector<uint32_t> orthCodepoints;
  55 + mutable vector<uint32_t> normalizedCodepoints;
100 56
101 static const uint8_t LEMMA_ONLY_LOWER = 0; 57 static const uint8_t LEMMA_ONLY_LOWER = 0;
102 static const uint8_t LEMMA_UPPER_PREFIX = 1; 58 static const uint8_t LEMMA_UPPER_PREFIX = 1;
morfeusz/Environment.cpp
@@ -8,9 +8,11 @@ @@ -8,9 +8,11 @@
8 #include <vector> 8 #include <vector>
9 #include <algorithm> 9 #include <algorithm>
10 #include "Environment.hpp" 10 #include "Environment.hpp"
11 -#include "InterpretedChunksDecoder.hpp" 11 +#include "decoder/InterpretedChunksDecoder.hpp"
12 #include "MorphDeserializer.hpp" 12 #include "MorphDeserializer.hpp"
13 #include "exceptions.hpp" 13 #include "exceptions.hpp"
  14 +#include "decoder/InterpretedChunksDecoder4Analyzer.hpp"
  15 +#include "decoder/InterpretedChunksDecoder4Generator.hpp"
14 16
15 //class InterpretedChunksDecoder4Analyzer; 17 //class InterpretedChunksDecoder4Analyzer;
16 //class InterpretedChunksDecoder4Generator; 18 //class InterpretedChunksDecoder4Generator;
@@ -53,7 +55,7 @@ processorType == ANALYZER @@ -53,7 +55,7 @@ processorType == ANALYZER
53 ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) 55 ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
54 : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)), 56 : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)),
55 processorType(processorType), 57 processorType(processorType),
56 -casePatternHelper() { 58 +casePatternHelper(new CasePatternHelper()) {
57 } 59 }
58 60
59 const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const { 61 const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const {
@@ -78,6 +80,7 @@ Environment::~Environment() { @@ -78,6 +80,7 @@ Environment::~Environment() {
78 delete this->fsaFileStartPtr; 80 delete this->fsaFileStartPtr;
79 } 81 }
80 delete this->chunksDecoder; 82 delete this->chunksDecoder;
  83 + delete this->casePatternHelper;
81 } 84 }
82 85
83 void Environment::setCharset(MorfeuszCharset charset) { 86 void Environment::setCharset(MorfeuszCharset charset) {
@@ -146,11 +149,11 @@ MorfeuszProcessorType Environment::getProcessorType() const { @@ -146,11 +149,11 @@ MorfeuszProcessorType Environment::getProcessorType() const {
146 } 149 }
147 150
148 void Environment::setCaseSensitive(bool caseSensitive) { 151 void Environment::setCaseSensitive(bool caseSensitive) {
149 - this->casePatternHelper.setCaseSensitive(caseSensitive); 152 + this->casePatternHelper->setCaseSensitive(caseSensitive);
150 } 153 }
151 154
152 const CasePatternHelper& Environment::getCasePatternHelper() const { 155 const CasePatternHelper& Environment::getCasePatternHelper() const {
153 - return this->casePatternHelper; 156 + return *this->casePatternHelper;
154 } 157 }
155 158
156 const Qualifiers& Environment::getQualifiersHelper() const { 159 const Qualifiers& Environment::getQualifiersHelper() const {
morfeusz/Environment.hpp
@@ -11,6 +11,7 @@ @@ -11,6 +11,7 @@
11 #include <vector> 11 #include <vector>
12 12
13 class InterpretedChunksDecoder; 13 class InterpretedChunksDecoder;
  14 +class CasePatternHelper;
14 15
15 #include "charset/CaseConverter.hpp" 16 #include "charset/CaseConverter.hpp"
16 #include "charset/CharsetConverter.hpp" 17 #include "charset/CharsetConverter.hpp"
@@ -79,7 +80,7 @@ private: @@ -79,7 +80,7 @@ private:
79 80
80 const InterpretedChunksDecoder* chunksDecoder; 81 const InterpretedChunksDecoder* chunksDecoder;
81 MorfeuszProcessorType processorType; 82 MorfeuszProcessorType processorType;
82 - CasePatternHelper casePatternHelper; 83 + CasePatternHelper* casePatternHelper;
83 84
84 const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; 85 const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const;
85 }; 86 };
morfeusz/InflexionGraph.cpp
@@ -78,7 +78,7 @@ void InflexionGraph::addPath(const std::vector&lt;InterpretedChunk&gt;&amp; path, bool wea @@ -78,7 +78,7 @@ void InflexionGraph::addPath(const std::vector&lt;InterpretedChunk&gt;&amp; path, bool wea
78 this->addMiddleEdge((unsigned int) this->graph.size(), e); 78 this->addMiddleEdge((unsigned int) this->graph.size(), e);
79 } 79 }
80 else { 80 else {
81 - Edge e = {chunk, (int) this->graph.size() + 1}; 81 + Edge e = {chunk, (unsigned long) this->graph.size() + 1};
82 this->addMiddleEdge((unsigned int) this->graph.size(), e); 82 this->addMiddleEdge((unsigned int) this->graph.size(), e);
83 } 83 }
84 } 84 }
@@ -117,7 +117,8 @@ static bool containsEqualEdge(const vector&lt;InflexionGraph::Edge&gt;&amp; edges, const I @@ -117,7 +117,8 @@ static bool containsEqualEdge(const vector&lt;InflexionGraph::Edge&gt;&amp; edges, const I
117 for (unsigned int i = 0; i < edges.size(); i++) { 117 for (unsigned int i = 0; i < edges.size(); i++) {
118 const InflexionGraph::Edge& e1 = edges[i]; 118 const InflexionGraph::Edge& e1 = edges[i];
119 if (e1.chunk.textStartPtr == e.chunk.textStartPtr 119 if (e1.chunk.textStartPtr == e.chunk.textStartPtr
120 - && e1.chunk.lowercaseCodepoints == e.chunk.lowercaseCodepoints 120 + && e1.chunk.textStartPtr == e.chunk.textStartPtr
  121 + && e1.chunk.textEndPtr == e.chunk.textEndPtr
121 && e1.chunk.segmentType == e.chunk.segmentType 122 && e1.chunk.segmentType == e.chunk.segmentType
122 && e1.nextNode == e.nextNode) { 123 && e1.nextNode == e.nextNode) {
123 return true; 124 return true;
morfeusz/InflexionGraph.hpp
@@ -22,7 +22,7 @@ public: @@ -22,7 +22,7 @@ public:
22 22
23 struct Edge { 23 struct Edge {
24 InterpretedChunk chunk; 24 InterpretedChunk chunk;
25 - unsigned int nextNode; 25 + unsigned long nextNode;
26 }; 26 };
27 27
28 void addPath(const std::vector<InterpretedChunk>& path, bool weak); 28 void addPath(const std::vector<InterpretedChunk>& path, bool weak);
morfeusz/InterpretedChunk.hpp
@@ -15,8 +15,6 @@ struct InterpretedChunk { @@ -15,8 +15,6 @@ struct InterpretedChunk {
15 unsigned char segmentType; 15 unsigned char segmentType;
16 const char* textStartPtr; 16 const char* textStartPtr;
17 const char* textEndPtr; 17 const char* textEndPtr;
18 - std::vector<uint32_t> originalCodepoints;  
19 - std::vector<uint32_t> lowercaseCodepoints;  
20 const unsigned char* interpsGroupPtr; 18 const unsigned char* interpsGroupPtr;
21 const unsigned char* interpsPtr; 19 const unsigned char* interpsPtr;
22 const unsigned char* interpsEndPtr; 20 const unsigned char* interpsEndPtr;
morfeusz/InterpretedChunksDecoder.hpp deleted
1 -/*  
2 - * File: InterpsGroupDecoder.hpp  
3 - * Author: mlenart  
4 - *  
5 - * Created on November 22, 2013, 10:35 PM  
6 - */  
7 -  
8 -#ifndef INTERPSGROUPDECODER_HPP  
9 -#define INTERPSGROUPDECODER_HPP  
10 -  
11 -#include <string>  
12 -#include <vector>  
13 -#include <utility>  
14 -  
15 -#include "charset/CharsetConverter.hpp"  
16 -#include "EncodedInterpretation.hpp"  
17 -#include "InterpretedChunk.hpp"  
18 -#include "EncodedInterpretation.hpp"  
19 -#include "charset/CaseConverter.hpp"  
20 -#include "Environment.hpp"  
21 -#include "MorphInterpretation.hpp"  
22 -#include "CasePatternHelper.hpp"  
23 -#include "deserializationUtils.hpp"  
24 -#include "compressionByteUtils.hpp"  
25 -#include "const.hpp"  
26 -  
27 -class InterpretedChunksDecoder {  
28 -public:  
29 -  
30 - InterpretedChunksDecoder(const Environment& env)  
31 - : env(env) {  
32 - }  
33 -  
34 - virtual ~InterpretedChunksDecoder() {  
35 - }  
36 -  
37 - virtual void decode(  
38 - unsigned int startNode,  
39 - unsigned int endNode,  
40 - const InterpretedChunk& interpretedChunk,  
41 - std::vector<MorphInterpretation>& out) const = 0;  
42 -  
43 -protected:  
44 -  
45 - const Environment& env;  
46 -};  
47 -  
48 -class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder {  
49 -public:  
50 -  
51 - InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) {  
52 - }  
53 -  
54 - void decode(  
55 - unsigned int startNode,  
56 - unsigned int endNode,  
57 - const InterpretedChunk& interpretedChunk,  
58 - std::vector<MorphInterpretation>& out) const {  
59 - string orth;  
60 - string lemmaPrefix;  
61 - if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) {  
62 - orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);  
63 - const unsigned char* currPtr = interpretedChunk.interpsPtr;  
64 - while (currPtr < interpretedChunk.interpsEndPtr) {  
65 - this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out);  
66 - }  
67 - }  
68 - }  
69 -  
70 -protected:  
71 -  
72 - void decodeForm(  
73 - const vector<uint32_t>& orth,  
74 - const EncodedForm& lemma,  
75 - bool forPrefix,  
76 - string& res) const {  
77 - for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) {  
78 - uint32_t cp =  
79 - (i < lemma.casePattern.size() && lemma.casePattern[i])  
80 - ? env.getCaseConverter().toTitle(orth[i])  
81 - : orth[i];  
82 - env.getCharsetConverter().append(cp, res);  
83 - }  
84 - if (!forPrefix) {  
85 - const char* suffixPtr = lemma.suffixToAdd.c_str();  
86 - const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length();  
87 - while (suffixPtr != suffixEnd) {  
88 - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);  
89 - env.getCharsetConverter().append(cp, res);  
90 - }  
91 - }  
92 - }  
93 -  
94 - void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const {  
95 - encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte)  
96 - ? getPrefixCutLength(compressionByte)  
97 - : readInt8(ptr);  
98 - encodedForm.suffixToCut = readInt8(ptr);  
99 - encodedForm.suffixToAdd = readString(ptr);  
100 - assert(encodedForm.casePattern.size() == 0);  
101 - if (isLemmaOnlyLower(compressionByte)) {  
102 - encodedForm.casePattern = std::vector<bool>();  
103 - } else if (isLemmaOnlyTitle(compressionByte)) {  
104 - encodedForm.casePattern = std::vector<bool>();  
105 - encodedForm.casePattern.push_back(true);  
106 - } else {  
107 - encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);  
108 - }  
109 - }  
110 -  
111 - EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const {  
112 - EncodedInterpretation interp;  
113 - if (isOrthOnlyLower(compressionByte)) {  
114 - } else if (isOrthOnlyTitle(compressionByte)) {  
115 - interp.orthCasePattern.push_back(true);  
116 - } else {  
117 - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);  
118 - }  
119 - deserializeEncodedForm(ptr, compressionByte, interp.value);  
120 - interp.tag = readInt16(ptr);  
121 - interp.nameClassifier = *ptr++;  
122 - interp.qualifiers = readInt16(ptr);  
123 - return interp;  
124 - }  
125 -private:  
126 -  
127 - pair<string, string> getLemmaHomonymIdPair(const string& lemma) const {  
128 - vector<string> splitRes(split(lemma, ':'));  
129 - if (splitRes.size() == 2) {  
130 - return make_pair(splitRes[0], splitRes[1]);  
131 - } else {  
132 - return make_pair(lemma, "");  
133 - }  
134 - }  
135 -  
136 - void decodeMorphInterpretation(  
137 - unsigned int startNode, unsigned int endNode,  
138 - const string& orth,  
139 - const string& lemmaPrefix,  
140 - const InterpretedChunk& chunk,  
141 - bool forPrefix,  
142 - const unsigned char*& ptr,  
143 - std::vector<MorphInterpretation>& out) const {  
144 - string lemma = lemmaPrefix;  
145 - EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr);  
146 - this->decodeForm(chunk.lowercaseCodepoints, ei.value, forPrefix, lemma);  
147 - if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.orthCasePattern)) {  
148 - // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma);  
149 - out.push_back(MorphInterpretation(  
150 - startNode, endNode,  
151 - orth, lemma,  
152 - // "",  
153 - ei.tag,  
154 - ei.nameClassifier,  
155 - ei.qualifiers,  
156 - env));  
157 - }  
158 - }  
159 -  
160 - bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const {  
161 - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {  
162 - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];  
163 - orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);  
164 - const unsigned char* ptr = prefixChunk.interpsPtr;  
165 - std::vector<MorphInterpretation> mi;  
166 - // env.getCasePatternHelper().skipCasePattern(ptr);  
167 - this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi);  
168 - if (!mi.empty()) {  
169 - lemmaPrefix += mi[0].getLemma();  
170 - } else {  
171 - return false;  
172 - }  
173 - }  
174 - return true;  
175 - }  
176 -};  
177 -  
178 -class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder {  
179 -public:  
180 -  
181 - InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) {  
182 - }  
183 -  
184 - void decode(  
185 - unsigned int startNode,  
186 - unsigned int endNode,  
187 - const InterpretedChunk& interpretedChunk,  
188 - std::vector<MorphInterpretation>& out) const {  
189 - string orthPrefix;  
190 - string lemma;  
191 - convertPrefixes(interpretedChunk, orthPrefix, lemma);  
192 - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);  
193 - const unsigned char* currPtr = interpretedChunk.interpsPtr;  
194 - while (currPtr < interpretedChunk.interpsEndPtr) {  
195 - MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);  
196 - // cerr << mi.toString(false) << endl;  
197 - // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;  
198 - if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) {  
199 - out.push_back(mi);  
200 - }  
201 - }  
202 - }  
203 -  
204 -private:  
205 -  
206 - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const {  
207 - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {  
208 - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];  
209 - lemma += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);  
210 - const unsigned char* ptr = prefixChunk.interpsPtr;  
211 - MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr);  
212 - orthPrefix += mi.getOrth();  
213 - }  
214 - }  
215 -  
216 - MorphInterpretation decodeMorphInterpretation(  
217 - unsigned int startNode, unsigned int endNode,  
218 - const string& orthPrefix,  
219 - const string& lemma,  
220 - const InterpretedChunk& chunk,  
221 - const unsigned char*& ptr) const {  
222 - string orth = orthPrefix;  
223 - EncodedInterpretation ei = this->deserializeInterp(ptr);  
224 - this->decodeForm(chunk.originalCodepoints, ei.value, orth);  
225 - return MorphInterpretation(  
226 - startNode, endNode,  
227 - orth, lemma + HOMONYM_SEPARATOR + ei.homonymId,  
228 - // ei.homonymId,  
229 - ei.tag,  
230 - ei.nameClassifier,  
231 - ei.qualifiers,  
232 - env);  
233 - }  
234 -  
235 - void decodeForm(  
236 - const vector<uint32_t>& lemma,  
237 - const EncodedForm& orth,  
238 - string& res) const {  
239 - res += orth.prefixToAdd;  
240 - for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) {  
241 - env.getCharsetConverter().append(lemma[i], res);  
242 - }  
243 - const char* suffixPtr = orth.suffixToAdd.c_str();  
244 - const char* suffixEnd = suffixPtr + orth.suffixToAdd.length();  
245 - while (suffixPtr != suffixEnd) {  
246 - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);  
247 - env.getCharsetConverter().append(cp, res);  
248 - }  
249 - }  
250 -  
251 - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {  
252 - EncodedInterpretation interp;  
253 - interp.homonymId = readString(ptr);  
254 - interp.value.prefixToAdd = readString(ptr);  
255 - interp.value.suffixToCut = readInt8(ptr);  
256 - interp.value.suffixToAdd = readString(ptr);  
257 - interp.tag = readInt16(ptr);  
258 - interp.nameClassifier = readInt8(ptr);  
259 - interp.qualifiers = readInt16(ptr);  
260 - return interp;  
261 - }  
262 -};  
263 -  
264 -#endif /* INTERPSGROUPDECODER_HPP */  
265 -  
morfeusz/Morfeusz.cpp
@@ -12,7 +12,7 @@ @@ -12,7 +12,7 @@
12 #include "data/default_fsa.hpp" 12 #include "data/default_fsa.hpp"
13 #include "Morfeusz.hpp" 13 #include "Morfeusz.hpp"
14 #include "MorphDeserializer.hpp" 14 #include "MorphDeserializer.hpp"
15 -#include "InterpretedChunksDecoder.hpp" 15 +#include "decoder/InterpretedChunksDecoder.hpp"
16 #include "charset/CharsetConverter.hpp" 16 #include "charset/CharsetConverter.hpp"
17 #include "charset/charset_utils.hpp" 17 #include "charset/charset_utils.hpp"
18 #include "charset/CaseConverter.hpp" 18 #include "charset/CaseConverter.hpp"
@@ -34,6 +34,51 @@ static MorfeuszOptions createDefaultOptions() { @@ -34,6 +34,51 @@ static MorfeuszOptions createDefaultOptions() {
34 return res; 34 return res;
35 } 35 }
36 36
  37 +static void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
  38 + to.prefixChunks.insert(
  39 + to.prefixChunks.begin(),
  40 + from.prefixChunks.begin(),
  41 + from.prefixChunks.end());
  42 + to.prefixChunks.push_back(from);
  43 + to.textStartPtr = from.textStartPtr;
  44 + from.orthWasShifted = true;
  45 +}
  46 +
  47 +static string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
  48 + stringstream res;
  49 + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
  50 + return res.str();
  51 +}
  52 +
  53 +static string debugAccum(vector<InterpretedChunk>& accum) {
  54 + stringstream res;
  55 + for (unsigned int i = 0; i < accum.size(); i++) {
  56 + res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr);
  57 + // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
  58 + }
  59 + return res.str();
  60 +}
  61 +
  62 +static void feedStateDirectly(
  63 + StateType& state,
  64 + const char* inputStart,
  65 + const char* inputEnd) {
  66 + const char* currInput = inputStart;
  67 + while (currInput != inputEnd && !state.isSink()) {
  68 + state.proceedToNext(*currInput++);
  69 + }
  70 +}
  71 +
  72 +static void feedState(
  73 + StateType& state,
  74 + int codepoint) {
  75 + std::string chars;
  76 + UTF8CharsetConverter::getInstance().append(codepoint, chars);
  77 + for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) {
  78 + state.proceedToNext(chars[i]);
  79 + }
  80 +}
  81 +
37 Morfeusz::Morfeusz() 82 Morfeusz::Morfeusz()
38 : analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA), 83 : analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA),
39 generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA), 84 generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA),
@@ -97,11 +142,12 @@ void Morfeusz::processOneWord( @@ -97,11 +142,12 @@ void Morfeusz::processOneWord(
97 if (!graph.empty()) { 142 if (!graph.empty()) {
98 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); 143 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
99 int srcNode = startNodeNum; 144 int srcNode = startNodeNum;
100 - for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) {  
101 - const vector<InflexionGraph::Edge>& edges = graph.getTheGraph()[i]; 145 + const std::vector< std::vector<InflexionGraph::Edge> >& theGraph = graph.getTheGraph();
  146 + for (unsigned int i = 0; i < theGraph.size(); i++) {
  147 + const vector<InflexionGraph::Edge>& edges = theGraph[i];
102 for (unsigned int j = 0; j < edges.size(); j++) { 148 for (unsigned int j = 0; j < edges.size(); j++) {
103 const InflexionGraph::Edge& e = edges[j]; 149 const InflexionGraph::Edge& e = edges[j];
104 - int targetNode = startNodeNum + e.nextNode; 150 + unsigned long targetNode = startNodeNum + e.nextNode;
105 interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results); 151 interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results);
106 } 152 }
107 srcNode++; 153 srcNode++;
@@ -118,56 +164,11 @@ void Morfeusz::processOneWord( @@ -118,56 +164,11 @@ void Morfeusz::processOneWord(
118 inputStart = currInput; 164 inputStart = currInput;
119 } 165 }
120 166
121 -static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {  
122 - to.prefixChunks.insert(  
123 - to.prefixChunks.begin(),  
124 - from.prefixChunks.begin(),  
125 - from.prefixChunks.end());  
126 - to.prefixChunks.push_back(from);  
127 - from.orthWasShifted = true;  
128 - to.textStartPtr = from.textStartPtr;  
129 -}  
130 -  
131 -static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {  
132 - stringstream res;  
133 - res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";  
134 - return res.str();  
135 -}  
136 -  
137 -static inline string debugAccum(vector<InterpretedChunk>& accum) {  
138 - stringstream res;  
139 - for (unsigned int i = 0; i < accum.size(); i++) {  
140 - res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr);  
141 - // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";  
142 - }  
143 - return res.str();  
144 -}  
145 -  
146 -static inline void feedStateDirectly(  
147 - StateType& state,  
148 - const char* inputStart,  
149 - const char* inputEnd) {  
150 - const char* currInput = inputStart;  
151 - while (currInput != inputEnd && !state.isSink()) {  
152 - state.proceedToNext(*currInput++);  
153 - }  
154 -}  
155 -  
156 -static inline void feedState(  
157 - StateType& state,  
158 - int codepoint) {  
159 - std::string chars;  
160 - UTF8CharsetConverter::getInstance().append(codepoint, chars);  
161 - for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) {  
162 - state.proceedToNext(chars[i]);  
163 - }  
164 -}  
165 -  
166 void Morfeusz::doProcessOneWord( 167 void Morfeusz::doProcessOneWord(
167 const Environment& env, 168 const Environment& env,
168 const char*& inputData, 169 const char*& inputData,
169 const char* inputEnd, 170 const char* inputEnd,
170 - SegrulesState segrulesState) const { 171 + const SegrulesState& segrulesState) const {
171 if (this->options.debug) { 172 if (this->options.debug) {
172 cerr << "----------" << endl; 173 cerr << "----------" << endl;
173 cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; 174 cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
@@ -178,11 +179,6 @@ void Morfeusz::doProcessOneWord( @@ -178,11 +179,6 @@ void Morfeusz::doProcessOneWord(
178 const char* currInput = inputData; 179 const char* currInput = inputData;
179 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); 180 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
180 bool currCodepointIsWhitespace = isWhitespace(codepoint); 181 bool currCodepointIsWhitespace = isWhitespace(codepoint);
181 - vector<uint32_t> originalCodepoints;  
182 - vector<uint32_t> normalizedCodepoints;  
183 -  
184 - originalCodepoints.reserve(16);  
185 - normalizedCodepoints.reserve(16);  
186 182
187 StateType state = env.getFSA().getInitialState(); 183 StateType state = env.getFSA().getInitialState();
188 184
@@ -190,8 +186,6 @@ void Morfeusz::doProcessOneWord( @@ -190,8 +186,6 @@ void Morfeusz::doProcessOneWord(
190 uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER 186 uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
191 ? env.getCaseConverter().toLower(codepoint) 187 ? env.getCaseConverter().toLower(codepoint)
192 : codepoint; 188 : codepoint;
193 - originalCodepoints.push_back(codepoint);  
194 - normalizedCodepoints.push_back(normalizedCodepoint);  
195 if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) { 189 if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) {
196 feedStateDirectly(state, prevInput, currInput); 190 feedStateDirectly(state, prevInput, currInput);
197 } 191 }
@@ -203,48 +197,37 @@ void Morfeusz::doProcessOneWord( @@ -203,48 +197,37 @@ void Morfeusz::doProcessOneWord(
203 currCodepointIsWhitespace = isWhitespace(codepoint); 197 currCodepointIsWhitespace = isWhitespace(codepoint);
204 string homonymId; 198 string homonymId;
205 if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) { 199 if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) {
206 - if (originalCodepoints.size() == 1) {  
207 - throw MorfeuszException("Lemma of length > 1 cannot start with a colon");  
208 - }  
209 homonymId = string(currInput + 1, inputEnd); 200 homonymId = string(currInput + 1, inputEnd);
210 - // cerr << "homonym " << homonymId << endl;  
211 prevInput = currInput; 201 prevInput = currInput;
212 currInput = inputEnd; 202 currInput = inputEnd;
213 codepoint = 0x00; 203 codepoint = 0x00;
214 currCodepointIsWhitespace = true; 204 currCodepointIsWhitespace = true;
215 } 205 }
216 if (state.isAccepting()) { 206 if (state.isAccepting()) {
217 - vector<InterpsGroup> val(state.getValue());  
218 - for (unsigned int i = 0; i < val.size(); i++) {  
219 - InterpsGroup& ig = val[i]; 207 +// vector<InterpsGroup> val(state.getValue());
  208 + for (unsigned int i = 0; i < state.getValue().size(); i++) {
  209 + const InterpsGroup& ig = state.getValue()[i];
220 if (this->options.debug) { 210 if (this->options.debug) {
221 cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; 211 cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
222 } 212 }
223 - vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace); 213 + const vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace);
224 if (!newSegrulesStates.empty() 214 if (!newSegrulesStates.empty()
225 - && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig)) {  
226 -  
227 - for (  
228 - vector<SegrulesState>::iterator it = newSegrulesStates.begin();  
229 - it != newSegrulesStates.end();  
230 - ++it) {  
231 - SegrulesState newSegrulesState = *it; 215 + && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(env, inputStart, currInput, ig)) {
  216 + for (unsigned int i = 0; i < newSegrulesStates.size(); i++) {
  217 + const SegrulesState& newSegrulesState = newSegrulesStates[i];
232 const unsigned char* interpsPtr = getInterpretationsPtr(env, ig); 218 const unsigned char* interpsPtr = getInterpretationsPtr(env, ig);
233 const unsigned char* interpsEndPtr = ig.ptr + ig.size; 219 const unsigned char* interpsEndPtr = ig.ptr + ig.size;
234 - InterpretedChunk ic = {  
235 - ig.type,  
236 - inputStart,  
237 - currInput,  
238 - originalCodepoints,  
239 - normalizedCodepoints,  
240 - ig.ptr,  
241 - interpsPtr,  
242 - interpsEndPtr,  
243 - newSegrulesState.shiftOrthFromPrevious,  
244 - false,  
245 - vector<InterpretedChunk>(),  
246 - homonymId  
247 - }; 220 + InterpretedChunk ic;
  221 + ic.segmentType = ig.type;
  222 + ic.textStartPtr = inputStart;
  223 + ic.textEndPtr = currInput;
  224 + ic.interpsGroupPtr = ig.ptr;
  225 + ic.interpsPtr = interpsPtr;
  226 + ic.interpsEndPtr = interpsEndPtr;
  227 + ic.shiftOrth = newSegrulesState.shiftOrthFromPrevious;
  228 + ic.orthWasShifted = false;
  229 + ic.requiredHomonymId = homonymId;
  230 +
248 if (!accum.empty() && accum.back().shiftOrth) { 231 if (!accum.empty() && accum.back().shiftOrth) {
249 doShiftOrth(accum.back(), ic); 232 doShiftOrth(accum.back(), ic);
250 } 233 }
@@ -266,7 +249,7 @@ void Morfeusz::doProcessOneWord( @@ -266,7 +249,7 @@ void Morfeusz::doProcessOneWord(
266 } 249 }
267 } 250 }
268 else if (this->options.debug) { 251 else if (this->options.debug) {
269 - cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl; 252 +// cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl;
270 cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; 253 cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
271 } 254 }
272 } 255 }
morfeusz/Morfeusz.hpp
@@ -170,7 +170,7 @@ private: @@ -170,7 +170,7 @@ private:
170 const Environment& env, 170 const Environment& env,
171 const char*& inputData, 171 const char*& inputData,
172 const char* inputEnd, 172 const char* inputEnd,
173 - SegrulesState segrulesState) const; 173 + const SegrulesState& segrulesState) const;
174 174
175 void handleIgnChunk( 175 void handleIgnChunk(
176 const Environment& env, 176 const Environment& env,
morfeusz/decoder/InterpretedChunksDecoder.hpp 0 → 100644
  1 +/*
  2 + * File: InterpsGroupDecoder.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on November 22, 2013, 10:35 PM
  6 + */
  7 +
  8 +#ifndef INTERPSGROUPDECODER_HPP
  9 +#define INTERPSGROUPDECODER_HPP
  10 +
  11 +#include <string>
  12 +#include <vector>
  13 +#include <utility>
  14 +
  15 +#include "charset/CharsetConverter.hpp"
  16 +#include "EncodedInterpretation.hpp"
  17 +#include "InterpretedChunk.hpp"
  18 +#include "EncodedInterpretation.hpp"
  19 +#include "charset/CaseConverter.hpp"
  20 +#include "Environment.hpp"
  21 +#include "MorphInterpretation.hpp"
  22 +#include "CasePatternHelper.hpp"
  23 +#include "deserializationUtils.hpp"
  24 +#include "compressionByteUtils.hpp"
  25 +#include "const.hpp"
  26 +
  27 +class InterpretedChunksDecoder {
  28 +public:
  29 +
  30 + InterpretedChunksDecoder(const Environment& env): env(env) {
  31 + }
  32 +
  33 + virtual ~InterpretedChunksDecoder() {
  34 + }
  35 +
  36 + virtual void decode(
  37 + unsigned int startNode,
  38 + unsigned int endNode,
  39 + const InterpretedChunk& interpretedChunk,
  40 + std::vector<MorphInterpretation>& out) const = 0;
  41 +
  42 +protected:
  43 +
  44 + const Environment& env;
  45 +};
  46 +
  47 +#endif /* INTERPSGROUPDECODER_HPP */
  48 +
morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp 0 → 100644
  1 +/*
  2 + * File: InterpretedChunksDecoder4Analyzer.cpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 15 maj 2014, 15:28
  6 + */
  7 +
  8 +#include "InterpretedChunksDecoder4Analyzer.hpp"
  9 +#include <string>
  10 +
  11 +using namespace std;
  12 +
  13 +InterpretedChunksDecoder4Analyzer::InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) {
  14 +}
  15 +
  16 +void InterpretedChunksDecoder4Analyzer::decode(
  17 + unsigned int startNode,
  18 + unsigned int endNode,
  19 + const InterpretedChunk& interpretedChunk,
  20 + std::vector<MorphInterpretation>& out) const {
  21 + string orth;
  22 + string lemmaPrefix;
  23 + if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) {
  24 + // orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
  25 + orth.insert(orth.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr);
  26 + const unsigned char* currPtr = interpretedChunk.interpsPtr;
  27 + while (currPtr < interpretedChunk.interpsEndPtr) {
  28 + this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out);
  29 + }
  30 + }
  31 +}
  32 +
  33 +void InterpretedChunksDecoder4Analyzer::decodeLemma(
  34 + const vector<uint32_t>& orth,
  35 + const EncodedForm& lemma,
  36 + bool forPrefix,
  37 + string& res) const {
  38 + for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) {
  39 + uint32_t cp =
  40 + (i < lemma.casePattern.size() && lemma.casePattern[i])
  41 + ? env.getCaseConverter().toTitle(orth[i])
  42 + : orth[i];
  43 + env.getCharsetConverter().append(cp, res);
  44 + }
  45 + if (!forPrefix) {
  46 + const char* suffixPtr = lemma.suffixToAdd.c_str();
  47 + const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length();
  48 + while (suffixPtr != suffixEnd) {
  49 + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
  50 + env.getCharsetConverter().append(cp, res);
  51 + }
  52 + }
  53 +}
  54 +
  55 +void InterpretedChunksDecoder4Analyzer::deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const {
  56 + encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte)
  57 + ? getPrefixCutLength(compressionByte)
  58 + : readInt8(ptr);
  59 + encodedForm.suffixToCut = readInt8(ptr);
  60 + encodedForm.suffixToAdd = readString(ptr);
  61 + assert(encodedForm.casePattern.size() == 0);
  62 + if (isLemmaOnlyLower(compressionByte)) {
  63 +// encodedForm.casePattern = std::vector<bool>();
  64 + }
  65 + else if (isLemmaOnlyTitle(compressionByte)) {
  66 +// encodedForm.casePattern = std::vector<bool>();
  67 + encodedForm.casePattern.push_back(true);
  68 + }
  69 + else {
  70 + encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
  71 + }
  72 +}
  73 +
  74 +EncodedInterpretation InterpretedChunksDecoder4Analyzer::deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const {
  75 + EncodedInterpretation interp;
  76 + if (isOrthOnlyLower(compressionByte)) {
  77 + }
  78 + else if (isOrthOnlyTitle(compressionByte)) {
  79 + interp.orthCasePattern.push_back(true);
  80 + }
  81 + else {
  82 + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
  83 + }
  84 + deserializeEncodedForm(ptr, compressionByte, interp.value);
  85 + interp.tag = readInt16(ptr);
  86 + interp.nameClassifier = *ptr++;
  87 + interp.qualifiers = readInt16(ptr);
  88 + return interp;
  89 +}
  90 +
  91 +void InterpretedChunksDecoder4Analyzer::decodeMorphInterpretation(
  92 + unsigned int startNode, unsigned int endNode,
  93 + const string& orth,
  94 + const string& lemmaPrefix,
  95 + const InterpretedChunk& chunk,
  96 + bool forPrefix,
  97 + const unsigned char*& ptr,
  98 + std::vector<MorphInterpretation>& out) const {
  99 + string lemma(lemmaPrefix);
  100 + orthCodepoints.clear();
  101 + normalizedCodepoints.clear();
  102 + const char* currPtr = chunk.textStartPtr;
  103 + while (currPtr != chunk.textEndPtr) {
  104 + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr);
  105 + orthCodepoints.push_back(cp);
  106 + normalizedCodepoints.push_back(env.getCaseConverter().toLower(cp));
  107 + }
  108 + EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr);
  109 + if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, orthCodepoints, ei.orthCasePattern)) {
  110 + this->decodeLemma(normalizedCodepoints, ei.value, forPrefix, lemma);
  111 + // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma);
  112 + out.push_back(MorphInterpretation(
  113 + startNode, endNode,
  114 + orth, lemma,
  115 + // "",
  116 + ei.tag,
  117 + ei.nameClassifier,
  118 + ei.qualifiers,
  119 + env));
  120 + }
  121 +}
  122 +
  123 +bool InterpretedChunksDecoder4Analyzer::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const {
  124 + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
  125 + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
  126 + orth.insert(orth.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr);
  127 + const unsigned char* ptr = prefixChunk.interpsPtr;
  128 + std::vector<MorphInterpretation> mi;
  129 + this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi);
  130 + if (!mi.empty()) {
  131 + lemmaPrefix += mi[0].getLemma();
  132 + }
  133 + else {
  134 + return false;
  135 + }
  136 + }
  137 + return true;
  138 +}
morfeusz/decoder/InterpretedChunksDecoder4Analyzer.hpp 0 → 100644
  1 +/*
  2 + * File: InterpretedChunksDecoder4Analyzer.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 15 maj 2014, 15:28
  6 + */
  7 +
  8 +#ifndef INTERPRETEDCHUNKSDECODER4ANALYZER_HPP
  9 +#define INTERPRETEDCHUNKSDECODER4ANALYZER_HPP
  10 +
  11 +#include "InterpretedChunksDecoder.hpp"
  12 +
  13 +class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder {
  14 +public:
  15 +
  16 + InterpretedChunksDecoder4Analyzer(const Environment& env);
  17 +
  18 + void decode(
  19 + unsigned int startNode,
  20 + unsigned int endNode,
  21 + const InterpretedChunk& interpretedChunk,
  22 + std::vector<MorphInterpretation>& out) const;
  23 +
  24 +private:
  25 +
  26 + void decodeLemma(
  27 + const vector<uint32_t>& orth,
  28 + const EncodedForm& lemma,
  29 + bool forPrefix,
  30 + string& res) const;
  31 +
  32 + void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const;
  33 +
  34 + EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const;
  35 +
  36 + void decodeMorphInterpretation(
  37 + unsigned int startNode, unsigned int endNode,
  38 + const string& orth,
  39 + const string& lemmaPrefix,
  40 + const InterpretedChunk& chunk,
  41 + bool forPrefix,
  42 + const unsigned char*& ptr,
  43 + std::vector<MorphInterpretation>& out) const;
  44 +
  45 + bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const;
  46 +
  47 + mutable std::vector<uint32_t> orthCodepoints;
  48 + mutable std::vector<uint32_t> normalizedCodepoints;
  49 +};
  50 +
  51 +#endif /* INTERPRETEDCHUNKSDECODER4ANALYZER_HPP */
  52 +
morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp 0 → 100644
  1 +/*
  2 + * File: InterpretedChunksDecoder4Generator.cpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 15 maj 2014, 15:28
  6 + */
  7 +
  8 +#include "InterpretedChunksDecoder4Generator.hpp"
  9 +#include <string>
  10 +#include <vector>
  11 +
  12 +using namespace std;
  13 +
  14 +InterpretedChunksDecoder4Generator::InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) {
  15 +}
  16 +
  17 +void InterpretedChunksDecoder4Generator::decode(
  18 + unsigned int startNode,
  19 + unsigned int endNode,
  20 + const InterpretedChunk& interpretedChunk,
  21 + std::vector<MorphInterpretation>& out) const {
  22 + string orthPrefix;
  23 + string lemma;
  24 + convertPrefixes(interpretedChunk, orthPrefix, lemma);
  25 + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
  26 + lemma.insert(lemma.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr);
  27 + const unsigned char* currPtr = interpretedChunk.interpsPtr;
  28 + while (currPtr < interpretedChunk.interpsEndPtr) {
  29 + MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
  30 + // cerr << mi.toString(false) << endl;
  31 + // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
  32 + if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) {
  33 + out.push_back(mi);
  34 + }
  35 + }
  36 +}
  37 +
  38 +void InterpretedChunksDecoder4Generator::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const {
  39 + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
  40 + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
  41 + lemma.insert(lemma.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr);
  42 + const unsigned char* ptr = prefixChunk.interpsPtr;
  43 + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr);
  44 + orthPrefix += mi.getOrth();
  45 + }
  46 +}
  47 +
  48 +MorphInterpretation InterpretedChunksDecoder4Generator::decodeMorphInterpretation(
  49 + unsigned int startNode, unsigned int endNode,
  50 + const string& orthPrefix,
  51 + const string& lemma,
  52 + const InterpretedChunk& chunk,
  53 + const unsigned char*& ptr) const {
  54 + string orth = orthPrefix;
  55 + EncodedInterpretation ei = this->deserializeInterp(ptr);
  56 + codepoints.clear();
  57 + const char* currPtr = chunk.textStartPtr;
  58 + while (currPtr != chunk.textEndPtr) {
  59 + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr);
  60 + codepoints.push_back(cp);
  61 + }
  62 + this->decodeForm(codepoints, ei.value, orth);
  63 + return MorphInterpretation(
  64 + startNode, endNode,
  65 + orth, ei.homonymId.empty() ? lemma : (lemma + HOMONYM_SEPARATOR + ei.homonymId),
  66 + // ei.homonymId,
  67 + ei.tag,
  68 + ei.nameClassifier,
  69 + ei.qualifiers,
  70 + env);
  71 +}
  72 +
  73 +void InterpretedChunksDecoder4Generator::decodeForm(
  74 + const vector<uint32_t>& lemma,
  75 + const EncodedForm& orth,
  76 + string& res) const {
  77 + res += orth.prefixToAdd;
  78 + for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) {
  79 + env.getCharsetConverter().append(lemma[i], res);
  80 + }
  81 + const char* suffixPtr = orth.suffixToAdd.c_str();
  82 + const char* suffixEnd = suffixPtr + orth.suffixToAdd.length();
  83 + while (suffixPtr != suffixEnd) {
  84 + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
  85 + env.getCharsetConverter().append(cp, res);
  86 + }
  87 +}
  88 +
  89 +EncodedInterpretation InterpretedChunksDecoder4Generator::deserializeInterp(const unsigned char*& ptr) const {
  90 + EncodedInterpretation interp;
  91 + interp.homonymId = readString(ptr);
  92 + interp.value.prefixToAdd = readString(ptr);
  93 + interp.value.suffixToCut = readInt8(ptr);
  94 + interp.value.suffixToAdd = readString(ptr);
  95 + interp.tag = readInt16(ptr);
  96 + interp.nameClassifier = readInt8(ptr);
  97 + interp.qualifiers = readInt16(ptr);
  98 + return interp;
  99 +}
morfeusz/decoder/InterpretedChunksDecoder4Generator.hpp 0 → 100644
  1 +/*
  2 + * File: InterpretedChunksDecoder4Generator.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 15 maj 2014, 15:28
  6 + */
  7 +
  8 +#ifndef INTERPRETEDCHUNKSDECODER4GENERATOR_HPP
  9 +#define INTERPRETEDCHUNKSDECODER4GENERATOR_HPP
  10 +
  11 +#include "InterpretedChunksDecoder.hpp"
  12 +
  13 +class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder {
  14 +public:
  15 +
  16 + InterpretedChunksDecoder4Generator(const Environment& env);
  17 +
  18 + void decode(
  19 + unsigned int startNode,
  20 + unsigned int endNode,
  21 + const InterpretedChunk& interpretedChunk,
  22 + std::vector<MorphInterpretation>& out) const;
  23 +
  24 +private:
  25 +
  26 + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const;
  27 +
  28 + MorphInterpretation decodeMorphInterpretation(
  29 + unsigned int startNode, unsigned int endNode,
  30 + const string& orthPrefix,
  31 + const string& lemma,
  32 + const InterpretedChunk& chunk,
  33 + const unsigned char*& ptr) const;
  34 +
  35 + void decodeForm(
  36 + const vector<uint32_t>& lemma,
  37 + const EncodedForm& orth,
  38 + string& res) const;
  39 +
  40 + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const;
  41 +
  42 + mutable std::vector<uint32_t> codepoints;
  43 +};
  44 +
  45 +
  46 +#endif /* INTERPRETEDCHUNKSDECODER4GENERATOR_HPP */
  47 +
morfeusz/fsa/fsa.hpp
@@ -167,7 +167,7 @@ public: @@ -167,7 +167,7 @@ public:
167 * Makes sense only for accepting states. 167 * Makes sense only for accepting states.
168 * For non-accepting states is throws an exception. 168 * For non-accepting states is throws an exception.
169 */ 169 */
170 - T getValue() const; 170 + const T& getValue() const;
171 171
172 unsigned char getLastTransitionValue() const; 172 unsigned char getLastTransitionValue() const;
173 173
morfeusz/fsa/state_impl.hpp
@@ -46,7 +46,7 @@ unsigned long State&lt;T&gt;::getOffset() const { @@ -46,7 +46,7 @@ unsigned long State&lt;T&gt;::getOffset() const {
46 } 46 }
47 47
48 template <class T> 48 template <class T>
49 -T State<T>::getValue() const { 49 +const T& State<T>::getValue() const {
50 assert(this->isAccepting()); 50 assert(this->isAccepting());
51 return this->value; 51 return this->value;
52 } 52 }
morfeusz/morfeusz_analyzer.cpp
@@ -43,11 +43,20 @@ int main(int argc, const char** argv) { @@ -43,11 +43,20 @@ int main(int argc, const char** argv) {
43 else if (prevStart != -1) { 43 else if (prevStart != -1) {
44 printf("; "); 44 printf("; ");
45 } 45 }
46 - printf("%s", mi.toString(true).c_str());  
47 -// printf("%d,%d,%s,%s,%s,%s",  
48 -// mi.getStartNode(), mi.getEndNode(),  
49 -// mi.getOrth().c_str(), lemmaToShow.c_str(),  
50 -// mi.getTag().c_str(), lemmaToShow.c_str()); 46 +// printf("%s", mi.toString(true).c_str());
  47 + printf("%d,%d,%s,%s,%s",
  48 + mi.getStartNode(), mi.getEndNode(),
  49 + mi.getOrth().c_str(), mi.getLemma().c_str(),
  50 + mi.getTag().c_str());
  51 + if (!mi.getName().empty()) {
  52 + printf(",%s", mi.getName().c_str());
  53 + }
  54 + if (!mi.getQualifiers().empty()) {
  55 + printf(",%s", mi.getQualifiers()[0].c_str());
  56 + for (unsigned int i = 1; i < mi.getQualifiers().size(); i++) {
  57 + printf("|%s", mi.getQualifiers()[i].c_str());
  58 + }
  59 + }
51 prevStart = mi.getStartNode(); 60 prevStart = mi.getStartNode();
52 prevEnd = mi.getEndNode(); 61 prevEnd = mi.getEndNode();
53 } 62 }
morfeusz/segrules/SegrulesFSA.hpp
@@ -34,12 +34,12 @@ public: @@ -34,12 +34,12 @@ public:
34 34
35 std::vector<SegrulesState> proceedToNext( 35 std::vector<SegrulesState> proceedToNext(
36 const unsigned char segnum, 36 const unsigned char segnum,
37 - const SegrulesState state, 37 + const SegrulesState& state,
38 bool atEndOfWord) const { 38 bool atEndOfWord) const {
39 std::vector<SegrulesState> res; 39 std::vector<SegrulesState> res;
40 const unsigned char* currPtr = ptr + state.offset + 1; 40 const unsigned char* currPtr = ptr + state.offset + 1;
41 const unsigned char transitionsNum = *currPtr++; 41 const unsigned char transitionsNum = *currPtr++;
42 - for (unsigned int i = 0; i < transitionsNum; i++) { 42 + for (int i = 0; i < transitionsNum; i++) {
43 if (*currPtr == segnum) { 43 if (*currPtr == segnum) {
44 SegrulesState newState = this->transition2State(currPtr); 44 SegrulesState newState = this->transition2State(currPtr);
45 if ((atEndOfWord && newState.accepting) 45 if ((atEndOfWord && newState.accepting)
nbproject/configurations.xml
@@ -130,6 +130,8 @@ @@ -130,6 +130,8 @@
130 </ccTool> 130 </ccTool>
131 </item> 131 </item>
132 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> 132 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
  133 + <ccTool flags="1">
  134 + </ccTool>
133 </item> 135 </item>
134 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" 136 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
135 ex="false" 137 ex="false"
@@ -239,6 +241,7 @@ @@ -239,6 +241,7 @@
239 <pElem>build/morfeusz</pElem> 241 <pElem>build/morfeusz</pElem>
240 </incDir> 242 </incDir>
241 <preprocessorList> 243 <preprocessorList>
  244 + <Elem>NDEBUG</Elem>
242 <Elem>libmorfeusz_EXPORTS</Elem> 245 <Elem>libmorfeusz_EXPORTS</Elem>
243 </preprocessorList> 246 </preprocessorList>
244 </ccTool> 247 </ccTool>
@@ -283,7 +286,7 @@ @@ -283,7 +286,7 @@
283 <ccTool> 286 <ccTool>
284 <incDir> 287 <incDir>
285 <pElem>morfeusz</pElem> 288 <pElem>morfeusz</pElem>
286 - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> 289 + <pElem>/usr/lib/jvm/default-java/include</pElem>
287 </incDir> 290 </incDir>
288 <preprocessorList> 291 <preprocessorList>
289 <Elem>NDEBUG</Elem> 292 <Elem>NDEBUG</Elem>
@@ -310,6 +313,19 @@ @@ -310,6 +313,19 @@
310 </undefinedList> 313 </undefinedList>
311 </ccTool> 314 </ccTool>
312 </folder> 315 </folder>
  316 + <item path="morfeusz/CasePatternHelper.cpp" ex="false" tool="1" flavor2="4">
  317 + <ccTool flags="1">
  318 + <incDir>
  319 + <pElem>build</pElem>
  320 + <pElem>morfeusz</pElem>
  321 + <pElem>build/morfeusz</pElem>
  322 + </incDir>
  323 + <preprocessorList>
  324 + <Elem>NDEBUG</Elem>
  325 + <Elem>libmorfeusz_EXPORTS</Elem>
  326 + </preprocessorList>
  327 + </ccTool>
  328 + </item>
313 <item path="morfeusz/Environment.cpp" ex="false" tool="1" flavor2="4"> 329 <item path="morfeusz/Environment.cpp" ex="false" tool="1" flavor2="4">
314 <ccTool flags="1"> 330 <ccTool flags="1">
315 <incDir> 331 <incDir>
@@ -387,40 +403,75 @@ @@ -387,40 +403,75 @@
387 </ccTool> 403 </ccTool>
388 </item> 404 </item>
389 <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> 405 <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4">
390 - <ccTool flags="1"> 406 + <ccTool flags="2">
391 <incDir> 407 <incDir>
392 <pElem>build</pElem> 408 <pElem>build</pElem>
393 <pElem>morfeusz</pElem> 409 <pElem>morfeusz</pElem>
394 <pElem>build/morfeusz</pElem> 410 <pElem>build/morfeusz</pElem>
395 </incDir> 411 </incDir>
396 <preprocessorList> 412 <preprocessorList>
397 - <Elem>NDEBUG</Elem>  
398 <Elem>libmorfeusz_EXPORTS</Elem> 413 <Elem>libmorfeusz_EXPORTS</Elem>
399 </preprocessorList> 414 </preprocessorList>
400 </ccTool> 415 </ccTool>
401 </item> 416 </item>
402 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> 417 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4">
  418 + <ccTool flags="2">
  419 + </ccTool>
403 </item> 420 </item>
404 <item path="morfeusz/charset/CharsetConverter.cpp" 421 <item path="morfeusz/charset/CharsetConverter.cpp"
405 ex="false" 422 ex="false"
406 tool="1" 423 tool="1"
407 flavor2="4"> 424 flavor2="4">
408 - <ccTool flags="1">  
409 - <preprocessorList>  
410 - <Elem>NDEBUG</Elem>  
411 - </preprocessorList> 425 + <ccTool flags="2">
412 </ccTool> 426 </ccTool>
413 </item> 427 </item>
414 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> 428 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
  429 + <ccTool flags="2">
  430 + </ccTool>
415 </item> 431 </item>
416 <item path="morfeusz/charset/conversion_tables.cpp" 432 <item path="morfeusz/charset/conversion_tables.cpp"
417 ex="false" 433 ex="false"
418 tool="1" 434 tool="1"
419 flavor2="4"> 435 flavor2="4">
  436 + <ccTool flags="2">
  437 + </ccTool>
420 </item> 438 </item>
421 <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4"> 439 <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4">
  440 + <ccTool flags="1">
  441 + </ccTool>
422 </item> 442 </item>
423 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> 443 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
  444 + <ccTool flags="2">
  445 + <incDir>
  446 + <pElem>build</pElem>
  447 + <pElem>morfeusz</pElem>
  448 + <pElem>build/morfeusz</pElem>
  449 + </incDir>
  450 + <preprocessorList>
  451 + <Elem>libmorfeusz_EXPORTS</Elem>
  452 + </preprocessorList>
  453 + </ccTool>
  454 + </item>
  455 + <item path="morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp"
  456 + ex="false"
  457 + tool="1"
  458 + flavor2="4">
  459 + <ccTool flags="1">
  460 + <incDir>
  461 + <pElem>build</pElem>
  462 + <pElem>morfeusz</pElem>
  463 + <pElem>build/morfeusz</pElem>
  464 + </incDir>
  465 + <preprocessorList>
  466 + <Elem>NDEBUG</Elem>
  467 + <Elem>libmorfeusz_EXPORTS</Elem>
  468 + </preprocessorList>
  469 + </ccTool>
  470 + </item>
  471 + <item path="morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp"
  472 + ex="false"
  473 + tool="1"
  474 + flavor2="4">
424 <ccTool flags="1"> 475 <ccTool flags="1">
425 <incDir> 476 <incDir>
426 <pElem>build</pElem> 477 <pElem>build</pElem>
@@ -509,6 +560,8 @@ @@ -509,6 +560,8 @@
509 </ccTool> 560 </ccTool>
510 </item> 561 </item>
511 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> 562 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
  563 + <ccTool flags="1">
  564 + </ccTool>
512 </item> 565 </item>
513 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> 566 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
514 <ccTool flags="0"> 567 <ccTool flags="0">
profile.sh 0 → 100755
  1 +#!/bin/bash
  2 +
  3 +rm -rf profbuild
  4 +mkdir -p profbuild
  5 +cd profbuild
  6 +cmake -D INPUT_DICTIONARIES=../input/dodatki.tab,../input/PoliMorfSmall.tab -D CMAKE_BUILD_TYPE=Debug -D CMAKE_SHARED_LINKER_FLAGS="-lprofiler" -D CMAKE_EXE_LINKER_FLAGS="-lprofiler" ..
  7 +make -j4
  8 +rm -f /tmp/morfeusz.prof
  9 +export LD_PRELOAD="/usr/lib/libprofiler.so"
  10 +export CPUPROFILE="/tmp/morfeusz.prof"
  11 +morfeusz/morfeusz_analyzer -i /tmp/dupadupa < /mnt/storage/morfeusz/sents10k > /dev/null
  12 +### pprof --gv profbuild/morfeusz/morfeusz_analyzer /tmp/morfeusz.prof