Commit f80dea595a7fb0c3ef6f9dea0075249a41c6f86b
1 parent
f3f17708
dalsza optymalizacja kodu
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@181 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
21 changed files
with
572 additions
and
434 deletions
morfeusz/CMakeLists.txt
@@ -38,6 +38,9 @@ set(SRC_FILES | @@ -38,6 +38,9 @@ set(SRC_FILES | ||
38 | charset/conversion_tables.cpp | 38 | charset/conversion_tables.cpp |
39 | cli/cli.cpp | 39 | cli/cli.cpp |
40 | segrules/segrules.cpp | 40 | segrules/segrules.cpp |
41 | + CasePatternHelper.cpp | ||
42 | + decoder/InterpretedChunksDecoder4Analyzer.cpp | ||
43 | + decoder/InterpretedChunksDecoder4Generator.cpp | ||
41 | ) | 44 | ) |
42 | 45 | ||
43 | set(INCLUDE_FILES | 46 | set(INCLUDE_FILES |
morfeusz/CasePatternHelper.hpp
@@ -12,6 +12,9 @@ | @@ -12,6 +12,9 @@ | ||
12 | #include "InterpsGroup.hpp" | 12 | #include "InterpsGroup.hpp" |
13 | #include "CasePatternHelper.hpp" | 13 | #include "CasePatternHelper.hpp" |
14 | #include "compressionByteUtils.hpp" | 14 | #include "compressionByteUtils.hpp" |
15 | +#include "Environment.hpp" | ||
16 | + | ||
17 | +class Environment; | ||
15 | 18 | ||
16 | class CasePatternHelper { | 19 | class CasePatternHelper { |
17 | public: | 20 | public: |
@@ -39,64 +42,17 @@ public: | @@ -39,64 +42,17 @@ public: | ||
39 | } | 42 | } |
40 | 43 | ||
41 | bool checkInterpsGroupOrthCasePatterns( | 44 | bool checkInterpsGroupOrthCasePatterns( |
42 | - const std::vector<uint32_t>& lowercaseCodepoints, | ||
43 | - const std::vector<uint32_t>& originalCodepoints, | ||
44 | - const InterpsGroup& ig) const { | ||
45 | - const unsigned char* currPtr = ig.ptr; | ||
46 | - unsigned char compressionByte = *currPtr++; | ||
47 | - if (!this->caseSensitive) { | ||
48 | - return true; | ||
49 | - } | ||
50 | - else if (isOrthOnlyLower(compressionByte)) { | ||
51 | - return true; | ||
52 | - } | ||
53 | - else if (isOrthOnlyTitle(compressionByte)) { | ||
54 | - return lowercaseCodepoints[0] != originalCodepoints[0]; | ||
55 | - } | ||
56 | - else { | ||
57 | - unsigned char casePatternsNum = *currPtr++; | ||
58 | - if (casePatternsNum == 0) { | ||
59 | - return true; | ||
60 | - } | ||
61 | - else { | ||
62 | - for (unsigned int i = 0; i < casePatternsNum; i++) { | ||
63 | - if (checkCasePattern( | ||
64 | - lowercaseCodepoints, | ||
65 | - originalCodepoints, | ||
66 | - deserializeOneCasePattern(currPtr))) { | ||
67 | - return true; | ||
68 | - } | ||
69 | - } | ||
70 | - return false; | ||
71 | - } | ||
72 | - } | ||
73 | - } | 45 | + const Environment& env, |
46 | + const char* orthStart, | ||
47 | + const char* orthEnd, | ||
48 | + const InterpsGroup& ig) const; | ||
74 | 49 | ||
75 | - std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { | ||
76 | - std::vector<bool> res; | ||
77 | - uint8_t casePatternType = *ptr++; | ||
78 | - uint8_t prefixLength; | ||
79 | - uint8_t patternLength; | ||
80 | - switch (casePatternType) { | ||
81 | - case LEMMA_ONLY_LOWER: | ||
82 | - break; | ||
83 | - case LEMMA_UPPER_PREFIX: | ||
84 | - prefixLength = *ptr++; | ||
85 | - res.resize(prefixLength, true); | ||
86 | - break; | ||
87 | - case LEMMA_MIXED_CASE: | ||
88 | - patternLength = *ptr++; | ||
89 | - for (unsigned int i = 0; i < patternLength; i++) { | ||
90 | - uint8_t idx = *ptr++; | ||
91 | - res.resize(idx + 1, false); | ||
92 | - res[idx] = true; | ||
93 | - } | ||
94 | - break; | ||
95 | - } | ||
96 | - return res; | ||
97 | - } | 50 | + static std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr); |
98 | private: | 51 | private: |
99 | bool caseSensitive; | 52 | bool caseSensitive; |
53 | + | ||
54 | + mutable vector<uint32_t> orthCodepoints; | ||
55 | + mutable vector<uint32_t> normalizedCodepoints; | ||
100 | 56 | ||
101 | static const uint8_t LEMMA_ONLY_LOWER = 0; | 57 | static const uint8_t LEMMA_ONLY_LOWER = 0; |
102 | static const uint8_t LEMMA_UPPER_PREFIX = 1; | 58 | static const uint8_t LEMMA_UPPER_PREFIX = 1; |
morfeusz/Environment.cpp
@@ -8,9 +8,11 @@ | @@ -8,9 +8,11 @@ | ||
8 | #include <vector> | 8 | #include <vector> |
9 | #include <algorithm> | 9 | #include <algorithm> |
10 | #include "Environment.hpp" | 10 | #include "Environment.hpp" |
11 | -#include "InterpretedChunksDecoder.hpp" | 11 | +#include "decoder/InterpretedChunksDecoder.hpp" |
12 | #include "MorphDeserializer.hpp" | 12 | #include "MorphDeserializer.hpp" |
13 | #include "exceptions.hpp" | 13 | #include "exceptions.hpp" |
14 | +#include "decoder/InterpretedChunksDecoder4Analyzer.hpp" | ||
15 | +#include "decoder/InterpretedChunksDecoder4Generator.hpp" | ||
14 | 16 | ||
15 | //class InterpretedChunksDecoder4Analyzer; | 17 | //class InterpretedChunksDecoder4Analyzer; |
16 | //class InterpretedChunksDecoder4Generator; | 18 | //class InterpretedChunksDecoder4Generator; |
@@ -53,7 +55,7 @@ processorType == ANALYZER | @@ -53,7 +55,7 @@ processorType == ANALYZER | ||
53 | ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) | 55 | ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) |
54 | : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)), | 56 | : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)), |
55 | processorType(processorType), | 57 | processorType(processorType), |
56 | -casePatternHelper() { | 58 | +casePatternHelper(new CasePatternHelper()) { |
57 | } | 59 | } |
58 | 60 | ||
59 | const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const { | 61 | const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const { |
@@ -78,6 +80,7 @@ Environment::~Environment() { | @@ -78,6 +80,7 @@ Environment::~Environment() { | ||
78 | delete this->fsaFileStartPtr; | 80 | delete this->fsaFileStartPtr; |
79 | } | 81 | } |
80 | delete this->chunksDecoder; | 82 | delete this->chunksDecoder; |
83 | + delete this->casePatternHelper; | ||
81 | } | 84 | } |
82 | 85 | ||
83 | void Environment::setCharset(MorfeuszCharset charset) { | 86 | void Environment::setCharset(MorfeuszCharset charset) { |
@@ -146,11 +149,11 @@ MorfeuszProcessorType Environment::getProcessorType() const { | @@ -146,11 +149,11 @@ MorfeuszProcessorType Environment::getProcessorType() const { | ||
146 | } | 149 | } |
147 | 150 | ||
148 | void Environment::setCaseSensitive(bool caseSensitive) { | 151 | void Environment::setCaseSensitive(bool caseSensitive) { |
149 | - this->casePatternHelper.setCaseSensitive(caseSensitive); | 152 | + this->casePatternHelper->setCaseSensitive(caseSensitive); |
150 | } | 153 | } |
151 | 154 | ||
152 | const CasePatternHelper& Environment::getCasePatternHelper() const { | 155 | const CasePatternHelper& Environment::getCasePatternHelper() const { |
153 | - return this->casePatternHelper; | 156 | + return *this->casePatternHelper; |
154 | } | 157 | } |
155 | 158 | ||
156 | const Qualifiers& Environment::getQualifiersHelper() const { | 159 | const Qualifiers& Environment::getQualifiersHelper() const { |
morfeusz/Environment.hpp
@@ -11,6 +11,7 @@ | @@ -11,6 +11,7 @@ | ||
11 | #include <vector> | 11 | #include <vector> |
12 | 12 | ||
13 | class InterpretedChunksDecoder; | 13 | class InterpretedChunksDecoder; |
14 | +class CasePatternHelper; | ||
14 | 15 | ||
15 | #include "charset/CaseConverter.hpp" | 16 | #include "charset/CaseConverter.hpp" |
16 | #include "charset/CharsetConverter.hpp" | 17 | #include "charset/CharsetConverter.hpp" |
@@ -79,7 +80,7 @@ private: | @@ -79,7 +80,7 @@ private: | ||
79 | 80 | ||
80 | const InterpretedChunksDecoder* chunksDecoder; | 81 | const InterpretedChunksDecoder* chunksDecoder; |
81 | MorfeuszProcessorType processorType; | 82 | MorfeuszProcessorType processorType; |
82 | - CasePatternHelper casePatternHelper; | 83 | + CasePatternHelper* casePatternHelper; |
83 | 84 | ||
84 | const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; | 85 | const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; |
85 | }; | 86 | }; |
morfeusz/InflexionGraph.cpp
@@ -78,7 +78,7 @@ void InflexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool wea | @@ -78,7 +78,7 @@ void InflexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool wea | ||
78 | this->addMiddleEdge((unsigned int) this->graph.size(), e); | 78 | this->addMiddleEdge((unsigned int) this->graph.size(), e); |
79 | } | 79 | } |
80 | else { | 80 | else { |
81 | - Edge e = {chunk, (int) this->graph.size() + 1}; | 81 | + Edge e = {chunk, (unsigned long) this->graph.size() + 1}; |
82 | this->addMiddleEdge((unsigned int) this->graph.size(), e); | 82 | this->addMiddleEdge((unsigned int) this->graph.size(), e); |
83 | } | 83 | } |
84 | } | 84 | } |
@@ -117,7 +117,8 @@ static bool containsEqualEdge(const vector<InflexionGraph::Edge>& edges, const I | @@ -117,7 +117,8 @@ static bool containsEqualEdge(const vector<InflexionGraph::Edge>& edges, const I | ||
117 | for (unsigned int i = 0; i < edges.size(); i++) { | 117 | for (unsigned int i = 0; i < edges.size(); i++) { |
118 | const InflexionGraph::Edge& e1 = edges[i]; | 118 | const InflexionGraph::Edge& e1 = edges[i]; |
119 | if (e1.chunk.textStartPtr == e.chunk.textStartPtr | 119 | if (e1.chunk.textStartPtr == e.chunk.textStartPtr |
120 | - && e1.chunk.lowercaseCodepoints == e.chunk.lowercaseCodepoints | 120 | + && e1.chunk.textStartPtr == e.chunk.textStartPtr |
121 | + && e1.chunk.textEndPtr == e.chunk.textEndPtr | ||
121 | && e1.chunk.segmentType == e.chunk.segmentType | 122 | && e1.chunk.segmentType == e.chunk.segmentType |
122 | && e1.nextNode == e.nextNode) { | 123 | && e1.nextNode == e.nextNode) { |
123 | return true; | 124 | return true; |
morfeusz/InflexionGraph.hpp
@@ -22,7 +22,7 @@ public: | @@ -22,7 +22,7 @@ public: | ||
22 | 22 | ||
23 | struct Edge { | 23 | struct Edge { |
24 | InterpretedChunk chunk; | 24 | InterpretedChunk chunk; |
25 | - unsigned int nextNode; | 25 | + unsigned long nextNode; |
26 | }; | 26 | }; |
27 | 27 | ||
28 | void addPath(const std::vector<InterpretedChunk>& path, bool weak); | 28 | void addPath(const std::vector<InterpretedChunk>& path, bool weak); |
morfeusz/InterpretedChunk.hpp
@@ -15,8 +15,6 @@ struct InterpretedChunk { | @@ -15,8 +15,6 @@ struct InterpretedChunk { | ||
15 | unsigned char segmentType; | 15 | unsigned char segmentType; |
16 | const char* textStartPtr; | 16 | const char* textStartPtr; |
17 | const char* textEndPtr; | 17 | const char* textEndPtr; |
18 | - std::vector<uint32_t> originalCodepoints; | ||
19 | - std::vector<uint32_t> lowercaseCodepoints; | ||
20 | const unsigned char* interpsGroupPtr; | 18 | const unsigned char* interpsGroupPtr; |
21 | const unsigned char* interpsPtr; | 19 | const unsigned char* interpsPtr; |
22 | const unsigned char* interpsEndPtr; | 20 | const unsigned char* interpsEndPtr; |
morfeusz/InterpretedChunksDecoder.hpp deleted
1 | -/* | ||
2 | - * File: InterpsGroupDecoder.hpp | ||
3 | - * Author: mlenart | ||
4 | - * | ||
5 | - * Created on November 22, 2013, 10:35 PM | ||
6 | - */ | ||
7 | - | ||
8 | -#ifndef INTERPSGROUPDECODER_HPP | ||
9 | -#define INTERPSGROUPDECODER_HPP | ||
10 | - | ||
11 | -#include <string> | ||
12 | -#include <vector> | ||
13 | -#include <utility> | ||
14 | - | ||
15 | -#include "charset/CharsetConverter.hpp" | ||
16 | -#include "EncodedInterpretation.hpp" | ||
17 | -#include "InterpretedChunk.hpp" | ||
18 | -#include "EncodedInterpretation.hpp" | ||
19 | -#include "charset/CaseConverter.hpp" | ||
20 | -#include "Environment.hpp" | ||
21 | -#include "MorphInterpretation.hpp" | ||
22 | -#include "CasePatternHelper.hpp" | ||
23 | -#include "deserializationUtils.hpp" | ||
24 | -#include "compressionByteUtils.hpp" | ||
25 | -#include "const.hpp" | ||
26 | - | ||
27 | -class InterpretedChunksDecoder { | ||
28 | -public: | ||
29 | - | ||
30 | - InterpretedChunksDecoder(const Environment& env) | ||
31 | - : env(env) { | ||
32 | - } | ||
33 | - | ||
34 | - virtual ~InterpretedChunksDecoder() { | ||
35 | - } | ||
36 | - | ||
37 | - virtual void decode( | ||
38 | - unsigned int startNode, | ||
39 | - unsigned int endNode, | ||
40 | - const InterpretedChunk& interpretedChunk, | ||
41 | - std::vector<MorphInterpretation>& out) const = 0; | ||
42 | - | ||
43 | -protected: | ||
44 | - | ||
45 | - const Environment& env; | ||
46 | -}; | ||
47 | - | ||
48 | -class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { | ||
49 | -public: | ||
50 | - | ||
51 | - InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { | ||
52 | - } | ||
53 | - | ||
54 | - void decode( | ||
55 | - unsigned int startNode, | ||
56 | - unsigned int endNode, | ||
57 | - const InterpretedChunk& interpretedChunk, | ||
58 | - std::vector<MorphInterpretation>& out) const { | ||
59 | - string orth; | ||
60 | - string lemmaPrefix; | ||
61 | - if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) { | ||
62 | - orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | ||
63 | - const unsigned char* currPtr = interpretedChunk.interpsPtr; | ||
64 | - while (currPtr < interpretedChunk.interpsEndPtr) { | ||
65 | - this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out); | ||
66 | - } | ||
67 | - } | ||
68 | - } | ||
69 | - | ||
70 | -protected: | ||
71 | - | ||
72 | - void decodeForm( | ||
73 | - const vector<uint32_t>& orth, | ||
74 | - const EncodedForm& lemma, | ||
75 | - bool forPrefix, | ||
76 | - string& res) const { | ||
77 | - for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) { | ||
78 | - uint32_t cp = | ||
79 | - (i < lemma.casePattern.size() && lemma.casePattern[i]) | ||
80 | - ? env.getCaseConverter().toTitle(orth[i]) | ||
81 | - : orth[i]; | ||
82 | - env.getCharsetConverter().append(cp, res); | ||
83 | - } | ||
84 | - if (!forPrefix) { | ||
85 | - const char* suffixPtr = lemma.suffixToAdd.c_str(); | ||
86 | - const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length(); | ||
87 | - while (suffixPtr != suffixEnd) { | ||
88 | - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | ||
89 | - env.getCharsetConverter().append(cp, res); | ||
90 | - } | ||
91 | - } | ||
92 | - } | ||
93 | - | ||
94 | - void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const { | ||
95 | - encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte) | ||
96 | - ? getPrefixCutLength(compressionByte) | ||
97 | - : readInt8(ptr); | ||
98 | - encodedForm.suffixToCut = readInt8(ptr); | ||
99 | - encodedForm.suffixToAdd = readString(ptr); | ||
100 | - assert(encodedForm.casePattern.size() == 0); | ||
101 | - if (isLemmaOnlyLower(compressionByte)) { | ||
102 | - encodedForm.casePattern = std::vector<bool>(); | ||
103 | - } else if (isLemmaOnlyTitle(compressionByte)) { | ||
104 | - encodedForm.casePattern = std::vector<bool>(); | ||
105 | - encodedForm.casePattern.push_back(true); | ||
106 | - } else { | ||
107 | - encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); | ||
108 | - } | ||
109 | - } | ||
110 | - | ||
111 | - EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const { | ||
112 | - EncodedInterpretation interp; | ||
113 | - if (isOrthOnlyLower(compressionByte)) { | ||
114 | - } else if (isOrthOnlyTitle(compressionByte)) { | ||
115 | - interp.orthCasePattern.push_back(true); | ||
116 | - } else { | ||
117 | - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); | ||
118 | - } | ||
119 | - deserializeEncodedForm(ptr, compressionByte, interp.value); | ||
120 | - interp.tag = readInt16(ptr); | ||
121 | - interp.nameClassifier = *ptr++; | ||
122 | - interp.qualifiers = readInt16(ptr); | ||
123 | - return interp; | ||
124 | - } | ||
125 | -private: | ||
126 | - | ||
127 | - pair<string, string> getLemmaHomonymIdPair(const string& lemma) const { | ||
128 | - vector<string> splitRes(split(lemma, ':')); | ||
129 | - if (splitRes.size() == 2) { | ||
130 | - return make_pair(splitRes[0], splitRes[1]); | ||
131 | - } else { | ||
132 | - return make_pair(lemma, ""); | ||
133 | - } | ||
134 | - } | ||
135 | - | ||
136 | - void decodeMorphInterpretation( | ||
137 | - unsigned int startNode, unsigned int endNode, | ||
138 | - const string& orth, | ||
139 | - const string& lemmaPrefix, | ||
140 | - const InterpretedChunk& chunk, | ||
141 | - bool forPrefix, | ||
142 | - const unsigned char*& ptr, | ||
143 | - std::vector<MorphInterpretation>& out) const { | ||
144 | - string lemma = lemmaPrefix; | ||
145 | - EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr); | ||
146 | - this->decodeForm(chunk.lowercaseCodepoints, ei.value, forPrefix, lemma); | ||
147 | - if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.orthCasePattern)) { | ||
148 | - // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma); | ||
149 | - out.push_back(MorphInterpretation( | ||
150 | - startNode, endNode, | ||
151 | - orth, lemma, | ||
152 | - // "", | ||
153 | - ei.tag, | ||
154 | - ei.nameClassifier, | ||
155 | - ei.qualifiers, | ||
156 | - env)); | ||
157 | - } | ||
158 | - } | ||
159 | - | ||
160 | - bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const { | ||
161 | - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | ||
162 | - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | ||
163 | - orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | ||
164 | - const unsigned char* ptr = prefixChunk.interpsPtr; | ||
165 | - std::vector<MorphInterpretation> mi; | ||
166 | - // env.getCasePatternHelper().skipCasePattern(ptr); | ||
167 | - this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi); | ||
168 | - if (!mi.empty()) { | ||
169 | - lemmaPrefix += mi[0].getLemma(); | ||
170 | - } else { | ||
171 | - return false; | ||
172 | - } | ||
173 | - } | ||
174 | - return true; | ||
175 | - } | ||
176 | -}; | ||
177 | - | ||
178 | -class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { | ||
179 | -public: | ||
180 | - | ||
181 | - InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { | ||
182 | - } | ||
183 | - | ||
184 | - void decode( | ||
185 | - unsigned int startNode, | ||
186 | - unsigned int endNode, | ||
187 | - const InterpretedChunk& interpretedChunk, | ||
188 | - std::vector<MorphInterpretation>& out) const { | ||
189 | - string orthPrefix; | ||
190 | - string lemma; | ||
191 | - convertPrefixes(interpretedChunk, orthPrefix, lemma); | ||
192 | - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | ||
193 | - const unsigned char* currPtr = interpretedChunk.interpsPtr; | ||
194 | - while (currPtr < interpretedChunk.interpsEndPtr) { | ||
195 | - MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); | ||
196 | - // cerr << mi.toString(false) << endl; | ||
197 | - // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; | ||
198 | - if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) { | ||
199 | - out.push_back(mi); | ||
200 | - } | ||
201 | - } | ||
202 | - } | ||
203 | - | ||
204 | -private: | ||
205 | - | ||
206 | - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const { | ||
207 | - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | ||
208 | - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | ||
209 | - lemma += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); | ||
210 | - const unsigned char* ptr = prefixChunk.interpsPtr; | ||
211 | - MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr); | ||
212 | - orthPrefix += mi.getOrth(); | ||
213 | - } | ||
214 | - } | ||
215 | - | ||
216 | - MorphInterpretation decodeMorphInterpretation( | ||
217 | - unsigned int startNode, unsigned int endNode, | ||
218 | - const string& orthPrefix, | ||
219 | - const string& lemma, | ||
220 | - const InterpretedChunk& chunk, | ||
221 | - const unsigned char*& ptr) const { | ||
222 | - string orth = orthPrefix; | ||
223 | - EncodedInterpretation ei = this->deserializeInterp(ptr); | ||
224 | - this->decodeForm(chunk.originalCodepoints, ei.value, orth); | ||
225 | - return MorphInterpretation( | ||
226 | - startNode, endNode, | ||
227 | - orth, lemma + HOMONYM_SEPARATOR + ei.homonymId, | ||
228 | - // ei.homonymId, | ||
229 | - ei.tag, | ||
230 | - ei.nameClassifier, | ||
231 | - ei.qualifiers, | ||
232 | - env); | ||
233 | - } | ||
234 | - | ||
235 | - void decodeForm( | ||
236 | - const vector<uint32_t>& lemma, | ||
237 | - const EncodedForm& orth, | ||
238 | - string& res) const { | ||
239 | - res += orth.prefixToAdd; | ||
240 | - for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) { | ||
241 | - env.getCharsetConverter().append(lemma[i], res); | ||
242 | - } | ||
243 | - const char* suffixPtr = orth.suffixToAdd.c_str(); | ||
244 | - const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); | ||
245 | - while (suffixPtr != suffixEnd) { | ||
246 | - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | ||
247 | - env.getCharsetConverter().append(cp, res); | ||
248 | - } | ||
249 | - } | ||
250 | - | ||
251 | - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { | ||
252 | - EncodedInterpretation interp; | ||
253 | - interp.homonymId = readString(ptr); | ||
254 | - interp.value.prefixToAdd = readString(ptr); | ||
255 | - interp.value.suffixToCut = readInt8(ptr); | ||
256 | - interp.value.suffixToAdd = readString(ptr); | ||
257 | - interp.tag = readInt16(ptr); | ||
258 | - interp.nameClassifier = readInt8(ptr); | ||
259 | - interp.qualifiers = readInt16(ptr); | ||
260 | - return interp; | ||
261 | - } | ||
262 | -}; | ||
263 | - | ||
264 | -#endif /* INTERPSGROUPDECODER_HPP */ | ||
265 | - |
morfeusz/Morfeusz.cpp
@@ -12,7 +12,7 @@ | @@ -12,7 +12,7 @@ | ||
12 | #include "data/default_fsa.hpp" | 12 | #include "data/default_fsa.hpp" |
13 | #include "Morfeusz.hpp" | 13 | #include "Morfeusz.hpp" |
14 | #include "MorphDeserializer.hpp" | 14 | #include "MorphDeserializer.hpp" |
15 | -#include "InterpretedChunksDecoder.hpp" | 15 | +#include "decoder/InterpretedChunksDecoder.hpp" |
16 | #include "charset/CharsetConverter.hpp" | 16 | #include "charset/CharsetConverter.hpp" |
17 | #include "charset/charset_utils.hpp" | 17 | #include "charset/charset_utils.hpp" |
18 | #include "charset/CaseConverter.hpp" | 18 | #include "charset/CaseConverter.hpp" |
@@ -34,6 +34,51 @@ static MorfeuszOptions createDefaultOptions() { | @@ -34,6 +34,51 @@ static MorfeuszOptions createDefaultOptions() { | ||
34 | return res; | 34 | return res; |
35 | } | 35 | } |
36 | 36 | ||
37 | +static void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { | ||
38 | + to.prefixChunks.insert( | ||
39 | + to.prefixChunks.begin(), | ||
40 | + from.prefixChunks.begin(), | ||
41 | + from.prefixChunks.end()); | ||
42 | + to.prefixChunks.push_back(from); | ||
43 | + to.textStartPtr = from.textStartPtr; | ||
44 | + from.orthWasShifted = true; | ||
45 | +} | ||
46 | + | ||
47 | +static string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { | ||
48 | + stringstream res; | ||
49 | + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), "; | ||
50 | + return res.str(); | ||
51 | +} | ||
52 | + | ||
53 | +static string debugAccum(vector<InterpretedChunk>& accum) { | ||
54 | + stringstream res; | ||
55 | + for (unsigned int i = 0; i < accum.size(); i++) { | ||
56 | + res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr); | ||
57 | + // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; | ||
58 | + } | ||
59 | + return res.str(); | ||
60 | +} | ||
61 | + | ||
62 | +static void feedStateDirectly( | ||
63 | + StateType& state, | ||
64 | + const char* inputStart, | ||
65 | + const char* inputEnd) { | ||
66 | + const char* currInput = inputStart; | ||
67 | + while (currInput != inputEnd && !state.isSink()) { | ||
68 | + state.proceedToNext(*currInput++); | ||
69 | + } | ||
70 | +} | ||
71 | + | ||
72 | +static void feedState( | ||
73 | + StateType& state, | ||
74 | + int codepoint) { | ||
75 | + std::string chars; | ||
76 | + UTF8CharsetConverter::getInstance().append(codepoint, chars); | ||
77 | + for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) { | ||
78 | + state.proceedToNext(chars[i]); | ||
79 | + } | ||
80 | +} | ||
81 | + | ||
37 | Morfeusz::Morfeusz() | 82 | Morfeusz::Morfeusz() |
38 | : analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA), | 83 | : analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA), |
39 | generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA), | 84 | generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA), |
@@ -97,11 +142,12 @@ void Morfeusz::processOneWord( | @@ -97,11 +142,12 @@ void Morfeusz::processOneWord( | ||
97 | if (!graph.empty()) { | 142 | if (!graph.empty()) { |
98 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); | 143 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); |
99 | int srcNode = startNodeNum; | 144 | int srcNode = startNodeNum; |
100 | - for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) { | ||
101 | - const vector<InflexionGraph::Edge>& edges = graph.getTheGraph()[i]; | 145 | + const std::vector< std::vector<InflexionGraph::Edge> >& theGraph = graph.getTheGraph(); |
146 | + for (unsigned int i = 0; i < theGraph.size(); i++) { | ||
147 | + const vector<InflexionGraph::Edge>& edges = theGraph[i]; | ||
102 | for (unsigned int j = 0; j < edges.size(); j++) { | 148 | for (unsigned int j = 0; j < edges.size(); j++) { |
103 | const InflexionGraph::Edge& e = edges[j]; | 149 | const InflexionGraph::Edge& e = edges[j]; |
104 | - int targetNode = startNodeNum + e.nextNode; | 150 | + unsigned long targetNode = startNodeNum + e.nextNode; |
105 | interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results); | 151 | interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results); |
106 | } | 152 | } |
107 | srcNode++; | 153 | srcNode++; |
@@ -118,56 +164,11 @@ void Morfeusz::processOneWord( | @@ -118,56 +164,11 @@ void Morfeusz::processOneWord( | ||
118 | inputStart = currInput; | 164 | inputStart = currInput; |
119 | } | 165 | } |
120 | 166 | ||
121 | -static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { | ||
122 | - to.prefixChunks.insert( | ||
123 | - to.prefixChunks.begin(), | ||
124 | - from.prefixChunks.begin(), | ||
125 | - from.prefixChunks.end()); | ||
126 | - to.prefixChunks.push_back(from); | ||
127 | - from.orthWasShifted = true; | ||
128 | - to.textStartPtr = from.textStartPtr; | ||
129 | -} | ||
130 | - | ||
131 | -static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { | ||
132 | - stringstream res; | ||
133 | - res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), "; | ||
134 | - return res.str(); | ||
135 | -} | ||
136 | - | ||
137 | -static inline string debugAccum(vector<InterpretedChunk>& accum) { | ||
138 | - stringstream res; | ||
139 | - for (unsigned int i = 0; i < accum.size(); i++) { | ||
140 | - res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr); | ||
141 | - // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; | ||
142 | - } | ||
143 | - return res.str(); | ||
144 | -} | ||
145 | - | ||
146 | -static inline void feedStateDirectly( | ||
147 | - StateType& state, | ||
148 | - const char* inputStart, | ||
149 | - const char* inputEnd) { | ||
150 | - const char* currInput = inputStart; | ||
151 | - while (currInput != inputEnd && !state.isSink()) { | ||
152 | - state.proceedToNext(*currInput++); | ||
153 | - } | ||
154 | -} | ||
155 | - | ||
156 | -static inline void feedState( | ||
157 | - StateType& state, | ||
158 | - int codepoint) { | ||
159 | - std::string chars; | ||
160 | - UTF8CharsetConverter::getInstance().append(codepoint, chars); | ||
161 | - for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) { | ||
162 | - state.proceedToNext(chars[i]); | ||
163 | - } | ||
164 | -} | ||
165 | - | ||
166 | void Morfeusz::doProcessOneWord( | 167 | void Morfeusz::doProcessOneWord( |
167 | const Environment& env, | 168 | const Environment& env, |
168 | const char*& inputData, | 169 | const char*& inputData, |
169 | const char* inputEnd, | 170 | const char* inputEnd, |
170 | - SegrulesState segrulesState) const { | 171 | + const SegrulesState& segrulesState) const { |
171 | if (this->options.debug) { | 172 | if (this->options.debug) { |
172 | cerr << "----------" << endl; | 173 | cerr << "----------" << endl; |
173 | cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; | 174 | cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; |
@@ -178,11 +179,6 @@ void Morfeusz::doProcessOneWord( | @@ -178,11 +179,6 @@ void Morfeusz::doProcessOneWord( | ||
178 | const char* currInput = inputData; | 179 | const char* currInput = inputData; |
179 | uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | 180 | uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); |
180 | bool currCodepointIsWhitespace = isWhitespace(codepoint); | 181 | bool currCodepointIsWhitespace = isWhitespace(codepoint); |
181 | - vector<uint32_t> originalCodepoints; | ||
182 | - vector<uint32_t> normalizedCodepoints; | ||
183 | - | ||
184 | - originalCodepoints.reserve(16); | ||
185 | - normalizedCodepoints.reserve(16); | ||
186 | 182 | ||
187 | StateType state = env.getFSA().getInitialState(); | 183 | StateType state = env.getFSA().getInitialState(); |
188 | 184 | ||
@@ -190,8 +186,6 @@ void Morfeusz::doProcessOneWord( | @@ -190,8 +186,6 @@ void Morfeusz::doProcessOneWord( | ||
190 | uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER | 186 | uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER |
191 | ? env.getCaseConverter().toLower(codepoint) | 187 | ? env.getCaseConverter().toLower(codepoint) |
192 | : codepoint; | 188 | : codepoint; |
193 | - originalCodepoints.push_back(codepoint); | ||
194 | - normalizedCodepoints.push_back(normalizedCodepoint); | ||
195 | if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) { | 189 | if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) { |
196 | feedStateDirectly(state, prevInput, currInput); | 190 | feedStateDirectly(state, prevInput, currInput); |
197 | } | 191 | } |
@@ -203,48 +197,37 @@ void Morfeusz::doProcessOneWord( | @@ -203,48 +197,37 @@ void Morfeusz::doProcessOneWord( | ||
203 | currCodepointIsWhitespace = isWhitespace(codepoint); | 197 | currCodepointIsWhitespace = isWhitespace(codepoint); |
204 | string homonymId; | 198 | string homonymId; |
205 | if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) { | 199 | if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) { |
206 | - if (originalCodepoints.size() == 1) { | ||
207 | - throw MorfeuszException("Lemma of length > 1 cannot start with a colon"); | ||
208 | - } | ||
209 | homonymId = string(currInput + 1, inputEnd); | 200 | homonymId = string(currInput + 1, inputEnd); |
210 | - // cerr << "homonym " << homonymId << endl; | ||
211 | prevInput = currInput; | 201 | prevInput = currInput; |
212 | currInput = inputEnd; | 202 | currInput = inputEnd; |
213 | codepoint = 0x00; | 203 | codepoint = 0x00; |
214 | currCodepointIsWhitespace = true; | 204 | currCodepointIsWhitespace = true; |
215 | } | 205 | } |
216 | if (state.isAccepting()) { | 206 | if (state.isAccepting()) { |
217 | - vector<InterpsGroup> val(state.getValue()); | ||
218 | - for (unsigned int i = 0; i < val.size(); i++) { | ||
219 | - InterpsGroup& ig = val[i]; | 207 | +// vector<InterpsGroup> val(state.getValue()); |
208 | + for (unsigned int i = 0; i < state.getValue().size(); i++) { | ||
209 | + const InterpsGroup& ig = state.getValue()[i]; | ||
220 | if (this->options.debug) { | 210 | if (this->options.debug) { |
221 | cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; | 211 | cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; |
222 | } | 212 | } |
223 | - vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace); | 213 | + const vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace); |
224 | if (!newSegrulesStates.empty() | 214 | if (!newSegrulesStates.empty() |
225 | - && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig)) { | ||
226 | - | ||
227 | - for ( | ||
228 | - vector<SegrulesState>::iterator it = newSegrulesStates.begin(); | ||
229 | - it != newSegrulesStates.end(); | ||
230 | - ++it) { | ||
231 | - SegrulesState newSegrulesState = *it; | 215 | + && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(env, inputStart, currInput, ig)) { |
216 | + for (unsigned int i = 0; i < newSegrulesStates.size(); i++) { | ||
217 | + const SegrulesState& newSegrulesState = newSegrulesStates[i]; | ||
232 | const unsigned char* interpsPtr = getInterpretationsPtr(env, ig); | 218 | const unsigned char* interpsPtr = getInterpretationsPtr(env, ig); |
233 | const unsigned char* interpsEndPtr = ig.ptr + ig.size; | 219 | const unsigned char* interpsEndPtr = ig.ptr + ig.size; |
234 | - InterpretedChunk ic = { | ||
235 | - ig.type, | ||
236 | - inputStart, | ||
237 | - currInput, | ||
238 | - originalCodepoints, | ||
239 | - normalizedCodepoints, | ||
240 | - ig.ptr, | ||
241 | - interpsPtr, | ||
242 | - interpsEndPtr, | ||
243 | - newSegrulesState.shiftOrthFromPrevious, | ||
244 | - false, | ||
245 | - vector<InterpretedChunk>(), | ||
246 | - homonymId | ||
247 | - }; | 220 | + InterpretedChunk ic; |
221 | + ic.segmentType = ig.type; | ||
222 | + ic.textStartPtr = inputStart; | ||
223 | + ic.textEndPtr = currInput; | ||
224 | + ic.interpsGroupPtr = ig.ptr; | ||
225 | + ic.interpsPtr = interpsPtr; | ||
226 | + ic.interpsEndPtr = interpsEndPtr; | ||
227 | + ic.shiftOrth = newSegrulesState.shiftOrthFromPrevious; | ||
228 | + ic.orthWasShifted = false; | ||
229 | + ic.requiredHomonymId = homonymId; | ||
230 | + | ||
248 | if (!accum.empty() && accum.back().shiftOrth) { | 231 | if (!accum.empty() && accum.back().shiftOrth) { |
249 | doShiftOrth(accum.back(), ic); | 232 | doShiftOrth(accum.back(), ic); |
250 | } | 233 | } |
@@ -266,7 +249,7 @@ void Morfeusz::doProcessOneWord( | @@ -266,7 +249,7 @@ void Morfeusz::doProcessOneWord( | ||
266 | } | 249 | } |
267 | } | 250 | } |
268 | else if (this->options.debug) { | 251 | else if (this->options.debug) { |
269 | - cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl; | 252 | +// cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl; |
270 | cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; | 253 | cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; |
271 | } | 254 | } |
272 | } | 255 | } |
morfeusz/Morfeusz.hpp
@@ -170,7 +170,7 @@ private: | @@ -170,7 +170,7 @@ private: | ||
170 | const Environment& env, | 170 | const Environment& env, |
171 | const char*& inputData, | 171 | const char*& inputData, |
172 | const char* inputEnd, | 172 | const char* inputEnd, |
173 | - SegrulesState segrulesState) const; | 173 | + const SegrulesState& segrulesState) const; |
174 | 174 | ||
175 | void handleIgnChunk( | 175 | void handleIgnChunk( |
176 | const Environment& env, | 176 | const Environment& env, |
morfeusz/decoder/InterpretedChunksDecoder.hpp
0 → 100644
1 | +/* | ||
2 | + * File: InterpsGroupDecoder.hpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on November 22, 2013, 10:35 PM | ||
6 | + */ | ||
7 | + | ||
8 | +#ifndef INTERPSGROUPDECODER_HPP | ||
9 | +#define INTERPSGROUPDECODER_HPP | ||
10 | + | ||
11 | +#include <string> | ||
12 | +#include <vector> | ||
13 | +#include <utility> | ||
14 | + | ||
15 | +#include "charset/CharsetConverter.hpp" | ||
16 | +#include "EncodedInterpretation.hpp" | ||
17 | +#include "InterpretedChunk.hpp" | ||
18 | +#include "EncodedInterpretation.hpp" | ||
19 | +#include "charset/CaseConverter.hpp" | ||
20 | +#include "Environment.hpp" | ||
21 | +#include "MorphInterpretation.hpp" | ||
22 | +#include "CasePatternHelper.hpp" | ||
23 | +#include "deserializationUtils.hpp" | ||
24 | +#include "compressionByteUtils.hpp" | ||
25 | +#include "const.hpp" | ||
26 | + | ||
27 | +class InterpretedChunksDecoder { | ||
28 | +public: | ||
29 | + | ||
30 | + InterpretedChunksDecoder(const Environment& env): env(env) { | ||
31 | + } | ||
32 | + | ||
33 | + virtual ~InterpretedChunksDecoder() { | ||
34 | + } | ||
35 | + | ||
36 | + virtual void decode( | ||
37 | + unsigned int startNode, | ||
38 | + unsigned int endNode, | ||
39 | + const InterpretedChunk& interpretedChunk, | ||
40 | + std::vector<MorphInterpretation>& out) const = 0; | ||
41 | + | ||
42 | +protected: | ||
43 | + | ||
44 | + const Environment& env; | ||
45 | +}; | ||
46 | + | ||
47 | +#endif /* INTERPSGROUPDECODER_HPP */ | ||
48 | + |
morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp
0 → 100644
1 | +/* | ||
2 | + * File: InterpretedChunksDecoder4Analyzer.cpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on 15 maj 2014, 15:28 | ||
6 | + */ | ||
7 | + | ||
8 | +#include "InterpretedChunksDecoder4Analyzer.hpp" | ||
9 | +#include <string> | ||
10 | + | ||
11 | +using namespace std; | ||
12 | + | ||
13 | +InterpretedChunksDecoder4Analyzer::InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { | ||
14 | +} | ||
15 | + | ||
16 | +void InterpretedChunksDecoder4Analyzer::decode( | ||
17 | + unsigned int startNode, | ||
18 | + unsigned int endNode, | ||
19 | + const InterpretedChunk& interpretedChunk, | ||
20 | + std::vector<MorphInterpretation>& out) const { | ||
21 | + string orth; | ||
22 | + string lemmaPrefix; | ||
23 | + if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) { | ||
24 | + // orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | ||
25 | + orth.insert(orth.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr); | ||
26 | + const unsigned char* currPtr = interpretedChunk.interpsPtr; | ||
27 | + while (currPtr < interpretedChunk.interpsEndPtr) { | ||
28 | + this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out); | ||
29 | + } | ||
30 | + } | ||
31 | +} | ||
32 | + | ||
33 | +void InterpretedChunksDecoder4Analyzer::decodeLemma( | ||
34 | + const vector<uint32_t>& orth, | ||
35 | + const EncodedForm& lemma, | ||
36 | + bool forPrefix, | ||
37 | + string& res) const { | ||
38 | + for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) { | ||
39 | + uint32_t cp = | ||
40 | + (i < lemma.casePattern.size() && lemma.casePattern[i]) | ||
41 | + ? env.getCaseConverter().toTitle(orth[i]) | ||
42 | + : orth[i]; | ||
43 | + env.getCharsetConverter().append(cp, res); | ||
44 | + } | ||
45 | + if (!forPrefix) { | ||
46 | + const char* suffixPtr = lemma.suffixToAdd.c_str(); | ||
47 | + const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length(); | ||
48 | + while (suffixPtr != suffixEnd) { | ||
49 | + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | ||
50 | + env.getCharsetConverter().append(cp, res); | ||
51 | + } | ||
52 | + } | ||
53 | +} | ||
54 | + | ||
55 | +void InterpretedChunksDecoder4Analyzer::deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const { | ||
56 | + encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte) | ||
57 | + ? getPrefixCutLength(compressionByte) | ||
58 | + : readInt8(ptr); | ||
59 | + encodedForm.suffixToCut = readInt8(ptr); | ||
60 | + encodedForm.suffixToAdd = readString(ptr); | ||
61 | + assert(encodedForm.casePattern.size() == 0); | ||
62 | + if (isLemmaOnlyLower(compressionByte)) { | ||
63 | +// encodedForm.casePattern = std::vector<bool>(); | ||
64 | + } | ||
65 | + else if (isLemmaOnlyTitle(compressionByte)) { | ||
66 | +// encodedForm.casePattern = std::vector<bool>(); | ||
67 | + encodedForm.casePattern.push_back(true); | ||
68 | + } | ||
69 | + else { | ||
70 | + encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); | ||
71 | + } | ||
72 | +} | ||
73 | + | ||
74 | +EncodedInterpretation InterpretedChunksDecoder4Analyzer::deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const { | ||
75 | + EncodedInterpretation interp; | ||
76 | + if (isOrthOnlyLower(compressionByte)) { | ||
77 | + } | ||
78 | + else if (isOrthOnlyTitle(compressionByte)) { | ||
79 | + interp.orthCasePattern.push_back(true); | ||
80 | + } | ||
81 | + else { | ||
82 | + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); | ||
83 | + } | ||
84 | + deserializeEncodedForm(ptr, compressionByte, interp.value); | ||
85 | + interp.tag = readInt16(ptr); | ||
86 | + interp.nameClassifier = *ptr++; | ||
87 | + interp.qualifiers = readInt16(ptr); | ||
88 | + return interp; | ||
89 | +} | ||
90 | + | ||
91 | +void InterpretedChunksDecoder4Analyzer::decodeMorphInterpretation( | ||
92 | + unsigned int startNode, unsigned int endNode, | ||
93 | + const string& orth, | ||
94 | + const string& lemmaPrefix, | ||
95 | + const InterpretedChunk& chunk, | ||
96 | + bool forPrefix, | ||
97 | + const unsigned char*& ptr, | ||
98 | + std::vector<MorphInterpretation>& out) const { | ||
99 | + string lemma(lemmaPrefix); | ||
100 | + orthCodepoints.clear(); | ||
101 | + normalizedCodepoints.clear(); | ||
102 | + const char* currPtr = chunk.textStartPtr; | ||
103 | + while (currPtr != chunk.textEndPtr) { | ||
104 | + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr); | ||
105 | + orthCodepoints.push_back(cp); | ||
106 | + normalizedCodepoints.push_back(env.getCaseConverter().toLower(cp)); | ||
107 | + } | ||
108 | + EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr); | ||
109 | + if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, orthCodepoints, ei.orthCasePattern)) { | ||
110 | + this->decodeLemma(normalizedCodepoints, ei.value, forPrefix, lemma); | ||
111 | + // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma); | ||
112 | + out.push_back(MorphInterpretation( | ||
113 | + startNode, endNode, | ||
114 | + orth, lemma, | ||
115 | + // "", | ||
116 | + ei.tag, | ||
117 | + ei.nameClassifier, | ||
118 | + ei.qualifiers, | ||
119 | + env)); | ||
120 | + } | ||
121 | +} | ||
122 | + | ||
123 | +bool InterpretedChunksDecoder4Analyzer::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const { | ||
124 | + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | ||
125 | + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | ||
126 | + orth.insert(orth.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr); | ||
127 | + const unsigned char* ptr = prefixChunk.interpsPtr; | ||
128 | + std::vector<MorphInterpretation> mi; | ||
129 | + this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi); | ||
130 | + if (!mi.empty()) { | ||
131 | + lemmaPrefix += mi[0].getLemma(); | ||
132 | + } | ||
133 | + else { | ||
134 | + return false; | ||
135 | + } | ||
136 | + } | ||
137 | + return true; | ||
138 | +} |
morfeusz/decoder/InterpretedChunksDecoder4Analyzer.hpp
0 → 100644
1 | +/* | ||
2 | + * File: InterpretedChunksDecoder4Analyzer.hpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on 15 maj 2014, 15:28 | ||
6 | + */ | ||
7 | + | ||
8 | +#ifndef INTERPRETEDCHUNKSDECODER4ANALYZER_HPP | ||
9 | +#define INTERPRETEDCHUNKSDECODER4ANALYZER_HPP | ||
10 | + | ||
11 | +#include "InterpretedChunksDecoder.hpp" | ||
12 | + | ||
13 | +class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { | ||
14 | +public: | ||
15 | + | ||
16 | + InterpretedChunksDecoder4Analyzer(const Environment& env); | ||
17 | + | ||
18 | + void decode( | ||
19 | + unsigned int startNode, | ||
20 | + unsigned int endNode, | ||
21 | + const InterpretedChunk& interpretedChunk, | ||
22 | + std::vector<MorphInterpretation>& out) const; | ||
23 | + | ||
24 | +private: | ||
25 | + | ||
26 | + void decodeLemma( | ||
27 | + const vector<uint32_t>& orth, | ||
28 | + const EncodedForm& lemma, | ||
29 | + bool forPrefix, | ||
30 | + string& res) const; | ||
31 | + | ||
32 | + void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const; | ||
33 | + | ||
34 | + EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const; | ||
35 | + | ||
36 | + void decodeMorphInterpretation( | ||
37 | + unsigned int startNode, unsigned int endNode, | ||
38 | + const string& orth, | ||
39 | + const string& lemmaPrefix, | ||
40 | + const InterpretedChunk& chunk, | ||
41 | + bool forPrefix, | ||
42 | + const unsigned char*& ptr, | ||
43 | + std::vector<MorphInterpretation>& out) const; | ||
44 | + | ||
45 | + bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const; | ||
46 | + | ||
47 | + mutable std::vector<uint32_t> orthCodepoints; | ||
48 | + mutable std::vector<uint32_t> normalizedCodepoints; | ||
49 | +}; | ||
50 | + | ||
51 | +#endif /* INTERPRETEDCHUNKSDECODER4ANALYZER_HPP */ | ||
52 | + |
morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp
0 → 100644
1 | +/* | ||
2 | + * File: InterpretedChunksDecoder4Generator.cpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on 15 maj 2014, 15:28 | ||
6 | + */ | ||
7 | + | ||
8 | +#include "InterpretedChunksDecoder4Generator.hpp" | ||
9 | +#include <string> | ||
10 | +#include <vector> | ||
11 | + | ||
12 | +using namespace std; | ||
13 | + | ||
14 | +InterpretedChunksDecoder4Generator::InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { | ||
15 | +} | ||
16 | + | ||
17 | +void InterpretedChunksDecoder4Generator::decode( | ||
18 | + unsigned int startNode, | ||
19 | + unsigned int endNode, | ||
20 | + const InterpretedChunk& interpretedChunk, | ||
21 | + std::vector<MorphInterpretation>& out) const { | ||
22 | + string orthPrefix; | ||
23 | + string lemma; | ||
24 | + convertPrefixes(interpretedChunk, orthPrefix, lemma); | ||
25 | + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | ||
26 | + lemma.insert(lemma.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr); | ||
27 | + const unsigned char* currPtr = interpretedChunk.interpsPtr; | ||
28 | + while (currPtr < interpretedChunk.interpsEndPtr) { | ||
29 | + MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); | ||
30 | + // cerr << mi.toString(false) << endl; | ||
31 | + // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; | ||
32 | + if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) { | ||
33 | + out.push_back(mi); | ||
34 | + } | ||
35 | + } | ||
36 | +} | ||
37 | + | ||
38 | +void InterpretedChunksDecoder4Generator::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const { | ||
39 | + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { | ||
40 | + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; | ||
41 | + lemma.insert(lemma.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr); | ||
42 | + const unsigned char* ptr = prefixChunk.interpsPtr; | ||
43 | + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr); | ||
44 | + orthPrefix += mi.getOrth(); | ||
45 | + } | ||
46 | +} | ||
47 | + | ||
48 | +MorphInterpretation InterpretedChunksDecoder4Generator::decodeMorphInterpretation( | ||
49 | + unsigned int startNode, unsigned int endNode, | ||
50 | + const string& orthPrefix, | ||
51 | + const string& lemma, | ||
52 | + const InterpretedChunk& chunk, | ||
53 | + const unsigned char*& ptr) const { | ||
54 | + string orth = orthPrefix; | ||
55 | + EncodedInterpretation ei = this->deserializeInterp(ptr); | ||
56 | + codepoints.clear(); | ||
57 | + const char* currPtr = chunk.textStartPtr; | ||
58 | + while (currPtr != chunk.textEndPtr) { | ||
59 | + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr); | ||
60 | + codepoints.push_back(cp); | ||
61 | + } | ||
62 | + this->decodeForm(codepoints, ei.value, orth); | ||
63 | + return MorphInterpretation( | ||
64 | + startNode, endNode, | ||
65 | + orth, ei.homonymId.empty() ? lemma : (lemma + HOMONYM_SEPARATOR + ei.homonymId), | ||
66 | + // ei.homonymId, | ||
67 | + ei.tag, | ||
68 | + ei.nameClassifier, | ||
69 | + ei.qualifiers, | ||
70 | + env); | ||
71 | +} | ||
72 | + | ||
73 | +void InterpretedChunksDecoder4Generator::decodeForm( | ||
74 | + const vector<uint32_t>& lemma, | ||
75 | + const EncodedForm& orth, | ||
76 | + string& res) const { | ||
77 | + res += orth.prefixToAdd; | ||
78 | + for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) { | ||
79 | + env.getCharsetConverter().append(lemma[i], res); | ||
80 | + } | ||
81 | + const char* suffixPtr = orth.suffixToAdd.c_str(); | ||
82 | + const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); | ||
83 | + while (suffixPtr != suffixEnd) { | ||
84 | + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); | ||
85 | + env.getCharsetConverter().append(cp, res); | ||
86 | + } | ||
87 | +} | ||
88 | + | ||
89 | +EncodedInterpretation InterpretedChunksDecoder4Generator::deserializeInterp(const unsigned char*& ptr) const { | ||
90 | + EncodedInterpretation interp; | ||
91 | + interp.homonymId = readString(ptr); | ||
92 | + interp.value.prefixToAdd = readString(ptr); | ||
93 | + interp.value.suffixToCut = readInt8(ptr); | ||
94 | + interp.value.suffixToAdd = readString(ptr); | ||
95 | + interp.tag = readInt16(ptr); | ||
96 | + interp.nameClassifier = readInt8(ptr); | ||
97 | + interp.qualifiers = readInt16(ptr); | ||
98 | + return interp; | ||
99 | +} |
morfeusz/decoder/InterpretedChunksDecoder4Generator.hpp
0 → 100644
1 | +/* | ||
2 | + * File: InterpretedChunksDecoder4Generator.hpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on 15 maj 2014, 15:28 | ||
6 | + */ | ||
7 | + | ||
8 | +#ifndef INTERPRETEDCHUNKSDECODER4GENERATOR_HPP | ||
9 | +#define INTERPRETEDCHUNKSDECODER4GENERATOR_HPP | ||
10 | + | ||
11 | +#include "InterpretedChunksDecoder.hpp" | ||
12 | + | ||
13 | +class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { | ||
14 | +public: | ||
15 | + | ||
16 | + InterpretedChunksDecoder4Generator(const Environment& env); | ||
17 | + | ||
18 | + void decode( | ||
19 | + unsigned int startNode, | ||
20 | + unsigned int endNode, | ||
21 | + const InterpretedChunk& interpretedChunk, | ||
22 | + std::vector<MorphInterpretation>& out) const; | ||
23 | + | ||
24 | +private: | ||
25 | + | ||
26 | + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const; | ||
27 | + | ||
28 | + MorphInterpretation decodeMorphInterpretation( | ||
29 | + unsigned int startNode, unsigned int endNode, | ||
30 | + const string& orthPrefix, | ||
31 | + const string& lemma, | ||
32 | + const InterpretedChunk& chunk, | ||
33 | + const unsigned char*& ptr) const; | ||
34 | + | ||
35 | + void decodeForm( | ||
36 | + const vector<uint32_t>& lemma, | ||
37 | + const EncodedForm& orth, | ||
38 | + string& res) const; | ||
39 | + | ||
40 | + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const; | ||
41 | + | ||
42 | + mutable std::vector<uint32_t> codepoints; | ||
43 | +}; | ||
44 | + | ||
45 | + | ||
46 | +#endif /* INTERPRETEDCHUNKSDECODER4GENERATOR_HPP */ | ||
47 | + |
morfeusz/fsa/fsa.hpp
@@ -167,7 +167,7 @@ public: | @@ -167,7 +167,7 @@ public: | ||
167 | * Makes sense only for accepting states. | 167 | * Makes sense only for accepting states. |
168 | * For non-accepting states is throws an exception. | 168 | * For non-accepting states is throws an exception. |
169 | */ | 169 | */ |
170 | - T getValue() const; | 170 | + const T& getValue() const; |
171 | 171 | ||
172 | unsigned char getLastTransitionValue() const; | 172 | unsigned char getLastTransitionValue() const; |
173 | 173 |
morfeusz/fsa/state_impl.hpp
@@ -46,7 +46,7 @@ unsigned long State<T>::getOffset() const { | @@ -46,7 +46,7 @@ unsigned long State<T>::getOffset() const { | ||
46 | } | 46 | } |
47 | 47 | ||
48 | template <class T> | 48 | template <class T> |
49 | -T State<T>::getValue() const { | 49 | +const T& State<T>::getValue() const { |
50 | assert(this->isAccepting()); | 50 | assert(this->isAccepting()); |
51 | return this->value; | 51 | return this->value; |
52 | } | 52 | } |
morfeusz/morfeusz_analyzer.cpp
@@ -43,11 +43,20 @@ int main(int argc, const char** argv) { | @@ -43,11 +43,20 @@ int main(int argc, const char** argv) { | ||
43 | else if (prevStart != -1) { | 43 | else if (prevStart != -1) { |
44 | printf("; "); | 44 | printf("; "); |
45 | } | 45 | } |
46 | - printf("%s", mi.toString(true).c_str()); | ||
47 | -// printf("%d,%d,%s,%s,%s,%s", | ||
48 | -// mi.getStartNode(), mi.getEndNode(), | ||
49 | -// mi.getOrth().c_str(), lemmaToShow.c_str(), | ||
50 | -// mi.getTag().c_str(), lemmaToShow.c_str()); | 46 | +// printf("%s", mi.toString(true).c_str()); |
47 | + printf("%d,%d,%s,%s,%s", | ||
48 | + mi.getStartNode(), mi.getEndNode(), | ||
49 | + mi.getOrth().c_str(), mi.getLemma().c_str(), | ||
50 | + mi.getTag().c_str()); | ||
51 | + if (!mi.getName().empty()) { | ||
52 | + printf(",%s", mi.getName().c_str()); | ||
53 | + } | ||
54 | + if (!mi.getQualifiers().empty()) { | ||
55 | + printf(",%s", mi.getQualifiers()[0].c_str()); | ||
56 | + for (unsigned int i = 1; i < mi.getQualifiers().size(); i++) { | ||
57 | + printf("|%s", mi.getQualifiers()[i].c_str()); | ||
58 | + } | ||
59 | + } | ||
51 | prevStart = mi.getStartNode(); | 60 | prevStart = mi.getStartNode(); |
52 | prevEnd = mi.getEndNode(); | 61 | prevEnd = mi.getEndNode(); |
53 | } | 62 | } |
morfeusz/segrules/SegrulesFSA.hpp
@@ -34,12 +34,12 @@ public: | @@ -34,12 +34,12 @@ public: | ||
34 | 34 | ||
35 | std::vector<SegrulesState> proceedToNext( | 35 | std::vector<SegrulesState> proceedToNext( |
36 | const unsigned char segnum, | 36 | const unsigned char segnum, |
37 | - const SegrulesState state, | 37 | + const SegrulesState& state, |
38 | bool atEndOfWord) const { | 38 | bool atEndOfWord) const { |
39 | std::vector<SegrulesState> res; | 39 | std::vector<SegrulesState> res; |
40 | const unsigned char* currPtr = ptr + state.offset + 1; | 40 | const unsigned char* currPtr = ptr + state.offset + 1; |
41 | const unsigned char transitionsNum = *currPtr++; | 41 | const unsigned char transitionsNum = *currPtr++; |
42 | - for (unsigned int i = 0; i < transitionsNum; i++) { | 42 | + for (int i = 0; i < transitionsNum; i++) { |
43 | if (*currPtr == segnum) { | 43 | if (*currPtr == segnum) { |
44 | SegrulesState newState = this->transition2State(currPtr); | 44 | SegrulesState newState = this->transition2State(currPtr); |
45 | if ((atEndOfWord && newState.accepting) | 45 | if ((atEndOfWord && newState.accepting) |
nbproject/configurations.xml
@@ -130,6 +130,8 @@ | @@ -130,6 +130,8 @@ | ||
130 | </ccTool> | 130 | </ccTool> |
131 | </item> | 131 | </item> |
132 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> | 132 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
133 | + <ccTool flags="1"> | ||
134 | + </ccTool> | ||
133 | </item> | 135 | </item> |
134 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" | 136 | <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" |
135 | ex="false" | 137 | ex="false" |
@@ -239,6 +241,7 @@ | @@ -239,6 +241,7 @@ | ||
239 | <pElem>build/morfeusz</pElem> | 241 | <pElem>build/morfeusz</pElem> |
240 | </incDir> | 242 | </incDir> |
241 | <preprocessorList> | 243 | <preprocessorList> |
244 | + <Elem>NDEBUG</Elem> | ||
242 | <Elem>libmorfeusz_EXPORTS</Elem> | 245 | <Elem>libmorfeusz_EXPORTS</Elem> |
243 | </preprocessorList> | 246 | </preprocessorList> |
244 | </ccTool> | 247 | </ccTool> |
@@ -283,7 +286,7 @@ | @@ -283,7 +286,7 @@ | ||
283 | <ccTool> | 286 | <ccTool> |
284 | <incDir> | 287 | <incDir> |
285 | <pElem>morfeusz</pElem> | 288 | <pElem>morfeusz</pElem> |
286 | - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | 289 | + <pElem>/usr/lib/jvm/default-java/include</pElem> |
287 | </incDir> | 290 | </incDir> |
288 | <preprocessorList> | 291 | <preprocessorList> |
289 | <Elem>NDEBUG</Elem> | 292 | <Elem>NDEBUG</Elem> |
@@ -310,6 +313,19 @@ | @@ -310,6 +313,19 @@ | ||
310 | </undefinedList> | 313 | </undefinedList> |
311 | </ccTool> | 314 | </ccTool> |
312 | </folder> | 315 | </folder> |
316 | + <item path="morfeusz/CasePatternHelper.cpp" ex="false" tool="1" flavor2="4"> | ||
317 | + <ccTool flags="1"> | ||
318 | + <incDir> | ||
319 | + <pElem>build</pElem> | ||
320 | + <pElem>morfeusz</pElem> | ||
321 | + <pElem>build/morfeusz</pElem> | ||
322 | + </incDir> | ||
323 | + <preprocessorList> | ||
324 | + <Elem>NDEBUG</Elem> | ||
325 | + <Elem>libmorfeusz_EXPORTS</Elem> | ||
326 | + </preprocessorList> | ||
327 | + </ccTool> | ||
328 | + </item> | ||
313 | <item path="morfeusz/Environment.cpp" ex="false" tool="1" flavor2="4"> | 329 | <item path="morfeusz/Environment.cpp" ex="false" tool="1" flavor2="4"> |
314 | <ccTool flags="1"> | 330 | <ccTool flags="1"> |
315 | <incDir> | 331 | <incDir> |
@@ -387,40 +403,75 @@ | @@ -387,40 +403,75 @@ | ||
387 | </ccTool> | 403 | </ccTool> |
388 | </item> | 404 | </item> |
389 | <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> | 405 | <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> |
390 | - <ccTool flags="1"> | 406 | + <ccTool flags="2"> |
391 | <incDir> | 407 | <incDir> |
392 | <pElem>build</pElem> | 408 | <pElem>build</pElem> |
393 | <pElem>morfeusz</pElem> | 409 | <pElem>morfeusz</pElem> |
394 | <pElem>build/morfeusz</pElem> | 410 | <pElem>build/morfeusz</pElem> |
395 | </incDir> | 411 | </incDir> |
396 | <preprocessorList> | 412 | <preprocessorList> |
397 | - <Elem>NDEBUG</Elem> | ||
398 | <Elem>libmorfeusz_EXPORTS</Elem> | 413 | <Elem>libmorfeusz_EXPORTS</Elem> |
399 | </preprocessorList> | 414 | </preprocessorList> |
400 | </ccTool> | 415 | </ccTool> |
401 | </item> | 416 | </item> |
402 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> | 417 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
418 | + <ccTool flags="2"> | ||
419 | + </ccTool> | ||
403 | </item> | 420 | </item> |
404 | <item path="morfeusz/charset/CharsetConverter.cpp" | 421 | <item path="morfeusz/charset/CharsetConverter.cpp" |
405 | ex="false" | 422 | ex="false" |
406 | tool="1" | 423 | tool="1" |
407 | flavor2="4"> | 424 | flavor2="4"> |
408 | - <ccTool flags="1"> | ||
409 | - <preprocessorList> | ||
410 | - <Elem>NDEBUG</Elem> | ||
411 | - </preprocessorList> | 425 | + <ccTool flags="2"> |
412 | </ccTool> | 426 | </ccTool> |
413 | </item> | 427 | </item> |
414 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> | 428 | <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> |
429 | + <ccTool flags="2"> | ||
430 | + </ccTool> | ||
415 | </item> | 431 | </item> |
416 | <item path="morfeusz/charset/conversion_tables.cpp" | 432 | <item path="morfeusz/charset/conversion_tables.cpp" |
417 | ex="false" | 433 | ex="false" |
418 | tool="1" | 434 | tool="1" |
419 | flavor2="4"> | 435 | flavor2="4"> |
436 | + <ccTool flags="2"> | ||
437 | + </ccTool> | ||
420 | </item> | 438 | </item> |
421 | <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4"> | 439 | <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4"> |
440 | + <ccTool flags="1"> | ||
441 | + </ccTool> | ||
422 | </item> | 442 | </item> |
423 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> | 443 | <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> |
444 | + <ccTool flags="2"> | ||
445 | + <incDir> | ||
446 | + <pElem>build</pElem> | ||
447 | + <pElem>morfeusz</pElem> | ||
448 | + <pElem>build/morfeusz</pElem> | ||
449 | + </incDir> | ||
450 | + <preprocessorList> | ||
451 | + <Elem>libmorfeusz_EXPORTS</Elem> | ||
452 | + </preprocessorList> | ||
453 | + </ccTool> | ||
454 | + </item> | ||
455 | + <item path="morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp" | ||
456 | + ex="false" | ||
457 | + tool="1" | ||
458 | + flavor2="4"> | ||
459 | + <ccTool flags="1"> | ||
460 | + <incDir> | ||
461 | + <pElem>build</pElem> | ||
462 | + <pElem>morfeusz</pElem> | ||
463 | + <pElem>build/morfeusz</pElem> | ||
464 | + </incDir> | ||
465 | + <preprocessorList> | ||
466 | + <Elem>NDEBUG</Elem> | ||
467 | + <Elem>libmorfeusz_EXPORTS</Elem> | ||
468 | + </preprocessorList> | ||
469 | + </ccTool> | ||
470 | + </item> | ||
471 | + <item path="morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp" | ||
472 | + ex="false" | ||
473 | + tool="1" | ||
474 | + flavor2="4"> | ||
424 | <ccTool flags="1"> | 475 | <ccTool flags="1"> |
425 | <incDir> | 476 | <incDir> |
426 | <pElem>build</pElem> | 477 | <pElem>build</pElem> |
@@ -509,6 +560,8 @@ | @@ -509,6 +560,8 @@ | ||
509 | </ccTool> | 560 | </ccTool> |
510 | </item> | 561 | </item> |
511 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> | 562 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
563 | + <ccTool flags="1"> | ||
564 | + </ccTool> | ||
512 | </item> | 565 | </item> |
513 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> | 566 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
514 | <ccTool flags="0"> | 567 | <ccTool flags="0"> |
profile.sh
0 → 100755
1 | +#!/bin/bash | ||
2 | + | ||
3 | +rm -rf profbuild | ||
4 | +mkdir -p profbuild | ||
5 | +cd profbuild | ||
6 | +cmake -D INPUT_DICTIONARIES=../input/dodatki.tab,../input/PoliMorfSmall.tab -D CMAKE_BUILD_TYPE=Debug -D CMAKE_SHARED_LINKER_FLAGS="-lprofiler" -D CMAKE_EXE_LINKER_FLAGS="-lprofiler" .. | ||
7 | +make -j4 | ||
8 | +rm -f /tmp/morfeusz.prof | ||
9 | +export LD_PRELOAD="/usr/lib/libprofiler.so" | ||
10 | +export CPUPROFILE="/tmp/morfeusz.prof" | ||
11 | +morfeusz/morfeusz_analyzer -i /tmp/dupadupa < /mnt/storage/morfeusz/sents10k > /dev/null | ||
12 | +### pprof --gv profbuild/morfeusz/morfeusz_analyzer /tmp/morfeusz.prof |