Commit f80dea595a7fb0c3ef6f9dea0075249a41c6f86b

Authored by Michał Lenart
1 parent f3f17708

dalsza optymalizacja kodu

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@181 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
morfeusz/CMakeLists.txt
... ... @@ -38,6 +38,9 @@ set(SRC_FILES
38 38 charset/conversion_tables.cpp
39 39 cli/cli.cpp
40 40 segrules/segrules.cpp
  41 + CasePatternHelper.cpp
  42 + decoder/InterpretedChunksDecoder4Analyzer.cpp
  43 + decoder/InterpretedChunksDecoder4Generator.cpp
41 44 )
42 45  
43 46 set(INCLUDE_FILES
... ...
morfeusz/CasePatternHelper.hpp
... ... @@ -12,6 +12,9 @@
12 12 #include "InterpsGroup.hpp"
13 13 #include "CasePatternHelper.hpp"
14 14 #include "compressionByteUtils.hpp"
  15 +#include "Environment.hpp"
  16 +
  17 +class Environment;
15 18  
16 19 class CasePatternHelper {
17 20 public:
... ... @@ -39,64 +42,17 @@ public:
39 42 }
40 43  
41 44 bool checkInterpsGroupOrthCasePatterns(
42   - const std::vector<uint32_t>& lowercaseCodepoints,
43   - const std::vector<uint32_t>& originalCodepoints,
44   - const InterpsGroup& ig) const {
45   - const unsigned char* currPtr = ig.ptr;
46   - unsigned char compressionByte = *currPtr++;
47   - if (!this->caseSensitive) {
48   - return true;
49   - }
50   - else if (isOrthOnlyLower(compressionByte)) {
51   - return true;
52   - }
53   - else if (isOrthOnlyTitle(compressionByte)) {
54   - return lowercaseCodepoints[0] != originalCodepoints[0];
55   - }
56   - else {
57   - unsigned char casePatternsNum = *currPtr++;
58   - if (casePatternsNum == 0) {
59   - return true;
60   - }
61   - else {
62   - for (unsigned int i = 0; i < casePatternsNum; i++) {
63   - if (checkCasePattern(
64   - lowercaseCodepoints,
65   - originalCodepoints,
66   - deserializeOneCasePattern(currPtr))) {
67   - return true;
68   - }
69   - }
70   - return false;
71   - }
72   - }
73   - }
  45 + const Environment& env,
  46 + const char* orthStart,
  47 + const char* orthEnd,
  48 + const InterpsGroup& ig) const;
74 49  
75   - std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {
76   - std::vector<bool> res;
77   - uint8_t casePatternType = *ptr++;
78   - uint8_t prefixLength;
79   - uint8_t patternLength;
80   - switch (casePatternType) {
81   - case LEMMA_ONLY_LOWER:
82   - break;
83   - case LEMMA_UPPER_PREFIX:
84   - prefixLength = *ptr++;
85   - res.resize(prefixLength, true);
86   - break;
87   - case LEMMA_MIXED_CASE:
88   - patternLength = *ptr++;
89   - for (unsigned int i = 0; i < patternLength; i++) {
90   - uint8_t idx = *ptr++;
91   - res.resize(idx + 1, false);
92   - res[idx] = true;
93   - }
94   - break;
95   - }
96   - return res;
97   - }
  50 + static std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr);
98 51 private:
99 52 bool caseSensitive;
  53 +
  54 + mutable vector<uint32_t> orthCodepoints;
  55 + mutable vector<uint32_t> normalizedCodepoints;
100 56  
101 57 static const uint8_t LEMMA_ONLY_LOWER = 0;
102 58 static const uint8_t LEMMA_UPPER_PREFIX = 1;
... ...
morfeusz/Environment.cpp
... ... @@ -8,9 +8,11 @@
8 8 #include <vector>
9 9 #include <algorithm>
10 10 #include "Environment.hpp"
11   -#include "InterpretedChunksDecoder.hpp"
  11 +#include "decoder/InterpretedChunksDecoder.hpp"
12 12 #include "MorphDeserializer.hpp"
13 13 #include "exceptions.hpp"
  14 +#include "decoder/InterpretedChunksDecoder4Analyzer.hpp"
  15 +#include "decoder/InterpretedChunksDecoder4Generator.hpp"
14 16  
15 17 //class InterpretedChunksDecoder4Analyzer;
16 18 //class InterpretedChunksDecoder4Generator;
... ... @@ -53,7 +55,7 @@ processorType == ANALYZER
53 55 ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
54 56 : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)),
55 57 processorType(processorType),
56   -casePatternHelper() {
  58 +casePatternHelper(new CasePatternHelper()) {
57 59 }
58 60  
59 61 const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const {
... ... @@ -78,6 +80,7 @@ Environment::~Environment() {
78 80 delete this->fsaFileStartPtr;
79 81 }
80 82 delete this->chunksDecoder;
  83 + delete this->casePatternHelper;
81 84 }
82 85  
83 86 void Environment::setCharset(MorfeuszCharset charset) {
... ... @@ -146,11 +149,11 @@ MorfeuszProcessorType Environment::getProcessorType() const {
146 149 }
147 150  
148 151 void Environment::setCaseSensitive(bool caseSensitive) {
149   - this->casePatternHelper.setCaseSensitive(caseSensitive);
  152 + this->casePatternHelper->setCaseSensitive(caseSensitive);
150 153 }
151 154  
152 155 const CasePatternHelper& Environment::getCasePatternHelper() const {
153   - return this->casePatternHelper;
  156 + return *this->casePatternHelper;
154 157 }
155 158  
156 159 const Qualifiers& Environment::getQualifiersHelper() const {
... ...
morfeusz/Environment.hpp
... ... @@ -11,6 +11,7 @@
11 11 #include <vector>
12 12  
13 13 class InterpretedChunksDecoder;
  14 +class CasePatternHelper;
14 15  
15 16 #include "charset/CaseConverter.hpp"
16 17 #include "charset/CharsetConverter.hpp"
... ... @@ -79,7 +80,7 @@ private:
79 80  
80 81 const InterpretedChunksDecoder* chunksDecoder;
81 82 MorfeuszProcessorType processorType;
82   - CasePatternHelper casePatternHelper;
  83 + CasePatternHelper* casePatternHelper;
83 84  
84 85 const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const;
85 86 };
... ...
morfeusz/InflexionGraph.cpp
... ... @@ -78,7 +78,7 @@ void InflexionGraph::addPath(const std::vector&lt;InterpretedChunk&gt;&amp; path, bool wea
78 78 this->addMiddleEdge((unsigned int) this->graph.size(), e);
79 79 }
80 80 else {
81   - Edge e = {chunk, (int) this->graph.size() + 1};
  81 + Edge e = {chunk, (unsigned long) this->graph.size() + 1};
82 82 this->addMiddleEdge((unsigned int) this->graph.size(), e);
83 83 }
84 84 }
... ... @@ -117,7 +117,8 @@ static bool containsEqualEdge(const vector&lt;InflexionGraph::Edge&gt;&amp; edges, const I
117 117 for (unsigned int i = 0; i < edges.size(); i++) {
118 118 const InflexionGraph::Edge& e1 = edges[i];
119 119 if (e1.chunk.textStartPtr == e.chunk.textStartPtr
120   - && e1.chunk.lowercaseCodepoints == e.chunk.lowercaseCodepoints
  120 + && e1.chunk.textStartPtr == e.chunk.textStartPtr
  121 + && e1.chunk.textEndPtr == e.chunk.textEndPtr
121 122 && e1.chunk.segmentType == e.chunk.segmentType
122 123 && e1.nextNode == e.nextNode) {
123 124 return true;
... ...
morfeusz/InflexionGraph.hpp
... ... @@ -22,7 +22,7 @@ public:
22 22  
23 23 struct Edge {
24 24 InterpretedChunk chunk;
25   - unsigned int nextNode;
  25 + unsigned long nextNode;
26 26 };
27 27  
28 28 void addPath(const std::vector<InterpretedChunk>& path, bool weak);
... ...
morfeusz/InterpretedChunk.hpp
... ... @@ -15,8 +15,6 @@ struct InterpretedChunk {
15 15 unsigned char segmentType;
16 16 const char* textStartPtr;
17 17 const char* textEndPtr;
18   - std::vector<uint32_t> originalCodepoints;
19   - std::vector<uint32_t> lowercaseCodepoints;
20 18 const unsigned char* interpsGroupPtr;
21 19 const unsigned char* interpsPtr;
22 20 const unsigned char* interpsEndPtr;
... ...
morfeusz/InterpretedChunksDecoder.hpp deleted
1   -/*
2   - * File: InterpsGroupDecoder.hpp
3   - * Author: mlenart
4   - *
5   - * Created on November 22, 2013, 10:35 PM
6   - */
7   -
8   -#ifndef INTERPSGROUPDECODER_HPP
9   -#define INTERPSGROUPDECODER_HPP
10   -
11   -#include <string>
12   -#include <vector>
13   -#include <utility>
14   -
15   -#include "charset/CharsetConverter.hpp"
16   -#include "EncodedInterpretation.hpp"
17   -#include "InterpretedChunk.hpp"
18   -#include "EncodedInterpretation.hpp"
19   -#include "charset/CaseConverter.hpp"
20   -#include "Environment.hpp"
21   -#include "MorphInterpretation.hpp"
22   -#include "CasePatternHelper.hpp"
23   -#include "deserializationUtils.hpp"
24   -#include "compressionByteUtils.hpp"
25   -#include "const.hpp"
26   -
27   -class InterpretedChunksDecoder {
28   -public:
29   -
30   - InterpretedChunksDecoder(const Environment& env)
31   - : env(env) {
32   - }
33   -
34   - virtual ~InterpretedChunksDecoder() {
35   - }
36   -
37   - virtual void decode(
38   - unsigned int startNode,
39   - unsigned int endNode,
40   - const InterpretedChunk& interpretedChunk,
41   - std::vector<MorphInterpretation>& out) const = 0;
42   -
43   -protected:
44   -
45   - const Environment& env;
46   -};
47   -
48   -class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder {
49   -public:
50   -
51   - InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) {
52   - }
53   -
54   - void decode(
55   - unsigned int startNode,
56   - unsigned int endNode,
57   - const InterpretedChunk& interpretedChunk,
58   - std::vector<MorphInterpretation>& out) const {
59   - string orth;
60   - string lemmaPrefix;
61   - if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) {
62   - orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
63   - const unsigned char* currPtr = interpretedChunk.interpsPtr;
64   - while (currPtr < interpretedChunk.interpsEndPtr) {
65   - this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out);
66   - }
67   - }
68   - }
69   -
70   -protected:
71   -
72   - void decodeForm(
73   - const vector<uint32_t>& orth,
74   - const EncodedForm& lemma,
75   - bool forPrefix,
76   - string& res) const {
77   - for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) {
78   - uint32_t cp =
79   - (i < lemma.casePattern.size() && lemma.casePattern[i])
80   - ? env.getCaseConverter().toTitle(orth[i])
81   - : orth[i];
82   - env.getCharsetConverter().append(cp, res);
83   - }
84   - if (!forPrefix) {
85   - const char* suffixPtr = lemma.suffixToAdd.c_str();
86   - const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length();
87   - while (suffixPtr != suffixEnd) {
88   - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
89   - env.getCharsetConverter().append(cp, res);
90   - }
91   - }
92   - }
93   -
94   - void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const {
95   - encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte)
96   - ? getPrefixCutLength(compressionByte)
97   - : readInt8(ptr);
98   - encodedForm.suffixToCut = readInt8(ptr);
99   - encodedForm.suffixToAdd = readString(ptr);
100   - assert(encodedForm.casePattern.size() == 0);
101   - if (isLemmaOnlyLower(compressionByte)) {
102   - encodedForm.casePattern = std::vector<bool>();
103   - } else if (isLemmaOnlyTitle(compressionByte)) {
104   - encodedForm.casePattern = std::vector<bool>();
105   - encodedForm.casePattern.push_back(true);
106   - } else {
107   - encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
108   - }
109   - }
110   -
111   - EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const {
112   - EncodedInterpretation interp;
113   - if (isOrthOnlyLower(compressionByte)) {
114   - } else if (isOrthOnlyTitle(compressionByte)) {
115   - interp.orthCasePattern.push_back(true);
116   - } else {
117   - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
118   - }
119   - deserializeEncodedForm(ptr, compressionByte, interp.value);
120   - interp.tag = readInt16(ptr);
121   - interp.nameClassifier = *ptr++;
122   - interp.qualifiers = readInt16(ptr);
123   - return interp;
124   - }
125   -private:
126   -
127   - pair<string, string> getLemmaHomonymIdPair(const string& lemma) const {
128   - vector<string> splitRes(split(lemma, ':'));
129   - if (splitRes.size() == 2) {
130   - return make_pair(splitRes[0], splitRes[1]);
131   - } else {
132   - return make_pair(lemma, "");
133   - }
134   - }
135   -
136   - void decodeMorphInterpretation(
137   - unsigned int startNode, unsigned int endNode,
138   - const string& orth,
139   - const string& lemmaPrefix,
140   - const InterpretedChunk& chunk,
141   - bool forPrefix,
142   - const unsigned char*& ptr,
143   - std::vector<MorphInterpretation>& out) const {
144   - string lemma = lemmaPrefix;
145   - EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr);
146   - this->decodeForm(chunk.lowercaseCodepoints, ei.value, forPrefix, lemma);
147   - if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.orthCasePattern)) {
148   - // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma);
149   - out.push_back(MorphInterpretation(
150   - startNode, endNode,
151   - orth, lemma,
152   - // "",
153   - ei.tag,
154   - ei.nameClassifier,
155   - ei.qualifiers,
156   - env));
157   - }
158   - }
159   -
160   - bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const {
161   - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
162   - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
163   - orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
164   - const unsigned char* ptr = prefixChunk.interpsPtr;
165   - std::vector<MorphInterpretation> mi;
166   - // env.getCasePatternHelper().skipCasePattern(ptr);
167   - this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi);
168   - if (!mi.empty()) {
169   - lemmaPrefix += mi[0].getLemma();
170   - } else {
171   - return false;
172   - }
173   - }
174   - return true;
175   - }
176   -};
177   -
178   -class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder {
179   -public:
180   -
181   - InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) {
182   - }
183   -
184   - void decode(
185   - unsigned int startNode,
186   - unsigned int endNode,
187   - const InterpretedChunk& interpretedChunk,
188   - std::vector<MorphInterpretation>& out) const {
189   - string orthPrefix;
190   - string lemma;
191   - convertPrefixes(interpretedChunk, orthPrefix, lemma);
192   - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
193   - const unsigned char* currPtr = interpretedChunk.interpsPtr;
194   - while (currPtr < interpretedChunk.interpsEndPtr) {
195   - MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
196   - // cerr << mi.toString(false) << endl;
197   - // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
198   - if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) {
199   - out.push_back(mi);
200   - }
201   - }
202   - }
203   -
204   -private:
205   -
206   - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const {
207   - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
208   - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
209   - lemma += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
210   - const unsigned char* ptr = prefixChunk.interpsPtr;
211   - MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr);
212   - orthPrefix += mi.getOrth();
213   - }
214   - }
215   -
216   - MorphInterpretation decodeMorphInterpretation(
217   - unsigned int startNode, unsigned int endNode,
218   - const string& orthPrefix,
219   - const string& lemma,
220   - const InterpretedChunk& chunk,
221   - const unsigned char*& ptr) const {
222   - string orth = orthPrefix;
223   - EncodedInterpretation ei = this->deserializeInterp(ptr);
224   - this->decodeForm(chunk.originalCodepoints, ei.value, orth);
225   - return MorphInterpretation(
226   - startNode, endNode,
227   - orth, lemma + HOMONYM_SEPARATOR + ei.homonymId,
228   - // ei.homonymId,
229   - ei.tag,
230   - ei.nameClassifier,
231   - ei.qualifiers,
232   - env);
233   - }
234   -
235   - void decodeForm(
236   - const vector<uint32_t>& lemma,
237   - const EncodedForm& orth,
238   - string& res) const {
239   - res += orth.prefixToAdd;
240   - for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) {
241   - env.getCharsetConverter().append(lemma[i], res);
242   - }
243   - const char* suffixPtr = orth.suffixToAdd.c_str();
244   - const char* suffixEnd = suffixPtr + orth.suffixToAdd.length();
245   - while (suffixPtr != suffixEnd) {
246   - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
247   - env.getCharsetConverter().append(cp, res);
248   - }
249   - }
250   -
251   - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
252   - EncodedInterpretation interp;
253   - interp.homonymId = readString(ptr);
254   - interp.value.prefixToAdd = readString(ptr);
255   - interp.value.suffixToCut = readInt8(ptr);
256   - interp.value.suffixToAdd = readString(ptr);
257   - interp.tag = readInt16(ptr);
258   - interp.nameClassifier = readInt8(ptr);
259   - interp.qualifiers = readInt16(ptr);
260   - return interp;
261   - }
262   -};
263   -
264   -#endif /* INTERPSGROUPDECODER_HPP */
265   -
morfeusz/Morfeusz.cpp
... ... @@ -12,7 +12,7 @@
12 12 #include "data/default_fsa.hpp"
13 13 #include "Morfeusz.hpp"
14 14 #include "MorphDeserializer.hpp"
15   -#include "InterpretedChunksDecoder.hpp"
  15 +#include "decoder/InterpretedChunksDecoder.hpp"
16 16 #include "charset/CharsetConverter.hpp"
17 17 #include "charset/charset_utils.hpp"
18 18 #include "charset/CaseConverter.hpp"
... ... @@ -34,6 +34,51 @@ static MorfeuszOptions createDefaultOptions() {
34 34 return res;
35 35 }
36 36  
  37 +static void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
  38 + to.prefixChunks.insert(
  39 + to.prefixChunks.begin(),
  40 + from.prefixChunks.begin(),
  41 + from.prefixChunks.end());
  42 + to.prefixChunks.push_back(from);
  43 + to.textStartPtr = from.textStartPtr;
  44 + from.orthWasShifted = true;
  45 +}
  46 +
  47 +static string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
  48 + stringstream res;
  49 + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
  50 + return res.str();
  51 +}
  52 +
  53 +static string debugAccum(vector<InterpretedChunk>& accum) {
  54 + stringstream res;
  55 + for (unsigned int i = 0; i < accum.size(); i++) {
  56 + res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr);
  57 + // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
  58 + }
  59 + return res.str();
  60 +}
  61 +
  62 +static void feedStateDirectly(
  63 + StateType& state,
  64 + const char* inputStart,
  65 + const char* inputEnd) {
  66 + const char* currInput = inputStart;
  67 + while (currInput != inputEnd && !state.isSink()) {
  68 + state.proceedToNext(*currInput++);
  69 + }
  70 +}
  71 +
  72 +static void feedState(
  73 + StateType& state,
  74 + int codepoint) {
  75 + std::string chars;
  76 + UTF8CharsetConverter::getInstance().append(codepoint, chars);
  77 + for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) {
  78 + state.proceedToNext(chars[i]);
  79 + }
  80 +}
  81 +
37 82 Morfeusz::Morfeusz()
38 83 : analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA),
39 84 generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA),
... ... @@ -97,11 +142,12 @@ void Morfeusz::processOneWord(
97 142 if (!graph.empty()) {
98 143 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
99 144 int srcNode = startNodeNum;
100   - for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) {
101   - const vector<InflexionGraph::Edge>& edges = graph.getTheGraph()[i];
  145 + const std::vector< std::vector<InflexionGraph::Edge> >& theGraph = graph.getTheGraph();
  146 + for (unsigned int i = 0; i < theGraph.size(); i++) {
  147 + const vector<InflexionGraph::Edge>& edges = theGraph[i];
102 148 for (unsigned int j = 0; j < edges.size(); j++) {
103 149 const InflexionGraph::Edge& e = edges[j];
104   - int targetNode = startNodeNum + e.nextNode;
  150 + unsigned long targetNode = startNodeNum + e.nextNode;
105 151 interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results);
106 152 }
107 153 srcNode++;
... ... @@ -118,56 +164,11 @@ void Morfeusz::processOneWord(
118 164 inputStart = currInput;
119 165 }
120 166  
121   -static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
122   - to.prefixChunks.insert(
123   - to.prefixChunks.begin(),
124   - from.prefixChunks.begin(),
125   - from.prefixChunks.end());
126   - to.prefixChunks.push_back(from);
127   - from.orthWasShifted = true;
128   - to.textStartPtr = from.textStartPtr;
129   -}
130   -
131   -static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
132   - stringstream res;
133   - res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
134   - return res.str();
135   -}
136   -
137   -static inline string debugAccum(vector<InterpretedChunk>& accum) {
138   - stringstream res;
139   - for (unsigned int i = 0; i < accum.size(); i++) {
140   - res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr);
141   - // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
142   - }
143   - return res.str();
144   -}
145   -
146   -static inline void feedStateDirectly(
147   - StateType& state,
148   - const char* inputStart,
149   - const char* inputEnd) {
150   - const char* currInput = inputStart;
151   - while (currInput != inputEnd && !state.isSink()) {
152   - state.proceedToNext(*currInput++);
153   - }
154   -}
155   -
156   -static inline void feedState(
157   - StateType& state,
158   - int codepoint) {
159   - std::string chars;
160   - UTF8CharsetConverter::getInstance().append(codepoint, chars);
161   - for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) {
162   - state.proceedToNext(chars[i]);
163   - }
164   -}
165   -
166 167 void Morfeusz::doProcessOneWord(
167 168 const Environment& env,
168 169 const char*& inputData,
169 170 const char* inputEnd,
170   - SegrulesState segrulesState) const {
  171 + const SegrulesState& segrulesState) const {
171 172 if (this->options.debug) {
172 173 cerr << "----------" << endl;
173 174 cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
... ... @@ -178,11 +179,6 @@ void Morfeusz::doProcessOneWord(
178 179 const char* currInput = inputData;
179 180 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
180 181 bool currCodepointIsWhitespace = isWhitespace(codepoint);
181   - vector<uint32_t> originalCodepoints;
182   - vector<uint32_t> normalizedCodepoints;
183   -
184   - originalCodepoints.reserve(16);
185   - normalizedCodepoints.reserve(16);
186 182  
187 183 StateType state = env.getFSA().getInitialState();
188 184  
... ... @@ -190,8 +186,6 @@ void Morfeusz::doProcessOneWord(
190 186 uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
191 187 ? env.getCaseConverter().toLower(codepoint)
192 188 : codepoint;
193   - originalCodepoints.push_back(codepoint);
194   - normalizedCodepoints.push_back(normalizedCodepoint);
195 189 if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) {
196 190 feedStateDirectly(state, prevInput, currInput);
197 191 }
... ... @@ -203,48 +197,37 @@ void Morfeusz::doProcessOneWord(
203 197 currCodepointIsWhitespace = isWhitespace(codepoint);
204 198 string homonymId;
205 199 if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) {
206   - if (originalCodepoints.size() == 1) {
207   - throw MorfeuszException("Lemma of length > 1 cannot start with a colon");
208   - }
209 200 homonymId = string(currInput + 1, inputEnd);
210   - // cerr << "homonym " << homonymId << endl;
211 201 prevInput = currInput;
212 202 currInput = inputEnd;
213 203 codepoint = 0x00;
214 204 currCodepointIsWhitespace = true;
215 205 }
216 206 if (state.isAccepting()) {
217   - vector<InterpsGroup> val(state.getValue());
218   - for (unsigned int i = 0; i < val.size(); i++) {
219   - InterpsGroup& ig = val[i];
  207 +// vector<InterpsGroup> val(state.getValue());
  208 + for (unsigned int i = 0; i < state.getValue().size(); i++) {
  209 + const InterpsGroup& ig = state.getValue()[i];
220 210 if (this->options.debug) {
221 211 cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
222 212 }
223   - vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace);
  213 + const vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace);
224 214 if (!newSegrulesStates.empty()
225   - && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig)) {
226   -
227   - for (
228   - vector<SegrulesState>::iterator it = newSegrulesStates.begin();
229   - it != newSegrulesStates.end();
230   - ++it) {
231   - SegrulesState newSegrulesState = *it;
  215 + && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(env, inputStart, currInput, ig)) {
  216 + for (unsigned int i = 0; i < newSegrulesStates.size(); i++) {
  217 + const SegrulesState& newSegrulesState = newSegrulesStates[i];
232 218 const unsigned char* interpsPtr = getInterpretationsPtr(env, ig);
233 219 const unsigned char* interpsEndPtr = ig.ptr + ig.size;
234   - InterpretedChunk ic = {
235   - ig.type,
236   - inputStart,
237   - currInput,
238   - originalCodepoints,
239   - normalizedCodepoints,
240   - ig.ptr,
241   - interpsPtr,
242   - interpsEndPtr,
243   - newSegrulesState.shiftOrthFromPrevious,
244   - false,
245   - vector<InterpretedChunk>(),
246   - homonymId
247   - };
  220 + InterpretedChunk ic;
  221 + ic.segmentType = ig.type;
  222 + ic.textStartPtr = inputStart;
  223 + ic.textEndPtr = currInput;
  224 + ic.interpsGroupPtr = ig.ptr;
  225 + ic.interpsPtr = interpsPtr;
  226 + ic.interpsEndPtr = interpsEndPtr;
  227 + ic.shiftOrth = newSegrulesState.shiftOrthFromPrevious;
  228 + ic.orthWasShifted = false;
  229 + ic.requiredHomonymId = homonymId;
  230 +
248 231 if (!accum.empty() && accum.back().shiftOrth) {
249 232 doShiftOrth(accum.back(), ic);
250 233 }
... ... @@ -266,7 +249,7 @@ void Morfeusz::doProcessOneWord(
266 249 }
267 250 }
268 251 else if (this->options.debug) {
269   - cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl;
  252 +// cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl;
270 253 cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
271 254 }
272 255 }
... ...
morfeusz/Morfeusz.hpp
... ... @@ -170,7 +170,7 @@ private:
170 170 const Environment& env,
171 171 const char*& inputData,
172 172 const char* inputEnd,
173   - SegrulesState segrulesState) const;
  173 + const SegrulesState& segrulesState) const;
174 174  
175 175 void handleIgnChunk(
176 176 const Environment& env,
... ...
morfeusz/decoder/InterpretedChunksDecoder.hpp 0 → 100644
  1 +/*
  2 + * File: InterpsGroupDecoder.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on November 22, 2013, 10:35 PM
  6 + */
  7 +
  8 +#ifndef INTERPSGROUPDECODER_HPP
  9 +#define INTERPSGROUPDECODER_HPP
  10 +
  11 +#include <string>
  12 +#include <vector>
  13 +#include <utility>
  14 +
  15 +#include "charset/CharsetConverter.hpp"
  16 +#include "EncodedInterpretation.hpp"
  17 +#include "InterpretedChunk.hpp"
  18 +#include "EncodedInterpretation.hpp"
  19 +#include "charset/CaseConverter.hpp"
  20 +#include "Environment.hpp"
  21 +#include "MorphInterpretation.hpp"
  22 +#include "CasePatternHelper.hpp"
  23 +#include "deserializationUtils.hpp"
  24 +#include "compressionByteUtils.hpp"
  25 +#include "const.hpp"
  26 +
  27 +class InterpretedChunksDecoder {
  28 +public:
  29 +
  30 + InterpretedChunksDecoder(const Environment& env): env(env) {
  31 + }
  32 +
  33 + virtual ~InterpretedChunksDecoder() {
  34 + }
  35 +
  36 + virtual void decode(
  37 + unsigned int startNode,
  38 + unsigned int endNode,
  39 + const InterpretedChunk& interpretedChunk,
  40 + std::vector<MorphInterpretation>& out) const = 0;
  41 +
  42 +protected:
  43 +
  44 + const Environment& env;
  45 +};
  46 +
  47 +#endif /* INTERPSGROUPDECODER_HPP */
  48 +
... ...
morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp 0 → 100644
  1 +/*
  2 + * File: InterpretedChunksDecoder4Analyzer.cpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 15 maj 2014, 15:28
  6 + */
  7 +
  8 +#include "InterpretedChunksDecoder4Analyzer.hpp"
  9 +#include <string>
  10 +
  11 +using namespace std;
  12 +
  13 +InterpretedChunksDecoder4Analyzer::InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) {
  14 +}
  15 +
  16 +void InterpretedChunksDecoder4Analyzer::decode(
  17 + unsigned int startNode,
  18 + unsigned int endNode,
  19 + const InterpretedChunk& interpretedChunk,
  20 + std::vector<MorphInterpretation>& out) const {
  21 + string orth;
  22 + string lemmaPrefix;
  23 + if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) {
  24 + // orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
  25 + orth.insert(orth.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr);
  26 + const unsigned char* currPtr = interpretedChunk.interpsPtr;
  27 + while (currPtr < interpretedChunk.interpsEndPtr) {
  28 + this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out);
  29 + }
  30 + }
  31 +}
  32 +
  33 +void InterpretedChunksDecoder4Analyzer::decodeLemma(
  34 + const vector<uint32_t>& orth,
  35 + const EncodedForm& lemma,
  36 + bool forPrefix,
  37 + string& res) const {
  38 + for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) {
  39 + uint32_t cp =
  40 + (i < lemma.casePattern.size() && lemma.casePattern[i])
  41 + ? env.getCaseConverter().toTitle(orth[i])
  42 + : orth[i];
  43 + env.getCharsetConverter().append(cp, res);
  44 + }
  45 + if (!forPrefix) {
  46 + const char* suffixPtr = lemma.suffixToAdd.c_str();
  47 + const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length();
  48 + while (suffixPtr != suffixEnd) {
  49 + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
  50 + env.getCharsetConverter().append(cp, res);
  51 + }
  52 + }
  53 +}
  54 +
  55 +void InterpretedChunksDecoder4Analyzer::deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const {
  56 + encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte)
  57 + ? getPrefixCutLength(compressionByte)
  58 + : readInt8(ptr);
  59 + encodedForm.suffixToCut = readInt8(ptr);
  60 + encodedForm.suffixToAdd = readString(ptr);
  61 + assert(encodedForm.casePattern.size() == 0);
  62 + if (isLemmaOnlyLower(compressionByte)) {
  63 +// encodedForm.casePattern = std::vector<bool>();
  64 + }
  65 + else if (isLemmaOnlyTitle(compressionByte)) {
  66 +// encodedForm.casePattern = std::vector<bool>();
  67 + encodedForm.casePattern.push_back(true);
  68 + }
  69 + else {
  70 + encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
  71 + }
  72 +}
  73 +
  74 +EncodedInterpretation InterpretedChunksDecoder4Analyzer::deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const {
  75 + EncodedInterpretation interp;
  76 + if (isOrthOnlyLower(compressionByte)) {
  77 + }
  78 + else if (isOrthOnlyTitle(compressionByte)) {
  79 + interp.orthCasePattern.push_back(true);
  80 + }
  81 + else {
  82 + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
  83 + }
  84 + deserializeEncodedForm(ptr, compressionByte, interp.value);
  85 + interp.tag = readInt16(ptr);
  86 + interp.nameClassifier = *ptr++;
  87 + interp.qualifiers = readInt16(ptr);
  88 + return interp;
  89 +}
  90 +
  91 +void InterpretedChunksDecoder4Analyzer::decodeMorphInterpretation(
  92 + unsigned int startNode, unsigned int endNode,
  93 + const string& orth,
  94 + const string& lemmaPrefix,
  95 + const InterpretedChunk& chunk,
  96 + bool forPrefix,
  97 + const unsigned char*& ptr,
  98 + std::vector<MorphInterpretation>& out) const {
  99 + string lemma(lemmaPrefix);
  100 + orthCodepoints.clear();
  101 + normalizedCodepoints.clear();
  102 + const char* currPtr = chunk.textStartPtr;
  103 + while (currPtr != chunk.textEndPtr) {
  104 + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr);
  105 + orthCodepoints.push_back(cp);
  106 + normalizedCodepoints.push_back(env.getCaseConverter().toLower(cp));
  107 + }
  108 + EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr);
  109 + if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, orthCodepoints, ei.orthCasePattern)) {
  110 + this->decodeLemma(normalizedCodepoints, ei.value, forPrefix, lemma);
  111 + // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma);
  112 + out.push_back(MorphInterpretation(
  113 + startNode, endNode,
  114 + orth, lemma,
  115 + // "",
  116 + ei.tag,
  117 + ei.nameClassifier,
  118 + ei.qualifiers,
  119 + env));
  120 + }
  121 +}
  122 +
  123 +bool InterpretedChunksDecoder4Analyzer::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const {
  124 + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
  125 + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
  126 + orth.insert(orth.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr);
  127 + const unsigned char* ptr = prefixChunk.interpsPtr;
  128 + std::vector<MorphInterpretation> mi;
  129 + this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi);
  130 + if (!mi.empty()) {
  131 + lemmaPrefix += mi[0].getLemma();
  132 + }
  133 + else {
  134 + return false;
  135 + }
  136 + }
  137 + return true;
  138 +}
... ...
morfeusz/decoder/InterpretedChunksDecoder4Analyzer.hpp 0 → 100644
  1 +/*
  2 + * File: InterpretedChunksDecoder4Analyzer.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 15 maj 2014, 15:28
  6 + */
  7 +
  8 +#ifndef INTERPRETEDCHUNKSDECODER4ANALYZER_HPP
  9 +#define INTERPRETEDCHUNKSDECODER4ANALYZER_HPP
  10 +
  11 +#include "InterpretedChunksDecoder.hpp"
  12 +
  13 +class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder {
  14 +public:
  15 +
  16 + InterpretedChunksDecoder4Analyzer(const Environment& env);
  17 +
  18 + void decode(
  19 + unsigned int startNode,
  20 + unsigned int endNode,
  21 + const InterpretedChunk& interpretedChunk,
  22 + std::vector<MorphInterpretation>& out) const;
  23 +
  24 +private:
  25 +
  26 + void decodeLemma(
  27 + const vector<uint32_t>& orth,
  28 + const EncodedForm& lemma,
  29 + bool forPrefix,
  30 + string& res) const;
  31 +
  32 + void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const;
  33 +
  34 + EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const;
  35 +
  36 + void decodeMorphInterpretation(
  37 + unsigned int startNode, unsigned int endNode,
  38 + const string& orth,
  39 + const string& lemmaPrefix,
  40 + const InterpretedChunk& chunk,
  41 + bool forPrefix,
  42 + const unsigned char*& ptr,
  43 + std::vector<MorphInterpretation>& out) const;
  44 +
  45 + bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const;
  46 +
  47 + mutable std::vector<uint32_t> orthCodepoints;
  48 + mutable std::vector<uint32_t> normalizedCodepoints;
  49 +};
  50 +
  51 +#endif /* INTERPRETEDCHUNKSDECODER4ANALYZER_HPP */
  52 +
... ...
morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp 0 → 100644
  1 +/*
  2 + * File: InterpretedChunksDecoder4Generator.cpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 15 maj 2014, 15:28
  6 + */
  7 +
  8 +#include "InterpretedChunksDecoder4Generator.hpp"
  9 +#include <string>
  10 +#include <vector>
  11 +
  12 +using namespace std;
  13 +
  14 +InterpretedChunksDecoder4Generator::InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) {
  15 +}
  16 +
  17 +void InterpretedChunksDecoder4Generator::decode(
  18 + unsigned int startNode,
  19 + unsigned int endNode,
  20 + const InterpretedChunk& interpretedChunk,
  21 + std::vector<MorphInterpretation>& out) const {
  22 + string orthPrefix;
  23 + string lemma;
  24 + convertPrefixes(interpretedChunk, orthPrefix, lemma);
  25 + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
  26 + lemma.insert(lemma.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr);
  27 + const unsigned char* currPtr = interpretedChunk.interpsPtr;
  28 + while (currPtr < interpretedChunk.interpsEndPtr) {
  29 + MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
  30 + // cerr << mi.toString(false) << endl;
  31 + // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
  32 + if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) {
  33 + out.push_back(mi);
  34 + }
  35 + }
  36 +}
  37 +
  38 +void InterpretedChunksDecoder4Generator::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const {
  39 + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
  40 + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
  41 + lemma.insert(lemma.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr);
  42 + const unsigned char* ptr = prefixChunk.interpsPtr;
  43 + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr);
  44 + orthPrefix += mi.getOrth();
  45 + }
  46 +}
  47 +
  48 +MorphInterpretation InterpretedChunksDecoder4Generator::decodeMorphInterpretation(
  49 + unsigned int startNode, unsigned int endNode,
  50 + const string& orthPrefix,
  51 + const string& lemma,
  52 + const InterpretedChunk& chunk,
  53 + const unsigned char*& ptr) const {
  54 + string orth = orthPrefix;
  55 + EncodedInterpretation ei = this->deserializeInterp(ptr);
  56 + codepoints.clear();
  57 + const char* currPtr = chunk.textStartPtr;
  58 + while (currPtr != chunk.textEndPtr) {
  59 + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr);
  60 + codepoints.push_back(cp);
  61 + }
  62 + this->decodeForm(codepoints, ei.value, orth);
  63 + return MorphInterpretation(
  64 + startNode, endNode,
  65 + orth, ei.homonymId.empty() ? lemma : (lemma + HOMONYM_SEPARATOR + ei.homonymId),
  66 + // ei.homonymId,
  67 + ei.tag,
  68 + ei.nameClassifier,
  69 + ei.qualifiers,
  70 + env);
  71 +}
  72 +
  73 +void InterpretedChunksDecoder4Generator::decodeForm(
  74 + const vector<uint32_t>& lemma,
  75 + const EncodedForm& orth,
  76 + string& res) const {
  77 + res += orth.prefixToAdd;
  78 + for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) {
  79 + env.getCharsetConverter().append(lemma[i], res);
  80 + }
  81 + const char* suffixPtr = orth.suffixToAdd.c_str();
  82 + const char* suffixEnd = suffixPtr + orth.suffixToAdd.length();
  83 + while (suffixPtr != suffixEnd) {
  84 + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
  85 + env.getCharsetConverter().append(cp, res);
  86 + }
  87 +}
  88 +
  89 +EncodedInterpretation InterpretedChunksDecoder4Generator::deserializeInterp(const unsigned char*& ptr) const {
  90 + EncodedInterpretation interp;
  91 + interp.homonymId = readString(ptr);
  92 + interp.value.prefixToAdd = readString(ptr);
  93 + interp.value.suffixToCut = readInt8(ptr);
  94 + interp.value.suffixToAdd = readString(ptr);
  95 + interp.tag = readInt16(ptr);
  96 + interp.nameClassifier = readInt8(ptr);
  97 + interp.qualifiers = readInt16(ptr);
  98 + return interp;
  99 +}
... ...
morfeusz/decoder/InterpretedChunksDecoder4Generator.hpp 0 → 100644
  1 +/*
  2 + * File: InterpretedChunksDecoder4Generator.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 15 maj 2014, 15:28
  6 + */
  7 +
  8 +#ifndef INTERPRETEDCHUNKSDECODER4GENERATOR_HPP
  9 +#define INTERPRETEDCHUNKSDECODER4GENERATOR_HPP
  10 +
  11 +#include "InterpretedChunksDecoder.hpp"
  12 +
  13 +class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder {
  14 +public:
  15 +
  16 + InterpretedChunksDecoder4Generator(const Environment& env);
  17 +
  18 + void decode(
  19 + unsigned int startNode,
  20 + unsigned int endNode,
  21 + const InterpretedChunk& interpretedChunk,
  22 + std::vector<MorphInterpretation>& out) const;
  23 +
  24 +private:
  25 +
  26 + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const;
  27 +
  28 + MorphInterpretation decodeMorphInterpretation(
  29 + unsigned int startNode, unsigned int endNode,
  30 + const string& orthPrefix,
  31 + const string& lemma,
  32 + const InterpretedChunk& chunk,
  33 + const unsigned char*& ptr) const;
  34 +
  35 + void decodeForm(
  36 + const vector<uint32_t>& lemma,
  37 + const EncodedForm& orth,
  38 + string& res) const;
  39 +
  40 + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const;
  41 +
  42 + mutable std::vector<uint32_t> codepoints;
  43 +};
  44 +
  45 +
  46 +#endif /* INTERPRETEDCHUNKSDECODER4GENERATOR_HPP */
  47 +
... ...
morfeusz/fsa/fsa.hpp
... ... @@ -167,7 +167,7 @@ public:
167 167 * Makes sense only for accepting states.
168 168 * For non-accepting states is throws an exception.
169 169 */
170   - T getValue() const;
  170 + const T& getValue() const;
171 171  
172 172 unsigned char getLastTransitionValue() const;
173 173  
... ...
morfeusz/fsa/state_impl.hpp
... ... @@ -46,7 +46,7 @@ unsigned long State&lt;T&gt;::getOffset() const {
46 46 }
47 47  
48 48 template <class T>
49   -T State<T>::getValue() const {
  49 +const T& State<T>::getValue() const {
50 50 assert(this->isAccepting());
51 51 return this->value;
52 52 }
... ...
morfeusz/morfeusz_analyzer.cpp
... ... @@ -43,11 +43,20 @@ int main(int argc, const char** argv) {
43 43 else if (prevStart != -1) {
44 44 printf("; ");
45 45 }
46   - printf("%s", mi.toString(true).c_str());
47   -// printf("%d,%d,%s,%s,%s,%s",
48   -// mi.getStartNode(), mi.getEndNode(),
49   -// mi.getOrth().c_str(), lemmaToShow.c_str(),
50   -// mi.getTag().c_str(), lemmaToShow.c_str());
  46 +// printf("%s", mi.toString(true).c_str());
  47 + printf("%d,%d,%s,%s,%s",
  48 + mi.getStartNode(), mi.getEndNode(),
  49 + mi.getOrth().c_str(), mi.getLemma().c_str(),
  50 + mi.getTag().c_str());
  51 + if (!mi.getName().empty()) {
  52 + printf(",%s", mi.getName().c_str());
  53 + }
  54 + if (!mi.getQualifiers().empty()) {
  55 + printf(",%s", mi.getQualifiers()[0].c_str());
  56 + for (unsigned int i = 1; i < mi.getQualifiers().size(); i++) {
  57 + printf("|%s", mi.getQualifiers()[i].c_str());
  58 + }
  59 + }
51 60 prevStart = mi.getStartNode();
52 61 prevEnd = mi.getEndNode();
53 62 }
... ...
morfeusz/segrules/SegrulesFSA.hpp
... ... @@ -34,12 +34,12 @@ public:
34 34  
35 35 std::vector<SegrulesState> proceedToNext(
36 36 const unsigned char segnum,
37   - const SegrulesState state,
  37 + const SegrulesState& state,
38 38 bool atEndOfWord) const {
39 39 std::vector<SegrulesState> res;
40 40 const unsigned char* currPtr = ptr + state.offset + 1;
41 41 const unsigned char transitionsNum = *currPtr++;
42   - for (unsigned int i = 0; i < transitionsNum; i++) {
  42 + for (int i = 0; i < transitionsNum; i++) {
43 43 if (*currPtr == segnum) {
44 44 SegrulesState newState = this->transition2State(currPtr);
45 45 if ((atEndOfWord && newState.accepting)
... ...
nbproject/configurations.xml
... ... @@ -130,6 +130,8 @@
130 130 </ccTool>
131 131 </item>
132 132 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
  133 + <ccTool flags="1">
  134 + </ccTool>
133 135 </item>
134 136 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
135 137 ex="false"
... ... @@ -239,6 +241,7 @@
239 241 <pElem>build/morfeusz</pElem>
240 242 </incDir>
241 243 <preprocessorList>
  244 + <Elem>NDEBUG</Elem>
242 245 <Elem>libmorfeusz_EXPORTS</Elem>
243 246 </preprocessorList>
244 247 </ccTool>
... ... @@ -283,7 +286,7 @@
283 286 <ccTool>
284 287 <incDir>
285 288 <pElem>morfeusz</pElem>
286   - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
  289 + <pElem>/usr/lib/jvm/default-java/include</pElem>
287 290 </incDir>
288 291 <preprocessorList>
289 292 <Elem>NDEBUG</Elem>
... ... @@ -310,6 +313,19 @@
310 313 </undefinedList>
311 314 </ccTool>
312 315 </folder>
  316 + <item path="morfeusz/CasePatternHelper.cpp" ex="false" tool="1" flavor2="4">
  317 + <ccTool flags="1">
  318 + <incDir>
  319 + <pElem>build</pElem>
  320 + <pElem>morfeusz</pElem>
  321 + <pElem>build/morfeusz</pElem>
  322 + </incDir>
  323 + <preprocessorList>
  324 + <Elem>NDEBUG</Elem>
  325 + <Elem>libmorfeusz_EXPORTS</Elem>
  326 + </preprocessorList>
  327 + </ccTool>
  328 + </item>
313 329 <item path="morfeusz/Environment.cpp" ex="false" tool="1" flavor2="4">
314 330 <ccTool flags="1">
315 331 <incDir>
... ... @@ -387,40 +403,75 @@
387 403 </ccTool>
388 404 </item>
389 405 <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4">
390   - <ccTool flags="1">
  406 + <ccTool flags="2">
391 407 <incDir>
392 408 <pElem>build</pElem>
393 409 <pElem>morfeusz</pElem>
394 410 <pElem>build/morfeusz</pElem>
395 411 </incDir>
396 412 <preprocessorList>
397   - <Elem>NDEBUG</Elem>
398 413 <Elem>libmorfeusz_EXPORTS</Elem>
399 414 </preprocessorList>
400 415 </ccTool>
401 416 </item>
402 417 <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4">
  418 + <ccTool flags="2">
  419 + </ccTool>
403 420 </item>
404 421 <item path="morfeusz/charset/CharsetConverter.cpp"
405 422 ex="false"
406 423 tool="1"
407 424 flavor2="4">
408   - <ccTool flags="1">
409   - <preprocessorList>
410   - <Elem>NDEBUG</Elem>
411   - </preprocessorList>
  425 + <ccTool flags="2">
412 426 </ccTool>
413 427 </item>
414 428 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
  429 + <ccTool flags="2">
  430 + </ccTool>
415 431 </item>
416 432 <item path="morfeusz/charset/conversion_tables.cpp"
417 433 ex="false"
418 434 tool="1"
419 435 flavor2="4">
  436 + <ccTool flags="2">
  437 + </ccTool>
420 438 </item>
421 439 <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4">
  440 + <ccTool flags="1">
  441 + </ccTool>
422 442 </item>
423 443 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
  444 + <ccTool flags="2">
  445 + <incDir>
  446 + <pElem>build</pElem>
  447 + <pElem>morfeusz</pElem>
  448 + <pElem>build/morfeusz</pElem>
  449 + </incDir>
  450 + <preprocessorList>
  451 + <Elem>libmorfeusz_EXPORTS</Elem>
  452 + </preprocessorList>
  453 + </ccTool>
  454 + </item>
  455 + <item path="morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp"
  456 + ex="false"
  457 + tool="1"
  458 + flavor2="4">
  459 + <ccTool flags="1">
  460 + <incDir>
  461 + <pElem>build</pElem>
  462 + <pElem>morfeusz</pElem>
  463 + <pElem>build/morfeusz</pElem>
  464 + </incDir>
  465 + <preprocessorList>
  466 + <Elem>NDEBUG</Elem>
  467 + <Elem>libmorfeusz_EXPORTS</Elem>
  468 + </preprocessorList>
  469 + </ccTool>
  470 + </item>
  471 + <item path="morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp"
  472 + ex="false"
  473 + tool="1"
  474 + flavor2="4">
424 475 <ccTool flags="1">
425 476 <incDir>
426 477 <pElem>build</pElem>
... ... @@ -509,6 +560,8 @@
509 560 </ccTool>
510 561 </item>
511 562 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
  563 + <ccTool flags="1">
  564 + </ccTool>
512 565 </item>
513 566 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
514 567 <ccTool flags="0">
... ...
profile.sh 0 → 100755
  1 +#!/bin/bash
  2 +
  3 +rm -rf profbuild
  4 +mkdir -p profbuild
  5 +cd profbuild
  6 +cmake -D INPUT_DICTIONARIES=../input/dodatki.tab,../input/PoliMorfSmall.tab -D CMAKE_BUILD_TYPE=Debug -D CMAKE_SHARED_LINKER_FLAGS="-lprofiler" -D CMAKE_EXE_LINKER_FLAGS="-lprofiler" ..
  7 +make -j4
  8 +rm -f /tmp/morfeusz.prof
  9 +export LD_PRELOAD="/usr/lib/libprofiler.so"
  10 +export CPUPROFILE="/tmp/morfeusz.prof"
  11 +morfeusz/morfeusz_analyzer -i /tmp/dupadupa < /mnt/storage/morfeusz/sents10k > /dev/null
  12 +### pprof --gv profbuild/morfeusz/morfeusz_analyzer /tmp/morfeusz.prof
... ...