Commit f3f17708743d360a3957135091e4059a4ed135ef

Authored by Michał Lenart
1 parent c9317018

dalsze próby optymalizacji, poprawa działania operatora ">" w segmentacji (przen…

…osi orth a nie lemat)

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@180 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt
... ... @@ -5,6 +5,7 @@ project (Morfeusz)
5 5 set (Morfeusz_VERSION_MAJOR 2)
6 6 set (Morfeusz_VERSION_MINOR 0)
7 7 set (Morfeusz_VERSION_PATCH 0)
  8 +set (CMAKE_BUILD_TYPE Release)
8 9  
9 10 enable_testing()
10 11  
... ...
fsabuilder/morfeuszbuilder/fsa/fsa.py
... ... @@ -41,7 +41,7 @@ class FSA(object):
41 41 self.n += 1
42 42  
43 43 # debug
44   - if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0 or word.startswith('naj'):
  44 + if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0:
45 45 logging.info(u'%d %s' % (self.n, word))
46 46 for label in encodedWord:
47 47 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
... ...
morfeusz/CasePatternHelper.hpp
... ... @@ -74,29 +74,20 @@ public:
74 74  
75 75 std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {
76 76 std::vector<bool> res;
77   - uint8_t casePatternType = *ptr;
78   - ptr++;
  77 + uint8_t casePatternType = *ptr++;
79 78 uint8_t prefixLength;
80 79 uint8_t patternLength;
81 80 switch (casePatternType) {
82 81 case LEMMA_ONLY_LOWER:
83 82 break;
84 83 case LEMMA_UPPER_PREFIX:
85   - prefixLength = *ptr;
86   - ptr++;
87   - for (unsigned int i = 0; i < prefixLength; i++) {
88   - // lemma.casePattern[i] = true;
89   - res.push_back(true);
90   - }
91   - // lemma.casePattern.resize(prefixLength, true);
  84 + prefixLength = *ptr++;
  85 + res.resize(prefixLength, true);
92 86 break;
93 87 case LEMMA_MIXED_CASE:
94   - patternLength = *ptr;
95   - ptr++;
  88 + patternLength = *ptr++;
96 89 for (unsigned int i = 0; i < patternLength; i++) {
97   - uint8_t idx = *ptr;
98   - ptr++;
99   - // lemma.casePattern[idx] = true;
  90 + uint8_t idx = *ptr++;
100 91 res.resize(idx + 1, false);
101 92 res[idx] = true;
102 93 }
... ...
morfeusz/Environment.cpp
... ... @@ -38,10 +38,6 @@ Environment::Environment(
38 38 MorfeuszProcessorType processorType,
39 39 const unsigned char* fsaFileStartPtr)
40 40 : currentCharsetConverter(getCharsetConverter(charset)),
41   -utf8CharsetConverter(),
42   -isoCharsetConverter(),
43   -cp1250CharsetConverter(),
44   -cp852CharsetConverter(),
45 41 caseConverter(),
46 42 tagset(fsaFileStartPtr),
47 43 qualifiers(fsaFileStartPtr),
... ... @@ -63,13 +59,13 @@ casePatternHelper() {
63 59 const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const {
64 60 switch (charset) {
65 61 case UTF8:
66   - return &this->utf8CharsetConverter;
  62 + return &UTF8CharsetConverter::getInstance();
67 63 case ISO8859_2:
68   - return &this->isoCharsetConverter;
  64 + return &ISO8859_2_CharsetConverter::getInstance();
69 65 case CP1250:
70   - return &this->cp1250CharsetConverter;
  66 + return &Windows_1250_CharsetConverter::getInstance();
71 67 case CP852:
72   - return &this->cp852CharsetConverter;
  68 + return &CP852_CharsetConverter::getInstance();
73 69 default:
74 70 throw MorfeuszException("invalid charset");
75 71 }
... ...
morfeusz/Environment.hpp
... ... @@ -65,10 +65,6 @@ public:
65 65 virtual ~Environment();
66 66 private:
67 67 const CharsetConverter* currentCharsetConverter;
68   - const UTF8CharsetConverter utf8CharsetConverter;
69   - const ISO8859_2_CharsetConverter isoCharsetConverter;
70   - const Windows_1250_CharsetConverter cp1250CharsetConverter;
71   - const CP852_CharsetConverter cp852CharsetConverter;
72 68 const CaseConverter caseConverter;
73 69 Tagset tagset;
74 70 Qualifiers qualifiers;
... ...
morfeusz/InterpretedChunksDecoder.hpp
... ... @@ -62,7 +62,7 @@ public:
62 62 orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
63 63 const unsigned char* currPtr = interpretedChunk.interpsPtr;
64 64 while (currPtr < interpretedChunk.interpsEndPtr) {
65   - this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr, out);
  65 + this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out);
66 66 }
67 67 }
68 68 }
... ... @@ -72,6 +72,7 @@ protected:
72 72 void decodeForm(
73 73 const vector<uint32_t>& orth,
74 74 const EncodedForm& lemma,
  75 + bool forPrefix,
75 76 string& res) const {
76 77 for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) {
77 78 uint32_t cp =
... ... @@ -80,11 +81,13 @@ protected:
80 81 : orth[i];
81 82 env.getCharsetConverter().append(cp, res);
82 83 }
83   - const char* suffixPtr = lemma.suffixToAdd.c_str();
84   - const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length();
85   - while (suffixPtr != suffixEnd) {
86   - uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd);
87   - env.getCharsetConverter().append(cp, res);
  84 + if (!forPrefix) {
  85 + const char* suffixPtr = lemma.suffixToAdd.c_str();
  86 + const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length();
  87 + while (suffixPtr != suffixEnd) {
  88 + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
  89 + env.getCharsetConverter().append(cp, res);
  90 + }
88 91 }
89 92 }
90 93  
... ... @@ -97,12 +100,10 @@ protected:
97 100 assert(encodedForm.casePattern.size() == 0);
98 101 if (isLemmaOnlyLower(compressionByte)) {
99 102 encodedForm.casePattern = std::vector<bool>();
100   - }
101   - else if (isLemmaOnlyTitle(compressionByte)) {
  103 + } else if (isLemmaOnlyTitle(compressionByte)) {
102 104 encodedForm.casePattern = std::vector<bool>();
103 105 encodedForm.casePattern.push_back(true);
104   - }
105   - else {
  106 + } else {
106 107 encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
107 108 }
108 109 }
... ... @@ -110,11 +111,9 @@ protected:
110 111 EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const {
111 112 EncodedInterpretation interp;
112 113 if (isOrthOnlyLower(compressionByte)) {
113   - }
114   - else if (isOrthOnlyTitle(compressionByte)) {
  114 + } else if (isOrthOnlyTitle(compressionByte)) {
115 115 interp.orthCasePattern.push_back(true);
116   - }
117   - else {
  116 + } else {
118 117 interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
119 118 }
120 119 deserializeEncodedForm(ptr, compressionByte, interp.value);
... ... @@ -129,8 +128,7 @@ private:
129 128 vector<string> splitRes(split(lemma, ':'));
130 129 if (splitRes.size() == 2) {
131 130 return make_pair(splitRes[0], splitRes[1]);
132   - }
133   - else {
  131 + } else {
134 132 return make_pair(lemma, "");
135 133 }
136 134 }
... ... @@ -140,17 +138,18 @@ private:
140 138 const string& orth,
141 139 const string& lemmaPrefix,
142 140 const InterpretedChunk& chunk,
  141 + bool forPrefix,
143 142 const unsigned char*& ptr,
144 143 std::vector<MorphInterpretation>& out) const {
145 144 string lemma = lemmaPrefix;
146 145 EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr);
147   - this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma);
  146 + this->decodeForm(chunk.lowercaseCodepoints, ei.value, forPrefix, lemma);
148 147 if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.orthCasePattern)) {
149 148 // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma);
150 149 out.push_back(MorphInterpretation(
151 150 startNode, endNode,
152 151 orth, lemma,
153   -// "",
  152 + // "",
154 153 ei.tag,
155 154 ei.nameClassifier,
156 155 ei.qualifiers,
... ... @@ -165,11 +164,10 @@ private:
165 164 const unsigned char* ptr = prefixChunk.interpsPtr;
166 165 std::vector<MorphInterpretation> mi;
167 166 // env.getCasePatternHelper().skipCasePattern(ptr);
168   - this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, ptr, mi);
  167 + this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi);
169 168 if (!mi.empty()) {
170 169 lemmaPrefix += mi[0].getLemma();
171   - }
172   - else {
  170 + } else {
173 171 return false;
174 172 }
175 173 }
... ... @@ -227,7 +225,7 @@ private:
227 225 return MorphInterpretation(
228 226 startNode, endNode,
229 227 orth, lemma + HOMONYM_SEPARATOR + ei.homonymId,
230   -// ei.homonymId,
  228 + // ei.homonymId,
231 229 ei.tag,
232 230 ei.nameClassifier,
233 231 ei.qualifiers,
... ... @@ -245,7 +243,7 @@ private:
245 243 const char* suffixPtr = orth.suffixToAdd.c_str();
246 244 const char* suffixEnd = suffixPtr + orth.suffixToAdd.length();
247 245 while (suffixPtr != suffixEnd) {
248   - uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd);
  246 + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
249 247 env.getCharsetConverter().append(cp, res);
250 248 }
251 249 }
... ...
morfeusz/Morfeusz.cpp
... ... @@ -39,7 +39,7 @@ Morfeusz::Morfeusz()
39 39 generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA),
40 40 options(createDefaultOptions()),
41 41 accum(),
42   -graph(){
  42 +graph() {
43 43 analyzerEnv.setCaseSensitive(options.caseSensitive);
44 44 generatorEnv.setCaseSensitive(false);
45 45 }
... ... @@ -85,14 +85,14 @@ void Morfeusz::processOneWord(
85 85 && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) {
86 86 env.getCharsetConverter().next(inputStart, inputEnd);
87 87 }
88   -
  88 +
89 89 accum.clear();
90 90 graph.clear();
91   -
  91 +
92 92 const char* currInput = inputStart;
93 93 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
94 94  
95   - doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
  95 + doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState);
96 96  
97 97 if (!graph.empty()) {
98 98 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
... ... @@ -111,7 +111,6 @@ void Morfeusz::processOneWord(
111 111 && env.getProcessorType() == ANALYZER
112 112 && !insideIgnHandler) {
113 113 this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results);
114   - // this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
115 114 }
116 115 else if (inputStart != inputEnd) {
117 116 this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
... ... @@ -144,25 +143,44 @@ static inline string debugAccum(vector&lt;InterpretedChunk&gt;&amp; accum) {
144 143 return res.str();
145 144 }
146 145  
  146 +static inline void feedStateDirectly(
  147 + StateType& state,
  148 + const char* inputStart,
  149 + const char* inputEnd) {
  150 + const char* currInput = inputStart;
  151 + while (currInput != inputEnd && !state.isSink()) {
  152 + state.proceedToNext(*currInput++);
  153 + }
  154 +}
  155 +
  156 +static inline void feedState(
  157 + StateType& state,
  158 + int codepoint) {
  159 + std::string chars;
  160 + UTF8CharsetConverter::getInstance().append(codepoint, chars);
  161 + for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) {
  162 + state.proceedToNext(chars[i]);
  163 + }
  164 +}
  165 +
147 166 void Morfeusz::doProcessOneWord(
148 167 const Environment& env,
149 168 const char*& inputData,
150 169 const char* inputEnd,
151   - SegrulesState segrulesState,
152   - vector<InterpretedChunk>& accum,
153   - InflexionGraph& graph) const {
  170 + SegrulesState segrulesState) const {
154 171 if (this->options.debug) {
155 172 cerr << "----------" << endl;
156 173 cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
157 174 }
158 175 // cerr << "doAnalyzeOneWord " << inputData << endl;
159 176 const char* inputStart = inputData;
  177 + const char* prevInput = inputData;
160 178 const char* currInput = inputData;
161 179 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
162 180 bool currCodepointIsWhitespace = isWhitespace(codepoint);
163 181 vector<uint32_t> originalCodepoints;
164 182 vector<uint32_t> normalizedCodepoints;
165   -
  183 +
166 184 originalCodepoints.reserve(16);
167 185 normalizedCodepoints.reserve(16);
168 186  
... ... @@ -174,7 +192,13 @@ void Morfeusz::doProcessOneWord(
174 192 : codepoint;
175 193 originalCodepoints.push_back(codepoint);
176 194 normalizedCodepoints.push_back(normalizedCodepoint);
177   - feedState(state, normalizedCodepoint, UTF8CharsetConverter());
  195 + if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) {
  196 + feedStateDirectly(state, prevInput, currInput);
  197 + }
  198 + else {
  199 + feedState(state, normalizedCodepoint);
  200 + }
  201 +
178 202 codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd);
179 203 currCodepointIsWhitespace = isWhitespace(codepoint);
180 204 string homonymId;
... ... @@ -184,6 +208,7 @@ void Morfeusz::doProcessOneWord(
184 208 }
185 209 homonymId = string(currInput + 1, inputEnd);
186 210 // cerr << "homonym " << homonymId << endl;
  211 + prevInput = currInput;
187 212 currInput = inputEnd;
188 213 codepoint = 0x00;
189 214 currCodepointIsWhitespace = true;
... ... @@ -195,9 +220,8 @@ void Morfeusz::doProcessOneWord(
195 220 if (this->options.debug) {
196 221 cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
197 222 }
198   - vector<SegrulesState> newSegrulesStates;
199   - env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
200   - if (!newSegrulesStates.empty()
  223 + vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace);
  224 + if (!newSegrulesStates.empty()
201 225 && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig)) {
202 226  
203 227 for (
... ... @@ -225,17 +249,18 @@ void Morfeusz::doProcessOneWord(
225 249 doShiftOrth(accum.back(), ic);
226 250 }
227 251 accum.push_back(ic);
228   - if (currCodepointIsWhitespace
229   - && newSegrulesState.accepting) {
  252 + if (currCodepointIsWhitespace) {
  253 + assert(newSegrulesState.accepting);
230 254 if (this->options.debug) {
231 255 cerr << "ACCEPTING " << debugAccum(accum) << endl;
232 256 }
233 257 graph.addPath(accum, newSegrulesState.weak);
234 258 }
235   - else if (!currCodepointIsWhitespace) {
  259 + else {
  260 + assert(!newSegrulesState.sink);
236 261 // cerr << "will process " << currInput << endl;
237 262 const char* newCurrInput = currInput;
238   - doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
  263 + doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState);
239 264 }
240 265 accum.pop_back();
241 266 }
... ... @@ -246,6 +271,7 @@ void Morfeusz::doProcessOneWord(
246 271 }
247 272 }
248 273 }
  274 + prevInput = currInput;
249 275 codepoint = currInput == inputEnd || currCodepointIsWhitespace ? 0x00 : env.getCharsetConverter().next(currInput, inputEnd);
250 276 }
251 277 inputData = currInput;
... ...
morfeusz/Morfeusz.hpp
... ... @@ -170,9 +170,7 @@ private:
170 170 const Environment& env,
171 171 const char*& inputData,
172 172 const char* inputEnd,
173   - SegrulesState segrulesState,
174   - std::vector<InterpretedChunk>& accum,
175   - InflexionGraph& graph) const;
  173 + SegrulesState segrulesState) const;
176 174  
177 175 void handleIgnChunk(
178 176 const Environment& env,
... ...
morfeusz/charset/CharsetConverter.cpp
... ... @@ -36,6 +36,13 @@ static inline void iterateThroughInvalidUtf8Sequence(const char*&amp; it, const char
36 36 }
37 37 }
38 38  
  39 +const UTF8CharsetConverter& UTF8CharsetConverter::getInstance() {
  40 + static UTF8CharsetConverter instance;
  41 + return instance;
  42 +}
  43 +
  44 +UTF8CharsetConverter::UTF8CharsetConverter() {}
  45 +
39 46 uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
40 47 uint32_t cp = 0;
41 48 utf8::internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
... ... @@ -86,14 +93,29 @@ void OneByteCharsetConverter::append(uint32_t cp, std::string&amp; result) const {
86 93 }
87 94 }
88 95  
  96 +const ISO8859_2_CharsetConverter& ISO8859_2_CharsetConverter::getInstance() {
  97 + static ISO8859_2_CharsetConverter instance;
  98 + return instance;
  99 +}
  100 +
89 101 ISO8859_2_CharsetConverter::ISO8859_2_CharsetConverter()
90 102 : OneByteCharsetConverter(ISO_8859_2_TO_CODEPOINT) {
91 103 }
92 104  
  105 +const Windows_1250_CharsetConverter& Windows_1250_CharsetConverter::getInstance() {
  106 + static Windows_1250_CharsetConverter instance;
  107 + return instance;
  108 +}
  109 +
93 110 Windows_1250_CharsetConverter::Windows_1250_CharsetConverter()
94 111 : OneByteCharsetConverter(WINDOWS_1250_TO_CODEPOINT) {
95 112 }
96 113  
  114 +const CP852_CharsetConverter& CP852_CharsetConverter::getInstance() {
  115 + static CP852_CharsetConverter instance;
  116 + return instance;
  117 +}
  118 +
97 119 CP852_CharsetConverter::CP852_CharsetConverter()
98 120 : OneByteCharsetConverter(CP852_TO_CODEPOINT) {
99 121 }
... ...
morfeusz/charset/CharsetConverter.hpp
... ... @@ -21,17 +21,23 @@ public:
21 21 virtual std::string fromUTF8(const std::string& input) const;
22 22  
23 23 std::string toString(const std::vector<uint32_t>& codepoints) const;
24   -
  24 +
25 25 virtual ~CharsetConverter();
26 26 private:
27 27 };
28 28  
29 29 class UTF8CharsetConverter : public CharsetConverter {
30 30 public:
  31 +
  32 + static const UTF8CharsetConverter& getInstance();
  33 +
31 34 uint32_t next(const char*& it, const char* end) const;
32 35 void append(uint32_t cp, std::string& result) const;
33 36 std::string fromUTF8(const std::string& input) const;
34 37 private:
  38 + UTF8CharsetConverter();
  39 + UTF8CharsetConverter(const UTF8CharsetConverter&); // do not implement
  40 + void operator=(const UTF8CharsetConverter&); // do not implement
35 41 };
36 42  
37 43 /*
... ... @@ -49,20 +55,29 @@ private:
49 55  
50 56 class ISO8859_2_CharsetConverter : public OneByteCharsetConverter {
51 57 public:
52   - ISO8859_2_CharsetConverter();
  58 + static const ISO8859_2_CharsetConverter& getInstance();
53 59 private:
  60 + ISO8859_2_CharsetConverter();
  61 + ISO8859_2_CharsetConverter(const ISO8859_2_CharsetConverter&); // do not implement
  62 + void operator=(const ISO8859_2_CharsetConverter&); // do not implement
54 63 };
55 64  
56 65 class Windows_1250_CharsetConverter : public OneByteCharsetConverter {
57 66 public:
58   - Windows_1250_CharsetConverter();
  67 + static const Windows_1250_CharsetConverter& getInstance();
59 68 private:
  69 + Windows_1250_CharsetConverter();
  70 + Windows_1250_CharsetConverter(const Windows_1250_CharsetConverter&); // do not implement
  71 + void operator=(const Windows_1250_CharsetConverter&); // do not implement
60 72 };
61 73  
62 74 class CP852_CharsetConverter : public OneByteCharsetConverter {
63 75 public:
64   - CP852_CharsetConverter();
  76 + static const CP852_CharsetConverter& getInstance();
65 77 private:
  78 + CP852_CharsetConverter();
  79 + CP852_CharsetConverter(const CP852_CharsetConverter&); // do not implement
  80 + void operator=(const CP852_CharsetConverter&); // do not implement
66 81 };
67 82  
68 83 #endif /* ENCODINGCONVERTER_HPP */
... ...
morfeusz/charset/charset_utils.hpp
... ... @@ -53,17 +53,5 @@ inline bool isWhitespace(uint32_t codepoint) {
53 53 return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint);
54 54 }
55 55  
56   -template <class StateClass>
57   -void feedState(
58   - StateClass& state,
59   - int codepoint,
60   - const CharsetConverter& charsetConverter) {
61   - std::string chars;
62   - charsetConverter.append(codepoint, chars);
63   - for (unsigned int i = 0; i < chars.length(); i++) {
64   - state.proceedToNext(chars[i]);
65   - }
66   -}
67   -
68 56 #endif /* CHARSET_UTILS_HPP */
69 57  
... ...
morfeusz/segrules/SegrulesFSA.hpp
... ... @@ -17,42 +17,48 @@ struct SegrulesState {
17 17 bool accepting;
18 18 bool weak;
19 19 bool shiftOrthFromPrevious;
  20 + bool sink;
20 21 };
21 22  
22   -inline bool operator<(const SegrulesState& s1, const SegrulesState& s2)
23   -{
24   - return s1.offset < s2.offset;
  23 +inline bool operator<(const SegrulesState& s1, const SegrulesState& s2) {
  24 + return s1.offset < s2.offset;
25 25 }
26 26  
27 27 class SegrulesFSA {
28 28 public:
29   - SegrulesFSA(const unsigned char* ptr): initialState(), ptr(ptr) {
30   - SegrulesState state = {0, false, false, false};
  29 +
  30 + SegrulesFSA(const unsigned char* ptr) : initialState(), ptr(ptr) {
  31 + SegrulesState state = {0, false, false, false, false};
31 32 initialState = state;
32 33 }
33   -
34   - void proceedToNext(
35   - const unsigned char segnum,
36   - const SegrulesState state,
37   - std::vector<SegrulesState>& newStates) const {
38   -
39   - const unsigned char* currPtr = ptr + state.offset;
40   - currPtr++;
  34 +
  35 + std::vector<SegrulesState> proceedToNext(
  36 + const unsigned char segnum,
  37 + const SegrulesState state,
  38 + bool atEndOfWord) const {
  39 + std::vector<SegrulesState> res;
  40 + const unsigned char* currPtr = ptr + state.offset + 1;
41 41 const unsigned char transitionsNum = *currPtr++;
42 42 for (unsigned int i = 0; i < transitionsNum; i++) {
43 43 if (*currPtr == segnum) {
44   - newStates.push_back(this->transition2State(currPtr));
  44 + SegrulesState newState = this->transition2State(currPtr);
  45 + if ((atEndOfWord && newState.accepting)
  46 + || (!atEndOfWord && !newState.sink)) {
  47 + res.push_back(newState);
  48 + }
45 49 }
46 50 currPtr += 4;
47 51 }
  52 + return res;
48 53 }
49   -
50   - virtual ~SegrulesFSA() {}
51   -
  54 +
  55 + virtual ~SegrulesFSA() {
  56 + }
  57 +
52 58 SegrulesState initialState;
53 59 private:
54 60 const unsigned char* ptr;
55   -
  61 +
56 62 SegrulesState transition2State(const unsigned char* transitionPtr) const {
57 63 unsigned char ACCEPTING_FLAG = 1;
58 64 unsigned char WEAK_FLAG = 2;
... ... @@ -62,6 +68,7 @@ private:
62 68 res.offset = readInt16(transitionPtr);
63 69 res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG;
64 70 res.weak = *(ptr + res.offset) & WEAK_FLAG;
  71 + res.sink = *(ptr + res.offset + 1) == 0;
65 72 return res;
66 73 }
67 74 };
... ...
nbproject/configurations.xml
... ... @@ -26,9 +26,6 @@
26 26 <df name="cli">
27 27 <in>cli.cpp</in>
28 28 </df>
29   - <df name="data">
30   - <in>default_fsa.cpp</in>
31   - </df>
32 29 <df name="fsa">
33 30 <in>const.cpp</in>
34 31 <in>test_not_recognize.cpp</in>
... ... @@ -133,8 +130,6 @@
133 130 </ccTool>
134 131 </item>
135 132 <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
136   - <ccTool flags="1">
137   - </ccTool>
138 133 </item>
139 134 <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
140 135 ex="false"
... ... @@ -236,25 +231,6 @@
236 231 </preprocessorList>
237 232 </ccTool>
238 233 </folder>
239   - <folder path="0/data">
240   - <ccTool>
241   - <incDir>
242   - <pElem>build</pElem>
243   - <pElem>build/morfeusz</pElem>
244   - </incDir>
245   - <preprocessorList>
246   - <Elem>NDEBUG</Elem>
247   - <Elem>_OPTIMIZE__=1</Elem>
248   - <Elem>__PIC__=2</Elem>
249   - <Elem>__pic__=2</Elem>
250   - <Elem>libmorfeusz_EXPORTS</Elem>
251   - </preprocessorList>
252   - <undefinedList>
253   - <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem>
254   - <Elem>__NO_INLINE__</Elem>
255   - </undefinedList>
256   - </ccTool>
257   - </folder>
258 234 <folder path="0/segrules">
259 235 <ccTool>
260 236 <incDir>
... ... @@ -307,7 +283,7 @@
307 283 <ccTool>
308 284 <incDir>
309 285 <pElem>morfeusz</pElem>
310   - <pElem>/usr/lib/jvm/default-java/include</pElem>
  286 + <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
311 287 </incDir>
312 288 <preprocessorList>
313 289 <Elem>NDEBUG</Elem>
... ... @@ -373,13 +349,14 @@
373 349 </ccTool>
374 350 </item>
375 351 <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="4">
376   - <ccTool flags="2">
  352 + <ccTool flags="1">
377 353 <incDir>
378 354 <pElem>build</pElem>
379 355 <pElem>morfeusz</pElem>
380 356 <pElem>build/morfeusz</pElem>
381 357 </incDir>
382 358 <preprocessorList>
  359 + <Elem>NDEBUG</Elem>
383 360 <Elem>libmorfeusz_EXPORTS</Elem>
384 361 </preprocessorList>
385 362 </ccTool>
... ... @@ -410,13 +387,14 @@
410 387 </ccTool>
411 388 </item>
412 389 <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4">
413   - <ccTool flags="2">
  390 + <ccTool flags="1">
414 391 <incDir>
415 392 <pElem>build</pElem>
416 393 <pElem>morfeusz</pElem>
417 394 <pElem>build/morfeusz</pElem>
418 395 </incDir>
419 396 <preprocessorList>
  397 + <Elem>NDEBUG</Elem>
420 398 <Elem>libmorfeusz_EXPORTS</Elem>
421 399 </preprocessorList>
422 400 </ccTool>
... ... @@ -427,6 +405,11 @@
427 405 ex="false"
428 406 tool="1"
429 407 flavor2="4">
  408 + <ccTool flags="1">
  409 + <preprocessorList>
  410 + <Elem>NDEBUG</Elem>
  411 + </preprocessorList>
  412 + </ccTool>
430 413 </item>
431 414 <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
432 415 </item>
... ... @@ -436,8 +419,6 @@
436 419 flavor2="4">
437 420 </item>
438 421 <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4">
439   - <ccTool flags="1">
440   - </ccTool>
441 422 </item>
442 423 <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
443 424 <ccTool flags="1">
... ...