Commit 4a94bb689bf0122f2270624056cb168e739d1063
1 parent
7351ce74
prawie dorobiona kompletna obsługa ignów
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@147 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
8 changed files
with
131 additions
and
24 deletions
fsabuilder/morfeuszbuilder/fsa/serializer.py
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... | ... | @@ -5,12 +5,14 @@ Created on 20 lut 2014 |
5 | 5 | ''' |
6 | 6 | import logging |
7 | 7 | from morfeuszbuilder.utils.serializationUtils import htons, htonl |
8 | +from morfeuszbuilder.utils import serializationUtils | |
8 | 9 | |
9 | 10 | class RulesManager(object): |
10 | 11 | |
11 | - def __init__(self, segtypes): | |
12 | + def __init__(self, segtypes, separatorsList): | |
12 | 13 | self.options2DFA = {} |
13 | 14 | self.segtypes = segtypes |
15 | + self.separatorsList = separatorsList | |
14 | 16 | self.defaultOptions = None |
15 | 17 | |
16 | 18 | def _options2Key(self, optionsMap): |
... | ... | @@ -37,6 +39,7 @@ class RulesManager(object): |
37 | 39 | |
38 | 40 | def serialize(self): |
39 | 41 | res = bytearray() |
42 | + res.extend(self._serializeSeparatorsList()) | |
40 | 43 | dfasNum = len(self.options2DFA) |
41 | 44 | assert dfasNum > 0 and dfasNum < 256 |
42 | 45 | res.append(dfasNum) |
... | ... | @@ -48,6 +51,13 @@ class RulesManager(object): |
48 | 51 | logging.info('segmentation rules size: %s bytes', len(res)) |
49 | 52 | return res |
50 | 53 | |
54 | + def _serializeSeparatorsList(self): | |
55 | + res = bytearray() | |
56 | + res.extend(serializationUtils.htons(len(self.separatorsList))) | |
57 | + for cp in sorted(self.separatorsList): | |
58 | + res.extend(serializationUtils.htonl(cp)) | |
59 | + return res | |
60 | + | |
51 | 61 | def _serializeOptionsMap(self, optionsMap): |
52 | 62 | assert len(optionsMap) < 256 |
53 | 63 | res = bytearray() |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... | ... | @@ -3,7 +3,7 @@ from pyparsing import * |
3 | 3 | ParserElement.enablePackrat() |
4 | 4 | from morfeuszbuilder.tagset import segtypes |
5 | 5 | from morfeuszbuilder.utils import configFile, exceptions |
6 | -from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString | |
6 | +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString, separatorChars | |
7 | 7 | import codecs |
8 | 8 | import re |
9 | 9 | |
... | ... | @@ -34,11 +34,22 @@ class RulesParser(object): |
34 | 34 | |
35 | 35 | def parse(self, filename): |
36 | 36 | |
37 | - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'generator combinations', 'tags', 'lexemes', 'segment types']) | |
37 | + segtypesConfigFile = configFile.ConfigFile(filename, | |
38 | + [ | |
39 | + 'options', | |
40 | + 'combinations', | |
41 | + 'generator combinations', | |
42 | + 'tags', | |
43 | + 'lexemes', | |
44 | + 'segment types', | |
45 | + 'separator chars']) | |
38 | 46 | key2Defs = self._getKey2Defs(segtypesConfigFile) |
39 | 47 | segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) |
48 | + separatorsList = separatorChars.parseSeparatorChars(segtypesConfigFile) \ | |
49 | + if self.rulesType == RulesParser.PARSE4ANALYZER \ | |
50 | + else [] | |
40 | 51 | |
41 | - res = rulesManager.RulesManager(segtypesHelper) | |
52 | + res = rulesManager.RulesManager(segtypesHelper, separatorsList) | |
42 | 53 | |
43 | 54 | def2Key = {} |
44 | 55 | for key, defs in key2Defs.iteritems(): |
... | ... |
input/segmenty.dat
... | ... | @@ -243,6 +243,16 @@ moze_interp( dig>+ dywiz> latek ) |
243 | 243 | # interpretacja znaków interpunkcyjnych |
244 | 244 | # moze_interp(samodz interp) |
245 | 245 | |
246 | +[separator chars] | |
247 | +# , | |
248 | +44 | |
249 | + | |
250 | +# . | |
251 | +46 | |
252 | + | |
253 | +# ; | |
254 | +59 | |
255 | + | |
246 | 256 | [generator combinations] |
247 | 257 | |
248 | 258 | [segment types] |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -62,7 +62,8 @@ void Morfeusz::processOneWord( |
62 | 62 | const char*& inputStart, |
63 | 63 | const char* inputEnd, |
64 | 64 | int startNodeNum, |
65 | - std::vector<MorphInterpretation>& results) const { | |
65 | + std::vector<MorphInterpretation>& results, | |
66 | + bool insideIgnHandler) const { | |
66 | 67 | while (inputStart != inputEnd |
67 | 68 | && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) { |
68 | 69 | env.getCharsetConverter().next(inputStart, inputEnd); |
... | ... | @@ -71,9 +72,9 @@ void Morfeusz::processOneWord( |
71 | 72 | InflexionGraph graph; |
72 | 73 | const char* currInput = inputStart; |
73 | 74 | const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); |
74 | - | |
75 | + | |
75 | 76 | doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); |
76 | - | |
77 | + | |
77 | 78 | if (!graph.empty()) { |
78 | 79 | const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); |
79 | 80 | int srcNode = startNodeNum; |
... | ... | @@ -87,6 +88,12 @@ void Morfeusz::processOneWord( |
87 | 88 | srcNode++; |
88 | 89 | } |
89 | 90 | } |
91 | + else if (inputStart != inputEnd | |
92 | + && env.getProcessorType() == ANALYZER | |
93 | + && !insideIgnHandler) { | |
94 | + this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results); | |
95 | + // this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results); | |
96 | + } | |
90 | 97 | else if (inputStart != inputEnd) { |
91 | 98 | this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results); |
92 | 99 | } |
... | ... | @@ -113,7 +120,7 @@ static inline string debugAccum(vector<InterpretedChunk>& accum) { |
113 | 120 | stringstream res; |
114 | 121 | for (unsigned int i = 0; i < accum.size(); i++) { |
115 | 122 | res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr); |
116 | -// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; | |
123 | + // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; | |
117 | 124 | } |
118 | 125 | return res.str(); |
119 | 126 | } |
... | ... | @@ -125,11 +132,11 @@ void Morfeusz::doProcessOneWord( |
125 | 132 | SegrulesState segrulesState, |
126 | 133 | vector<InterpretedChunk>& accum, |
127 | 134 | InflexionGraph& graph) const { |
128 | -// if (this->options.debug) { | |
129 | -// cerr << "----------" << endl; | |
130 | -// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; | |
131 | -// } | |
132 | -// cerr << "doAnalyzeOneWord " << inputData << endl; | |
135 | + // if (this->options.debug) { | |
136 | + // cerr << "----------" << endl; | |
137 | + // cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; | |
138 | + // } | |
139 | + // cerr << "doAnalyzeOneWord " << inputData << endl; | |
133 | 140 | const char* inputStart = inputData; |
134 | 141 | const char* currInput = inputData; |
135 | 142 | uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); |
... | ... | @@ -140,8 +147,8 @@ void Morfeusz::doProcessOneWord( |
140 | 147 | |
141 | 148 | while (!isEndOfWord(codepoint)) { |
142 | 149 | uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER |
143 | - ? env.getCaseConverter().toLower(codepoint) | |
144 | - : codepoint; | |
150 | + ? env.getCaseConverter().toLower(codepoint) | |
151 | + : codepoint; | |
145 | 152 | originalCodepoints.push_back(codepoint); |
146 | 153 | normalizedCodepoints.push_back(normalizedCodepoint); |
147 | 154 | feedState(state, normalizedCodepoint, UTF8CharsetConverter()); |
... | ... | @@ -152,7 +159,7 @@ void Morfeusz::doProcessOneWord( |
152 | 159 | throw MorfeuszException("Lemma of length > 1 cannot start with a colon"); |
153 | 160 | } |
154 | 161 | homonymId = string(currInput + 1, inputEnd); |
155 | -// cerr << "homonym " << homonymId << endl; | |
162 | + // cerr << "homonym " << homonymId << endl; | |
156 | 163 | currInput = inputEnd; |
157 | 164 | codepoint = 0x00; |
158 | 165 | } |
... | ... | @@ -163,13 +170,13 @@ void Morfeusz::doProcessOneWord( |
163 | 170 | if (this->options.debug) { |
164 | 171 | cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; |
165 | 172 | } |
166 | -// cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; | |
173 | + // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; | |
167 | 174 | set<SegrulesState> newSegrulesStates; |
168 | 175 | env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); |
169 | 176 | if (this->options.debug && newSegrulesStates.empty()) { |
170 | 177 | cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; |
171 | 178 | } |
172 | -// cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; | |
179 | + // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; | |
173 | 180 | for ( |
174 | 181 | set<SegrulesState>::iterator it = newSegrulesStates.begin(); |
175 | 182 | it != newSegrulesStates.end(); |
... | ... | @@ -190,7 +197,7 @@ void Morfeusz::doProcessOneWord( |
190 | 197 | doShiftOrth(accum.back(), ic); |
191 | 198 | } |
192 | 199 | accum.push_back(ic); |
193 | - if (isEndOfWord(codepoint) | |
200 | + if (isEndOfWord(codepoint) | |
194 | 201 | && newSegrulesState.accepting) { |
195 | 202 | if (this->options.debug) { |
196 | 203 | cerr << "ACCEPTING " << debugAccum(accum) << endl; |
... | ... | @@ -198,7 +205,7 @@ void Morfeusz::doProcessOneWord( |
198 | 205 | graph.addPath(accum, newSegrulesState.weak); |
199 | 206 | } |
200 | 207 | else if (!isEndOfWord(codepoint)) { |
201 | -// cerr << "will process " << currInput << endl; | |
208 | + // cerr << "will process " << currInput << endl; | |
202 | 209 | const char* newCurrInput = currInput; |
203 | 210 | doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); |
204 | 211 | } |
... | ... | @@ -211,6 +218,58 @@ void Morfeusz::doProcessOneWord( |
211 | 218 | inputData = currInput; |
212 | 219 | } |
213 | 220 | |
221 | +static inline bool isSeparator(uint32_t codepoint) { | |
222 | + return codepoint == 44; | |
223 | +} | |
224 | + | |
225 | +void Morfeusz::handleIgnChunk( | |
226 | + const Environment& env, | |
227 | + const char* inputStart, | |
228 | + const char* inputEnd, | |
229 | + int startNodeNum, | |
230 | + std::vector<MorphInterpretation>& results) const { | |
231 | + const char* currInput = inputStart; | |
232 | + const char* prevInput; | |
233 | + uint32_t codepoint; | |
234 | + bool separatorFound = false; | |
235 | + while (currInput != inputEnd) { | |
236 | + prevInput = currInput; | |
237 | + const char* nonSeparatorInputEnd = prevInput; | |
238 | + do { | |
239 | + codepoint = env.getCharsetConverter().next(currInput, inputEnd); | |
240 | + if (!isSeparator(codepoint)) { | |
241 | + nonSeparatorInputEnd = currInput; | |
242 | + } | |
243 | + } | |
244 | + while (currInput != inputEnd && !isSeparator(codepoint)); | |
245 | + | |
246 | + if (isSeparator(codepoint)) { | |
247 | + separatorFound = true; | |
248 | + if (nonSeparatorInputEnd != prevInput) { | |
249 | + int startNode = results.empty() ? startNodeNum : results.back().getEndNode(); | |
250 | + this->processOneWord(env, prevInput, nonSeparatorInputEnd, startNode, results, true); | |
251 | + startNode = results.empty() ? startNodeNum : results.back().getEndNode(); | |
252 | + this->processOneWord(env, nonSeparatorInputEnd, currInput, startNode, results, true); | |
253 | + } | |
254 | + else { | |
255 | + int startNode = results.empty() ? startNodeNum : results.back().getEndNode(); | |
256 | + this->processOneWord(env, prevInput, currInput, startNode, results, true); | |
257 | + } | |
258 | + } | |
259 | + } | |
260 | + | |
261 | + // currInput == inputEnd | |
262 | + if (!isSeparator(codepoint)) { | |
263 | + if (separatorFound) { | |
264 | + int startNode = results.empty() ? startNodeNum : results.back().getEndNode(); | |
265 | + this->processOneWord(env, prevInput, inputEnd, startNode, results, true); | |
266 | + } | |
267 | + else { | |
268 | + this->appendIgnotiumToResults(env, string(inputStart, inputEnd), startNodeNum, results); | |
269 | + } | |
270 | + } | |
271 | +} | |
272 | + | |
214 | 273 | void Morfeusz::appendIgnotiumToResults( |
215 | 274 | const Environment& env, |
216 | 275 | const string& word, |
... | ... | @@ -260,6 +319,7 @@ void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results |
260 | 319 | } |
261 | 320 | |
262 | 321 | // XXX - someday it should be improved |
322 | + | |
263 | 323 | void Morfeusz::generate(const std::string& lemma, int tagnum, vector<MorphInterpretation>& result) const { |
264 | 324 | vector<MorphInterpretation> partRes; |
265 | 325 | this->generate(lemma, partRes); |
... | ... |
morfeusz/Morfeusz.hpp
... | ... | @@ -157,7 +157,8 @@ private: |
157 | 157 | const char*& inputData, |
158 | 158 | const char* inputEnd, |
159 | 159 | int startNodeNum, |
160 | - std::vector<MorphInterpretation>& result) const; | |
160 | + std::vector<MorphInterpretation>& result, | |
161 | + bool insideIgnHandler=false) const; | |
161 | 162 | |
162 | 163 | void doProcessOneWord( |
163 | 164 | const Environment& env, |
... | ... | @@ -166,6 +167,13 @@ private: |
166 | 167 | SegrulesState segrulesState, |
167 | 168 | std::vector<InterpretedChunk>& accum, |
168 | 169 | InflexionGraph& graph) const; |
170 | + | |
171 | + void handleIgnChunk( | |
172 | + const Environment& env, | |
173 | + const char* inputStart, | |
174 | + const char* inputEnd, | |
175 | + int startNodeNum, | |
176 | + std::vector<MorphInterpretation>& results) const; | |
169 | 177 | |
170 | 178 | void appendIgnotiumToResults( |
171 | 179 | const Environment& env, |
... | ... |
morfeusz/fsa/const.cpp
... | ... | @@ -2,7 +2,7 @@ |
2 | 2 | #include "const.hpp" |
3 | 3 | |
4 | 4 | extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; |
5 | -extern const uint8_t VERSION_NUM = 12; | |
5 | +extern const uint8_t VERSION_NUM = 13; | |
6 | 6 | |
7 | 7 | extern const unsigned int VERSION_NUM_OFFSET = 4; |
8 | 8 | extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5; |
... | ... |
morfeusz/segrules/segrules.cpp
... | ... | @@ -18,11 +18,19 @@ static inline string deserializeString(const unsigned char*& ptr) { |
18 | 18 | return res; |
19 | 19 | } |
20 | 20 | |
21 | +static inline void ignoreSeparatorsList(const unsigned char*& ptr) { | |
22 | + uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr)); | |
23 | + ptr += 2; | |
24 | + ptr += 4 * listSize; | |
25 | +} | |
26 | + | |
21 | 27 | static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) { |
22 | 28 | const unsigned char* additionalDataPtr = ptr |
23 | 29 | + FSA_DATA_OFFSET |
24 | 30 | + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); |
25 | - return additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; | |
31 | + const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; | |
32 | + ignoreSeparatorsList(res); | |
33 | + return res; | |
26 | 34 | } |
27 | 35 | |
28 | 36 | static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { |
... | ... |