Commit 4a94bb689bf0122f2270624056cb168e739d1063

Authored by Michał Lenart
1 parent 7351ce74

prawie dorobiona kompletna obsługa ignów

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@147 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/morfeuszbuilder/fsa/serializer.py
... ... @@ -22,7 +22,7 @@ class Serializer(object):
22 22  
23 23 # get the Morfeusz file format version that is being encoded
24 24 def getVersion(self):
25   - return 12
  25 + return 13
26 26  
27 27 def serialize2CppFile(self, fname, generator, segmentationRulesData):
28 28 res = []
... ...
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
... ... @@ -5,12 +5,14 @@ Created on 20 lut 2014
5 5 '''
6 6 import logging
7 7 from morfeuszbuilder.utils.serializationUtils import htons, htonl
  8 +from morfeuszbuilder.utils import serializationUtils
8 9  
9 10 class RulesManager(object):
10 11  
11   - def __init__(self, segtypes):
  12 + def __init__(self, segtypes, separatorsList):
12 13 self.options2DFA = {}
13 14 self.segtypes = segtypes
  15 + self.separatorsList = separatorsList
14 16 self.defaultOptions = None
15 17  
16 18 def _options2Key(self, optionsMap):
... ... @@ -37,6 +39,7 @@ class RulesManager(object):
37 39  
38 40 def serialize(self):
39 41 res = bytearray()
  42 + res.extend(self._serializeSeparatorsList())
40 43 dfasNum = len(self.options2DFA)
41 44 assert dfasNum > 0 and dfasNum < 256
42 45 res.append(dfasNum)
... ... @@ -48,6 +51,13 @@ class RulesManager(object):
48 51 logging.info('segmentation rules size: %s bytes', len(res))
49 52 return res
50 53  
  54 + def _serializeSeparatorsList(self):
  55 + res = bytearray()
  56 + res.extend(serializationUtils.htons(len(self.separatorsList)))
  57 + for cp in sorted(self.separatorsList):
  58 + res.extend(serializationUtils.htonl(cp))
  59 + return res
  60 +
51 61 def _serializeOptionsMap(self, optionsMap):
52 62 assert len(optionsMap) < 256
53 63 res = bytearray()
... ...
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... ... @@ -3,7 +3,7 @@ from pyparsing import *
3 3 ParserElement.enablePackrat()
4 4 from morfeuszbuilder.tagset import segtypes
5 5 from morfeuszbuilder.utils import configFile, exceptions
6   -from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString
  6 +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString, separatorChars
7 7 import codecs
8 8 import re
9 9  
... ... @@ -34,11 +34,22 @@ class RulesParser(object):
34 34  
35 35 def parse(self, filename):
36 36  
37   - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'generator combinations', 'tags', 'lexemes', 'segment types'])
  37 + segtypesConfigFile = configFile.ConfigFile(filename,
  38 + [
  39 + 'options',
  40 + 'combinations',
  41 + 'generator combinations',
  42 + 'tags',
  43 + 'lexemes',
  44 + 'segment types',
  45 + 'separator chars'])
38 46 key2Defs = self._getKey2Defs(segtypesConfigFile)
39 47 segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
  48 + separatorsList = separatorChars.parseSeparatorChars(segtypesConfigFile) \
  49 + if self.rulesType == RulesParser.PARSE4ANALYZER \
  50 + else []
40 51  
41   - res = rulesManager.RulesManager(segtypesHelper)
  52 + res = rulesManager.RulesManager(segtypesHelper, separatorsList)
42 53  
43 54 def2Key = {}
44 55 for key, defs in key2Defs.iteritems():
... ...
input/segmenty.dat
... ... @@ -243,6 +243,16 @@ moze_interp( dig&gt;+ dywiz&gt; latek )
243 243 # interpretacja znaków interpunkcyjnych
244 244 # moze_interp(samodz interp)
245 245  
  246 +[separator chars]
  247 +# ,
  248 +44
  249 +
  250 +# .
  251 +46
  252 +
  253 +# ;
  254 +59
  255 +
246 256 [generator combinations]
247 257  
248 258 [segment types]
... ...
morfeusz/Morfeusz.cpp
... ... @@ -62,7 +62,8 @@ void Morfeusz::processOneWord(
62 62 const char*& inputStart,
63 63 const char* inputEnd,
64 64 int startNodeNum,
65   - std::vector<MorphInterpretation>& results) const {
  65 + std::vector<MorphInterpretation>& results,
  66 + bool insideIgnHandler) const {
66 67 while (inputStart != inputEnd
67 68 && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) {
68 69 env.getCharsetConverter().next(inputStart, inputEnd);
... ... @@ -71,9 +72,9 @@ void Morfeusz::processOneWord(
71 72 InflexionGraph graph;
72 73 const char* currInput = inputStart;
73 74 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
74   -
  75 +
75 76 doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
76   -
  77 +
77 78 if (!graph.empty()) {
78 79 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
79 80 int srcNode = startNodeNum;
... ... @@ -87,6 +88,12 @@ void Morfeusz::processOneWord(
87 88 srcNode++;
88 89 }
89 90 }
  91 + else if (inputStart != inputEnd
  92 + && env.getProcessorType() == ANALYZER
  93 + && !insideIgnHandler) {
  94 + this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results);
  95 + // this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
  96 + }
90 97 else if (inputStart != inputEnd) {
91 98 this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
92 99 }
... ... @@ -113,7 +120,7 @@ static inline string debugAccum(vector&lt;InterpretedChunk&gt;&amp; accum) {
113 120 stringstream res;
114 121 for (unsigned int i = 0; i < accum.size(); i++) {
115 122 res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr);
116   -// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
  123 + // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
117 124 }
118 125 return res.str();
119 126 }
... ... @@ -125,11 +132,11 @@ void Morfeusz::doProcessOneWord(
125 132 SegrulesState segrulesState,
126 133 vector<InterpretedChunk>& accum,
127 134 InflexionGraph& graph) const {
128   -// if (this->options.debug) {
129   -// cerr << "----------" << endl;
130   -// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
131   -// }
132   -// cerr << "doAnalyzeOneWord " << inputData << endl;
  135 + // if (this->options.debug) {
  136 + // cerr << "----------" << endl;
  137 + // cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
  138 + // }
  139 + // cerr << "doAnalyzeOneWord " << inputData << endl;
133 140 const char* inputStart = inputData;
134 141 const char* currInput = inputData;
135 142 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
... ... @@ -140,8 +147,8 @@ void Morfeusz::doProcessOneWord(
140 147  
141 148 while (!isEndOfWord(codepoint)) {
142 149 uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
143   - ? env.getCaseConverter().toLower(codepoint)
144   - : codepoint;
  150 + ? env.getCaseConverter().toLower(codepoint)
  151 + : codepoint;
145 152 originalCodepoints.push_back(codepoint);
146 153 normalizedCodepoints.push_back(normalizedCodepoint);
147 154 feedState(state, normalizedCodepoint, UTF8CharsetConverter());
... ... @@ -152,7 +159,7 @@ void Morfeusz::doProcessOneWord(
152 159 throw MorfeuszException("Lemma of length > 1 cannot start with a colon");
153 160 }
154 161 homonymId = string(currInput + 1, inputEnd);
155   -// cerr << "homonym " << homonymId << endl;
  162 + // cerr << "homonym " << homonymId << endl;
156 163 currInput = inputEnd;
157 164 codepoint = 0x00;
158 165 }
... ... @@ -163,13 +170,13 @@ void Morfeusz::doProcessOneWord(
163 170 if (this->options.debug) {
164 171 cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
165 172 }
166   -// cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
  173 + // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
167 174 set<SegrulesState> newSegrulesStates;
168 175 env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
169 176 if (this->options.debug && newSegrulesStates.empty()) {
170 177 cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
171 178 }
172   -// cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
  179 + // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
173 180 for (
174 181 set<SegrulesState>::iterator it = newSegrulesStates.begin();
175 182 it != newSegrulesStates.end();
... ... @@ -190,7 +197,7 @@ void Morfeusz::doProcessOneWord(
190 197 doShiftOrth(accum.back(), ic);
191 198 }
192 199 accum.push_back(ic);
193   - if (isEndOfWord(codepoint)
  200 + if (isEndOfWord(codepoint)
194 201 && newSegrulesState.accepting) {
195 202 if (this->options.debug) {
196 203 cerr << "ACCEPTING " << debugAccum(accum) << endl;
... ... @@ -198,7 +205,7 @@ void Morfeusz::doProcessOneWord(
198 205 graph.addPath(accum, newSegrulesState.weak);
199 206 }
200 207 else if (!isEndOfWord(codepoint)) {
201   -// cerr << "will process " << currInput << endl;
  208 + // cerr << "will process " << currInput << endl;
202 209 const char* newCurrInput = currInput;
203 210 doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
204 211 }
... ... @@ -211,6 +218,58 @@ void Morfeusz::doProcessOneWord(
211 218 inputData = currInput;
212 219 }
213 220  
  221 +static inline bool isSeparator(uint32_t codepoint) {
  222 + return codepoint == 44;
  223 +}
  224 +
  225 +void Morfeusz::handleIgnChunk(
  226 + const Environment& env,
  227 + const char* inputStart,
  228 + const char* inputEnd,
  229 + int startNodeNum,
  230 + std::vector<MorphInterpretation>& results) const {
  231 + const char* currInput = inputStart;
  232 + const char* prevInput;
  233 + uint32_t codepoint;
  234 + bool separatorFound = false;
  235 + while (currInput != inputEnd) {
  236 + prevInput = currInput;
  237 + const char* nonSeparatorInputEnd = prevInput;
  238 + do {
  239 + codepoint = env.getCharsetConverter().next(currInput, inputEnd);
  240 + if (!isSeparator(codepoint)) {
  241 + nonSeparatorInputEnd = currInput;
  242 + }
  243 + }
  244 + while (currInput != inputEnd && !isSeparator(codepoint));
  245 +
  246 + if (isSeparator(codepoint)) {
  247 + separatorFound = true;
  248 + if (nonSeparatorInputEnd != prevInput) {
  249 + int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
  250 + this->processOneWord(env, prevInput, nonSeparatorInputEnd, startNode, results, true);
  251 + startNode = results.empty() ? startNodeNum : results.back().getEndNode();
  252 + this->processOneWord(env, nonSeparatorInputEnd, currInput, startNode, results, true);
  253 + }
  254 + else {
  255 + int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
  256 + this->processOneWord(env, prevInput, currInput, startNode, results, true);
  257 + }
  258 + }
  259 + }
  260 +
  261 + // currInput == inputEnd
  262 + if (!isSeparator(codepoint)) {
  263 + if (separatorFound) {
  264 + int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
  265 + this->processOneWord(env, prevInput, inputEnd, startNode, results, true);
  266 + }
  267 + else {
  268 + this->appendIgnotiumToResults(env, string(inputStart, inputEnd), startNodeNum, results);
  269 + }
  270 + }
  271 +}
  272 +
214 273 void Morfeusz::appendIgnotiumToResults(
215 274 const Environment& env,
216 275 const string& word,
... ... @@ -260,6 +319,7 @@ void Morfeusz::generate(const string&amp; text, vector&lt;MorphInterpretation&gt;&amp; results
260 319 }
261 320  
262 321 // XXX - someday it should be improved
  322 +
263 323 void Morfeusz::generate(const std::string& lemma, int tagnum, vector<MorphInterpretation>& result) const {
264 324 vector<MorphInterpretation> partRes;
265 325 this->generate(lemma, partRes);
... ...
morfeusz/Morfeusz.hpp
... ... @@ -157,7 +157,8 @@ private:
157 157 const char*& inputData,
158 158 const char* inputEnd,
159 159 int startNodeNum,
160   - std::vector<MorphInterpretation>& result) const;
  160 + std::vector<MorphInterpretation>& result,
  161 + bool insideIgnHandler=false) const;
161 162  
162 163 void doProcessOneWord(
163 164 const Environment& env,
... ... @@ -166,6 +167,13 @@ private:
166 167 SegrulesState segrulesState,
167 168 std::vector<InterpretedChunk>& accum,
168 169 InflexionGraph& graph) const;
  170 +
  171 + void handleIgnChunk(
  172 + const Environment& env,
  173 + const char* inputStart,
  174 + const char* inputEnd,
  175 + int startNodeNum,
  176 + std::vector<MorphInterpretation>& results) const;
169 177  
170 178 void appendIgnotiumToResults(
171 179 const Environment& env,
... ...
morfeusz/fsa/const.cpp
... ... @@ -2,7 +2,7 @@
2 2 #include "const.hpp"
3 3  
4 4 extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b;
5   -extern const uint8_t VERSION_NUM = 12;
  5 +extern const uint8_t VERSION_NUM = 13;
6 6  
7 7 extern const unsigned int VERSION_NUM_OFFSET = 4;
8 8 extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5;
... ...
morfeusz/segrules/segrules.cpp
... ... @@ -18,11 +18,19 @@ static inline string deserializeString(const unsigned char*&amp; ptr) {
18 18 return res;
19 19 }
20 20  
  21 +static inline void ignoreSeparatorsList(const unsigned char*& ptr) {
  22 + uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr));
  23 + ptr += 2;
  24 + ptr += 4 * listSize;
  25 +}
  26 +
21 27 static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) {
22 28 const unsigned char* additionalDataPtr = ptr
23 29 + FSA_DATA_OFFSET
24 30 + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
25   - return additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;
  31 + const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;
  32 + ignoreSeparatorsList(res);
  33 + return res;
26 34 }
27 35  
28 36 static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) {
... ...