Commit 4a94bb689bf0122f2270624056cb168e739d1063

Authored by Michał Lenart
1 parent 7351ce74

prawie dorobiona kompletna obsługa ignów

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@147 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/morfeuszbuilder/fsa/serializer.py
@@ -22,7 +22,7 @@ class Serializer(object): @@ -22,7 +22,7 @@ class Serializer(object):
22 22
23 # get the Morfeusz file format version that is being encoded 23 # get the Morfeusz file format version that is being encoded
24 def getVersion(self): 24 def getVersion(self):
25 - return 12 25 + return 13
26 26
27 def serialize2CppFile(self, fname, generator, segmentationRulesData): 27 def serialize2CppFile(self, fname, generator, segmentationRulesData):
28 res = [] 28 res = []
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -5,12 +5,14 @@ Created on 20 lut 2014 @@ -5,12 +5,14 @@ Created on 20 lut 2014
5 ''' 5 '''
6 import logging 6 import logging
7 from morfeuszbuilder.utils.serializationUtils import htons, htonl 7 from morfeuszbuilder.utils.serializationUtils import htons, htonl
  8 +from morfeuszbuilder.utils import serializationUtils
8 9
9 class RulesManager(object): 10 class RulesManager(object):
10 11
11 - def __init__(self, segtypes): 12 + def __init__(self, segtypes, separatorsList):
12 self.options2DFA = {} 13 self.options2DFA = {}
13 self.segtypes = segtypes 14 self.segtypes = segtypes
  15 + self.separatorsList = separatorsList
14 self.defaultOptions = None 16 self.defaultOptions = None
15 17
16 def _options2Key(self, optionsMap): 18 def _options2Key(self, optionsMap):
@@ -37,6 +39,7 @@ class RulesManager(object): @@ -37,6 +39,7 @@ class RulesManager(object):
37 39
38 def serialize(self): 40 def serialize(self):
39 res = bytearray() 41 res = bytearray()
  42 + res.extend(self._serializeSeparatorsList())
40 dfasNum = len(self.options2DFA) 43 dfasNum = len(self.options2DFA)
41 assert dfasNum > 0 and dfasNum < 256 44 assert dfasNum > 0 and dfasNum < 256
42 res.append(dfasNum) 45 res.append(dfasNum)
@@ -48,6 +51,13 @@ class RulesManager(object): @@ -48,6 +51,13 @@ class RulesManager(object):
48 logging.info('segmentation rules size: %s bytes', len(res)) 51 logging.info('segmentation rules size: %s bytes', len(res))
49 return res 52 return res
50 53
  54 + def _serializeSeparatorsList(self):
  55 + res = bytearray()
  56 + res.extend(serializationUtils.htons(len(self.separatorsList)))
  57 + for cp in sorted(self.separatorsList):
  58 + res.extend(serializationUtils.htonl(cp))
  59 + return res
  60 +
51 def _serializeOptionsMap(self, optionsMap): 61 def _serializeOptionsMap(self, optionsMap):
52 assert len(optionsMap) < 256 62 assert len(optionsMap) < 256
53 res = bytearray() 63 res = bytearray()
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -3,7 +3,7 @@ from pyparsing import * @@ -3,7 +3,7 @@ from pyparsing import *
3 ParserElement.enablePackrat() 3 ParserElement.enablePackrat()
4 from morfeuszbuilder.tagset import segtypes 4 from morfeuszbuilder.tagset import segtypes
5 from morfeuszbuilder.utils import configFile, exceptions 5 from morfeuszbuilder.utils import configFile, exceptions
6 -from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString 6 +from morfeuszbuilder.segrules import preprocessor, rules, rulesManager, pyparseString, separatorChars
7 import codecs 7 import codecs
8 import re 8 import re
9 9
@@ -34,11 +34,22 @@ class RulesParser(object): @@ -34,11 +34,22 @@ class RulesParser(object):
34 34
35 def parse(self, filename): 35 def parse(self, filename):
36 36
37 - segtypesConfigFile = configFile.ConfigFile(filename, ['options', 'combinations', 'generator combinations', 'tags', 'lexemes', 'segment types']) 37 + segtypesConfigFile = configFile.ConfigFile(filename,
  38 + [
  39 + 'options',
  40 + 'combinations',
  41 + 'generator combinations',
  42 + 'tags',
  43 + 'lexemes',
  44 + 'segment types',
  45 + 'separator chars'])
38 key2Defs = self._getKey2Defs(segtypesConfigFile) 46 key2Defs = self._getKey2Defs(segtypesConfigFile)
39 segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile) 47 segtypesHelper = segtypes.Segtypes(self.tagset, segtypesConfigFile)
  48 + separatorsList = separatorChars.parseSeparatorChars(segtypesConfigFile) \
  49 + if self.rulesType == RulesParser.PARSE4ANALYZER \
  50 + else []
40 51
41 - res = rulesManager.RulesManager(segtypesHelper) 52 + res = rulesManager.RulesManager(segtypesHelper, separatorsList)
42 53
43 def2Key = {} 54 def2Key = {}
44 for key, defs in key2Defs.iteritems(): 55 for key, defs in key2Defs.iteritems():
input/segmenty.dat
@@ -243,6 +243,16 @@ moze_interp( dig&gt;+ dywiz&gt; latek ) @@ -243,6 +243,16 @@ moze_interp( dig&gt;+ dywiz&gt; latek )
243 # interpretacja znaków interpunkcyjnych 243 # interpretacja znaków interpunkcyjnych
244 # moze_interp(samodz interp) 244 # moze_interp(samodz interp)
245 245
  246 +[separator chars]
  247 +# ,
  248 +44
  249 +
  250 +# .
  251 +46
  252 +
  253 +# ;
  254 +59
  255 +
246 [generator combinations] 256 [generator combinations]
247 257
248 [segment types] 258 [segment types]
morfeusz/Morfeusz.cpp
@@ -62,7 +62,8 @@ void Morfeusz::processOneWord( @@ -62,7 +62,8 @@ void Morfeusz::processOneWord(
62 const char*& inputStart, 62 const char*& inputStart,
63 const char* inputEnd, 63 const char* inputEnd,
64 int startNodeNum, 64 int startNodeNum,
65 - std::vector<MorphInterpretation>& results) const { 65 + std::vector<MorphInterpretation>& results,
  66 + bool insideIgnHandler) const {
66 while (inputStart != inputEnd 67 while (inputStart != inputEnd
67 && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) { 68 && isEndOfWord(env.getCharsetConverter().peek(inputStart, inputEnd))) {
68 env.getCharsetConverter().next(inputStart, inputEnd); 69 env.getCharsetConverter().next(inputStart, inputEnd);
@@ -71,9 +72,9 @@ void Morfeusz::processOneWord( @@ -71,9 +72,9 @@ void Morfeusz::processOneWord(
71 InflexionGraph graph; 72 InflexionGraph graph;
72 const char* currInput = inputStart; 73 const char* currInput = inputStart;
73 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA(); 74 const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
74 - 75 +
75 doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph); 76 doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
76 - 77 +
77 if (!graph.empty()) { 78 if (!graph.empty()) {
78 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); 79 const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
79 int srcNode = startNodeNum; 80 int srcNode = startNodeNum;
@@ -87,6 +88,12 @@ void Morfeusz::processOneWord( @@ -87,6 +88,12 @@ void Morfeusz::processOneWord(
87 srcNode++; 88 srcNode++;
88 } 89 }
89 } 90 }
  91 + else if (inputStart != inputEnd
  92 + && env.getProcessorType() == ANALYZER
  93 + && !insideIgnHandler) {
  94 + this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results);
  95 + // this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
  96 + }
90 else if (inputStart != inputEnd) { 97 else if (inputStart != inputEnd) {
91 this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results); 98 this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
92 } 99 }
@@ -113,7 +120,7 @@ static inline string debugAccum(vector&lt;InterpretedChunk&gt;&amp; accum) { @@ -113,7 +120,7 @@ static inline string debugAccum(vector&lt;InterpretedChunk&gt;&amp; accum) {
113 stringstream res; 120 stringstream res;
114 for (unsigned int i = 0; i < accum.size(); i++) { 121 for (unsigned int i = 0; i < accum.size(); i++) {
115 res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr); 122 res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr);
116 -// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; 123 + // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
117 } 124 }
118 return res.str(); 125 return res.str();
119 } 126 }
@@ -125,11 +132,11 @@ void Morfeusz::doProcessOneWord( @@ -125,11 +132,11 @@ void Morfeusz::doProcessOneWord(
125 SegrulesState segrulesState, 132 SegrulesState segrulesState,
126 vector<InterpretedChunk>& accum, 133 vector<InterpretedChunk>& accum,
127 InflexionGraph& graph) const { 134 InflexionGraph& graph) const {
128 -// if (this->options.debug) {  
129 -// cerr << "----------" << endl;  
130 -// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;  
131 -// }  
132 -// cerr << "doAnalyzeOneWord " << inputData << endl; 135 + // if (this->options.debug) {
  136 + // cerr << "----------" << endl;
  137 + // cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
  138 + // }
  139 + // cerr << "doAnalyzeOneWord " << inputData << endl;
133 const char* inputStart = inputData; 140 const char* inputStart = inputData;
134 const char* currInput = inputData; 141 const char* currInput = inputData;
135 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); 142 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
@@ -140,8 +147,8 @@ void Morfeusz::doProcessOneWord( @@ -140,8 +147,8 @@ void Morfeusz::doProcessOneWord(
140 147
141 while (!isEndOfWord(codepoint)) { 148 while (!isEndOfWord(codepoint)) {
142 uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER 149 uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
143 - ? env.getCaseConverter().toLower(codepoint)  
144 - : codepoint; 150 + ? env.getCaseConverter().toLower(codepoint)
  151 + : codepoint;
145 originalCodepoints.push_back(codepoint); 152 originalCodepoints.push_back(codepoint);
146 normalizedCodepoints.push_back(normalizedCodepoint); 153 normalizedCodepoints.push_back(normalizedCodepoint);
147 feedState(state, normalizedCodepoint, UTF8CharsetConverter()); 154 feedState(state, normalizedCodepoint, UTF8CharsetConverter());
@@ -152,7 +159,7 @@ void Morfeusz::doProcessOneWord( @@ -152,7 +159,7 @@ void Morfeusz::doProcessOneWord(
152 throw MorfeuszException("Lemma of length > 1 cannot start with a colon"); 159 throw MorfeuszException("Lemma of length > 1 cannot start with a colon");
153 } 160 }
154 homonymId = string(currInput + 1, inputEnd); 161 homonymId = string(currInput + 1, inputEnd);
155 -// cerr << "homonym " << homonymId << endl; 162 + // cerr << "homonym " << homonymId << endl;
156 currInput = inputEnd; 163 currInput = inputEnd;
157 codepoint = 0x00; 164 codepoint = 0x00;
158 } 165 }
@@ -163,13 +170,13 @@ void Morfeusz::doProcessOneWord( @@ -163,13 +170,13 @@ void Morfeusz::doProcessOneWord(
163 if (this->options.debug) { 170 if (this->options.debug) {
164 cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; 171 cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
165 } 172 }
166 -// cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; 173 + // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
167 set<SegrulesState> newSegrulesStates; 174 set<SegrulesState> newSegrulesStates;
168 env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); 175 env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
169 if (this->options.debug && newSegrulesStates.empty()) { 176 if (this->options.debug && newSegrulesStates.empty()) {
170 cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; 177 cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
171 } 178 }
172 -// cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; 179 + // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
173 for ( 180 for (
174 set<SegrulesState>::iterator it = newSegrulesStates.begin(); 181 set<SegrulesState>::iterator it = newSegrulesStates.begin();
175 it != newSegrulesStates.end(); 182 it != newSegrulesStates.end();
@@ -190,7 +197,7 @@ void Morfeusz::doProcessOneWord( @@ -190,7 +197,7 @@ void Morfeusz::doProcessOneWord(
190 doShiftOrth(accum.back(), ic); 197 doShiftOrth(accum.back(), ic);
191 } 198 }
192 accum.push_back(ic); 199 accum.push_back(ic);
193 - if (isEndOfWord(codepoint) 200 + if (isEndOfWord(codepoint)
194 && newSegrulesState.accepting) { 201 && newSegrulesState.accepting) {
195 if (this->options.debug) { 202 if (this->options.debug) {
196 cerr << "ACCEPTING " << debugAccum(accum) << endl; 203 cerr << "ACCEPTING " << debugAccum(accum) << endl;
@@ -198,7 +205,7 @@ void Morfeusz::doProcessOneWord( @@ -198,7 +205,7 @@ void Morfeusz::doProcessOneWord(
198 graph.addPath(accum, newSegrulesState.weak); 205 graph.addPath(accum, newSegrulesState.weak);
199 } 206 }
200 else if (!isEndOfWord(codepoint)) { 207 else if (!isEndOfWord(codepoint)) {
201 -// cerr << "will process " << currInput << endl; 208 + // cerr << "will process " << currInput << endl;
202 const char* newCurrInput = currInput; 209 const char* newCurrInput = currInput;
203 doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); 210 doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
204 } 211 }
@@ -211,6 +218,58 @@ void Morfeusz::doProcessOneWord( @@ -211,6 +218,58 @@ void Morfeusz::doProcessOneWord(
211 inputData = currInput; 218 inputData = currInput;
212 } 219 }
213 220
  221 +static inline bool isSeparator(uint32_t codepoint) {
  222 + return codepoint == 44;
  223 +}
  224 +
  225 +void Morfeusz::handleIgnChunk(
  226 + const Environment& env,
  227 + const char* inputStart,
  228 + const char* inputEnd,
  229 + int startNodeNum,
  230 + std::vector<MorphInterpretation>& results) const {
  231 + const char* currInput = inputStart;
  232 + const char* prevInput;
  233 + uint32_t codepoint;
  234 + bool separatorFound = false;
  235 + while (currInput != inputEnd) {
  236 + prevInput = currInput;
  237 + const char* nonSeparatorInputEnd = prevInput;
  238 + do {
  239 + codepoint = env.getCharsetConverter().next(currInput, inputEnd);
  240 + if (!isSeparator(codepoint)) {
  241 + nonSeparatorInputEnd = currInput;
  242 + }
  243 + }
  244 + while (currInput != inputEnd && !isSeparator(codepoint));
  245 +
  246 + if (isSeparator(codepoint)) {
  247 + separatorFound = true;
  248 + if (nonSeparatorInputEnd != prevInput) {
  249 + int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
  250 + this->processOneWord(env, prevInput, nonSeparatorInputEnd, startNode, results, true);
  251 + startNode = results.empty() ? startNodeNum : results.back().getEndNode();
  252 + this->processOneWord(env, nonSeparatorInputEnd, currInput, startNode, results, true);
  253 + }
  254 + else {
  255 + int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
  256 + this->processOneWord(env, prevInput, currInput, startNode, results, true);
  257 + }
  258 + }
  259 + }
  260 +
  261 + // currInput == inputEnd
  262 + if (!isSeparator(codepoint)) {
  263 + if (separatorFound) {
  264 + int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
  265 + this->processOneWord(env, prevInput, inputEnd, startNode, results, true);
  266 + }
  267 + else {
  268 + this->appendIgnotiumToResults(env, string(inputStart, inputEnd), startNodeNum, results);
  269 + }
  270 + }
  271 +}
  272 +
214 void Morfeusz::appendIgnotiumToResults( 273 void Morfeusz::appendIgnotiumToResults(
215 const Environment& env, 274 const Environment& env,
216 const string& word, 275 const string& word,
@@ -260,6 +319,7 @@ void Morfeusz::generate(const string&amp; text, vector&lt;MorphInterpretation&gt;&amp; results @@ -260,6 +319,7 @@ void Morfeusz::generate(const string&amp; text, vector&lt;MorphInterpretation&gt;&amp; results
260 } 319 }
261 320
262 // XXX - someday it should be improved 321 // XXX - someday it should be improved
  322 +
263 void Morfeusz::generate(const std::string& lemma, int tagnum, vector<MorphInterpretation>& result) const { 323 void Morfeusz::generate(const std::string& lemma, int tagnum, vector<MorphInterpretation>& result) const {
264 vector<MorphInterpretation> partRes; 324 vector<MorphInterpretation> partRes;
265 this->generate(lemma, partRes); 325 this->generate(lemma, partRes);
morfeusz/Morfeusz.hpp
@@ -157,7 +157,8 @@ private: @@ -157,7 +157,8 @@ private:
157 const char*& inputData, 157 const char*& inputData,
158 const char* inputEnd, 158 const char* inputEnd,
159 int startNodeNum, 159 int startNodeNum,
160 - std::vector<MorphInterpretation>& result) const; 160 + std::vector<MorphInterpretation>& result,
  161 + bool insideIgnHandler=false) const;
161 162
162 void doProcessOneWord( 163 void doProcessOneWord(
163 const Environment& env, 164 const Environment& env,
@@ -166,6 +167,13 @@ private: @@ -166,6 +167,13 @@ private:
166 SegrulesState segrulesState, 167 SegrulesState segrulesState,
167 std::vector<InterpretedChunk>& accum, 168 std::vector<InterpretedChunk>& accum,
168 InflexionGraph& graph) const; 169 InflexionGraph& graph) const;
  170 +
  171 + void handleIgnChunk(
  172 + const Environment& env,
  173 + const char* inputStart,
  174 + const char* inputEnd,
  175 + int startNodeNum,
  176 + std::vector<MorphInterpretation>& results) const;
169 177
170 void appendIgnotiumToResults( 178 void appendIgnotiumToResults(
171 const Environment& env, 179 const Environment& env,
morfeusz/fsa/const.cpp
@@ -2,7 +2,7 @@ @@ -2,7 +2,7 @@
2 #include "const.hpp" 2 #include "const.hpp"
3 3
4 extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; 4 extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b;
5 -extern const uint8_t VERSION_NUM = 12; 5 +extern const uint8_t VERSION_NUM = 13;
6 6
7 extern const unsigned int VERSION_NUM_OFFSET = 4; 7 extern const unsigned int VERSION_NUM_OFFSET = 4;
8 extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5; 8 extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5;
morfeusz/segrules/segrules.cpp
@@ -18,11 +18,19 @@ static inline string deserializeString(const unsigned char*&amp; ptr) { @@ -18,11 +18,19 @@ static inline string deserializeString(const unsigned char*&amp; ptr) {
18 return res; 18 return res;
19 } 19 }
20 20
  21 +static inline void ignoreSeparatorsList(const unsigned char*& ptr) {
  22 + uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr));
  23 + ptr += 2;
  24 + ptr += 4 * listSize;
  25 +}
  26 +
21 static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) { 27 static inline const unsigned char* getFSAsMapPtr(const unsigned char* ptr) {
22 const unsigned char* additionalDataPtr = ptr 28 const unsigned char* additionalDataPtr = ptr
23 + FSA_DATA_OFFSET 29 + FSA_DATA_OFFSET
24 + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); 30 + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
25 - return additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; 31 + const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;
  32 + ignoreSeparatorsList(res);
  33 + return res;
26 } 34 }
27 35
28 static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { 36 static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) {