|
1
2
|
/*
* File: Morfeusz.cpp
|
|
3
|
* Author: mlenart
|
|
4
5
6
7
|
*
* Created on November 13, 2013, 5:21 PM
*/
|
|
8
|
#include <string>
|
|
9
|
#include <iostream>
|
|
10
|
#include "fsa/fsa.hpp"
|
|
11
|
#include "utils.hpp"
|
|
12
|
#include "data/default_fsa.hpp"
|
|
13
|
#include "Morfeusz.hpp"
|
|
14
|
#include "MorphDeserializer.hpp"
|
|
15
|
#include "InterpretedChunksDecoder.hpp"
|
|
16
|
#include "charset/CharsetConverter.hpp"
|
|
17
|
#include "charset/charset_utils.hpp"
|
|
18
|
#include "charset/CaseConverter.hpp"
|
|
19
|
#include "segrules/segrules.hpp"
|
|
20
|
#include "const.hpp"
|
|
21
|
#include "deserializationUtils.hpp"
|
|
22
|
#include "charset/utf8.h"
|
|
23
24
|
// TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
|
|
25
|
|
|
26
27
|
using namespace std;
|
|
28
29
30
31
|
static MorfeuszOptions createDefaultOptions() {
MorfeuszOptions res;
res.caseSensitive = true;
res.encoding = UTF8;
|
|
32
|
res.debug = false;
|
|
33
34
35
|
return res;
}
|
|
36
|
Morfeusz::Morfeusz()
|
|
37
38
|
: analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA),
generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA),
|
|
39
|
options(createDefaultOptions()) {
|
|
40
41
|
analyzerEnv.setCaseSensitive(options.caseSensitive);
generatorEnv.setCaseSensitive(false);
|
|
42
43
|
}
|
|
44
45
46
47
48
49
50
51
52
53
54
55
56
57
|
inline const unsigned char* getInterpretationsPtr(const Environment& env, const InterpsGroup& ig) {
if (env.getProcessorType() == ANALYZER) {
const unsigned char* currPtr = ig.ptr;
unsigned char casePatternsNum = *currPtr++;
for (unsigned int i = 0; i < casePatternsNum; i++) {
env.getCasePatternHelper().deserializeOneCasePattern(currPtr);
}
return currPtr;
}
else {
return ig.ptr;
}
}
|
|
58
|
void Morfeusz::setAnalyzerFile(const string& filename) {
|
|
59
|
this->analyzerEnv.setFSAFile(filename);
|
|
60
61
|
}
|
|
62
|
void Morfeusz::setGeneratorFile(const string& filename) {
|
|
63
|
this->generatorEnv.setFSAFile(filename);
|
|
64
65
|
}
|
|
66
|
Morfeusz::~Morfeusz() {
|
|
67
68
|
}
|
|
69
70
|
void Morfeusz::processOneWord(
const Environment& env,
|
|
71
|
const char*& inputStart,
|
|
72
|
const char* inputEnd,
|
|
73
|
int startNodeNum,
|
|
74
75
|
std::vector<MorphInterpretation>& results,
bool insideIgnHandler) const {
|
|
76
|
while (inputStart != inputEnd
|
|
77
|
&& isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) {
|
|
78
|
env.getCharsetConverter().next(inputStart, inputEnd);
|
|
79
|
}
|
|
80
|
vector<InterpretedChunk> accum;
|
|
81
|
InflexionGraph graph;
|
|
82
|
const char* currInput = inputStart;
|
|
83
|
const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
|
|
84
|
|
|
85
|
doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
|
|
86
|
|
|
87
|
if (!graph.empty()) {
|
|
88
|
const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
|
|
89
|
int srcNode = startNodeNum;
|
|
90
|
for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) {
|
|
91
|
vector<InflexionGraph::Edge>& edges = graph.getTheGraph()[i];
|
|
92
|
for (unsigned int j = 0; j < edges.size(); j++) {
|
|
93
|
InflexionGraph::Edge& e = edges[j];
|
|
94
|
int targetNode = startNodeNum + e.nextNode;
|
|
95
|
interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results);
|
|
96
97
98
|
}
srcNode++;
}
|
|
99
|
}
|
|
100
101
|
else if (inputStart != inputEnd
&& env.getProcessorType() == ANALYZER
|
|
102
103
104
105
|
&& !insideIgnHandler) {
this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results);
// this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
}
|
|
106
|
else if (inputStart != inputEnd) {
|
|
107
|
this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
|
|
108
109
|
}
inputStart = currInput;
|
|
110
111
|
}
|
|
112
113
|
static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
to.prefixChunks.insert(
|
|
114
115
116
|
to.prefixChunks.begin(),
from.prefixChunks.begin(),
from.prefixChunks.end());
|
|
117
118
|
to.prefixChunks.push_back(from);
from.orthWasShifted = true;
|
|
119
|
to.textStartPtr = from.textStartPtr;
|
|
120
121
|
}
|
|
122
123
124
125
126
127
128
129
130
|
static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
stringstream res;
res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
return res.str();
}
static inline string debugAccum(vector<InterpretedChunk>& accum) {
stringstream res;
for (unsigned int i = 0; i < accum.size(); i++) {
|
|
131
|
res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr);
|
|
132
|
// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
|
|
133
134
135
136
|
}
return res.str();
}
|
|
137
138
|
void Morfeusz::doProcessOneWord(
const Environment& env,
|
|
139
140
|
const char*& inputData,
const char* inputEnd,
|
|
141
|
SegrulesState segrulesState,
|
|
142
|
vector<InterpretedChunk>& accum,
|
|
143
|
InflexionGraph& graph) const {
|
|
144
145
146
147
|
if (this->options.debug) {
cerr << "----------" << endl;
cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
}
|
|
148
|
// cerr << "doAnalyzeOneWord " << inputData << endl;
|
|
149
|
const char* inputStart = inputData;
|
|
150
|
const char* currInput = inputData;
|
|
151
|
uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
|
|
152
|
vector<uint32_t> originalCodepoints;
|
|
153
|
vector<uint32_t> normalizedCodepoints;
|
|
154
|
|
|
155
|
StateType state = env.getFSA().getInitialState();
|
|
156
|
|
|
157
|
while (!isWhitespace(codepoint)) {
|
|
158
|
uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
|
|
159
160
|
? env.getCaseConverter().toLower(codepoint)
: codepoint;
|
|
161
|
originalCodepoints.push_back(codepoint);
|
|
162
163
|
normalizedCodepoints.push_back(normalizedCodepoint);
feedState(state, normalizedCodepoint, UTF8CharsetConverter());
|
|
164
|
codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd);
|
|
165
166
167
168
169
170
|
string homonymId;
if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) {
if (originalCodepoints.size() == 1) {
throw MorfeuszException("Lemma of length > 1 cannot start with a colon");
}
homonymId = string(currInput + 1, inputEnd);
|
|
171
|
// cerr << "homonym " << homonymId << endl;
|
|
172
173
174
|
currInput = inputEnd;
codepoint = 0x00;
}
|
|
175
176
177
178
|
if (state.isAccepting()) {
vector<InterpsGroup> val(state.getValue());
for (unsigned int i = 0; i < val.size(); i++) {
InterpsGroup& ig = val[i];
|
|
179
180
181
|
if (this->options.debug) {
cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
}
|
|
182
183
184
185
186
187
|
set<SegrulesState> newSegrulesStates;
env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
if (this->options.debug && newSegrulesStates.empty()) {
cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
}
if (!newSegrulesStates.empty() && env.getCasePatternHelper().checkInterpsGroupCasePatterns(normalizedCodepoints, originalCodepoints, ig)) {
|
|
188
|
|
|
189
190
191
192
193
|
for (
set<SegrulesState>::iterator it = newSegrulesStates.begin();
it != newSegrulesStates.end();
++it) {
SegrulesState newSegrulesState = *it;
|
|
194
|
const unsigned char* interpsPtr = getInterpretationsPtr(env, ig);
|
|
195
|
const unsigned char* interpsEndPtr = ig.ptr + ig.size;
|
|
196
|
InterpretedChunk ic = {
|
|
197
|
ig.type,
|
|
198
199
200
201
|
inputStart,
currInput,
originalCodepoints,
normalizedCodepoints,
|
|
202
203
|
interpsPtr,
interpsEndPtr,
|
|
204
205
206
207
208
209
210
|
newSegrulesState.shiftOrthFromPrevious,
false,
vector<InterpretedChunk>(),
homonymId
};
if (!accum.empty() && accum.back().shiftOrth) {
doShiftOrth(accum.back(), ic);
|
|
211
|
}
|
|
212
|
accum.push_back(ic);
|
|
213
|
if (isWhitespace(codepoint)
|
|
214
215
216
217
218
219
|
&& newSegrulesState.accepting) {
if (this->options.debug) {
cerr << "ACCEPTING " << debugAccum(accum) << endl;
}
graph.addPath(accum, newSegrulesState.weak);
}
|
|
220
|
else if (!isWhitespace(codepoint)) {
|
|
221
222
223
224
225
|
// cerr << "will process " << currInput << endl;
const char* newCurrInput = currInput;
doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
}
accum.pop_back();
|
|
226
|
}
|
|
227
|
}
|
|
228
|
}
|
|
229
|
}
|
|
230
|
codepoint = currInput == inputEnd || isWhitespace(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
|
|
231
232
|
}
inputData = currInput;
|
|
233
234
|
}
|
|
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
|
void Morfeusz::handleIgnChunk(
const Environment& env,
const char* inputStart,
const char* inputEnd,
int startNodeNum,
std::vector<MorphInterpretation>& results) const {
const char* currInput = inputStart;
const char* prevInput;
uint32_t codepoint;
bool separatorFound = false;
while (currInput != inputEnd) {
prevInput = currInput;
const char* nonSeparatorInputEnd = prevInput;
do {
codepoint = env.getCharsetConverter().next(currInput, inputEnd);
|
|
250
|
if (!env.isSeparator(codepoint)) {
|
|
251
252
253
|
nonSeparatorInputEnd = currInput;
}
}
|
|
254
|
while (currInput != inputEnd && !env.isSeparator(codepoint));
|
|
255
|
|
|
256
|
if (env.isSeparator(codepoint)) {
|
|
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
|
separatorFound = true;
if (nonSeparatorInputEnd != prevInput) {
int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
this->processOneWord(env, prevInput, nonSeparatorInputEnd, startNode, results, true);
startNode = results.empty() ? startNodeNum : results.back().getEndNode();
this->processOneWord(env, nonSeparatorInputEnd, currInput, startNode, results, true);
}
else {
int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
this->processOneWord(env, prevInput, currInput, startNode, results, true);
}
}
}
// currInput == inputEnd
|
|
272
|
if (!env.isSeparator(codepoint)) {
|
|
273
274
275
276
277
278
279
280
281
282
|
if (separatorFound) {
int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
this->processOneWord(env, prevInput, inputEnd, startNode, results, true);
}
else {
this->appendIgnotiumToResults(env, string(inputStart, inputEnd), startNodeNum, results);
}
}
}
|
|
283
|
void Morfeusz::appendIgnotiumToResults(
|
|
284
|
const Environment& env,
|
|
285
286
287
|
const string& word,
int startNodeNum,
std::vector<MorphInterpretation>& results) const {
|
|
288
|
MorphInterpretation interp = MorphInterpretation::createIgn(startNodeNum, word, env);
|
|
289
290
291
|
results.push_back(interp);
}
|
|
292
|
ResultsIterator Morfeusz::analyze(const string& text) const {
|
|
293
294
295
|
vector<MorphInterpretation> res;
this->analyze(text, res);
return ResultsIterator(res);
|
|
296
297
|
}
|
|
298
|
void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const {
|
|
299
300
|
const char* input = text.c_str();
const char* inputEnd = input + text.length();
|
|
301
302
|
while (input != inputEnd) {
int startNode = results.empty() ? 0 : results.back().getEndNode();
|
|
303
|
this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results);
|
|
304
|
}
|
|
305
306
|
}
|
|
307
308
309
310
311
312
|
ResultsIterator Morfeusz::generate(const string& text) const {
vector<MorphInterpretation> res;
this->generate(text, res);
return ResultsIterator(res);
}
|
|
313
314
315
316
317
318
|
ResultsIterator Morfeusz::generate(const string& text, int tagnum) const {
vector<MorphInterpretation> res;
this->generate(text, tagnum, res);
return ResultsIterator(res);
}
|
|
319
|
void Morfeusz::generate(const string& lemma, vector<MorphInterpretation>& results) const {
|
|
320
321
|
const char* input = lemma.c_str();
const char* inputEnd = input + lemma.length();
|
|
322
323
324
325
|
int startNode = 0;
this->processOneWord(this->generatorEnv, input, inputEnd, startNode, results);
if (input != inputEnd) {
throw MorfeuszException("Input contains more than one word");
|
|
326
|
}
|
|
327
328
|
}
|
|
329
|
// XXX - someday it should be improved
|
|
330
|
|
|
331
332
333
334
335
336
337
338
339
340
|
void Morfeusz::generate(const std::string& lemma, int tagnum, vector<MorphInterpretation>& result) const {
vector<MorphInterpretation> partRes;
this->generate(lemma, partRes);
for (unsigned int i = 0; i < partRes.size(); i++) {
if (partRes[i].getTagnum() == tagnum) {
result.push_back(partRes[i]);
}
}
}
|
|
341
342
|
void Morfeusz::setCharset(MorfeuszCharset charset) {
this->options.encoding = charset;
|
|
343
344
|
this->analyzerEnv.setCharset(charset);
this->generatorEnv.setCharset(charset);
|
|
345
346
|
}
|
|
347
348
349
350
351
352
353
354
355
356
|
void Morfeusz::setAggl(const std::string& aggl) {
this->analyzerEnv.setSegrulesOption("aggl", aggl);
this->generatorEnv.setSegrulesOption("aggl", aggl);
}
void Morfeusz::setPraet(const std::string& praet) {
this->analyzerEnv.setSegrulesOption("praet", praet);
this->generatorEnv.setSegrulesOption("praet", praet);
}
|
|
357
358
|
void Morfeusz::setCaseSensitive(bool caseSensitive) {
this->options.caseSensitive = caseSensitive;
|
|
359
|
this->analyzerEnv.setCaseSensitive(caseSensitive);
|
|
360
361
|
}
|
|
362
363
364
365
|
void Morfeusz::setDebug(bool debug) {
this->options.debug = debug;
}
|
|
366
|
ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) {
|
|
367
|
resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end());
|
|
368
369
370
|
}
MorphInterpretation ResultsIterator::getNext() {
|
|
371
372
373
|
MorphInterpretation res = this->resultsBuffer.front();
this->resultsBuffer.pop_front();
return res;
|
|
374
375
376
|
}
bool ResultsIterator::hasNext() {
|
|
377
|
return !resultsBuffer.empty();
|
|
378
|
}
|