|
1
2
3
4
5
6
7
8
9
|
/*
* File: Morfeusz.cpp
* Author: mlenart
*
* Created on November 13, 2013, 5:21 PM
*/
#include <string>
#include <iostream>
|
|
10
|
#include <vector>
|
|
11
|
#include <cstring>
|
|
12
|
#include <stdexcept>
|
|
13
14
|
#include "fsa/fsa.hpp"
#include "utils.hpp"
|
|
15
|
#include "MorfeuszImpl.hpp"
|
|
16
17
18
19
20
21
22
|
#include "deserialization/morphInterps/InterpretedChunksDecoder.hpp"
#include "charset/CharsetConverter.hpp"
#include "charset/charset_utils.hpp"
#include "case/CaseConverter.hpp"
#include "segrules/segrules.hpp"
#include "const.hpp"
#include "charset/utf8.h"
|
|
23
|
#include "ChunkBounds.hpp"
|
|
24
|
#include "DictionariesRepository.hpp"
|
|
25
26
27
28
29
30
31
|
// TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
using namespace std;
namespace morfeusz {
|
|
32
33
|
static MorfeuszOptions createDefaultOptions() {
MorfeuszOptions res;
|
|
34
|
res.caseHandling = CONDITIONALLY_CASE_SENSITIVE;
|
|
35
|
res.encoding = UTF8;
|
|
36
37
|
res.tokenNumbering = SEPARATE_NUMBERING;
res.whitespaceHandling = SKIP_WHITESPACES;
|
|
38
39
40
|
res.debug = false;
return res;
}
|
|
41
|
|
|
42
43
44
45
|
static string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
stringstream res;
res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
return res.str();
|
|
46
47
|
}
|
|
48
49
50
|
static string debugAccum(vector<InterpretedChunk>& accum) {
stringstream res;
for (unsigned int i = 0; i < accum.size(); i++) {
|
|
51
|
res << debugInterpsGroup(accum[i].segmentType, accum[i].textNoPrefixesStartPtr, accum[i].textEndPtr);
|
|
52
53
|
}
return res.str();
|
|
54
55
|
}
|
|
56
|
static void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
|
|
57
|
to.prefixChunks.swap(from.prefixChunks); // from.prefixChunks are ignored anyway. Will swap them back in doUnshiftOrth
|
|
58
59
60
61
|
to.prefixChunks.push_back(from);
to.textStartPtr = from.textStartPtr;
from.orthWasShifted = true;
}
|
|
62
63
64
65
66
|
static void doUnshiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
to.prefixChunks.swap(from.prefixChunks);
from.prefixChunks.pop_back();
}
|
|
67
|
|
|
68
69
70
71
72
73
74
75
76
|
static void feedStateDirectly(
const FSAType& fsa,
StateType& state,
const char* inputStart,
const char* inputEnd) {
const char* currInput = inputStart;
while (currInput != inputEnd && !state.isSink()) {
state.proceedToNext(fsa, *currInput++);
}
|
|
77
78
|
}
|
|
79
80
81
82
83
84
85
86
87
|
static void feedStateIndirectly(
const FSAType& fsa,
StateType& state,
uint32_t codepoint) {
std::string chars;
UTF8CharsetConverter::getInstance().append(codepoint, chars);
for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) {
state.proceedToNext(fsa, chars[i]);
}
|
|
88
|
}
|
|
89
90
91
92
93
94
95
|
static void feedState(
const Environment& env,
StateType& state,
TextReader& reader) {
if (reader.peek() == reader.normalizedPeek() && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) {
feedStateDirectly(env.getFSA(), state, reader.getCurrPtr(), reader.getNextPtr());
|
|
96
|
} else {
|
|
97
98
|
feedStateIndirectly(env.getFSA(), state, reader.normalizedPeek());
}
|
|
99
100
|
}
|
|
101
102
103
104
105
106
107
108
|
static InterpretedChunk createChunk(
const InterpsGroup& ig,
const TextReader& reader,
bool shiftOrth,
const string& homonymId) {
const unsigned char* interpsEndPtr = ig.ptr + ig.size;
InterpretedChunk ic;
ic.segmentType = ig.type;
|
|
109
110
|
ic.textStartPtr = reader.getWordStartPtr(); // may be changed later in doShiftOrth(...) function
ic.textNoPrefixesStartPtr = ic.textStartPtr;
|
|
111
112
113
114
115
116
117
118
119
120
|
ic.textEndPtr = homonymId.empty() ? reader.getCurrPtr() : reader.getCurrPtr() - homonymId.length() - 1;
ic.interpsGroupPtr = ig.ptr;
ic.interpsEndPtr = interpsEndPtr;
ic.shiftOrth = shiftOrth;
ic.orthWasShifted = false;
ic.requiredHomonymId = homonymId;
ic.codepointsNum = reader.getCodepointsRead();
ic.forceIgnoreCase = false;
return ic;
}
|
|
121
|
|
|
122
123
|
MorfeuszImpl::MorfeuszImpl(const string& dictName, MorfeuszUsage usage)
: currDictionary(dictName),
|
|
124
|
usage(usage),
|
|
125
126
|
analyzerEnv(dictName, ANALYZER, usage != GENERATE_ONLY),
generatorEnv(dictName, GENERATOR, usage != ANALYSE_ONLY),
|
|
127
128
129
130
131
|
options(createDefaultOptions()),
accum(),
notMatchingCaseSegs(0),
graph(),
nextNodeNum(0) {
|
|
132
|
analyzerEnv.setCaseSensitive(options.caseHandling != IGNORE_CASE);
|
|
133
134
|
generatorEnv.setCaseSensitive(false);
}
|
|
135
136
137
138
139
|
Morfeusz* MorfeuszImpl::clone() const {
return new MorfeuszImpl(*this);
}
|
|
140
141
142
143
144
145
146
147
|
string MorfeuszImpl::getDictID() const {
return getAnyEnvironment().getCurrentDictionary()->id;
}
string MorfeuszImpl::getDictCopyright() const {
return getAnyEnvironment().getCurrentDictionary()->copyright;
}
|
|
148
|
void MorfeuszImpl::setDictionary(const string& dictName) {
|
|
149
150
151
152
153
154
|
if (dictName != currDictionary) {
doSetDictionary(dictName);
currDictionary = dictName;
|
|
155
|
}
|
|
156
157
158
159
160
161
|
}
void MorfeuszImpl::doSetDictionary(const string& dictName) {
switch (usage) {
case BOTH_ANALYSE_AND_GENERATE:
{
|
|
162
163
|
const Dictionary* analyzerDict = DictionariesRepository::getInstance().getDictionary(dictName, ANALYZER);
const Dictionary* generatorDict = DictionariesRepository::getInstance().getDictionary(dictName, GENERATOR);
|
|
164
165
166
|
if (analyzerDict->isCompatibleWith(*generatorDict)) {
analyzerEnv.setDictionary(analyzerDict);
generatorEnv.setDictionary(generatorDict);
|
|
167
168
|
}
else {
|
|
169
170
171
172
173
|
throw MorfeuszException("Analyzer and generator dictionaries are incompatible");
}
}
break;
case ANALYSE_ONLY:
|
|
174
|
analyzerEnv.setDictionary(DictionariesRepository::getInstance().getDictionary(dictName, ANALYZER));
|
|
175
176
|
break;
case GENERATE_ONLY:
|
|
177
|
generatorEnv.setDictionary(DictionariesRepository::getInstance().getDictionary(dictName, GENERATOR));
|
|
178
|
break;
|
|
179
|
}
|
|
180
|
}
|
|
181
|
|
|
182
183
184
|
const Environment& MorfeuszImpl::getAnyEnvironment() const {
if (analyzerEnv.isUsable()) {
return analyzerEnv;
|
|
185
|
} else {
|
|
186
187
188
189
|
return generatorEnv;
}
}
|
|
190
|
const set<string>& MorfeuszImpl::getAvailableAgglOptions() const {
|
|
191
|
return getAnyEnvironment().getAvailableAgglOptions();
|
|
192
|
}
|
|
193
|
|
|
194
|
const set<string>& MorfeuszImpl::getAvailablePraetOptions() const {
|
|
195
|
return getAnyEnvironment().getAvailablePraetOptions();
|
|
196
197
|
}
|
|
198
|
MorfeuszImpl::~MorfeuszImpl() {
|
|
199
|
}
|
|
200
|
|
|
201
202
203
204
205
206
|
const char* getWordEndPtr(const TextReader& reader, const Environment& env) {
TextReader tmpReader(reader.getCurrPtr(), reader.getEndPtr(), env);
while (!tmpReader.isAtEnd() && !tmpReader.isAtWhitespace()) {
tmpReader.next();
}
return tmpReader.getCurrPtr();
|
|
207
208
|
}
|
|
209
|
bool MorfeuszImpl::handleWhitespacesAtBeginning(
|
|
210
211
212
213
|
const Environment& env,
TextReader& reader,
int startNodeNum,
std::vector<MorphInterpretation>& results) const {
|
|
214
215
|
if (env.getProcessorType() == ANALYZER) {
switch (options.whitespaceHandling) {
|
|
216
|
case KEEP_WHITESPACES:
|
|
217
218
219
220
221
222
223
224
225
|
{
bool res = reader.isAtWhitespace() && !reader.isAtEnd();
if (res) {
processWhitespacesChunk(reader, startNodeNum, results);
}
reader.markChunkStartsHere();
reader.markWordStartsHere();
return res;
}
|
|
226
|
case APPEND_WHITESPACES:
|
|
227
228
229
230
|
reader.markChunkStartsHere();
reader.skipWhitespaces();
reader.markWordStartsHere();
return false;
|
|
231
|
case SKIP_WHITESPACES:
|
|
232
233
234
235
236
237
|
reader.skipWhitespaces();
reader.markChunkStartsHere();
reader.markWordStartsHere();
return false;
default:
break;
|
|
238
239
|
}
}
|
|
240
|
|
|
241
242
|
return false;
}
|
|
243
|
|
|
244
|
const char* MorfeuszImpl::handleWhitespacesAtEnd(
|
|
245
246
247
|
const Environment& env,
TextReader& reader) const {
if (env.getProcessorType() == ANALYZER
|
|
248
|
&& options.whitespaceHandling == APPEND_WHITESPACES) {
|
|
249
250
|
reader.skipWhitespaces();
}
|
|
251
|
return reader.getCurrPtr();
|
|
252
|
}
|
|
253
|
|
|
254
|
void MorfeuszImpl::processOneWord(
|
|
255
256
257
|
const Environment& env,
TextReader& reader,
int startNodeNum,
|
|
258
|
vector<MorphInterpretation>& results,
|
|
259
|
bool insideIgnHandler) const {
|
|
260
|
if (handleWhitespacesAtBeginning(env, reader, startNodeNum, results)) {
|
|
261
|
startNodeNum = results.back().endNode;
|
|
262
|
}
|
|
263
|
|
|
264
265
266
|
if (reader.isAtEnd()) {
return;
}
|
|
267
|
accum.resize(0);
|
|
268
269
|
notMatchingCaseSegs = 0;
graph.clear();
|
|
270
|
|
|
271
|
const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
|
|
272
|
|
|
273
274
275
276
277
|
doProcessOneWord(env, reader, segrulesFSA.initialState);
while (reader.isInsideAWord()) {
reader.next();
}
|
|
278
|
|
|
279
280
281
282
283
|
ChunkBounds chunkBounds;
chunkBounds.chunkStartPtr = reader.getChunkStartPtr();
chunkBounds.wordStartPtr = reader.getWordStartPtr();
chunkBounds.wordEndPtr = reader.getCurrPtr();
chunkBounds.chunkEndPtr = handleWhitespacesAtEnd(env, reader);
|
|
284
|
|
|
285
286
287
288
289
290
291
292
293
294
|
if (!graph.empty()) {
const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
int srcNode = startNodeNum;
const std::vector< std::vector<InflexionGraph::Edge> >& theGraph = graph.getTheGraph();
size_t initialResultsSize = results.size();
for (unsigned int i = 0; i < theGraph.size(); i++) {
const vector<InflexionGraph::Edge>& edges = theGraph[i];
for (unsigned int j = 0; j < edges.size(); j++) {
const InflexionGraph::Edge& e = edges[j];
unsigned int targetNode = startNodeNum + e.nextNode;
|
|
295
|
InterpretedChunk ic = e.chunk;
|
|
296
|
ic.chunkStartPtr =
|
|
297
298
299
|
ic.textStartPtr == reader.getWordStartPtr()
? reader.getChunkStartPtr()
: ic.textStartPtr;
|
|
300
301
302
303
|
ic.chunkEndPtr =
ic.textEndPtr == chunkBounds.wordEndPtr
? chunkBounds.chunkEndPtr
: ic.textEndPtr;
|
|
304
|
interpretedChunksDecoder.decode(srcNode, targetNode, ic, results);
|
|
305
306
307
308
|
}
srcNode++;
}
if (results.size() == initialResultsSize) {
|
|
309
|
this->appendIgnotiumToResults(env, chunkBounds, startNodeNum, results);
|
|
310
|
}
|
|
311
312
|
}
else if (env.getProcessorType() == ANALYZER
|
|
313
|
&& !insideIgnHandler) {
|
|
314
|
this->handleIgnChunk(env, chunkBounds, startNodeNum, results);
|
|
315
316
|
}
else {
|
|
317
|
this->appendIgnotiumToResults(env, chunkBounds, startNodeNum, results);
|
|
318
|
}
|
|
319
320
|
}
|
|
321
|
void MorfeuszImpl::doProcessOneWord(
|
|
322
323
324
325
326
|
const Environment& env,
TextReader& reader,
const SegrulesState& segrulesState) const {
if (this->options.debug) {
cerr << "----------" << endl;
|
|
327
|
cerr << "doProcessOneWord: '" << reader.getCurrPtr() << "', already recognized: " << debugAccum(accum) << endl;
|
|
328
|
}
|
|
329
|
StateType state = env.getFSA().getInitialState();
|
|
330
|
string homonymId;
|
|
331
332
333
334
335
|
while (!reader.isAtWhitespace()) {
feedState(env, state, reader);
if (state.isSink()) {
return;
}
|
|
336
|
reader.next();
|
|
337
338
339
340
341
342
343
|
if (env.getProcessorType() == GENERATOR && reader.getCurrPtr() != reader.getEndPtr() && reader.peek() == (uint32_t) HOMONYM_SEPARATOR) {
homonymId = env.getCharsetConverter().fromUTF8(string(reader.getCurrPtr() + 1, reader.getEndPtr()));
reader.proceedToEnd();
}
if (state.isAccepting()) {
InterpsGroupsReader& igReader = const_cast<InterpsGroupsReader&> (state.getValue());
while (igReader.hasNext()) {
|
|
344
|
processInterpsGroup(env, reader, reader.isAtWhitespace(), segrulesState, homonymId, igReader.getNext());
|
|
345
346
|
}
}
|
|
347
|
}
|
|
348
349
|
}
|
|
350
|
void MorfeuszImpl::processInterpsGroup(
|
|
351
352
353
354
355
|
const Environment& env,
const TextReader& reader,
bool isAtWhitespace,
const SegrulesState& segrulesState,
const string& homonymId,
|
|
356
|
const InterpsGroup& ig) const {
|
|
357
|
if (this->options.debug) {
|
|
358
|
std::cerr << "processInterpsGroup, segmentType=" << (int) ig.type << std::endl;
|
|
359
|
}
|
|
360
|
bool caseMatches = env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(env, reader.getWordStartPtr(), reader.getCurrPtr(), ig);
|
|
361
|
if (caseMatches || options.caseHandling == CONDITIONALLY_CASE_SENSITIVE) {
|
|
362
|
SegrulesState newSegrulesState;
|
|
363
364
|
env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, isAtWhitespace, newSegrulesState);
if (!newSegrulesState.failed) {
|
|
365
|
|
|
366
367
368
369
370
371
372
373
374
375
376
377
|
InterpretedChunk ic(
createChunk(ig, reader, newSegrulesState.shiftOrthFromPrevious, homonymId));
processInterpretedChunk(
env,
reader,
isAtWhitespace,
caseMatches,
newSegrulesState,
ic);
}
else if (this->options.debug) {
|
|
378
|
std::cerr << "NOT ACCEPTING (segmentation)" << debugAccum(accum) << debugInterpsGroup(ig.type, reader.getWordStartPtr(), reader.getCurrPtr()) << std::endl;
|
|
379
|
}
|
|
380
381
|
}
else if (this->options.debug) {
|
|
382
383
384
385
|
std::cerr << "NOT ACCEPTING (case)" << debugAccum(accum) << debugInterpsGroup(ig.type, reader.getWordStartPtr(), reader.getCurrPtr()) << std::endl;
}
}
|
|
386
|
void MorfeuszImpl::processInterpretedChunk(
|
|
387
388
389
390
391
392
|
const Environment& env,
const TextReader& reader,
bool isAtWhitespace,
bool caseMatches,
const SegrulesState& newSegrulesState,
InterpretedChunk& ic) const {
|
|
393
|
bool orthShifted = false;
|
|
394
395
|
if (!accum.empty() && accum.back().shiftOrth) {
doShiftOrth(accum.back(), ic);
|
|
396
|
orthShifted = true;
|
|
397
|
}
|
|
398
|
if (!caseMatches && options.caseHandling == CONDITIONALLY_CASE_SENSITIVE) {
|
|
399
400
401
402
403
404
405
|
notMatchingCaseSegs++;
ic.forceIgnoreCase = true;
}
accum.push_back(ic);
if (isAtWhitespace) {
assert(newSegrulesState.accepting);
if (this->options.debug) {
|
|
406
|
cerr << "ACCEPTING " << debugAccum(accum) << " prefixChunks: " << debugAccum(accum.back().prefixChunks) << endl;
|
|
407
408
|
}
graph.addPath(accum, newSegrulesState.weak || notMatchingCaseSegs > 0);
|
|
409
410
|
}
else {
|
|
411
412
413
414
415
|
assert(!newSegrulesState.sink);
TextReader newReader(reader.getCurrPtr(), reader.getEndPtr(), env);
doProcessOneWord(env, newReader, newSegrulesState);
}
accum.pop_back();
|
|
416
417
418
|
if (orthShifted) {
doUnshiftOrth(accum.back(), ic);
}
|
|
419
|
if (!caseMatches && options.caseHandling == CONDITIONALLY_CASE_SENSITIVE) {
|
|
420
|
notMatchingCaseSegs--;
|
|
421
|
}
|
|
422
|
}
|
|
423
|
|
|
424
|
void MorfeuszImpl::processWhitespacesChunk(
|
|
425
426
427
428
|
TextReader& reader,
int startNodeNum,
std::vector<MorphInterpretation>& results) const {
string orth(reader.readWhitespacesChunk());
|
|
429
|
results.push_back(MorphInterpretation::createWhitespace(startNodeNum, startNodeNum + 1, orth));
|
|
430
|
}
|
|
431
|
|
|
432
|
void MorfeuszImpl::handleIgnChunk(
|
|
433
|
const Environment& env,
|
|
434
|
const ChunkBounds& chunkBounds,
|
|
435
436
|
int startNodeNum,
std::vector<MorphInterpretation>& results) const {
|
|
437
|
const char* currInput = chunkBounds.chunkStartPtr;
|
|
438
|
const char* prevInput = currInput;
|
|
439
440
|
uint32_t codepoint = 0x00;
bool separatorFound = false;
|
|
441
|
while (currInput != chunkBounds.chunkEndPtr) {
|
|
442
443
444
|
prevInput = currInput;
const char* nonSeparatorInputEnd = prevInput;
do {
|
|
445
|
codepoint = env.getCharsetConverter().next(currInput, chunkBounds.chunkEndPtr);
|
|
446
447
448
|
if (!env.isSeparator(codepoint)) {
nonSeparatorInputEnd = currInput;
}
|
|
449
|
} while (currInput != chunkBounds.chunkEndPtr && !env.isSeparator(codepoint));
|
|
450
|
|
|
451
452
453
|
if (env.isSeparator(codepoint)) {
separatorFound = true;
if (nonSeparatorInputEnd != prevInput) {
|
|
454
|
// there are non-separators + separators
|
|
455
|
|
|
456
|
int startNode = results.empty() ? startNodeNum : results.back().endNode;
|
|
457
|
// process part before separators
|
|
458
459
460
|
TextReader newReader1(prevInput, nonSeparatorInputEnd, env);
notMatchingCaseSegs = 0;
this->processOneWord(env, newReader1, startNode, results, true);
|
|
461
|
|
|
462
463
464
465
|
// process separators part
if (currInput == chunkBounds.wordEndPtr) {
currInput = chunkBounds.chunkEndPtr;
}
|
|
466
|
startNode = results.empty() ? startNodeNum : results.back().endNode;
|
|
467
468
|
TextReader newReader2(nonSeparatorInputEnd, currInput, env);
this->processOneWord(env, newReader2, startNode, results, true);
|
|
469
|
} else {
|
|
470
471
472
473
|
// there are only separators
if (currInput == chunkBounds.wordEndPtr) {
currInput = chunkBounds.chunkEndPtr;
}
|
|
474
|
int startNode = results.empty() ? startNodeNum : results.back().endNode;
|
|
475
476
477
478
|
TextReader newReader3(prevInput, currInput, env);
notMatchingCaseSegs = 0;
this->processOneWord(env, newReader3, startNode, results, true);
}
|
|
479
|
}
|
|
480
481
|
}
|
|
482
|
// currInput == chunkBounds.chunkEndPtr
|
|
483
484
|
if (!env.isSeparator(codepoint)) {
if (separatorFound) {
|
|
485
|
// process part after separators
|
|
486
|
int startNode = results.empty() ? startNodeNum : results.back().endNode;
|
|
487
|
TextReader newReader4(prevInput, chunkBounds.chunkEndPtr, env);
|
|
488
|
this->processOneWord(env, newReader4, startNode, results, true);
|
|
489
|
} else {
|
|
490
|
this->appendIgnotiumToResults(env, chunkBounds, startNodeNum, results);
|
|
491
492
493
494
|
}
}
}
|
|
495
|
void MorfeuszImpl::appendIgnotiumToResults(
|
|
496
|
const Environment& env,
|
|
497
|
const ChunkBounds& chunkBounds,
|
|
498
499
|
int startNodeNum,
std::vector<MorphInterpretation>& results) const {
|
|
500
501
|
string orth(chunkBounds.chunkStartPtr, chunkBounds.chunkEndPtr);
string lemma(chunkBounds.wordStartPtr, chunkBounds.wordEndPtr);
|
|
502
|
results.push_back(MorphInterpretation::createIgn(startNodeNum, startNodeNum + 1, orth, lemma));
|
|
503
504
|
}
|
|
505
|
void MorfeuszImpl::analyseOneWord(
|
|
506
507
508
509
|
TextReader& reader,
vector<MorphInterpretation>& results) const {
this->processOneWord(this->analyzerEnv, reader, nextNodeNum, results);
if (!results.empty()) {
|
|
510
|
nextNodeNum = results.back().endNode;
|
|
511
|
}
|
|
512
|
}
|
|
513
|
|
|
514
|
void MorfeuszImpl::adjustTokensCounter() const {
|
|
515
|
if (options.tokenNumbering == SEPARATE_NUMBERING) {
|
|
516
|
nextNodeNum = 0;
|
|
517
|
}
|
|
518
|
}
|
|
519
|
|
|
520
|
ResultsIterator* MorfeuszImpl::analyse(const string& text) const {
|
|
521
522
523
|
ensureIsAnalyzer();
|
|
524
|
adjustTokensCounter();
|
|
525
526
527
528
|
char* textCopy = new char[text.length() + 1];
strcpy(textCopy, text.c_str());
return new ResultsIteratorImpl(*this, textCopy, textCopy + text.length(), true);
}
|
|
529
|
|
|
530
|
ResultsIterator* MorfeuszImpl::analyseWithCopy(const char* text) const {
|
|
531
532
533
|
ensureIsAnalyzer();
|
|
534
535
536
537
538
539
|
adjustTokensCounter();
long n = strlen(text);
char* textCopy = new char[n + 1];
strcpy(textCopy, text);
return new ResultsIteratorImpl(*this, textCopy, textCopy + n, true);
}
|
|
540
|
|
|
541
|
ResultsIterator* MorfeuszImpl::analyse(const char* text) const {
|
|
542
543
544
|
ensureIsAnalyzer();
|
|
545
546
|
adjustTokensCounter();
return new ResultsIteratorImpl(*this, text, text + strlen(text), false);
|
|
547
|
}
|
|
548
|
|
|
549
|
void MorfeuszImpl::analyse(const string& text, vector<MorphInterpretation>& results) const {
|
|
550
551
552
|
ensureIsAnalyzer();
|
|
553
554
555
|
adjustTokensCounter();
TextReader reader(text, this->analyzerEnv);
while (!reader.isAtEnd()) {
|
|
556
|
analyseOneWord(reader, results);
|
|
557
|
}
|
|
558
|
}
|
|
559
|
|
|
560
|
void MorfeuszImpl::generate(const string& lemma, vector<MorphInterpretation>& results) const {
|
|
561
562
563
|
ensureIsGenerator();
|
|
564
565
566
567
568
569
570
571
|
const char* input = lemma.c_str();
const char* inputEnd = input + lemma.length();
int startNode = 0;
TextReader reader(input, inputEnd, this->generatorEnv);
this->processOneWord(this->generatorEnv, reader, startNode, results);
if (reader.getCurrPtr() != reader.getEndPtr()) {
throw MorfeuszException("Input contains more than one word");
}
|
|
572
573
|
}
|
|
574
|
void MorfeuszImpl::generate(const std::string& lemma, int tagId, vector<MorphInterpretation>& result) const {
|
|
575
576
|
ensureIsGenerator();
|
|
577
|
|
|
578
579
580
|
if (tagId >= this->generatorEnv.getIdResolver().getTagsCount()) {
throw MorfeuszException("Invalid tagId (outside of tagset)");
}
|
|
581
|
|
|
582
583
584
|
vector<MorphInterpretation> partRes;
this->generate(lemma, partRes);
for (unsigned int i = 0; i < partRes.size(); i++) {
|
|
585
|
// XXX - someday it should be improved
|
|
586
|
if (partRes[i].tagId == tagId) {
|
|
587
588
|
result.push_back(partRes[i]);
}
|
|
589
590
591
|
}
}
|
|
592
|
void MorfeuszImpl::setCharset(Charset charset) {
|
|
593
594
595
596
|
this->options.encoding = charset;
this->analyzerEnv.setCharset(charset);
this->generatorEnv.setCharset(charset);
}
|
|
597
598
599
600
|
Charset MorfeuszImpl::getCharset() const {
return this->options.encoding;
}
|
|
601
|
|
|
602
|
void MorfeuszImpl::setAggl(const std::string& aggl) {
|
|
603
604
605
|
this->analyzerEnv.setSegrulesOption("aggl", aggl);
this->generatorEnv.setSegrulesOption("aggl", aggl);
}
|
|
606
607
608
609
|
string MorfeuszImpl::getAggl() const {
return getAnyEnvironment().getSegrulesOption("aggl");
}
|
|
610
|
|
|
611
|
void MorfeuszImpl::setPraet(const std::string& praet) {
|
|
612
613
614
|
this->analyzerEnv.setSegrulesOption("praet", praet);
this->generatorEnv.setSegrulesOption("praet", praet);
}
|
|
615
616
617
618
|
string MorfeuszImpl::getPraet() const {
return getAnyEnvironment().getSegrulesOption("praet");
}
|
|
619
|
|
|
620
|
void MorfeuszImpl::setCaseHandling(CaseHandling caseHandling) {
|
|
621
622
623
624
625
626
627
628
629
|
switch (caseHandling) {
case IGNORE_CASE:
case CONDITIONALLY_CASE_SENSITIVE:
case STRICTLY_CASE_SENSITIVE:
break;
default:
throw std::invalid_argument("Invalid caseHandling option");
}
this->options.caseHandling = caseHandling;
|
|
630
|
this->analyzerEnv.setCaseSensitive(caseHandling != IGNORE_CASE);
|
|
631
|
}
|
|
632
633
634
635
|
CaseHandling MorfeuszImpl::getCaseHandling() const {
return this->options.caseHandling;
}
|
|
636
|
|
|
637
|
void MorfeuszImpl::setTokenNumbering(TokenNumbering tokenNumbering) {
|
|
638
639
640
641
642
643
644
|
switch (tokenNumbering) {
case SEPARATE_NUMBERING:
case CONTINUOUS_NUMBERING:
break;
default:
throw std::invalid_argument("Invalid tokenNumbering option");
}
|
|
645
|
this->options.tokenNumbering = tokenNumbering;
|
|
646
|
nextNodeNum = 0;
|
|
647
|
}
|
|
648
649
650
651
|
TokenNumbering MorfeuszImpl::getTokenNumbering() const {
return this->options.tokenNumbering;
}
|
|
652
|
|
|
653
|
void MorfeuszImpl::setWhitespaceHandling(WhitespaceHandling whitespaceHandling) {
|
|
654
655
656
657
658
659
660
661
|
switch (whitespaceHandling) {
case SKIP_WHITESPACES:
case APPEND_WHITESPACES:
case KEEP_WHITESPACES:
break;
default:
throw std::invalid_argument("Invalid whitespaceHandling option");
}
|
|
662
663
|
this->options.whitespaceHandling = whitespaceHandling;
}
|
|
664
665
666
667
|
WhitespaceHandling MorfeuszImpl::getWhitespaceHandling() const {
return this->options.whitespaceHandling;
}
|
|
668
|
|
|
669
|
void MorfeuszImpl::setDebug(bool debug) {
|
|
670
671
|
this->options.debug = debug;
}
|
|
672
|
|
|
673
674
675
676
677
678
|
const IdResolver& MorfeuszImpl::getIdResolver() const {
if (this->analyzerEnv.isUsable()) {
return this->analyzerEnv.getIdResolver();
} else {
return this->generatorEnv.getIdResolver();
}
|
|
679
680
|
}
|
|
681
682
683
684
|
void MorfeuszImpl::ensureIsAnalyzer() const {
if (usage != ANALYSE_ONLY && usage != BOTH_ANALYSE_AND_GENERATE) {
throw MorfeuszException("Cannot analyse with given Morfeusz instance.");
}
|
|
685
|
}
|
|
686
687
688
689
690
|
void MorfeuszImpl::ensureIsGenerator() const {
if (usage != GENERATE_ONLY && usage != BOTH_ANALYSE_AND_GENERATE) {
throw MorfeuszException("Cannot generate with given Morfeusz instance.");
}
|
|
691
|
}
|
|
692
|
}
|