Blame view

morfeusz/MorfeuszImpl.cpp 25.4 KB
Michał Lenart authored
1
2
3
4
5
6
7
8
9
/* 
 * File:   Morfeusz.cpp
 * Author: mlenart
 * 
 * Created on November 13, 2013, 5:21 PM
 */

#include <string>
#include <iostream>
Michał Lenart authored
10
#include <vector>
Michał Lenart authored
11
#include <cstring>
Michał Lenart authored
12
#include <stdexcept>
Michał Lenart authored
13
14
#include "fsa/fsa.hpp"
#include "utils.hpp"
Michał Lenart authored
15
#include "MorfeuszImpl.hpp"
Michał Lenart authored
16
17
18
19
20
21
22
#include "deserialization/morphInterps/InterpretedChunksDecoder.hpp"
#include "charset/CharsetConverter.hpp"
#include "charset/charset_utils.hpp"
#include "case/CaseConverter.hpp"
#include "segrules/segrules.hpp"
#include "const.hpp"
#include "charset/utf8.h"
Michał Lenart authored
23
#include "ChunkBounds.hpp"
Michał Lenart authored
24
#include "DictionariesRepository.hpp"
Michał Lenart authored
25
26
27
28
29
30
31

// TODO - konstruktor kopiujący działający Tak-Jak-Trzeba

using namespace std;

namespace morfeusz {
Michał Lenart authored
32
33
    static MorfeuszOptions createDefaultOptions() {
        MorfeuszOptions res;
Michał Lenart authored
34
        res.caseHandling = CONDITIONALLY_CASE_SENSITIVE;
Michał Lenart authored
35
        res.encoding = UTF8;
Michał Lenart authored
36
37
        res.tokenNumbering = SEPARATE_NUMBERING;
        res.whitespaceHandling = SKIP_WHITESPACES;
Michał Lenart authored
38
39
40
        res.debug = false;
        return res;
    }
Michał Lenart authored
41
Michał Lenart authored
42
43
44
45
    static string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
        stringstream res;
        res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
        return res.str();
Michał Lenart authored
46
47
    }
Michał Lenart authored
48
49
50
    static string debugAccum(vector<InterpretedChunk>& accum) {
        stringstream res;
        for (unsigned int i = 0; i < accum.size(); i++) {
Michał Lenart authored
51
            res << debugInterpsGroup(accum[i].segmentType, accum[i].textNoPrefixesStartPtr, accum[i].textEndPtr);
Michał Lenart authored
52
53
        }
        return res.str();
Michał Lenart authored
54
55
    }
Michał Lenart authored
56
    static void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
Michał Lenart authored
57
        to.prefixChunks.swap(from.prefixChunks); // from.prefixChunks are ignored anyway. Will swap them back in doUnshiftOrth
Michał Lenart authored
58
59
60
61
        to.prefixChunks.push_back(from);
        to.textStartPtr = from.textStartPtr;
        from.orthWasShifted = true;
    }
Michał Lenart authored
62
63
64
65
66

    static void doUnshiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
        to.prefixChunks.swap(from.prefixChunks);
        from.prefixChunks.pop_back();
    }
Michał Lenart authored
67
Michał Lenart authored
68
69
70
71
72
73
74
75
76
    static void feedStateDirectly(
            const FSAType& fsa,
            StateType& state,
            const char* inputStart,
            const char* inputEnd) {
        const char* currInput = inputStart;
        while (currInput != inputEnd && !state.isSink()) {
            state.proceedToNext(fsa, *currInput++);
        }
Michał Lenart authored
77
78
    }
Michał Lenart authored
79
80
81
82
83
84
85
86
87
    static void feedStateIndirectly(
            const FSAType& fsa,
            StateType& state,
            uint32_t codepoint) {
        std::string chars;
        UTF8CharsetConverter::getInstance().append(codepoint, chars);
        for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) {
            state.proceedToNext(fsa, chars[i]);
        }
Michał Lenart authored
88
    }
Michał Lenart authored
89
90
91
92
93
94
95

    static void feedState(
            const Environment& env,
            StateType& state,
            TextReader& reader) {
        if (reader.peek() == reader.normalizedPeek() && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) {
            feedStateDirectly(env.getFSA(), state, reader.getCurrPtr(), reader.getNextPtr());
Michał Lenart authored
96
        } else {
Michał Lenart authored
97
98
            feedStateIndirectly(env.getFSA(), state, reader.normalizedPeek());
        }
Michał Lenart authored
99
100
    }
Michał Lenart authored
101
102
103
104
105
106
107
108
    static InterpretedChunk createChunk(
            const InterpsGroup& ig,
            const TextReader& reader,
            bool shiftOrth,
            const string& homonymId) {
        const unsigned char* interpsEndPtr = ig.ptr + ig.size;
        InterpretedChunk ic;
        ic.segmentType = ig.type;
Michał Lenart authored
109
110
        ic.textStartPtr = reader.getWordStartPtr(); // may be changed later in doShiftOrth(...) function
        ic.textNoPrefixesStartPtr = ic.textStartPtr;
Michał Lenart authored
111
112
113
114
115
116
117
118
119
120
        ic.textEndPtr = homonymId.empty() ? reader.getCurrPtr() : reader.getCurrPtr() - homonymId.length() - 1;
        ic.interpsGroupPtr = ig.ptr;
        ic.interpsEndPtr = interpsEndPtr;
        ic.shiftOrth = shiftOrth;
        ic.orthWasShifted = false;
        ic.requiredHomonymId = homonymId;
        ic.codepointsNum = reader.getCodepointsRead();
        ic.forceIgnoreCase = false;
        return ic;
    }
Michał Lenart authored
121
Michał Lenart authored
122
123
    MorfeuszImpl::MorfeuszImpl(const string& dictName, MorfeuszUsage usage)
    : currDictionary(dictName),
Michał Lenart authored
124
    usage(usage),
Michał Lenart authored
125
126
    analyzerEnv(dictName, ANALYZER, usage != GENERATE_ONLY),
    generatorEnv(dictName, GENERATOR, usage != ANALYSE_ONLY),
Michał Lenart authored
127
128
129
130
131
    options(createDefaultOptions()),
    accum(),
    notMatchingCaseSegs(0),
    graph(),
    nextNodeNum(0) {
Michał Lenart authored
132
        analyzerEnv.setCaseSensitive(options.caseHandling != IGNORE_CASE);
Michał Lenart authored
133
134
        generatorEnv.setCaseSensitive(false);
    }
Michał Lenart authored
135
136
137
138
139

    Morfeusz* MorfeuszImpl::clone() const {
        return new MorfeuszImpl(*this);
    }
Michał Lenart authored
140
141
142
143
144
145
146
147
    string MorfeuszImpl::getDictID() const {
        return getAnyEnvironment().getCurrentDictionary()->id;
    }

    string MorfeuszImpl::getDictCopyright() const {
        return getAnyEnvironment().getCurrentDictionary()->copyright;
    }
Michał Lenart authored
148
    void MorfeuszImpl::setDictionary(const string& dictName) {
Michał Lenart authored
149
150
151
152
153
154

        if (dictName != currDictionary) {

            doSetDictionary(dictName);

            currDictionary = dictName;
Michał Lenart authored
155
        }
Michał Lenart authored
156
157
158
159
160
161
    }

    void MorfeuszImpl::doSetDictionary(const string& dictName) {
        switch (usage) {
            case BOTH_ANALYSE_AND_GENERATE:
            {
Michał Lenart authored
162
163
                const Dictionary* analyzerDict = DictionariesRepository::getInstance().getDictionary(dictName, ANALYZER);
                const Dictionary* generatorDict = DictionariesRepository::getInstance().getDictionary(dictName, GENERATOR);
Michał Lenart authored
164
165
166
                if (analyzerDict->isCompatibleWith(*generatorDict)) {
                    analyzerEnv.setDictionary(analyzerDict);
                    generatorEnv.setDictionary(generatorDict);
Michał Lenart authored
167
168
                } 
                else {
Michał Lenart authored
169
170
171
172
173
                    throw MorfeuszException("Analyzer and generator dictionaries are incompatible");
                }
            }
                break;
            case ANALYSE_ONLY:
Michał Lenart authored
174
                analyzerEnv.setDictionary(DictionariesRepository::getInstance().getDictionary(dictName, ANALYZER));
Michał Lenart authored
175
176
                break;
            case GENERATE_ONLY:
Michał Lenart authored
177
                generatorEnv.setDictionary(DictionariesRepository::getInstance().getDictionary(dictName, GENERATOR));
Michał Lenart authored
178
                break;
Michał Lenart authored
179
        }
Michał Lenart authored
180
    }
Michał Lenart authored
181
Michał Lenart authored
182
183
184
    const Environment& MorfeuszImpl::getAnyEnvironment() const {
        if (analyzerEnv.isUsable()) {
            return analyzerEnv;
Michał Lenart authored
185
        } else {
Michał Lenart authored
186
187
188
189
            return generatorEnv;
        }
    }
Michał Lenart authored
190
    const set<string>& MorfeuszImpl::getAvailableAgglOptions() const {
Michał Lenart authored
191
        return getAnyEnvironment().getAvailableAgglOptions();
Michał Lenart authored
192
    }
Michał Lenart authored
193
Michał Lenart authored
194
    const set<string>& MorfeuszImpl::getAvailablePraetOptions() const {
Michał Lenart authored
195
        return getAnyEnvironment().getAvailablePraetOptions();
Michał Lenart authored
196
197
    }
Michał Lenart authored
198
    MorfeuszImpl::~MorfeuszImpl() {
Michał Lenart authored
199
    }
Michał Lenart authored
200
Michał Lenart authored
201
202
203
204
205
206
    const char* getWordEndPtr(const TextReader& reader, const Environment& env) {
        TextReader tmpReader(reader.getCurrPtr(), reader.getEndPtr(), env);
        while (!tmpReader.isAtEnd() && !tmpReader.isAtWhitespace()) {
            tmpReader.next();
        }
        return tmpReader.getCurrPtr();
Michał Lenart authored
207
208
    }
Michał Lenart authored
209
    bool MorfeuszImpl::handleWhitespacesAtBeginning(
Michał Lenart authored
210
211
212
213
            const Environment& env,
            TextReader& reader,
            int startNodeNum,
            std::vector<MorphInterpretation>& results) const {
Michał Lenart authored
214
215
        if (env.getProcessorType() == ANALYZER) {
            switch (options.whitespaceHandling) {
Michał Lenart authored
216
                case KEEP_WHITESPACES:
Michał Lenart authored
217
218
219
220
221
222
223
224
225
                {
                    bool res = reader.isAtWhitespace() && !reader.isAtEnd();
                    if (res) {
                        processWhitespacesChunk(reader, startNodeNum, results);
                    }
                    reader.markChunkStartsHere();
                    reader.markWordStartsHere();
                    return res;
                }
Michał Lenart authored
226
                case APPEND_WHITESPACES:
Michał Lenart authored
227
228
229
230
                    reader.markChunkStartsHere();
                    reader.skipWhitespaces();
                    reader.markWordStartsHere();
                    return false;
Michał Lenart authored
231
                case SKIP_WHITESPACES:
Michał Lenart authored
232
233
234
235
236
237
                    reader.skipWhitespaces();
                    reader.markChunkStartsHere();
                    reader.markWordStartsHere();
                    return false;
                default:
                    break;
Michał Lenart authored
238
239
            }
        }
Michał Lenart authored
240
Michał Lenart authored
241
242
        return false;
    }
Michał Lenart authored
243
Michał Lenart authored
244
    const char* MorfeuszImpl::handleWhitespacesAtEnd(
Michał Lenart authored
245
246
247
            const Environment& env,
            TextReader& reader) const {
        if (env.getProcessorType() == ANALYZER
Michał Lenart authored
248
                && options.whitespaceHandling == APPEND_WHITESPACES) {
Michał Lenart authored
249
250
            reader.skipWhitespaces();
        }
Michał Lenart authored
251
        return reader.getCurrPtr();
Michał Lenart authored
252
    }
Michał Lenart authored
253
Michał Lenart authored
254
    void MorfeuszImpl::processOneWord(
Michał Lenart authored
255
256
257
            const Environment& env,
            TextReader& reader,
            int startNodeNum,
Michał Lenart authored
258
            vector<MorphInterpretation>& results,
Michał Lenart authored
259
            bool insideIgnHandler) const {
Michał Lenart authored
260
        if (handleWhitespacesAtBeginning(env, reader, startNodeNum, results)) {
Michał Lenart authored
261
            startNodeNum = results.back().endNode;
Michał Lenart authored
262
        }
Michał Lenart authored
263
Michał Lenart authored
264
265
266
        if (reader.isAtEnd()) {
            return;
        }
Michał Lenart authored
267
        accum.resize(0);
Michał Lenart authored
268
269
        notMatchingCaseSegs = 0;
        graph.clear();
Michał Lenart authored
270
Michał Lenart authored
271
        const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
Michał Lenart authored
272
Michał Lenart authored
273
274
275
276
277
        doProcessOneWord(env, reader, segrulesFSA.initialState);

        while (reader.isInsideAWord()) {
            reader.next();
        }
Michał Lenart authored
278
Michał Lenart authored
279
280
281
282
283
        ChunkBounds chunkBounds;
        chunkBounds.chunkStartPtr = reader.getChunkStartPtr();
        chunkBounds.wordStartPtr = reader.getWordStartPtr();
        chunkBounds.wordEndPtr = reader.getCurrPtr();
        chunkBounds.chunkEndPtr = handleWhitespacesAtEnd(env, reader);
Michał Lenart authored
284
Michał Lenart authored
285
286
287
288
289
290
291
292
293
294
        if (!graph.empty()) {
            const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
            int srcNode = startNodeNum;
            const std::vector< std::vector<InflexionGraph::Edge> >& theGraph = graph.getTheGraph();
            size_t initialResultsSize = results.size();
            for (unsigned int i = 0; i < theGraph.size(); i++) {
                const vector<InflexionGraph::Edge>& edges = theGraph[i];
                for (unsigned int j = 0; j < edges.size(); j++) {
                    const InflexionGraph::Edge& e = edges[j];
                    unsigned int targetNode = startNodeNum + e.nextNode;
Michał Lenart authored
295
                    InterpretedChunk ic = e.chunk;
Michał Lenart authored
296
                    ic.chunkStartPtr =
Michał Lenart authored
297
298
299
                            ic.textStartPtr == reader.getWordStartPtr()
                            ? reader.getChunkStartPtr()
                            : ic.textStartPtr;
Michał Lenart authored
300
301
302
303
                    ic.chunkEndPtr = 
                            ic.textEndPtr == chunkBounds.wordEndPtr
                            ? chunkBounds.chunkEndPtr
                            : ic.textEndPtr;
Michał Lenart authored
304
                    interpretedChunksDecoder.decode(srcNode, targetNode, ic, results);
Michał Lenart authored
305
306
307
308
                }
                srcNode++;
            }
            if (results.size() == initialResultsSize) {
Michał Lenart authored
309
                this->appendIgnotiumToResults(env, chunkBounds, startNodeNum, results);
Michał Lenart authored
310
            }
Michał Lenart authored
311
312
        } 
        else if (env.getProcessorType() == ANALYZER
Michał Lenart authored
313
                && !insideIgnHandler) {
Michał Lenart authored
314
            this->handleIgnChunk(env, chunkBounds, startNodeNum, results);
Michał Lenart authored
315
316
        } 
        else {
Michał Lenart authored
317
            this->appendIgnotiumToResults(env, chunkBounds, startNodeNum, results);
Michał Lenart authored
318
        }
Michał Lenart authored
319
320
    }
Michał Lenart authored
321
    void MorfeuszImpl::doProcessOneWord(
Michał Lenart authored
322
323
324
325
326
            const Environment& env,
            TextReader& reader,
            const SegrulesState& segrulesState) const {
        if (this->options.debug) {
            cerr << "----------" << endl;
Michał Lenart authored
327
            cerr << "doProcessOneWord: '" << reader.getCurrPtr() << "', already recognized: " << debugAccum(accum) << endl;
Michał Lenart authored
328
        }
Michał Lenart authored
329
        StateType state = env.getFSA().getInitialState();
Michał Lenart authored
330
        string homonymId;
Michał Lenart authored
331
332
333
334
335
        while (!reader.isAtWhitespace()) {
            feedState(env, state, reader);
            if (state.isSink()) {
                return;
            }
Michał Lenart authored
336
            reader.next();
Michał Lenart authored
337
338
339
340
341
342
343
            if (env.getProcessorType() == GENERATOR && reader.getCurrPtr() != reader.getEndPtr() && reader.peek() == (uint32_t) HOMONYM_SEPARATOR) {
                homonymId = env.getCharsetConverter().fromUTF8(string(reader.getCurrPtr() + 1, reader.getEndPtr()));
                reader.proceedToEnd();
            }
            if (state.isAccepting()) {
                InterpsGroupsReader& igReader = const_cast<InterpsGroupsReader&> (state.getValue());
                while (igReader.hasNext()) {
Michał Lenart authored
344
                    processInterpsGroup(env, reader, reader.isAtWhitespace(), segrulesState, homonymId, igReader.getNext());
Michał Lenart authored
345
346
                }
            }
Michał Lenart authored
347
        }
Michał Lenart authored
348
349
    }
Michał Lenart authored
350
    void MorfeuszImpl::processInterpsGroup(
Michał Lenart authored
351
352
353
354
355
            const Environment& env,
            const TextReader& reader,
            bool isAtWhitespace,
            const SegrulesState& segrulesState,
            const string& homonymId,
Michał Lenart authored
356
            const InterpsGroup& ig) const {
Michał Lenart authored
357
        if (this->options.debug) {
Michał Lenart authored
358
            std::cerr << "processInterpsGroup, segmentType=" << (int) ig.type << std::endl;
Michał Lenart authored
359
        }
Michał Lenart authored
360
        bool caseMatches = env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(env, reader.getWordStartPtr(), reader.getCurrPtr(), ig);
Michał Lenart authored
361
        if (caseMatches || options.caseHandling == CONDITIONALLY_CASE_SENSITIVE) {
Michał Lenart authored
362
            SegrulesState newSegrulesState;
Michał Lenart authored
363
364
            env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, isAtWhitespace, newSegrulesState);
            if (!newSegrulesState.failed) {
Michał Lenart authored
365
Michał Lenart authored
366
367
368
369
370
371
372
373
374
375
376
377
                InterpretedChunk ic(
                        createChunk(ig, reader, newSegrulesState.shiftOrthFromPrevious, homonymId));

                processInterpretedChunk(
                        env,
                        reader,
                        isAtWhitespace,
                        caseMatches,
                        newSegrulesState,
                        ic);
            }
            else if (this->options.debug) {
Michał Lenart authored
378
                std::cerr << "NOT ACCEPTING (segmentation)" << debugAccum(accum) << debugInterpsGroup(ig.type, reader.getWordStartPtr(), reader.getCurrPtr()) << std::endl;
Michał Lenart authored
379
            }
Michał Lenart authored
380
381
        } 
        else if (this->options.debug) {
Michał Lenart authored
382
383
384
385
            std::cerr << "NOT ACCEPTING (case)" << debugAccum(accum) << debugInterpsGroup(ig.type, reader.getWordStartPtr(), reader.getCurrPtr()) << std::endl;
        }
    }
Michał Lenart authored
386
    void MorfeuszImpl::processInterpretedChunk(
Michał Lenart authored
387
388
389
390
391
392
            const Environment& env,
            const TextReader& reader,
            bool isAtWhitespace,
            bool caseMatches,
            const SegrulesState& newSegrulesState,
            InterpretedChunk& ic) const {
Michał Lenart authored
393
        bool orthShifted = false;
Michał Lenart authored
394
395
        if (!accum.empty() && accum.back().shiftOrth) {
            doShiftOrth(accum.back(), ic);
Michał Lenart authored
396
            orthShifted = true;
Michał Lenart authored
397
        }
Michał Lenart authored
398
        if (!caseMatches && options.caseHandling == CONDITIONALLY_CASE_SENSITIVE) {
Michał Lenart authored
399
400
401
402
403
404
405
            notMatchingCaseSegs++;
            ic.forceIgnoreCase = true;
        }
        accum.push_back(ic);
        if (isAtWhitespace) {
            assert(newSegrulesState.accepting);
            if (this->options.debug) {
Michał Lenart authored
406
                cerr << "ACCEPTING " << debugAccum(accum) << " prefixChunks: " << debugAccum(accum.back().prefixChunks) << endl;
Michał Lenart authored
407
408
            }
            graph.addPath(accum, newSegrulesState.weak || notMatchingCaseSegs > 0);
Michał Lenart authored
409
410
        } 
        else {
Michał Lenart authored
411
412
413
414
415
            assert(!newSegrulesState.sink);
            TextReader newReader(reader.getCurrPtr(), reader.getEndPtr(), env);
            doProcessOneWord(env, newReader, newSegrulesState);
        }
        accum.pop_back();
Michał Lenart authored
416
417
418
        if (orthShifted) {
            doUnshiftOrth(accum.back(), ic);
        }
Michał Lenart authored
419
        if (!caseMatches && options.caseHandling == CONDITIONALLY_CASE_SENSITIVE) {
Michał Lenart authored
420
            notMatchingCaseSegs--;
Michał Lenart authored
421
        }
Michał Lenart authored
422
    }
Michał Lenart authored
423
Michał Lenart authored
424
    void MorfeuszImpl::processWhitespacesChunk(
Michał Lenart authored
425
426
427
428
            TextReader& reader,
            int startNodeNum,
            std::vector<MorphInterpretation>& results) const {
        string orth(reader.readWhitespacesChunk());
Michał Lenart authored
429
        results.push_back(MorphInterpretation::createWhitespace(startNodeNum, startNodeNum + 1, orth));
Michał Lenart authored
430
    }
Michał Lenart authored
431
Michał Lenart authored
432
    void MorfeuszImpl::handleIgnChunk(
Michał Lenart authored
433
            const Environment& env,
Michał Lenart authored
434
            const ChunkBounds& chunkBounds,
Michał Lenart authored
435
436
            int startNodeNum,
            std::vector<MorphInterpretation>& results) const {
Michał Lenart authored
437
        const char* currInput = chunkBounds.chunkStartPtr;
Michał Lenart authored
438
        const char* prevInput = currInput;
Michał Lenart authored
439
440
        uint32_t codepoint = 0x00;
        bool separatorFound = false;
Michał Lenart authored
441
        while (currInput != chunkBounds.chunkEndPtr) {
Michał Lenart authored
442
443
444
            prevInput = currInput;
            const char* nonSeparatorInputEnd = prevInput;
            do {
Michał Lenart authored
445
                codepoint = env.getCharsetConverter().next(currInput, chunkBounds.chunkEndPtr);
Michał Lenart authored
446
447
448
                if (!env.isSeparator(codepoint)) {
                    nonSeparatorInputEnd = currInput;
                }
Michał Lenart authored
449
            } while (currInput != chunkBounds.chunkEndPtr && !env.isSeparator(codepoint));
Michał Lenart authored
450
Michał Lenart authored
451
452
453
            if (env.isSeparator(codepoint)) {
                separatorFound = true;
                if (nonSeparatorInputEnd != prevInput) {
Michał Lenart authored
454
                    // there are non-separators + separators
Michał Lenart authored
455
Michał Lenart authored
456
                    int startNode = results.empty() ? startNodeNum : results.back().endNode;
Michał Lenart authored
457
                    // process part before separators
Michał Lenart authored
458
459
460
                    TextReader newReader1(prevInput, nonSeparatorInputEnd, env);
                    notMatchingCaseSegs = 0;
                    this->processOneWord(env, newReader1, startNode, results, true);
Michał Lenart authored
461
Michał Lenart authored
462
463
464
465
                    // process separators part
                    if (currInput == chunkBounds.wordEndPtr) {
                        currInput = chunkBounds.chunkEndPtr;
                    }
Michał Lenart authored
466
                    startNode = results.empty() ? startNodeNum : results.back().endNode;
Michał Lenart authored
467
468
                    TextReader newReader2(nonSeparatorInputEnd, currInput, env);
                    this->processOneWord(env, newReader2, startNode, results, true);
Michał Lenart authored
469
                } else {
Michał Lenart authored
470
471
472
473
                    // there are only separators
                    if (currInput == chunkBounds.wordEndPtr) {
                        currInput = chunkBounds.chunkEndPtr;
                    }
Michał Lenart authored
474
                    int startNode = results.empty() ? startNodeNum : results.back().endNode;
Michał Lenart authored
475
476
477
478
                    TextReader newReader3(prevInput, currInput, env);
                    notMatchingCaseSegs = 0;
                    this->processOneWord(env, newReader3, startNode, results, true);
                }
Michał Lenart authored
479
            }
Michał Lenart authored
480
481
        }
Michał Lenart authored
482
        // currInput == chunkBounds.chunkEndPtr
Michał Lenart authored
483
484
        if (!env.isSeparator(codepoint)) {
            if (separatorFound) {
Michał Lenart authored
485
                // process part after separators
Michał Lenart authored
486
                int startNode = results.empty() ? startNodeNum : results.back().endNode;
Michał Lenart authored
487
                TextReader newReader4(prevInput, chunkBounds.chunkEndPtr, env);
Michał Lenart authored
488
                this->processOneWord(env, newReader4, startNode, results, true);
Michał Lenart authored
489
            } else {
Michał Lenart authored
490
                this->appendIgnotiumToResults(env, chunkBounds, startNodeNum, results);
Michał Lenart authored
491
492
493
494
            }
        }
    }
Michał Lenart authored
495
    void MorfeuszImpl::appendIgnotiumToResults(
Michał Lenart authored
496
            const Environment& env,
Michał Lenart authored
497
            const ChunkBounds& chunkBounds,
Michał Lenart authored
498
499
            int startNodeNum,
            std::vector<MorphInterpretation>& results) const {
Michał Lenart authored
500
501
        string orth(chunkBounds.chunkStartPtr, chunkBounds.chunkEndPtr);
        string lemma(chunkBounds.wordStartPtr, chunkBounds.wordEndPtr);
Michał Lenart authored
502
        results.push_back(MorphInterpretation::createIgn(startNodeNum, startNodeNum + 1, orth, lemma));
Michał Lenart authored
503
504
    }
Michał Lenart authored
505
    void MorfeuszImpl::analyseOneWord(
Michał Lenart authored
506
507
508
509
            TextReader& reader,
            vector<MorphInterpretation>& results) const {
        this->processOneWord(this->analyzerEnv, reader, nextNodeNum, results);
        if (!results.empty()) {
Michał Lenart authored
510
            nextNodeNum = results.back().endNode;
Michał Lenart authored
511
        }
Michał Lenart authored
512
    }
Michał Lenart authored
513
Michał Lenart authored
514
    void MorfeuszImpl::adjustTokensCounter() const {
Michał Lenart authored
515
        if (options.tokenNumbering == SEPARATE_NUMBERING) {
Michał Lenart authored
516
            nextNodeNum = 0;
Michał Lenart authored
517
        }
Michał Lenart authored
518
    }
Michał Lenart authored
519
Michał Lenart authored
520
    ResultsIterator* MorfeuszImpl::analyse(const string& text) const {
Michał Lenart authored
521
522
523

        ensureIsAnalyzer();
Michał Lenart authored
524
        adjustTokensCounter();
Michał Lenart authored
525
526
527
528
        char* textCopy = new char[text.length() + 1];
        strcpy(textCopy, text.c_str());
        return new ResultsIteratorImpl(*this, textCopy, textCopy + text.length(), true);
    }
Michał Lenart authored
529
Michał Lenart authored
530
    ResultsIterator* MorfeuszImpl::analyseWithCopy(const char* text) const {
Michał Lenart authored
531
532
533

        ensureIsAnalyzer();
Michał Lenart authored
534
535
536
537
538
539
        adjustTokensCounter();
        long n = strlen(text);
        char* textCopy = new char[n + 1];
        strcpy(textCopy, text);
        return new ResultsIteratorImpl(*this, textCopy, textCopy + n, true);
    }
Michał Lenart authored
540
Michał Lenart authored
541
    ResultsIterator* MorfeuszImpl::analyse(const char* text) const {
Michał Lenart authored
542
543
544

        ensureIsAnalyzer();
Michał Lenart authored
545
546
        adjustTokensCounter();
        return new ResultsIteratorImpl(*this, text, text + strlen(text), false);
Michał Lenart authored
547
    }
Michał Lenart authored
548
Michał Lenart authored
549
    void MorfeuszImpl::analyse(const string& text, vector<MorphInterpretation>& results) const {
Michał Lenart authored
550
551
552

        ensureIsAnalyzer();
Michał Lenart authored
553
554
555
        adjustTokensCounter();
        TextReader reader(text, this->analyzerEnv);
        while (!reader.isAtEnd()) {
Michał Lenart authored
556
            analyseOneWord(reader, results);
Michał Lenart authored
557
        }
Michał Lenart authored
558
    }
Michał Lenart authored
559
Michał Lenart authored
560
    void MorfeuszImpl::generate(const string& lemma, vector<MorphInterpretation>& results) const {
Michał Lenart authored
561
562
563

        ensureIsGenerator();
Michał Lenart authored
564
565
566
567
568
569
570
571
        const char* input = lemma.c_str();
        const char* inputEnd = input + lemma.length();
        int startNode = 0;
        TextReader reader(input, inputEnd, this->generatorEnv);
        this->processOneWord(this->generatorEnv, reader, startNode, results);
        if (reader.getCurrPtr() != reader.getEndPtr()) {
            throw MorfeuszException("Input contains more than one word");
        }
Michał Lenart authored
572
573
    }
Michał Lenart authored
574
    void MorfeuszImpl::generate(const std::string& lemma, int tagId, vector<MorphInterpretation>& result) const {
Michał Lenart authored
575
576

        ensureIsGenerator();
Michał Lenart authored
577
Michał Lenart authored
578
579
580
        if (tagId >= this->generatorEnv.getIdResolver().getTagsCount()) {
            throw MorfeuszException("Invalid tagId (outside of tagset)");
        }
Michał Lenart authored
581
Michał Lenart authored
582
583
584
        vector<MorphInterpretation> partRes;
        this->generate(lemma, partRes);
        for (unsigned int i = 0; i < partRes.size(); i++) {
Michał Lenart authored
585
            // XXX - someday it should be improved
Michał Lenart authored
586
            if (partRes[i].tagId == tagId) {
Michał Lenart authored
587
588
                result.push_back(partRes[i]);
            }
Michał Lenart authored
589
590
591
        }
    }
Michał Lenart authored
592
    void MorfeuszImpl::setCharset(Charset charset) {
Michał Lenart authored
593
594
595
596
        this->options.encoding = charset;
        this->analyzerEnv.setCharset(charset);
        this->generatorEnv.setCharset(charset);
    }
Michał Lenart authored
597
598
599
600

    Charset MorfeuszImpl::getCharset() const {
        return this->options.encoding;
    }
Michał Lenart authored
601
Michał Lenart authored
602
    void MorfeuszImpl::setAggl(const std::string& aggl) {
Michał Lenart authored
603
604
605
        this->analyzerEnv.setSegrulesOption("aggl", aggl);
        this->generatorEnv.setSegrulesOption("aggl", aggl);
    }
Michał Lenart authored
606
607
608
609

    string MorfeuszImpl::getAggl() const {
        return getAnyEnvironment().getSegrulesOption("aggl");
    }
Michał Lenart authored
610
Michał Lenart authored
611
    void MorfeuszImpl::setPraet(const std::string& praet) {
Michał Lenart authored
612
613
614
        this->analyzerEnv.setSegrulesOption("praet", praet);
        this->generatorEnv.setSegrulesOption("praet", praet);
    }
Michał Lenart authored
615
616
617
618

    string MorfeuszImpl::getPraet() const {
        return getAnyEnvironment().getSegrulesOption("praet");
    }
Michał Lenart authored
619
Michał Lenart authored
620
    void MorfeuszImpl::setCaseHandling(CaseHandling caseHandling) {
Michał Lenart authored
621
622
623
624
625
626
627
628
629
        switch (caseHandling) {
            case IGNORE_CASE:
            case CONDITIONALLY_CASE_SENSITIVE:
            case STRICTLY_CASE_SENSITIVE:
                break;
            default:
                throw std::invalid_argument("Invalid caseHandling option");
        }
        this->options.caseHandling = caseHandling;
Michał Lenart authored
630
        this->analyzerEnv.setCaseSensitive(caseHandling != IGNORE_CASE);
Michał Lenart authored
631
    }
Michał Lenart authored
632
633
634
635

    CaseHandling MorfeuszImpl::getCaseHandling() const {
        return this->options.caseHandling;
    }
Michał Lenart authored
636
Michał Lenart authored
637
    void MorfeuszImpl::setTokenNumbering(TokenNumbering tokenNumbering) {
Michał Lenart authored
638
639
640
641
642
643
644
        switch (tokenNumbering) {
            case SEPARATE_NUMBERING:
            case CONTINUOUS_NUMBERING:
                break;
            default:
                throw std::invalid_argument("Invalid tokenNumbering option");
        }
Michał Lenart authored
645
        this->options.tokenNumbering = tokenNumbering;
Michał Lenart authored
646
        nextNodeNum = 0;
Michał Lenart authored
647
    }
Michał Lenart authored
648
649
650
651

    TokenNumbering MorfeuszImpl::getTokenNumbering() const {
        return this->options.tokenNumbering;
    }
Michał Lenart authored
652
Michał Lenart authored
653
    void MorfeuszImpl::setWhitespaceHandling(WhitespaceHandling whitespaceHandling) {
Michał Lenart authored
654
655
656
657
658
659
660
661
        switch (whitespaceHandling) {
            case SKIP_WHITESPACES:
            case APPEND_WHITESPACES:
            case KEEP_WHITESPACES:
                break;
            default:
                throw std::invalid_argument("Invalid whitespaceHandling option");
        }
Michał Lenart authored
662
663
        this->options.whitespaceHandling = whitespaceHandling;
    }
Michał Lenart authored
664
665
666
667

    WhitespaceHandling MorfeuszImpl::getWhitespaceHandling() const {
        return this->options.whitespaceHandling;
    }
Michał Lenart authored
668
Michał Lenart authored
669
    void MorfeuszImpl::setDebug(bool debug) {
Michał Lenart authored
670
671
        this->options.debug = debug;
    }
Michał Lenart authored
672
Michał Lenart authored
673
674
675
676
677
678
    const IdResolver& MorfeuszImpl::getIdResolver() const {
        if (this->analyzerEnv.isUsable()) {
            return this->analyzerEnv.getIdResolver();
        } else {
            return this->generatorEnv.getIdResolver();
        }
Michał Lenart authored
679
680
    }
Michał Lenart authored
681
682
683
684
    void MorfeuszImpl::ensureIsAnalyzer() const {
        if (usage != ANALYSE_ONLY && usage != BOTH_ANALYSE_AND_GENERATE) {
            throw MorfeuszException("Cannot analyse with given Morfeusz instance.");
        }
Michał Lenart authored
685
    }
Michał Lenart authored
686
687
688
689
690

    void MorfeuszImpl::ensureIsGenerator() const {
        if (usage != GENERATE_ONLY && usage != BOTH_ANALYSE_AND_GENERATE) {
            throw MorfeuszException("Cannot generate with given Morfeusz instance.");
        }
Michał Lenart authored
691
    }
Michał Lenart authored
692
}