Blame view

morfeusz/Morfeusz.cpp 14.2 KB
Michał Lenart authored
1
2
/* 
 * File:   Morfeusz.cpp
Michał Lenart authored
3
 * Author: mlenart
Michał Lenart authored
4
5
6
7
 * 
 * Created on November 13, 2013, 5:21 PM
 */
Michał Lenart authored
8
#include <string>
Michał Lenart authored
9
#include <iostream>
Michał Lenart authored
10
#include "fsa/fsa.hpp"
Michał Lenart authored
11
#include "utils.hpp"
Michał Lenart authored
12
#include "data/default_fsa.hpp"
Michał Lenart authored
13
#include "Morfeusz.hpp"
Michał Lenart authored
14
#include "MorphDeserializer.hpp"
Michał Lenart authored
15
#include "InterpretedChunksDecoder.hpp"
Michał Lenart authored
16
#include "charset/CharsetConverter.hpp"
Michał Lenart authored
17
#include "charset/charset_utils.hpp"
Michał Lenart authored
18
#include "charset/CaseConverter.hpp"
Michał Lenart authored
19
#include "segrules/segrules.hpp"
Michał Lenart authored
20
#include "const.hpp"
Michał Lenart authored
21
#include "deserializationUtils.hpp"
Michał Lenart authored
22
#include "charset/utf8.h"
Michał Lenart authored
23
24

// TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
Michał Lenart authored
25
Michał Lenart authored
26
27
using namespace std;
Michał Lenart authored
28
29
30
31
static MorfeuszOptions createDefaultOptions() {
    MorfeuszOptions res;
    res.caseSensitive = true;
    res.encoding = UTF8;
Michał Lenart authored
32
    res.debug = false;
Michał Lenart authored
33
34
35
    return res;
}
Michał Lenart authored
36
Morfeusz::Morfeusz()
Michał Lenart authored
37
38
: analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA),
generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA),
Michał Lenart authored
39
options(createDefaultOptions()) {
Michał Lenart authored
40
41
    analyzerEnv.setCaseSensitive(options.caseSensitive);
    generatorEnv.setCaseSensitive(false);
Michał Lenart authored
42
43
}
Michał Lenart authored
44
45
46
47
48
49
50
51
52
53
54
55
56
57
inline const unsigned char* getInterpretationsPtr(const Environment& env, const InterpsGroup& ig) {
    if (env.getProcessorType() == ANALYZER) {
        const unsigned char* currPtr = ig.ptr;
        unsigned char casePatternsNum = *currPtr++;
        for (unsigned int i = 0; i < casePatternsNum; i++) {
            env.getCasePatternHelper().deserializeOneCasePattern(currPtr);
        }
        return currPtr;
    }
    else {
        return ig.ptr;
    }
}
Michał Lenart authored
58
void Morfeusz::setAnalyzerFile(const string& filename) {
Michał Lenart authored
59
    this->analyzerEnv.setFSAFile(filename);
Michał Lenart authored
60
61
}
Michał Lenart authored
62
void Morfeusz::setGeneratorFile(const string& filename) {
Michał Lenart authored
63
    this->generatorEnv.setFSAFile(filename);
Michał Lenart authored
64
65
}
Michał Lenart authored
66
Morfeusz::~Morfeusz() {
Michał Lenart authored
67
68
}
Michał Lenart authored
69
70
void Morfeusz::processOneWord(
        const Environment& env,
Michał Lenart authored
71
        const char*& inputStart,
Michał Lenart authored
72
        const char* inputEnd,
Michał Lenart authored
73
        int startNodeNum,
Michał Lenart authored
74
75
        std::vector<MorphInterpretation>& results,
        bool insideIgnHandler) const {
Michał Lenart authored
76
    while (inputStart != inputEnd
Michał Lenart authored
77
            && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) {
Michał Lenart authored
78
        env.getCharsetConverter().next(inputStart, inputEnd);
Michał Lenart authored
79
    }
Michał Lenart authored
80
    vector<InterpretedChunk> accum;
Michał Lenart authored
81
    InflexionGraph graph;
Michał Lenart authored
82
    const char* currInput = inputStart;
Michał Lenart authored
83
    const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
Michał Lenart authored
84
Michał Lenart authored
85
    doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
Michał Lenart authored
86
Michał Lenart authored
87
    if (!graph.empty()) {
Michał Lenart authored
88
        const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
Michał Lenart authored
89
        int srcNode = startNodeNum;
Michał Lenart authored
90
        for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) {
Michał Lenart authored
91
            vector<InflexionGraph::Edge>& edges = graph.getTheGraph()[i];
Michał Lenart authored
92
            for (unsigned int j = 0; j < edges.size(); j++) {
Michał Lenart authored
93
                InflexionGraph::Edge& e = edges[j];
Michał Lenart authored
94
                int targetNode = startNodeNum + e.nextNode;
Michał Lenart authored
95
                interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results);
Michał Lenart authored
96
97
98
            }
            srcNode++;
        }
Michał Lenart authored
99
    }
Michał Lenart authored
100
101
    else if (inputStart != inputEnd
            && env.getProcessorType() == ANALYZER
Michał Lenart authored
102
103
104
105
            && !insideIgnHandler) {
        this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results);
        //        this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
    }
Michał Lenart authored
106
    else if (inputStart != inputEnd) {
Michał Lenart authored
107
        this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
Michał Lenart authored
108
109
    }
    inputStart = currInput;
Michał Lenart authored
110
111
}
Michał Lenart authored
112
113
static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
    to.prefixChunks.insert(
Michał Lenart authored
114
115
116
            to.prefixChunks.begin(),
            from.prefixChunks.begin(),
            from.prefixChunks.end());
Michał Lenart authored
117
118
    to.prefixChunks.push_back(from);
    from.orthWasShifted = true;
Michał Lenart authored
119
    to.textStartPtr = from.textStartPtr;
Michał Lenart authored
120
121
}
Michał Lenart authored
122
123
124
125
126
127
128
129
130
static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
    stringstream res;
    res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
    return res.str();
}

static inline string debugAccum(vector<InterpretedChunk>& accum) {
    stringstream res;
    for (unsigned int i = 0; i < accum.size(); i++) {
Michał Lenart authored
131
        res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr);
Michał Lenart authored
132
        //        res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
Michał Lenart authored
133
134
135
136
    }
    return res.str();
}
Michał Lenart authored
137
138
void Morfeusz::doProcessOneWord(
        const Environment& env,
Michał Lenart authored
139
140
        const char*& inputData,
        const char* inputEnd,
Michał Lenart authored
141
        SegrulesState segrulesState,
Michał Lenart authored
142
        vector<InterpretedChunk>& accum,
Michał Lenart authored
143
        InflexionGraph& graph) const {
Michał Lenart authored
144
145
146
147
    if (this->options.debug) {
        cerr << "----------" << endl;
        cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
    }
Michał Lenart authored
148
    //    cerr << "doAnalyzeOneWord " << inputData << endl;
Michał Lenart authored
149
    const char* inputStart = inputData;
Michał Lenart authored
150
    const char* currInput = inputData;
Michał Lenart authored
151
    uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
Michał Lenart authored
152
    vector<uint32_t> originalCodepoints;
Michał Lenart authored
153
    vector<uint32_t> normalizedCodepoints;
Michał Lenart authored
154
Michał Lenart authored
155
    StateType state = env.getFSA().getInitialState();
Michał Lenart authored
156
Michał Lenart authored
157
    while (!isWhitespace(codepoint)) {
Michał Lenart authored
158
        uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER
Michał Lenart authored
159
160
                ? env.getCaseConverter().toLower(codepoint)
                : codepoint;
Michał Lenart authored
161
        originalCodepoints.push_back(codepoint);
Michał Lenart authored
162
163
        normalizedCodepoints.push_back(normalizedCodepoint);
        feedState(state, normalizedCodepoint, UTF8CharsetConverter());
Michał Lenart authored
164
        codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd);
Michał Lenart authored
165
166
167
168
169
170
        string homonymId;
        if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) {
            if (originalCodepoints.size() == 1) {
                throw MorfeuszException("Lemma of length > 1 cannot start with a colon");
            }
            homonymId = string(currInput + 1, inputEnd);
Michał Lenart authored
171
            //            cerr << "homonym " << homonymId << endl;
Michał Lenart authored
172
173
174
            currInput = inputEnd;
            codepoint = 0x00;
        }
Michał Lenart authored
175
176
177
178
        if (state.isAccepting()) {
            vector<InterpsGroup> val(state.getValue());
            for (unsigned int i = 0; i < val.size(); i++) {
                InterpsGroup& ig = val[i];
Michał Lenart authored
179
180
181
                if (this->options.debug) {
                    cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
                }
Michał Lenart authored
182
183
184
185
186
187
                set<SegrulesState> newSegrulesStates;
                env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
                if (this->options.debug && newSegrulesStates.empty()) {
                    cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
                }
                if (!newSegrulesStates.empty() && env.getCasePatternHelper().checkInterpsGroupCasePatterns(normalizedCodepoints, originalCodepoints, ig)) {
Michał Lenart authored
188
Michał Lenart authored
189
190
191
192
193
                    for (
                            set<SegrulesState>::iterator it = newSegrulesStates.begin();
                            it != newSegrulesStates.end();
                            ++it) {
                        SegrulesState newSegrulesState = *it;
Michał Lenart authored
194
                        const unsigned char* interpsPtr = getInterpretationsPtr(env, ig);
Michał Lenart authored
195
                        const unsigned char* interpsEndPtr = ig.ptr + ig.size;
Michał Lenart authored
196
                        InterpretedChunk ic = {
Michał Lenart authored
197
                            ig.type,
Michał Lenart authored
198
199
200
201
                            inputStart,
                            currInput,
                            originalCodepoints,
                            normalizedCodepoints,
Michał Lenart authored
202
203
                            interpsPtr,
                            interpsEndPtr,
Michał Lenart authored
204
205
206
207
208
209
210
                            newSegrulesState.shiftOrthFromPrevious,
                            false,
                            vector<InterpretedChunk>(),
                            homonymId
                        };
                        if (!accum.empty() && accum.back().shiftOrth) {
                            doShiftOrth(accum.back(), ic);
Michał Lenart authored
211
                        }
Michał Lenart authored
212
                        accum.push_back(ic);
Michał Lenart authored
213
                        if (isWhitespace(codepoint)
Michał Lenart authored
214
215
216
217
218
219
                                && newSegrulesState.accepting) {
                            if (this->options.debug) {
                                cerr << "ACCEPTING " << debugAccum(accum) << endl;
                            }
                            graph.addPath(accum, newSegrulesState.weak);
                        }
Michał Lenart authored
220
                        else if (!isWhitespace(codepoint)) {
Michał Lenart authored
221
222
223
224
225
                            //                        cerr << "will process " << currInput << endl;
                            const char* newCurrInput = currInput;
                            doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
                        }
                        accum.pop_back();
Michał Lenart authored
226
                    }
Michał Lenart authored
227
                }
Michał Lenart authored
228
            }
Michał Lenart authored
229
        }
Michał Lenart authored
230
        codepoint = currInput == inputEnd || isWhitespace(codepoint) ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
Michał Lenart authored
231
232
    }
    inputData = currInput;
Michał Lenart authored
233
234
}
Michał Lenart authored
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
void Morfeusz::handleIgnChunk(
        const Environment& env,
        const char* inputStart,
        const char* inputEnd,
        int startNodeNum,
        std::vector<MorphInterpretation>& results) const {
    const char* currInput = inputStart;
    const char* prevInput;
    uint32_t codepoint;
    bool separatorFound = false;
    while (currInput != inputEnd) {
        prevInput = currInput;
        const char* nonSeparatorInputEnd = prevInput;
        do {
            codepoint = env.getCharsetConverter().next(currInput, inputEnd);
Michał Lenart authored
250
            if (!env.isSeparator(codepoint)) {
Michał Lenart authored
251
252
253
                nonSeparatorInputEnd = currInput;
            }
        }
Michał Lenart authored
254
        while (currInput != inputEnd && !env.isSeparator(codepoint));
Michał Lenart authored
255
Michał Lenart authored
256
        if (env.isSeparator(codepoint)) {
Michał Lenart authored
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
            separatorFound = true;
            if (nonSeparatorInputEnd != prevInput) {
                int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
                this->processOneWord(env, prevInput, nonSeparatorInputEnd, startNode, results, true);
                startNode = results.empty() ? startNodeNum : results.back().getEndNode();
                this->processOneWord(env, nonSeparatorInputEnd, currInput, startNode, results, true);
            }
            else {
                int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
                this->processOneWord(env, prevInput, currInput, startNode, results, true);
            }
        }
    }

    // currInput == inputEnd
Michał Lenart authored
272
    if (!env.isSeparator(codepoint)) {
Michał Lenart authored
273
274
275
276
277
278
279
280
281
282
        if (separatorFound) {
            int startNode = results.empty() ? startNodeNum : results.back().getEndNode();
            this->processOneWord(env, prevInput, inputEnd, startNode, results, true);
        }
        else {
            this->appendIgnotiumToResults(env, string(inputStart, inputEnd), startNodeNum, results);
        }
    }
}
Michał Lenart authored
283
void Morfeusz::appendIgnotiumToResults(
Michał Lenart authored
284
        const Environment& env,
Michał Lenart authored
285
286
287
        const string& word,
        int startNodeNum,
        std::vector<MorphInterpretation>& results) const {
Michał Lenart authored
288
    MorphInterpretation interp = MorphInterpretation::createIgn(startNodeNum, word, env);
Michał Lenart authored
289
290
291
    results.push_back(interp);
}
Michał Lenart authored
292
ResultsIterator Morfeusz::analyze(const string& text) const {
Michał Lenart authored
293
294
295
    vector<MorphInterpretation> res;
    this->analyze(text, res);
    return ResultsIterator(res);
Michał Lenart authored
296
297
}
Michał Lenart authored
298
void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const {
Michał Lenart authored
299
300
    const char* input = text.c_str();
    const char* inputEnd = input + text.length();
Michał Lenart authored
301
302
    while (input != inputEnd) {
        int startNode = results.empty() ? 0 : results.back().getEndNode();
Michał Lenart authored
303
        this->processOneWord(this->analyzerEnv, input, inputEnd, startNode, results);
Michał Lenart authored
304
    }
Michał Lenart authored
305
306
}
Michał Lenart authored
307
308
309
310
311
312
ResultsIterator Morfeusz::generate(const string& text) const {
    vector<MorphInterpretation> res;
    this->generate(text, res);
    return ResultsIterator(res);
}
Michał Lenart authored
313
314
315
316
317
318
ResultsIterator Morfeusz::generate(const string& text, int tagnum) const {
    vector<MorphInterpretation> res;
    this->generate(text, tagnum, res);
    return ResultsIterator(res);
}
Michał Lenart authored
319
void Morfeusz::generate(const string& lemma, vector<MorphInterpretation>& results) const {
Michał Lenart authored
320
321
    const char* input = lemma.c_str();
    const char* inputEnd = input + lemma.length();
Michał Lenart authored
322
323
324
325
    int startNode = 0;
    this->processOneWord(this->generatorEnv, input, inputEnd, startNode, results);
    if (input != inputEnd) {
        throw MorfeuszException("Input contains more than one word");
Michał Lenart authored
326
    }
Michał Lenart authored
327
328
}
Michał Lenart authored
329
// XXX - someday it should be improved
Michał Lenart authored
330
Michał Lenart authored
331
332
333
334
335
336
337
338
339
340
void Morfeusz::generate(const std::string& lemma, int tagnum, vector<MorphInterpretation>& result) const {
    vector<MorphInterpretation> partRes;
    this->generate(lemma, partRes);
    for (unsigned int i = 0; i < partRes.size(); i++) {
        if (partRes[i].getTagnum() == tagnum) {
            result.push_back(partRes[i]);
        }
    }
}
Michał Lenart authored
341
342
void Morfeusz::setCharset(MorfeuszCharset charset) {
    this->options.encoding = charset;
Michał Lenart authored
343
344
    this->analyzerEnv.setCharset(charset);
    this->generatorEnv.setCharset(charset);
Michał Lenart authored
345
346
}
Michał Lenart authored
347
348
349
350
351
352
353
354
355
356
void Morfeusz::setAggl(const std::string& aggl) {
    this->analyzerEnv.setSegrulesOption("aggl", aggl);
    this->generatorEnv.setSegrulesOption("aggl", aggl);
}

void Morfeusz::setPraet(const std::string& praet) {
    this->analyzerEnv.setSegrulesOption("praet", praet);
    this->generatorEnv.setSegrulesOption("praet", praet);
}
Michał Lenart authored
357
358
void Morfeusz::setCaseSensitive(bool caseSensitive) {
    this->options.caseSensitive = caseSensitive;
Michał Lenart authored
359
    this->analyzerEnv.setCaseSensitive(caseSensitive);
Michał Lenart authored
360
361
}
Michał Lenart authored
362
363
364
365
void Morfeusz::setDebug(bool debug) {
    this->options.debug = debug;
}
Michał Lenart authored
366
ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) {
Michał Lenart authored
367
    resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end());
Michał Lenart authored
368
369
370
}

MorphInterpretation ResultsIterator::getNext() {
Michał Lenart authored
371
372
373
    MorphInterpretation res = this->resultsBuffer.front();
    this->resultsBuffer.pop_front();
    return res;
Michał Lenart authored
374
375
376
}

bool ResultsIterator::hasNext() {
Michał Lenart authored
377
    return !resultsBuffer.empty();
Michał Lenart authored
378
}