Generator.cpp
3.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/*
* File: Generator.cpp
* Author: mlenart
*
* Created on 21 styczeń 2014, 14:38
*/
#include <string>
#include <iostream>
#include "charset/charset_utils.hpp"
#include "MorphInterpretation.hpp"
#include "Generator.hpp"
#include "Environment.hpp"
using namespace std;
Generator::Generator(
const unsigned char* ptr,
const Environment& env)
: deserializer(env),
fsa(SynthFSAType::getFSA(ptr, deserializer)),
env(env),
generatorPtr(ptr) {
}
Generator::~Generator() {
}
void Generator::setGeneratorPtr(const unsigned char* ptr) {
delete this->fsa;
this->generatorPtr = ptr;
this->fsa = SynthFSAType::getFSA(ptr, deserializer);
}
void Generator::appendString(const string& str, string& res) const {
const char* suffixPtr = str.c_str();
const char* suffixEnd = suffixPtr + str.length();
while (suffixPtr != suffixEnd) {
uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd);
env.getCharsetConverter().append(cp, res);
}
}
std::string Generator::decodeOrth(
const EncodedOrth& orth,
const std::vector<uint32_t>& lemma) const {
string res;
this->appendString(orth.prefixToAdd, res);
for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) {
uint32_t cp = lemma[i];
env.getCharsetConverter().append(cp, res);
}
this->appendString(orth.suffixToAdd, res);
// const char* suffixPtr = orth.suffixToAdd.c_str();
// const char* suffixEnd = suffixPtr + orth.suffixToAdd.length();
// while (suffixPtr != suffixEnd) {
// uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd);
// env.getCharsetConverter().append(cp, res);
// }
return res;
}
void Generator::decodeRes(
const std::vector<EncodedGeneratorInterpretation>& encodedRes,
const std::string& lemma,
const std::vector<uint32_t>& lemmaCodepoints,
std::vector<MorphInterpretation>& result) const {
for (unsigned int i = 0; i < encodedRes.size(); i++) {
EncodedGeneratorInterpretation egi = encodedRes[i];
string decodedOrth = this->decodeOrth(egi.orth, lemmaCodepoints);
MorphInterpretation mi(
0, 0,
decodedOrth, lemma,
egi.tag,
egi.nameClassifier,
env.getTagset(),
env.getCharsetConverter());
result.push_back(mi);
}
}
void Generator::generate(const string& lemma, vector<MorphInterpretation>& result) const {
const char* currInput = lemma.c_str();
const char* inputEnd = currInput + lemma.length();
vector<uint32_t> codepoints;
SynthStateType state = this->fsa->getInitialState();
while (currInput != inputEnd && !state.isSink()) {
uint32_t codepoint = this->env.getCharsetConverter().next(currInput, inputEnd);
feedState(state, codepoint, this->env.getCharsetConverter());
codepoints.push_back(codepoint);
}
if (state.isAccepting()) {
vector<EncodedGeneratorInterpretation> encodedRes = state.getValue();
decodeRes(encodedRes, lemma, codepoints, result);
}
}