InterpretedChunksDecoder4Generator.cpp
4.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
/*
* File: InterpretedChunksDecoder4Generator.cpp
* Author: mlenart
*
* Created on 15 maj 2014, 15:28
*/
#include "InterpretedChunksDecoder4Generator.hpp"
#include <string>
#include <vector>
using namespace std;
namespace morfeusz {
InterpretedChunksDecoder4Generator::InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) {
}
void InterpretedChunksDecoder4Generator::decode(
unsigned int startNode,
unsigned int endNode,
const InterpretedChunk& interpretedChunk,
std::vector<MorphInterpretation>& out) const {
string orthPrefix;
string lemma;
convertPrefixes(interpretedChunk, orthPrefix, lemma);
// lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
lemma.insert(lemma.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr);
const unsigned char* currPtr = getInterpretationsPtr(interpretedChunk.interpsGroupPtr);
while (currPtr < interpretedChunk.interpsEndPtr) {
MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
// cerr << mi.toString(false) << endl;
// cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) {
out.push_back(mi);
}
}
}
void InterpretedChunksDecoder4Generator::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const {
for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
// lemma.insert(lemma.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr);
const unsigned char* ptr = getInterpretationsPtr(interpretedChunk.interpsGroupPtr);
MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr);
// orthPrefix += mi.getOrth();
}
}
MorphInterpretation InterpretedChunksDecoder4Generator::decodeMorphInterpretation(
unsigned int startNode, unsigned int endNode,
const string& orthPrefix,
const string& lemma,
const InterpretedChunk& chunk,
const unsigned char*& ptr) const {
string orth = orthPrefix;
EncodedInterpretation ei = this->deserializeInterp(ptr);
codepoints.clear();
const char* currPtr = chunk.textStartPtr;
while (currPtr != chunk.textEndPtr) {
uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr);
codepoints.push_back(cp);
}
this->decodeForm(codepoints, ei.value, orth);
return MorphInterpretation(
startNode, endNode,
orth, ei.homonymId.empty() ? lemma : (lemma + HOMONYM_SEPARATOR + ei.homonymId),
// ei.homonymId,
ei.tag,
ei.nameClassifier,
ei.qualifiers,
env);
}
void InterpretedChunksDecoder4Generator::decodeForm(
const vector<uint32_t>& lemma,
const EncodedForm& orth,
string& res) const {
res += orth.prefixToAdd;
for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) {
env.getCharsetConverter().append(lemma[i], res);
}
const char* suffixPtr = orth.suffixToAdd.c_str();
const char* suffixEnd = suffixPtr + orth.suffixToAdd.length();
while (suffixPtr != suffixEnd) {
uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
env.getCharsetConverter().append(cp, res);
}
}
EncodedInterpretation InterpretedChunksDecoder4Generator::deserializeInterp(const unsigned char*& ptr) const {
EncodedInterpretation interp;
interp.homonymId = readString(ptr);
interp.value.prefixToAdd = readString(ptr);
interp.value.suffixToCut = readInt8(ptr);
interp.value.suffixToAdd = readString(ptr);
interp.tag = readInt16(ptr);
interp.nameClassifier = readInt8(ptr);
interp.qualifiers = readInt16(ptr);
return interp;
}
}