Morfeusz.cpp
4.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
/*
* File: Morfeusz.cpp
* Author: mlenart
*
* Created on November 13, 2013, 5:21 PM
*/
#include <string>
#include "fsa.hpp"
#include "utils.hpp"
#include "Morfeusz.hpp"
#include "MorphDeserializer.hpp"
#include "charset/CharsetConverter.hpp"
#include "charset/charset_utils.hpp"
// TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
using namespace std;
static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) {
static Deserializer < vector < InterpsGroup >> *deserializer
= new MorphDeserializer();
return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer);
}
static CharsetConverter* initializeCharsetConverter() {
static CharsetConverter* converter = new UTF8CharsetConverter();
return converter;
}
Morfeusz::Morfeusz(const string& filename)
: fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) {
}
Morfeusz::~Morfeusz() {
delete &this->fsa;
delete &this->charsetConverter;
}
void Morfeusz::processOneWord(
const char*& inputData,
const char* inputEnd,
const int startNodeNum,
std::vector<MorphInterpretation>& results) const {
vector<InterpretedChunk> accum;
FlexionGraph graph(startNodeNum);
const char* currInput = inputData;
doProcessOneWord(currInput, inputEnd, accum, graph);
graph.appendToResults(this->tagset, results);
inputData = currInput;
}
void Morfeusz::doProcessOneWord(
const char*& inputData,
const char* inputEnd,
vector<InterpretedChunk>& accum,
FlexionGraph& graph) const {
const char* currInput = inputData;
StateType state = this->fsa->getInitialState();
int codepoint = this->charsetConverter->next(currInput, inputEnd);
if (!accum.empty() && isEndOfWord(codepoint)) {
graph.addPath(accum);
}
else
while (!isEndOfWord(codepoint)) {
this->feedState(state, codepoint);
codepoint = this->charsetConverter->next(currInput, inputEnd);
if (state.isAccepting()) {
for (InterpsGroup& ig : state.getValue()) {
InterpretedChunk ic = {inputData, currInput - inputData, ig};
accum.push_back(ic);
doProcessOneWord(currInput, inputEnd, accum, graph);
accum.pop_back();
}
}
}
}
void Morfeusz::feedState(
StateType& state,
const int codepoint) const {
vector<char> chars;
this->charsetConverter->append(codepoint, chars);
for (char c: chars) {
state.proceedToNext(c);
}
}
ResultsIterator Morfeusz::analyze(const std::string& text) {
// const char* textStart = text.c_str();
// const char* textEnd = text.c_str() + text.length();
return ResultsIterator(text, *this);
}
ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz)
: rawInput(text.c_str()),
morfeusz(morfeusz) {
}
MorphInterpretation ResultsIterator::getNext() {
// if (resultsBuffer.empty()) {
// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer));
// }
// startNode = resultsBuffer.back().getEndNode();
// MorphInterpretation res = resultsBuffer.front();
// resultsBuffer.pop_front();
// return res;
}
bool ResultsIterator::hasNext() {
return rawInput[0] != '\0' && resultsBuffer.empty();
}
//int Morfeusz::doProcessOneWord(const char*& inputPtr, const char* inputEnd, int startNodeNum, std::vector<EncodedInterpretation>& interps) const {
// assert(inputPtr[0] != '\0');
// const char* start = inputPtr;
// StateType state = fsa->getInitialState();
// int currNodeNum = startNodeNum;
// int codepoint = this->charsetConverter->next(inputPtr, inputEnd);
// assert(!isEndOfWord(codepoint));
// while(!isEndOfWord(codepoint)) {
// feedState(state, codepoint);
// if (state.isAccepting()) {
// const char* currInputPtr = inputPtr;
// vector<EncodedInterpretation> startInterps = state.getValue();
// filterOutNonGluableInterps(startInterps);
// if (!startInterps.empty()) {
//
// }
// vector<EncodedInterpretation> additionalInterps;
// int nextNodeNum = doProcessOneWord(currInputPtr, inputEnd, currNodeNum + 1, additionalInterps);
// if (!additionalInterps.empty()) {
// for (EncodedInterpretation& interp: state.getValue()) {
// interp.startNode = currNodeNum;
// interp.endNode = currNodeNum + 1;
// interps.push_back(interp);
// }
//
// }
// }
// }
//}