Morfeusz.cpp
5.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
/*
* File: Morfeusz.cpp
* Author: mlenart
*
* Created on November 13, 2013, 5:21 PM
*/
#include <string>
#include <iostream>
#include "fsa.hpp"
#include "utils.hpp"
#include "Morfeusz.hpp"
#include "MorphDeserializer.hpp"
#include "charset/CharsetConverter.hpp"
#include "charset/charset_utils.hpp"
// TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
using namespace std;
static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) {
cerr << "initialize FSA" << endl;
static Deserializer < vector < InterpsGroup >> *deserializer
= new MorphDeserializer();
return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer);
}
static CharsetConverter* initializeCharsetConverter() {
cerr << "initialize charset converter" << endl;
static CharsetConverter* converter = new UTF8CharsetConverter();
return converter;
}
static Tagset* initializeTagset(const string& filename) {
cerr << "initialize tagset" << endl;
static Tagset* tagset = new Tagset(readFile(filename.c_str()));
return tagset;
}
Morfeusz::Morfeusz(const string& filename)
: fsa(initializeFSA(filename)),
charsetConverter(initializeCharsetConverter()),
tagset(initializeTagset(filename)) {
}
Morfeusz::~Morfeusz() {
delete &this->fsa;
delete &this->charsetConverter;
}
void Morfeusz::processOneWord(
const char*& inputData,
const char* inputEnd,
const int startNodeNum,
std::vector<MorphInterpretation>& results) const {
vector<InterpretedChunk> accum;
FlexionGraph graph(startNodeNum);
const char* currInput = inputData;
doProcessOneWord(currInput, inputEnd, accum, graph);
graph.appendToResults(*this->tagset, results);
inputData = currInput;
}
void Morfeusz::doProcessOneWord(
const char*& inputData,
const char* inputEnd,
vector<InterpretedChunk>& accum,
FlexionGraph& graph) const {
const char* currInput = inputData;
StateType state = this->fsa->getInitialState();
int codepoint = this->charsetConverter->next(currInput, inputEnd);
if (!accum.empty() && isEndOfWord(codepoint)) {
graph.addPath(accum);
}
else
while (!isEndOfWord(codepoint)) {
this->feedState(state, codepoint);
codepoint = this->charsetConverter->next(currInput, inputEnd);
if (state.isAccepting()) {
for (InterpsGroup& ig : state.getValue()) {
InterpretedChunk ic = {inputData, currInput - inputData, ig};
accum.push_back(ic);
doProcessOneWord(currInput, inputEnd, accum, graph);
accum.pop_back();
}
}
}
}
void Morfeusz::feedState(
StateType& state,
const int codepoint) const {
vector<char> chars;
this->charsetConverter->append(codepoint, chars);
for (char c: chars) {
state.proceedToNext(c);
}
}
ResultsIterator Morfeusz::analyze(const string& text) {
// const char* textStart = text.c_str();
// const char* textEnd = text.c_str() + text.length();
return ResultsIterator(text, *this);
}
void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) {
}
ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz)
: rawInput(text.c_str()),
morfeusz(morfeusz) {
}
MorphInterpretation ResultsIterator::getNext() {
// if (resultsBuffer.empty()) {
// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer));
// }
// startNode = resultsBuffer.back().getEndNode();
// MorphInterpretation res = resultsBuffer.front();
// resultsBuffer.pop_front();
// return res;
}
bool ResultsIterator::hasNext() {
return rawInput[0] != '\0' && resultsBuffer.empty();
}
//int Morfeusz::doProcessOneWord(const char*& inputPtr, const char* inputEnd, int startNodeNum, std::vector<EncodedInterpretation>& interps) const {
// assert(inputPtr[0] != '\0');
// const char* start = inputPtr;
// StateType state = fsa->getInitialState();
// int currNodeNum = startNodeNum;
// int codepoint = this->charsetConverter->next(inputPtr, inputEnd);
// assert(!isEndOfWord(codepoint));
// while(!isEndOfWord(codepoint)) {
// feedState(state, codepoint);
// if (state.isAccepting()) {
// const char* currInputPtr = inputPtr;
// vector<EncodedInterpretation> startInterps = state.getValue();
// filterOutNonGluableInterps(startInterps);
// if (!startInterps.empty()) {
//
// }
// vector<EncodedInterpretation> additionalInterps;
// int nextNodeNum = doProcessOneWord(currInputPtr, inputEnd, currNodeNum + 1, additionalInterps);
// if (!additionalInterps.empty()) {
// for (EncodedInterpretation& interp: state.getValue()) {
// interp.startNode = currNodeNum;
// interp.endNode = currNodeNum + 1;
// interps.push_back(interp);
// }
//
// }
// }
// }
//}