Morfeusz.cpp
2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
/*
* File: Morfeusz.cpp
* Author: mlenart
*
* Created on November 13, 2013, 5:21 PM
*/
#include <string>
#include "fsa.hpp"
#include "utils.hpp"
#include "Morfeusz.hpp"
#include "MorphDeserializer.hpp"
#include "charset/CharsetConverter.hpp"
using namespace std;
static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) {
static Deserializer < vector < InterpsGroup >> *deserializer
= new MorphDeserializer();
return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer);
}
static CharsetConverter* initializeCharsetConverter() {
static CharsetConverter* converter = new UTF8CharsetConverter();
return converter;
}
Morfeusz::Morfeusz(const string& filename)
: fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) {
}
//Morfeusz::Morfeusz(const Morfeusz& orig) {
//}
Morfeusz::~Morfeusz() {
delete &this->fsa;
}
ResultsIterator Morfeusz::analyze(const std::string& text) {
// const char* textStart = text.c_str();
// const char* textEnd = text.c_str() + text.length();
return ResultsIterator(text, *this);
}
ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz)
: rawInput(text.c_str()),
morfeusz(morfeusz) {
}
MorphInterpretation ResultsIterator::getNext() {
// if (resultsBuffer.empty()) {
// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer));
// }
// startNode = resultsBuffer.back().getEndNode();
// MorphInterpretation res = resultsBuffer.front();
// resultsBuffer.pop_front();
// return res;
}
bool ResultsIterator::hasNext() {
return rawInput[0] != '\0' && resultsBuffer.empty();
}
//int Morfeusz::doProcessOneWord(const char*& inputPtr, const char* inputEnd, int startNodeNum, std::vector<EncodedInterpretation>& interps) const {
// assert(inputPtr[0] != '\0');
// const char* start = inputPtr;
// StateType state = fsa->getInitialState();
// int currNodeNum = startNodeNum;
// int codepoint = this->charsetConverter->next(inputPtr, inputEnd);
// assert(!isEndOfWord(codepoint));
// while(!isEndOfWord(codepoint)) {
// feedState(state, codepoint);
// if (state.isAccepting()) {
// const char* currInputPtr = inputPtr;
// vector<EncodedInterpretation> startInterps = state.getValue();
// filterOutNonGluableInterps(startInterps);
// if (!startInterps.empty()) {
//
// }
// vector<EncodedInterpretation> additionalInterps;
// int nextNodeNum = doProcessOneWord(currInputPtr, inputEnd, currNodeNum + 1, additionalInterps);
// if (!additionalInterps.empty()) {
// for (EncodedInterpretation& interp: state.getValue()) {
// interp.startNode = currNodeNum;
// interp.endNode = currNodeNum + 1;
// interps.push_back(interp);
// }
//
// }
// }
// }
//}