|
1
2
3
4
5
6
7
|
/*
* File: Environment.cpp
* Author: mlenart
*
* Created on 22 styczeń 2014, 12:08
*/
|
|
8
9
|
#include <vector>
#include <algorithm>
|
|
10
|
#include "Environment.hpp"
|
|
11
|
#include "deserialization/MorphDeserializer.hpp"
|
|
12
|
#include "exceptions.hpp"
|
|
13
14
15
|
#include "deserialization/morphInterps/InterpretedChunksDecoder.hpp"
#include "deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.hpp"
#include "deserialization/morphInterps/InterpretedChunksDecoder4Generator.hpp"
|
|
16
|
|
|
17
18
|
namespace morfeusz {
|
|
19
20
|
static Deserializer<InterpsGroupsReader>& initializeDeserializer(MorfeuszProcessorType processorType) {
static Deserializer<InterpsGroupsReader> *analyzerDeserializer
|
|
21
|
= new MorphDeserializer();
|
|
22
|
static Deserializer<InterpsGroupsReader> *generatorDeserializer
|
|
23
24
|
= new MorphDeserializer();
return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer);
|
|
25
26
27
28
29
30
31
32
33
34
35
36
|
}
static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) {
for (
std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin();
it != fsasMap.end();
++it) {
delete it->second;
}
fsasMap.clear();
}
|
|
37
|
Environment::Environment(
|
|
38
|
Charset charset,
|
|
39
40
|
MorfeuszProcessorType processorType,
const unsigned char* fsaFileStartPtr)
|
|
41
|
: currentCharsetConverter(getCharsetConverter(charset)),
|
|
42
43
|
caseConverter(),
tagset(fsaFileStartPtr),
|
|
44
|
qualifiers(fsaFileStartPtr),
|
|
45
46
|
fsaFileStartPtr(fsaFileStartPtr),
fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))),
|
|
47
|
separatorsList(getSeparatorsList(fsaFileStartPtr)),
|
|
48
49
50
51
52
53
54
55
|
segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)),
currSegrulesOptions(getDefaultSegrulesOptions(fsaFileStartPtr)),
currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap, fsaFileStartPtr)),
isFromFile(false),
chunksDecoder(
processorType == ANALYZER
? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
: (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)),
|
|
56
|
processorType(processorType),
|
|
57
|
casePatternHelper(new CasePatternHelper()) {
|
|
58
59
|
}
|
|
60
|
const CharsetConverter* Environment::getCharsetConverter(Charset charset) const {
|
|
61
62
|
switch (charset) {
case UTF8:
|
|
63
|
return &UTF8CharsetConverter::getInstance();
|
|
64
|
case ISO8859_2:
|
|
65
|
return &ISO8859_2_CharsetConverter::getInstance();
|
|
66
|
case CP1250:
|
|
67
|
return &Windows_1250_CharsetConverter::getInstance();
|
|
68
|
case CP852:
|
|
69
|
return &CP852_CharsetConverter::getInstance();
|
|
70
71
72
73
74
75
|
default:
throw MorfeuszException("invalid charset");
}
}
Environment::~Environment() {
|
|
76
77
78
79
80
81
|
delete this->fsa;
if (this->isFromFile) {
deleteSegrulesFSAs(this->segrulesFSAsMap);
delete this->fsaFileStartPtr;
}
delete this->chunksDecoder;
|
|
82
|
delete this->casePatternHelper;
|
|
83
84
|
}
|
|
85
|
void Environment::setCharset(Charset charset) {
|
|
86
87
88
89
90
91
92
|
this->currentCharsetConverter = this->getCharsetConverter(charset);
}
const CharsetConverter& Environment::getCharsetConverter() const {
return *this->currentCharsetConverter;
}
|
|
93
94
|
const CaseConverter& Environment::getCaseConverter() const {
return this->caseConverter;
|
|
95
96
|
}
|
|
97
98
|
void Environment::setTagset(const Tagset& tagset) {
this->tagset = tagset;
|
|
99
100
|
}
|
|
101
102
|
const Tagset& Environment::getTagset() const {
return this->tagset;
|
|
103
104
|
}
|
|
105
|
void Environment::setDictionaryFile(const std::string& filename) {
|
|
106
107
108
109
110
111
|
if (this->isFromFile) {
delete this->fsa;
deleteSegrulesFSAs(this->segrulesFSAsMap);
delete this->fsaFileStartPtr;
}
this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str());
|
|
112
|
this->fsa = FSA< InterpsGroupsReader > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType));
|
|
113
|
this->separatorsList = getSeparatorsList(fsaFileStartPtr);
|
|
114
|
this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr);
|
|
115
|
this->currSegrulesFSA = getDefaultSegrulesFSA(this->segrulesFSAsMap, this->fsaFileStartPtr);
|
|
116
|
this->isFromFile = true;
|
|
117
|
this->tagset = Tagset(fsaFileStartPtr);
|
|
118
|
this->qualifiers = Qualifiers(fsaFileStartPtr);
|
|
119
120
|
}
|
|
121
122
123
124
125
126
127
128
129
130
|
const SegrulesFSA& Environment::getCurrentSegrulesFSA() const {
return *(this->currSegrulesFSA);
}
const FSAType& Environment::getFSA() const {
return *(this->fsa);
}
const InterpretedChunksDecoder& Environment::getInterpretedChunksDecoder() const {
return *(this->chunksDecoder);
|
|
131
|
}
|
|
132
133
134
135
136
137
138
139
140
141
142
143
144
|
void Environment::setSegrulesOption(const std::string& option, const std::string& value) {
if (this->currSegrulesOptions.find(option) == this->currSegrulesOptions.end()) {
throw MorfeuszException("Invalid segmentation option '"+option+"'");
}
SegrulesOptions prevOptions = this->currSegrulesOptions;
this->currSegrulesOptions[option] = value;
if (this->segrulesFSAsMap.find(this->currSegrulesOptions) == this->segrulesFSAsMap.end()) {
this->currSegrulesOptions = prevOptions;
throw MorfeuszException("Invalid '"+option+"' option value: '"+value+"'");
}
this->currSegrulesFSA = this->segrulesFSAsMap.find(this->currSegrulesOptions)->second;
}
|
|
145
146
147
148
|
MorfeuszProcessorType Environment::getProcessorType() const {
return this->processorType;
}
|
|
149
150
|
void Environment::setCaseSensitive(bool caseSensitive) {
|
|
151
|
this->casePatternHelper->setCaseSensitive(caseSensitive);
|
|
152
153
154
|
}
const CasePatternHelper& Environment::getCasePatternHelper() const {
|
|
155
|
return *this->casePatternHelper;
|
|
156
|
}
|
|
157
|
|
|
158
159
160
161
|
const Qualifiers& Environment::getQualifiersHelper() const {
return this->qualifiers;
}
|
|
162
163
164
165
166
167
|
bool Environment::isSeparator(uint32_t codepoint) const {
return binary_search(
this->separatorsList.begin(),
this->separatorsList.end(),
codepoint);
}
|
|
168
169
|
}
|