Blame view

morfeusz/Environment.cpp 5.59 KB
Michał Lenart authored
1
2
3
4
5
6
7
/* 
 * File:   Environment.cpp
 * Author: mlenart
 * 
 * Created on 22 styczeń 2014, 12:08
 */
Michał Lenart authored
8
9
#include <vector>
#include <algorithm>
Michał Lenart authored
10
#include "Environment.hpp"
Michał Lenart authored
11
#include "deserialization/MorphDeserializer.hpp"
Michał Lenart authored
12
#include "exceptions.hpp"
Michał Lenart authored
13
14
15
#include "deserialization/morphInterps/InterpretedChunksDecoder.hpp"
#include "deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.hpp"
#include "deserialization/morphInterps/InterpretedChunksDecoder4Generator.hpp"
Michał Lenart authored
16
Michał Lenart authored
17
18
namespace morfeusz {
Michał Lenart authored
19
20
static Deserializer<InterpsGroupsReader>& initializeDeserializer(MorfeuszProcessorType processorType) {
    static Deserializer<InterpsGroupsReader> *analyzerDeserializer
Michał Lenart authored
21
            = new MorphDeserializer();
Michał Lenart authored
22
    static Deserializer<InterpsGroupsReader> *generatorDeserializer
Michał Lenart authored
23
24
            = new MorphDeserializer();
    return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer);
Michał Lenart authored
25
26
27
28
29
30
31
32
33
34
35
36
}

static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) {
    for (
            std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin();
            it != fsasMap.end();
            ++it) {
        delete it->second;
    }
    fsasMap.clear();
}
Michał Lenart authored
37
Environment::Environment(
Michał Lenart authored
38
        Charset charset,
Michał Lenart authored
39
40
        MorfeuszProcessorType processorType,
        const unsigned char* fsaFileStartPtr)
Michał Lenart authored
41
: currentCharsetConverter(getCharsetConverter(charset)),
Michał Lenart authored
42
43
caseConverter(),
tagset(fsaFileStartPtr),
Michał Lenart authored
44
qualifiers(fsaFileStartPtr),
Michał Lenart authored
45
46
fsaFileStartPtr(fsaFileStartPtr),
fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))),
Michał Lenart authored
47
separatorsList(getSeparatorsList(fsaFileStartPtr)),
Michał Lenart authored
48
49
50
51
52
53
54
55
segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)),
currSegrulesOptions(getDefaultSegrulesOptions(fsaFileStartPtr)),
currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap, fsaFileStartPtr)),
isFromFile(false),
chunksDecoder(
processorType == ANALYZER
? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
: (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)),
Michał Lenart authored
56
processorType(processorType),
Michał Lenart authored
57
casePatternHelper(new CasePatternHelper()) {
Michał Lenart authored
58
59
}
Michał Lenart authored
60
const CharsetConverter* Environment::getCharsetConverter(Charset charset) const {
Michał Lenart authored
61
62
    switch (charset) {
        case UTF8:
Michał Lenart authored
63
            return &UTF8CharsetConverter::getInstance();
Michał Lenart authored
64
        case ISO8859_2:
Michał Lenart authored
65
            return &ISO8859_2_CharsetConverter::getInstance();
Michał Lenart authored
66
        case CP1250:
Michał Lenart authored
67
            return &Windows_1250_CharsetConverter::getInstance();
Michał Lenart authored
68
        case CP852:
Michał Lenart authored
69
            return &CP852_CharsetConverter::getInstance();
Michał Lenart authored
70
71
72
73
74
75
        default:
            throw MorfeuszException("invalid charset");
    }
}

Environment::~Environment() {
Michał Lenart authored
76
77
78
79
80
81
    delete this->fsa;
    if (this->isFromFile) {
        deleteSegrulesFSAs(this->segrulesFSAsMap);
        delete this->fsaFileStartPtr;
    }
    delete this->chunksDecoder;
Michał Lenart authored
82
    delete this->casePatternHelper;
Michał Lenart authored
83
84
}
Michał Lenart authored
85
void Environment::setCharset(Charset charset) {
Michał Lenart authored
86
87
88
89
90
91
92
    this->currentCharsetConverter = this->getCharsetConverter(charset);
}

const CharsetConverter& Environment::getCharsetConverter() const {
    return *this->currentCharsetConverter;
}
Michał Lenart authored
93
94
const CaseConverter& Environment::getCaseConverter() const {
    return this->caseConverter;
Michał Lenart authored
95
96
}
Michał Lenart authored
97
98
void Environment::setTagset(const Tagset& tagset) {
    this->tagset = tagset;
Michał Lenart authored
99
100
}
Michał Lenart authored
101
102
const Tagset& Environment::getTagset() const {
    return this->tagset;
Michał Lenart authored
103
104
}
Michał Lenart authored
105
void Environment::setDictionaryFile(const std::string& filename) {
Michał Lenart authored
106
107
108
109
110
111
    if (this->isFromFile) {
        delete this->fsa;
        deleteSegrulesFSAs(this->segrulesFSAsMap);
        delete this->fsaFileStartPtr;
    }
    this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str());
Michał Lenart authored
112
    this->fsa = FSA< InterpsGroupsReader > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType));
Michał Lenart authored
113
    this->separatorsList = getSeparatorsList(fsaFileStartPtr);
Michał Lenart authored
114
    this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr);
Michał Lenart authored
115
    this->currSegrulesFSA = getDefaultSegrulesFSA(this->segrulesFSAsMap, this->fsaFileStartPtr);
Michał Lenart authored
116
    this->isFromFile = true;
Michał Lenart authored
117
    this->tagset = Tagset(fsaFileStartPtr);
Michał Lenart authored
118
    this->qualifiers = Qualifiers(fsaFileStartPtr);
Michał Lenart authored
119
120
}
Michał Lenart authored
121
122
123
124
125
126
127
128
129
130
const SegrulesFSA& Environment::getCurrentSegrulesFSA() const {
    return *(this->currSegrulesFSA);
}

const FSAType& Environment::getFSA() const {
    return *(this->fsa);
}

const InterpretedChunksDecoder& Environment::getInterpretedChunksDecoder() const {
    return *(this->chunksDecoder);
Michał Lenart authored
131
}
Michał Lenart authored
132
133
134
135
136
137
138
139
140
141
142
143
144

void Environment::setSegrulesOption(const std::string& option, const std::string& value) {
    if (this->currSegrulesOptions.find(option) == this->currSegrulesOptions.end()) {
        throw MorfeuszException("Invalid segmentation option '"+option+"'");
    }
    SegrulesOptions prevOptions = this->currSegrulesOptions;
    this->currSegrulesOptions[option] = value;
    if (this->segrulesFSAsMap.find(this->currSegrulesOptions) == this->segrulesFSAsMap.end()) {
        this->currSegrulesOptions = prevOptions;
        throw MorfeuszException("Invalid '"+option+"' option value: '"+value+"'");
    }
    this->currSegrulesFSA = this->segrulesFSAsMap.find(this->currSegrulesOptions)->second;
}
Michał Lenart authored
145
146
147
148

MorfeuszProcessorType Environment::getProcessorType() const {
    return this->processorType;
}
Michał Lenart authored
149
150

void Environment::setCaseSensitive(bool caseSensitive) {
Michał Lenart authored
151
    this->casePatternHelper->setCaseSensitive(caseSensitive);
Michał Lenart authored
152
153
154
}

const CasePatternHelper& Environment::getCasePatternHelper() const {
Michał Lenart authored
155
    return *this->casePatternHelper;
Michał Lenart authored
156
}
Michał Lenart authored
157
Michał Lenart authored
158
159
160
161
const Qualifiers& Environment::getQualifiersHelper() const {
    return this->qualifiers;
}
Michał Lenart authored
162
163
164
165
166
167
bool Environment::isSeparator(uint32_t codepoint) const {
    return binary_search(
            this->separatorsList.begin(), 
            this->separatorsList.end(), 
            codepoint);
}
Michał Lenart authored
168
169

}