Blame view

morfeusz/Environment.cpp 5.61 KB
Michał Lenart authored
1
2
3
4
5
6
7
/* 
 * File:   Environment.cpp
 * Author: mlenart
 * 
 * Created on 22 styczeń 2014, 12:08
 */
Michał Lenart authored
8
9
#include <vector>
#include <algorithm>
Michał Lenart authored
10
#include "Environment.hpp"
Michał Lenart authored
11
#include "decoder/InterpretedChunksDecoder.hpp"
Michał Lenart authored
12
#include "deserializer/MorphDeserializer.hpp"
Michał Lenart authored
13
#include "exceptions.hpp"
Michał Lenart authored
14
15
#include "decoder/InterpretedChunksDecoder4Analyzer.hpp"
#include "decoder/InterpretedChunksDecoder4Generator.hpp"
Michał Lenart authored
16
Michał Lenart authored
17
18
19
//class InterpretedChunksDecoder4Analyzer;
//class InterpretedChunksDecoder4Generator;
Michał Lenart authored
20
21
static Deserializer<InterpsGroupsReader>& initializeDeserializer(MorfeuszProcessorType processorType) {
    static Deserializer<InterpsGroupsReader> *analyzerDeserializer
Michał Lenart authored
22
            = new MorphDeserializer();
Michał Lenart authored
23
    static Deserializer<InterpsGroupsReader> *generatorDeserializer
Michał Lenart authored
24
25
            = new MorphDeserializer();
    return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer);
Michał Lenart authored
26
27
28
29
30
31
32
33
34
35
36
37
}

static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) {
    for (
            std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin();
            it != fsasMap.end();
            ++it) {
        delete it->second;
    }
    fsasMap.clear();
}
Michał Lenart authored
38
Environment::Environment(
Michał Lenart authored
39
40
41
        MorfeuszCharset charset,
        MorfeuszProcessorType processorType,
        const unsigned char* fsaFileStartPtr)
Michał Lenart authored
42
: currentCharsetConverter(getCharsetConverter(charset)),
Michał Lenart authored
43
44
caseConverter(),
tagset(fsaFileStartPtr),
Michał Lenart authored
45
qualifiers(fsaFileStartPtr),
Michał Lenart authored
46
47
fsaFileStartPtr(fsaFileStartPtr),
fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))),
Michał Lenart authored
48
separatorsList(getSeparatorsList(fsaFileStartPtr)),
Michał Lenart authored
49
50
51
52
53
54
55
56
segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)),
currSegrulesOptions(getDefaultSegrulesOptions(fsaFileStartPtr)),
currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap, fsaFileStartPtr)),
isFromFile(false),
chunksDecoder(
processorType == ANALYZER
? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
: (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)),
Michał Lenart authored
57
processorType(processorType),
Michał Lenart authored
58
casePatternHelper(new CasePatternHelper()) {
Michał Lenart authored
59
60
61
62
63
}

const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const {
    switch (charset) {
        case UTF8:
Michał Lenart authored
64
            return &UTF8CharsetConverter::getInstance();
Michał Lenart authored
65
        case ISO8859_2:
Michał Lenart authored
66
            return &ISO8859_2_CharsetConverter::getInstance();
Michał Lenart authored
67
        case CP1250:
Michał Lenart authored
68
            return &Windows_1250_CharsetConverter::getInstance();
Michał Lenart authored
69
        case CP852:
Michał Lenart authored
70
            return &CP852_CharsetConverter::getInstance();
Michał Lenart authored
71
72
73
74
75
76
        default:
            throw MorfeuszException("invalid charset");
    }
}

Environment::~Environment() {
Michał Lenart authored
77
78
79
80
81
82
    delete this->fsa;
    if (this->isFromFile) {
        deleteSegrulesFSAs(this->segrulesFSAsMap);
        delete this->fsaFileStartPtr;
    }
    delete this->chunksDecoder;
Michał Lenart authored
83
    delete this->casePatternHelper;
Michał Lenart authored
84
85
86
87
88
89
90
91
92
93
}

void Environment::setCharset(MorfeuszCharset charset) {
    this->currentCharsetConverter = this->getCharsetConverter(charset);
}

const CharsetConverter& Environment::getCharsetConverter() const {
    return *this->currentCharsetConverter;
}
Michał Lenart authored
94
95
const CaseConverter& Environment::getCaseConverter() const {
    return this->caseConverter;
Michał Lenart authored
96
97
}
Michał Lenart authored
98
99
void Environment::setTagset(const Tagset& tagset) {
    this->tagset = tagset;
Michał Lenart authored
100
101
}
Michał Lenart authored
102
103
const Tagset& Environment::getTagset() const {
    return this->tagset;
Michał Lenart authored
104
105
}
Michał Lenart authored
106
107
108
109
110
111
112
void Environment::setFSAFile(const std::string& filename) {
    if (this->isFromFile) {
        delete this->fsa;
        deleteSegrulesFSAs(this->segrulesFSAsMap);
        delete this->fsaFileStartPtr;
    }
    this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str());
Michał Lenart authored
113
    this->fsa = FSA< InterpsGroupsReader > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType));
Michał Lenart authored
114
    this->separatorsList = getSeparatorsList(fsaFileStartPtr);
Michał Lenart authored
115
    this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr);
Michał Lenart authored
116
    this->currSegrulesFSA = getDefaultSegrulesFSA(this->segrulesFSAsMap, this->fsaFileStartPtr);
Michał Lenart authored
117
    this->isFromFile = true;
Michał Lenart authored
118
    this->tagset = Tagset(fsaFileStartPtr);
Michał Lenart authored
119
    this->qualifiers = Qualifiers(fsaFileStartPtr);
Michał Lenart authored
120
121
}
Michał Lenart authored
122
123
124
125
126
127
128
129
130
131
const SegrulesFSA& Environment::getCurrentSegrulesFSA() const {
    return *(this->currSegrulesFSA);
}

const FSAType& Environment::getFSA() const {
    return *(this->fsa);
}

const InterpretedChunksDecoder& Environment::getInterpretedChunksDecoder() const {
    return *(this->chunksDecoder);
Michał Lenart authored
132
}
Michał Lenart authored
133
134
135
136
137
138
139
140
141
142
143
144
145

void Environment::setSegrulesOption(const std::string& option, const std::string& value) {
    if (this->currSegrulesOptions.find(option) == this->currSegrulesOptions.end()) {
        throw MorfeuszException("Invalid segmentation option '"+option+"'");
    }
    SegrulesOptions prevOptions = this->currSegrulesOptions;
    this->currSegrulesOptions[option] = value;
    if (this->segrulesFSAsMap.find(this->currSegrulesOptions) == this->segrulesFSAsMap.end()) {
        this->currSegrulesOptions = prevOptions;
        throw MorfeuszException("Invalid '"+option+"' option value: '"+value+"'");
    }
    this->currSegrulesFSA = this->segrulesFSAsMap.find(this->currSegrulesOptions)->second;
}
Michał Lenart authored
146
147
148
149

MorfeuszProcessorType Environment::getProcessorType() const {
    return this->processorType;
}
Michał Lenart authored
150
151

void Environment::setCaseSensitive(bool caseSensitive) {
Michał Lenart authored
152
    this->casePatternHelper->setCaseSensitive(caseSensitive);
Michał Lenart authored
153
154
155
}

const CasePatternHelper& Environment::getCasePatternHelper() const {
Michał Lenart authored
156
    return *this->casePatternHelper;
Michał Lenart authored
157
}
Michał Lenart authored
158
Michał Lenart authored
159
160
161
162
const Qualifiers& Environment::getQualifiersHelper() const {
    return this->qualifiers;
}
Michał Lenart authored
163
164
165
166
167
168
bool Environment::isSeparator(uint32_t codepoint) const {
    return binary_search(
            this->separatorsList.begin(), 
            this->separatorsList.end(), 
            codepoint);
}