|
1
2
3
4
5
6
7
8
9
10
|
/*
* File: Environment.hpp
* Author: mlenart
*
* Created on 22 styczeń 2014, 12:08
*/
#ifndef ENVIRONMENT_HPP
#define ENVIRONMENT_HPP
|
|
11
12
|
#include <vector>
|
|
13
|
#include "case/CaseConverter.hpp"
|
|
14
|
#include "charset/CharsetConverter.hpp"
|
|
15
16
|
#include "fsa/fsa.hpp"
#include "segrules/segrules.hpp"
|
|
17
18
|
#include "const.hpp"
#include "Tagset.hpp"
|
|
19
|
#include "InterpsGroup.hpp"
|
|
20
|
#include "case/CasePatternHelper.hpp"
|
|
21
|
#include "Qualifiers.hpp"
|
|
22
|
#include "deserialization/InterpsGroupsReader.hpp"
|
|
23
|
|
|
24
25
26
27
|
namespace morfeusz {
class InterpretedChunksDecoder;
class CasePatternHelper;
|
|
28
|
struct InterpsGroup;
|
|
29
|
typedef FSA<InterpsGroupsReader> FSAType;
|
|
30
|
|
|
31
32
33
34
35
|
/**
* This class contains data required for morphological analysis/synthesis.
* It contains references to dictionary automaton, charset converter, tagset data etc.
* All of these can be changed by setters, changing Morfeusz behavior (different dictionary, charset, and other options).
*/
|
|
36
37
|
class Environment {
public:
|
|
38
39
40
41
42
43
44
|
/**
* Creates default environment with given initial charset, processor type (analyzer/generator) and default dictionary data ptr.
*
* @param charset
* @param morfeuszProcessor
* @param fileStartPtr
*/
|
|
45
|
Environment(
|
|
46
47
|
MorfeuszCharset charset,
MorfeuszProcessorType morfeuszProcessor,
|
|
48
49
|
const unsigned char* fileStartPtr);
|
|
50
51
52
53
54
|
/**
* Sets charset for this environment.
*
* @param charset
*/
|
|
55
|
void setCharset(MorfeuszCharset charset);
|
|
56
|
|
|
57
58
59
60
61
|
/**
* Sets case sensitivity options.
*
* @param caseSensitive - if true, interpretations not matching case will be discarded.
*/
|
|
62
63
|
void setCaseSensitive(bool caseSensitive);
|
|
64
65
66
67
68
69
|
/**
* Gets charset converter that is currently used by this environment.
* Changed by setting charset.
*
* @return - reference to charset converter.
*/
|
|
70
71
|
const CharsetConverter& getCharsetConverter() const;
|
|
72
73
74
75
76
77
|
/**
* Returns case converter that is currently used by this environment.
* Changed by setting case sensitivity option.
*
* @return - reference to case converter.
*/
|
|
78
|
const CaseConverter& getCaseConverter() const;
|
|
79
|
|
|
80
81
82
83
84
|
/**
* Sets new tagset for this environment.
*
* @param tagset
*/
|
|
85
|
void setTagset(const Tagset& tagset);
|
|
86
87
88
89
90
91
|
/**
* Gets currently used tagset.
*
* @return
*/
|
|
92
|
const Tagset& getTagset() const;
|
|
93
|
|
|
94
95
96
97
98
|
/**
* Sets binary dictionary file used by this environment.
*
* @param filename - filename of the dictionary
*/
|
|
99
100
|
void setFSAFile(const std::string& filename);
|
|
101
102
103
104
105
106
|
/**
* Sets segmentation rules option.
*
* @param option
* @param value
*/
|
|
107
108
|
void setSegrulesOption(const std::string& option, const std::string& value);
|
|
109
110
111
112
113
|
/**
* Gets segmentation rules automaton.
*
* @return
*/
|
|
114
115
|
const SegrulesFSA& getCurrentSegrulesFSA() const;
|
|
116
117
118
119
120
|
/**
* Gets dictionary automaton.
*
* @return
*/
|
|
121
122
|
const FSAType& getFSA() const;
|
|
123
124
125
126
|
/**
* Returns decoder that converts interpretations to external format.
* @return
*/
|
|
127
|
const InterpretedChunksDecoder& getInterpretedChunksDecoder() const;
|
|
128
|
|
|
129
130
131
132
|
/**
* Gets processor type (info if this is analyzer or generator environment)
* @return
*/
|
|
133
134
|
MorfeuszProcessorType getProcessorType() const;
|
|
135
136
137
138
139
|
/**
* Return current case pattern helper
*
* @return
*/
|
|
140
141
|
const CasePatternHelper& getCasePatternHelper() const;
|
|
142
143
144
145
|
/**
* Return current qualifiers helper.
* @return
*/
|
|
146
147
|
const Qualifiers& getQualifiersHelper() const;
|
|
148
149
150
151
152
|
/**
* Returns true iff given codepoint denotes a separator char for ign handling.
* @param codepoint
* @return
*/
|
|
153
154
|
bool isSeparator(uint32_t codepoint) const;
|
|
155
156
157
158
|
virtual ~Environment();
private:
const CharsetConverter* currentCharsetConverter;
const CaseConverter caseConverter;
|
|
159
|
Tagset tagset;
|
|
160
|
Qualifiers qualifiers;
|
|
161
162
163
|
const unsigned char* fsaFileStartPtr;
const FSAType* fsa;
|
|
164
|
std::vector<uint32_t> separatorsList;
|
|
165
|
std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap;
|
|
166
|
SegrulesOptions currSegrulesOptions;
|
|
167
168
169
170
|
const SegrulesFSA* currSegrulesFSA;
bool isFromFile;
const InterpretedChunksDecoder* chunksDecoder;
|
|
171
|
MorfeuszProcessorType processorType;
|
|
172
|
CasePatternHelper* casePatternHelper;
|
|
173
174
175
176
|
const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const;
};
|
|
177
178
|
}
|
|
179
180
|
#endif /* ENVIRONMENT_HPP */
|