Blame view

morfeusz/Environment.hpp 4.49 KB
Michał Lenart authored
1
2
3
4
5
6
7
8
9
10
/* 
 * File:   Environment.hpp
 * Author: mlenart
 *
 * Created on 22 styczeń 2014, 12:08
 */

#ifndef ENVIRONMENT_HPP
#define	ENVIRONMENT_HPP
Michał Lenart authored
11
12
#include <vector>
Michał Lenart authored
13
#include "case/CaseConverter.hpp"
Michał Lenart authored
14
#include "charset/CharsetConverter.hpp"
Michał Lenart authored
15
16
#include "fsa/fsa.hpp"
#include "segrules/segrules.hpp"
Michał Lenart authored
17
18
#include "const.hpp"
#include "Tagset.hpp"
Michał Lenart authored
19
#include "InterpsGroup.hpp"
Michał Lenart authored
20
#include "case/CasePatternHelper.hpp"
Michał Lenart authored
21
#include "Qualifiers.hpp"
Michał Lenart authored
22
#include "deserialization/InterpsGroupsReader.hpp"
Michał Lenart authored
23
Michał Lenart authored
24
25
26
27
namespace morfeusz {

class InterpretedChunksDecoder;
class CasePatternHelper;
Michał Lenart authored
28
struct InterpsGroup;
Michał Lenart authored
29
typedef FSA<InterpsGroupsReader> FSAType;
Michał Lenart authored
30
Michał Lenart authored
31
32
33
34
35
/**
 * This class contains data required for morphological analysis/synthesis.
 * It contains references to dictionary automaton, charset converter, tagset data etc.
 * All of these can be changed by setters, changing Morfeusz behavior (different dictionary, charset, and other options).
 */
Michał Lenart authored
36
37
class Environment {
public:
Michał Lenart authored
38
39
40
41
42
43
44
    /**
     * Creates default environment with given initial charset, processor type (analyzer/generator) and default dictionary data ptr.
     * 
     * @param charset
     * @param morfeuszProcessor
     * @param fileStartPtr
     */
Michał Lenart authored
45
    Environment(
Michał Lenart authored
46
47
            MorfeuszCharset charset,
            MorfeuszProcessorType morfeuszProcessor,
Michał Lenart authored
48
49
            const unsigned char* fileStartPtr);
Michał Lenart authored
50
51
52
53
54
    /**
     * Sets charset for this environment.
     * 
     * @param charset
     */
Michał Lenart authored
55
    void setCharset(MorfeuszCharset charset);
Michał Lenart authored
56
Michał Lenart authored
57
58
59
60
61
    /**
     * Sets case sensitivity options.
     * 
     * @param caseSensitive - if true, interpretations not matching case will be discarded.
     */
Michał Lenart authored
62
63
    void setCaseSensitive(bool caseSensitive);
Michał Lenart authored
64
65
66
67
68
69
    /**
     * Gets charset converter that is currently used by this environment.
     * Changed by setting charset.
     * 
     * @return - reference to charset converter.
     */
Michał Lenart authored
70
71
    const CharsetConverter& getCharsetConverter() const;
Michał Lenart authored
72
73
74
75
76
77
    /**
     * Returns case converter that is currently used by this environment.
     * Changed by setting case sensitivity option.
     * 
     * @return - reference to case converter.
     */
Michał Lenart authored
78
    const CaseConverter& getCaseConverter() const;
Michał Lenart authored
79
Michał Lenart authored
80
81
82
83
84
    /**
     * Sets new tagset for this environment.
     * 
     * @param tagset
     */
Michał Lenart authored
85
    void setTagset(const Tagset& tagset);
Michał Lenart authored
86
87
88
89
90
91

    /**
     * Gets currently used tagset.
     * 
     * @return 
     */
Michał Lenart authored
92
    const Tagset& getTagset() const;
Michał Lenart authored
93
Michał Lenart authored
94
95
96
97
98
    /**
     * Sets binary dictionary file used by this environment.
     * 
     * @param filename - filename of the dictionary
     */
Michał Lenart authored
99
100
    void setFSAFile(const std::string& filename);
Michał Lenart authored
101
102
103
104
105
106
    /**
     * Sets segmentation rules option.
     * 
     * @param option
     * @param value
     */
Michał Lenart authored
107
108
    void setSegrulesOption(const std::string& option, const std::string& value);
Michał Lenart authored
109
110
111
112
113
    /**
     * Gets segmentation rules automaton.
     * 
     * @return 
     */
Michał Lenart authored
114
115
    const SegrulesFSA& getCurrentSegrulesFSA() const;
Michał Lenart authored
116
117
118
119
120
    /**
     * Gets dictionary automaton.
     * 
     * @return 
     */
Michał Lenart authored
121
122
    const FSAType& getFSA() const;
Michał Lenart authored
123
124
125
126
    /**
     * Returns decoder that converts interpretations to external format.
     * @return 
     */
Michał Lenart authored
127
    const InterpretedChunksDecoder& getInterpretedChunksDecoder() const;
Michał Lenart authored
128
Michał Lenart authored
129
130
131
132
    /**
     * Gets processor type (info if this is analyzer or generator environment)
     * @return 
     */
Michał Lenart authored
133
134
    MorfeuszProcessorType getProcessorType() const;
Michał Lenart authored
135
136
137
138
139
    /**
     * Return current case pattern helper
     * 
     * @return 
     */
Michał Lenart authored
140
141
    const CasePatternHelper& getCasePatternHelper() const;
Michał Lenart authored
142
143
144
145
    /**
     * Return current qualifiers helper.
     * @return 
     */
Michał Lenart authored
146
147
    const Qualifiers& getQualifiersHelper() const;
Michał Lenart authored
148
149
150
151
152
    /**
     * Returns true iff given codepoint denotes a separator char for ign handling.
     * @param codepoint
     * @return 
     */
Michał Lenart authored
153
154
    bool isSeparator(uint32_t codepoint) const;
Michał Lenart authored
155
156
157
158
    virtual ~Environment();
private:
    const CharsetConverter* currentCharsetConverter;
    const CaseConverter caseConverter;
Michał Lenart authored
159
    Tagset tagset;
Michał Lenart authored
160
    Qualifiers qualifiers;
Michał Lenart authored
161
162
163

    const unsigned char* fsaFileStartPtr;
    const FSAType* fsa;
Michał Lenart authored
164
    std::vector<uint32_t> separatorsList;
Michał Lenart authored
165
    std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap;
Michał Lenart authored
166
    SegrulesOptions currSegrulesOptions;
Michał Lenart authored
167
168
169
170
    const SegrulesFSA* currSegrulesFSA;
    bool isFromFile;

    const InterpretedChunksDecoder* chunksDecoder;
Michał Lenart authored
171
    MorfeuszProcessorType processorType;
Michał Lenart authored
172
    CasePatternHelper* casePatternHelper;
Michał Lenart authored
173
174
175
176

    const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const;
};
Michał Lenart authored
177
178
}
Michał Lenart authored
179
180
#endif	/* ENVIRONMENT_HPP */