morphanalyser.h
6.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libmaca project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.MACA, LICENSE.SFST, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details.
*/
#ifndef LIBMACA_MORPHANALYSER_H
#define LIBMACA_MORPHANALYSER_H
#include <libtoki/sentence.h>
#include <libmaca/util/confignode.h>
#include <libcorpus2/sentence.h>
#include <libcorpus2/token.h>
#include <libcorpus2/tagset.h>
#include <boost/function.hpp>
#include <loki/Factory.h>
#include <loki/Singleton.h>
namespace Maca {
/**
* Base class for morphological analysers.
*
* A MorphAnalyser analyser takes a Toki token and returns one or more
* Maca tokens, each of which contains one or more lexemes. An analyser
* returns tags from a specified tagset.
*/
class MorphAnalyser
{
public:
/// Constructor for a MorphAnalyser working within a given tagset
explicit MorphAnalyser(const Corpus2::Tagset* tagset);
/**
* Config node constructor. Recognized keys are:
* - tagset - the tagset name to load and use
*/
explicit MorphAnalyser(const Config::Node& cfg);
/// Cloning
virtual MorphAnalyser* clone() const = 0;
/// Destructor
virtual ~MorphAnalyser();
/// Convenience process_functional wrapper to process a Toki token
/// and return a vector of Maca tokens.
std::vector<Corpus2::Token*> process(const Toki::Token& t);
/// Convenience process_functional wrapper to process a Toki sentence
/// and return a Maca sentence of analysed tokens
Corpus2::Sentence::Ptr process(const Toki::Sentence& s);
/// Convenience process_functional wrapper to process a Toki token
/// and insert the resulting Maca tokens into the given vector.
void process(const Toki::Token &t, std::vector<Corpus2::Token*>& vec);
/// Convenience process_functional wrapper to process a vector of Toki
/// tokens and return a vector of Maca tokens. The toki tokens are
/// deleted.
std::vector<Corpus2::Token*> process_dispose(
const std::vector<Toki::Token*>& t);
/// Convenience process_functional wrapper to process a vector of Toki
/// tokens and insert the resulting Maca tokens into the given
/// vector. The toki tokens are deleted.
void process_dispose(const std::vector<Toki::Token*>& t,
std::vector<Corpus2::Token*>& v);
/// Convenience process_functional wrapper to process a vector of Toki
/// tokens and return a vector of Maca tokens. The toki tokens are
/// deleted.
Corpus2::Sentence::Ptr process_dispose(Toki::Sentence* s);
/// Convenience process_functional wrapper to process a Toki Sentence
/// tokens and insert the resulting Maca tokens into a Maca
/// Sentence. The Toki sentence and tokens are deleted.
void process_dispose(Toki::Sentence* t, Corpus2::Sentence::Ptr v);
/**
* The main token analysis function to be implemented in derived
* classes. Takes a Toki token and feeds the resulting Maca tokens
* into the sink function object.
*
* The sink function can be called zero or more times, with zero
* indicating that the token has not been recognized at all, and more
* than one token meaning that the analyser decided to split the token
* into smaller parts.
*
* Should return true if the sink was called at least once, and false
* of no tokens were output.
*/
virtual bool process_functional(const Toki::Token &t,
boost::function<void (Corpus2::Token*)> sink) = 0;
/**
* Convenience function to call process_functional and then dispose of
* the incoming Toki tokens.
*/
bool process_functional_dispose(const Toki::Token *t,
boost::function<void (Corpus2::Token*)> sink) {
bool rv = process_functional(*t, sink);
delete t;
return rv;
}
/// Corpus2::Tagset accessor
const Corpus2::Tagset& tagset() const {
return *tagset_;
}
/// Corpus2::Tagset setter
void set_tagset(const Corpus2::Tagset* tagset) {
tagset_ = tagset;
}
/**
* Factory interface for creating analysers from string identifiers
*
* Mostly a convenience function to avoid having client code refer
* directly to the TokenLayerFactory instance.
*
* @param class_id the unique class identifier
* @param input the input source to pass to the layer's constructor
* @param props the configuration to pass to the layer's constructor
*/
static MorphAnalyser* create(const std::string class_id,
const Config::Node& props);
/**
* Function to get a vector of available analyser type strings.
*/
static std::vector<std::string> available_analyser_types();
/**
* Function to check how many types of analysers are registered.
*/
static size_t available_analyser_count();
/**
* Load a plugin analyser module
*/
static bool load_plugin(const std::string& name, bool quiet = false);
/**
* Convenience template for registering MorphAnalyser derived classes.
*/
template <typename T>
static bool register_analyser(const std::string& class_id);
/**
* Convenience template for registering MorphAnalyser derived classes.
* Assumes the identifier is at T::identifier
*/
template <typename T>
static bool register_analyser();
private:
/// The tagset used by this analyser
const Corpus2::Tagset* tagset_;
};
/**
* Helper function to create a Corpus2 Token from a Toki Token.
*/
Corpus2::Token* create_from_toki(const Toki::Token&);
/**
* Declaration of the MorphAnalyser factory as a singleton Loki object
* factory. The factory instance can be accessed as
* MorphAnalyserFactory::Instance(). It is assumed that all derived classes
* have the same constructor signature.
*/
typedef Loki::SingletonHolder<
Loki::Factory<
MorphAnalyser, // The base class for objects created in the factory
std::string, // Identifier type
Loki::TL::MakeTypelist< const Config::Node& >::Result
// TokenLayer constructor arguments' types specification
>,
Loki::CreateUsingNew, // default, needed to change the item below
Loki::LongevityLifetime::DieAsSmallObjectChild // per libloki docs
>
MorphAnalyserFactory;
/**
* Convenience typedef for the exception type the factory throws
*/
typedef Loki::DefaultFactoryError<
std::string, MorphAnalyser
>::Exception
MorphAnalyserFactoryException;
/**
* Convenience template MorphAnalyser creation function
*/
template <typename T>
T* analyser_creator(const Config::Node& props)
{
return new T(props);
}
template <typename T>
bool MorphAnalyser::register_analyser(const std::string& class_id)
{
return MorphAnalyserFactory::Instance().Register(
class_id, analyser_creator<T>);
}
template <typename T>
bool MorphAnalyser::register_analyser()
{
return MorphAnalyserFactory::Instance().Register(
T::identifier, analyser_creator<T>);
}
} /* end ns Maca */
#endif // LIBMACA_MORPHANALYSER_H