|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
/*
* File: morfeusz2.h
* Author: mlenart
*
* Created on 13 czerwiec 2014, 17:28
*/
#ifndef MORFEUSZ2_H
#define MORFEUSZ2_H
#include <vector>
#include <string>
#include <list>
|
|
14
|
#include <set>
|
|
15
|
|
|
16
17
18
19
20
|
#ifndef __WIN32
#define DLLIMPORT
#else
/* A Windows system. Need to define DLLIMPORT. */
#if BUILDING_MORFEUSZ
|
|
21
|
#define DLLIMPORT __declspec (dllexport)
|
|
22
|
#else
|
|
23
|
#define DLLIMPORT __declspec (dllimport)
|
|
24
25
|
#endif
#endif
|
|
26
|
|
|
27
28
|
namespace morfeusz {
|
|
29
30
31
|
class DLLIMPORT MorphInterpretation;
class DLLIMPORT Morfeusz;
class DLLIMPORT ResultsIterator;
|
|
32
|
class DLLIMPORT IdResolver;
|
|
33
|
class DLLIMPORT MorfeuszException;
|
|
34
35
|
enum Charset {
|
|
36
|
UTF8 = 11,
|
|
37
38
39
|
// UTF16LE,
// UTF16BE,
// UTF32,
|
|
40
41
42
|
ISO8859_2 = 12,
CP1250 = 13,
CP852 = 14
|
|
43
|
};
|
|
44
|
|
|
45
46
47
48
|
enum TokenNumbering {
/**
* Start from 0. Reset counter for every invocation of Morfeusz::analyze
*/
|
|
49
|
SEPARATE_NUMBERING = 201,
|
|
50
|
|
|
51
52
53
|
/**
* Also start from 0. Reset counter for every invocation of Morfeusz::setTokenNumbering only
*/
|
|
54
|
CONTINUOUS_NUMBERING = 202
|
|
55
|
};
|
|
56
|
|
|
57
58
59
60
|
enum CaseHandling {
/**
* Case-sensitive but allows interpretations that do not match case but there are no alternatives
*/
|
|
61
|
CONDITIONALLY_CASE_SENSITIVE = 100,
|
|
62
|
|
|
63
64
65
|
/**
* Strictly case-sensitive, reject all interpretations that do not match case
*/
|
|
66
|
STRICTLY_CASE_SENSITIVE = 101,
|
|
67
|
|
|
68
69
70
|
/**
* Case-insensitive - ignores case
*/
|
|
71
|
IGNORE_CASE = 102
|
|
72
|
};
|
|
73
|
|
|
74
75
76
77
|
enum WhitespaceHandling {
/**
* Ignore whitespaces
*/
|
|
78
|
SKIP_WHITESPACES = 301,
|
|
79
|
|
|
80
81
82
|
/**
* Append whitespaces to previous MorphInterpretation
*/
|
|
83
|
APPEND_WHITESPACES = 302,
|
|
84
|
|
|
85
86
87
|
/**
* Whitespaces are separate MorphInterpretation objects
*/
|
|
88
|
KEEP_WHITESPACES = 303
|
|
89
|
};
|
|
90
91
|
enum MorfeuszUsage {
|
|
92
93
|
ANALYSE_ONLY = 401,
GENERATE_ONLY = 402,
|
|
94
95
|
BOTH_ANALYSE_AND_GENERATE = 403
};
|
|
96
97
98
99
100
101
102
|
/**
* Performs morphological analysis (analyze methods) and syntesis (generate methods).
*
* It is NOT thread-safe
* but it is possible to use separate Morfeusz instance for each concurrent thread.
*/
|
|
103
|
class DLLIMPORT Morfeusz {
|
|
104
105
|
public:
|
|
106
107
108
109
|
/**
* Returns a string containing library version.
* @return
*/
|
|
110
|
static std::string getVersion();
|
|
111
112
113
114
115
116
|
/**
* Returns a string containing default dictionary name.
* @return
*/
static std::string getDefaultDictName();
|
|
117
|
|
|
118
119
120
121
|
/**
* Creates actual instance of Morfeusz class.
* The caller is responsible for destroying it.
*
|
|
122
123
|
* @remarks NOT THREAD-SAFE (affects ALL Morfeusz instances)
* @return new instance of Morfeusz.
|
|
124
|
*/
|
|
125
126
127
|
static Morfeusz* createInstance(MorfeuszUsage usage);
/**
|
|
128
129
130
|
* Creates exact copy of Morfeusz object.
*
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
131
132
|
*/
virtual Morfeusz* clone() const = 0;
|
|
133
134
135
136
137
|
virtual ~Morfeusz();
/**
* Analyze given text and return the results as iterator.
|
|
138
139
|
* Use this method for analysis of big texts.
* Copies the text under the hood - use analyze(const char*) if you want to avoid this.
|
|
140
|
*
|
|
141
|
* @param text - text for morphological analysis.
|
|
142
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
143
144
|
* @return - iterator over morphological analysis results
*/
|
|
145
|
virtual ResultsIterator* analyse(const std::string& text) const = 0;
|
|
146
|
|
|
147
148
|
/**
* Analyze given text and return the results as iterator.
|
|
149
|
* It does not store results for whole text at once, so may be less memory-consuming for analysis of big texts
|
|
150
151
152
|
*
*
* @param text - text for morphological analysis. This pointer must not be deleted before returned ResultsIterator object.
|
|
153
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
154
155
|
* @return - iterator over morphological analysis results
*/
|
|
156
|
virtual ResultsIterator* analyse(const char* text) const = 0;
|
|
157
158
159
160
161
162
|
/**
* Perform morphological analysis on a given text and put results in a vector.
*
* @param text - text to be analyzed
* @param result - results vector
|
|
163
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
164
|
*/
|
|
165
|
virtual void analyse(const std::string& text, std::vector<MorphInterpretation>& result) const = 0;
|
|
166
167
168
169
170
171
|
/**
* Perform morphological synthesis on a given lemma and put results in a vector.
*
* @param lemma - lemma to be analyzed
* @param result - results vector
|
|
172
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
173
|
* @throws MorfeuszException - when lemma parameter contains whitespaces.
|
|
174
175
176
177
178
179
180
181
182
183
|
*/
virtual void generate(const std::string& lemma, std::vector<MorphInterpretation>& result) const = 0;
/**
* Perform morphological synthesis on a given lemma and put results in a vector.
* Limit results to interpretations with the specified tag.
*
* @param lemma - lemma to be analyzed
* @param tag - tag of result interpretations
* @param result - results vector
|
|
184
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
185
|
* @throws MorfeuszException - when lemma parameter contains whitespaces or tagId is outside tagset.
|
|
186
|
*/
|
|
187
|
virtual void generate(const std::string& lemma, int tagId, std::vector<MorphInterpretation>& result) const = 0;
|
|
188
189
190
191
192
|
/**
* Set encoding for input and output string objects.
*
* @param encoding
|
|
193
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
194
195
196
197
|
*/
virtual void setCharset(Charset encoding) = 0;
/**
|
|
198
|
* Select agglutination rules
|
|
199
200
|
*
* @param aggl
|
|
201
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
202
|
* @throws MorfeuszException - for invalid aggl parameter.
|
|
203
204
205
206
|
*/
virtual void setAggl(const std::string& aggl) = 0;
/**
|
|
207
|
* Select past tense segmentation
|
|
208
209
|
*
* @param praet
|
|
210
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
211
|
* @throws MorfeuszException - for invalid aggl praet parameter.
|
|
212
213
214
215
|
*/
virtual void setPraet(const std::string& praet) = 0;
/**
|
|
216
|
* Set case handling.
|
|
217
218
|
*
* @param caseSensitive
|
|
219
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
220
|
*/
|
|
221
|
virtual void setCaseHandling(CaseHandling caseHandling) = 0;
|
|
222
|
|
|
223
224
225
226
|
/**
* Set token numbering policy.
*
* @param numbering
|
|
227
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
228
229
|
*/
virtual void setTokenNumbering(TokenNumbering numbering) = 0;
|
|
230
|
|
|
231
232
233
234
|
/**
* Set whitespace handling.
*
* @param numbering
|
|
235
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
236
237
|
*/
virtual void setWhitespaceHandling(WhitespaceHandling whitespaceHandling) = 0;
|
|
238
239
240
241
242
243
244
|
/**
* Set debug option value.
*
* @param debug
*/
virtual void setDebug(bool debug) = 0;
|
|
245
|
|
|
246
|
/**
|
|
247
248
249
|
* Get reference to tagset currently being in use.
*
* @return currently used tagset
|
|
250
|
*/
|
|
251
|
virtual const IdResolver& getIdResolver() const = 0;
|
|
252
|
|
|
253
|
/**
|
|
254
255
|
* Set current dictionary to the one with provided name.
*
|
|
256
|
* This is NOT THREAD SAFE - no other thread may invoke setDictionary
|
|
257
258
259
|
* either within this instance, or any other in the same application.
*
* @param dictName dictionary name
|
|
260
|
* @remarks NOT THREAD-SAFE (affects ALL Morfeusz instances)
|
|
261
262
|
* @throws MorfeuszException - when dictionary not found.
* @throws std::ios_base::failure - when IO error occurred when loading given dictionary.
|
|
263
|
*/
|
|
264
265
266
267
|
virtual void setDictionary(const std::string& dictName) = 0;
/**
* List of paths where current Morfeusz instance will look for dictionaries.
|
|
268
|
* Modifying it is NOT THREAD-SAFE.
|
|
269
270
271
|
*/
static std::list<std::string> dictionarySearchPaths;
|
|
272
|
/**
|
|
273
274
|
* Get available parameters for "setAggl" method.
* @return
|
|
275
|
*/
|
|
276
277
278
279
280
281
282
|
virtual const std::set<std::string>& getAvailableAgglOptions() const = 0;
/**
* Get available parameters for "setPraet" method.
* @return
*/
virtual const std::set<std::string>& getAvailablePraetOptions() const = 0;
|
|
283
|
|
|
284
285
286
287
288
|
protected:
/**
* Same as analyze(text) but copies the text under the hood.
* Useful for wrappers to other languages.
*/
|
|
289
|
virtual ResultsIterator* analyseWithCopy(const char* text) const = 0;
|
|
290
291
|
};
|
|
292
|
class DLLIMPORT ResultsIterator {
|
|
293
|
public:
|
|
294
295
296
297
|
/**
*
* @return true iff this iterator contains more elements.
*/
|
|
298
|
virtual bool hasNext() = 0;
|
|
299
300
301
302
303
304
|
/**
*
* @return the element, that will be returned in next next() invocation.
* @throws std::out_of_range when this iterator has already reached the end.
*/
|
|
305
|
virtual const MorphInterpretation& peek() = 0;
|
|
306
307
308
309
310
311
|
/**
*
* @return next analysis result.
* @throws std::out_of_range when this iterator has already reached the end.
*/
|
|
312
|
virtual MorphInterpretation next() = 0;
|
|
313
|
|
|
314
315
|
virtual ~ResultsIterator() {
}
|
|
316
|
};
|
|
317
318
|
/**
|
|
319
|
* Represents a tagset
|
|
320
|
*/
|
|
321
|
class DLLIMPORT IdResolver {
|
|
322
|
public:
|
|
323
|
|
|
324
325
326
327
328
|
/**
* Returns tag (denoted by its index).
*
* @param tagNum - tag index in the tagset.
* @return - the tag
|
|
329
|
* @throws std::out_of_range when invalid tagId is provided.
|
|
330
|
*/
|
|
331
|
virtual const std::string& getTag(const int tagId) const = 0;
|
|
332
|
|
|
333
334
335
336
337
|
/**
* Returns identifier for given tag.
* Throws MorfeuszException when none exists.
*
* @return identifier for given tag
|
|
338
|
* @throws MorfeuszException when invalid tag parameter is provided.
|
|
339
340
|
*/
virtual int getTagId(const std::string& tag) const = 0;
|
|
341
342
343
344
345
346
|
/**
* Returns named entity type (denoted by its index).
*
* @param nameNum - name index in the tagset.
* @return - the named entity type
|
|
347
|
* @throws std::out_of_range when invalid nameId is provided.
|
|
348
|
*/
|
|
349
|
virtual const std::string& getName(const int nameId) const = 0;
|
|
350
|
|
|
351
352
353
354
355
|
/**
* Returns identifier for given named entity.
* Throws MorfeuszException when none exists.
*
* @return identifier for given named entity
|
|
356
|
* @throws MorfeuszException when invalid name parameter is provided.
|
|
357
358
|
*/
virtual int getNameId(const std::string& name) const = 0;
|
|
359
|
|
|
360
361
362
363
364
365
366
|
/**
* Returns labels string for given labelsId.
*
* @param labelsId
* @return labels as string
* @throws std::out_of_range when invalid labelsId is provided.
*/
|
|
367
|
virtual const std::string& getLabelsAsString(int labelsId) const = 0;
|
|
368
|
|
|
369
370
371
372
373
374
|
/**
* Returns labels as set of strings for given labelsId.
* @param labelsId
* @return labels as set of strings
* @throws std::out_of_range when invalid labelsId is provided.
*/
|
|
375
|
virtual const std::set<std::string>& getLabels(int labelsId) const = 0;
|
|
376
|
|
|
377
378
379
380
381
382
383
|
/**
* Get labelsId for given labels as string.
*
* @param labelsStr
* @return labelsId
* @throws MorfeuszException when invalid tag is provided.
*/
|
|
384
|
virtual int getLabelsId(const std::string& labelsStr) const = 0;
|
|
385
|
|
|
386
|
/**
|
|
387
|
* Returns number of tags this tagset contains.
|
|
388
389
390
|
*
* @return
*/
|
|
391
|
virtual size_t getTagsCount() const = 0;
|
|
392
|
|
|
393
|
/**
|
|
394
|
* Returns number of named entity types this tagset contains.
|
|
395
396
397
|
*
* @return
*/
|
|
398
|
virtual size_t getNamesCount() const = 0;
|
|
399
|
|
|
400
401
402
|
/**
* Returns number of different labels combinations.
*/
|
|
403
|
virtual size_t getLabelsCount() const = 0;
|
|
404
|
|
|
405
|
virtual ~IdResolver() {
|
|
406
|
}
|
|
407
408
|
};
|
|
409
|
/**
|
|
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
|
The result of analysis is a directed acyclic graph with numbered
nodes representing positions in text (points _between_ segments)
and edges representing interpretations of segments that span from
one node to another. E.g.,
{0,1,"ja","ja","ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri"}
|
| {1,2,"został","zostać","praet:sg:m1.m2.m3:perf"}
| |
__| ____| __{2,3,"em","być","aglt:sg:pri:imperf:wok"}
/ \ / \ / \
* Ja * został*em *
0 1 2 3
Note that the word 'zostałem' got broken into 2 separate segments.
The structure below describes one edge of this DAG:
|
|
428
|
*/
|
|
429
|
struct DLLIMPORT MorphInterpretation {
|
|
430
431
432
|
/**
* Creates new instance with "ign" tag (meaning: "not found in the dictionary")
*/
|
|
433
|
static MorphInterpretation createIgn(
|
|
434
435
|
int startNode, int endNode,
const std::string& orth, const std::string& lemma);
|
|
436
|
|
|
437
438
439
|
/**
* Creates new instance with "sp" tag (meaning: "this is a sequence of whitespaces")
*/
|
|
440
|
static MorphInterpretation createWhitespace(int startNode, int endNode, const std::string& orth);
|
|
441
|
|
|
442
443
444
445
|
/**
*
* @return true iff this instance represents an unknown word.
*/
|
|
446
|
inline bool isIgn() const {
|
|
447
|
return tagId == 0;
|
|
448
|
}
|
|
449
|
|
|
450
451
452
453
|
/**
*
* @return true iff this instance represents a whitespace.
*/
|
|
454
|
inline bool isWhitespace() const {
|
|
455
|
return tagId == 1;
|
|
456
|
}
|
|
457
|
|
|
458
459
460
461
462
463
|
/**
* Get tag as string.
*
* @param morfeusz Morfeusz instance this interpretation was created by.
* @return
*/
|
|
464
465
466
467
|
inline const std::string& getTag(const Morfeusz& morfeusz) const {
return morfeusz.getIdResolver().getTag(this->tagId);
}
|
|
468
469
470
471
472
473
|
/**
* Get name as string.
*
* @param morfeusz Morfeusz instance this interpretation was created by.
* @return
*/
|
|
474
475
476
477
|
inline const std::string& getName(const Morfeusz& morfeusz) const {
return morfeusz.getIdResolver().getName(this->nameId);
}
|
|
478
479
480
481
482
483
|
/**
* Get labels as string.
*
* @param morfeusz Morfeusz instance this interpretation was created by.
* @return
*/
|
|
484
485
486
487
|
inline const std::string& getLabelsAsString(const Morfeusz& morfeusz) const {
return morfeusz.getIdResolver().getLabelsAsString(this->labelsId);
}
|
|
488
489
490
491
492
493
|
/**
* Get tag as set of strings.
*
* @param morfeusz Morfeusz instance this interpretation was created by.
* @return
*/
|
|
494
495
496
|
inline const std::set<std::string>& getLabels(const Morfeusz& morfeusz) const {
return morfeusz.getIdResolver().getLabels(this->labelsId);
}
|
|
497
|
|
|
498
499
500
501
|
int startNode;
int endNode;
std::string orth;
std::string lemma;
|
|
502
503
504
|
int tagId;
int nameId;
int labelsId;
|
|
505
|
};
|
|
506
|
|
|
507
|
class DLLIMPORT MorfeuszException : public std::exception {
|
|
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
|
public:
MorfeuszException(const std::string& what) : msg(what.c_str()) {
}
virtual ~MorfeuszException() throw () {
}
virtual const char* what() const throw () {
return this->msg.c_str();
}
private:
const std::string msg;
};
|
|
523
|
class DLLIMPORT FileFormatException : public MorfeuszException {
|
|
524
525
|
public:
|
|
526
|
FileFormatException(const std::string& what) : MorfeuszException(what) {
|
|
527
528
529
530
531
532
|
}
};
}
#endif /* MORFEUSZ2_H */
|