|
1
2
3
4
5
6
7
8
9
10
11
12
13
|
/*
* File: morfeusz2.h
* Author: mlenart
*
* Created on 13 czerwiec 2014, 17:28
*/
#ifndef MORFEUSZ2_H
#define MORFEUSZ2_H
#include <vector>
#include <string>
#include <list>
|
|
14
|
#include <set>
|
|
15
|
|
|
16
17
18
19
20
|
#ifndef __WIN32
#define DLLIMPORT
#else
/* A Windows system. Need to define DLLIMPORT. */
#if BUILDING_MORFEUSZ
|
|
21
|
#define DLLIMPORT __declspec (dllexport)
|
|
22
|
#else
|
|
23
|
#define DLLIMPORT __declspec (dllimport)
|
|
24
25
|
#endif
#endif
|
|
26
|
|
|
27
28
|
namespace morfeusz {
|
|
29
30
31
|
class DLLIMPORT MorphInterpretation;
class DLLIMPORT Morfeusz;
class DLLIMPORT ResultsIterator;
|
|
32
|
class DLLIMPORT IdResolver;
|
|
33
|
class DLLIMPORT MorfeuszException;
|
|
34
35
|
enum Charset {
|
|
36
|
UTF8 = 11,
|
|
37
38
39
|
// UTF16LE,
// UTF16BE,
// UTF32,
|
|
40
41
42
|
ISO8859_2 = 12,
CP1250 = 13,
CP852 = 14
|
|
43
|
};
|
|
44
|
|
|
45
46
47
48
|
enum TokenNumbering {
/**
* Start from 0. Reset counter for every invocation of Morfeusz::analyze
*/
|
|
49
|
SEPARATE_NUMBERING = 201,
|
|
50
|
|
|
51
52
53
|
/**
* Also start from 0. Reset counter for every invocation of Morfeusz::setTokenNumbering only
*/
|
|
54
|
CONTINUOUS_NUMBERING = 202
|
|
55
|
};
|
|
56
|
|
|
57
58
59
60
|
enum CaseHandling {
/**
* Case-sensitive but allows interpretations that do not match case but there are no alternatives
*/
|
|
61
|
CONDITIONALLY_CASE_SENSITIVE = 100,
|
|
62
|
|
|
63
64
65
|
/**
* Strictly case-sensitive, reject all interpretations that do not match case
*/
|
|
66
|
STRICTLY_CASE_SENSITIVE = 101,
|
|
67
|
|
|
68
69
70
|
/**
* Case-insensitive - ignores case
*/
|
|
71
|
IGNORE_CASE = 102
|
|
72
|
};
|
|
73
|
|
|
74
75
76
77
|
enum WhitespaceHandling {
/**
* Ignore whitespaces
*/
|
|
78
|
SKIP_WHITESPACES = 301,
|
|
79
|
|
|
80
81
82
|
/**
* Append whitespaces to previous MorphInterpretation
*/
|
|
83
|
APPEND_WHITESPACES = 302,
|
|
84
|
|
|
85
86
87
|
/**
* Whitespaces are separate MorphInterpretation objects
*/
|
|
88
|
KEEP_WHITESPACES = 303
|
|
89
|
};
|
|
90
91
|
enum MorfeuszUsage {
|
|
92
93
|
ANALYSE_ONLY = 401,
GENERATE_ONLY = 402,
|
|
94
95
|
BOTH_ANALYSE_AND_GENERATE = 403
};
|
|
96
97
98
99
100
101
102
|
/**
* Performs morphological analysis (analyze methods) and syntesis (generate methods).
*
* It is NOT thread-safe
* but it is possible to use separate Morfeusz instance for each concurrent thread.
*/
|
|
103
|
class DLLIMPORT Morfeusz {
|
|
104
105
|
public:
|
|
106
107
108
109
|
/**
* Returns a string containing library version.
* @return
*/
|
|
110
|
static std::string getVersion();
|
|
111
112
113
114
115
116
|
/**
* Returns a string containing default dictionary name.
* @return
*/
static std::string getDefaultDictName();
|
|
117
|
|
|
118
119
120
121
|
/**
* Creates actual instance of Morfeusz class.
* The caller is responsible for destroying it.
*
|
|
122
123
|
* @remarks NOT THREAD-SAFE (affects ALL Morfeusz instances)
* @return new instance of Morfeusz.
|
|
124
|
*/
|
|
125
126
127
128
129
130
131
132
133
134
|
static Morfeusz* createInstance(MorfeuszUsage usage=BOTH_ANALYSE_AND_GENERATE);
/**
* Creates actual instance of Morfeusz class with possibly non-default dictionary.
* The caller is responsible for destroying it.
*
* @remarks NOT THREAD-SAFE (affects ALL Morfeusz instances)
* @return new instance of Morfeusz.
*/
static Morfeusz* createInstance(const std::string& dictName, MorfeuszUsage usage=BOTH_ANALYSE_AND_GENERATE);
|
|
135
136
|
/**
|
|
137
138
139
|
* Creates exact copy of Morfeusz object.
*
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
140
141
|
*/
virtual Morfeusz* clone() const = 0;
|
|
142
143
144
145
146
|
virtual ~Morfeusz();
/**
* Analyze given text and return the results as iterator.
|
|
147
148
|
* Use this method for analysis of big texts.
* Copies the text under the hood - use analyze(const char*) if you want to avoid this.
|
|
149
|
*
|
|
150
|
* @param text - text for morphological analysis.
|
|
151
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
152
153
|
* @return - iterator over morphological analysis results
*/
|
|
154
|
virtual ResultsIterator* analyse(const std::string& text) const = 0;
|
|
155
|
|
|
156
157
|
/**
* Analyze given text and return the results as iterator.
|
|
158
|
* It does not store results for whole text at once, so may be less memory-consuming for analysis of big texts
|
|
159
160
161
|
*
*
* @param text - text for morphological analysis. This pointer must not be deleted before returned ResultsIterator object.
|
|
162
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
163
164
|
* @return - iterator over morphological analysis results
*/
|
|
165
|
virtual ResultsIterator* analyse(const char* text) const = 0;
|
|
166
167
168
169
170
171
|
/**
* Perform morphological analysis on a given text and put results in a vector.
*
* @param text - text to be analyzed
* @param result - results vector
|
|
172
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
173
|
*/
|
|
174
|
virtual void analyse(const std::string& text, std::vector<MorphInterpretation>& result) const = 0;
|
|
175
176
177
178
179
180
|
/**
* Perform morphological synthesis on a given lemma and put results in a vector.
*
* @param lemma - lemma to be analyzed
* @param result - results vector
|
|
181
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
182
|
* @throws MorfeuszException - when lemma parameter contains whitespaces.
|
|
183
184
185
186
187
188
189
190
191
192
|
*/
virtual void generate(const std::string& lemma, std::vector<MorphInterpretation>& result) const = 0;
/**
* Perform morphological synthesis on a given lemma and put results in a vector.
* Limit results to interpretations with the specified tag.
*
* @param lemma - lemma to be analyzed
* @param tag - tag of result interpretations
* @param result - results vector
|
|
193
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
194
|
* @throws MorfeuszException - when lemma parameter contains whitespaces or tagId is outside tagset.
|
|
195
|
*/
|
|
196
|
virtual void generate(const std::string& lemma, int tagId, std::vector<MorphInterpretation>& result) const = 0;
|
|
197
198
199
200
201
|
/**
* Set encoding for input and output string objects.
*
* @param encoding
|
|
202
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
203
204
|
*/
virtual void setCharset(Charset encoding) = 0;
|
|
205
206
207
208
209
210
|
/**
* Get charset used for input and output string objects.
* @return
*/
virtual Charset getCharset() const = 0;
|
|
211
212
|
/**
|
|
213
|
* Select agglutination rules
|
|
214
215
|
*
* @param aggl
|
|
216
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
217
|
* @throws MorfeuszException - for invalid aggl parameter.
|
|
218
219
|
*/
virtual void setAggl(const std::string& aggl) = 0;
|
|
220
221
222
223
224
225
|
/**
* Get current agglutination rules option
* @return
*/
virtual std::string getAggl() const = 0;
|
|
226
227
|
/**
|
|
228
|
* Select past tense segmentation
|
|
229
230
|
*
* @param praet
|
|
231
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
232
|
* @throws MorfeuszException - for invalid aggl praet parameter.
|
|
233
234
|
*/
virtual void setPraet(const std::string& praet) = 0;
|
|
235
236
237
238
239
240
|
/**
* Get current past tense segmentation option
* @return
*/
virtual std::string getPraet() const = 0;
|
|
241
242
|
/**
|
|
243
|
* Set case handling.
|
|
244
245
|
*
* @param caseSensitive
|
|
246
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
247
|
*/
|
|
248
|
virtual void setCaseHandling(CaseHandling caseHandling) = 0;
|
|
249
250
251
252
253
254
|
/**
* Get case handling policy.
* @return
*/
virtual CaseHandling getCaseHandling() const = 0;
|
|
255
|
|
|
256
257
258
259
|
/**
* Set token numbering policy.
*
* @param numbering
|
|
260
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
261
262
|
*/
virtual void setTokenNumbering(TokenNumbering numbering) = 0;
|
|
263
264
265
266
267
268
|
/**
* Get token numbering policy.
* @return
*/
virtual TokenNumbering getTokenNumbering() const = 0;
|
|
269
|
|
|
270
271
272
273
|
/**
* Set whitespace handling.
*
* @param numbering
|
|
274
|
* @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
|
|
275
276
|
*/
virtual void setWhitespaceHandling(WhitespaceHandling whitespaceHandling) = 0;
|
|
277
278
279
280
281
282
|
/**
* Get whitespace handling.
* @return
*/
virtual WhitespaceHandling getWhitespaceHandling() const = 0;
|
|
283
284
285
286
287
288
289
|
/**
* Set debug option value.
*
* @param debug
*/
virtual void setDebug(bool debug) = 0;
|
|
290
|
|
|
291
|
/**
|
|
292
293
294
|
* Get reference to tagset currently being in use.
*
* @return currently used tagset
|
|
295
|
*/
|
|
296
|
virtual const IdResolver& getIdResolver() const = 0;
|
|
297
|
|
|
298
|
/**
|
|
299
300
|
* Set current dictionary to the one with provided name.
*
|
|
301
|
* This is NOT THREAD SAFE - no other thread may invoke setDictionary
|
|
302
303
304
|
* either within this instance, or any other in the same application.
*
* @param dictName dictionary name
|
|
305
|
* @remarks NOT THREAD-SAFE (affects ALL Morfeusz instances)
|
|
306
307
|
* @throws MorfeuszException - when dictionary not found.
* @throws std::ios_base::failure - when IO error occurred when loading given dictionary.
|
|
308
|
*/
|
|
309
310
311
312
|
virtual void setDictionary(const std::string& dictName) = 0;
/**
* List of paths where current Morfeusz instance will look for dictionaries.
|
|
313
|
* Modifying it is NOT THREAD-SAFE.
|
|
314
|
*/
|
|
315
|
static std::list<std::string> dictionarySearchPaths;
|
|
316
|
|
|
317
|
/**
|
|
318
319
|
* Get available parameters for "setAggl" method.
* @return
|
|
320
|
*/
|
|
321
322
323
324
325
326
327
|
virtual const std::set<std::string>& getAvailableAgglOptions() const = 0;
/**
* Get available parameters for "setPraet" method.
* @return
*/
virtual const std::set<std::string>& getAvailablePraetOptions() const = 0;
|
|
328
|
|
|
329
330
331
332
333
|
protected:
/**
* Same as analyze(text) but copies the text under the hood.
* Useful for wrappers to other languages.
*/
|
|
334
|
virtual ResultsIterator* analyseWithCopy(const char* text) const = 0;
|
|
335
336
|
};
|
|
337
|
class DLLIMPORT ResultsIterator {
|
|
338
|
public:
|
|
339
340
341
342
|
/**
*
* @return true iff this iterator contains more elements.
*/
|
|
343
|
virtual bool hasNext() = 0;
|
|
344
345
346
347
348
349
|
/**
*
* @return the element, that will be returned in next next() invocation.
* @throws std::out_of_range when this iterator has already reached the end.
*/
|
|
350
|
virtual const MorphInterpretation& peek() = 0;
|
|
351
352
353
354
355
356
|
/**
*
* @return next analysis result.
* @throws std::out_of_range when this iterator has already reached the end.
*/
|
|
357
|
virtual MorphInterpretation next() = 0;
|
|
358
|
|
|
359
360
|
virtual ~ResultsIterator() {
}
|
|
361
|
};
|
|
362
363
|
/**
|
|
364
|
* Represents mappings for tags, names and labels.
|
|
365
|
*/
|
|
366
|
class DLLIMPORT IdResolver {
|
|
367
|
public:
|
|
368
369
370
371
372
373
374
|
/**
* Returns current TAGSET-ID (as specified in first line of tagset file)
*
* @return tagset id string
*/
virtual const std::string getTagsetId() const = 0;
|
|
375
|
|
|
376
377
378
379
380
|
/**
* Returns tag (denoted by its index).
*
* @param tagNum - tag index in the tagset.
* @return - the tag
|
|
381
|
* @throws std::out_of_range when invalid tagId is provided.
|
|
382
|
*/
|
|
383
|
virtual const std::string& getTag(const int tagId) const = 0;
|
|
384
|
|
|
385
386
387
388
389
|
/**
* Returns identifier for given tag.
* Throws MorfeuszException when none exists.
*
* @return identifier for given tag
|
|
390
|
* @throws MorfeuszException when invalid tag parameter is provided.
|
|
391
392
|
*/
virtual int getTagId(const std::string& tag) const = 0;
|
|
393
394
395
396
397
398
|
/**
* Returns named entity type (denoted by its index).
*
* @param nameNum - name index in the tagset.
* @return - the named entity type
|
|
399
|
* @throws std::out_of_range when invalid nameId is provided.
|
|
400
|
*/
|
|
401
|
virtual const std::string& getName(const int nameId) const = 0;
|
|
402
|
|
|
403
404
405
406
407
|
/**
* Returns identifier for given named entity.
* Throws MorfeuszException when none exists.
*
* @return identifier for given named entity
|
|
408
|
* @throws MorfeuszException when invalid name parameter is provided.
|
|
409
410
|
*/
virtual int getNameId(const std::string& name) const = 0;
|
|
411
|
|
|
412
413
414
415
416
417
418
|
/**
* Returns labels string for given labelsId.
*
* @param labelsId
* @return labels as string
* @throws std::out_of_range when invalid labelsId is provided.
*/
|
|
419
|
virtual const std::string& getLabelsAsString(int labelsId) const = 0;
|
|
420
|
|
|
421
422
423
424
425
426
|
/**
* Returns labels as set of strings for given labelsId.
* @param labelsId
* @return labels as set of strings
* @throws std::out_of_range when invalid labelsId is provided.
*/
|
|
427
|
virtual const std::set<std::string>& getLabels(int labelsId) const = 0;
|
|
428
|
|
|
429
430
431
432
433
434
435
|
/**
* Get labelsId for given labels as string.
*
* @param labelsStr
* @return labelsId
* @throws MorfeuszException when invalid tag is provided.
*/
|
|
436
|
virtual int getLabelsId(const std::string& labelsStr) const = 0;
|
|
437
|
|
|
438
|
/**
|
|
439
|
* Returns number of tags this tagset contains.
|
|
440
441
442
|
*
* @return
*/
|
|
443
|
virtual size_t getTagsCount() const = 0;
|
|
444
|
|
|
445
|
/**
|
|
446
|
* Returns number of named entity types this tagset contains.
|
|
447
448
449
|
*
* @return
*/
|
|
450
|
virtual size_t getNamesCount() const = 0;
|
|
451
|
|
|
452
453
454
|
/**
* Returns number of different labels combinations.
*/
|
|
455
|
virtual size_t getLabelsCount() const = 0;
|
|
456
|
|
|
457
|
virtual ~IdResolver() {
|
|
458
|
}
|
|
459
460
|
};
|
|
461
|
/**
|
|
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
|
The result of analysis is a directed acyclic graph with numbered
nodes representing positions in text (points _between_ segments)
and edges representing interpretations of segments that span from
one node to another. E.g.,
{0,1,"ja","ja","ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri"}
|
| {1,2,"został","zostać","praet:sg:m1.m2.m3:perf"}
| |
__| ____| __{2,3,"em","być","aglt:sg:pri:imperf:wok"}
/ \ / \ / \
* Ja * został*em *
0 1 2 3
Note that the word 'zostałem' got broken into 2 separate segments.
|
|
477
|
* One MorphInterpretation instance describes one edge of this DAG.
|
|
478
|
*/
|
|
479
|
struct DLLIMPORT MorphInterpretation {
|
|
480
481
482
483
|
MorphInterpretation()
: startNode(0), endNode(0), orth(), lemma(), tagId(0), nameId(0), labelsId(0) {}
|
|
484
485
486
|
/**
* Creates new instance with "ign" tag (meaning: "not found in the dictionary")
*/
|
|
487
|
static MorphInterpretation createIgn(
|
|
488
489
|
int startNode, int endNode,
const std::string& orth, const std::string& lemma);
|
|
490
|
|
|
491
492
493
|
/**
* Creates new instance with "sp" tag (meaning: "this is a sequence of whitespaces")
*/
|
|
494
|
static MorphInterpretation createWhitespace(int startNode, int endNode, const std::string& orth);
|
|
495
|
|
|
496
497
498
499
|
/**
*
* @return true iff this instance represents an unknown word.
*/
|
|
500
|
inline bool isIgn() const {
|
|
501
|
return tagId == 0;
|
|
502
|
}
|
|
503
|
|
|
504
505
506
507
|
/**
*
* @return true iff this instance represents a whitespace.
*/
|
|
508
|
inline bool isWhitespace() const {
|
|
509
|
return tagId == 1;
|
|
510
|
}
|
|
511
|
|
|
512
513
514
515
516
517
|
/**
* Get tag as string.
*
* @param morfeusz Morfeusz instance this interpretation was created by.
* @return
*/
|
|
518
519
520
521
|
inline const std::string& getTag(const Morfeusz& morfeusz) const {
return morfeusz.getIdResolver().getTag(this->tagId);
}
|
|
522
523
524
525
526
527
|
/**
* Get name as string.
*
* @param morfeusz Morfeusz instance this interpretation was created by.
* @return
*/
|
|
528
529
530
531
|
inline const std::string& getName(const Morfeusz& morfeusz) const {
return morfeusz.getIdResolver().getName(this->nameId);
}
|
|
532
533
534
535
536
537
|
/**
* Get labels as string.
*
* @param morfeusz Morfeusz instance this interpretation was created by.
* @return
*/
|
|
538
539
540
541
|
inline const std::string& getLabelsAsString(const Morfeusz& morfeusz) const {
return morfeusz.getIdResolver().getLabelsAsString(this->labelsId);
}
|
|
542
543
544
545
546
547
|
/**
* Get tag as set of strings.
*
* @param morfeusz Morfeusz instance this interpretation was created by.
* @return
*/
|
|
548
549
550
|
inline const std::set<std::string>& getLabels(const Morfeusz& morfeusz) const {
return morfeusz.getIdResolver().getLabels(this->labelsId);
}
|
|
551
|
|
|
552
553
554
555
|
int startNode;
int endNode;
std::string orth;
std::string lemma;
|
|
556
557
558
|
int tagId;
int nameId;
int labelsId;
|
|
559
|
};
|
|
560
|
|
|
561
|
class DLLIMPORT MorfeuszException : public std::exception {
|
|
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
|
public:
MorfeuszException(const std::string& what) : msg(what.c_str()) {
}
virtual ~MorfeuszException() throw () {
}
virtual const char* what() const throw () {
return this->msg.c_str();
}
private:
const std::string msg;
};
|
|
577
|
class DLLIMPORT FileFormatException : public MorfeuszException {
|
|
578
579
|
public:
|
|
580
|
FileFormatException(const std::string& what) : MorfeuszException(what) {
|
|
581
582
583
584
585
586
|
}
};
}
#endif /* MORFEUSZ2_H */
|