Blame view

morfeusz/morfeusz2.h 16.7 KB
Michał Lenart authored
1
2
3
4
5
6
7
8
9
10
11
12
13
/* 
 * File:   morfeusz2.h
 * Author: mlenart
 *
 * Created on 13 czerwiec 2014, 17:28
 */

#ifndef MORFEUSZ2_H
#define	MORFEUSZ2_H

#include <vector>
#include <string>
#include <list>
Michał Lenart authored
14
#include <set>
Michał Lenart authored
15
Michał Lenart authored
16
17
18
19
20
#ifndef __WIN32
#define DLLIMPORT
#else
/* A Windows system.  Need to define DLLIMPORT. */
#if BUILDING_MORFEUSZ
Michał Lenart authored
21
#define DLLIMPORT __declspec (dllexport)
Michał Lenart authored
22
#else
Michał Lenart authored
23
#define DLLIMPORT __declspec (dllimport)
Michał Lenart authored
24
25
#endif
#endif
Michał Lenart authored
26
Michał Lenart authored
27
28
namespace morfeusz {
Michał Lenart authored
29
30
31
    class DLLIMPORT MorphInterpretation;
    class DLLIMPORT Morfeusz;
    class DLLIMPORT ResultsIterator;
Michał Lenart authored
32
    class DLLIMPORT IdResolver;
Michał Lenart authored
33
    class DLLIMPORT MorfeuszException;
Michał Lenart authored
34
35

    enum Charset {
Michał Lenart authored
36
        UTF8 = 11,
Michał Lenart authored
37
38
39
        //    UTF16LE,
        //    UTF16BE,
        //    UTF32,
Michał Lenart authored
40
41
42
        ISO8859_2 = 12,
        CP1250 = 13,
        CP852 = 14
Michał Lenart authored
43
    };
Michał Lenart authored
44
Michał Lenart authored
45
46
47
48
    enum TokenNumbering {
        /**
         * Start from 0. Reset counter for every invocation of Morfeusz::analyze
         */
Michał Lenart authored
49
        SEPARATE_NUMBERING = 201,
Michał Lenart authored
50
Michał Lenart authored
51
52
53
        /**
         * Also start from 0. Reset counter for every invocation of Morfeusz::setTokenNumbering only
         */
Michał Lenart authored
54
        CONTINUOUS_NUMBERING = 202
Michał Lenart authored
55
    };
Michał Lenart authored
56
Michał Lenart authored
57
58
59
60
    enum CaseHandling {
        /**
         * Case-sensitive but allows interpretations that do not match case but there are no alternatives
         */
Michał Lenart authored
61
        CONDITIONALLY_CASE_SENSITIVE = 100,
Michał Lenart authored
62
Michał Lenart authored
63
64
65
        /**
         * Strictly case-sensitive, reject all interpretations that do not match case
         */
Michał Lenart authored
66
        STRICTLY_CASE_SENSITIVE = 101,
Michał Lenart authored
67
Michał Lenart authored
68
69
70
        /**
         * Case-insensitive - ignores case
         */
Michał Lenart authored
71
        IGNORE_CASE = 102
Michał Lenart authored
72
    };
Michał Lenart authored
73
Michał Lenart authored
74
75
76
77
    enum WhitespaceHandling {
        /**
         * Ignore whitespaces
         */
Michał Lenart authored
78
        SKIP_WHITESPACES = 301,
Michał Lenart authored
79
Michał Lenart authored
80
81
82
        /**
         * Append whitespaces to previous MorphInterpretation
         */
Michał Lenart authored
83
        APPEND_WHITESPACES = 302,
Michał Lenart authored
84
Michał Lenart authored
85
86
87
        /**
         * Whitespaces are separate MorphInterpretation objects
         */
Michał Lenart authored
88
        KEEP_WHITESPACES = 303
Michał Lenart authored
89
    };
Michał Lenart authored
90
91

    enum MorfeuszUsage {
Michał Lenart authored
92
93
        ANALYSE_ONLY = 401,
        GENERATE_ONLY = 402,
Michał Lenart authored
94
95
        BOTH_ANALYSE_AND_GENERATE = 403
    };
Michał Lenart authored
96
97
98
99
100
101
102

    /**
     * Performs morphological analysis (analyze methods) and syntesis (generate methods).
     * 
     * It is NOT thread-safe
     * but it is possible to use separate Morfeusz instance for each concurrent thread.
     */
Michał Lenart authored
103
    class DLLIMPORT Morfeusz {
Michał Lenart authored
104
105
    public:
Michał Lenart authored
106
107
108
109
        /**
         * Returns a string containing library version.
         * @return 
         */
Michał Lenart authored
110
        static std::string getVersion();
Michał Lenart authored
111
112
113
114
115
116

        /**
         * Returns a string containing default dictionary name.
         * @return 
         */
        static std::string getDefaultDictName();
Michał Lenart authored
117
Michał Lenart authored
118
119
120
121
        /**
         * Creates actual instance of Morfeusz class.
         * The caller is responsible for destroying it.
         * 
Michał Lenart authored
122
123
         * @remarks NOT THREAD-SAFE (affects ALL Morfeusz instances)
         * @return new instance of Morfeusz.
Michał Lenart authored
124
         */
Michał Lenart authored
125
126
127
        static Morfeusz* createInstance(MorfeuszUsage usage);

        /**
Michał Lenart authored
128
129
130
         * Creates exact copy of Morfeusz object.
         * 
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
131
132
         */
        virtual Morfeusz* clone() const = 0;
Michał Lenart authored
133
134
135
136
137

        virtual ~Morfeusz();

        /**
         * Analyze given text and return the results as iterator.
Michał Lenart authored
138
139
         * Use this method for analysis of big texts.
         * Copies the text under the hood - use analyze(const char*) if you want to avoid this.
Michał Lenart authored
140
         * 
Michał Lenart authored
141
         * @param text - text for morphological analysis.
Michał Lenart authored
142
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
143
144
         * @return - iterator over morphological analysis results
         */
Michał Lenart authored
145
        virtual ResultsIterator* analyse(const std::string& text) const = 0;
Michał Lenart authored
146
Michał Lenart authored
147
148
        /**
         * Analyze given text and return the results as iterator.
Michał Lenart authored
149
         * It does not store results for whole text at once, so may be less memory-consuming for analysis of big texts
Michał Lenart authored
150
151
152
         * 
         * 
         * @param text - text for morphological analysis. This pointer must not be deleted before returned ResultsIterator object.
Michał Lenart authored
153
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
154
155
         * @return - iterator over morphological analysis results
         */
Michał Lenart authored
156
        virtual ResultsIterator* analyse(const char* text) const = 0;
Michał Lenart authored
157
158
159
160
161
162

        /**
         * Perform morphological analysis on a given text and put results in a vector.
         * 
         * @param text - text to be analyzed
         * @param result - results vector
Michał Lenart authored
163
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
164
         */
Michał Lenart authored
165
        virtual void analyse(const std::string& text, std::vector<MorphInterpretation>& result) const = 0;
Michał Lenart authored
166
167
168
169
170
171

        /**
         * Perform morphological synthesis on a given lemma and put results in a vector.
         * 
         * @param lemma - lemma to be analyzed
         * @param result - results vector
Michał Lenart authored
172
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
173
         * @throws MorfeuszException - when lemma parameter contains whitespaces.
Michał Lenart authored
174
175
176
177
178
179
180
181
182
183
         */
        virtual void generate(const std::string& lemma, std::vector<MorphInterpretation>& result) const = 0;

        /**
         * Perform morphological synthesis on a given lemma and put results in a vector.
         * Limit results to interpretations with the specified tag.
         * 
         * @param lemma - lemma to be analyzed
         * @param tag - tag of result interpretations
         * @param result - results vector
Michał Lenart authored
184
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
185
         * @throws MorfeuszException - when lemma parameter contains whitespaces or tagId is outside tagset.
Michał Lenart authored
186
         */
Michał Lenart authored
187
        virtual void generate(const std::string& lemma, int tagId, std::vector<MorphInterpretation>& result) const = 0;
Michał Lenart authored
188
189
190
191
192

        /**
         * Set encoding for input and output string objects.
         * 
         * @param encoding
Michał Lenart authored
193
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
194
195
196
197
         */
        virtual void setCharset(Charset encoding) = 0;

        /**
Michał Lenart authored
198
         * Select agglutination rules
Michał Lenart authored
199
200
         * 
         * @param aggl
Michał Lenart authored
201
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
202
         * @throws MorfeuszException - for invalid aggl parameter.
Michał Lenart authored
203
204
205
206
         */
        virtual void setAggl(const std::string& aggl) = 0;

        /**
Michał Lenart authored
207
         * Select past tense segmentation
Michał Lenart authored
208
209
         * 
         * @param praet
Michał Lenart authored
210
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
211
         * @throws MorfeuszException - for invalid aggl praet parameter.
Michał Lenart authored
212
213
214
215
         */
        virtual void setPraet(const std::string& praet) = 0;

        /**
Michał Lenart authored
216
         * Set case handling.
Michał Lenart authored
217
218
         * 
         * @param caseSensitive
Michał Lenart authored
219
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
220
         */
Michał Lenart authored
221
        virtual void setCaseHandling(CaseHandling caseHandling) = 0;
Michał Lenart authored
222
Michał Lenart authored
223
224
225
226
        /**
         * Set token numbering policy.
         * 
         * @param numbering
Michał Lenart authored
227
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
228
229
         */
        virtual void setTokenNumbering(TokenNumbering numbering) = 0;
Michał Lenart authored
230
Michał Lenart authored
231
232
233
234
        /**
         * Set whitespace handling.
         * 
         * @param numbering
Michał Lenart authored
235
         * @remarks NOT THREAD-SAFE (must have exclusive access to this instance. Does not affect other Morfeusz instances).
Michał Lenart authored
236
237
         */
        virtual void setWhitespaceHandling(WhitespaceHandling whitespaceHandling) = 0;
Michał Lenart authored
238
239
240
241
242
243
244

        /**
         * Set debug option value.
         * 
         * @param debug
         */
        virtual void setDebug(bool debug) = 0;
Michał Lenart authored
245
Michał Lenart authored
246
        /**
Michał Lenart authored
247
248
249
         * Get reference to tagset currently being in use.
         * 
         * @return currently used tagset
Michał Lenart authored
250
         */
Michał Lenart authored
251
        virtual const IdResolver& getIdResolver() const = 0;
Michał Lenart authored
252
Michał Lenart authored
253
        /**
Michał Lenart authored
254
255
         * Set current dictionary to the one with provided name.
         * 
Michał Lenart authored
256
         * This is NOT THREAD SAFE - no other thread may invoke setDictionary 
Michał Lenart authored
257
258
259
         * either within this instance, or any other in the same application.
         * 
         * @param dictName dictionary name
Michał Lenart authored
260
         * @remarks NOT THREAD-SAFE (affects ALL Morfeusz instances)
Michał Lenart authored
261
262
         * @throws MorfeuszException - when dictionary not found.
         * @throws std::ios_base::failure - when IO error occurred when loading given dictionary.
Michał Lenart authored
263
         */
Michał Lenart authored
264
265
266
267
        virtual void setDictionary(const std::string& dictName) = 0;

        /**
         * List of paths where current Morfeusz instance will look for dictionaries.
Michał Lenart authored
268
         * Modifying it is NOT THREAD-SAFE.
Michał Lenart authored
269
270
271
         */
        static std::list<std::string> dictionarySearchPaths;
Michał Lenart authored
272
        /**
Michał Lenart authored
273
274
         * Get available parameters for "setAggl" method.
         * @return 
Michał Lenart authored
275
         */
Michał Lenart authored
276
277
278
279
280
281
282
        virtual const std::set<std::string>& getAvailableAgglOptions() const = 0;

        /**
         * Get available parameters for "setPraet" method.
         * @return 
         */
        virtual const std::set<std::string>& getAvailablePraetOptions() const = 0;
Michał Lenart authored
283
Michał Lenart authored
284
285
286
287
288
    protected:
        /**
         * Same as analyze(text) but copies the text under the hood.
         * Useful for wrappers to other languages.
         */
Michał Lenart authored
289
        virtual ResultsIterator* analyseWithCopy(const char* text) const = 0;
Michał Lenart authored
290
291
    };
Michał Lenart authored
292
    class DLLIMPORT ResultsIterator {
Michał Lenart authored
293
    public:
Michał Lenart authored
294
295
296
297
        /**
         * 
         * @return true iff this iterator contains more elements.
         */
Michał Lenart authored
298
        virtual bool hasNext() = 0;
Michał Lenart authored
299
300
301
302
303
304

        /**
         * 
         * @return the element, that will be returned in next next() invocation.
         * @throws std::out_of_range when this iterator has already reached the end.
         */
Michał Lenart authored
305
        virtual const MorphInterpretation& peek() = 0;
Michał Lenart authored
306
307
308
309
310
311

        /**
         * 
         * @return next analysis result.
         * @throws std::out_of_range when this iterator has already reached the end.
         */
Michał Lenart authored
312
        virtual MorphInterpretation next() = 0;
Michał Lenart authored
313
Michał Lenart authored
314
315
        virtual ~ResultsIterator() {
        }
Michał Lenart authored
316
    };
Michał Lenart authored
317
318

    /**
Michał Lenart authored
319
     * Represents a tagset
Michał Lenart authored
320
     */
Michał Lenart authored
321
    class DLLIMPORT IdResolver {
Michał Lenart authored
322
    public:
Michał Lenart authored
323
Michał Lenart authored
324
325
326
327
328
        /**
         * Returns tag (denoted by its index).
         * 
         * @param tagNum - tag index in the tagset.
         * @return - the tag
Michał Lenart authored
329
         * @throws std::out_of_range when invalid tagId is provided.
Michał Lenart authored
330
         */
Michał Lenart authored
331
        virtual const std::string& getTag(const int tagId) const = 0;
Michał Lenart authored
332
Michał Lenart authored
333
334
335
336
337
        /**
         * Returns identifier for given tag.
         * Throws MorfeuszException when none exists.
         * 
         * @return identifier for given tag
Michał Lenart authored
338
         * @throws MorfeuszException when invalid tag parameter is provided.
Michał Lenart authored
339
340
         */
        virtual int getTagId(const std::string& tag) const = 0;
Michał Lenart authored
341
342
343
344
345
346

        /**
         * Returns named entity type (denoted by its index).
         * 
         * @param nameNum - name index in the tagset.
         * @return - the named entity type
Michał Lenart authored
347
         * @throws std::out_of_range when invalid nameId is provided.
Michał Lenart authored
348
         */
Michał Lenart authored
349
        virtual const std::string& getName(const int nameId) const = 0;
Michał Lenart authored
350
Michał Lenart authored
351
352
353
354
355
        /**
         * Returns identifier for given named entity.
         * Throws MorfeuszException when none exists.
         * 
         * @return identifier for given named entity
Michał Lenart authored
356
         * @throws MorfeuszException when invalid name parameter is provided.
Michał Lenart authored
357
358
         */
        virtual int getNameId(const std::string& name) const = 0;
Michał Lenart authored
359
Michał Lenart authored
360
361
362
363
364
365
366
        /**
         * Returns labels string for given labelsId.
         * 
         * @param labelsId
         * @return labels as string
         * @throws std::out_of_range when invalid labelsId is provided.
         */
Michał Lenart authored
367
        virtual const std::string& getLabelsAsString(int labelsId) const = 0;
Michał Lenart authored
368
Michał Lenart authored
369
370
371
372
373
374
        /**
         * Returns labels as set of strings for given labelsId.
         * @param labelsId
         * @return labels as set of strings
         * @throws std::out_of_range when invalid labelsId is provided.
         */
Michał Lenart authored
375
        virtual const std::set<std::string>& getLabels(int labelsId) const = 0;
Michał Lenart authored
376
Michał Lenart authored
377
378
379
380
381
382
383
        /**
         * Get labelsId for given labels as string.
         * 
         * @param labelsStr
         * @return labelsId
         * @throws MorfeuszException when invalid tag is provided.
         */
Michał Lenart authored
384
        virtual int getLabelsId(const std::string& labelsStr) const = 0;
Michał Lenart authored
385
Michał Lenart authored
386
        /**
Michał Lenart authored
387
         * Returns number of tags this tagset contains.
Michał Lenart authored
388
389
390
         * 
         * @return 
         */
Michał Lenart authored
391
        virtual size_t getTagsCount() const = 0;
Michał Lenart authored
392
Michał Lenart authored
393
        /**
Michał Lenart authored
394
         * Returns number of named entity types this tagset contains.
Michał Lenart authored
395
396
397
         * 
         * @return 
         */
Michał Lenart authored
398
        virtual size_t getNamesCount() const = 0;
Michał Lenart authored
399
Michał Lenart authored
400
401
402
        /**
         * Returns number of different labels combinations.
         */
Michał Lenart authored
403
        virtual size_t getLabelsCount() const = 0;
Michał Lenart authored
404
Michał Lenart authored
405
        virtual ~IdResolver() {
Michał Lenart authored
406
        }
Michał Lenart authored
407
408
    };
Michał Lenart authored
409
    /**
Michał Lenart authored
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
     The result of analysis is  a directed acyclic graph with numbered
     nodes representing positions  in text (points _between_ segments)
     and edges representing interpretations of segments that span from
     one node to another.  E.g.,

         {0,1,"ja","ja","ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri"}
         |
         |      {1,2,"został","zostać","praet:sg:m1.m2.m3:perf"}
         |      |
       __|  ____|   __{2,3,"em","być","aglt:sg:pri:imperf:wok"}
      /  \ /     \ / \
     * Ja * został*em *
     0    1       2   3

     Note that the word 'zostałem' got broken into 2 separate segments.

     The structure below describes one edge of this DAG:
Michał Lenart authored
428
     */
Michał Lenart authored
429
    struct DLLIMPORT MorphInterpretation {
Michał Lenart authored
430
431
432
        /**
         * Creates new instance with "ign" tag (meaning: "not found in the dictionary")
         */
Michał Lenart authored
433
        static MorphInterpretation createIgn(
Michał Lenart authored
434
435
                int startNode, int endNode,
                const std::string& orth, const std::string& lemma);
Michał Lenart authored
436
Michał Lenart authored
437
438
439
        /**
         * Creates new instance with "sp" tag (meaning: "this is a sequence of whitespaces")
         */
Michał Lenart authored
440
        static MorphInterpretation createWhitespace(int startNode, int endNode, const std::string& orth);
Michał Lenart authored
441
Michał Lenart authored
442
443
444
445
        /**
         * 
         * @return true iff this instance represents an unknown word.
         */
Michał Lenart authored
446
        inline bool isIgn() const {
Michał Lenart authored
447
            return tagId == 0;
Michał Lenart authored
448
        }
Michał Lenart authored
449
Michał Lenart authored
450
451
452
453
        /**
         * 
         * @return true iff this instance represents a whitespace.
         */
Michał Lenart authored
454
        inline bool isWhitespace() const {
Michał Lenart authored
455
            return tagId == 1;
Michał Lenart authored
456
        }
Michał Lenart authored
457
Michał Lenart authored
458
459
460
461
462
463
        /**
         * Get tag as string.
         * 
         * @param morfeusz Morfeusz instance this interpretation was created by.
         * @return 
         */
Michał Lenart authored
464
465
466
467
        inline const std::string& getTag(const Morfeusz& morfeusz) const {
            return morfeusz.getIdResolver().getTag(this->tagId);
        }
Michał Lenart authored
468
469
470
471
472
473
        /**
         * Get name as string.
         * 
         * @param morfeusz Morfeusz instance this interpretation was created by.
         * @return 
         */
Michał Lenart authored
474
475
476
477
        inline const std::string& getName(const Morfeusz& morfeusz) const {
            return morfeusz.getIdResolver().getName(this->nameId);
        }
Michał Lenart authored
478
479
480
481
482
483
        /**
         * Get labels as string.
         * 
         * @param morfeusz Morfeusz instance this interpretation was created by.
         * @return 
         */
Michał Lenart authored
484
485
486
487
        inline const std::string& getLabelsAsString(const Morfeusz& morfeusz) const {
            return morfeusz.getIdResolver().getLabelsAsString(this->labelsId);
        }
Michał Lenart authored
488
489
490
491
492
493
        /**
         * Get tag as set of strings.
         * 
         * @param morfeusz Morfeusz instance this interpretation was created by.
         * @return 
         */
Michał Lenart authored
494
495
496
        inline const std::set<std::string>& getLabels(const Morfeusz& morfeusz) const {
            return morfeusz.getIdResolver().getLabels(this->labelsId);
        }
Michał Lenart authored
497
Michał Lenart authored
498
499
500
501
        int startNode;
        int endNode;
        std::string orth;
        std::string lemma;
Michał Lenart authored
502
503
504
        int tagId;
        int nameId;
        int labelsId;
Michał Lenart authored
505
    };
Michał Lenart authored
506
Michał Lenart authored
507
    class DLLIMPORT MorfeuszException : public std::exception {
Michał Lenart authored
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
    public:

        MorfeuszException(const std::string& what) : msg(what.c_str()) {
        }

        virtual ~MorfeuszException() throw () {
        }

        virtual const char* what() const throw () {
            return this->msg.c_str();
        }
    private:
        const std::string msg;
    };
Michał Lenart authored
523
    class DLLIMPORT FileFormatException : public MorfeuszException {
Michał Lenart authored
524
525
    public:
Michał Lenart authored
526
        FileFormatException(const std::string& what) : MorfeuszException(what) {
Michał Lenart authored
527
528
529
530
531
532
        }
    };
}

#endif	/* MORFEUSZ2_H */