sfstanalyser.h
3.18 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libmaca project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.MACA, LICENSE.SFST, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details.
*/
#ifndef LIBMACA_SFSTMORPHANALYSER_H
#define LIBMACA_SFSTMORPHANALYSER_H
#include <libmaca/morph/morphanalyser.h>
#include <libmaca/typedefs.h>
#include <boost/thread/mutex.hpp>
/// forwrd declaration of a SFST type
class CompactTransducer;
namespace Maca {
/**
* A morphological analyser using a SFST compact transducer.
*
* The transducer file should be prepared by the fst-compiler-utf8 program
* from the SFST suite, with the -c option to create a compact transducer.
*
* The expected behaviour of the transducer is to return, when given an
* orth it recognizes, one or more lemma-tag strings. A lemma-tag string
* should contain the lemma, followed by a less-than sign (<), followed
* by the tag-string, followed by a more-than sign (>). The tag string
* is composed of one or more tag identifiers in the colon-dot format,
* tag identifiers should be separated by a pipe character (|). An example
* lemma-tag string is "somelemma<pos1:foo:bar|pos2:foo:bar.baz>".
*
* Input for the fst-compiler tool can be prepared from an orth-lemma-tag
* tab-separated dictionary by the tab-to-sfst tool which should be bundled
* with the library.
*
* Configuration class key: \b sfst
*/
class SfstAnalyser : public MorphAnalyser, private boost::noncopyable
{
public:
/// Constructor for a SFST analyser working with a tagset and using
/// a transducer loaded from thegiven file
SfstAnalyser(const Corpus2::Tagset* tagset, const std::string& filename);
/**
* Config node constructor. recognized keys are:
* - file - the transducer file
* - lower-case - lowercase all input to the transducer
*/
SfstAnalyser(const Config::Node& cfg);
/// Destructor
~SfstAnalyser();
/// Cloning
SfstAnalyser* clone() const;
/// MorphAnalyser override
bool process_functional(const Toki::Token& t,
boost::function<void (Corpus2::Token*)> sink);
/// helper function for interfacing with the transducer output
static std::string unescape_analysis(const std::string& sfst_analysis);
/// Class identifier
static const char* identifier;
/// Registered flag
static bool registered;
private:
/// ctor for use in clone()
SfstAnalyser(const Corpus2::Tagset *tagset);
/// convenience function for loading a transducer file
void open_transducer(const std::string& filename);
/// the transducer, potentially shared between clones
boost::shared_ptr<CompactTransducer> ct_;
/// Mutex for safe access in case there are clones
boost::shared_ptr<boost::mutex> mutex_;
/// force lower case flag
bool lcase_;
};
} /* end ns Maca */
#endif // LIBMACA_SFSTMORPHANALYSER_H