splitlayer.h
4.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libmaca project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.MACA, LICENSE.SFST, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details.
*/
#ifndef LIBMACA_CONV_SPLITLAYER_H
#define LIBMACA_CONV_SPLITLAYER_H
#include <libmaca/conv/layer.h>
#include <libmaca/conv/predicate.h>
#include <libmaca/util/confignode.h>
#include <deque>
#include <unicode/regex.h>
#include <boost/scoped_ptr.hpp>
#include <boost/shared_ptr.hpp>
namespace Maca {
namespace Conversion {
/**
* A layer to split tokens matching specific criteria.
*
* Some precondition predicates are first checked, then a regexp
* is matched on the token's orth. If the regexp matches, tokens will be
* created with orths corresponding to the capturing groups (in this case,
* the regexp should have exactly two capturing groups). The resulting
* tokens are referred to as token 1 and token 2.
*
* Corpus2::Token 1 retains the lexemes of the original token. Token 2 is created
* with a pre-set lexeme, with some attributes possibly copied from the
* original token's lexemes.
*
* Finally, some postcondition predicates are appliked on output token 1.
*/
class TwoSplitLayer : public OneTagsetLayer, boost::noncopyable
{
public:
/// Constructor for an empty TwoSplitLayer working within a tagset
TwoSplitLayer(const Corpus2::Tagset& tagset);
/**
* Config node constructor. Recognized keys are:
* - regexp - the regular expression used for orth spliiting, should
* have exactly two capturing groups.
* - pre - precondition predicates, can appear multiple times
* - t1_post - postconditions predicates of token 1, can appear
* multiple times
* - copy_attrs_to_t2 - attributes to copy to token 2 from the original
* token's lexemes, can appear multiple times
* - t2_lemma - lemma of the token 2 lexeme
* - t2_tag - tag of the token 2 lexeme
*/
TwoSplitLayer(const Config::Node& cfg);
/// Destructor
~TwoSplitLayer();
TwoSplitLayer* clone() const;
void set_orth_regexp(const std::string& regexp_string);
void add_precondition(const TagPredicate& tp);
void add_precondition(const std::string& pred_string);
void add_t1_postcondition(const TagPredicate& tp);
void add_t1_postcondition(const std::string& pred_string);
void add_copy_attr_to_t2(Corpus2::idx_t a);
void append_copy_attrs_to_t2(const std::string& a);
void set_t2_lexeme(const Corpus2::Lexeme& lex);
Corpus2::Token* get_next_token();
protected:
void clone_helper(TwoSplitLayer* copy) const;
std::deque<Corpus2::Token*> queue_;
boost::scoped_ptr<RegexMatcher> orth_matcher_;
boost::shared_ptr<RegexPattern> orth_pattern_;
std::vector<TagPredicate> pre_;
std::vector<TagPredicate> t1_post_;
Corpus2::mask_t copy_attrs_to_t2_;
Corpus2::Lexeme t2_lexeme_;
};
/**
* Three-token variant split layer. Main difference from TwoSplitLayer
* is that the regexp should have three capturing groups, otherwise this
* behaves similarily. The resulting split tokens are referred to as tokens
* 1, 2 and 3.
*/
class ThreeSplitLayer : public TwoSplitLayer
{
public:
/// Constructor for an empty ThreeSplitLayer working within a tagset
ThreeSplitLayer(const Corpus2::Tagset& tagset);
/**
* Config node constructor. Recognized keys are:
* - copy_attrs_to_t3 - attributes to copy to token 3 from the original
* token's lexemes, can appear multiple times
* - t3_lemma - lemma of the token 3 lexeme
* - t3_tag - tag of the token 3 lexeme
*/
ThreeSplitLayer(const Config::Node& cfg);
ThreeSplitLayer* clone() const;
void add_copy_attr_to_t3(Corpus2::idx_t a);
void append_copy_attrs_to_t3(const std::string& a);
void set_t3_lexeme(const Corpus2::Lexeme& lex);
Corpus2::Token* get_next_token();
protected:
Corpus2::mask_t copy_attrs_to_t3_;
Corpus2::Lexeme t3_lexeme_;
};
} /* end ns Conversion */
} /* end ns Maca */
#endif // LIBMACA_CONV_SPLITLAYER_H