joinrule.cpp
3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libmaca project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.MACA, LICENSE.SFST, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details.
*/
#include <libmaca/conv/joinrule.h>
#include <boost/foreach.hpp>
#include <libcorpus2/tagsetmanager.h>
#include <libmaca/conv/attributecopier.h>
#include <boost/algorithm/string.hpp>
namespace Maca {
namespace Conversion {
JoinRule::JoinRule(const Corpus2::Tagset& tagset)
: tagset_(&tagset)
, copy_t2_attrs_(0)
{
}
JoinRule::JoinRule(const Config::Node& cfg)
: tagset_(&Corpus2::get_named_tagset(cfg.get<std::string>("tagset")))
, copy_t2_attrs_(0)
{
std::string pos1, pos2;
UnicodeString orth1, orth2;
BOOST_FOREACH(const Config::Node::value_type &v, cfg) {
if (v.first == "t1_pos") {
pos1 = v.second.data();
} else if (v.first == "t2_pos") {
pos2 = v.second.data();
} else if (v.first == "t1_orth") {
orth1 = UnicodeString::fromUTF8(v.second.data());
} else if (v.first == "t2_orth") {
orth2 = UnicodeString::fromUTF8(v.second.data());
} else if (v.first == "post") {
add_postcondition(v.second.data());
} else if (v.first == "copy_attr") {
append_copy_attrs(v.second.data());
}
}
set_token1_preconditions(pos1, orth1);
set_token2_preconditions(pos2, orth2);
}
void JoinRule::set_token1_preconditions(const PosOrthPredicate &pre)
{
pre1_ = pre;
}
void JoinRule::set_token1_preconditions(const std::string& pos,
const UnicodeString& orth)
{
Corpus2::mask_t p = tagset_->get_pos_mask(pos);
pre1_ = PosOrthPredicate(p, orth);
}
void JoinRule::set_token2_preconditions(const PosOrthPredicate &pre)
{
pre1_ = pre;
}
void JoinRule::set_token2_preconditions(const std::string& pos,
const UnicodeString& orth)
{
Corpus2::mask_t p = tagset_->get_pos_mask(pos);
pre2_ = PosOrthPredicate(p, orth);
}
void JoinRule::set_copy_attrs(Corpus2::mask_t mask)
{
copy_t2_attrs_ = mask;
}
void JoinRule::append_copy_attrs(const std::string& names)
{
append_attributes_mask(copy_t2_attrs_, tagset(), names);
}
void JoinRule::add_postcondition(const TagPredicate &tp)
{
post_.push_back(tp);
}
void JoinRule::add_postcondition(const std::string& pred_string)
{
std::vector<std::string> srv;
boost::algorithm::split(srv, pred_string,
boost::is_any_of(std::string(": ")));
BOOST_FOREACH(const std::string& sr, srv) {
if (!sr.empty()) {
post_.push_back(TagPredicate(sr, *tagset_));
}
}
}
Corpus2::Token* JoinRule::try_join(Corpus2::Token* t1, Corpus2::Token* t2) const
{
if (pre1_.check(*t1) && pre2_.check(*t2)) {
t1->set_orth(t1->orth() + t2->orth());
copy_attributes(*t2, copy_t2_attrs_, *t1);
apply_predicates(post_, *t1);
delete t2;
return t1;
} else {
//std::cerr << pre1_.check(*t1) << " " << pre2_.check(*t2) << "\n";
//std::cerr << pre2_.dump() << "\n";
return NULL;
}
}
} /* end ns Conversion */
} /* end ns Maca */