fold.cpp
3.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libmaca project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.MACA, LICENSE.SFST, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details.
*/
#include <libmaca/conv/fold.h>
#include <boost/foreach.hpp>
#include <libcorpus2/token.h>
namespace Maca { namespace Conversion {
size_t find_shortest(const std::vector<std::vector<Corpus2::Token *> >& v,
size_t& min_len_path)
{
size_t min_len = v[0].size();
min_len_path = 0;
for (size_t pi = 0; pi < v.size(); ++pi) {
if (v[pi].size() < min_len) {
min_len = v[pi].size();
min_len_path = pi;
}
}
return min_len;
}
bool try_fold_paths(const std::vector< std::vector<Corpus2::Token*> >& v,
boost::function<void (Corpus2::Token*)> sink)
{
size_t min_len = v[0].size();
size_t max_len = v[0].size();
for (size_t pi = 0; pi < v.size(); ++pi) {
if (v[pi].size() < min_len) {
min_len = v[pi].size();
} else if (v[pi].size() > max_len) {
max_len = v[pi].size();
}
}
// folding is only possible if the paths all have the same length
if (max_len != min_len) return false;
for (size_t ti = 0; ti < min_len; ++ti) {
Corpus2::Token* t = v[0][ti];
for (size_t pi = 1; pi < v.size(); ++pi) {
// if orths vary, merging is not possible
if (t->orth() != v[pi][ti]->orth()) return false;
}
}
// actual merging
for (size_t ti = 0; ti < min_len; ++ti) {
Corpus2::Token* t = v[0][ti];
for (size_t pi = 1; pi < v.size(); ++pi) {
BOOST_FOREACH(const Corpus2::Lexeme& lex, v[pi][ti]->lexemes()) {
t->add_lexeme(lex);
}
delete v[pi][ti];
}
t->remove_duplicate_lexemes();
sink(t);
}
return true;
}
std::vector<Corpus2::Token*> choose_path(
const std::vector< std::vector<Corpus2::Token*> >& v, size_t n)
{
assert(n < v.size());
for (size_t i = 0; i < v.size(); ++i) {
if (i != n) {
BOOST_FOREACH(Corpus2::Token* t, v[i]) {
delete t;
}
}
}
return v[n];
}
void choose_path(const std::vector< std::vector<Corpus2::Token*> >& v, size_t n,
boost::function<void (Corpus2::Token*)> sink)
{
assert(n < v.size());
for (size_t i = 0; i < v.size(); ++i) {
if (i != n) {
BOOST_FOREACH(Corpus2::Token* t, v[i]) {
delete t;
}
}
}
BOOST_FOREACH(Corpus2::Token* t, v[n]) {
sink(t);
}
}
std::vector<Corpus2::Token*> choose_shortest_path(
const std::vector< std::vector<Corpus2::Token*> >& v)
{
size_t min_len_path;
find_shortest(v, min_len_path);
return choose_path(v, min_len_path);
}
void choose_shortest_path(const std::vector< std::vector<Corpus2::Token*> >& v,
boost::function<void (Corpus2::Token*)> sink)
{
size_t min_len_path;
find_shortest(v, min_len_path);
choose_path(v, min_len_path, sink);
}
} /* end ns Conversion */
} /* end ns Maca */