guesser.cpp
3.53 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libmaca project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.MACA, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details.
*/
#include <libmaca/morph/guesser.h>
#include <boost/algorithm/string.hpp>
#include <Corpus/guesser_api.h>
#include <algorithm>
#include <morfeusz.h>
namespace Maca {
const char* GuesserAnalyser::identifier = "guesser";
bool GuesserAnalyser::registered =
MorphAnalyser::register_analyser<GuesserAnalyser>();
GuesserAnalyser::GuesserAnalyser(const Config::Node &cfg)
: MorphAnalyser(cfg)
{
setup_corpus1();
}
GuesserAnalyser::GuesserAnalyser(const Corpus2::Tagset *tagset)
: MorphAnalyser(tagset)
{
setup_corpus1();
}
void GuesserAnalyser::setup_corpus1()
{
// we operate in UTF8
SetCorpusEncoding(GUESSER_UTF8);
// if unexpected tags come out of Guesser (or actually
// Morfeusz+Guesser system), just warn instead of halting
AllowMorfErrors();
}
GuesserAnalyser::~GuesserAnalyser()
{
}
GuesserAnalyser* GuesserAnalyser::clone() const
{
GuesserAnalyser* copy = new GuesserAnalyser(*this);
return copy;
}
bool GuesserAnalyser::process_functional(const Toki::Token &t,
boost::function<void (Corpus2::Token *)> sink)
{
std::string orthu8 = t.orth_utf8();
// feed the orth through the guesser
const char* guesser_response = GuessForm(orthu8.c_str());
if (!guesser_response) {
throw MacaError("Guesser returned NULL");
}
std::string gs(guesser_response);
std::vector<std::string> lines;
boost::algorithm::split(lines, gs, boost::is_any_of("\n"));
PwrNlp::Whitespace::Enum wa = t.preceeding_whitespace();
bool sunk = false;
// NOTE: the Guesser API may also run Morfeusz.
// If the output is multi-token, it clearly comes from Morfeusz,
// so we immediately reject multi-line guesser responses.
// Also note that single-token output may be from the guesser,
// but it might also come from Morfeusz -- we can't check it via
// this simple API.
std::vector<std::string> nonempty_lines;
BOOST_FOREACH(const std::string& line, lines) {
if (!line.empty()) {
nonempty_lines.push_back(line);
}
}
if (nonempty_lines.size() == 1 ) {
const std::string line = nonempty_lines[0];
std::vector<std::string> olt; // orth-lemma-tag
boost::algorithm::split(olt, line, boost::is_any_of(" \t"));
if (((olt.size() - 1) % 2 != 0) || (olt.size() < 3)) {
throw MacaError("Unexpected orth-lemma-tag line returned by Guesser: " + line);
}
// UnicodeString the_orth = UnicodeString::fromUTF8(olt[0]);
// take orth from input unchanged
std::auto_ptr<Corpus2::Token> tt(new Corpus2::Token(t.orth(), wa));
wa = PwrNlp::Whitespace::None;
size_t lexeme_start_idx = 1;
while (lexeme_start_idx + 1 < olt.size()) {
if (olt[lexeme_start_idx + 1] != "ign") {
UnicodeString the_lemma = UnicodeString::fromUTF8(olt[lexeme_start_idx]);
Corpus2::Tag the_tag = tagset().parse_simple_tag(olt[lexeme_start_idx + 1]);
Corpus2::Lexeme lex(the_lemma, the_tag);
tt->add_lexeme(lex);
}
lexeme_start_idx += 2;
}
if (sunk || !tt->lexemes().empty()) {
sink(tt.release());
sunk = true;
}
}
return sunk;
}
} /* end ns Maca */