sentenceanalyser.cpp
2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libmaca project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE.MACA, LICENSE.SFST, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details.
*/
#include <libmaca/util/sentenceanalyser.h>
#include <libmaca/util/settings.h>
#include <libtoki/tokenizer/layertokenizer.h>
#include <libmaca/morph/dispatchanalyser.h>
#include <boost/scoped_ptr.hpp>
#include <boost/make_shared.hpp>
#include <boost/algorithm/string.hpp>
namespace Maca {
SentenceAnalyser::SentenceAnalyser(
const boost::shared_ptr<Toki::Tokenizer>& tok,
const boost::shared_ptr<MorphAnalyser>& ma)
: UnicodeSink(), tok_(tok), sp_(*tok_), ma_(ma)
{
}
SentenceAnalyser::SentenceAnalyser(const Config::Node &cfg)
: UnicodeSink()
, tok_(Toki::LayerTokenizer::create_from_named_config(
cfg.get("general.toki-config", "")))
, sp_(*tok_)
, ma_(new DispatchAnalyser(cfg))
{
}
SentenceAnalyser::SentenceAnalyser(const Config::Node &cfg,
const Toki::Config::Node &toki_config_override)
: UnicodeSink()
, tok_(new Toki::LayerTokenizer(toki_config_override))
, sp_(*tok_)
, ma_(new DispatchAnalyser(cfg))
{
}
boost::shared_ptr<SentenceAnalyser>
SentenceAnalyser::create_from_named_config(
const std::string& config_name)
{
const Config::Node& cfg = get_named_config(config_name);
return boost::make_shared<SentenceAnalyser>(cfg);
}
boost::shared_ptr<SentenceAnalyser>
SentenceAnalyser::create_from_named_config(
const std::string& config_name,
const std::string& toki_config_override)
{
const Config::Node& cfg = get_named_config(config_name);
const Toki::Config::Node& toki_cfg = Toki::get_named_config(
toki_config_override);
return boost::make_shared<SentenceAnalyser>(cfg, toki_cfg);
}
std::string SentenceAnalyser::available_configurations()
{
return boost::algorithm::join(Path::Instance().list_files(".ini"), " ");
}
Corpus2::Sentence::Ptr SentenceAnalyser::get_next_sentence()
{
if (sp_.has_more()) {
boost::scoped_ptr<Toki::Sentence> toki_sentence(
sp_.get_next_sentence());
assert(toki_sentence);
assert(!toki_sentence->empty());
return ma_->process(*toki_sentence);
} else {
return Corpus2::Sentence::Ptr();
}
}
void SentenceAnalyser::new_input_source()
{
tok_->restart();
tok_->set_input_source(get_input_source());
}
bool SentenceAnalyser::process(sentence_sink_t sink)
{
bool had_sentences = false;
while (Corpus2::Sentence::Ptr s = get_next_sentence()) {
sink(s);
had_sentences = true;
}
return had_sentences;
}
} /* end ns Maca */