/* Copyright (C) 2010 Tomasz Ĺšniatowski, Adam Radziszewski Part of the libmaca project This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the LICENSE.MACA, LICENSE.SFST, LICENSE.GUESSER, COPYING.LESSER and COPYING files for more details. */ #include <libcorpus2/io/writer.h> #include <libmaca/io/text.h> #include <libmaca/io/premorph.h> #include <libmaca/morph/dispatchanalyser.h> #include <libmaca/util/settings.h> #include <libcorpus2/util/settings.h> #include <libmaca/util/sentenceanalyser.h> #include <libcorpus2/util/tokentimer.h> // generated by CMake #include <libmaca/version.h> #include <libtoki/sentencesplitter.h> #include <libtoki/tokenizer/layertokenizer.h> #include <boost/foreach.hpp> #include <libtoki/util/settings.h> #include <boost/algorithm/string.hpp> #include <boost/program_options.hpp> #include <boost/make_shared.hpp> #include <fstream> #include <omp.h> int main(int argc, char** argv) { std::string config, toki_config; std::string input_format, output_format; std::string output_filename; std::vector<std::string> plugins; std::string config_path, toki_config_path, initial_wa_override; int threads = 0; bool quiet = false, progress = false, split_chunks = false; bool linewise = false; using boost::program_options::value; std::string writers = boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), " "); std::string writers_help = "Output format, any of: " + writers + "\n"; boost::program_options::options_description desc("Allowed options"); desc.add_options() ("config,c", value(&config), "Morphological analyser configuration file") ("config-path,C", value(&config_path)->default_value(""), "Override config search path") ("toki-config,t", value(&toki_config), "Tokenizer configuration file. " "Overrides config value, only used in some input modes.") ("toki-config-path", value(&toki_config_path)->default_value(""), "Override toki config search path") ("initial-wa-override", value(&initial_wa_override), "Initial whitespace (overrides toki config file)") ("input-format,i", value(&input_format)->default_value("text"), "Input format, any of: text premorph premorph-stream premorph-stream-nosent") ("output-format,o", value(&output_format)->default_value("plain"), writers_help.c_str()) ("linewise,l", value(&linewise)->default_value(false)->zero_tokens(), "``Interactive'' line-wise input processing (newlines force sentence breaks)") ("output-file", value(&output_filename), "Output filename (do not write to stdout)") ("split,s", value(&split_chunks)->zero_tokens(), "Split output into chunks on many-newline tokens") ("plugin,P", value(&plugins), "Additional plugins to load") ("threads,T", value(&threads), "Threads to use") ("progress,p", value(&progress)->zero_tokens(), "Show progress info") ("quiet,q", value(&quiet)->zero_tokens(), "Suppress startup info when loading a tagset") ("help,h", "Show help") ("version", "print version string") ; boost::program_options::options_description script("Script help"); script.add_options() ("script-help", "Show help in a greppable format") ; script.add(desc); boost::program_options::variables_map vm; boost::program_options::positional_options_description p; p.add("config", -1); try { boost::program_options::store( boost::program_options::command_line_parser(argc, argv) .options(script).positional(p).run(), vm); } catch (boost::program_options::error& e) { std::cerr << e.what() << "\n"; return 2; } boost::program_options::notify(vm); if (!config_path.empty()) { Maca::Path::Instance().set_search_path(config_path); } if (!toki_config_path.empty()) { Toki::Path::Instance().set_search_path(config_path); } BOOST_FOREACH(const std::string& s, plugins) { Maca::MorphAnalyser::load_plugin(s, false); } if (vm.count("help")) { std::cout << desc << "\n"; std::cout << "Available analyser types: "; std::cout << boost::algorithm::join( Maca::MorphAnalyser::available_analyser_types(), " ") << "\n"; std::cout << "Available configurations: "; std::cout << Maca::SentenceAnalyser::available_configurations() << "\n"; return 1; } if (vm.count("script-help")) { std::cout << "INPUT "; std::cout << "text premorph premorph-stream premorph-stream-nosent"; std::cout << "\n"; std::cout << "OUTPUT "; std::cout << boost::algorithm::join(Corpus2::TokenWriter::available_writer_types(), " "); std::cout << "\n"; std::cout << boost::algorithm::join(Corpus2::TokenWriter::available_writer_types_help(), "\n"); std::cout << "\n"; return 0; } if (vm.count("version")) { std::cout << "maca-analyse (MACA) " << LIBMACA_VERSION << "\n"; return 0; } Toki::Path::Instance().set_verbose(!quiet); Maca::Path::Instance().set_verbose(!quiet); Corpus2::Path::Instance().set_verbose(!quiet); if (!config.empty()) { try { boost::shared_ptr<Maca::SentenceAnalyser> sa; if (toki_config.empty()) { sa = Maca::SentenceAnalyser::create_from_named_config(config); } else { sa = Maca::SentenceAnalyser::create_from_named_config(config, toki_config); } if (!initial_wa_override.empty()) { PwrNlp::Whitespace::Enum wa = PwrNlp::Whitespace::from_string( initial_wa_override); if (PwrNlp::Whitespace::is_valid(wa)) { Toki::Tokenizer& tok = sa->tokenizer(); dynamic_cast<Toki::LayerTokenizer&>(tok).input_tokenizer().set_initial_whitespace(wa); } else { std::cerr << "Invalid initial whitespace: " << initial_wa_override << "\n"; } } if (input_format == "premorph-stream" || input_format == "premorph-stream-nosent") { Maca::PremorphProcessor pp(std::cout, sa); pp.set_stats(progress); if (input_format == "premorph-stream-nosent") { // we don't want to mark sentences in output pp.set_mark_sentences(false); } pp.parse_stream(std::cin); return 0; } boost::shared_ptr<Corpus2::TokenWriter> writer; if (output_filename.empty()) { writer = Corpus2::TokenWriter::create_stream_writer(output_format, std::cout, sa->tagset()); } else { writer = Corpus2::TokenWriter::create_path_writer(output_format, output_filename, sa->tagset()); } Corpus2::TokenTimer& timer = Corpus2::global_timer(); timer.register_signal_handler(); boost::shared_ptr<Corpus2::TokenReader> tr; if (linewise) { std::string line; while (std::getline(std::cin, line)) { sa->set_input_source(UnicodeString::fromUTF8(line)); while (Corpus2::Sentence::Ptr sentence = sa->get_next_sentence()) { timer.count_sentence(*sentence); writer->write_sentence(*sentence); if (progress) { timer.check_slice(); } } } } else { if (input_format == "premorph") { tr = boost::make_shared<Maca::PremorphReader>(boost::ref(std::cin), sa); // whatever config says, premorph consists of chunks split_chunks = true; } else { tr = boost::make_shared<Maca::TextReader>(boost::ref(std::cin), sa, 1); } if (split_chunks) { while (boost::shared_ptr<Corpus2::Chunk> chunk = tr->get_next_chunk()) { if (!chunk->empty()) { writer->write_chunk(*chunk); timer.count_chunk(*chunk); if (progress) { timer.check_slice(); } } } } else { while (Corpus2::Sentence::Ptr sentence = tr->get_next_sentence()) { assert(!sentence->empty()); timer.count_sentence(*sentence); writer->write_sentence(*sentence); if (progress) { timer.check_slice(); } } } } if (progress) { timer.stats(); } } catch (Maca::MacaError& e) { std::cerr << "Maca Error: " << e.info() << "\n"; return 4; } catch (Toki::TokiError& e) { std::cerr << "Tokenizer Error: " << e.info() << "\n"; return 6; } catch (Corpus2::Corpus2Error& e) { std::cerr << "Corpus2 Error: " << e.info() << "\n"; return 8; } } else { std::cerr << "Usage: analyse -c CONFIG [OPTIONS]\n"; std::cerr << "See analyse --help\n"; return 1; } return 0; }