Blame view

morfeusz/cli/cli.cpp 9.91 KB
Michał Lenart authored
1
2
3
4

#include <iostream>
#include <cstdlib>
#include "cli.hpp"
Michał Lenart authored
5
#include "../const.hpp"
Michał Lenart authored
6
7
8
9

using namespace std;
using namespace ez;
Michał Lenart authored
10
11
namespace morfeusz {
Michał Lenart authored
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
    static inline void printCLIUsage(ezOptionParser& opt, ostream& out) {
        string usage;
        opt.getUsage(usage);
        out << usage;
    }

    ezOptionParser* getOptions(int argc, const char** argv, MorfeuszProcessorType processorType) {

        ezOptionParser& opt = *(new ezOptionParser());

        opt.overview = processorType == ANALYZER
                ? "Morfeusz analyzer"
                : "Morfeusz generator";
        opt.syntax = string(argv[0]) + " [OPTIONS]";
        opt.example = string(argv[0]) + " --aggl strict --praet split --input /path/to/file.fsa";
        //	opt.footer = "Morfeusz Copyright (C) 2014\n";
Michał Lenart authored
28
Michał Lenart authored
29
        opt.add(
Michał Lenart authored
30
31
32
33
34
35
36
37
38
39
                "", // Default.
                0, // Required?
                0, // Number of args expected.
                0, // Delimiter if expecting multiple args.
                "Display usage instructions.", // Help description.
                "-h", // Flag token. 
                "-help", // Flag token.
                "--help", // Flag token.
                "--usage" // Flag token.
                );
Michał Lenart authored
40
Michał Lenart authored
41
42
43
44
45
46
47
48
49
50
        opt.add(
                "", // Default.
                0, // Required?
                1, // Number of args expected.
                0, // Delimiter if expecting multiple args.
                "file with analyzer finite state automaton and data, created with buildfsa.py script.", // Help description.
                "-i", // Flag token. 
                "-input", // Flag token.
                "--input" // Flag token.
                );
Michał Lenart authored
51
Michał Lenart authored
52
53
54
55
56
57
58
59
60
61
        opt.add(
                "", // Default.
                0, // Required?
                1, // Number of args expected.
                0, // Delimiter if expecting multiple args.
                "aggl option.", // Help description.
                "-a", // Flag token. 
                "-aggl", // Flag token.
                "--aggl" // Flag token.
                );
Michał Lenart authored
62
Michał Lenart authored
63
64
65
66
67
68
69
70
71
72
        opt.add(
                "", // Default.
                0, // Required?
                1, // Number of args expected.
                0, // Delimiter if expecting multiple args.
                "praet option.", // Help description.
                "-p", // Flag token. 
                "-praet", // Flag token.
                "--praet" // Flag token.
                );
Michał Lenart authored
73
Michał Lenart authored
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
        opt.add(
                "", // Default.
                0, // Required?
                1, // Number of args expected.
                0, // Delimiter if expecting multiple args.
                "input/output charset", // Help description.
                "-c", // Flag token. 
                "-charset", // Flag token.
                "--charset" // Flag token.
                );

        if (processorType == ANALYZER) {
            opt.add(
                    "", // Default.
                    0, // Required?
                    1, // Number of args expected.
                    0, // Delimiter if expecting multiple args.
Michał Lenart authored
91
92
93
94
95
96
                    "case handling strategy.\n \
                     WEAK - Case-sensitive but allows interpretations that do not match case but there is no alternative\n \
                     STRICT - strictly case-sensitive\n \
                     IGNORE - ignores case\n", // Help description.
                    "-case-handling", // Flag token.
                    "--case-handling" // Flag token.
Michał Lenart authored
97
98
99
100
101
102
                    );
            opt.add(
                    "", // Default.
                    0, // Required?
                    1, // Number of args expected.
                    0, // Delimiter if expecting multiple args.
Michał Lenart authored
103
104
105
                    "token numbering strategy \
                     SEPARATE - Start from 0 and reset counter for every line\n \
                     CONTINUOUS - start from 0 and never reset counter", // Help description.
Michał Lenart authored
106
107
108
                    "-token-numbering", // Flag token.
                    "--token-numbering" // Flag token.
                    );
Michał Lenart authored
109
110
111
112
113
114
115
116
117
118
119
120
            opt.add(
                    "", // Default.
                    0, // Required?
                    1, // Number of args expected.
                    0, // Delimiter if expecting multiple args.
                    "whitespace handling strategy. \n \
                     SKIP - ignore whitespaces \n \
                     APPEND - append whitespaces to preceding segment\n \
                     KEEP - whitespaces are separate segments", // Help description.
                    "-whitespace-handling", // Flag token.
                    "--whitespace-handling" // Flag token.
                    );
Michał Lenart authored
121
        }
Michał Lenart authored
122
Michał Lenart authored
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
        opt.add(
                "", // Default.
                0, // Required?
                0, // Number of args expected.
                0, // Delimiter if expecting multiple args.
                "show some debug information.", // Help description.
                "-d", // Flag token. 
                "-debug", // Flag token.
                "--debug" // Flag token.
                );

        opt.parse(argc, argv);

        if (opt.firstArgs.size() > 1) {
            cerr << "Invalid argument (not bound to any flag): " << *opt.firstArgs[1] << endl;
            exit(1);
Michał Lenart authored
139
        }
Michał Lenart authored
140
141
142
143
144
145
146
147
148
149
150
151

        if (!opt.lastArgs.empty()) {
            cerr << "Invalid argument (not bound to any flag): " << *opt.lastArgs[0] << endl;
            exit(1);
        }


        if (opt.isSet("-h")) {
            printCLIUsage(opt, cout);
            exit(0);
        }
        return &opt;
Michał Lenart authored
152
    }
Michał Lenart authored
153
154
155
156
157
158
159
160
161
162
163
164
165
166

    static Charset getCharset(const string& encodingStr) {
        if (encodingStr == "UTF8")
            return UTF8;
        else if (encodingStr == "ISO8859_2")
            return ISO8859_2;
        else if (encodingStr == "CP1250")
            return CP1250;
        else if (encodingStr == "CP852")
            return CP852;
        else {
            cerr << "Invalid encoding: '" << encodingStr << "'. Must be one of: UTF8, ISO8859_2, WINDOWS1250" << endl;
            throw "Invalid encoding";
        }
Michał Lenart authored
167
    }
Michał Lenart authored
168
169
170

    static TokenNumbering getTokenNumbering(const string& optionStr) {
        if (optionStr == "SEPARATE")
Michał Lenart authored
171
            return SEPARATE_NUMBERING;
Michał Lenart authored
172
        else if (optionStr == "CONTINUOUS")
Michał Lenart authored
173
            return CONTINUOUS_NUMBERING;
Michał Lenart authored
174
175
176
177
        else {
            cerr << "Invalid token numbering: '" << optionStr << "'. Must be one of: SEPARATE, CONTINUOUS" << endl;
            throw "Invalid token numbering";
        }
Michał Lenart authored
178
    }
Michał Lenart authored
179
180
181

    static CaseHandling getCaseHandling(const string& optionStr) {
        if (optionStr == "WEAK")
Michał Lenart authored
182
            return CONDITIONALLY_CASE_SENSITIVE;
Michał Lenart authored
183
        else if (optionStr == "STRICT")
Michał Lenart authored
184
            return STRICTLY_CASE_SENSITIVE;
Michał Lenart authored
185
        else if (optionStr == "IGNORE")
Michał Lenart authored
186
            return IGNORE_CASE;
Michał Lenart authored
187
188
189
190
191
        else {
            cerr << "Invalid case handling: '" << optionStr << "'. Must be one of: WEAK, STRICT, IGNORE" << endl;
            throw "Invalid token numbering";
        }
    }
Michał Lenart authored
192
193
194

    static WhitespaceHandling getWhitespaceHandling(const string& optionStr) {
        if (optionStr == "SKIP")
Michał Lenart authored
195
            return SKIP_WHITESPACES;
Michał Lenart authored
196
        else if (optionStr == "APPEND")
Michał Lenart authored
197
            return APPEND_WHITESPACES;
Michał Lenart authored
198
        else if (optionStr == "KEEP")
Michał Lenart authored
199
            return KEEP_WHITESPACES;
Michał Lenart authored
200
201
202
203
204
        else {
            cerr << "Invalid whitespace handling: '" << optionStr << "'. Must be one of: SKIP, APPEND, KEEP" << endl;
            throw "Invalid whitespace handling";
        }
    }
Michał Lenart authored
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219

    void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz, MorfeuszProcessorType processorType) {
        if (opt.isSet("-i")) {
            string dictFile;
            opt.get("-i")->getString(dictFile);
            switch (processorType) {
                case ANALYZER:
                    morfeusz.setAnalyzerDictionary(dictFile);
                    break;
                case GENERATOR:
                    morfeusz.setGeneratorDictionary(dictFile);
                    break;
                default:
                    break;
            }
Michał Lenart authored
220
            cerr << "Using dictionary from " << dictFile << endl;
Michał Lenart authored
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
        }
        if (opt.isSet("-a")) {
            string aggl;
            opt.get("-a")->getString(aggl);
            cerr << "setting aggl option to " << aggl << endl;
            morfeusz.setAggl(aggl);
        }
        if (opt.isSet("-p")) {
            string praet;
            opt.get("-p")->getString(praet);
            cerr << "setting praet option to " << praet << endl;
            morfeusz.setPraet(praet);
        }
        if (opt.isSet("-d")) {
            cerr << "setting debug to TRUE" << endl;
            morfeusz.setDebug(true);
        }
        if (opt.isSet("-c")) {
            string charset;
            opt.get("-c")->getString(charset);
            cerr << "setting charset to " << charset << endl;
            morfeusz.setCharset(getCharset(charset));
        }

        if (processorType == ANALYZER) {
Michał Lenart authored
246
247
248
249
250
            if (opt.isSet("-case-handling")) {
                string caseHandling;
                opt.get("-case-handling")->getString(caseHandling);
                cerr << "setting case handling to " << caseHandling << endl;
                morfeusz.setCaseHandling(getCaseHandling(caseHandling));
Michał Lenart authored
251
252
253
254
255
256
257
258
            }

            if (opt.isSet("-token-numbering")) {
                string tokenNumbering;
                opt.get("-token-numbering")->getString(tokenNumbering);
                cerr << "setting token numbering to " << tokenNumbering << endl;
                morfeusz.setTokenNumbering(getTokenNumbering(tokenNumbering));
            }
Michał Lenart authored
259
260
261
262
263
264
265

            if (opt.isSet("-whitespace-handling")) {
                string whitespaceHandling;
                opt.get("-whitespace-handling")->getString(whitespaceHandling);
                cerr << "setting whitespace handling to " << whitespaceHandling << endl;
                morfeusz.setWhitespaceHandling(getWhitespaceHandling(whitespaceHandling));
            }
Michał Lenart authored
266
267
        }
Michał Lenart authored
268
#if defined(_WIN64) || defined(_WIN32)
Michał Lenart authored
269
        morfeusz.setCharset(CP852);
Michał Lenart authored
270
#endif
Michał Lenart authored
271
    }
Michał Lenart authored
272
273

}