|
1
2
3
4
|
#include <iostream>
#include <cstdlib>
#include "cli.hpp"
|
|
5
|
#include "../const.hpp"
|
|
6
7
8
9
|
using namespace std;
using namespace ez;
|
|
10
11
|
namespace morfeusz {
|
|
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
static inline void printCLIUsage(ezOptionParser& opt, ostream& out) {
string usage;
opt.getUsage(usage);
out << usage;
}
ezOptionParser* getOptions(int argc, const char** argv, MorfeuszProcessorType processorType) {
ezOptionParser& opt = *(new ezOptionParser());
opt.overview = processorType == ANALYZER
? "Morfeusz analyzer"
: "Morfeusz generator";
opt.syntax = string(argv[0]) + " [OPTIONS]";
opt.example = string(argv[0]) + " --aggl strict --praet split --input /path/to/file.fsa";
// opt.footer = "Morfeusz Copyright (C) 2014\n";
|
|
28
|
|
|
29
|
opt.add(
|
|
30
31
32
33
34
35
36
37
38
39
|
"", // Default.
0, // Required?
0, // Number of args expected.
0, // Delimiter if expecting multiple args.
"Display usage instructions.", // Help description.
"-h", // Flag token.
"-help", // Flag token.
"--help", // Flag token.
"--usage" // Flag token.
);
|
|
40
|
|
|
41
42
43
44
45
46
47
48
49
50
|
opt.add(
"", // Default.
0, // Required?
1, // Number of args expected.
0, // Delimiter if expecting multiple args.
"file with analyzer finite state automaton and data, created with buildfsa.py script.", // Help description.
"-i", // Flag token.
"-input", // Flag token.
"--input" // Flag token.
);
|
|
51
|
|
|
52
53
54
55
56
57
58
59
60
61
|
opt.add(
"", // Default.
0, // Required?
1, // Number of args expected.
0, // Delimiter if expecting multiple args.
"aggl option.", // Help description.
"-a", // Flag token.
"-aggl", // Flag token.
"--aggl" // Flag token.
);
|
|
62
|
|
|
63
64
65
66
67
68
69
70
71
72
|
opt.add(
"", // Default.
0, // Required?
1, // Number of args expected.
0, // Delimiter if expecting multiple args.
"praet option.", // Help description.
"-p", // Flag token.
"-praet", // Flag token.
"--praet" // Flag token.
);
|
|
73
|
|
|
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
|
opt.add(
"", // Default.
0, // Required?
1, // Number of args expected.
0, // Delimiter if expecting multiple args.
"input/output charset", // Help description.
"-c", // Flag token.
"-charset", // Flag token.
"--charset" // Flag token.
);
if (processorType == ANALYZER) {
opt.add(
"", // Default.
0, // Required?
1, // Number of args expected.
0, // Delimiter if expecting multiple args.
|
|
91
92
93
94
95
96
|
"case handling strategy.\n \
WEAK - Case-sensitive but allows interpretations that do not match case but there is no alternative\n \
STRICT - strictly case-sensitive\n \
IGNORE - ignores case\n", // Help description.
"-case-handling", // Flag token.
"--case-handling" // Flag token.
|
|
97
98
99
100
101
102
|
);
opt.add(
"", // Default.
0, // Required?
1, // Number of args expected.
0, // Delimiter if expecting multiple args.
|
|
103
104
105
|
"token numbering strategy \
SEPARATE - Start from 0 and reset counter for every line\n \
CONTINUOUS - start from 0 and never reset counter", // Help description.
|
|
106
107
108
|
"-token-numbering", // Flag token.
"--token-numbering" // Flag token.
);
|
|
109
110
111
112
113
114
115
116
117
118
119
120
|
opt.add(
"", // Default.
0, // Required?
1, // Number of args expected.
0, // Delimiter if expecting multiple args.
"whitespace handling strategy. \n \
SKIP - ignore whitespaces \n \
APPEND - append whitespaces to preceding segment\n \
KEEP - whitespaces are separate segments", // Help description.
"-whitespace-handling", // Flag token.
"--whitespace-handling" // Flag token.
);
|
|
121
|
}
|
|
122
|
|
|
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
opt.add(
"", // Default.
0, // Required?
0, // Number of args expected.
0, // Delimiter if expecting multiple args.
"show some debug information.", // Help description.
"-d", // Flag token.
"-debug", // Flag token.
"--debug" // Flag token.
);
opt.parse(argc, argv);
if (opt.firstArgs.size() > 1) {
cerr << "Invalid argument (not bound to any flag): " << *opt.firstArgs[1] << endl;
exit(1);
|
|
139
|
}
|
|
140
141
142
143
144
145
146
147
148
149
150
151
|
if (!opt.lastArgs.empty()) {
cerr << "Invalid argument (not bound to any flag): " << *opt.lastArgs[0] << endl;
exit(1);
}
if (opt.isSet("-h")) {
printCLIUsage(opt, cout);
exit(0);
}
return &opt;
|
|
152
|
}
|
|
153
154
155
156
157
158
159
160
161
162
163
164
165
166
|
static Charset getCharset(const string& encodingStr) {
if (encodingStr == "UTF8")
return UTF8;
else if (encodingStr == "ISO8859_2")
return ISO8859_2;
else if (encodingStr == "CP1250")
return CP1250;
else if (encodingStr == "CP852")
return CP852;
else {
cerr << "Invalid encoding: '" << encodingStr << "'. Must be one of: UTF8, ISO8859_2, WINDOWS1250" << endl;
throw "Invalid encoding";
}
|
|
167
|
}
|
|
168
169
170
|
static TokenNumbering getTokenNumbering(const string& optionStr) {
if (optionStr == "SEPARATE")
|
|
171
|
return SEPARATE_NUMBERING;
|
|
172
|
else if (optionStr == "CONTINUOUS")
|
|
173
|
return CONTINUOUS_NUMBERING;
|
|
174
175
176
177
|
else {
cerr << "Invalid token numbering: '" << optionStr << "'. Must be one of: SEPARATE, CONTINUOUS" << endl;
throw "Invalid token numbering";
}
|
|
178
|
}
|
|
179
180
181
|
static CaseHandling getCaseHandling(const string& optionStr) {
if (optionStr == "WEAK")
|
|
182
|
return CONDITIONALLY_CASE_SENSITIVE;
|
|
183
|
else if (optionStr == "STRICT")
|
|
184
|
return STRICTLY_CASE_SENSITIVE;
|
|
185
|
else if (optionStr == "IGNORE")
|
|
186
|
return IGNORE_CASE;
|
|
187
188
189
190
191
|
else {
cerr << "Invalid case handling: '" << optionStr << "'. Must be one of: WEAK, STRICT, IGNORE" << endl;
throw "Invalid token numbering";
}
}
|
|
192
193
194
|
static WhitespaceHandling getWhitespaceHandling(const string& optionStr) {
if (optionStr == "SKIP")
|
|
195
|
return SKIP_WHITESPACES;
|
|
196
|
else if (optionStr == "APPEND")
|
|
197
|
return APPEND_WHITESPACES;
|
|
198
|
else if (optionStr == "KEEP")
|
|
199
|
return KEEP_WHITESPACES;
|
|
200
201
202
203
204
|
else {
cerr << "Invalid whitespace handling: '" << optionStr << "'. Must be one of: SKIP, APPEND, KEEP" << endl;
throw "Invalid whitespace handling";
}
}
|
|
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
|
void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz, MorfeuszProcessorType processorType) {
if (opt.isSet("-i")) {
string dictFile;
opt.get("-i")->getString(dictFile);
switch (processorType) {
case ANALYZER:
morfeusz.setAnalyzerDictionary(dictFile);
break;
case GENERATOR:
morfeusz.setGeneratorDictionary(dictFile);
break;
default:
break;
}
|
|
220
|
cerr << "Using dictionary from " << dictFile << endl;
|
|
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
|
}
if (opt.isSet("-a")) {
string aggl;
opt.get("-a")->getString(aggl);
cerr << "setting aggl option to " << aggl << endl;
morfeusz.setAggl(aggl);
}
if (opt.isSet("-p")) {
string praet;
opt.get("-p")->getString(praet);
cerr << "setting praet option to " << praet << endl;
morfeusz.setPraet(praet);
}
if (opt.isSet("-d")) {
cerr << "setting debug to TRUE" << endl;
morfeusz.setDebug(true);
}
if (opt.isSet("-c")) {
string charset;
opt.get("-c")->getString(charset);
cerr << "setting charset to " << charset << endl;
morfeusz.setCharset(getCharset(charset));
}
if (processorType == ANALYZER) {
|
|
246
247
248
249
250
|
if (opt.isSet("-case-handling")) {
string caseHandling;
opt.get("-case-handling")->getString(caseHandling);
cerr << "setting case handling to " << caseHandling << endl;
morfeusz.setCaseHandling(getCaseHandling(caseHandling));
|
|
251
252
253
254
255
256
257
258
|
}
if (opt.isSet("-token-numbering")) {
string tokenNumbering;
opt.get("-token-numbering")->getString(tokenNumbering);
cerr << "setting token numbering to " << tokenNumbering << endl;
morfeusz.setTokenNumbering(getTokenNumbering(tokenNumbering));
}
|
|
259
260
261
262
263
264
265
|
if (opt.isSet("-whitespace-handling")) {
string whitespaceHandling;
opt.get("-whitespace-handling")->getString(whitespaceHandling);
cerr << "setting whitespace handling to " << whitespaceHandling << endl;
morfeusz.setWhitespaceHandling(getWhitespaceHandling(whitespaceHandling));
}
|
|
266
267
|
}
|
|
268
|
#if defined(_WIN64) || defined(_WIN32)
|
|
269
|
morfeusz.setCharset(CP852);
|
|
270
|
#endif
|
|
271
|
}
|
|
272
273
|
}
|