Commit c46767982cfc2f92a989cae084e4859d9f13854e
1 parent
3c3e1cf2
naprawa obsługi nieutfowych kodowań
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@206 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
2 changed files
with
37 additions
and
5 deletions
morfeusz/Morfeusz.cpp
... | ... | @@ -77,14 +77,14 @@ static void feedStateIndirectly( |
77 | 77 | } |
78 | 78 | |
79 | 79 | static void feedState( |
80 | - const FSAType& fsa, | |
80 | + const Environment& env, | |
81 | 81 | StateType& state, |
82 | 82 | TextReader& reader) { |
83 | - if (reader.peek() == reader.normalizedPeek()) { | |
84 | - feedStateDirectly(fsa, state, reader.getCurrPtr(), reader.getNextPtr()); | |
83 | + if (reader.peek() == reader.normalizedPeek() && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) { | |
84 | + feedStateDirectly(env.getFSA(), state, reader.getCurrPtr(), reader.getNextPtr()); | |
85 | 85 | } |
86 | 86 | else { |
87 | - feedStateIndirectly(fsa, state, reader.normalizedPeek()); | |
87 | + feedStateIndirectly(env.getFSA(), state, reader.normalizedPeek()); | |
88 | 88 | } |
89 | 89 | } |
90 | 90 | |
... | ... | @@ -190,7 +190,7 @@ void Morfeusz::doProcessOneWord( |
190 | 190 | reader.proceedToEnd(); |
191 | 191 | } |
192 | 192 | else { |
193 | - feedState(env.getFSA(), state, reader); | |
193 | + feedState(env, state, reader); | |
194 | 194 | reader.next(); |
195 | 195 | } |
196 | 196 | if (state.isAccepting()) { |
... | ... |
morfeusz/cli/cli.cpp
... | ... | @@ -69,6 +69,17 @@ ezOptionParser* getOptions(int argc, const char** argv, MorfeuszProcessorType pr |
69 | 69 | "--praet" // Flag token. |
70 | 70 | ); |
71 | 71 | |
72 | + opt.add( | |
73 | + "", // Default. | |
74 | + 0, // Required? | |
75 | + 1, // Number of args expected. | |
76 | + 0, // Delimiter if expecting multiple args. | |
77 | + "input/output charset", // Help description. | |
78 | + "-c", // Flag token. | |
79 | + "-charset", // Flag token. | |
80 | + "--charset" // Flag token. | |
81 | + ); | |
82 | + | |
72 | 83 | if (processorType == ANALYZER) { |
73 | 84 | opt.add( |
74 | 85 | "", // Default. |
... | ... | @@ -112,6 +123,21 @@ ezOptionParser* getOptions(int argc, const char** argv, MorfeuszProcessorType pr |
112 | 123 | return &opt; |
113 | 124 | } |
114 | 125 | |
126 | +static MorfeuszCharset getCharset(const string& encodingStr) { | |
127 | + if (encodingStr == "UTF8") | |
128 | + return UTF8; | |
129 | + else if (encodingStr == "ISO8859_2") | |
130 | + return ISO8859_2; | |
131 | + else if (encodingStr == "CP1250") | |
132 | + return CP1250; | |
133 | + else if (encodingStr == "CP852") | |
134 | + return CP852; | |
135 | + else { | |
136 | + cerr << "Invalid encoding: '" << encodingStr << "'. Must be one of: UTF8, ISO8859_2, WINDOWS1250" << endl; | |
137 | + throw "Invalid encoding"; | |
138 | + } | |
139 | +} | |
140 | + | |
115 | 141 | void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { |
116 | 142 | if (opt.isSet("-i")) { |
117 | 143 | string analyzerFile; |
... | ... | @@ -139,6 +165,12 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { |
139 | 165 | cerr << "setting case sensitive to FALSE" << endl; |
140 | 166 | morfeusz.setCaseSensitive(false); |
141 | 167 | } |
168 | + if (opt.isSet("-c")) { | |
169 | + string charset; | |
170 | + opt.get("-c")->getString(charset); | |
171 | + cerr << "setting charset to " << charset << endl; | |
172 | + morfeusz.setCharset(getCharset(charset)); | |
173 | + } | |
142 | 174 | #if defined(_WIN64) || defined(_WIN32) |
143 | 175 | morfeusz.setCharset(CP852); |
144 | 176 | #endif |
... | ... |