Commit c46767982cfc2f92a989cae084e4859d9f13854e

Authored by Michał Lenart
1 parent 3c3e1cf2

naprawa obsługi nieutfowych kodowań

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@206 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
morfeusz/Morfeusz.cpp
... ... @@ -77,14 +77,14 @@ static void feedStateIndirectly(
77 77 }
78 78  
79 79 static void feedState(
80   - const FSAType& fsa,
  80 + const Environment& env,
81 81 StateType& state,
82 82 TextReader& reader) {
83   - if (reader.peek() == reader.normalizedPeek()) {
84   - feedStateDirectly(fsa, state, reader.getCurrPtr(), reader.getNextPtr());
  83 + if (reader.peek() == reader.normalizedPeek() && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) {
  84 + feedStateDirectly(env.getFSA(), state, reader.getCurrPtr(), reader.getNextPtr());
85 85 }
86 86 else {
87   - feedStateIndirectly(fsa, state, reader.normalizedPeek());
  87 + feedStateIndirectly(env.getFSA(), state, reader.normalizedPeek());
88 88 }
89 89 }
90 90  
... ... @@ -190,7 +190,7 @@ void Morfeusz::doProcessOneWord(
190 190 reader.proceedToEnd();
191 191 }
192 192 else {
193   - feedState(env.getFSA(), state, reader);
  193 + feedState(env, state, reader);
194 194 reader.next();
195 195 }
196 196 if (state.isAccepting()) {
... ...
morfeusz/cli/cli.cpp
... ... @@ -69,6 +69,17 @@ ezOptionParser* getOptions(int argc, const char** argv, MorfeuszProcessorType pr
69 69 "--praet" // Flag token.
70 70 );
71 71  
  72 + opt.add(
  73 + "", // Default.
  74 + 0, // Required?
  75 + 1, // Number of args expected.
  76 + 0, // Delimiter if expecting multiple args.
  77 + "input/output charset", // Help description.
  78 + "-c", // Flag token.
  79 + "-charset", // Flag token.
  80 + "--charset" // Flag token.
  81 + );
  82 +
72 83 if (processorType == ANALYZER) {
73 84 opt.add(
74 85 "", // Default.
... ... @@ -112,6 +123,21 @@ ezOptionParser* getOptions(int argc, const char** argv, MorfeuszProcessorType pr
112 123 return &opt;
113 124 }
114 125  
  126 +static MorfeuszCharset getCharset(const string& encodingStr) {
  127 + if (encodingStr == "UTF8")
  128 + return UTF8;
  129 + else if (encodingStr == "ISO8859_2")
  130 + return ISO8859_2;
  131 + else if (encodingStr == "CP1250")
  132 + return CP1250;
  133 + else if (encodingStr == "CP852")
  134 + return CP852;
  135 + else {
  136 + cerr << "Invalid encoding: '" << encodingStr << "'. Must be one of: UTF8, ISO8859_2, WINDOWS1250" << endl;
  137 + throw "Invalid encoding";
  138 + }
  139 +}
  140 +
115 141 void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) {
116 142 if (opt.isSet("-i")) {
117 143 string analyzerFile;
... ... @@ -139,6 +165,12 @@ void initializeMorfeusz(ezOptionParser&amp; opt, Morfeusz&amp; morfeusz) {
139 165 cerr << "setting case sensitive to FALSE" << endl;
140 166 morfeusz.setCaseSensitive(false);
141 167 }
  168 + if (opt.isSet("-c")) {
  169 + string charset;
  170 + opt.get("-c")->getString(charset);
  171 + cerr << "setting charset to " << charset << endl;
  172 + morfeusz.setCharset(getCharset(charset));
  173 + }
142 174 #if defined(_WIN64) || defined(_WIN32)
143 175 morfeusz.setCharset(CP852);
144 176 #endif
... ...