Commit 36a8a25293aa8e61cc03dca638932fc8169b7792

Authored by Michał Lenart
1 parent 53f1a33d

dodanie opcji "debug" dla analizatora i generatora

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@140 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -41,9 +41,9 @@ class FSA(object): @@ -41,9 +41,9 @@ class FSA(object):
41 self.n += 1 41 self.n += 1
42 42
43 # debug 43 # debug
44 - if self.n % 10000 == 0:  
45 - logging.info(word)  
46 - logging.info(str(self.register.getStatesNum())) 44 + if self.n % 100000 == 0:
  45 + logging.info(u'%d %s' % (self.n, word))
  46 +# logging.info(str(self.register.getStatesNum()))
47 # allWords.append(word) 47 # allWords.append(word)
48 for label in encodedWord: 48 for label in encodedWord:
49 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 49 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
@@ -68,6 +68,6 @@ class RulesFSA(object): @@ -68,6 +68,6 @@ class RulesFSA(object):
68 res.extend(self.stateData2bytearray(state)) 68 res.extend(self.stateData2bytearray(state))
69 res.extend(self.transitionsData2bytearray(state)) 69 res.extend(self.transitionsData2bytearray(state))
70 70
71 - logging.info('Segmentation automaton size: %d bytes', len(res))  
72 - print list(res) 71 +# logging.info('Segmentation automaton size: %d bytes', len(res))
  72 +# print list(res)
73 return res 73 return res
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -5,6 +5,7 @@ Created on 17 lut 2014 @@ -5,6 +5,7 @@ Created on 17 lut 2014
5 ''' 5 '''
6 import re 6 import re
7 import logging 7 import logging
  8 +import sys
8 from morfeuszbuilder.utils import exceptions 9 from morfeuszbuilder.utils import exceptions
9 10
10 def _cutHomonymFromLemma(lemma): 11 def _cutHomonymFromLemma(lemma):
@@ -33,10 +34,12 @@ class Segtypes(object): @@ -33,10 +34,12 @@ class Segtypes(object):
33 self._readTags(segrulesConfigFile) 34 self._readTags(segrulesConfigFile)
34 self._indexSegnums() 35 self._indexSegnums()
35 36
36 - print self._lemmaTagnum2Segnum  
37 - print self._tagnum2Segnum  
38 -  
39 - print self.segnum2Segtype 37 +# print self._lemmaTagnum2Segnum
  38 +# print self._tagnum2Segnum
  39 + logging.info('segment number -> segment type')
  40 + logging.info('------------------------------')
  41 + logging.info(str(self.segnum2Segtype))
  42 + logging.info('------------------------------')
40 43
41 # self._debugSegnums() 44 # self._debugSegnums()
42 45
morfeusz/InterpretedChunk.hpp
@@ -13,6 +13,7 @@ @@ -13,6 +13,7 @@
13 13
14 struct InterpretedChunk { 14 struct InterpretedChunk {
15 const char* chunkStartPtr; 15 const char* chunkStartPtr;
  16 + const char* chunkEndPtr;
16 std::vector<uint32_t> originalCodepoints; 17 std::vector<uint32_t> originalCodepoints;
17 std::vector<uint32_t> lowercaseCodepoints; 18 std::vector<uint32_t> lowercaseCodepoints;
18 InterpsGroup interpsGroup; 19 InterpsGroup interpsGroup;
morfeusz/InterpsGroup.hpp
@@ -15,25 +15,9 @@ @@ -15,25 +15,9 @@
15 #include "Tagset.hpp" 15 #include "Tagset.hpp"
16 16
17 struct InterpsGroup { 17 struct InterpsGroup {
18 -//public:  
19 -//  
20 -// InterpsGroup() {  
21 -//  
22 -// }  
23 -//  
24 -// explicit InterpsGroup(const unsigned char type)  
25 -// : type(type) {  
26 -//  
27 -// }  
28 -//  
29 -// void addInterpretation(const EncodedInterpretation& interp) {  
30 -// interps.push_back(interp);  
31 -// }  
32 -  
33 unsigned char type; 18 unsigned char type;
34 uint16_t size; 19 uint16_t size;
35 const unsigned char* ptr; 20 const unsigned char* ptr;
36 -// std::vector<EncodedInterpretation> interps;  
37 }; 21 };
38 22
39 #endif /* GROUPEDINTERPRETATIONS_HPP */ 23 #endif /* GROUPEDINTERPRETATIONS_HPP */
morfeusz/Morfeusz.cpp
@@ -28,6 +28,7 @@ static MorfeuszOptions createDefaultOptions() { @@ -28,6 +28,7 @@ static MorfeuszOptions createDefaultOptions() {
28 MorfeuszOptions res; 28 MorfeuszOptions res;
29 res.caseSensitive = true; 29 res.caseSensitive = true;
30 res.encoding = UTF8; 30 res.encoding = UTF8;
  31 + res.debug = false;
31 return res; 32 return res;
32 } 33 }
33 34
@@ -102,6 +103,21 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) { @@ -102,6 +103,21 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) {
102 to.chunkStartPtr = from.chunkStartPtr; 103 to.chunkStartPtr = from.chunkStartPtr;
103 } 104 }
104 105
  106 +static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
  107 + stringstream res;
  108 + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
  109 + return res.str();
  110 +}
  111 +
  112 +static inline string debugAccum(vector<InterpretedChunk>& accum) {
  113 + stringstream res;
  114 + for (unsigned int i = 0; i < accum.size(); i++) {
  115 + res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr);
  116 +// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
  117 + }
  118 + return res.str();
  119 +}
  120 +
105 void Morfeusz::doProcessOneWord( 121 void Morfeusz::doProcessOneWord(
106 const Environment& env, 122 const Environment& env,
107 const char*& inputData, 123 const char*& inputData,
@@ -109,7 +125,12 @@ void Morfeusz::doProcessOneWord( @@ -109,7 +125,12 @@ void Morfeusz::doProcessOneWord(
109 SegrulesState segrulesState, 125 SegrulesState segrulesState,
110 vector<InterpretedChunk>& accum, 126 vector<InterpretedChunk>& accum,
111 InflexionGraph& graph) const { 127 InflexionGraph& graph) const {
  128 +// if (this->options.debug) {
  129 +// cerr << "----------" << endl;
  130 +// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
  131 +// }
112 // cerr << "doAnalyzeOneWord " << inputData << endl; 132 // cerr << "doAnalyzeOneWord " << inputData << endl;
  133 + const char* inputStart = inputData;
113 const char* currInput = inputData; 134 const char* currInput = inputData;
114 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); 135 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
115 vector<uint32_t> originalCodepoints; 136 vector<uint32_t> originalCodepoints;
@@ -139,9 +160,15 @@ void Morfeusz::doProcessOneWord( @@ -139,9 +160,15 @@ void Morfeusz::doProcessOneWord(
139 vector<InterpsGroup> val(state.getValue()); 160 vector<InterpsGroup> val(state.getValue());
140 for (unsigned int i = 0; i < val.size(); i++) { 161 for (unsigned int i = 0; i < val.size(); i++) {
141 InterpsGroup& ig = val[i]; 162 InterpsGroup& ig = val[i];
  163 + if (this->options.debug) {
  164 + cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
  165 + }
142 // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; 166 // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
143 set<SegrulesState> newSegrulesStates; 167 set<SegrulesState> newSegrulesStates;
144 env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); 168 env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
  169 + if (this->options.debug && newSegrulesStates.empty()) {
  170 + cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
  171 + }
145 // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; 172 // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
146 for ( 173 for (
147 set<SegrulesState>::iterator it = newSegrulesStates.begin(); 174 set<SegrulesState>::iterator it = newSegrulesStates.begin();
@@ -149,7 +176,8 @@ void Morfeusz::doProcessOneWord( @@ -149,7 +176,8 @@ void Morfeusz::doProcessOneWord(
149 ++it) { 176 ++it) {
150 SegrulesState newSegrulesState = *it; 177 SegrulesState newSegrulesState = *it;
151 InterpretedChunk ic = { 178 InterpretedChunk ic = {
152 - inputData, 179 + inputStart,
  180 + currInput,
153 originalCodepoints, 181 originalCodepoints,
154 normalizedCodepoints, 182 normalizedCodepoints,
155 ig, 183 ig,
@@ -164,6 +192,9 @@ void Morfeusz::doProcessOneWord( @@ -164,6 +192,9 @@ void Morfeusz::doProcessOneWord(
164 accum.push_back(ic); 192 accum.push_back(ic);
165 if (isEndOfWord(codepoint) 193 if (isEndOfWord(codepoint)
166 && newSegrulesState.accepting) { 194 && newSegrulesState.accepting) {
  195 + if (this->options.debug) {
  196 + cerr << "ACCEPTING " << debugAccum(accum) << endl;
  197 + }
167 graph.addPath(accum, newSegrulesState.weak); 198 graph.addPath(accum, newSegrulesState.weak);
168 } 199 }
169 else if (!isEndOfWord(codepoint)) { 200 else if (!isEndOfWord(codepoint)) {
@@ -255,6 +286,10 @@ void Morfeusz::setPraet(const std::string&amp; praet) { @@ -255,6 +286,10 @@ void Morfeusz::setPraet(const std::string&amp; praet) {
255 this->generatorEnv.setSegrulesOption("praet", praet); 286 this->generatorEnv.setSegrulesOption("praet", praet);
256 } 287 }
257 288
  289 +void Morfeusz::setDebug(bool debug) {
  290 + this->options.debug = debug;
  291 +}
  292 +
258 ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) { 293 ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) {
259 resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end()); 294 resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end());
260 } 295 }
morfeusz/Morfeusz.hpp
@@ -139,6 +139,13 @@ public: @@ -139,6 +139,13 @@ public:
139 * @param praet 139 * @param praet
140 */ 140 */
141 void setPraet(const std::string& praet); 141 void setPraet(const std::string& praet);
  142 +
  143 + /**
  144 + * Set debug option value.
  145 + *
  146 + * @param praet
  147 + */
  148 + void setDebug(bool debug);
142 149
143 friend class ResultsIterator; 150 friend class ResultsIterator;
144 private: 151 private:
morfeusz/MorfeuszOptions.hpp
@@ -13,6 +13,7 @@ @@ -13,6 +13,7 @@
13 struct MorfeuszOptions { 13 struct MorfeuszOptions {
14 bool caseSensitive; 14 bool caseSensitive;
15 MorfeuszCharset encoding; 15 MorfeuszCharset encoding;
  16 + bool debug;
16 }; 17 };
17 18
18 #endif /* MORFEUSZOPTIONS_HPP */ 19 #endif /* MORFEUSZOPTIONS_HPP */
morfeusz/cli/cli.cpp
@@ -65,6 +65,17 @@ ezOptionParser* getOptions(int argc, const char** argv, const string&amp; titleText) @@ -65,6 +65,17 @@ ezOptionParser* getOptions(int argc, const char** argv, const string&amp; titleText)
65 "-praet", // Flag token. 65 "-praet", // Flag token.
66 "--praet" // Flag token. 66 "--praet" // Flag token.
67 ); 67 );
  68 +
  69 + opt.add(
  70 + "", // Default.
  71 + 0, // Required?
  72 + 0, // Number of args expected.
  73 + 0, // Delimiter if expecting multiple args.
  74 + "praet option.", // Help description.
  75 + "-d", // Flag token.
  76 + "-debug", // Flag token.
  77 + "--debug" // Flag token.
  78 + );
68 79
69 opt.parse(argc, argv); 80 opt.parse(argc, argv);
70 81
@@ -105,6 +116,10 @@ void initializeMorfeusz(ezOptionParser&amp; opt, Morfeusz&amp; morfeusz) { @@ -105,6 +116,10 @@ void initializeMorfeusz(ezOptionParser&amp; opt, Morfeusz&amp; morfeusz) {
105 cerr << "setting praet option to " << praet << endl; 116 cerr << "setting praet option to " << praet << endl;
106 morfeusz.setPraet(praet); 117 morfeusz.setPraet(praet);
107 } 118 }
  119 + if (opt.isSet("-d")) {
  120 + cerr << "setting debug to TRUE" << endl;
  121 + morfeusz.setDebug(true);
  122 + }
108 #ifdef _WIN32 123 #ifdef _WIN32
109 morfeusz.setCharset(CP852); 124 morfeusz.setCharset(CP852);
110 #endif 125 #endif