Commit 36a8a25293aa8e61cc03dca638932fc8169b7792
1 parent
53f1a33d
dodanie opcji "debug" dla analizatora i generatora
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@140 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
9 changed files
with
72 additions
and
26 deletions
fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -41,9 +41,9 @@ class FSA(object): | @@ -41,9 +41,9 @@ class FSA(object): | ||
41 | self.n += 1 | 41 | self.n += 1 |
42 | 42 | ||
43 | # debug | 43 | # debug |
44 | - if self.n % 10000 == 0: | ||
45 | - logging.info(word) | ||
46 | - logging.info(str(self.register.getStatesNum())) | 44 | + if self.n % 100000 == 0: |
45 | + logging.info(u'%d %s' % (self.n, word)) | ||
46 | +# logging.info(str(self.register.getStatesNum())) | ||
47 | # allWords.append(word) | 47 | # allWords.append(word) |
48 | for label in encodedWord: | 48 | for label in encodedWord: |
49 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 | 49 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 |
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
@@ -68,6 +68,6 @@ class RulesFSA(object): | @@ -68,6 +68,6 @@ class RulesFSA(object): | ||
68 | res.extend(self.stateData2bytearray(state)) | 68 | res.extend(self.stateData2bytearray(state)) |
69 | res.extend(self.transitionsData2bytearray(state)) | 69 | res.extend(self.transitionsData2bytearray(state)) |
70 | 70 | ||
71 | - logging.info('Segmentation automaton size: %d bytes', len(res)) | ||
72 | - print list(res) | 71 | +# logging.info('Segmentation automaton size: %d bytes', len(res)) |
72 | +# print list(res) | ||
73 | return res | 73 | return res |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -5,6 +5,7 @@ Created on 17 lut 2014 | @@ -5,6 +5,7 @@ Created on 17 lut 2014 | ||
5 | ''' | 5 | ''' |
6 | import re | 6 | import re |
7 | import logging | 7 | import logging |
8 | +import sys | ||
8 | from morfeuszbuilder.utils import exceptions | 9 | from morfeuszbuilder.utils import exceptions |
9 | 10 | ||
10 | def _cutHomonymFromLemma(lemma): | 11 | def _cutHomonymFromLemma(lemma): |
@@ -33,10 +34,12 @@ class Segtypes(object): | @@ -33,10 +34,12 @@ class Segtypes(object): | ||
33 | self._readTags(segrulesConfigFile) | 34 | self._readTags(segrulesConfigFile) |
34 | self._indexSegnums() | 35 | self._indexSegnums() |
35 | 36 | ||
36 | - print self._lemmaTagnum2Segnum | ||
37 | - print self._tagnum2Segnum | ||
38 | - | ||
39 | - print self.segnum2Segtype | 37 | +# print self._lemmaTagnum2Segnum |
38 | +# print self._tagnum2Segnum | ||
39 | + logging.info('segment number -> segment type') | ||
40 | + logging.info('------------------------------') | ||
41 | + logging.info(str(self.segnum2Segtype)) | ||
42 | + logging.info('------------------------------') | ||
40 | 43 | ||
41 | # self._debugSegnums() | 44 | # self._debugSegnums() |
42 | 45 |
morfeusz/InterpretedChunk.hpp
@@ -13,6 +13,7 @@ | @@ -13,6 +13,7 @@ | ||
13 | 13 | ||
14 | struct InterpretedChunk { | 14 | struct InterpretedChunk { |
15 | const char* chunkStartPtr; | 15 | const char* chunkStartPtr; |
16 | + const char* chunkEndPtr; | ||
16 | std::vector<uint32_t> originalCodepoints; | 17 | std::vector<uint32_t> originalCodepoints; |
17 | std::vector<uint32_t> lowercaseCodepoints; | 18 | std::vector<uint32_t> lowercaseCodepoints; |
18 | InterpsGroup interpsGroup; | 19 | InterpsGroup interpsGroup; |
morfeusz/InterpsGroup.hpp
@@ -15,25 +15,9 @@ | @@ -15,25 +15,9 @@ | ||
15 | #include "Tagset.hpp" | 15 | #include "Tagset.hpp" |
16 | 16 | ||
17 | struct InterpsGroup { | 17 | struct InterpsGroup { |
18 | -//public: | ||
19 | -// | ||
20 | -// InterpsGroup() { | ||
21 | -// | ||
22 | -// } | ||
23 | -// | ||
24 | -// explicit InterpsGroup(const unsigned char type) | ||
25 | -// : type(type) { | ||
26 | -// | ||
27 | -// } | ||
28 | -// | ||
29 | -// void addInterpretation(const EncodedInterpretation& interp) { | ||
30 | -// interps.push_back(interp); | ||
31 | -// } | ||
32 | - | ||
33 | unsigned char type; | 18 | unsigned char type; |
34 | uint16_t size; | 19 | uint16_t size; |
35 | const unsigned char* ptr; | 20 | const unsigned char* ptr; |
36 | -// std::vector<EncodedInterpretation> interps; | ||
37 | }; | 21 | }; |
38 | 22 | ||
39 | #endif /* GROUPEDINTERPRETATIONS_HPP */ | 23 | #endif /* GROUPEDINTERPRETATIONS_HPP */ |
morfeusz/Morfeusz.cpp
@@ -28,6 +28,7 @@ static MorfeuszOptions createDefaultOptions() { | @@ -28,6 +28,7 @@ static MorfeuszOptions createDefaultOptions() { | ||
28 | MorfeuszOptions res; | 28 | MorfeuszOptions res; |
29 | res.caseSensitive = true; | 29 | res.caseSensitive = true; |
30 | res.encoding = UTF8; | 30 | res.encoding = UTF8; |
31 | + res.debug = false; | ||
31 | return res; | 32 | return res; |
32 | } | 33 | } |
33 | 34 | ||
@@ -102,6 +103,21 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { | @@ -102,6 +103,21 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { | ||
102 | to.chunkStartPtr = from.chunkStartPtr; | 103 | to.chunkStartPtr = from.chunkStartPtr; |
103 | } | 104 | } |
104 | 105 | ||
106 | +static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { | ||
107 | + stringstream res; | ||
108 | + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), "; | ||
109 | + return res.str(); | ||
110 | +} | ||
111 | + | ||
112 | +static inline string debugAccum(vector<InterpretedChunk>& accum) { | ||
113 | + stringstream res; | ||
114 | + for (unsigned int i = 0; i < accum.size(); i++) { | ||
115 | + res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr); | ||
116 | +// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; | ||
117 | + } | ||
118 | + return res.str(); | ||
119 | +} | ||
120 | + | ||
105 | void Morfeusz::doProcessOneWord( | 121 | void Morfeusz::doProcessOneWord( |
106 | const Environment& env, | 122 | const Environment& env, |
107 | const char*& inputData, | 123 | const char*& inputData, |
@@ -109,7 +125,12 @@ void Morfeusz::doProcessOneWord( | @@ -109,7 +125,12 @@ void Morfeusz::doProcessOneWord( | ||
109 | SegrulesState segrulesState, | 125 | SegrulesState segrulesState, |
110 | vector<InterpretedChunk>& accum, | 126 | vector<InterpretedChunk>& accum, |
111 | InflexionGraph& graph) const { | 127 | InflexionGraph& graph) const { |
128 | +// if (this->options.debug) { | ||
129 | +// cerr << "----------" << endl; | ||
130 | +// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; | ||
131 | +// } | ||
112 | // cerr << "doAnalyzeOneWord " << inputData << endl; | 132 | // cerr << "doAnalyzeOneWord " << inputData << endl; |
133 | + const char* inputStart = inputData; | ||
113 | const char* currInput = inputData; | 134 | const char* currInput = inputData; |
114 | uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); | 135 | uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); |
115 | vector<uint32_t> originalCodepoints; | 136 | vector<uint32_t> originalCodepoints; |
@@ -139,9 +160,15 @@ void Morfeusz::doProcessOneWord( | @@ -139,9 +160,15 @@ void Morfeusz::doProcessOneWord( | ||
139 | vector<InterpsGroup> val(state.getValue()); | 160 | vector<InterpsGroup> val(state.getValue()); |
140 | for (unsigned int i = 0; i < val.size(); i++) { | 161 | for (unsigned int i = 0; i < val.size(); i++) { |
141 | InterpsGroup& ig = val[i]; | 162 | InterpsGroup& ig = val[i]; |
163 | + if (this->options.debug) { | ||
164 | + cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; | ||
165 | + } | ||
142 | // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; | 166 | // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; |
143 | set<SegrulesState> newSegrulesStates; | 167 | set<SegrulesState> newSegrulesStates; |
144 | env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); | 168 | env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); |
169 | + if (this->options.debug && newSegrulesStates.empty()) { | ||
170 | + cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; | ||
171 | + } | ||
145 | // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; | 172 | // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; |
146 | for ( | 173 | for ( |
147 | set<SegrulesState>::iterator it = newSegrulesStates.begin(); | 174 | set<SegrulesState>::iterator it = newSegrulesStates.begin(); |
@@ -149,7 +176,8 @@ void Morfeusz::doProcessOneWord( | @@ -149,7 +176,8 @@ void Morfeusz::doProcessOneWord( | ||
149 | ++it) { | 176 | ++it) { |
150 | SegrulesState newSegrulesState = *it; | 177 | SegrulesState newSegrulesState = *it; |
151 | InterpretedChunk ic = { | 178 | InterpretedChunk ic = { |
152 | - inputData, | 179 | + inputStart, |
180 | + currInput, | ||
153 | originalCodepoints, | 181 | originalCodepoints, |
154 | normalizedCodepoints, | 182 | normalizedCodepoints, |
155 | ig, | 183 | ig, |
@@ -164,6 +192,9 @@ void Morfeusz::doProcessOneWord( | @@ -164,6 +192,9 @@ void Morfeusz::doProcessOneWord( | ||
164 | accum.push_back(ic); | 192 | accum.push_back(ic); |
165 | if (isEndOfWord(codepoint) | 193 | if (isEndOfWord(codepoint) |
166 | && newSegrulesState.accepting) { | 194 | && newSegrulesState.accepting) { |
195 | + if (this->options.debug) { | ||
196 | + cerr << "ACCEPTING " << debugAccum(accum) << endl; | ||
197 | + } | ||
167 | graph.addPath(accum, newSegrulesState.weak); | 198 | graph.addPath(accum, newSegrulesState.weak); |
168 | } | 199 | } |
169 | else if (!isEndOfWord(codepoint)) { | 200 | else if (!isEndOfWord(codepoint)) { |
@@ -255,6 +286,10 @@ void Morfeusz::setPraet(const std::string& praet) { | @@ -255,6 +286,10 @@ void Morfeusz::setPraet(const std::string& praet) { | ||
255 | this->generatorEnv.setSegrulesOption("praet", praet); | 286 | this->generatorEnv.setSegrulesOption("praet", praet); |
256 | } | 287 | } |
257 | 288 | ||
289 | +void Morfeusz::setDebug(bool debug) { | ||
290 | + this->options.debug = debug; | ||
291 | +} | ||
292 | + | ||
258 | ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) { | 293 | ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) { |
259 | resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end()); | 294 | resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end()); |
260 | } | 295 | } |
morfeusz/Morfeusz.hpp
@@ -139,6 +139,13 @@ public: | @@ -139,6 +139,13 @@ public: | ||
139 | * @param praet | 139 | * @param praet |
140 | */ | 140 | */ |
141 | void setPraet(const std::string& praet); | 141 | void setPraet(const std::string& praet); |
142 | + | ||
143 | + /** | ||
144 | + * Set debug option value. | ||
145 | + * | ||
146 | + * @param praet | ||
147 | + */ | ||
148 | + void setDebug(bool debug); | ||
142 | 149 | ||
143 | friend class ResultsIterator; | 150 | friend class ResultsIterator; |
144 | private: | 151 | private: |
morfeusz/MorfeuszOptions.hpp
@@ -13,6 +13,7 @@ | @@ -13,6 +13,7 @@ | ||
13 | struct MorfeuszOptions { | 13 | struct MorfeuszOptions { |
14 | bool caseSensitive; | 14 | bool caseSensitive; |
15 | MorfeuszCharset encoding; | 15 | MorfeuszCharset encoding; |
16 | + bool debug; | ||
16 | }; | 17 | }; |
17 | 18 | ||
18 | #endif /* MORFEUSZOPTIONS_HPP */ | 19 | #endif /* MORFEUSZOPTIONS_HPP */ |
morfeusz/cli/cli.cpp
@@ -65,6 +65,17 @@ ezOptionParser* getOptions(int argc, const char** argv, const string& titleText) | @@ -65,6 +65,17 @@ ezOptionParser* getOptions(int argc, const char** argv, const string& titleText) | ||
65 | "-praet", // Flag token. | 65 | "-praet", // Flag token. |
66 | "--praet" // Flag token. | 66 | "--praet" // Flag token. |
67 | ); | 67 | ); |
68 | + | ||
69 | + opt.add( | ||
70 | + "", // Default. | ||
71 | + 0, // Required? | ||
72 | + 0, // Number of args expected. | ||
73 | + 0, // Delimiter if expecting multiple args. | ||
74 | + "praet option.", // Help description. | ||
75 | + "-d", // Flag token. | ||
76 | + "-debug", // Flag token. | ||
77 | + "--debug" // Flag token. | ||
78 | + ); | ||
68 | 79 | ||
69 | opt.parse(argc, argv); | 80 | opt.parse(argc, argv); |
70 | 81 | ||
@@ -105,6 +116,10 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { | @@ -105,6 +116,10 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { | ||
105 | cerr << "setting praet option to " << praet << endl; | 116 | cerr << "setting praet option to " << praet << endl; |
106 | morfeusz.setPraet(praet); | 117 | morfeusz.setPraet(praet); |
107 | } | 118 | } |
119 | + if (opt.isSet("-d")) { | ||
120 | + cerr << "setting debug to TRUE" << endl; | ||
121 | + morfeusz.setDebug(true); | ||
122 | + } | ||
108 | #ifdef _WIN32 | 123 | #ifdef _WIN32 |
109 | morfeusz.setCharset(CP852); | 124 | morfeusz.setCharset(CP852); |
110 | #endif | 125 | #endif |