Commit 36a8a25293aa8e61cc03dca638932fc8169b7792
1 parent
53f1a33d
dodanie opcji "debug" dla analizatora i generatora
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@140 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
9 changed files
with
72 additions
and
26 deletions
fsabuilder/morfeuszbuilder/fsa/fsa.py
... | ... | @@ -41,9 +41,9 @@ class FSA(object): |
41 | 41 | self.n += 1 |
42 | 42 | |
43 | 43 | # debug |
44 | - if self.n % 10000 == 0: | |
45 | - logging.info(word) | |
46 | - logging.info(str(self.register.getStatesNum())) | |
44 | + if self.n % 100000 == 0: | |
45 | + logging.info(u'%d %s' % (self.n, word)) | |
46 | +# logging.info(str(self.register.getStatesNum())) | |
47 | 47 | # allWords.append(word) |
48 | 48 | for label in encodedWord: |
49 | 49 | self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
... | ... | @@ -68,6 +68,6 @@ class RulesFSA(object): |
68 | 68 | res.extend(self.stateData2bytearray(state)) |
69 | 69 | res.extend(self.transitionsData2bytearray(state)) |
70 | 70 | |
71 | - logging.info('Segmentation automaton size: %d bytes', len(res)) | |
72 | - print list(res) | |
71 | +# logging.info('Segmentation automaton size: %d bytes', len(res)) | |
72 | +# print list(res) | |
73 | 73 | return res |
... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... | ... | @@ -5,6 +5,7 @@ Created on 17 lut 2014 |
5 | 5 | ''' |
6 | 6 | import re |
7 | 7 | import logging |
8 | +import sys | |
8 | 9 | from morfeuszbuilder.utils import exceptions |
9 | 10 | |
10 | 11 | def _cutHomonymFromLemma(lemma): |
... | ... | @@ -33,10 +34,12 @@ class Segtypes(object): |
33 | 34 | self._readTags(segrulesConfigFile) |
34 | 35 | self._indexSegnums() |
35 | 36 | |
36 | - print self._lemmaTagnum2Segnum | |
37 | - print self._tagnum2Segnum | |
38 | - | |
39 | - print self.segnum2Segtype | |
37 | +# print self._lemmaTagnum2Segnum | |
38 | +# print self._tagnum2Segnum | |
39 | + logging.info('segment number -> segment type') | |
40 | + logging.info('------------------------------') | |
41 | + logging.info(str(self.segnum2Segtype)) | |
42 | + logging.info('------------------------------') | |
40 | 43 | |
41 | 44 | # self._debugSegnums() |
42 | 45 | |
... | ... |
morfeusz/InterpretedChunk.hpp
morfeusz/InterpsGroup.hpp
... | ... | @@ -15,25 +15,9 @@ |
15 | 15 | #include "Tagset.hpp" |
16 | 16 | |
17 | 17 | struct InterpsGroup { |
18 | -//public: | |
19 | -// | |
20 | -// InterpsGroup() { | |
21 | -// | |
22 | -// } | |
23 | -// | |
24 | -// explicit InterpsGroup(const unsigned char type) | |
25 | -// : type(type) { | |
26 | -// | |
27 | -// } | |
28 | -// | |
29 | -// void addInterpretation(const EncodedInterpretation& interp) { | |
30 | -// interps.push_back(interp); | |
31 | -// } | |
32 | - | |
33 | 18 | unsigned char type; |
34 | 19 | uint16_t size; |
35 | 20 | const unsigned char* ptr; |
36 | -// std::vector<EncodedInterpretation> interps; | |
37 | 21 | }; |
38 | 22 | |
39 | 23 | #endif /* GROUPEDINTERPRETATIONS_HPP */ |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -28,6 +28,7 @@ static MorfeuszOptions createDefaultOptions() { |
28 | 28 | MorfeuszOptions res; |
29 | 29 | res.caseSensitive = true; |
30 | 30 | res.encoding = UTF8; |
31 | + res.debug = false; | |
31 | 32 | return res; |
32 | 33 | } |
33 | 34 | |
... | ... | @@ -102,6 +103,21 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { |
102 | 103 | to.chunkStartPtr = from.chunkStartPtr; |
103 | 104 | } |
104 | 105 | |
106 | +static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { | |
107 | + stringstream res; | |
108 | + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), "; | |
109 | + return res.str(); | |
110 | +} | |
111 | + | |
112 | +static inline string debugAccum(vector<InterpretedChunk>& accum) { | |
113 | + stringstream res; | |
114 | + for (unsigned int i = 0; i < accum.size(); i++) { | |
115 | + res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr); | |
116 | +// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; | |
117 | + } | |
118 | + return res.str(); | |
119 | +} | |
120 | + | |
105 | 121 | void Morfeusz::doProcessOneWord( |
106 | 122 | const Environment& env, |
107 | 123 | const char*& inputData, |
... | ... | @@ -109,7 +125,12 @@ void Morfeusz::doProcessOneWord( |
109 | 125 | SegrulesState segrulesState, |
110 | 126 | vector<InterpretedChunk>& accum, |
111 | 127 | InflexionGraph& graph) const { |
128 | +// if (this->options.debug) { | |
129 | +// cerr << "----------" << endl; | |
130 | +// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; | |
131 | +// } | |
112 | 132 | // cerr << "doAnalyzeOneWord " << inputData << endl; |
133 | + const char* inputStart = inputData; | |
113 | 134 | const char* currInput = inputData; |
114 | 135 | uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); |
115 | 136 | vector<uint32_t> originalCodepoints; |
... | ... | @@ -139,9 +160,15 @@ void Morfeusz::doProcessOneWord( |
139 | 160 | vector<InterpsGroup> val(state.getValue()); |
140 | 161 | for (unsigned int i = 0; i < val.size(); i++) { |
141 | 162 | InterpsGroup& ig = val[i]; |
163 | + if (this->options.debug) { | |
164 | + cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; | |
165 | + } | |
142 | 166 | // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; |
143 | 167 | set<SegrulesState> newSegrulesStates; |
144 | 168 | env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); |
169 | + if (this->options.debug && newSegrulesStates.empty()) { | |
170 | + cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; | |
171 | + } | |
145 | 172 | // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; |
146 | 173 | for ( |
147 | 174 | set<SegrulesState>::iterator it = newSegrulesStates.begin(); |
... | ... | @@ -149,7 +176,8 @@ void Morfeusz::doProcessOneWord( |
149 | 176 | ++it) { |
150 | 177 | SegrulesState newSegrulesState = *it; |
151 | 178 | InterpretedChunk ic = { |
152 | - inputData, | |
179 | + inputStart, | |
180 | + currInput, | |
153 | 181 | originalCodepoints, |
154 | 182 | normalizedCodepoints, |
155 | 183 | ig, |
... | ... | @@ -164,6 +192,9 @@ void Morfeusz::doProcessOneWord( |
164 | 192 | accum.push_back(ic); |
165 | 193 | if (isEndOfWord(codepoint) |
166 | 194 | && newSegrulesState.accepting) { |
195 | + if (this->options.debug) { | |
196 | + cerr << "ACCEPTING " << debugAccum(accum) << endl; | |
197 | + } | |
167 | 198 | graph.addPath(accum, newSegrulesState.weak); |
168 | 199 | } |
169 | 200 | else if (!isEndOfWord(codepoint)) { |
... | ... | @@ -255,6 +286,10 @@ void Morfeusz::setPraet(const std::string& praet) { |
255 | 286 | this->generatorEnv.setSegrulesOption("praet", praet); |
256 | 287 | } |
257 | 288 | |
289 | +void Morfeusz::setDebug(bool debug) { | |
290 | + this->options.debug = debug; | |
291 | +} | |
292 | + | |
258 | 293 | ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) { |
259 | 294 | resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end()); |
260 | 295 | } |
... | ... |
morfeusz/Morfeusz.hpp
... | ... | @@ -139,6 +139,13 @@ public: |
139 | 139 | * @param praet |
140 | 140 | */ |
141 | 141 | void setPraet(const std::string& praet); |
142 | + | |
143 | + /** | |
144 | + * Set debug option value. | |
145 | + * | |
146 | + * @param praet | |
147 | + */ | |
148 | + void setDebug(bool debug); | |
142 | 149 | |
143 | 150 | friend class ResultsIterator; |
144 | 151 | private: |
... | ... |
morfeusz/MorfeuszOptions.hpp
morfeusz/cli/cli.cpp
... | ... | @@ -65,6 +65,17 @@ ezOptionParser* getOptions(int argc, const char** argv, const string& titleText) |
65 | 65 | "-praet", // Flag token. |
66 | 66 | "--praet" // Flag token. |
67 | 67 | ); |
68 | + | |
69 | + opt.add( | |
70 | + "", // Default. | |
71 | + 0, // Required? | |
72 | + 0, // Number of args expected. | |
73 | + 0, // Delimiter if expecting multiple args. | |
74 | + "praet option.", // Help description. | |
75 | + "-d", // Flag token. | |
76 | + "-debug", // Flag token. | |
77 | + "--debug" // Flag token. | |
78 | + ); | |
68 | 79 | |
69 | 80 | opt.parse(argc, argv); |
70 | 81 | |
... | ... | @@ -105,6 +116,10 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { |
105 | 116 | cerr << "setting praet option to " << praet << endl; |
106 | 117 | morfeusz.setPraet(praet); |
107 | 118 | } |
119 | + if (opt.isSet("-d")) { | |
120 | + cerr << "setting debug to TRUE" << endl; | |
121 | + morfeusz.setDebug(true); | |
122 | + } | |
108 | 123 | #ifdef _WIN32 |
109 | 124 | morfeusz.setCharset(CP852); |
110 | 125 | #endif |
... | ... |