Commit 36a8a25293aa8e61cc03dca638932fc8169b7792

Authored by Michał Lenart
1 parent 53f1a33d

dodanie opcji "debug" dla analizatora i generatora

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@140 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/morfeuszbuilder/fsa/fsa.py
... ... @@ -41,9 +41,9 @@ class FSA(object):
41 41 self.n += 1
42 42  
43 43 # debug
44   - if self.n % 10000 == 0:
45   - logging.info(word)
46   - logging.info(str(self.register.getStatesNum()))
  44 + if self.n % 100000 == 0:
  45 + logging.info(u'%d %s' % (self.n, word))
  46 +# logging.info(str(self.register.getStatesNum()))
47 47 # allWords.append(word)
48 48 for label in encodedWord:
49 49 self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
... ...
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
... ... @@ -68,6 +68,6 @@ class RulesFSA(object):
68 68 res.extend(self.stateData2bytearray(state))
69 69 res.extend(self.transitionsData2bytearray(state))
70 70  
71   - logging.info('Segmentation automaton size: %d bytes', len(res))
72   - print list(res)
  71 +# logging.info('Segmentation automaton size: %d bytes', len(res))
  72 +# print list(res)
73 73 return res
... ...
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... ... @@ -5,6 +5,7 @@ Created on 17 lut 2014
5 5 '''
6 6 import re
7 7 import logging
  8 +import sys
8 9 from morfeuszbuilder.utils import exceptions
9 10  
10 11 def _cutHomonymFromLemma(lemma):
... ... @@ -33,10 +34,12 @@ class Segtypes(object):
33 34 self._readTags(segrulesConfigFile)
34 35 self._indexSegnums()
35 36  
36   - print self._lemmaTagnum2Segnum
37   - print self._tagnum2Segnum
38   -
39   - print self.segnum2Segtype
  37 +# print self._lemmaTagnum2Segnum
  38 +# print self._tagnum2Segnum
  39 + logging.info('segment number -> segment type')
  40 + logging.info('------------------------------')
  41 + logging.info(str(self.segnum2Segtype))
  42 + logging.info('------------------------------')
40 43  
41 44 # self._debugSegnums()
42 45  
... ...
morfeusz/InterpretedChunk.hpp
... ... @@ -13,6 +13,7 @@
13 13  
14 14 struct InterpretedChunk {
15 15 const char* chunkStartPtr;
  16 + const char* chunkEndPtr;
16 17 std::vector<uint32_t> originalCodepoints;
17 18 std::vector<uint32_t> lowercaseCodepoints;
18 19 InterpsGroup interpsGroup;
... ...
morfeusz/InterpsGroup.hpp
... ... @@ -15,25 +15,9 @@
15 15 #include "Tagset.hpp"
16 16  
17 17 struct InterpsGroup {
18   -//public:
19   -//
20   -// InterpsGroup() {
21   -//
22   -// }
23   -//
24   -// explicit InterpsGroup(const unsigned char type)
25   -// : type(type) {
26   -//
27   -// }
28   -//
29   -// void addInterpretation(const EncodedInterpretation& interp) {
30   -// interps.push_back(interp);
31   -// }
32   -
33 18 unsigned char type;
34 19 uint16_t size;
35 20 const unsigned char* ptr;
36   -// std::vector<EncodedInterpretation> interps;
37 21 };
38 22  
39 23 #endif /* GROUPEDINTERPRETATIONS_HPP */
... ...
morfeusz/Morfeusz.cpp
... ... @@ -28,6 +28,7 @@ static MorfeuszOptions createDefaultOptions() {
28 28 MorfeuszOptions res;
29 29 res.caseSensitive = true;
30 30 res.encoding = UTF8;
  31 + res.debug = false;
31 32 return res;
32 33 }
33 34  
... ... @@ -102,6 +103,21 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) {
102 103 to.chunkStartPtr = from.chunkStartPtr;
103 104 }
104 105  
  106 +static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
  107 + stringstream res;
  108 + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
  109 + return res.str();
  110 +}
  111 +
  112 +static inline string debugAccum(vector<InterpretedChunk>& accum) {
  113 + stringstream res;
  114 + for (unsigned int i = 0; i < accum.size(); i++) {
  115 + res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr);
  116 +// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
  117 + }
  118 + return res.str();
  119 +}
  120 +
105 121 void Morfeusz::doProcessOneWord(
106 122 const Environment& env,
107 123 const char*& inputData,
... ... @@ -109,7 +125,12 @@ void Morfeusz::doProcessOneWord(
109 125 SegrulesState segrulesState,
110 126 vector<InterpretedChunk>& accum,
111 127 InflexionGraph& graph) const {
  128 +// if (this->options.debug) {
  129 +// cerr << "----------" << endl;
  130 +// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
  131 +// }
112 132 // cerr << "doAnalyzeOneWord " << inputData << endl;
  133 + const char* inputStart = inputData;
113 134 const char* currInput = inputData;
114 135 uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
115 136 vector<uint32_t> originalCodepoints;
... ... @@ -139,9 +160,15 @@ void Morfeusz::doProcessOneWord(
139 160 vector<InterpsGroup> val(state.getValue());
140 161 for (unsigned int i = 0; i < val.size(); i++) {
141 162 InterpsGroup& ig = val[i];
  163 + if (this->options.debug) {
  164 + cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
  165 + }
142 166 // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
143 167 set<SegrulesState> newSegrulesStates;
144 168 env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
  169 + if (this->options.debug && newSegrulesStates.empty()) {
  170 + cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
  171 + }
145 172 // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
146 173 for (
147 174 set<SegrulesState>::iterator it = newSegrulesStates.begin();
... ... @@ -149,7 +176,8 @@ void Morfeusz::doProcessOneWord(
149 176 ++it) {
150 177 SegrulesState newSegrulesState = *it;
151 178 InterpretedChunk ic = {
152   - inputData,
  179 + inputStart,
  180 + currInput,
153 181 originalCodepoints,
154 182 normalizedCodepoints,
155 183 ig,
... ... @@ -164,6 +192,9 @@ void Morfeusz::doProcessOneWord(
164 192 accum.push_back(ic);
165 193 if (isEndOfWord(codepoint)
166 194 && newSegrulesState.accepting) {
  195 + if (this->options.debug) {
  196 + cerr << "ACCEPTING " << debugAccum(accum) << endl;
  197 + }
167 198 graph.addPath(accum, newSegrulesState.weak);
168 199 }
169 200 else if (!isEndOfWord(codepoint)) {
... ... @@ -255,6 +286,10 @@ void Morfeusz::setPraet(const std::string&amp; praet) {
255 286 this->generatorEnv.setSegrulesOption("praet", praet);
256 287 }
257 288  
  289 +void Morfeusz::setDebug(bool debug) {
  290 + this->options.debug = debug;
  291 +}
  292 +
258 293 ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) {
259 294 resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end());
260 295 }
... ...
morfeusz/Morfeusz.hpp
... ... @@ -139,6 +139,13 @@ public:
139 139 * @param praet
140 140 */
141 141 void setPraet(const std::string& praet);
  142 +
  143 + /**
  144 + * Set debug option value.
  145 + *
  146 + * @param praet
  147 + */
  148 + void setDebug(bool debug);
142 149  
143 150 friend class ResultsIterator;
144 151 private:
... ...
morfeusz/MorfeuszOptions.hpp
... ... @@ -13,6 +13,7 @@
13 13 struct MorfeuszOptions {
14 14 bool caseSensitive;
15 15 MorfeuszCharset encoding;
  16 + bool debug;
16 17 };
17 18  
18 19 #endif /* MORFEUSZOPTIONS_HPP */
... ...
morfeusz/cli/cli.cpp
... ... @@ -65,6 +65,17 @@ ezOptionParser* getOptions(int argc, const char** argv, const string&amp; titleText)
65 65 "-praet", // Flag token.
66 66 "--praet" // Flag token.
67 67 );
  68 +
  69 + opt.add(
  70 + "", // Default.
  71 + 0, // Required?
  72 + 0, // Number of args expected.
  73 + 0, // Delimiter if expecting multiple args.
  74 + "praet option.", // Help description.
  75 + "-d", // Flag token.
  76 + "-debug", // Flag token.
  77 + "--debug" // Flag token.
  78 + );
68 79  
69 80 opt.parse(argc, argv);
70 81  
... ... @@ -105,6 +116,10 @@ void initializeMorfeusz(ezOptionParser&amp; opt, Morfeusz&amp; morfeusz) {
105 116 cerr << "setting praet option to " << praet << endl;
106 117 morfeusz.setPraet(praet);
107 118 }
  119 + if (opt.isSet("-d")) {
  120 + cerr << "setting debug to TRUE" << endl;
  121 + morfeusz.setDebug(true);
  122 + }
108 123 #ifdef _WIN32
109 124 morfeusz.setCharset(CP852);
110 125 #endif
... ...