From 36a8a25293aa8e61cc03dca638932fc8169b7792 Mon Sep 17 00:00:00 2001 From: MichaĆ Lenart <michall@ipipan.waw.pl> Date: Mon, 31 Mar 2014 13:00:21 +0000 Subject: [PATCH] dodanie opcji "debug" dla analizatora i generatora --- fsabuilder/morfeuszbuilder/fsa/fsa.py | 6 +++--- fsabuilder/morfeuszbuilder/segrules/rulesFSA.py | 4 ++-- fsabuilder/morfeuszbuilder/tagset/segtypes.py | 11 +++++++---- morfeusz/InterpretedChunk.hpp | 1 + morfeusz/InterpsGroup.hpp | 16 ---------------- morfeusz/Morfeusz.cpp | 37 ++++++++++++++++++++++++++++++++++++- morfeusz/Morfeusz.hpp | 7 +++++++ morfeusz/MorfeuszOptions.hpp | 1 + morfeusz/cli/cli.cpp | 15 +++++++++++++++ 9 files changed, 72 insertions(+), 26 deletions(-) diff --git a/fsabuilder/morfeuszbuilder/fsa/fsa.py b/fsabuilder/morfeuszbuilder/fsa/fsa.py index 2a68af2..c0f234b 100644 --- a/fsabuilder/morfeuszbuilder/fsa/fsa.py +++ b/fsabuilder/morfeuszbuilder/fsa/fsa.py @@ -41,9 +41,9 @@ class FSA(object): self.n += 1 # debug - if self.n % 10000 == 0: - logging.info(word) - logging.info(str(self.register.getStatesNum())) + if self.n % 100000 == 0: + logging.info(u'%d %s' % (self.n, word)) +# logging.info(str(self.register.getStatesNum())) # allWords.append(word) for label in encodedWord: self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py index 65d63e4..5da3482 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py @@ -68,6 +68,6 @@ class RulesFSA(object): res.extend(self.stateData2bytearray(state)) res.extend(self.transitionsData2bytearray(state)) - logging.info('Segmentation automaton size: %d bytes', len(res)) - print list(res) +# logging.info('Segmentation automaton size: %d bytes', len(res)) +# print list(res) return res diff --git a/fsabuilder/morfeuszbuilder/tagset/segtypes.py b/fsabuilder/morfeuszbuilder/tagset/segtypes.py index 31208a6..61b73c7 100644 --- a/fsabuilder/morfeuszbuilder/tagset/segtypes.py +++ b/fsabuilder/morfeuszbuilder/tagset/segtypes.py @@ -5,6 +5,7 @@ Created on 17 lut 2014 ''' import re import logging +import sys from morfeuszbuilder.utils import exceptions def _cutHomonymFromLemma(lemma): @@ -33,10 +34,12 @@ class Segtypes(object): self._readTags(segrulesConfigFile) self._indexSegnums() - print self._lemmaTagnum2Segnum - print self._tagnum2Segnum - - print self.segnum2Segtype +# print self._lemmaTagnum2Segnum +# print self._tagnum2Segnum + logging.info('segment number -> segment type') + logging.info('------------------------------') + logging.info(str(self.segnum2Segtype)) + logging.info('------------------------------') # self._debugSegnums() diff --git a/morfeusz/InterpretedChunk.hpp b/morfeusz/InterpretedChunk.hpp index 6020906..76b51a1 100644 --- a/morfeusz/InterpretedChunk.hpp +++ b/morfeusz/InterpretedChunk.hpp @@ -13,6 +13,7 @@ struct InterpretedChunk { const char* chunkStartPtr; + const char* chunkEndPtr; std::vector<uint32_t> originalCodepoints; std::vector<uint32_t> lowercaseCodepoints; InterpsGroup interpsGroup; diff --git a/morfeusz/InterpsGroup.hpp b/morfeusz/InterpsGroup.hpp index 2227525..98c55b0 100644 --- a/morfeusz/InterpsGroup.hpp +++ b/morfeusz/InterpsGroup.hpp @@ -15,25 +15,9 @@ #include "Tagset.hpp" struct InterpsGroup { -//public: -// -// InterpsGroup() { -// -// } -// -// explicit InterpsGroup(const unsigned char type) -// : type(type) { -// -// } -// -// void addInterpretation(const EncodedInterpretation& interp) { -// interps.push_back(interp); -// } - unsigned char type; uint16_t size; const unsigned char* ptr; -// std::vector<EncodedInterpretation> interps; }; #endif /* GROUPEDINTERPRETATIONS_HPP */ diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index 686a793..d326c17 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -28,6 +28,7 @@ static MorfeuszOptions createDefaultOptions() { MorfeuszOptions res; res.caseSensitive = true; res.encoding = UTF8; + res.debug = false; return res; } @@ -102,6 +103,21 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { to.chunkStartPtr = from.chunkStartPtr; } +static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { + stringstream res; + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), "; + return res.str(); +} + +static inline string debugAccum(vector<InterpretedChunk>& accum) { + stringstream res; + for (unsigned int i = 0; i < accum.size(); i++) { + res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr); +// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; + } + return res.str(); +} + void Morfeusz::doProcessOneWord( const Environment& env, const char*& inputData, @@ -109,7 +125,12 @@ void Morfeusz::doProcessOneWord( SegrulesState segrulesState, vector<InterpretedChunk>& accum, InflexionGraph& graph) const { +// if (this->options.debug) { +// cerr << "----------" << endl; +// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; +// } // cerr << "doAnalyzeOneWord " << inputData << endl; + const char* inputStart = inputData; const char* currInput = inputData; uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); vector<uint32_t> originalCodepoints; @@ -139,9 +160,15 @@ void Morfeusz::doProcessOneWord( vector<InterpsGroup> val(state.getValue()); for (unsigned int i = 0; i < val.size(); i++) { InterpsGroup& ig = val[i]; + if (this->options.debug) { + cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; + } // cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl; set<SegrulesState> newSegrulesStates; env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates); + if (this->options.debug && newSegrulesStates.empty()) { + cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; + } // cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl; for ( set<SegrulesState>::iterator it = newSegrulesStates.begin(); @@ -149,7 +176,8 @@ void Morfeusz::doProcessOneWord( ++it) { SegrulesState newSegrulesState = *it; InterpretedChunk ic = { - inputData, + inputStart, + currInput, originalCodepoints, normalizedCodepoints, ig, @@ -164,6 +192,9 @@ void Morfeusz::doProcessOneWord( accum.push_back(ic); if (isEndOfWord(codepoint) && newSegrulesState.accepting) { + if (this->options.debug) { + cerr << "ACCEPTING " << debugAccum(accum) << endl; + } graph.addPath(accum, newSegrulesState.weak); } else if (!isEndOfWord(codepoint)) { @@ -255,6 +286,10 @@ void Morfeusz::setPraet(const std::string& praet) { this->generatorEnv.setSegrulesOption("praet", praet); } +void Morfeusz::setDebug(bool debug) { + this->options.debug = debug; +} + ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) { resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end()); } diff --git a/morfeusz/Morfeusz.hpp b/morfeusz/Morfeusz.hpp index dbfd9b8..a62e167 100644 --- a/morfeusz/Morfeusz.hpp +++ b/morfeusz/Morfeusz.hpp @@ -139,6 +139,13 @@ public: * @param praet */ void setPraet(const std::string& praet); + + /** + * Set debug option value. + * + * @param praet + */ + void setDebug(bool debug); friend class ResultsIterator; private: diff --git a/morfeusz/MorfeuszOptions.hpp b/morfeusz/MorfeuszOptions.hpp index 99daa5d..cf975a6 100644 --- a/morfeusz/MorfeuszOptions.hpp +++ b/morfeusz/MorfeuszOptions.hpp @@ -13,6 +13,7 @@ struct MorfeuszOptions { bool caseSensitive; MorfeuszCharset encoding; + bool debug; }; #endif /* MORFEUSZOPTIONS_HPP */ diff --git a/morfeusz/cli/cli.cpp b/morfeusz/cli/cli.cpp index f5c343a..c836ce1 100644 --- a/morfeusz/cli/cli.cpp +++ b/morfeusz/cli/cli.cpp @@ -65,6 +65,17 @@ ezOptionParser* getOptions(int argc, const char** argv, const string& titleText) "-praet", // Flag token. "--praet" // Flag token. ); + + opt.add( + "", // Default. + 0, // Required? + 0, // Number of args expected. + 0, // Delimiter if expecting multiple args. + "praet option.", // Help description. + "-d", // Flag token. + "-debug", // Flag token. + "--debug" // Flag token. + ); opt.parse(argc, argv); @@ -105,6 +116,10 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) { cerr << "setting praet option to " << praet << endl; morfeusz.setPraet(praet); } + if (opt.isSet("-d")) { + cerr << "setting debug to TRUE" << endl; + morfeusz.setDebug(true); + } #ifdef _WIN32 morfeusz.setCharset(CP852); #endif -- libgit2 0.22.2