diff --git a/fsabuilder/morfeuszbuilder/fsa/fsa.py b/fsabuilder/morfeuszbuilder/fsa/fsa.py
index 2a68af2..c0f234b 100644
--- a/fsabuilder/morfeuszbuilder/fsa/fsa.py
+++ b/fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -41,9 +41,9 @@ class FSA(object):
self.n += 1
# debug
- if self.n % 10000 == 0:
- logging.info(word)
- logging.info(str(self.register.getStatesNum()))
+ if self.n % 100000 == 0:
+ logging.info(u'%d %s' % (self.n, word))
+# logging.info(str(self.register.getStatesNum()))
# allWords.append(word)
for label in encodedWord:
self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
index 65d63e4..5da3482 100644
--- a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
+++ b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
@@ -68,6 +68,6 @@ class RulesFSA(object):
res.extend(self.stateData2bytearray(state))
res.extend(self.transitionsData2bytearray(state))
- logging.info('Segmentation automaton size: %d bytes', len(res))
- print list(res)
+# logging.info('Segmentation automaton size: %d bytes', len(res))
+# print list(res)
return res
diff --git a/fsabuilder/morfeuszbuilder/tagset/segtypes.py b/fsabuilder/morfeuszbuilder/tagset/segtypes.py
index 31208a6..61b73c7 100644
--- a/fsabuilder/morfeuszbuilder/tagset/segtypes.py
+++ b/fsabuilder/morfeuszbuilder/tagset/segtypes.py
@@ -5,6 +5,7 @@ Created on 17 lut 2014
'''
import re
import logging
+import sys
from morfeuszbuilder.utils import exceptions
def _cutHomonymFromLemma(lemma):
@@ -33,10 +34,12 @@ class Segtypes(object):
self._readTags(segrulesConfigFile)
self._indexSegnums()
- print self._lemmaTagnum2Segnum
- print self._tagnum2Segnum
-
- print self.segnum2Segtype
+# print self._lemmaTagnum2Segnum
+# print self._tagnum2Segnum
+ logging.info('segment number -> segment type')
+ logging.info('------------------------------')
+ logging.info(str(self.segnum2Segtype))
+ logging.info('------------------------------')
# self._debugSegnums()
diff --git a/morfeusz/InterpretedChunk.hpp b/morfeusz/InterpretedChunk.hpp
index 6020906..76b51a1 100644
--- a/morfeusz/InterpretedChunk.hpp
+++ b/morfeusz/InterpretedChunk.hpp
@@ -13,6 +13,7 @@
struct InterpretedChunk {
const char* chunkStartPtr;
+ const char* chunkEndPtr;
std::vector<uint32_t> originalCodepoints;
std::vector<uint32_t> lowercaseCodepoints;
InterpsGroup interpsGroup;
diff --git a/morfeusz/InterpsGroup.hpp b/morfeusz/InterpsGroup.hpp
index 2227525..98c55b0 100644
--- a/morfeusz/InterpsGroup.hpp
+++ b/morfeusz/InterpsGroup.hpp
@@ -15,25 +15,9 @@
#include "Tagset.hpp"
struct InterpsGroup {
-//public:
-//
-// InterpsGroup() {
-//
-// }
-//
-// explicit InterpsGroup(const unsigned char type)
-// : type(type) {
-//
-// }
-//
-// void addInterpretation(const EncodedInterpretation& interp) {
-// interps.push_back(interp);
-// }
-
unsigned char type;
uint16_t size;
const unsigned char* ptr;
-// std::vector<EncodedInterpretation> interps;
};
#endif /* GROUPEDINTERPRETATIONS_HPP */
diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp
index 686a793..d326c17 100644
--- a/morfeusz/Morfeusz.cpp
+++ b/morfeusz/Morfeusz.cpp
@@ -28,6 +28,7 @@ static MorfeuszOptions createDefaultOptions() {
MorfeuszOptions res;
res.caseSensitive = true;
res.encoding = UTF8;
+ res.debug = false;
return res;
}
@@ -102,6 +103,21 @@ static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) {
to.chunkStartPtr = from.chunkStartPtr;
}
+static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
+ stringstream res;
+ res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), ";
+ return res.str();
+}
+
+static inline string debugAccum(vector<InterpretedChunk>& accum) {
+ stringstream res;
+ for (unsigned int i = 0; i < accum.size(); i++) {
+ res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr);
+// res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
+ }
+ return res.str();
+}
+
void Morfeusz::doProcessOneWord(
const Environment& env,
const char*& inputData,
@@ -109,7 +125,12 @@ void Morfeusz::doProcessOneWord(
SegrulesState segrulesState,
vector<InterpretedChunk>& accum,
InflexionGraph& graph) const {
+// if (this->options.debug) {
+// cerr << "----------" << endl;
+// cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
+// }
// cerr << "doAnalyzeOneWord " << inputData << endl;
+ const char* inputStart = inputData;
const char* currInput = inputData;
uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
vector<uint32_t> originalCodepoints;
@@ -139,9 +160,15 @@ void Morfeusz::doProcessOneWord(
vector<InterpsGroup> val(state.getValue());
for (unsigned int i = 0; i < val.size(); i++) {
InterpsGroup& ig = val[i];
+ if (this->options.debug) {
+ cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
+ }
// cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
set<SegrulesState> newSegrulesStates;
env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
+ if (this->options.debug && newSegrulesStates.empty()) {
+ cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
+ }
// cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
for (
set<SegrulesState>::iterator it = newSegrulesStates.begin();
@@ -149,7 +176,8 @@ void Morfeusz::doProcessOneWord(
++it) {
SegrulesState newSegrulesState = *it;
InterpretedChunk ic = {
- inputData,
+ inputStart,
+ currInput,
originalCodepoints,
normalizedCodepoints,
ig,
@@ -164,6 +192,9 @@ void Morfeusz::doProcessOneWord(
accum.push_back(ic);
if (isEndOfWord(codepoint)
&& newSegrulesState.accepting) {
+ if (this->options.debug) {
+ cerr << "ACCEPTING " << debugAccum(accum) << endl;
+ }
graph.addPath(accum, newSegrulesState.weak);
}
else if (!isEndOfWord(codepoint)) {
@@ -255,6 +286,10 @@ void Morfeusz::setPraet(const std::string& praet) {
this->generatorEnv.setSegrulesOption("praet", praet);
}
+void Morfeusz::setDebug(bool debug) {
+ this->options.debug = debug;
+}
+
ResultsIterator::ResultsIterator(const vector<MorphInterpretation>& res) {
resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end());
}
diff --git a/morfeusz/Morfeusz.hpp b/morfeusz/Morfeusz.hpp
index dbfd9b8..a62e167 100644
--- a/morfeusz/Morfeusz.hpp
+++ b/morfeusz/Morfeusz.hpp
@@ -139,6 +139,13 @@ public:
* @param praet
*/
void setPraet(const std::string& praet);
+
+ /**
+ * Set debug option value.
+ *
+ * @param praet
+ */
+ void setDebug(bool debug);
friend class ResultsIterator;
private:
diff --git a/morfeusz/MorfeuszOptions.hpp b/morfeusz/MorfeuszOptions.hpp
index 99daa5d..cf975a6 100644
--- a/morfeusz/MorfeuszOptions.hpp
+++ b/morfeusz/MorfeuszOptions.hpp
@@ -13,6 +13,7 @@
struct MorfeuszOptions {
bool caseSensitive;
MorfeuszCharset encoding;
+ bool debug;
};
#endif /* MORFEUSZOPTIONS_HPP */
diff --git a/morfeusz/cli/cli.cpp b/morfeusz/cli/cli.cpp
index f5c343a..c836ce1 100644
--- a/morfeusz/cli/cli.cpp
+++ b/morfeusz/cli/cli.cpp
@@ -65,6 +65,17 @@ ezOptionParser* getOptions(int argc, const char** argv, const string& titleText)
"-praet", // Flag token.
"--praet" // Flag token.
);
+
+ opt.add(
+ "", // Default.
+ 0, // Required?
+ 0, // Number of args expected.
+ 0, // Delimiter if expecting multiple args.
+ "praet option.", // Help description.
+ "-d", // Flag token.
+ "-debug", // Flag token.
+ "--debug" // Flag token.
+ );
opt.parse(argc, argv);
@@ -105,6 +116,10 @@ void initializeMorfeusz(ezOptionParser& opt, Morfeusz& morfeusz) {
cerr << "setting praet option to " << praet << endl;
morfeusz.setPraet(praet);
}
+ if (opt.isSet("-d")) {
+ cerr << "setting debug to TRUE" << endl;
+ morfeusz.setDebug(true);
+ }
#ifdef _WIN32
morfeusz.setCharset(CP852);
#endif