diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt index 37263f9..5e400c1 100644 --- a/morfeusz/CMakeLists.txt +++ b/morfeusz/CMakeLists.txt @@ -38,6 +38,9 @@ set(SRC_FILES charset/conversion_tables.cpp cli/cli.cpp segrules/segrules.cpp + CasePatternHelper.cpp + decoder/InterpretedChunksDecoder4Analyzer.cpp + decoder/InterpretedChunksDecoder4Generator.cpp ) set(INCLUDE_FILES diff --git a/morfeusz/CasePatternHelper.hpp b/morfeusz/CasePatternHelper.hpp index fb730f8..6b9b0ea 100644 --- a/morfeusz/CasePatternHelper.hpp +++ b/morfeusz/CasePatternHelper.hpp @@ -12,6 +12,9 @@ #include "InterpsGroup.hpp" #include "CasePatternHelper.hpp" #include "compressionByteUtils.hpp" +#include "Environment.hpp" + +class Environment; class CasePatternHelper { public: @@ -39,64 +42,17 @@ public: } bool checkInterpsGroupOrthCasePatterns( - const std::vector<uint32_t>& lowercaseCodepoints, - const std::vector<uint32_t>& originalCodepoints, - const InterpsGroup& ig) const { - const unsigned char* currPtr = ig.ptr; - unsigned char compressionByte = *currPtr++; - if (!this->caseSensitive) { - return true; - } - else if (isOrthOnlyLower(compressionByte)) { - return true; - } - else if (isOrthOnlyTitle(compressionByte)) { - return lowercaseCodepoints[0] != originalCodepoints[0]; - } - else { - unsigned char casePatternsNum = *currPtr++; - if (casePatternsNum == 0) { - return true; - } - else { - for (unsigned int i = 0; i < casePatternsNum; i++) { - if (checkCasePattern( - lowercaseCodepoints, - originalCodepoints, - deserializeOneCasePattern(currPtr))) { - return true; - } - } - return false; - } - } - } + const Environment& env, + const char* orthStart, + const char* orthEnd, + const InterpsGroup& ig) const; - std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { - std::vector<bool> res; - uint8_t casePatternType = *ptr++; - uint8_t prefixLength; - uint8_t patternLength; - switch (casePatternType) { - case LEMMA_ONLY_LOWER: - break; - case LEMMA_UPPER_PREFIX: - prefixLength = *ptr++; - res.resize(prefixLength, true); - break; - case LEMMA_MIXED_CASE: - patternLength = *ptr++; - for (unsigned int i = 0; i < patternLength; i++) { - uint8_t idx = *ptr++; - res.resize(idx + 1, false); - res[idx] = true; - } - break; - } - return res; - } + static std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr); private: bool caseSensitive; + + mutable vector<uint32_t> orthCodepoints; + mutable vector<uint32_t> normalizedCodepoints; static const uint8_t LEMMA_ONLY_LOWER = 0; static const uint8_t LEMMA_UPPER_PREFIX = 1; diff --git a/morfeusz/Environment.cpp b/morfeusz/Environment.cpp index 6a27192..3da6f9a 100644 --- a/morfeusz/Environment.cpp +++ b/morfeusz/Environment.cpp @@ -8,9 +8,11 @@ #include <vector> #include <algorithm> #include "Environment.hpp" -#include "InterpretedChunksDecoder.hpp" +#include "decoder/InterpretedChunksDecoder.hpp" #include "MorphDeserializer.hpp" #include "exceptions.hpp" +#include "decoder/InterpretedChunksDecoder4Analyzer.hpp" +#include "decoder/InterpretedChunksDecoder4Generator.hpp" //class InterpretedChunksDecoder4Analyzer; //class InterpretedChunksDecoder4Generator; @@ -53,7 +55,7 @@ processorType == ANALYZER ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this) : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)), processorType(processorType), -casePatternHelper() { +casePatternHelper(new CasePatternHelper()) { } const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const { @@ -78,6 +80,7 @@ Environment::~Environment() { delete this->fsaFileStartPtr; } delete this->chunksDecoder; + delete this->casePatternHelper; } void Environment::setCharset(MorfeuszCharset charset) { @@ -146,11 +149,11 @@ MorfeuszProcessorType Environment::getProcessorType() const { } void Environment::setCaseSensitive(bool caseSensitive) { - this->casePatternHelper.setCaseSensitive(caseSensitive); + this->casePatternHelper->setCaseSensitive(caseSensitive); } const CasePatternHelper& Environment::getCasePatternHelper() const { - return this->casePatternHelper; + return *this->casePatternHelper; } const Qualifiers& Environment::getQualifiersHelper() const { diff --git a/morfeusz/Environment.hpp b/morfeusz/Environment.hpp index f015486..4a189a2 100644 --- a/morfeusz/Environment.hpp +++ b/morfeusz/Environment.hpp @@ -11,6 +11,7 @@ #include <vector> class InterpretedChunksDecoder; +class CasePatternHelper; #include "charset/CaseConverter.hpp" #include "charset/CharsetConverter.hpp" @@ -79,7 +80,7 @@ private: const InterpretedChunksDecoder* chunksDecoder; MorfeuszProcessorType processorType; - CasePatternHelper casePatternHelper; + CasePatternHelper* casePatternHelper; const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; }; diff --git a/morfeusz/InflexionGraph.cpp b/morfeusz/InflexionGraph.cpp index 6a66520..0650ef8 100644 --- a/morfeusz/InflexionGraph.cpp +++ b/morfeusz/InflexionGraph.cpp @@ -78,7 +78,7 @@ void InflexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool wea this->addMiddleEdge((unsigned int) this->graph.size(), e); } else { - Edge e = {chunk, (int) this->graph.size() + 1}; + Edge e = {chunk, (unsigned long) this->graph.size() + 1}; this->addMiddleEdge((unsigned int) this->graph.size(), e); } } @@ -117,7 +117,8 @@ static bool containsEqualEdge(const vector<InflexionGraph::Edge>& edges, const I for (unsigned int i = 0; i < edges.size(); i++) { const InflexionGraph::Edge& e1 = edges[i]; if (e1.chunk.textStartPtr == e.chunk.textStartPtr - && e1.chunk.lowercaseCodepoints == e.chunk.lowercaseCodepoints + && e1.chunk.textStartPtr == e.chunk.textStartPtr + && e1.chunk.textEndPtr == e.chunk.textEndPtr && e1.chunk.segmentType == e.chunk.segmentType && e1.nextNode == e.nextNode) { return true; diff --git a/morfeusz/InflexionGraph.hpp b/morfeusz/InflexionGraph.hpp index 863b996..ac8d526 100644 --- a/morfeusz/InflexionGraph.hpp +++ b/morfeusz/InflexionGraph.hpp @@ -22,7 +22,7 @@ public: struct Edge { InterpretedChunk chunk; - unsigned int nextNode; + unsigned long nextNode; }; void addPath(const std::vector<InterpretedChunk>& path, bool weak); diff --git a/morfeusz/InterpretedChunk.hpp b/morfeusz/InterpretedChunk.hpp index c3d1a01..15fc83c 100644 --- a/morfeusz/InterpretedChunk.hpp +++ b/morfeusz/InterpretedChunk.hpp @@ -15,8 +15,6 @@ struct InterpretedChunk { unsigned char segmentType; const char* textStartPtr; const char* textEndPtr; - std::vector<uint32_t> originalCodepoints; - std::vector<uint32_t> lowercaseCodepoints; const unsigned char* interpsGroupPtr; const unsigned char* interpsPtr; const unsigned char* interpsEndPtr; diff --git a/morfeusz/InterpretedChunksDecoder.hpp b/morfeusz/InterpretedChunksDecoder.hpp deleted file mode 100644 index 1c42044..0000000 --- a/morfeusz/InterpretedChunksDecoder.hpp +++ /dev/null @@ -1,265 +0,0 @@ -/* - * File: InterpsGroupDecoder.hpp - * Author: mlenart - * - * Created on November 22, 2013, 10:35 PM - */ - -#ifndef INTERPSGROUPDECODER_HPP -#define INTERPSGROUPDECODER_HPP - -#include <string> -#include <vector> -#include <utility> - -#include "charset/CharsetConverter.hpp" -#include "EncodedInterpretation.hpp" -#include "InterpretedChunk.hpp" -#include "EncodedInterpretation.hpp" -#include "charset/CaseConverter.hpp" -#include "Environment.hpp" -#include "MorphInterpretation.hpp" -#include "CasePatternHelper.hpp" -#include "deserializationUtils.hpp" -#include "compressionByteUtils.hpp" -#include "const.hpp" - -class InterpretedChunksDecoder { -public: - - InterpretedChunksDecoder(const Environment& env) - : env(env) { - } - - virtual ~InterpretedChunksDecoder() { - } - - virtual void decode( - unsigned int startNode, - unsigned int endNode, - const InterpretedChunk& interpretedChunk, - std::vector<MorphInterpretation>& out) const = 0; - -protected: - - const Environment& env; -}; - -class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { -public: - - InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { - } - - void decode( - unsigned int startNode, - unsigned int endNode, - const InterpretedChunk& interpretedChunk, - std::vector<MorphInterpretation>& out) const { - string orth; - string lemmaPrefix; - if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) { - orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); - const unsigned char* currPtr = interpretedChunk.interpsPtr; - while (currPtr < interpretedChunk.interpsEndPtr) { - this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out); - } - } - } - -protected: - - void decodeForm( - const vector<uint32_t>& orth, - const EncodedForm& lemma, - bool forPrefix, - string& res) const { - for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) { - uint32_t cp = - (i < lemma.casePattern.size() && lemma.casePattern[i]) - ? env.getCaseConverter().toTitle(orth[i]) - : orth[i]; - env.getCharsetConverter().append(cp, res); - } - if (!forPrefix) { - const char* suffixPtr = lemma.suffixToAdd.c_str(); - const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length(); - while (suffixPtr != suffixEnd) { - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); - env.getCharsetConverter().append(cp, res); - } - } - } - - void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const { - encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte) - ? getPrefixCutLength(compressionByte) - : readInt8(ptr); - encodedForm.suffixToCut = readInt8(ptr); - encodedForm.suffixToAdd = readString(ptr); - assert(encodedForm.casePattern.size() == 0); - if (isLemmaOnlyLower(compressionByte)) { - encodedForm.casePattern = std::vector<bool>(); - } else if (isLemmaOnlyTitle(compressionByte)) { - encodedForm.casePattern = std::vector<bool>(); - encodedForm.casePattern.push_back(true); - } else { - encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); - } - } - - EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const { - EncodedInterpretation interp; - if (isOrthOnlyLower(compressionByte)) { - } else if (isOrthOnlyTitle(compressionByte)) { - interp.orthCasePattern.push_back(true); - } else { - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); - } - deserializeEncodedForm(ptr, compressionByte, interp.value); - interp.tag = readInt16(ptr); - interp.nameClassifier = *ptr++; - interp.qualifiers = readInt16(ptr); - return interp; - } -private: - - pair<string, string> getLemmaHomonymIdPair(const string& lemma) const { - vector<string> splitRes(split(lemma, ':')); - if (splitRes.size() == 2) { - return make_pair(splitRes[0], splitRes[1]); - } else { - return make_pair(lemma, ""); - } - } - - void decodeMorphInterpretation( - unsigned int startNode, unsigned int endNode, - const string& orth, - const string& lemmaPrefix, - const InterpretedChunk& chunk, - bool forPrefix, - const unsigned char*& ptr, - std::vector<MorphInterpretation>& out) const { - string lemma = lemmaPrefix; - EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr); - this->decodeForm(chunk.lowercaseCodepoints, ei.value, forPrefix, lemma); - if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.orthCasePattern)) { - // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma); - out.push_back(MorphInterpretation( - startNode, endNode, - orth, lemma, - // "", - ei.tag, - ei.nameClassifier, - ei.qualifiers, - env)); - } - } - - bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const { - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; - orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); - const unsigned char* ptr = prefixChunk.interpsPtr; - std::vector<MorphInterpretation> mi; - // env.getCasePatternHelper().skipCasePattern(ptr); - this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi); - if (!mi.empty()) { - lemmaPrefix += mi[0].getLemma(); - } else { - return false; - } - } - return true; - } -}; - -class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { -public: - - InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { - } - - void decode( - unsigned int startNode, - unsigned int endNode, - const InterpretedChunk& interpretedChunk, - std::vector<MorphInterpretation>& out) const { - string orthPrefix; - string lemma; - convertPrefixes(interpretedChunk, orthPrefix, lemma); - lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); - const unsigned char* currPtr = interpretedChunk.interpsPtr; - while (currPtr < interpretedChunk.interpsEndPtr) { - MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); - // cerr << mi.toString(false) << endl; - // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; - if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) { - out.push_back(mi); - } - } - } - -private: - - void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const { - for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { - const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; - lemma += env.getCharsetConverter().toString(prefixChunk.originalCodepoints); - const unsigned char* ptr = prefixChunk.interpsPtr; - MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr); - orthPrefix += mi.getOrth(); - } - } - - MorphInterpretation decodeMorphInterpretation( - unsigned int startNode, unsigned int endNode, - const string& orthPrefix, - const string& lemma, - const InterpretedChunk& chunk, - const unsigned char*& ptr) const { - string orth = orthPrefix; - EncodedInterpretation ei = this->deserializeInterp(ptr); - this->decodeForm(chunk.originalCodepoints, ei.value, orth); - return MorphInterpretation( - startNode, endNode, - orth, lemma + HOMONYM_SEPARATOR + ei.homonymId, - // ei.homonymId, - ei.tag, - ei.nameClassifier, - ei.qualifiers, - env); - } - - void decodeForm( - const vector<uint32_t>& lemma, - const EncodedForm& orth, - string& res) const { - res += orth.prefixToAdd; - for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) { - env.getCharsetConverter().append(lemma[i], res); - } - const char* suffixPtr = orth.suffixToAdd.c_str(); - const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); - while (suffixPtr != suffixEnd) { - uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); - env.getCharsetConverter().append(cp, res); - } - } - - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { - EncodedInterpretation interp; - interp.homonymId = readString(ptr); - interp.value.prefixToAdd = readString(ptr); - interp.value.suffixToCut = readInt8(ptr); - interp.value.suffixToAdd = readString(ptr); - interp.tag = readInt16(ptr); - interp.nameClassifier = readInt8(ptr); - interp.qualifiers = readInt16(ptr); - return interp; - } -}; - -#endif /* INTERPSGROUPDECODER_HPP */ - diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index 0b5af71..174ec11 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -12,7 +12,7 @@ #include "data/default_fsa.hpp" #include "Morfeusz.hpp" #include "MorphDeserializer.hpp" -#include "InterpretedChunksDecoder.hpp" +#include "decoder/InterpretedChunksDecoder.hpp" #include "charset/CharsetConverter.hpp" #include "charset/charset_utils.hpp" #include "charset/CaseConverter.hpp" @@ -34,6 +34,51 @@ static MorfeuszOptions createDefaultOptions() { return res; } +static void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { + to.prefixChunks.insert( + to.prefixChunks.begin(), + from.prefixChunks.begin(), + from.prefixChunks.end()); + to.prefixChunks.push_back(from); + to.textStartPtr = from.textStartPtr; + from.orthWasShifted = true; +} + +static string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { + stringstream res; + res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), "; + return res.str(); +} + +static string debugAccum(vector<InterpretedChunk>& accum) { + stringstream res; + for (unsigned int i = 0; i < accum.size(); i++) { + res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr); + // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; + } + return res.str(); +} + +static void feedStateDirectly( + StateType& state, + const char* inputStart, + const char* inputEnd) { + const char* currInput = inputStart; + while (currInput != inputEnd && !state.isSink()) { + state.proceedToNext(*currInput++); + } +} + +static void feedState( + StateType& state, + int codepoint) { + std::string chars; + UTF8CharsetConverter::getInstance().append(codepoint, chars); + for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) { + state.proceedToNext(chars[i]); + } +} + Morfeusz::Morfeusz() : analyzerEnv(DEFAULT_MORFEUSZ_CHARSET, ANALYZER, DEFAULT_FSA), generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA), @@ -97,11 +142,12 @@ void Morfeusz::processOneWord( if (!graph.empty()) { const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder(); int srcNode = startNodeNum; - for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) { - const vector<InflexionGraph::Edge>& edges = graph.getTheGraph()[i]; + const std::vector< std::vector<InflexionGraph::Edge> >& theGraph = graph.getTheGraph(); + for (unsigned int i = 0; i < theGraph.size(); i++) { + const vector<InflexionGraph::Edge>& edges = theGraph[i]; for (unsigned int j = 0; j < edges.size(); j++) { const InflexionGraph::Edge& e = edges[j]; - int targetNode = startNodeNum + e.nextNode; + unsigned long targetNode = startNodeNum + e.nextNode; interpretedChunksDecoder.decode(srcNode, targetNode, e.chunk, results); } srcNode++; @@ -118,56 +164,11 @@ void Morfeusz::processOneWord( inputStart = currInput; } -static inline void doShiftOrth(InterpretedChunk& from, InterpretedChunk& to) { - to.prefixChunks.insert( - to.prefixChunks.begin(), - from.prefixChunks.begin(), - from.prefixChunks.end()); - to.prefixChunks.push_back(from); - from.orthWasShifted = true; - to.textStartPtr = from.textStartPtr; -} - -static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) { - stringstream res; - res << "(" << (int) type << ", " << string(startPtr, endPtr) << "), "; - return res.str(); -} - -static inline string debugAccum(vector<InterpretedChunk>& accum) { - stringstream res; - for (unsigned int i = 0; i < accum.size(); i++) { - res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr); - // res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), "; - } - return res.str(); -} - -static inline void feedStateDirectly( - StateType& state, - const char* inputStart, - const char* inputEnd) { - const char* currInput = inputStart; - while (currInput != inputEnd && !state.isSink()) { - state.proceedToNext(*currInput++); - } -} - -static inline void feedState( - StateType& state, - int codepoint) { - std::string chars; - UTF8CharsetConverter::getInstance().append(codepoint, chars); - for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) { - state.proceedToNext(chars[i]); - } -} - void Morfeusz::doProcessOneWord( const Environment& env, const char*& inputData, const char* inputEnd, - SegrulesState segrulesState) const { + const SegrulesState& segrulesState) const { if (this->options.debug) { cerr << "----------" << endl; cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl; @@ -178,11 +179,6 @@ void Morfeusz::doProcessOneWord( const char* currInput = inputData; uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd); bool currCodepointIsWhitespace = isWhitespace(codepoint); - vector<uint32_t> originalCodepoints; - vector<uint32_t> normalizedCodepoints; - - originalCodepoints.reserve(16); - normalizedCodepoints.reserve(16); StateType state = env.getFSA().getInitialState(); @@ -190,8 +186,6 @@ void Morfeusz::doProcessOneWord( uint32_t normalizedCodepoint = env.getProcessorType() == ANALYZER ? env.getCaseConverter().toLower(codepoint) : codepoint; - originalCodepoints.push_back(codepoint); - normalizedCodepoints.push_back(normalizedCodepoint); if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) { feedStateDirectly(state, prevInput, currInput); } @@ -203,48 +197,37 @@ void Morfeusz::doProcessOneWord( currCodepointIsWhitespace = isWhitespace(codepoint); string homonymId; if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) { - if (originalCodepoints.size() == 1) { - throw MorfeuszException("Lemma of length > 1 cannot start with a colon"); - } homonymId = string(currInput + 1, inputEnd); - // cerr << "homonym " << homonymId << endl; prevInput = currInput; currInput = inputEnd; codepoint = 0x00; currCodepointIsWhitespace = true; } if (state.isAccepting()) { - vector<InterpsGroup> val(state.getValue()); - for (unsigned int i = 0; i < val.size(); i++) { - InterpsGroup& ig = val[i]; +// vector<InterpsGroup> val(state.getValue()); + for (unsigned int i = 0; i < state.getValue().size(); i++) { + const InterpsGroup& ig = state.getValue()[i]; if (this->options.debug) { cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl; } - vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace); + const vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace); if (!newSegrulesStates.empty() - && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig)) { - - for ( - vector<SegrulesState>::iterator it = newSegrulesStates.begin(); - it != newSegrulesStates.end(); - ++it) { - SegrulesState newSegrulesState = *it; + && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(env, inputStart, currInput, ig)) { + for (unsigned int i = 0; i < newSegrulesStates.size(); i++) { + const SegrulesState& newSegrulesState = newSegrulesStates[i]; const unsigned char* interpsPtr = getInterpretationsPtr(env, ig); const unsigned char* interpsEndPtr = ig.ptr + ig.size; - InterpretedChunk ic = { - ig.type, - inputStart, - currInput, - originalCodepoints, - normalizedCodepoints, - ig.ptr, - interpsPtr, - interpsEndPtr, - newSegrulesState.shiftOrthFromPrevious, - false, - vector<InterpretedChunk>(), - homonymId - }; + InterpretedChunk ic; + ic.segmentType = ig.type; + ic.textStartPtr = inputStart; + ic.textEndPtr = currInput; + ic.interpsGroupPtr = ig.ptr; + ic.interpsPtr = interpsPtr; + ic.interpsEndPtr = interpsEndPtr; + ic.shiftOrth = newSegrulesState.shiftOrthFromPrevious; + ic.orthWasShifted = false; + ic.requiredHomonymId = homonymId; + if (!accum.empty() && accum.back().shiftOrth) { doShiftOrth(accum.back(), ic); } @@ -266,7 +249,7 @@ void Morfeusz::doProcessOneWord( } } else if (this->options.debug) { - cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl; +// cerr << !newSegrulesStates.empty() << env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig) << endl; cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl; } } diff --git a/morfeusz/Morfeusz.hpp b/morfeusz/Morfeusz.hpp index 371eaf1..9941fd5 100644 --- a/morfeusz/Morfeusz.hpp +++ b/morfeusz/Morfeusz.hpp @@ -170,7 +170,7 @@ private: const Environment& env, const char*& inputData, const char* inputEnd, - SegrulesState segrulesState) const; + const SegrulesState& segrulesState) const; void handleIgnChunk( const Environment& env, diff --git a/morfeusz/decoder/InterpretedChunksDecoder.hpp b/morfeusz/decoder/InterpretedChunksDecoder.hpp new file mode 100644 index 0000000..9511258 --- /dev/null +++ b/morfeusz/decoder/InterpretedChunksDecoder.hpp @@ -0,0 +1,48 @@ +/* + * File: InterpsGroupDecoder.hpp + * Author: mlenart + * + * Created on November 22, 2013, 10:35 PM + */ + +#ifndef INTERPSGROUPDECODER_HPP +#define INTERPSGROUPDECODER_HPP + +#include <string> +#include <vector> +#include <utility> + +#include "charset/CharsetConverter.hpp" +#include "EncodedInterpretation.hpp" +#include "InterpretedChunk.hpp" +#include "EncodedInterpretation.hpp" +#include "charset/CaseConverter.hpp" +#include "Environment.hpp" +#include "MorphInterpretation.hpp" +#include "CasePatternHelper.hpp" +#include "deserializationUtils.hpp" +#include "compressionByteUtils.hpp" +#include "const.hpp" + +class InterpretedChunksDecoder { +public: + + InterpretedChunksDecoder(const Environment& env): env(env) { + } + + virtual ~InterpretedChunksDecoder() { + } + + virtual void decode( + unsigned int startNode, + unsigned int endNode, + const InterpretedChunk& interpretedChunk, + std::vector<MorphInterpretation>& out) const = 0; + +protected: + + const Environment& env; +}; + +#endif /* INTERPSGROUPDECODER_HPP */ + diff --git a/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp b/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp new file mode 100644 index 0000000..074d7a5 --- /dev/null +++ b/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp @@ -0,0 +1,138 @@ +/* + * File: InterpretedChunksDecoder4Analyzer.cpp + * Author: mlenart + * + * Created on 15 maj 2014, 15:28 + */ + +#include "InterpretedChunksDecoder4Analyzer.hpp" +#include <string> + +using namespace std; + +InterpretedChunksDecoder4Analyzer::InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) { +} + +void InterpretedChunksDecoder4Analyzer::decode( + unsigned int startNode, + unsigned int endNode, + const InterpretedChunk& interpretedChunk, + std::vector<MorphInterpretation>& out) const { + string orth; + string lemmaPrefix; + if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) { + // orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); + orth.insert(orth.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr); + const unsigned char* currPtr = interpretedChunk.interpsPtr; + while (currPtr < interpretedChunk.interpsEndPtr) { + this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out); + } + } +} + +void InterpretedChunksDecoder4Analyzer::decodeLemma( + const vector<uint32_t>& orth, + const EncodedForm& lemma, + bool forPrefix, + string& res) const { + for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) { + uint32_t cp = + (i < lemma.casePattern.size() && lemma.casePattern[i]) + ? env.getCaseConverter().toTitle(orth[i]) + : orth[i]; + env.getCharsetConverter().append(cp, res); + } + if (!forPrefix) { + const char* suffixPtr = lemma.suffixToAdd.c_str(); + const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length(); + while (suffixPtr != suffixEnd) { + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); + env.getCharsetConverter().append(cp, res); + } + } +} + +void InterpretedChunksDecoder4Analyzer::deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const { + encodedForm.prefixToCut = hasCompressedPrefixCut(compressionByte) + ? getPrefixCutLength(compressionByte) + : readInt8(ptr); + encodedForm.suffixToCut = readInt8(ptr); + encodedForm.suffixToAdd = readString(ptr); + assert(encodedForm.casePattern.size() == 0); + if (isLemmaOnlyLower(compressionByte)) { +// encodedForm.casePattern = std::vector<bool>(); + } + else if (isLemmaOnlyTitle(compressionByte)) { +// encodedForm.casePattern = std::vector<bool>(); + encodedForm.casePattern.push_back(true); + } + else { + encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); + } +} + +EncodedInterpretation InterpretedChunksDecoder4Analyzer::deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const { + EncodedInterpretation interp; + if (isOrthOnlyLower(compressionByte)) { + } + else if (isOrthOnlyTitle(compressionByte)) { + interp.orthCasePattern.push_back(true); + } + else { + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); + } + deserializeEncodedForm(ptr, compressionByte, interp.value); + interp.tag = readInt16(ptr); + interp.nameClassifier = *ptr++; + interp.qualifiers = readInt16(ptr); + return interp; +} + +void InterpretedChunksDecoder4Analyzer::decodeMorphInterpretation( + unsigned int startNode, unsigned int endNode, + const string& orth, + const string& lemmaPrefix, + const InterpretedChunk& chunk, + bool forPrefix, + const unsigned char*& ptr, + std::vector<MorphInterpretation>& out) const { + string lemma(lemmaPrefix); + orthCodepoints.clear(); + normalizedCodepoints.clear(); + const char* currPtr = chunk.textStartPtr; + while (currPtr != chunk.textEndPtr) { + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr); + orthCodepoints.push_back(cp); + normalizedCodepoints.push_back(env.getCaseConverter().toLower(cp)); + } + EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr); + if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, orthCodepoints, ei.orthCasePattern)) { + this->decodeLemma(normalizedCodepoints, ei.value, forPrefix, lemma); + // pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma); + out.push_back(MorphInterpretation( + startNode, endNode, + orth, lemma, + // "", + ei.tag, + ei.nameClassifier, + ei.qualifiers, + env)); + } +} + +bool InterpretedChunksDecoder4Analyzer::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const { + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; + orth.insert(orth.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr); + const unsigned char* ptr = prefixChunk.interpsPtr; + std::vector<MorphInterpretation> mi; + this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi); + if (!mi.empty()) { + lemmaPrefix += mi[0].getLemma(); + } + else { + return false; + } + } + return true; +} diff --git a/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.hpp b/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.hpp new file mode 100644 index 0000000..a8bf9b7 --- /dev/null +++ b/morfeusz/decoder/InterpretedChunksDecoder4Analyzer.hpp @@ -0,0 +1,52 @@ +/* + * File: InterpretedChunksDecoder4Analyzer.hpp + * Author: mlenart + * + * Created on 15 maj 2014, 15:28 + */ + +#ifndef INTERPRETEDCHUNKSDECODER4ANALYZER_HPP +#define INTERPRETEDCHUNKSDECODER4ANALYZER_HPP + +#include "InterpretedChunksDecoder.hpp" + +class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder { +public: + + InterpretedChunksDecoder4Analyzer(const Environment& env); + + void decode( + unsigned int startNode, + unsigned int endNode, + const InterpretedChunk& interpretedChunk, + std::vector<MorphInterpretation>& out) const; + +private: + + void decodeLemma( + const vector<uint32_t>& orth, + const EncodedForm& lemma, + bool forPrefix, + string& res) const; + + void deserializeEncodedForm(const unsigned char*& ptr, unsigned char compressionByte, EncodedForm& encodedForm) const; + + EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const; + + void decodeMorphInterpretation( + unsigned int startNode, unsigned int endNode, + const string& orth, + const string& lemmaPrefix, + const InterpretedChunk& chunk, + bool forPrefix, + const unsigned char*& ptr, + std::vector<MorphInterpretation>& out) const; + + bool convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orth, std::string& lemmaPrefix) const; + + mutable std::vector<uint32_t> orthCodepoints; + mutable std::vector<uint32_t> normalizedCodepoints; +}; + +#endif /* INTERPRETEDCHUNKSDECODER4ANALYZER_HPP */ + diff --git a/morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp b/morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp new file mode 100644 index 0000000..02ababb --- /dev/null +++ b/morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp @@ -0,0 +1,99 @@ +/* + * File: InterpretedChunksDecoder4Generator.cpp + * Author: mlenart + * + * Created on 15 maj 2014, 15:28 + */ + +#include "InterpretedChunksDecoder4Generator.hpp" +#include <string> +#include <vector> + +using namespace std; + +InterpretedChunksDecoder4Generator::InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) { +} + +void InterpretedChunksDecoder4Generator::decode( + unsigned int startNode, + unsigned int endNode, + const InterpretedChunk& interpretedChunk, + std::vector<MorphInterpretation>& out) const { + string orthPrefix; + string lemma; + convertPrefixes(interpretedChunk, orthPrefix, lemma); + // lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); + lemma.insert(lemma.end(), interpretedChunk.textStartPtr, interpretedChunk.textEndPtr); + const unsigned char* currPtr = interpretedChunk.interpsPtr; + while (currPtr < interpretedChunk.interpsEndPtr) { + MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); + // cerr << mi.toString(false) << endl; + // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; + if (interpretedChunk.requiredHomonymId.empty() || mi.hasHomonym(interpretedChunk.requiredHomonymId)) { + out.push_back(mi); + } + } +} + +void InterpretedChunksDecoder4Generator::convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const { + for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) { + const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i]; + lemma.insert(lemma.end(), prefixChunk.textStartPtr, prefixChunk.textEndPtr); + const unsigned char* ptr = prefixChunk.interpsPtr; + MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr); + orthPrefix += mi.getOrth(); + } +} + +MorphInterpretation InterpretedChunksDecoder4Generator::decodeMorphInterpretation( + unsigned int startNode, unsigned int endNode, + const string& orthPrefix, + const string& lemma, + const InterpretedChunk& chunk, + const unsigned char*& ptr) const { + string orth = orthPrefix; + EncodedInterpretation ei = this->deserializeInterp(ptr); + codepoints.clear(); + const char* currPtr = chunk.textStartPtr; + while (currPtr != chunk.textEndPtr) { + uint32_t cp = env.getCharsetConverter().next(currPtr, chunk.textEndPtr); + codepoints.push_back(cp); + } + this->decodeForm(codepoints, ei.value, orth); + return MorphInterpretation( + startNode, endNode, + orth, ei.homonymId.empty() ? lemma : (lemma + HOMONYM_SEPARATOR + ei.homonymId), + // ei.homonymId, + ei.tag, + ei.nameClassifier, + ei.qualifiers, + env); +} + +void InterpretedChunksDecoder4Generator::decodeForm( + const vector<uint32_t>& lemma, + const EncodedForm& orth, + string& res) const { + res += orth.prefixToAdd; + for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) { + env.getCharsetConverter().append(lemma[i], res); + } + const char* suffixPtr = orth.suffixToAdd.c_str(); + const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); + while (suffixPtr != suffixEnd) { + uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd); + env.getCharsetConverter().append(cp, res); + } +} + +EncodedInterpretation InterpretedChunksDecoder4Generator::deserializeInterp(const unsigned char*& ptr) const { + EncodedInterpretation interp; + interp.homonymId = readString(ptr); + interp.value.prefixToAdd = readString(ptr); + interp.value.suffixToCut = readInt8(ptr); + interp.value.suffixToAdd = readString(ptr); + interp.tag = readInt16(ptr); + interp.nameClassifier = readInt8(ptr); + interp.qualifiers = readInt16(ptr); + return interp; +} diff --git a/morfeusz/decoder/InterpretedChunksDecoder4Generator.hpp b/morfeusz/decoder/InterpretedChunksDecoder4Generator.hpp new file mode 100644 index 0000000..f2a3b3f --- /dev/null +++ b/morfeusz/decoder/InterpretedChunksDecoder4Generator.hpp @@ -0,0 +1,47 @@ +/* + * File: InterpretedChunksDecoder4Generator.hpp + * Author: mlenart + * + * Created on 15 maj 2014, 15:28 + */ + +#ifndef INTERPRETEDCHUNKSDECODER4GENERATOR_HPP +#define INTERPRETEDCHUNKSDECODER4GENERATOR_HPP + +#include "InterpretedChunksDecoder.hpp" + +class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder { +public: + + InterpretedChunksDecoder4Generator(const Environment& env); + + void decode( + unsigned int startNode, + unsigned int endNode, + const InterpretedChunk& interpretedChunk, + std::vector<MorphInterpretation>& out) const; + +private: + + void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& orthPrefix, std::string& lemma) const; + + MorphInterpretation decodeMorphInterpretation( + unsigned int startNode, unsigned int endNode, + const string& orthPrefix, + const string& lemma, + const InterpretedChunk& chunk, + const unsigned char*& ptr) const; + + void decodeForm( + const vector<uint32_t>& lemma, + const EncodedForm& orth, + string& res) const; + + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const; + + mutable std::vector<uint32_t> codepoints; +}; + + +#endif /* INTERPRETEDCHUNKSDECODER4GENERATOR_HPP */ + diff --git a/morfeusz/fsa/fsa.hpp b/morfeusz/fsa/fsa.hpp index a936f09..6693ba6 100644 --- a/morfeusz/fsa/fsa.hpp +++ b/morfeusz/fsa/fsa.hpp @@ -167,7 +167,7 @@ public: * Makes sense only for accepting states. * For non-accepting states is throws an exception. */ - T getValue() const; + const T& getValue() const; unsigned char getLastTransitionValue() const; diff --git a/morfeusz/fsa/state_impl.hpp b/morfeusz/fsa/state_impl.hpp index f4e81c5..68649b3 100644 --- a/morfeusz/fsa/state_impl.hpp +++ b/morfeusz/fsa/state_impl.hpp @@ -46,7 +46,7 @@ unsigned long State<T>::getOffset() const { } template <class T> -T State<T>::getValue() const { +const T& State<T>::getValue() const { assert(this->isAccepting()); return this->value; } diff --git a/morfeusz/morfeusz_analyzer.cpp b/morfeusz/morfeusz_analyzer.cpp index 03fb659..bca46c5 100644 --- a/morfeusz/morfeusz_analyzer.cpp +++ b/morfeusz/morfeusz_analyzer.cpp @@ -43,11 +43,20 @@ int main(int argc, const char** argv) { else if (prevStart != -1) { printf("; "); } - printf("%s", mi.toString(true).c_str()); -// printf("%d,%d,%s,%s,%s,%s", -// mi.getStartNode(), mi.getEndNode(), -// mi.getOrth().c_str(), lemmaToShow.c_str(), -// mi.getTag().c_str(), lemmaToShow.c_str()); +// printf("%s", mi.toString(true).c_str()); + printf("%d,%d,%s,%s,%s", + mi.getStartNode(), mi.getEndNode(), + mi.getOrth().c_str(), mi.getLemma().c_str(), + mi.getTag().c_str()); + if (!mi.getName().empty()) { + printf(",%s", mi.getName().c_str()); + } + if (!mi.getQualifiers().empty()) { + printf(",%s", mi.getQualifiers()[0].c_str()); + for (unsigned int i = 1; i < mi.getQualifiers().size(); i++) { + printf("|%s", mi.getQualifiers()[i].c_str()); + } + } prevStart = mi.getStartNode(); prevEnd = mi.getEndNode(); } diff --git a/morfeusz/segrules/SegrulesFSA.hpp b/morfeusz/segrules/SegrulesFSA.hpp index 7374d0f..2838ab3 100644 --- a/morfeusz/segrules/SegrulesFSA.hpp +++ b/morfeusz/segrules/SegrulesFSA.hpp @@ -34,12 +34,12 @@ public: std::vector<SegrulesState> proceedToNext( const unsigned char segnum, - const SegrulesState state, + const SegrulesState& state, bool atEndOfWord) const { std::vector<SegrulesState> res; const unsigned char* currPtr = ptr + state.offset + 1; const unsigned char transitionsNum = *currPtr++; - for (unsigned int i = 0; i < transitionsNum; i++) { + for (int i = 0; i < transitionsNum; i++) { if (*currPtr == segnum) { SegrulesState newState = this->transition2State(currPtr); if ((atEndOfWord && newState.accepting) diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml index 5cd4235..a9c998f 100644 --- a/nbproject/configurations.xml +++ b/nbproject/configurations.xml @@ -130,6 +130,8 @@ </ccTool> </item> <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="build/morfeusz/morfeuszJAVA_wrap.cxx" ex="false" @@ -239,6 +241,7 @@ <pElem>build/morfeusz</pElem> </incDir> <preprocessorList> + <Elem>NDEBUG</Elem> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> @@ -283,7 +286,7 @@ <ccTool> <incDir> <pElem>morfeusz</pElem> - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> + <pElem>/usr/lib/jvm/default-java/include</pElem> </incDir> <preprocessorList> <Elem>NDEBUG</Elem> @@ -310,6 +313,19 @@ </undefinedList> </ccTool> </folder> + <item path="morfeusz/CasePatternHelper.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + <incDir> + <pElem>build</pElem> + <pElem>morfeusz</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + <preprocessorList> + <Elem>NDEBUG</Elem> + <Elem>libmorfeusz_EXPORTS</Elem> + </preprocessorList> + </ccTool> + </item> <item path="morfeusz/Environment.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="1"> <incDir> @@ -387,40 +403,75 @@ </ccTool> </item> <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="1"> + <ccTool flags="2"> <incDir> <pElem>build</pElem> <pElem>morfeusz</pElem> <pElem>build/morfeusz</pElem> </incDir> <preprocessorList> - <Elem>NDEBUG</Elem> <Elem>libmorfeusz_EXPORTS</Elem> </preprocessorList> </ccTool> </item> <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="2"> + </ccTool> </item> <item path="morfeusz/charset/CharsetConverter.cpp" ex="false" tool="1" flavor2="4"> - <ccTool flags="1"> - <preprocessorList> - <Elem>NDEBUG</Elem> - </preprocessorList> + <ccTool flags="2"> </ccTool> </item> <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="2"> + </ccTool> </item> <item path="morfeusz/charset/conversion_tables.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="2"> + </ccTool> </item> <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="2"> + <incDir> + <pElem>build</pElem> + <pElem>morfeusz</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + <preprocessorList> + <Elem>libmorfeusz_EXPORTS</Elem> + </preprocessorList> + </ccTool> + </item> + <item path="morfeusz/decoder/InterpretedChunksDecoder4Analyzer.cpp" + ex="false" + tool="1" + flavor2="4"> + <ccTool flags="1"> + <incDir> + <pElem>build</pElem> + <pElem>morfeusz</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + <preprocessorList> + <Elem>NDEBUG</Elem> + <Elem>libmorfeusz_EXPORTS</Elem> + </preprocessorList> + </ccTool> + </item> + <item path="morfeusz/decoder/InterpretedChunksDecoder4Generator.cpp" + ex="false" + tool="1" + flavor2="4"> <ccTool flags="1"> <incDir> <pElem>build</pElem> @@ -509,6 +560,8 @@ </ccTool> </item> <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> + <ccTool flags="1"> + </ccTool> </item> <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> <ccTool flags="0"> diff --git a/profile.sh b/profile.sh new file mode 100755 index 0000000..8443833 --- /dev/null +++ b/profile.sh @@ -0,0 +1,12 @@ +#!/bin/bash + +rm -rf profbuild +mkdir -p profbuild +cd profbuild +cmake -D INPUT_DICTIONARIES=../input/dodatki.tab,../input/PoliMorfSmall.tab -D CMAKE_BUILD_TYPE=Debug -D CMAKE_SHARED_LINKER_FLAGS="-lprofiler" -D CMAKE_EXE_LINKER_FLAGS="-lprofiler" .. +make -j4 +rm -f /tmp/morfeusz.prof +export LD_PRELOAD="/usr/lib/libprofiler.so" +export CPUPROFILE="/tmp/morfeusz.prof" +morfeusz/morfeusz_analyzer -i /tmp/dupadupa < /mnt/storage/morfeusz/sents10k > /dev/null +### pprof --gv profbuild/morfeusz/morfeusz_analyzer /tmp/morfeusz.prof