diff --git a/README b/README index 6e08362..5f5a94f 100644 --- a/README +++ b/README @@ -5,7 +5,7 @@ Compilation - prerequisites This tutorial assumes that build process is performed on Linux 64bit machine (preferably from Debian/Ubuntu family). -sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip +sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip python-all-dev python-pyparsing sudo pip install pyinstaller For cross compiling: diff --git a/fsabuilder/CMakeLists.txt b/fsabuilder/CMakeLists.txt index 0195673..919831b 100644 --- a/fsabuilder/CMakeLists.txt +++ b/fsabuilder/CMakeLists.txt @@ -55,8 +55,8 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows") list (APPEND PACKAGE_DEPENDS package-python-win-installer) #~ add_custom_target (buildfsa-exec ALL - #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder -#~ ) + #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder + #~ ) #~ #~ add_executable (morfeusz_builder IMPORTED) #~ add_dependencies (morfeusz_builder buildfsa-exec) diff --git a/fsabuilder/buildanalyzer.sh b/fsabuilder/buildanalyzer.sh index 26076e4..69667d1 100755 --- a/fsabuilder/buildanalyzer.sh +++ b/fsabuilder/buildanalyzer.sh @@ -1,3 +1,3 @@ #!/bin/bash -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1 +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1 diff --git a/fsabuilder/buildgenerator.sh b/fsabuilder/buildgenerator.sh index becc130..2f7f562 100755 --- a/fsabuilder/buildgenerator.sh +++ b/fsabuilder/buildgenerator.sh @@ -1,7 +1,7 @@ #!/bin/bash -python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \ - --tagset-file=../input/polimorf.tagset \ +python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ + --tagset-file=../input/sgjp-morfeusz.tagset \ --segments-file=../input/segmenty.dat \ --generator \ --serialization-method=V2 \ diff --git a/fsabuilder/morfeuszbuilder/fsa/common.py b/fsabuilder/morfeuszbuilder/fsa/common.py index 28d0b17..10fadde 100644 --- a/fsabuilder/morfeuszbuilder/fsa/common.py +++ b/fsabuilder/morfeuszbuilder/fsa/common.py @@ -41,6 +41,11 @@ class EncodedForm4Generator(object): self.cutLength = bestEncodedForm.cutLength self.suffixToAdd = bestEncodedForm.suffixToAdd self.prefixToAdd = targetWord[:bestPrefixLength] + +# if fromWord == 'BC': +# print self.cutLength +# print self.suffixToAdd +# print self.prefixToAdd, len(self.prefixToAdd) class Interpretation4Analyzer(object): diff --git a/fsabuilder/morfeuszbuilder/fsa/encode.py b/fsabuilder/morfeuszbuilder/fsa/encode.py index e41ab9a..8aecd71 100644 --- a/fsabuilder/morfeuszbuilder/fsa/encode.py +++ b/fsabuilder/morfeuszbuilder/fsa/encode.py @@ -6,7 +6,7 @@ Created on Oct 23, 2013 import logging import itertools -from morfeuszbuilder.utils import serializationUtils +from morfeuszbuilder.utils.serializationUtils import * class Encoder(object): ''' @@ -44,19 +44,6 @@ class Encoder(object): assert typenum >= 0 and typenum < 256 return bytearray([typenum]) - def _encodeEncodedForm(self, form, withCasePattern, withPrefix): - res = bytearray() - assert form.cutLength < 256 and form.cutLength >= 0 - if withPrefix: - res.extend(self.encodeWord(form.prefixToAdd, lowercase=False)) - res.append(0) - res.append(form.cutLength) - res.extend(self.encodeWord(form.suffixToAdd, lowercase=False)) - res.append(0) - if withCasePattern: - res.extend(self._encodeCasePattern(form.casePattern)) - return res - def _encodeCasePattern(self, casePattern): res = bytearray() if True not in casePattern: @@ -84,7 +71,7 @@ class Encoder(object): n = len(self.qualifiersMap) self.qualifiersMap[key] = n assert n < 500 - res.extend(serializationUtils.htons(n)) + res.extend(htons(n)) return res def _hasUpperPrefix(self, casePattern): @@ -102,11 +89,9 @@ class Encoder(object): def _encodeTagNum(self, tagnum): res = bytearray() -# logging.info((tagnum & 0xFF00) >> 8) assert tagnum < 65536 and tagnum >= 0 res.append((tagnum & 0xFF00) >> 8) res.append(tagnum & 0x00FF) -# logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1]))) return res def _encodeNameNum(self, namenum): @@ -129,31 +114,37 @@ class Encoder(object): res.append(list(interp.orthCasePattern)) return res - def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId): + def _encodeInterps4Type(self, typenum, interpsList, isAnalyzer): res = bytearray() res.extend(self._encodeTypeNum(typenum)) encodedInterpsList = bytearray() - if withCasePattern: + if isAnalyzer: casePatterns = self._getOrthCasePatterns(interpsList) encodedInterpsList.append(len(casePatterns)) for casePattern in casePatterns: encodedInterpsList.extend(self._encodeCasePattern(casePattern)) for interp in sorted(interpsList, key=lambda i: i.getSortKey()): - if withHomonymId: - encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False)) - encodedInterpsList.append(0) - if withCasePattern: + if isAnalyzer: encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern)) - encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix)) - encodedInterpsList.extend(self._encodeTagNum(interp.tagnum)) - encodedInterpsList.extend(self._encodeNameNum(interp.namenum)) + else: + serializeString(interp.homonymId, encodedInterpsList) + serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList) + encodedInterpsList.append(interp.encodedForm.cutLength) + serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList) + if isAnalyzer: + encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern)) + encodedInterpsList.extend(htons(interp.tagnum)) + encodedInterpsList.append(interp.namenum) encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers)) + + if interp.encodedForm.suffixToAdd == 'bc': + print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList] - res.extend(serializationUtils.htons(len(encodedInterpsList))) + res.extend(htons(len(encodedInterpsList))) res.extend(encodedInterpsList) return res - def _doEncodeData(self, interpsList, withCasePattern, withPrefix, withHomonymId): + def _doEncodeData(self, interpsList, isAnalyzer): assert type(interpsList) == frozenset @@ -167,7 +158,7 @@ class Encoder(object): res.append(firstByte) for typenum, interpsList in segnum2Interps.iteritems(): - res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix, withHomonymId)) + res.extend(self._encodeInterps4Type(typenum, interpsList, isAnalyzer)) del interpsList return res @@ -181,7 +172,7 @@ class MorphEncoder(Encoder): self.LEMMA_MIXED_CASE = 2 def encodeData(self, interpsList): - return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False, withHomonymId=False) + return self._doEncodeData(interpsList, isAnalyzer=True) class Encoder4Generator(Encoder): @@ -189,4 +180,4 @@ class Encoder4Generator(Encoder): super(Encoder4Generator, self).__init__(False, encoding) def encodeData(self, interpsList): - return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True, withHomonymId=True) + return self._doEncodeData(interpsList, isAnalyzer=False) diff --git a/fsabuilder/morfeuszbuilder/fsa/fsa.py b/fsabuilder/morfeuszbuilder/fsa/fsa.py index 18bc957..f23a0bb 100644 --- a/fsabuilder/morfeuszbuilder/fsa/fsa.py +++ b/fsabuilder/morfeuszbuilder/fsa/fsa.py @@ -43,9 +43,6 @@ class FSA(object): # debug if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0: logging.info(u'%d %s' % (self.n, word)) -# logging.info(str(self.register.getStatesNum())) -# logging.info(str(self.register.getStatesNum())) - # allWords.append(word) for label in encodedWord: self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 diff --git a/fsabuilder/morfeuszbuilder/segrules/rules.py b/fsabuilder/morfeuszbuilder/segrules/rules.py index ca46a6d..2252c60 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rules.py +++ b/fsabuilder/morfeuszbuilder/segrules/rules.py @@ -56,6 +56,7 @@ class TagRule(SegmentRule): def __str__(self): res = self.segtype + res += '(' + str(self.segnum) + ')' if self.shiftOrth: res += '>' return res @@ -70,8 +71,8 @@ class TagRule(SegmentRule): class UnaryRule(SegmentRule): def __init__(self, child, linenum): + super(UnaryRule, self).__init__(linenum) self.child = child - self.linenum = linenum assert not child.isSinkRule() def isShiftOrthRule(self): @@ -80,8 +81,8 @@ class UnaryRule(SegmentRule): class ComplexRule(SegmentRule): def __init__(self, children, linenum): + super(ComplexRule, self).__init__(linenum) self.children = children - self.linenum = linenum assert not any(map(lambda c: c.isSinkRule(), children)) def addToNFA(self, fsa): diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py index 5da3482..1b86a0c 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py @@ -68,6 +68,4 @@ class RulesFSA(object): res.extend(self.stateData2bytearray(state)) res.extend(self.transitionsData2bytearray(state)) -# logging.info('Segmentation automaton size: %d bytes', len(res)) -# print list(res) return res diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py index 9abe88c..e73a6f9 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py @@ -49,6 +49,7 @@ class RulesManager(object): res.extend(self._serializeDFA(dfa)) res.extend(self._serializeOptionsMap(self.defaultOptions)) logging.info('segmentation rules size: %s bytes', len(res)) +# logging.info([int(x) for x in res]) return res def _serializeSeparatorsList(self): diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py index 735407c..de06641 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py @@ -63,8 +63,8 @@ class RulesParser(object): nfa = rulesNFA.RulesNFA() if not firstNFA: firstNFA = nfa - section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations' - combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section, ignoreComments=False) +# section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations' + combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False) combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename)) for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename): if rule.allowsEmptySequence(): @@ -72,8 +72,11 @@ class RulesParser(object): filename, rule.linenum, 'This rule allows empty segments sequence to be accepted') - rule.addToNFA(nfa) -# nfa.debug() + if self.rulesType == RulesParser.PARSE4GENERATOR: + rule = rule.transformToGeneratorVersion() + if not rule.isSinkRule(): + rule.addToNFA(nfa) +# nfa.debug() try: dfa = nfa.convertToDFA() res.addDFA(key2Def, dfa) @@ -146,10 +149,11 @@ class RulesParser(object): unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3 oneOfRule = delimitedList(unaryRule, delim='|') complexRule = unaryRule ^ oneOfRule - if self.rulesType == RulesParser.PARSE4ANALYZER: - concatRule = OneOrMore(complexRule) - else: - concatRule = ZeroOrMore(shiftOrthRule) + tagRule + concatRule = OneOrMore(complexRule) +# if self.rulesType == RulesParser.PARSE4ANALYZER: +# concatRule = OneOrMore(complexRule) +# else: +# concatRule = ZeroOrMore(shiftOrthRule) + tagRule rule << concatRule + Optional(CaselessLiteral('!weak')) tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) diff --git a/fsabuilder/morfeuszbuilder/utils/serializationUtils.py b/fsabuilder/morfeuszbuilder/utils/serializationUtils.py index f8ffe0e..3a1cd54 100644 --- a/fsabuilder/morfeuszbuilder/utils/serializationUtils.py +++ b/fsabuilder/morfeuszbuilder/utils/serializationUtils.py @@ -22,3 +22,7 @@ def htonl(n): res.append((n & 0x0000FF00) >> 8) res.append(n & 0x000000FF) return res + +def serializeString(string, out): + out.extend(string.encode('utf8')) + out.append(0) diff --git a/input/segmenty.dat b/input/segmenty.dat index 67be1d4..1eb39f5 100644 --- a/input/segmenty.dat +++ b/input/segmenty.dat @@ -682,5 +682,3 @@ pref_dyw e-+:prefs # ; 59 - -[generator combinations] diff --git a/morfeusz/CasePatternHelper.hpp b/morfeusz/CasePatternHelper.hpp index fec391f..86467ee 100644 --- a/morfeusz/CasePatternHelper.hpp +++ b/morfeusz/CasePatternHelper.hpp @@ -62,15 +62,6 @@ public: } } - const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const { - const unsigned char* currPtr = ig.ptr; - unsigned char casePatternsNum = *currPtr++; - for (unsigned int i = 0; i < casePatternsNum; i++) { - deserializeOneCasePattern(currPtr); - } - return currPtr; - } - std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const { std::vector<bool> res; uint8_t casePatternType = *ptr; @@ -103,26 +94,6 @@ public: } return res; } - -// bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const { -// if (this->caseSensitive) { -// for (unsigned int i = 0; i < chunks.size(); i++) { -// const InterpretedChunk& ic = chunks[i]; -// const unsigned char* casePatternPtr = ic.interpsGroup.ptr; -// std::vector<bool> casePattern; -// deserializeCasePattern(casePatternPtr, casePattern); -// if (!checkCasePattern(ic, casePattern)) { -// return false; -// } -// } -// } -// return true; -// } - -// void skipCasePattern(const unsigned char*& ptr) const { -// vector<bool> _dupa; -// deserializeCasePattern(ptr, _dupa); -// } private: bool caseSensitive; diff --git a/morfeusz/InterpretedChunksDecoder.hpp b/morfeusz/InterpretedChunksDecoder.hpp index 6b6f185..d37885a 100644 --- a/morfeusz/InterpretedChunksDecoder.hpp +++ b/morfeusz/InterpretedChunksDecoder.hpp @@ -40,18 +40,6 @@ public: protected: - EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { - EncodedInterpretation interp; - interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); - deserializeEncodedForm(ptr, interp.value); - interp.tag = readInt16(ptr); - interp.nameClassifier = *ptr++; - interp.qualifiers = readInt16(ptr); - return interp; - } - - virtual void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const = 0; - const Environment& env; }; @@ -106,6 +94,16 @@ protected: assert(encodedForm.casePattern.size() == 0); encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr); } + + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { + EncodedInterpretation interp; + interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr); + deserializeEncodedForm(ptr, interp.value); + interp.tag = readInt16(ptr); + interp.nameClassifier = *ptr++; + interp.qualifiers = readInt16(ptr); + return interp; + } private: pair<string, string> getLemmaHomonymIdPair(const string& lemma) const { @@ -176,7 +174,7 @@ public: const unsigned char* currPtr = interpretedChunk.interpsPtr; while (currPtr < interpretedChunk.interpsEndPtr) { MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); - // cerr << mi.toString(false) << endl; +// cerr << mi.toString(false) << endl; // cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) { out.push_back(mi); @@ -203,15 +201,12 @@ private: const InterpretedChunk& chunk, const unsigned char*& ptr) const { string orth = orthPrefix; - string homonymId = (const char*) ptr; - ptr += strlen((const char*) ptr) + 1; EncodedInterpretation ei = this->deserializeInterp(ptr); this->decodeForm(chunk.originalCodepoints, ei.value, orth); - // string realLemma = homonymId.empty() ? lemma : (lemma + ":" + homonymId); return MorphInterpretation( startNode, endNode, orth, lemma, - homonymId, + ei.homonymId, ei.tag, ei.nameClassifier, ei.qualifiers, @@ -233,14 +228,17 @@ private: env.getCharsetConverter().append(cp, res); } } - - void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const { - encodedForm.prefixToAdd = (const char*) ptr; - ptr += strlen((const char*) ptr) + 1; - encodedForm.suffixToCut = *ptr; - ptr++; - encodedForm.suffixToAdd = (const char*) ptr; - ptr += strlen((const char*) ptr) + 1; + + EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const { + EncodedInterpretation interp; + interp.homonymId = readString(ptr); + interp.value.prefixToAdd = readString(ptr); + interp.value.suffixToCut = readInt8(ptr); + interp.value.suffixToAdd = readString(ptr); + interp.tag = readInt16(ptr); + interp.nameClassifier = readInt8(ptr); + interp.qualifiers = readInt16(ptr); + return interp; } }; diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index 234ab90..b142d4f 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -18,6 +18,7 @@ #include "charset/CaseConverter.hpp" #include "segrules/segrules.hpp" #include "const.hpp" +#include "deserializationUtils.hpp" #include "charset/utf8.h" // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba @@ -40,6 +41,20 @@ options(createDefaultOptions()) { generatorEnv.setCaseSensitive(false); } +inline const unsigned char* getInterpretationsPtr(const Environment& env, const InterpsGroup& ig) { + if (env.getProcessorType() == ANALYZER) { + const unsigned char* currPtr = ig.ptr; + unsigned char casePatternsNum = *currPtr++; + for (unsigned int i = 0; i < casePatternsNum; i++) { + env.getCasePatternHelper().deserializeOneCasePattern(currPtr); + } + return currPtr; + } + else { + return ig.ptr; + } +} + void Morfeusz::setAnalyzerFile(const string& filename) { this->analyzerEnv.setFSAFile(filename); } @@ -183,7 +198,7 @@ void Morfeusz::doProcessOneWord( it != newSegrulesStates.end(); ++it) { SegrulesState newSegrulesState = *it; - const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig); + const unsigned char* interpsPtr = getInterpretationsPtr(env, ig); const unsigned char* interpsEndPtr = ig.ptr + ig.size; InterpretedChunk ic = { ig.type, diff --git a/morfeusz/Qualifiers.cpp b/morfeusz/Qualifiers.cpp index d9cf171..b76b3dd 100644 --- a/morfeusz/Qualifiers.cpp +++ b/morfeusz/Qualifiers.cpp @@ -20,7 +20,6 @@ qualifiers() { readTags(currPtr, _dupa); _dupa.clear(); readTags(currPtr, _dupa); - uint16_t allCombinationsSize = readInt16(currPtr); this->qualifiers.reserve(allCombinationsSize); for (unsigned int i = 0; i < allCombinationsSize; i++) { diff --git a/morfeusz/deserializationUtils.hpp b/morfeusz/deserializationUtils.hpp index d993d9f..b198a99 100644 --- a/morfeusz/deserializationUtils.hpp +++ b/morfeusz/deserializationUtils.hpp @@ -11,14 +11,24 @@ #include "endianness.hpp" #include <iostream> +inline unsigned char readInt8(const unsigned char*& currPtr) { + return *currPtr++; +} + inline uint16_t readInt16(const unsigned char*& currPtr) { - uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr)); + uint16_t res = htons(*reinterpret_cast<const uint16_t*> (currPtr)); currPtr += 2; return res; } +inline uint32_t readInt32(const unsigned char*& currPtr) { + uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr)); + currPtr += 4; + return res; +} + inline std::string readString(const unsigned char*& currPtr) { - std::string res(reinterpret_cast<const char*>(currPtr)); + std::string res((const char*) currPtr); currPtr += res.length(); currPtr++; return res; diff --git a/morfeusz/segrules/SegrulesFSA.hpp b/morfeusz/segrules/SegrulesFSA.hpp index 70684b1..873a612 100644 --- a/morfeusz/segrules/SegrulesFSA.hpp +++ b/morfeusz/segrules/SegrulesFSA.hpp @@ -9,7 +9,8 @@ #define SEGRULESFSA_HPP #include <set> -#include "../endianness.hpp" +#include <iostream> +#include "../deserializationUtils.hpp" struct SegrulesState { uint16_t offset; @@ -37,8 +38,7 @@ public: const unsigned char* currPtr = ptr + state.offset; currPtr++; - const unsigned char transitionsNum = *currPtr; - currPtr++; + const unsigned char transitionsNum = *currPtr++; for (unsigned int i = 0; i < transitionsNum; i++) { if (*currPtr == segnum) { newStates.insert(newStates.begin(), this->transition2State(currPtr)); @@ -58,9 +58,8 @@ private: unsigned char WEAK_FLAG = 2; SegrulesState res; transitionPtr++; - res.shiftOrthFromPrevious = *transitionPtr; - transitionPtr++; - res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr)); + res.shiftOrthFromPrevious = *transitionPtr++; + res.offset = readInt16(transitionPtr); res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG; res.weak = *(ptr + res.offset) & WEAK_FLAG; return res; diff --git a/morfeusz/segrules/segrules.cpp b/morfeusz/segrules/segrules.cpp index 90aa9aa..7bd0619 100644 --- a/morfeusz/segrules/segrules.cpp +++ b/morfeusz/segrules/segrules.cpp @@ -2,25 +2,12 @@ #include "segrules.hpp" #include "../fsa/fsa.hpp" #include "../fsa/const.hpp" +#include "../deserializationUtils.hpp" using namespace std; -static inline uint32_t deserializeUint32(const unsigned char*& ptr) { - uint32_t res = *reinterpret_cast<const uint32_t*>(ptr); - res = htonl(res); - ptr += 4; - return res; -} - -static inline string deserializeString(const unsigned char*& ptr) { - string res(reinterpret_cast<const char*>(ptr)); - ptr += res.length() + 1; - return res; -} - static inline void skipSeparatorsList(const unsigned char*& ptr) { - uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr)); - ptr += 2; + uint16_t listSize = readInt16(ptr); ptr += 4 * listSize; } @@ -28,7 +15,7 @@ static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr const unsigned char* additionalDataPtr = ptr + FSA_DATA_OFFSET + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); - const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4; + const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4; return res; } @@ -47,14 +34,14 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) { unsigned char optsNum = *ptr; ptr++; for (unsigned char i = 0; i < optsNum; i++) { - string key = deserializeString(ptr); - res[key] = deserializeString(ptr); + string key = readString(ptr); + res[key] = readString(ptr); } return res; } static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) { - uint32_t fsaSize = deserializeUint32(ptr); + uint32_t fsaSize = readInt32(ptr); // static SegrulesDeserializer deserializer; SegrulesFSA* res = new SegrulesFSA(ptr); ptr += fsaSize; diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml index 22cb4c4..cc0e348 100644 --- a/nbproject/configurations.xml +++ b/nbproject/configurations.xml @@ -105,7 +105,7 @@ <buildCommandWorkingDir>build</buildCommandWorkingDir> <buildCommand>${MAKE} -f Makefile</buildCommand> <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> - <executablePath>build/morfeusz/morfeusz_analyzer</executablePath> + <executablePath>build/morfeusz/morfeusz_generator</executablePath> </makeTool> </makefileType> <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> @@ -311,7 +311,7 @@ <ccTool> <incDir> <pElem>morfeusz</pElem> - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> + <pElem>/usr/lib/jvm/default-java/include</pElem> </incDir> <preprocessorList> <Elem>libjmorfeusz_EXPORTS</Elem>