nowsza wersja generatora - teraz naprawdę jest lustrzanym odbiciem analizatora

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@166 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

nowsza wersja generatora - teraz naprawdę jest lustrzanym odbiciem analizatora
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@166 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Michał Lenart
1 parent d836a116
Showing 21 changed files with 117 additions and 139 deletions
README
fsabuilder/CMakeLists.txt
fsabuilder/buildanalyzer.sh
fsabuilder/buildgenerator.sh
fsabuilder/morfeuszbuilder/fsa/common.py
fsabuilder/morfeuszbuilder/fsa/encode.py
fsabuilder/morfeuszbuilder/fsa/fsa.py
fsabuilder/morfeuszbuilder/segrules/rules.py
fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
fsabuilder/morfeuszbuilder/segrules/rulesManager.py
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
fsabuilder/morfeuszbuilder/utils/serializationUtils.py
input/segmenty.dat
morfeusz/CasePatternHelper.hpp
morfeusz/InterpretedChunksDecoder.hpp
morfeusz/Morfeusz.cpp
morfeusz/Qualifiers.cpp
morfeusz/deserializationUtils.hpp
morfeusz/segrules/SegrulesFSA.hpp
morfeusz/segrules/segrules.cpp
@@ -5,7 +5,7 @@ Compilation - prerequisites
  
 This tutorial assumes that build process is performed on Linux 64bit machine (preferably from Debian/Ubuntu family).
  
-sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip
+sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip python-all-dev python-pyparsing
 sudo pip install pyinstaller
  
 For cross compiling:
@@ -55,8 +55,8 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES &quot;Windows&quot;)
     list (APPEND PACKAGE_DEPENDS package-python-win-installer)
  
     #~ add_custom_target (buildfsa-exec ALL
-    #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder
-#~ )
+        #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder
+    #~ )
 #~ 
 #~ add_executable (morfeusz_builder IMPORTED)
 #~ add_dependencies (morfeusz_builder buildfsa-exec)
 #!/bin/bash
  
-python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1
+python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1
 #!/bin/bash
  
-python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
-    --tagset-file=../input/polimorf.tagset \
+python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
+    --tagset-file=../input/sgjp-morfeusz.tagset \
     --segments-file=../input/segmenty.dat \
     --generator \
     --serialization-method=V2 \
@@ -41,6 +41,11 @@ class EncodedForm4Generator(object):
         self.cutLength = bestEncodedForm.cutLength
         self.suffixToAdd = bestEncodedForm.suffixToAdd
         self.prefixToAdd = targetWord[:bestPrefixLength]
+        
+#         if fromWord == 'BC':
+#             print self.cutLength
+#             print self.suffixToAdd
+#             print self.prefixToAdd, len(self.prefixToAdd)
  
 class Interpretation4Analyzer(object):
  
@@ -6,7 +6,7 @@ Created on Oct 23, 2013
  
 import logging
 import itertools
-from morfeuszbuilder.utils import serializationUtils
+from morfeuszbuilder.utils.serializationUtils import *
  
 class Encoder(object):
     '''
@@ -44,19 +44,6 @@ class Encoder(object):
         assert typenum >= 0 and typenum < 256
         return bytearray([typenum])
  
-    def _encodeEncodedForm(self, form, withCasePattern, withPrefix):
-        res = bytearray()
-        assert form.cutLength < 256 and form.cutLength >= 0
-        if withPrefix:
-            res.extend(self.encodeWord(form.prefixToAdd, lowercase=False))
-            res.append(0)
-        res.append(form.cutLength)
-        res.extend(self.encodeWord(form.suffixToAdd, lowercase=False))
-        res.append(0)
-        if withCasePattern:
-            res.extend(self._encodeCasePattern(form.casePattern))
-        return res
-    
     def _encodeCasePattern(self, casePattern):
         res = bytearray()
         if True not in casePattern:
@@ -84,7 +71,7 @@ class Encoder(object):
             n = len(self.qualifiersMap)
             self.qualifiersMap[key] = n
         assert n < 500
-        res.extend(serializationUtils.htons(n))
+        res.extend(htons(n))
         return res
  
     def _hasUpperPrefix(self, casePattern):
@@ -102,11 +89,9 @@ class Encoder(object):
  
     def _encodeTagNum(self, tagnum):
         res = bytearray()
-#         logging.info((tagnum & 0xFF00) >> 8)
         assert tagnum < 65536 and tagnum >= 0
         res.append((tagnum & 0xFF00) >> 8)
         res.append(tagnum & 0x00FF)
-#         logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1])))
         return res
  
     def _encodeNameNum(self, namenum):
@@ -129,31 +114,37 @@ class Encoder(object):
                 res.append(list(interp.orthCasePattern))
         return res
  
-    def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId):
+    def _encodeInterps4Type(self, typenum, interpsList, isAnalyzer):
         res = bytearray()
         res.extend(self._encodeTypeNum(typenum))
         encodedInterpsList = bytearray()
-        if withCasePattern:
+        if isAnalyzer:
             casePatterns = self._getOrthCasePatterns(interpsList)
             encodedInterpsList.append(len(casePatterns))
             for casePattern in casePatterns:
                 encodedInterpsList.extend(self._encodeCasePattern(casePattern))
         for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
-            if withHomonymId:
-                encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False))
-                encodedInterpsList.append(0)
-            if withCasePattern:
+            if isAnalyzer:
                 encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
-            encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
-            encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
-            encodedInterpsList.extend(self._encodeNameNum(interp.namenum))
+            else:
+                serializeString(interp.homonymId, encodedInterpsList)
+                serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList)
+            encodedInterpsList.append(interp.encodedForm.cutLength)
+            serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList)
+            if isAnalyzer:
+                encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
+            encodedInterpsList.extend(htons(interp.tagnum))
+            encodedInterpsList.append(interp.namenum)
             encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers))
+            
+            if interp.encodedForm.suffixToAdd == 'bc':
+                print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList]
  
-        res.extend(serializationUtils.htons(len(encodedInterpsList)))
+        res.extend(htons(len(encodedInterpsList)))
         res.extend(encodedInterpsList)
         return res
  
-    def _doEncodeData(self, interpsList, withCasePattern, withPrefix, withHomonymId):
+    def _doEncodeData(self, interpsList, isAnalyzer):
  
         assert type(interpsList) == frozenset
  
@@ -167,7 +158,7 @@ class Encoder(object):
         res.append(firstByte)
  
         for typenum, interpsList in segnum2Interps.iteritems():
-            res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix, withHomonymId))
+            res.extend(self._encodeInterps4Type(typenum, interpsList, isAnalyzer))
         del interpsList
  
         return res
@@ -181,7 +172,7 @@ class MorphEncoder(Encoder):
         self.LEMMA_MIXED_CASE = 2
  
     def encodeData(self, interpsList):
-        return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False, withHomonymId=False)
+        return self._doEncodeData(interpsList, isAnalyzer=True)
  
 class Encoder4Generator(Encoder):
  
@@ -189,4 +180,4 @@ class Encoder4Generator(Encoder):
         super(Encoder4Generator, self).__init__(False, encoding)
  
     def encodeData(self, interpsList):
-        return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True, withHomonymId=True)
+        return self._doEncodeData(interpsList, isAnalyzer=False)
@@ -43,9 +43,6 @@ class FSA(object):
         # debug
         if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0:
             logging.info(u'%d %s' % (self.n, word))
-#             logging.info(str(self.register.getStatesNum()))
-#             logging.info(str(self.register.getStatesNum()))
-    #             allWords.append(word)
         for label in encodedWord:
             self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
  
@@ -56,6 +56,7 @@ class TagRule(SegmentRule):
  
     def __str__(self):
         res = self.segtype
+        res += '(' + str(self.segnum) + ')'
         if self.shiftOrth:
             res += '>'
         return res
@@ -70,8 +71,8 @@ class TagRule(SegmentRule):
 class UnaryRule(SegmentRule):
  
     def __init__(self, child, linenum):
+        super(UnaryRule, self).__init__(linenum)
         self.child = child
-        self.linenum = linenum
         assert not child.isSinkRule()
  
     def isShiftOrthRule(self):
@@ -80,8 +81,8 @@ class UnaryRule(SegmentRule):
 class ComplexRule(SegmentRule):
  
     def __init__(self, children, linenum):
+        super(ComplexRule, self).__init__(linenum)
         self.children = children
-        self.linenum = linenum
         assert not any(map(lambda c: c.isSinkRule(), children))
  
     def addToNFA(self, fsa):
@@ -68,6 +68,4 @@ class RulesFSA(object):
             res.extend(self.stateData2bytearray(state))
             res.extend(self.transitionsData2bytearray(state))
  
-#         logging.info('Segmentation automaton size: %d bytes', len(res))
-#         print list(res)
         return res
@@ -49,6 +49,7 @@ class RulesManager(object):
             res.extend(self._serializeDFA(dfa))
         res.extend(self._serializeOptionsMap(self.defaultOptions))
         logging.info('segmentation rules size: %s bytes', len(res))
+#         logging.info([int(x) for x in res])
         return res
  
     def _serializeSeparatorsList(self):
@@ -63,8 +63,8 @@ class RulesParser(object):
             nfa = rulesNFA.RulesNFA()
             if not firstNFA:
                 firstNFA = nfa
-            section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
-            combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section, ignoreComments=False)
+#             section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
+            combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False)
             combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
             for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
                 if rule.allowsEmptySequence():
@@ -72,8 +72,11 @@ class RulesParser(object):
                                                      filename, 
                                                      rule.linenum, 
                                                      'This rule allows empty segments sequence to be accepted')
-                rule.addToNFA(nfa)
-#                 nfa.debug()
+                if self.rulesType == RulesParser.PARSE4GENERATOR:
+                    rule = rule.transformToGeneratorVersion()
+                if not rule.isSinkRule():
+                    rule.addToNFA(nfa)
+#             nfa.debug()
             try:
                 dfa = nfa.convertToDFA()
                 res.addDFA(key2Def, dfa)
@@ -146,10 +149,11 @@ class RulesParser(object):
         unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3
         oneOfRule = delimitedList(unaryRule, delim='|')
         complexRule = unaryRule ^ oneOfRule
-        if self.rulesType == RulesParser.PARSE4ANALYZER:
-            concatRule = OneOrMore(complexRule)
-        else:
-            concatRule = ZeroOrMore(shiftOrthRule) + tagRule
+        concatRule = OneOrMore(complexRule)
+#         if self.rulesType == RulesParser.PARSE4ANALYZER:
+#             concatRule = OneOrMore(complexRule)
+#         else:
+#             concatRule = ZeroOrMore(shiftOrthRule) + tagRule
         rule << concatRule + Optional(CaselessLiteral('!weak'))
  
         tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
@@ -22,3 +22,7 @@ def htonl(n):
     res.append((n & 0x0000FF00) >> 8)
     res.append(n & 0x000000FF)
     return res
+
+def serializeString(string, out):
+    out.extend(string.encode('utf8'))
+    out.append(0)
@@ -682,5 +682,3 @@ pref_dyw	e-+:prefs
  
 # ;
 59
-
-[generator combinations]
@@ -62,15 +62,6 @@ public:
         }
     }
  
-    const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const {
-        const unsigned char* currPtr = ig.ptr;
-        unsigned char casePatternsNum = *currPtr++;
-        for (unsigned int i = 0; i < casePatternsNum; i++) {
-            deserializeOneCasePattern(currPtr);
-        }
-        return currPtr;
-    }
-    
     std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {
         std::vector<bool> res;
         uint8_t casePatternType = *ptr;
@@ -103,26 +94,6 @@ public:
         }
         return res;
     }
-
-//    bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const {
-//        if (this->caseSensitive) {
-//            for (unsigned int i = 0; i < chunks.size(); i++) {
-//                const InterpretedChunk& ic = chunks[i];
-//                const unsigned char* casePatternPtr = ic.interpsGroup.ptr;
-//                std::vector<bool> casePattern;
-//                deserializeCasePattern(casePatternPtr, casePattern);
-//                if (!checkCasePattern(ic, casePattern)) {
-//                    return false;
-//                }
-//            }
-//        }
-//        return true;
-//    }
-
-//    void skipCasePattern(const unsigned char*& ptr) const {
-//        vector<bool> _dupa;
-//        deserializeCasePattern(ptr, _dupa);
-//    }
 private:
     bool caseSensitive;
  
@@ -40,18 +40,6 @@ public:
  
 protected:
  
-    EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
-        EncodedInterpretation interp;
-        interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
-        deserializeEncodedForm(ptr, interp.value);
-        interp.tag = readInt16(ptr);
-        interp.nameClassifier = *ptr++;
-        interp.qualifiers = readInt16(ptr);
-        return interp;
-    }
-
-    virtual void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const = 0;
-
     const Environment& env;
 };
  
@@ -106,6 +94,16 @@ protected:
         assert(encodedForm.casePattern.size() == 0);
         encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
     }
+    
+    EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
+        EncodedInterpretation interp;
+        interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
+        deserializeEncodedForm(ptr, interp.value);
+        interp.tag = readInt16(ptr);
+        interp.nameClassifier = *ptr++;
+        interp.qualifiers = readInt16(ptr);
+        return interp;
+    }
 private:
  
     pair<string, string> getLemmaHomonymIdPair(const string& lemma) const {
@@ -176,7 +174,7 @@ public:
         const unsigned char* currPtr = interpretedChunk.interpsPtr;
         while (currPtr < interpretedChunk.interpsEndPtr) {
             MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
-            //            cerr << mi.toString(false) << endl;
+//                        cerr << mi.toString(false) << endl;
             //            cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
             if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) {
                 out.push_back(mi);
@@ -203,15 +201,12 @@ private:
             const InterpretedChunk& chunk,
             const unsigned char*& ptr) const {
         string orth = orthPrefix;
-        string homonymId = (const char*) ptr;
-        ptr += strlen((const char*) ptr) + 1;
         EncodedInterpretation ei = this->deserializeInterp(ptr);
         this->decodeForm(chunk.originalCodepoints, ei.value, orth);
-        //        string realLemma = homonymId.empty() ? lemma : (lemma + ":" + homonymId);
         return MorphInterpretation(
                 startNode, endNode,
                 orth, lemma,
-                homonymId,
+                ei.homonymId,
                 ei.tag,
                 ei.nameClassifier,
                 ei.qualifiers,
@@ -233,14 +228,17 @@ private:
             env.getCharsetConverter().append(cp, res);
         }
     }
-
-    void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const {
-        encodedForm.prefixToAdd = (const char*) ptr;
-        ptr += strlen((const char*) ptr) + 1;
-        encodedForm.suffixToCut = *ptr;
-        ptr++;
-        encodedForm.suffixToAdd = (const char*) ptr;
-        ptr += strlen((const char*) ptr) + 1;
+    
+    EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
+        EncodedInterpretation interp;
+        interp.homonymId = readString(ptr);
+        interp.value.prefixToAdd = readString(ptr);
+        interp.value.suffixToCut = readInt8(ptr);
+        interp.value.suffixToAdd = readString(ptr);
+        interp.tag = readInt16(ptr);
+        interp.nameClassifier = readInt8(ptr);
+        interp.qualifiers = readInt16(ptr);
+        return interp;
     }
 };
  
@@ -18,6 +18,7 @@
 #include "charset/CaseConverter.hpp"
 #include "segrules/segrules.hpp"
 #include "const.hpp"
+#include "deserializationUtils.hpp"
 #include "charset/utf8.h"
  
 // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
@@ -40,6 +41,20 @@ options(createDefaultOptions()) {
     generatorEnv.setCaseSensitive(false);
 }
  
+inline const unsigned char* getInterpretationsPtr(const Environment& env, const InterpsGroup& ig) {
+    if (env.getProcessorType() == ANALYZER) {
+        const unsigned char* currPtr = ig.ptr;
+        unsigned char casePatternsNum = *currPtr++;
+        for (unsigned int i = 0; i < casePatternsNum; i++) {
+            env.getCasePatternHelper().deserializeOneCasePattern(currPtr);
+        }
+        return currPtr;
+    }
+    else {
+        return ig.ptr;
+    }
+}
+
 void Morfeusz::setAnalyzerFile(const string& filename) {
     this->analyzerEnv.setFSAFile(filename);
 }
@@ -183,7 +198,7 @@ void Morfeusz::doProcessOneWord(
                             it != newSegrulesStates.end();
                             ++it) {
                         SegrulesState newSegrulesState = *it;
-                        const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig);
+                        const unsigned char* interpsPtr = getInterpretationsPtr(env, ig);
                         const unsigned char* interpsEndPtr = ig.ptr + ig.size;
                         InterpretedChunk ic = {
                             ig.type,
@@ -20,7 +20,6 @@ qualifiers() {
     readTags(currPtr, _dupa);
     _dupa.clear();
     readTags(currPtr, _dupa);
-    
     uint16_t allCombinationsSize = readInt16(currPtr);
     this->qualifiers.reserve(allCombinationsSize);
     for (unsigned int i = 0; i < allCombinationsSize; i++) {
@@ -11,14 +11,24 @@
 #include "endianness.hpp"
 #include <iostream>
  
+inline unsigned char readInt8(const unsigned char*& currPtr) {
+    return *currPtr++;
+}
+
 inline uint16_t readInt16(const unsigned char*& currPtr) {
-    uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr));
+    uint16_t res = htons(*reinterpret_cast<const uint16_t*> (currPtr));
     currPtr += 2;
     return res;
 }
  
+inline uint32_t readInt32(const unsigned char*& currPtr) {
+    uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr));
+    currPtr += 4;
+    return res;
+}
+
 inline std::string readString(const unsigned char*& currPtr) {
-    std::string res(reinterpret_cast<const char*>(currPtr));
+    std::string res((const char*) currPtr);
     currPtr += res.length();
     currPtr++;
     return res;
@@ -9,7 +9,8 @@
 #define	SEGRULESFSA_HPP
  
 #include <set>
-#include "../endianness.hpp"
+#include <iostream>
+#include "../deserializationUtils.hpp"
  
 struct SegrulesState {
     uint16_t offset;
@@ -37,8 +38,7 @@ public:
  
         const unsigned char* currPtr = ptr + state.offset;
         currPtr++;
-        const unsigned char transitionsNum = *currPtr;
-        currPtr++;
+        const unsigned char transitionsNum = *currPtr++;
         for (unsigned int i = 0; i < transitionsNum; i++) {
             if (*currPtr == segnum) {
                 newStates.insert(newStates.begin(), this->transition2State(currPtr));
@@ -58,9 +58,8 @@ private:
         unsigned char WEAK_FLAG = 2;
         SegrulesState res;
         transitionPtr++;
-        res.shiftOrthFromPrevious = *transitionPtr;
-        transitionPtr++;
-        res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr));
+        res.shiftOrthFromPrevious = *transitionPtr++;
+        res.offset = readInt16(transitionPtr);
         res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG;
         res.weak = *(ptr + res.offset) & WEAK_FLAG;
         return res;
@@ -2,25 +2,12 @@
 #include "segrules.hpp"
 #include "../fsa/fsa.hpp"
 #include "../fsa/const.hpp"
+#include "../deserializationUtils.hpp"
  
 using namespace std;
  
-static inline uint32_t deserializeUint32(const unsigned char*& ptr) {
-    uint32_t res = *reinterpret_cast<const uint32_t*>(ptr);
-    res = htonl(res);
-    ptr += 4;
-    return res;
-}
-
-static inline string deserializeString(const unsigned char*& ptr) {
-    string res(reinterpret_cast<const char*>(ptr));
-    ptr += res.length() + 1;
-    return res;
-}
-
 static inline void skipSeparatorsList(const unsigned char*& ptr) {
-    uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr));
-    ptr += 2;
+    uint16_t listSize = readInt16(ptr);
     ptr += 4 * listSize;
 }
  
@@ -28,7 +15,7 @@ static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr
     const unsigned char* additionalDataPtr = ptr 
         + FSA_DATA_OFFSET 
         + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
-    const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;
+    const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4;
     return res;
 }
  
@@ -47,14 +34,14 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*&amp; ptr) {
     unsigned char optsNum = *ptr;
     ptr++;
     for (unsigned char i = 0; i < optsNum; i++) {
-        string key = deserializeString(ptr);
-        res[key] = deserializeString(ptr);
+        string key = readString(ptr);
+        res[key] = readString(ptr);
     }
     return res;
 }
  
 static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) {
-    uint32_t fsaSize = deserializeUint32(ptr);
+    uint32_t fsaSize = readInt32(ptr);
 //    static SegrulesDeserializer deserializer;
     SegrulesFSA* res = new SegrulesFSA(ptr);
     ptr += fsaSize;
@@ -105,7 +105,7 @@
           <buildCommandWorkingDir>build</buildCommandWorkingDir>
           <buildCommand>${MAKE} -f Makefile</buildCommand>
           <cleanCommand>${MAKE} -f Makefile clean</cleanCommand>
-          <executablePath>build/morfeusz/morfeusz_analyzer</executablePath>
+          <executablePath>build/morfeusz/morfeusz_generator</executablePath>
         </makeTool>
       </makefileType>
       <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
@@ -311,7 +311,7 @@
         <ccTool>
           <incDir>
             <pElem>morfeusz</pElem>
-            <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
+            <pElem>/usr/lib/jvm/default-java/include</pElem>
           </incDir>
           <preprocessorList>
             <Elem>libjmorfeusz_EXPORTS</Elem>