From aece7355ccc6dec55f688a43c5c8bcd911c7b9ca Mon Sep 17 00:00:00 2001
From: Michał Lenart <michall@ipipan.waw.pl>
Date: Thu, 17 Apr 2014 17:02:21 +0000
Subject: [PATCH] nowsza wersja generatora - teraz naprawdę jest lustrzanym odbiciem analizatora

---
 README                                                 |  2 +-
 fsabuilder/CMakeLists.txt                              |  4 ++--
 fsabuilder/buildanalyzer.sh                            |  2 +-
 fsabuilder/buildgenerator.sh                           |  4 ++--
 fsabuilder/morfeuszbuilder/fsa/common.py               |  5 +++++
 fsabuilder/morfeuszbuilder/fsa/encode.py               | 53 ++++++++++++++++++++++-------------------------------
 fsabuilder/morfeuszbuilder/fsa/fsa.py                  |  3 ---
 fsabuilder/morfeuszbuilder/segrules/rules.py           |  5 +++--
 fsabuilder/morfeuszbuilder/segrules/rulesFSA.py        |  2 --
 fsabuilder/morfeuszbuilder/segrules/rulesManager.py    |  1 +
 fsabuilder/morfeuszbuilder/segrules/rulesParser.py     | 20 ++++++++++++--------
 fsabuilder/morfeuszbuilder/utils/serializationUtils.py |  4 ++++
 input/segmenty.dat                                     |  2 --
 morfeusz/CasePatternHelper.hpp                         | 29 -----------------------------
 morfeusz/InterpretedChunksDecoder.hpp                  | 48 +++++++++++++++++++++++-------------------------
 morfeusz/Morfeusz.cpp                                  | 17 ++++++++++++++++-
 morfeusz/Qualifiers.cpp                                |  1 -
 morfeusz/deserializationUtils.hpp                      | 14 ++++++++++++--
 morfeusz/segrules/SegrulesFSA.hpp                      | 11 +++++------
 morfeusz/segrules/segrules.cpp                         | 25 ++++++-------------------
 nbproject/configurations.xml                           |  4 ++--
 21 files changed, 117 insertions(+), 139 deletions(-)

diff --git a/README b/README
index 6e08362..5f5a94f 100644
--- a/README
+++ b/README
@@ -5,7 +5,7 @@ Compilation - prerequisites
 
 This tutorial assumes that build process is performed on Linux 64bit machine (preferably from Debian/Ubuntu family).
 
-sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip
+sudo apt-get install build-essential autotools-dev python python-setuptools python-stdeb python-pip python-all-dev python-pyparsing
 sudo pip install pyinstaller
 
 For cross compiling:
diff --git a/fsabuilder/CMakeLists.txt b/fsabuilder/CMakeLists.txt
index 0195673..919831b 100644
--- a/fsabuilder/CMakeLists.txt
+++ b/fsabuilder/CMakeLists.txt
@@ -55,8 +55,8 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
     list (APPEND PACKAGE_DEPENDS package-python-win-installer)
     
     #~ add_custom_target (buildfsa-exec ALL
-    #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder
-#~ )
+        #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder
+    #~ )
 #~ 
 #~ add_executable (morfeusz_builder IMPORTED)
 #~ add_dependencies (morfeusz_builder buildfsa-exec)
diff --git a/fsabuilder/buildanalyzer.sh b/fsabuilder/buildanalyzer.sh
index 26076e4..69667d1 100755
--- a/fsabuilder/buildanalyzer.sh
+++ b/fsabuilder/buildanalyzer.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1
+python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab --tagset-file=../input/sgjp-morfeusz.tagset --segments-file=../input/segmenty.dat --analyzer --serialization-method=V2 --trim-supneg -o $1
diff --git a/fsabuilder/buildgenerator.sh b/fsabuilder/buildgenerator.sh
index becc130..2f7f562 100755
--- a/fsabuilder/buildgenerator.sh
+++ b/fsabuilder/buildgenerator.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
-python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
-    --tagset-file=../input/polimorf.tagset \
+python morfeusz_builder --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
+    --tagset-file=../input/sgjp-morfeusz.tagset \
     --segments-file=../input/segmenty.dat \
     --generator \
     --serialization-method=V2 \
diff --git a/fsabuilder/morfeuszbuilder/fsa/common.py b/fsabuilder/morfeuszbuilder/fsa/common.py
index 28d0b17..10fadde 100644
--- a/fsabuilder/morfeuszbuilder/fsa/common.py
+++ b/fsabuilder/morfeuszbuilder/fsa/common.py
@@ -41,6 +41,11 @@ class EncodedForm4Generator(object):
         self.cutLength = bestEncodedForm.cutLength
         self.suffixToAdd = bestEncodedForm.suffixToAdd
         self.prefixToAdd = targetWord[:bestPrefixLength]
+        
+#         if fromWord == 'BC':
+#             print self.cutLength
+#             print self.suffixToAdd
+#             print self.prefixToAdd, len(self.prefixToAdd)
 
 class Interpretation4Analyzer(object):
     
diff --git a/fsabuilder/morfeuszbuilder/fsa/encode.py b/fsabuilder/morfeuszbuilder/fsa/encode.py
index e41ab9a..8aecd71 100644
--- a/fsabuilder/morfeuszbuilder/fsa/encode.py
+++ b/fsabuilder/morfeuszbuilder/fsa/encode.py
@@ -6,7 +6,7 @@ Created on Oct 23, 2013
 
 import logging
 import itertools
-from morfeuszbuilder.utils import serializationUtils
+from morfeuszbuilder.utils.serializationUtils import *
 
 class Encoder(object):
     '''
@@ -44,19 +44,6 @@ class Encoder(object):
         assert typenum >= 0 and typenum < 256
         return bytearray([typenum])
     
-    def _encodeEncodedForm(self, form, withCasePattern, withPrefix):
-        res = bytearray()
-        assert form.cutLength < 256 and form.cutLength >= 0
-        if withPrefix:
-            res.extend(self.encodeWord(form.prefixToAdd, lowercase=False))
-            res.append(0)
-        res.append(form.cutLength)
-        res.extend(self.encodeWord(form.suffixToAdd, lowercase=False))
-        res.append(0)
-        if withCasePattern:
-            res.extend(self._encodeCasePattern(form.casePattern))
-        return res
-    
     def _encodeCasePattern(self, casePattern):
         res = bytearray()
         if True not in casePattern:
@@ -84,7 +71,7 @@ class Encoder(object):
             n = len(self.qualifiersMap)
             self.qualifiersMap[key] = n
         assert n < 500
-        res.extend(serializationUtils.htons(n))
+        res.extend(htons(n))
         return res
     
     def _hasUpperPrefix(self, casePattern):
@@ -102,11 +89,9 @@ class Encoder(object):
     
     def _encodeTagNum(self, tagnum):
         res = bytearray()
-#         logging.info((tagnum & 0xFF00) >> 8)
         assert tagnum < 65536 and tagnum >= 0
         res.append((tagnum & 0xFF00) >> 8)
         res.append(tagnum & 0x00FF)
-#         logging.info('%d %s %s' % (tagnum, hex(res[0]), hex(res[1])))
         return res
     
     def _encodeNameNum(self, namenum):
@@ -129,31 +114,37 @@ class Encoder(object):
                 res.append(list(interp.orthCasePattern))
         return res
     
-    def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId):
+    def _encodeInterps4Type(self, typenum, interpsList, isAnalyzer):
         res = bytearray()
         res.extend(self._encodeTypeNum(typenum))
         encodedInterpsList = bytearray()
-        if withCasePattern:
+        if isAnalyzer:
             casePatterns = self._getOrthCasePatterns(interpsList)
             encodedInterpsList.append(len(casePatterns))
             for casePattern in casePatterns:
                 encodedInterpsList.extend(self._encodeCasePattern(casePattern))
         for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
-            if withHomonymId:
-                encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False))
-                encodedInterpsList.append(0)
-            if withCasePattern:
+            if isAnalyzer:
                 encodedInterpsList.extend(self._encodeCasePattern(interp.orthCasePattern))
-            encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
-            encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
-            encodedInterpsList.extend(self._encodeNameNum(interp.namenum))
+            else:
+                serializeString(interp.homonymId, encodedInterpsList)
+                serializeString(interp.encodedForm.prefixToAdd, encodedInterpsList)
+            encodedInterpsList.append(interp.encodedForm.cutLength)
+            serializeString(interp.encodedForm.suffixToAdd, encodedInterpsList)
+            if isAnalyzer:
+                encodedInterpsList.extend(self._encodeCasePattern(interp.encodedForm.casePattern))
+            encodedInterpsList.extend(htons(interp.tagnum))
+            encodedInterpsList.append(interp.namenum)
             encodedInterpsList.extend(self._encodeQualifiers(interp.qualifiers))
+            
+            if interp.encodedForm.suffixToAdd == 'bc':
+                print len(interpsList), interp.encodedForm.suffixToAdd, [int(x) for x in encodedInterpsList]
         
-        res.extend(serializationUtils.htons(len(encodedInterpsList)))
+        res.extend(htons(len(encodedInterpsList)))
         res.extend(encodedInterpsList)
         return res
     
-    def _doEncodeData(self, interpsList, withCasePattern, withPrefix, withHomonymId):
+    def _doEncodeData(self, interpsList, isAnalyzer):
         
         assert type(interpsList) == frozenset
         
@@ -167,7 +158,7 @@ class Encoder(object):
         res.append(firstByte)
         
         for typenum, interpsList in segnum2Interps.iteritems():
-            res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix, withHomonymId))
+            res.extend(self._encodeInterps4Type(typenum, interpsList, isAnalyzer))
         del interpsList
         
         return res
@@ -181,7 +172,7 @@ class MorphEncoder(Encoder):
         self.LEMMA_MIXED_CASE = 2
     
     def encodeData(self, interpsList):
-        return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False, withHomonymId=False)
+        return self._doEncodeData(interpsList, isAnalyzer=True)
 
 class Encoder4Generator(Encoder):
     
@@ -189,4 +180,4 @@ class Encoder4Generator(Encoder):
         super(Encoder4Generator, self).__init__(False, encoding)
     
     def encodeData(self, interpsList):
-        return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True, withHomonymId=True)
+        return self._doEncodeData(interpsList, isAnalyzer=False)
diff --git a/fsabuilder/morfeuszbuilder/fsa/fsa.py b/fsabuilder/morfeuszbuilder/fsa/fsa.py
index 18bc957..f23a0bb 100644
--- a/fsabuilder/morfeuszbuilder/fsa/fsa.py
+++ b/fsabuilder/morfeuszbuilder/fsa/fsa.py
@@ -43,9 +43,6 @@ class FSA(object):
         # debug
         if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0:
             logging.info(u'%d %s' % (self.n, word))
-#             logging.info(str(self.register.getStatesNum()))
-#             logging.info(str(self.register.getStatesNum()))
-    #             allWords.append(word)
         for label in encodedWord:
             self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
     
diff --git a/fsabuilder/morfeuszbuilder/segrules/rules.py b/fsabuilder/morfeuszbuilder/segrules/rules.py
index ca46a6d..2252c60 100644
--- a/fsabuilder/morfeuszbuilder/segrules/rules.py
+++ b/fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -56,6 +56,7 @@ class TagRule(SegmentRule):
     
     def __str__(self):
         res = self.segtype
+        res += '(' + str(self.segnum) + ')'
         if self.shiftOrth:
             res += '>'
         return res
@@ -70,8 +71,8 @@ class TagRule(SegmentRule):
 class UnaryRule(SegmentRule):
     
     def __init__(self, child, linenum):
+        super(UnaryRule, self).__init__(linenum)
         self.child = child
-        self.linenum = linenum
         assert not child.isSinkRule()
     
     def isShiftOrthRule(self):
@@ -80,8 +81,8 @@ class UnaryRule(SegmentRule):
 class ComplexRule(SegmentRule):
     
     def __init__(self, children, linenum):
+        super(ComplexRule, self).__init__(linenum)
         self.children = children
-        self.linenum = linenum
         assert not any(map(lambda c: c.isSinkRule(), children))
     
     def addToNFA(self, fsa):
diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
index 5da3482..1b86a0c 100644
--- a/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
+++ b/fsabuilder/morfeuszbuilder/segrules/rulesFSA.py
@@ -68,6 +68,4 @@ class RulesFSA(object):
             res.extend(self.stateData2bytearray(state))
             res.extend(self.transitionsData2bytearray(state))
         
-#         logging.info('Segmentation automaton size: %d bytes', len(res))
-#         print list(res)
         return res
diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py
index 9abe88c..e73a6f9 100644
--- a/fsabuilder/morfeuszbuilder/segrules/rulesManager.py
+++ b/fsabuilder/morfeuszbuilder/segrules/rulesManager.py
@@ -49,6 +49,7 @@ class RulesManager(object):
             res.extend(self._serializeDFA(dfa))
         res.extend(self._serializeOptionsMap(self.defaultOptions))
         logging.info('segmentation rules size: %s bytes', len(res))
+#         logging.info([int(x) for x in res])
         return res
     
     def _serializeSeparatorsList(self):
diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py
index 735407c..de06641 100644
--- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py
+++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -63,8 +63,8 @@ class RulesParser(object):
             nfa = rulesNFA.RulesNFA()
             if not firstNFA:
                 firstNFA = nfa
-            section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
-            combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection(section, ignoreComments=False)
+#             section = 'combinations' if self.rulesType == RulesParser.PARSE4ANALYZER else 'generator combinations'
+            combinationEnumeratedLines = segtypesConfigFile.enumerateLinesInSection('combinations', ignoreComments=False)
             combinationEnumeratedLines = list(preprocessor.preprocess(combinationEnumeratedLines, defs, filename))
             for rule in self._doParse(combinationEnumeratedLines, segtypesHelper, filename):
                 if rule.allowsEmptySequence():
@@ -72,8 +72,11 @@ class RulesParser(object):
                                                      filename, 
                                                      rule.linenum, 
                                                      'This rule allows empty segments sequence to be accepted')
-                rule.addToNFA(nfa)
-#                 nfa.debug()
+                if self.rulesType == RulesParser.PARSE4GENERATOR:
+                    rule = rule.transformToGeneratorVersion()
+                if not rule.isSinkRule():
+                    rule.addToNFA(nfa)
+#             nfa.debug()
             try:
                 dfa = nfa.convertToDFA()
                 res.addDFA(key2Def, dfa)
@@ -146,10 +149,11 @@ class RulesParser(object):
         unaryRule = atomicRule ^ zeroOrMoreRule ^ oneOrMoreRule ^ optionalRule ^ quantRule1 ^ quantRule2 ^ quantRule3
         oneOfRule = delimitedList(unaryRule, delim='|')
         complexRule = unaryRule ^ oneOfRule
-        if self.rulesType == RulesParser.PARSE4ANALYZER:
-            concatRule = OneOrMore(complexRule)
-        else:
-            concatRule = ZeroOrMore(shiftOrthRule) + tagRule
+        concatRule = OneOrMore(complexRule)
+#         if self.rulesType == RulesParser.PARSE4ANALYZER:
+#             concatRule = OneOrMore(complexRule)
+#         else:
+#             concatRule = ZeroOrMore(shiftOrthRule) + tagRule
         rule << concatRule + Optional(CaselessLiteral('!weak'))
         
         tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
diff --git a/fsabuilder/morfeuszbuilder/utils/serializationUtils.py b/fsabuilder/morfeuszbuilder/utils/serializationUtils.py
index f8ffe0e..3a1cd54 100644
--- a/fsabuilder/morfeuszbuilder/utils/serializationUtils.py
+++ b/fsabuilder/morfeuszbuilder/utils/serializationUtils.py
@@ -22,3 +22,7 @@ def htonl(n):
     res.append((n & 0x0000FF00) >> 8)
     res.append(n & 0x000000FF)
     return res
+
+def serializeString(string, out):
+    out.extend(string.encode('utf8'))
+    out.append(0)
diff --git a/input/segmenty.dat b/input/segmenty.dat
index 67be1d4..1eb39f5 100644
--- a/input/segmenty.dat
+++ b/input/segmenty.dat
@@ -682,5 +682,3 @@ pref_dyw	e-+:prefs
 
 # ;
 59
-
-[generator combinations]
diff --git a/morfeusz/CasePatternHelper.hpp b/morfeusz/CasePatternHelper.hpp
index fec391f..86467ee 100644
--- a/morfeusz/CasePatternHelper.hpp
+++ b/morfeusz/CasePatternHelper.hpp
@@ -62,15 +62,6 @@ public:
         }
     }
     
-    const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const {
-        const unsigned char* currPtr = ig.ptr;
-        unsigned char casePatternsNum = *currPtr++;
-        for (unsigned int i = 0; i < casePatternsNum; i++) {
-            deserializeOneCasePattern(currPtr);
-        }
-        return currPtr;
-    }
-    
     std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {
         std::vector<bool> res;
         uint8_t casePatternType = *ptr;
@@ -103,26 +94,6 @@ public:
         }
         return res;
     }
-
-//    bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const {
-//        if (this->caseSensitive) {
-//            for (unsigned int i = 0; i < chunks.size(); i++) {
-//                const InterpretedChunk& ic = chunks[i];
-//                const unsigned char* casePatternPtr = ic.interpsGroup.ptr;
-//                std::vector<bool> casePattern;
-//                deserializeCasePattern(casePatternPtr, casePattern);
-//                if (!checkCasePattern(ic, casePattern)) {
-//                    return false;
-//                }
-//            }
-//        }
-//        return true;
-//    }
-
-//    void skipCasePattern(const unsigned char*& ptr) const {
-//        vector<bool> _dupa;
-//        deserializeCasePattern(ptr, _dupa);
-//    }
 private:
     bool caseSensitive;
     
diff --git a/morfeusz/InterpretedChunksDecoder.hpp b/morfeusz/InterpretedChunksDecoder.hpp
index 6b6f185..d37885a 100644
--- a/morfeusz/InterpretedChunksDecoder.hpp
+++ b/morfeusz/InterpretedChunksDecoder.hpp
@@ -40,18 +40,6 @@ public:
 
 protected:
 
-    EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
-        EncodedInterpretation interp;
-        interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
-        deserializeEncodedForm(ptr, interp.value);
-        interp.tag = readInt16(ptr);
-        interp.nameClassifier = *ptr++;
-        interp.qualifiers = readInt16(ptr);
-        return interp;
-    }
-
-    virtual void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const = 0;
-
     const Environment& env;
 };
 
@@ -106,6 +94,16 @@ protected:
         assert(encodedForm.casePattern.size() == 0);
         encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
     }
+    
+    EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
+        EncodedInterpretation interp;
+        interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
+        deserializeEncodedForm(ptr, interp.value);
+        interp.tag = readInt16(ptr);
+        interp.nameClassifier = *ptr++;
+        interp.qualifiers = readInt16(ptr);
+        return interp;
+    }
 private:
 
     pair<string, string> getLemmaHomonymIdPair(const string& lemma) const {
@@ -176,7 +174,7 @@ public:
         const unsigned char* currPtr = interpretedChunk.interpsPtr;
         while (currPtr < interpretedChunk.interpsEndPtr) {
             MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
-            //            cerr << mi.toString(false) << endl;
+//                        cerr << mi.toString(false) << endl;
             //            cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
             if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) {
                 out.push_back(mi);
@@ -203,15 +201,12 @@ private:
             const InterpretedChunk& chunk,
             const unsigned char*& ptr) const {
         string orth = orthPrefix;
-        string homonymId = (const char*) ptr;
-        ptr += strlen((const char*) ptr) + 1;
         EncodedInterpretation ei = this->deserializeInterp(ptr);
         this->decodeForm(chunk.originalCodepoints, ei.value, orth);
-        //        string realLemma = homonymId.empty() ? lemma : (lemma + ":" + homonymId);
         return MorphInterpretation(
                 startNode, endNode,
                 orth, lemma,
-                homonymId,
+                ei.homonymId,
                 ei.tag,
                 ei.nameClassifier,
                 ei.qualifiers,
@@ -233,14 +228,17 @@ private:
             env.getCharsetConverter().append(cp, res);
         }
     }
-
-    void deserializeEncodedForm(const unsigned char*& ptr, EncodedForm& encodedForm) const {
-        encodedForm.prefixToAdd = (const char*) ptr;
-        ptr += strlen((const char*) ptr) + 1;
-        encodedForm.suffixToCut = *ptr;
-        ptr++;
-        encodedForm.suffixToAdd = (const char*) ptr;
-        ptr += strlen((const char*) ptr) + 1;
+    
+    EncodedInterpretation deserializeInterp(const unsigned char*& ptr) const {
+        EncodedInterpretation interp;
+        interp.homonymId = readString(ptr);
+        interp.value.prefixToAdd = readString(ptr);
+        interp.value.suffixToCut = readInt8(ptr);
+        interp.value.suffixToAdd = readString(ptr);
+        interp.tag = readInt16(ptr);
+        interp.nameClassifier = readInt8(ptr);
+        interp.qualifiers = readInt16(ptr);
+        return interp;
     }
 };
 
diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp
index 234ab90..b142d4f 100644
--- a/morfeusz/Morfeusz.cpp
+++ b/morfeusz/Morfeusz.cpp
@@ -18,6 +18,7 @@
 #include "charset/CaseConverter.hpp"
 #include "segrules/segrules.hpp"
 #include "const.hpp"
+#include "deserializationUtils.hpp"
 #include "charset/utf8.h"
 
 // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
@@ -40,6 +41,20 @@ options(createDefaultOptions()) {
     generatorEnv.setCaseSensitive(false);
 }
 
+inline const unsigned char* getInterpretationsPtr(const Environment& env, const InterpsGroup& ig) {
+    if (env.getProcessorType() == ANALYZER) {
+        const unsigned char* currPtr = ig.ptr;
+        unsigned char casePatternsNum = *currPtr++;
+        for (unsigned int i = 0; i < casePatternsNum; i++) {
+            env.getCasePatternHelper().deserializeOneCasePattern(currPtr);
+        }
+        return currPtr;
+    }
+    else {
+        return ig.ptr;
+    }
+}
+
 void Morfeusz::setAnalyzerFile(const string& filename) {
     this->analyzerEnv.setFSAFile(filename);
 }
@@ -183,7 +198,7 @@ void Morfeusz::doProcessOneWord(
                             it != newSegrulesStates.end();
                             ++it) {
                         SegrulesState newSegrulesState = *it;
-                        const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig);
+                        const unsigned char* interpsPtr = getInterpretationsPtr(env, ig);
                         const unsigned char* interpsEndPtr = ig.ptr + ig.size;
                         InterpretedChunk ic = {
                             ig.type,
diff --git a/morfeusz/Qualifiers.cpp b/morfeusz/Qualifiers.cpp
index d9cf171..b76b3dd 100644
--- a/morfeusz/Qualifiers.cpp
+++ b/morfeusz/Qualifiers.cpp
@@ -20,7 +20,6 @@ qualifiers() {
     readTags(currPtr, _dupa);
     _dupa.clear();
     readTags(currPtr, _dupa);
-    
     uint16_t allCombinationsSize = readInt16(currPtr);
     this->qualifiers.reserve(allCombinationsSize);
     for (unsigned int i = 0; i < allCombinationsSize; i++) {
diff --git a/morfeusz/deserializationUtils.hpp b/morfeusz/deserializationUtils.hpp
index d993d9f..b198a99 100644
--- a/morfeusz/deserializationUtils.hpp
+++ b/morfeusz/deserializationUtils.hpp
@@ -11,14 +11,24 @@
 #include "endianness.hpp"
 #include <iostream>
 
+inline unsigned char readInt8(const unsigned char*& currPtr) {
+    return *currPtr++;
+}
+
 inline uint16_t readInt16(const unsigned char*& currPtr) {
-    uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr));
+    uint16_t res = htons(*reinterpret_cast<const uint16_t*> (currPtr));
     currPtr += 2;
     return res;
 }
 
+inline uint32_t readInt32(const unsigned char*& currPtr) {
+    uint32_t res = htonl(*reinterpret_cast<const uint32_t*> (currPtr));
+    currPtr += 4;
+    return res;
+}
+
 inline std::string readString(const unsigned char*& currPtr) {
-    std::string res(reinterpret_cast<const char*>(currPtr));
+    std::string res((const char*) currPtr);
     currPtr += res.length();
     currPtr++;
     return res;
diff --git a/morfeusz/segrules/SegrulesFSA.hpp b/morfeusz/segrules/SegrulesFSA.hpp
index 70684b1..873a612 100644
--- a/morfeusz/segrules/SegrulesFSA.hpp
+++ b/morfeusz/segrules/SegrulesFSA.hpp
@@ -9,7 +9,8 @@
 #define	SEGRULESFSA_HPP
 
 #include <set>
-#include "../endianness.hpp"
+#include <iostream>
+#include "../deserializationUtils.hpp"
 
 struct SegrulesState {
     uint16_t offset;
@@ -37,8 +38,7 @@ public:
         
         const unsigned char* currPtr = ptr + state.offset;
         currPtr++;
-        const unsigned char transitionsNum = *currPtr;
-        currPtr++;
+        const unsigned char transitionsNum = *currPtr++;
         for (unsigned int i = 0; i < transitionsNum; i++) {
             if (*currPtr == segnum) {
                 newStates.insert(newStates.begin(), this->transition2State(currPtr));
@@ -58,9 +58,8 @@ private:
         unsigned char WEAK_FLAG = 2;
         SegrulesState res;
         transitionPtr++;
-        res.shiftOrthFromPrevious = *transitionPtr;
-        transitionPtr++;
-        res.offset = htons(*reinterpret_cast<const uint16_t*>(transitionPtr));
+        res.shiftOrthFromPrevious = *transitionPtr++;
+        res.offset = readInt16(transitionPtr);
         res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG;
         res.weak = *(ptr + res.offset) & WEAK_FLAG;
         return res;
diff --git a/morfeusz/segrules/segrules.cpp b/morfeusz/segrules/segrules.cpp
index 90aa9aa..7bd0619 100644
--- a/morfeusz/segrules/segrules.cpp
+++ b/morfeusz/segrules/segrules.cpp
@@ -2,25 +2,12 @@
 #include "segrules.hpp"
 #include "../fsa/fsa.hpp"
 #include "../fsa/const.hpp"
+#include "../deserializationUtils.hpp"
 
 using namespace std;
 
-static inline uint32_t deserializeUint32(const unsigned char*& ptr) {
-    uint32_t res = *reinterpret_cast<const uint32_t*>(ptr);
-    res = htonl(res);
-    ptr += 4;
-    return res;
-}
-
-static inline string deserializeString(const unsigned char*& ptr) {
-    string res(reinterpret_cast<const char*>(ptr));
-    ptr += res.length() + 1;
-    return res;
-}
-
 static inline void skipSeparatorsList(const unsigned char*& ptr) {
-    uint16_t listSize = ntohs(*reinterpret_cast<const uint16_t*>(ptr));
-    ptr += 2;
+    uint16_t listSize = readInt16(ptr);
     ptr += 4 * listSize;
 }
 
@@ -28,7 +15,7 @@ static inline const unsigned char* getSeparatorsListPtr(const unsigned char* ptr
     const unsigned char* additionalDataPtr = ptr 
         + FSA_DATA_OFFSET 
         + ntohl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET));
-    const unsigned char* res = additionalDataPtr + deserializeUint32(additionalDataPtr) + 4;
+    const unsigned char* res = additionalDataPtr + readInt32(additionalDataPtr) + 4;
     return res;
 }
 
@@ -47,14 +34,14 @@ static inline SegrulesOptions deserializeOptions(const unsigned char*& ptr) {
     unsigned char optsNum = *ptr;
     ptr++;
     for (unsigned char i = 0; i < optsNum; i++) {
-        string key = deserializeString(ptr);
-        res[key] = deserializeString(ptr);
+        string key = readString(ptr);
+        res[key] = readString(ptr);
     }
     return res;
 }
 
 static inline SegrulesFSA* deserializeFSA(const unsigned char*& ptr) {
-    uint32_t fsaSize = deserializeUint32(ptr);
+    uint32_t fsaSize = readInt32(ptr);
 //    static SegrulesDeserializer deserializer;
     SegrulesFSA* res = new SegrulesFSA(ptr);
     ptr += fsaSize;
diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml
index 22cb4c4..cc0e348 100644
--- a/nbproject/configurations.xml
+++ b/nbproject/configurations.xml
@@ -105,7 +105,7 @@
           <buildCommandWorkingDir>build</buildCommandWorkingDir>
           <buildCommand>${MAKE} -f Makefile</buildCommand>
           <cleanCommand>${MAKE} -f Makefile clean</cleanCommand>
-          <executablePath>build/morfeusz/morfeusz_analyzer</executablePath>
+          <executablePath>build/morfeusz/morfeusz_generator</executablePath>
         </makeTool>
       </makefileType>
       <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
@@ -311,7 +311,7 @@
         <ccTool>
           <incDir>
             <pElem>morfeusz</pElem>
-            <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
+            <pElem>/usr/lib/jvm/default-java/include</pElem>
           </incDir>
           <preprocessorList>
             <Elem>libjmorfeusz_EXPORTS</Elem>
--
libgit2 0.22.2