poprawienie czasu działania, przebudowanie analizatora tak, by nie powielać kodu…

… w generatorze, poprawienie rozpoznawania pierwszego segmentu w grafie fleksyjnym git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@114 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

poprawienie czasu działania, przebudowanie analizatora tak, by nie powielać kodu…
… w generatorze, poprawienie rozpoznawania pierwszego segmentu w grafie fleksyjnym git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@114 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Michał Lenart
1 parent de0e960d
Showing 17 changed files with 411 additions and 205 deletions
CMakeLists.txt
fsabuilder/morfeuszbuilder/fsa/common.py
fsabuilder/morfeuszbuilder/fsa/convertinput.py
fsabuilder/morfeuszbuilder/fsa/encode.py
fsabuilder/morfeuszbuilder/tagset/segtypes.py
input/dodatki.tab
input/segmenty.dat
input/segmenty1.dat
morfeusz/EncodedInterpretation.hpp
morfeusz/Environment.cpp
morfeusz/Environment.hpp
morfeusz/FlexionGraph.cpp
morfeusz/InterpretedChunksDecoder.hpp
morfeusz/InterpsGroup.hpp
morfeusz/Morfeusz.cpp
morfeusz/MorphDeserializer.cpp
nbproject/configurations.xml
@@ -5,7 +5,7 @@ project (Morfeusz)
 set (Morfeusz_VERSION_MAJOR 2)
 set (Morfeusz_VERSION_MINOR 0)
 set (Morfeusz_VERSION_PATCH 0)
-set (CMAKE_BUILD_TYPE "Debug")
+set (CMAKE_BUILD_TYPE "Release")
  
 enable_testing()
  
@@ -47,7 +47,7 @@ endif ()
  
 # SEGMENT_RULES_FILE
 if ("${SEGMENT_RULES_FILE}" STREQUAL "")
-   set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty1.dat")
+   set (SEGMENT_RULES_FILE "${PROJECT_SOURCE_DIR}/input/segmenty.dat")
 endif ()
  
 message ("Will use ${INPUT_DICTIONARIES} as default dictionary input, ${INPUT_TAGSET} as tagset and ${SEGMENT_RULES_FILE} as segmentation rules")
@@ -40,24 +40,24 @@ class EncodedFormWithPrefix(object):
         self.suffixToAdd = bestEncodedForm.suffixToAdd
         self.prefixToAdd = targetWord[:bestPrefixLength]
  
-class Interpretation(object):
+class Interpretation4Analyzer(object):
  
     def __init__(self, orth, base, tagnum, namenum, typenum):
-        self.lemma = EncodedForm(orth, base)
+        self.encodedForm = EncodedForm(orth, base)
         self.tagnum = tagnum
         self.namenum = namenum
         self.typenum = typenum
  
     def getSortKey(self):
         return (
-                self.lemma.cutLength, 
-                tuple(self.lemma.suffixToAdd), 
-                tuple(self.lemma.casePattern), 
+                self.encodedForm.cutLength, 
+                tuple(self.encodedForm.suffixToAdd), 
+                tuple(self.encodedForm.casePattern), 
                 self.tagnum, 
                 self.namenum)
  
     def __eq__(self, other):
-        if isinstance(other, Interpretation):
+        if isinstance(other, Interpretation4Analyzer):
             return self.getSortKey() == other.getSortKey()
         else:
             return False
@@ -68,8 +68,8 @@ class Interpretation(object):
 class Interpretation4Generator(object):
  
     def __init__(self, orth, base, tagnum, namenum, typenum):
-        self.lemma = base
-        self.orth = EncodedFormWithPrefix(base, orth)
+        self.encodedForm = base
+        self.encodedForm = EncodedFormWithPrefix(base, orth)
         self.tagnum = tagnum
         self.namenum = namenum
         self.typenum = typenum
@@ -77,9 +77,9 @@ class Interpretation4Generator(object):
     def getSortKey(self):
         return (
                 self.tagnum,
-                self.orth.cutLength, 
-                tuple(self.orth.suffixToAdd), 
-#                 tuple(self.lemma.casePattern), 
+                self.encodedForm.cutLength, 
+                tuple(self.encodedForm.suffixToAdd), 
+#                 tuple(self.encodedForm.casePattern), 
                 self.namenum)
  
     def __eq__(self, other):
@@ -92,7 +92,7 @@ class Interpretation4Generator(object):
         return hash(self.getSortKey())
  
     def __unicode__(self):
-        return u'<%s,(%d %s),%d,%d>' % (self.lemma.decode('utf8'), self.orth.cutLength, self.orth.suffixToAdd.decode('utf8'), self.tagnum, self.namenum)
+        return u'<%s,(%d %s),%d,%d>' % (self.encodedForm.decode('utf8'), self.encodedForm.cutLength, self.encodedForm.suffixToAdd.decode('utf8'), self.tagnum, self.namenum)
  
     def __repr__(self):
         return unicode(self)
@@ -4,7 +4,7 @@ Created on Oct 23, 2013
 @author: mlenart
 '''
 import logging
-from common import Interpretation
+from common import Interpretation4Analyzer
 from morfeuszbuilder.fsa.common import Interpretation4Generator
  
 def _mergeEntries(inputLines):
@@ -74,7 +74,7 @@ class PolimorfConverter4Analyzer(object):
                 tagnum = int(tagnum)
                 namenum = int(namenum)
                 typenum = int(typenum)
-                yield (orth, Interpretation(orth, base, tagnum, namenum, typenum))
+                yield (orth, Interpretation4Analyzer(orth, base, tagnum, namenum, typenum))
  
     def convert(self, inputLines):
         return _mergeEntries(self._reallyParseLines(self._sortLines(self._partiallyParseLines(inputLines))))
@@ -5,6 +5,7 @@ Created on Oct 23, 2013
 '''
  
 import logging
+from morfeuszbuilder.utils import serializationUtils
  
 class Encoder(object):
     '''
@@ -96,6 +97,54 @@ class Encoder(object):
     def _encodeNameNum(self, namenum):
         assert namenum < 256 and namenum >= 0
         return bytearray([namenum])
+    
+    def _groupInterpsByType(self, interpsList):
+        res = {}
+        for interp in interpsList:
+            res.setdefault(interp.typenum, [])
+            res[interp.typenum].append(interp)
+        return res
+    
+    def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix):
+        res = bytearray()
+        res.extend(self._encodeTypeNum(typenum))
+        
+        encodedInterpsList = bytearray()
+        for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
+            encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
+            encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
+            encodedInterpsList.extend(self._encodeNameNum(interp.namenum))
+        
+        res.extend(serializationUtils.htons(len(encodedInterpsList)))
+        res.extend(encodedInterpsList)
+        return res
+    
+    def _doEncodeData(self, interpsList, withCasePattern, withPrefix):
+        
+        assert type(interpsList) == frozenset
+        
+        segnum2Interps = self._groupInterpsByType(interpsList)
+        
+        
+        res = bytearray()
+        firstByte = len(segnum2Interps)
+        assert firstByte < 256
+        assert firstByte > 0
+        res.append(firstByte)
+        
+        for typenum, interpsList in segnum2Interps.iteritems():
+            res.extend(self._encodeInterps4Type(typenum, interpsList, withCasePattern, withPrefix))
+            
+        
+#         for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
+#             encodedInterpsList.extend(self._encodeTypeNum(interp.typenum))
+#             encodedInterpsList.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=withCasePattern, withPrefix=withPrefix))
+#             encodedInterpsList.extend(self._encodeTagNum(interp.tagnum))
+#             encodedInterpsList.extend(self._encodeNameNum(interp.namenum))
+        del interpsList
+#         res.extend(serializationUtils.htons(len(encodedInterpsList)))
+#         res.extend(encodedInterpsList)
+        return res
  
 class MorphEncoder(Encoder):
  
@@ -106,19 +155,20 @@ class MorphEncoder(Encoder):
         self.LEMMA_MIXED_CASE = 2
  
     def encodeData(self, interpsList):
-        res = bytearray()
-        firstByte = len(interpsList)
-        assert firstByte < 256
-        assert firstByte > 0
-        res.append(firstByte)
-        assert type(interpsList) == frozenset
-        for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
-            res.extend(self._encodeTypeNum(interp.typenum))
-            res.extend(self._encodeEncodedForm(interp.lemma, withCasePattern=True, withPrefix=False))
-            res.extend(self._encodeTagNum(interp.tagnum))
-            res.extend(self._encodeNameNum(interp.namenum))
-        del interpsList
-        return res
+        return self._doEncodeData(interpsList, withCasePattern=True, withPrefix=False)
+#         res = bytearray()
+#         firstByte = len(interpsList)
+#         assert firstByte < 256
+#         assert firstByte > 0
+#         res.append(firstByte)
+#         assert type(interpsList) == frozenset
+#         for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
+#             res.extend(self._encodeTypeNum(interp.typenum))
+#             res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=True, withPrefix=False))
+#             res.extend(self._encodeTagNum(interp.tagnum))
+#             res.extend(self._encodeNameNum(interp.namenum))
+#         del interpsList
+#         return res
  
 class Encoder4Generator(Encoder):
  
@@ -126,18 +176,19 @@ class Encoder4Generator(Encoder):
         super(Encoder4Generator, self).__init__(encoding)
  
     def encodeData(self, interpsList):
-        res = bytearray()
-        firstByte = len(interpsList)
-        assert firstByte < 256
-        assert firstByte > 0
-        res.append(firstByte)
-        assert type(interpsList) == frozenset
-        for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
-            res.extend(self._encodeTypeNum(interp.typenum))
-            res.extend(self._encodeEncodedForm(interp.orth, withCasePattern=False, withPrefix=True))
-            res.extend(self._encodeTagNum(interp.tagnum))
-            res.extend(self._encodeNameNum(interp.namenum))
-        return res
+        return self._doEncodeData(interpsList, withCasePattern=False, withPrefix=True)
+#         res = bytearray()
+#         firstByte = len(interpsList)
+#         assert firstByte < 256
+#         assert firstByte > 0
+#         res.append(firstByte)
+#         assert type(interpsList) == frozenset
+#         for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
+#             res.extend(self._encodeTypeNum(interp.typenum))
+#             res.extend(self._encodeEncodedForm(interp.encodedForm, withCasePattern=False, withPrefix=True))
+#             res.extend(self._encodeTagNum(interp.tagnum))
+#             res.extend(self._encodeNameNum(interp.namenum))
+#         return res
 #     
 #     def decodeData(self, data):
 #         
@@ -106,7 +106,7 @@ class Segtypes(object):
                            lineNum,
                            re.match(r'[a-z_]+', segtype))
             self._validate(
-                           u'Pattern must contain lemma and part-of-speech fields',
+                           u'Pattern must contain encodedForm and part-of-speech fields',
                            lineNum,
                            re.match(r'.+\:[a-z_]+', pattern, re.U))
  
@@ -146,13 +146,13 @@ class Segtypes(object):
  
         # index lexemes
         for p in self.patternsList:
-            if p.lemma:
+            if p.encodedForm:
                 for tag in self.tagset.getAllTags():
                     tagnum = self.tagset.getTagnum4Tag(tag)
-                    if not (p.lemma, tagnum) in self._lemmaTagnum2Segnum:
-                        segnum = p.tryToMatch(p.lemma, tag)
+                    if not (p.encodedForm, tagnum) in self._lemmaTagnum2Segnum:
+                        segnum = p.tryToMatch(p.encodedForm, tag)
                         if segnum != -1:
-                            self._lemmaTagnum2Segnum[(p.lemma, tagnum)] = segnum
+                            self._lemmaTagnum2Segnum[(p.encodedForm, tagnum)] = segnum
 #         logging.info('indexing segment type numbers - done')
 #         self._debugSegnums()
  
@@ -171,7 +171,7 @@ class Segtypes(object):
 class SegtypePattern(object):
  
     def __init__(self, lemma, pattern, segnum):
-        self.lemma = lemma
+        self.encodedForm = lemma
         self.pattern = pattern
         self.segnum = segnum
  
@@ -181,7 +181,7 @@ class SegtypePattern(object):
         patterns2Match = []
         patterns2Match.append(self.pattern.replace('%', '.*'))
         patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*'))
-        if (self.lemma is None or self.lemma == lemma) \
+        if (self.encodedForm is None or self.encodedForm == lemma) \
         and any([re.match(p, tag) for p in patterns2Match]):
             return self.segnum
         else:
+0	0	dig
+1	1	dig
+2	2	dig
+3	3	dig
+4	4	dig
+5	5	dig
+6	6	dig
+7	7	dig
+8	8	dig
+9	9	dig
 ń	on	ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep
 by	by	qub
 naj		naj
@@ -142,6 +142,7 @@ samodz dywiz adj
 # Stopień najwyższy:
 # np. „naj·zieleńszy”, „naj·mądrzej”
 moze_interp( naj> adj_sup )
+moze_interp( nie> naj> adj_sup )
 # Cząstka li przy osobowych formach czasownika oddzielona dywizem: znasz-li ten kraj
 moze_interp( praet_sg dywiz li)
 moze_interp( praet_pl dywiz li)
@@ -52,11 +52,14 @@ naj	naj
 nie	nie
 prefs	prefs
 prefv	prefv
+prefa	prefa
 dig	dig
 adja	adja
 adj	adj:%:pos
 adj_sup	adj:%:sup
 adj_sup	adv:sup
+adj_com	adj:%:com
+adj_com	adj:%:com
 negat	ger:%:neg
 negat	pact:%:neg
 negat	ppas:%:neg
@@ -69,6 +72,22 @@ interp	interp
 aglsg	aglt:sg:%
 aglpl	aglt:pl:%
 samodz		%
+praet_fin	praet:%
+praet_fin	fin:%
+li		li:qub:%
+nomina		subst:%
+nomina		ger:%
+nomina		depr:%
+adjectiva	adj:%
+adjectiva	adv:%
+adjectiva	ppas:%
+adjectiva	pact:%
+verba_imperf	praet:%:imperf
+verba_imperf	fin:%:imperf
+verba_imperf	inf:imperf
+verba_imperf	imps:imperf
+verba_imperf	impt:imperf
+
  
 [lexemes]
 z_aglt	aby:comp
@@ -28,7 +28,6 @@ struct EncodedForm {
  */
 struct EncodedInterpretation {
     EncodedForm value;
-    unsigned char type;
     int tag;
     int nameClassifier;
 };
@@ -13,10 +13,12 @@
 //class InterpretedChunksDecoder4Analyzer;
 //class InterpretedChunksDecoder4Generator;
  
-static Deserializer<vector<InterpsGroup> >* initializeDeserializer() {
-    static Deserializer < vector < InterpsGroup > > *deserializer
+static Deserializer<vector<InterpsGroup> >& initializeDeserializer(MorfeuszProcessorType processorType) {
+    static Deserializer < vector < InterpsGroup > > *analyzerDeserializer
             = new MorphDeserializer();
-    return deserializer;
+    static Deserializer < vector < InterpsGroup > > *generatorDeserializer
+            = new MorphDeserializer();
+    return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer);
 }
  
 static SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map) {
@@ -48,14 +50,15 @@ Environment::Environment(
         caseConverter(),
         tagset(fsaFileStartPtr),
         fsaFileStartPtr(fsaFileStartPtr),
-        fsa(FSAType::getFSA(fsaFileStartPtr, *initializeDeserializer())),
+        fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))),
         segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)),
         currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap)),
         isFromFile(false),
         chunksDecoder(
             processorType == ANALYZER
             ? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
-            : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this))
+            : (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)),
+        processorType(processorType)
          {
 }
  
@@ -110,7 +113,7 @@ void Environment::setFSAFile(const std::string&amp; filename) {
         delete this->fsaFileStartPtr;
     }
     this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str());
-    this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, *initializeDeserializer());
+    this->fsa = FSA< vector<InterpsGroup> > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType));
     this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr);
     this->isFromFile = true;
 }
@@ -64,6 +64,7 @@ private:
     bool isFromFile;
  
     const InterpretedChunksDecoder* chunksDecoder;
+    MorfeuszProcessorType processorType;
  
     const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const;
 };
@@ -15,7 +15,6 @@ void FlexionGraph::addStartEdge(const Edge&amp; e) {
         this->graph.push_back(vector<Edge>());
         this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr);
     }
-//    cerr << string(e.chunk.chunkStartPtr) << endl;
     assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr);
     this->graph[0].push_back(e);
 }
@@ -30,22 +29,43 @@ void FlexionGraph::addMiddleEdge(unsigned int startNode, const Edge&amp; e) {
     this->graph[startNode].push_back(e);
 }
  
+static inline bool chunkIsAtFront(
+        const InterpretedChunk& chunk, 
+        const std::vector<InterpretedChunk>& path) {
+    unsigned int i;
+    for (i = 0; i < path.size() - 1 && path[i].orthWasShifted; i++) {
+    }
+    assert(!path[i].orthWasShifted);
+    return &chunk == &(path[i]);
+}
+
+static inline bool chunkIsAtBack(
+        const InterpretedChunk& chunk, 
+        const std::vector<InterpretedChunk>& path) {
+    return &chunk == &(path.back());
+}
+
+static inline bool chunkIsTheOnlyOne(
+        const InterpretedChunk& chunk, 
+        const std::vector<InterpretedChunk>& path) {
+    return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path);
+}
+
 void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) {
     //    debugPath(path);
     //    debugGraph(this->graph);
     for (unsigned int i = 0; i < path.size(); i++) {
         const InterpretedChunk& chunk = path[i];
         if (!chunk.orthWasShifted) {
-            if (&chunk == &(path.front())
-                    && &chunk == &(path.back())) {
+            if (chunkIsTheOnlyOne(chunk, path)) {
                 Edge e = {chunk, UINT_MAX};
                 this->addStartEdge(e);
             }
-            else if (&chunk == &(path.front())) {
+            else if (chunkIsAtFront(chunk, path)) {
                 Edge e = {chunk, this->graph.empty() ? 1 : (unsigned int) this->graph.size()};
                 this->addStartEdge(e);
             }
-            else if (&chunk == &(path.back())) {
+            else if (chunkIsAtBack(chunk, path)) {
                 Edge e = {chunk, UINT_MAX};
                 this->addMiddleEdge((unsigned int) this->graph.size(), e);
             }
@@ -18,6 +18,10 @@
 #include "charset/CaseConverter.hpp"
 #include "Environment.hpp"
  
+const uint8_t LEMMA_ONLY_LOWER = 0;
+const uint8_t LEMMA_UPPER_PREFIX = 1;
+const uint8_t LEMMA_MIXED_CASE = 2;
+
 class InterpretedChunksDecoder {
 public:
  
@@ -30,22 +34,12 @@ public:
             unsigned int endNode,
             const InterpretedChunk& interpretedChunk,
             std::vector<MorphInterpretation>& out) const = 0;
-    
-    virtual ~InterpretedChunksDecoder() {}
  
-protected:
-    
-    void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const {
-        for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
-            const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
-            originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
-            decodeForm(
-                    prefixChunk.lowercaseCodepoints,
-                    prefixChunk.interpsGroup.interps[0].value,
-                    decodedForm);
-        }
+    virtual ~InterpretedChunksDecoder() {
     }
-    
+
+protected:
+
     virtual void decodeForm(
             const std::vector<uint32_t>& orth,
             const EncodedForm& form,
@@ -55,9 +49,10 @@ protected:
 };
  
 class InterpretedChunksDecoder4Analyzer : public InterpretedChunksDecoder {
-
 public:
-    InterpretedChunksDecoder4Analyzer(const Environment& env): InterpretedChunksDecoder(env) {}
+
+    InterpretedChunksDecoder4Analyzer(const Environment& env) : InterpretedChunksDecoder(env) {
+    }
  
     void decode(
             unsigned int startNode,
@@ -65,22 +60,12 @@ public:
             const InterpretedChunk& interpretedChunk,
             std::vector<MorphInterpretation>& out) const {
         string orth;
-        string lemma;
-        convertPrefixes(interpretedChunk, orth, lemma);
+        string lemmaPrefix;
+        convertPrefixes(interpretedChunk, orth, lemmaPrefix);
         orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
-        for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {
-            const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];
-            decodeForm(
-                    interpretedChunk.lowercaseCodepoints,
-                    ei.value,
-                    lemma);
-            out.push_back(MorphInterpretation(
-                    startNode, endNode,
-                    orth, lemma,
-                    ei.tag,
-                    ei.nameClassifier,
-                    env.getTagset(),
-                    env.getCharsetConverter()));
+        const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr;
+        while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) {
+            out.push_back(this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr));
         }
     }
  
@@ -104,36 +89,116 @@ protected:
             env.getCharsetConverter().append(cp, res);
         }
     }
+
+private:
+
+    void convertPrefixes(const InterpretedChunk& interpretedChunk, std::string& originalForm, std::string& decodedForm) const {
+        for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
+            const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
+            originalForm += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
+            const unsigned char* ptr = prefixChunk.interpsGroup.ptr;
+            MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, originalForm, string(""), prefixChunk, ptr);
+            decodedForm += mi.getLemma();
+        }
+    }
+
+    MorphInterpretation decodeMorphInterpretation(
+            unsigned int startNode, unsigned int endNode,
+            const string& orth,
+            const string& lemmaPrefix,
+            const InterpretedChunk& chunk,
+            const unsigned char*& ptr) const {
+        string lemma = lemmaPrefix;
+        EncodedInterpretation ei = this->decodeInterp(ptr);
+        this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma);
+        return MorphInterpretation(
+                startNode, endNode,
+                orth, lemma,
+                ei.tag,
+                ei.nameClassifier,
+                env.getTagset(),
+                env.getCharsetConverter());
+    }
+
+    void decodeLemma(const unsigned char*& ptr, EncodedForm& lemma) const {
+        lemma.suffixToCut = *ptr;
+        ptr++;
+        lemma.suffixToAdd = (const char*) ptr;
+        ptr += strlen((const char*) ptr) + 1;
+        assert(lemma.casePattern.size() == 0);
+        //    lemma.casePattern.resize(MAX_WORD_SIZE, false);
+        uint8_t casePatternType = *ptr;
+        ptr++;
+        uint8_t prefixLength;
+        uint8_t patternLength;
+        switch (casePatternType) {
+            case LEMMA_ONLY_LOWER:
+                break;
+            case LEMMA_UPPER_PREFIX:
+                prefixLength = *ptr;
+                ptr++;
+                for (unsigned int i = 0; i < prefixLength; i++) {
+                    //                lemma.casePattern[i] = true;
+                    lemma.casePattern.push_back(true);
+                }
+                //            lemma.casePattern.resize(prefixLength, true);
+                break;
+            case LEMMA_MIXED_CASE:
+                patternLength = *ptr;
+                ptr++;
+                for (unsigned int i = 0; i < patternLength; i++) {
+                    uint8_t idx = *ptr;
+                    ptr++;
+                    //                lemma.casePattern[idx] = true;
+                    lemma.casePattern.resize(idx + 1, false);
+                    lemma.casePattern[idx] = true;
+                }
+                break;
+        }
+    }
+
+    EncodedInterpretation decodeInterp(const unsigned char*& ptr) const {
+        EncodedInterpretation interp;
+        decodeLemma(ptr, interp.value);
+        interp.tag = ntohs(*(reinterpret_cast<const uint16_t*> (ptr)));
+        ptr += 2;
+        interp.nameClassifier = *ptr;
+        ptr++;
+        return interp;
+    }
 };
  
 class InterpretedChunksDecoder4Generator : public InterpretedChunksDecoder {
-
 public:
-    InterpretedChunksDecoder4Generator(const Environment& env): InterpretedChunksDecoder(env) {}
+
+    InterpretedChunksDecoder4Generator(const Environment& env) : InterpretedChunksDecoder(env) {
+    }
  
     void decode(
             unsigned int startNode,
             unsigned int endNode,
             const InterpretedChunk& interpretedChunk,
             std::vector<MorphInterpretation>& out) const {
-        string orth;
-        string lemma;
-        convertPrefixes(interpretedChunk, lemma, orth);
-        lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
-        for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {
-            const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];
-            decodeForm(
-                    interpretedChunk.originalCodepoints,
-                    ei.value,
-                    orth);
-            out.push_back(MorphInterpretation(
-                    startNode, endNode,
-                    orth, lemma,
-                    ei.tag,
-                    ei.nameClassifier,
-                    env.getTagset(),
-                    env.getCharsetConverter()));
-        }
+        //        string orth;
+        //        string lemma;
+        //        convertPrefixes(interpretedChunk, lemma, orth);
+        //        size_t orthLength = orth.length();
+        //        lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
+        //        for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) {
+        //            const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i];
+        //            decodeForm(
+        //                    interpretedChunk.originalCodepoints,
+        //                    ei.value,
+        //                    orth);
+        //            out.push_back(MorphInterpretation(
+        //                    startNode, endNode,
+        //                    orth, lemma,
+        //                    ei.tag,
+        //                    ei.nameClassifier,
+        //                    env.getTagset(),
+        //                    env.getCharsetConverter()));
+        //            orth.erase(orthLength);
+        //        }
     }
  
 private:
@@ -14,24 +14,26 @@
 #include "MorphInterpretation.hpp"
 #include "Tagset.hpp"
  
-class InterpsGroup {
-public:
-    
-    InterpsGroup() {
-        
-    }
-    
-    explicit InterpsGroup(const unsigned char type)
-    : type(type) {
-        
-    }
-    
-    void addInterpretation(const EncodedInterpretation& interp) {
-        interps.push_back(interp);
-    }
+struct InterpsGroup {
+//public:
+//    
+//    InterpsGroup() {
+//        
+//    }
+//    
+//    explicit InterpsGroup(const unsigned char type)
+//    : type(type) {
+//        
+//    }
+//    
+//    void addInterpretation(const EncodedInterpretation& interp) {
+//        interps.push_back(interp);
+//    }
  
     unsigned char type;
-    std::vector<EncodedInterpretation> interps;
+    uint16_t size;
+    const unsigned char* ptr;
+//    std::vector<EncodedInterpretation> interps;
 };
  
 #endif	/* GROUPEDINTERPRETATIONS_HPP */
@@ -82,7 +82,9 @@ void Morfeusz::processOneWord(
     FlexionGraph graph;
     const char* currInput = inputStart;
     const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
+    
     doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
+    
     if (!graph.empty()) {
         const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
         int srcNode = startNodeNum;
@@ -110,6 +112,7 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) {
             from.prefixChunks.end());
     to.prefixChunks.push_back(from);
     from.orthWasShifted = true;
+    to.chunkStartPtr = from.chunkStartPtr;
 }
  
 void Morfeusz::doProcessOneWord(
@@ -119,28 +122,21 @@ void Morfeusz::doProcessOneWord(
         SegrulesState segrulesState,
         vector<InterpretedChunk>& accum,
         FlexionGraph& graph) const {
-        cerr << "doAnalyzeOneWord " << inputData << endl;
-    bool endOfProcessing = inputData == inputEnd;
+//    cerr << "doAnalyzeOneWord " << inputData << endl;
     const char* currInput = inputData;
-    uint32_t codepoint = endOfProcessing ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
-    //    UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter));
+    uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
     vector<uint32_t> originalCodepoints;
     vector<uint32_t> lowercaseCodepoints;
  
     StateType state = env.getFSA().getInitialState();
  
-    while (!endOfProcessing) {
-        if (isEndOfWord(codepoint)) {
-            endOfProcessing = true;
-        }
-        cerr << "not end of word '" << string(currInput) << "'" << endl;
+    while (!isEndOfWord(codepoint)) {
         uint32_t lowerCP = env.getCaseConverter().toLower(codepoint);
         originalCodepoints.push_back(codepoint);
         lowercaseCodepoints.push_back(lowerCP);
         feedState(state, lowerCP, UTF8CharsetConverter());
         codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd);
         if (state.isAccepting()) {
-            cerr << "accepting" << endl;
             vector<InterpsGroup> val(state.getValue());
             for (unsigned int i = 0; i < val.size(); i++) {
                 InterpsGroup& ig = val[i];
@@ -151,6 +147,9 @@ void Morfeusz::doProcessOneWord(
                         it != newSegrulesStates.end();
                         ++it) {
                     SegrulesState newSegrulesState = *it;
+//                    if (newSegrulesState.shiftOrthFromPrevious) {
+//                        
+//                    }
                     InterpretedChunk ic = {
                         inputData,
                         originalCodepoints,
@@ -165,7 +164,6 @@ void Morfeusz::doProcessOneWord(
                     }
                     accum.push_back(ic);
                     if (isEndOfWord(codepoint)) {
-                        cerr << "end of word inside " << currInput <<endl;
                         if (newSegrulesState.accepting)
                             graph.addPath(accum);
                     }
@@ -177,8 +175,8 @@ void Morfeusz::doProcessOneWord(
                 }
             }
         }
+        codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
     }
-    cerr << "end of word " << currInput << endl;
     inputData = currInput;
 }
  
@@ -23,74 +23,93 @@ MorphDeserializer::MorphDeserializer() {
 MorphDeserializer::~MorphDeserializer() {
 }
  
-static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) {
-    // XXX uważać na poprawność danych
-    lemma.suffixToCut = *ptr;
-    ptr++;
-    lemma.suffixToAdd = (const char*) ptr;
-    ptr += strlen((const char*) ptr) + 1;
-    assert(lemma.casePattern.size() == 0);
-//    lemma.casePattern.resize(MAX_WORD_SIZE, false);
-    uint8_t casePatternType = *ptr;
-    ptr++;
-    uint8_t prefixLength;
-    uint8_t patternLength;
-    switch (casePatternType) {
-        case LEMMA_ONLY_LOWER:
-            break;
-        case LEMMA_UPPER_PREFIX:
-            prefixLength = *ptr;
-            ptr++;
-            for (unsigned int i = 0; i < prefixLength; i++) {
-//                lemma.casePattern[i] = true;
-                lemma.casePattern.push_back(true);
-            }
-//            lemma.casePattern.resize(prefixLength, true);
-            break;
-        case LEMMA_MIXED_CASE:
-            patternLength = *ptr;
-            ptr++;
-            for (unsigned int i = 0; i < patternLength; i++) {
-                uint8_t idx = *ptr;
-                ptr++;
+//static void deserializeLemma(const unsigned char*& ptr, EncodedForm& lemma) {
+//    // XXX uważać na poprawność danych
+//    lemma.suffixToCut = *ptr;
+//    ptr++;
+//    lemma.suffixToAdd = (const char*) ptr;
+//    ptr += strlen((const char*) ptr) + 1;
+//    assert(lemma.casePattern.size() == 0);
+////    lemma.casePattern.resize(MAX_WORD_SIZE, false);
+//    uint8_t casePatternType = *ptr;
+//    ptr++;
+//    uint8_t prefixLength;
+//    uint8_t patternLength;
+//    switch (casePatternType) {
+//        case LEMMA_ONLY_LOWER:
+//            break;
+//        case LEMMA_UPPER_PREFIX:
+//            prefixLength = *ptr;
+//            ptr++;
+//            for (unsigned int i = 0; i < prefixLength; i++) {
+////                lemma.casePattern[i] = true;
+//                lemma.casePattern.push_back(true);
+//            }
+////            lemma.casePattern.resize(prefixLength, true);
+//            break;
+//        case LEMMA_MIXED_CASE:
+//            patternLength = *ptr;
+//            ptr++;
+//            for (unsigned int i = 0; i < patternLength; i++) {
+//                uint8_t idx = *ptr;
+//                ptr++;
+////                lemma.casePattern[idx] = true;
+//                lemma.casePattern.resize(idx + 1, false);
 //                lemma.casePattern[idx] = true;
-                lemma.casePattern.resize(idx + 1, false);
-                lemma.casePattern[idx] = true;
-            }
-            break;
-    }
-}
-
-static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
-    interp.type = *ptr;
-    ptr++;
-    deserializeLemma(ptr, interp.value);
-    interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
-    ptr += 2;
-    interp.nameClassifier = *ptr;
-    ptr++;
-}
+//            }
+//            break;
+//    }
+//}
+//
+//static void deserializeInterp(const unsigned char*& ptr, EncodedInterpretation& interp) {
+//    interp.type = *ptr;
+//    ptr++;
+//    deserializeLemma(ptr, interp.value);
+//    interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
+//    ptr += 2;
+//    interp.nameClassifier = *ptr;
+//    ptr++;
+//}
  
 long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
     const unsigned char* currPtr = ptr;
-    uint8_t interpsNum = *ptr;
-    interps.clear();
-    interps.reserve(interpsNum);
+    uint8_t interpTypesNum = *currPtr;
     currPtr++;
-    // FIXME - to jest do poprawy
-    map<int, InterpsGroup> results;
-    for (unsigned int i = 0; i < interpsNum; ++i) {
-        EncodedInterpretation interp;
-        deserializeInterp(currPtr, interp);
-        if (results.count(interp.type) == 0) {
-            results[interp.type] = InterpsGroup(interp.type);
-        }
-        results[interp.type].addInterpretation(interp);
-//        interps.push_back(interp);
-    }
-    map<int, InterpsGroup>::iterator it;
-    for (it = results.begin(); it != results.end(); ++it) {
-        interps.push_back((*it).second);
+    interps.clear();
+    interps.reserve(interpTypesNum);
+    for (unsigned int i = 0; i < interpTypesNum; i++) {
+        InterpsGroup ig;
+        ig.type = *currPtr;
+        currPtr++;
+        ig.size = ntohs(*(reinterpret_cast<const uint16_t*>(currPtr)));
+        currPtr += 2;
+        ig.ptr = currPtr;
+        currPtr += ig.size;
+        interps.push_back(ig);
     }
     return currPtr - ptr;
 }
+
+//long MorphDeserializer::deserialize(const unsigned char* ptr, vector<InterpsGroup>& interps) const {
+//    const unsigned char* currPtr = ptr;
+//    uint8_t interpsNum = *ptr;
+//    interps.clear();
+//    interps.reserve(interpsNum);
+//    currPtr++;
+//    // FIXME - to jest do poprawy
+//    map<int, InterpsGroup> results;
+//    for (unsigned int i = 0; i < interpsNum; ++i) {
+//        EncodedInterpretation interp;
+//        deserializeInterp(currPtr, interp);
+//        if (results.count(interp.type) == 0) {
+//            results[interp.type] = InterpsGroup(interp.type);
+//        }
+//        results[interp.type].addInterpretation(interp);
+////        interps.push_back(interp);
+//    }
+//    map<int, InterpsGroup>::iterator it;
+//    for (it = results.begin(); it != results.end(); ++it) {
+//        interps.push_back((*it).second);
+//    }
+//    return currPtr - ptr;
+//}
@@ -106,14 +106,20 @@
         </makeTool>
       </makefileType>
       <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4">
+        <ccTool flags="1">
+        </ccTool>
       </item>
       <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
+        <ccTool flags="1">
+        </ccTool>
       </item>
       <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4">
       </item>
       <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
       </item>
       <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
+        <ccTool flags="1">
+        </ccTool>
       </item>
       <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
             ex="false"
@@ -169,7 +175,7 @@
       <item path="build1/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
       </item>
       <item path="default_fsa.cpp" ex="false" tool="1" flavor2="4">
-        <ccTool flags="1">
+        <ccTool>
           <incDir>
             <pElem>morfeusz</pElem>
             <pElem>morfeusz/build/morfeusz</pElem>
@@ -180,7 +186,7 @@
         </ccTool>
       </item>
       <item path="default_synth_fsa.cpp" ex="false" tool="1" flavor2="4">
-        <ccTool flags="1">
+        <ccTool>
           <incDir>
             <pElem>morfeusz</pElem>
             <pElem>morfeusz/build/morfeusz</pElem>
@@ -273,7 +279,7 @@
         <ccTool>
           <incDir>
             <pElem>morfeusz</pElem>
-            <pElem>/usr/lib/jvm/default-java/include</pElem>
+            <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
           </incDir>
           <preprocessorList>
             <Elem>libjmorfeusz_EXPORTS</Elem>
@@ -408,18 +414,26 @@
         </ccTool>
       </item>
       <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4">
+        <ccTool flags="1">
+        </ccTool>
       </item>
       <item path="morfeusz/charset/CharsetConverter.cpp"
             ex="false"
             tool="1"
             flavor2="4">
+        <ccTool flags="1">
+        </ccTool>
       </item>
       <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
+        <ccTool flags="1">
+        </ccTool>
       </item>
       <item path="morfeusz/charset/conversion_tables.cpp"
             ex="false"
             tool="1"
             flavor2="4">
+        <ccTool flags="1">
+        </ccTool>
       </item>
       <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
         <ccTool flags="1">
@@ -508,8 +522,12 @@
             ex="false"
             tool="1"
             flavor2="4">
+        <ccTool flags="1">
+        </ccTool>
       </item>
       <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
+        <ccTool flags="1">
+        </ccTool>
       </item>
       <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
         <ccTool flags="0">