rozprawienie się z wielkością liter, poprawa jakości pakietu debianowego, zmiana…

… nazwy skryptu do budowania automatu na morfeusz_builder git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@153 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

rozprawienie się z wielkością liter, poprawa jakości pakietu debianowego, zmiana…
… nazwy skryptu do budowania automatu na morfeusz_builder git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@153 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Michał Lenart
1 parent aaf322b3
Showing 20 changed files with 150 additions and 142 deletions
CMakeLists.txt
License.txt
fsabuilder/.settings/org.eclipse.core.resources.prefs
fsabuilder/CMakeLists.txt
fsabuilder/buildanalyzer.sh
fsabuilder/buildgenerator.sh
fsabuilder/buildfsa.py → fsabuilder/morfeusz_builder
fsabuilder/morfeuszbuilder/fsa/common.py
fsabuilder/morfeuszbuilder/fsa/encode.py
fsabuilder/morfeuszbuilder/fsa/serializer.py
fsabuilder/setup.py.in
morfeusz/CMakeLists.txt
morfeusz/CasePatternHelper.hpp
morfeusz/EncodedGeneratorInterpretation.hpp
morfeusz/InflexionGraph.cpp
morfeusz/InterpretedChunk.hpp
morfeusz/InterpretedChunksDecoder.hpp
morfeusz/Morfeusz.cpp
morfeusz/fsa/const.cpp
morfeusz/fsa/fsa_impl.hpp
@@ -5,7 +5,7 @@ project (Morfeusz)
 set (Morfeusz_VERSION_MAJOR 2)
 set (Morfeusz_VERSION_MINOR 0)
 set (Morfeusz_VERSION_PATCH 0)
-set (CMAKE_BUILD_TYPE "Debug")
+#~ set (CMAKE_BUILD_TYPE "Release")
  
 enable_testing()
  
@@ -30,13 +30,13 @@ set (PROJECT_VERSION &quot;${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morf
 ### USER DEFINED VARIABLES
  
 # INPUT_DICTIONARY_CPP
-set (INPUT_DICTIONARY_CPP "${PROJECT_SOURCE_DIR}/../default_fsa.cpp")
-set (INPUT_SYNTH_DICTIONARY_CPP "${PROJECT_SOURCE_DIR}/../default_synth_fsa.cpp")
+set (INPUT_DICTIONARY_CPP "default_fsa.cpp")
+set (INPUT_SYNTH_DICTIONARY_CPP "default_synth_fsa.cpp")
 if ("${INPUT_DICTIONARIES}" STREQUAL "")
    if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE")
     set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt)
    else ()
-    set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab")
+    set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/sgjp-hom.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab")
    endif ()
 endif ()
  
@@ -98,10 +98,10 @@ set (CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
  
  
 # the RPATH to be used when installing, but only if it's not a system directory
-list (FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir)
-if ("${isSystemDir}" STREQUAL "-1")
-   SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
-endif ()
+#~ list (FIND CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES "${CMAKE_INSTALL_PREFIX}/lib" isSystemDir)
+#~ if ("${isSystemDir}" STREQUAL "-1")
+   #~ SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
+#~ endif ()
  
 ########## Configure CPack ##########
  
@@ -110,14 +110,16 @@ set (CPACK_PACKAGE_FILE_NAME &quot;morfeusz2-${PROJECT_VERSION}-${CMAKE_SYSTEM_NAME}-
 set (CPACK_PACKAGE_VENDOR "Michał Lenart")
 set (CPACK_PACKAGE_CONTACT "michal.lenart@ipipan.waw.pl")
 set (CPACK_PACKAGE_DESCRIPTION_SUMMARY "Morphological analyzer for the Polish language.")
+set (CPACK_PACKAGE_DESCRIPTION "This is the second version of Morfeusz. Among numerous improvements it has better segmentation and case-sensitive lemmas handling. It also makes it possible to perform morphological synthesis and to use your own dictionary")
+set (CPACK_RESOURCE_FILE_LICENSE "${PROJECT_SOURCE_DIR}/License.txt")
 set (CPACK_OUTPUT_FILE_PREFIX "${TARGET_DIR}")
  
 if (${CMAKE_SYSTEM_NAME} MATCHES "Linux")
     set (CPACK_GENERATOR "DEB" "TGZ")
     #debian
     set (CPACK_DEBIAN_PACKAGE_NAME "morfeusz2")
-    set (CPACK_DEBIAN_PACKAGE_MAINTAINER "${CPACK_PACKAGE_CONTACT}")
-    set (CPACK_DEBIAN_PACKAGE_DEPENDS "libstdc++6 (>= 4.6)")
+    set (CPACK_DEBIAN_PACKAGE_MAINTAINER "Michał Lenart <${CPACK_PACKAGE_CONTACT}>")
+    set (CPACK_DEBIAN_PACKAGE_DEPENDS "libstdc++6 (>= 4.6), libc6")
     set (CPACK_DEBIAN_PACKAGE_ARCHITECTURE "${ARCHITECTURE}")
 elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows")
     if (${ARCHITECTURE} MATCHES "amd64")
@@ -149,15 +151,15 @@ add_subdirectory (fsabuilder)
 ########## add tests ##########
  
 macro (test_build_and_recognize fname method)
-    add_test (TestBuild-${method}-${fname} python fsabuilder/buildfsa.py --analyzer --input-files testfiles/${fname} -o /tmp/test-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --segments-file=testfiles/segmenty.dat --serialization-method=${method})
-    add_test (TestBuild4Synth-${method}-${fname} python fsabuilder/buildfsa.py --generator --input-files testfiles/${fname} -o /tmp/test-synth-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --serialization-method=${method})
+    add_test (TestBuild-${method}-${fname} python fsabuilder/morfeusz_builder --analyzer --input-files testfiles/${fname} -o /tmp/test-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --segments-file=testfiles/segmenty.dat --serialization-method=${method})
+    add_test (TestBuild4Synth-${method}-${fname} python fsabuilder/morfeusz_builder --generator --input-files testfiles/${fname} -o /tmp/test-synth-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --serialization-method=${method})
     add_test (TestRecognize-${method}-${fname} morfeusz/test_recognize_dict /tmp/test-${method}-${fname}.fsa testfiles/${fname})
     # add_test (TestNOTRecognize-${method}-${fname} fsa/test_not_recognize /tmp/test-${method}-${fname}.fsa testfiles/out_of_dict)
     # add_test (TestSpeed-${method}-${fname} fsa/test_speed /tmp/test-${method}-${fname}.fsa testfiles/speed_test_data)
 endmacro ()
  
 macro (test_result_equals inputFilename requiredOutputFilename encoding)
-    # add_test (TestBuild4ResultEquals-${dictFilename}-${requiredOutputFilename} python fsabuilder/fsa/buildfsa.py -i ${dictFilename} -o /tmp/test.fsa --tagset-file=testfiles/polimorf.tagset --output-format=BINARY --serialization-method=SIMPLE)
+    # add_test (TestBuild4ResultEquals-${dictFilename}-${requiredOutputFilename} python fsabuilder/fsa/morfeusz_builder -i ${dictFilename} -o /tmp/test.fsa --tagset-file=testfiles/polimorf.tagset --output-format=BINARY --serialization-method=SIMPLE)
     add_test (TestResultEquals-${inputFilename}-${requiredOutputFilename} morfeusz/test_result_equals ${inputFilename} ${requiredOutputFilename} ${encoding})
 endmacro ()
  
+This is the Morfeusz license file.
 eclipse.preferences.version=1
 encoding//morfeuszbuilder/fsa/test/testConstruction.py=utf-8
 encoding//morfeuszbuilder/segrules/preprocessor.py=utf-8
-encoding/buildfsa.py=utf-8
+encoding/morfeusz_builder=utf-8
@@ -16,15 +16,6 @@ add_custom_command (OUTPUT ${SETUP_PY}
  
 add_custom_target (builder-setup DEPENDS ${SETUP_PY})
  
-#~ add_custom_target (buildfsa-exec ALL
-    #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/buildfsa.py
-#~ )
-#~ 
-#~ add_executable (buildfsa IMPORTED)
-#~ add_dependencies (buildfsa buildfsa-exec)
-#~ set_property (TARGET buildfsa PROPERTY IMPORTED_LOCATION "${DIST_PATH}")
-#~ install (PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/buildfsa" DESTINATION bin)
-
 if (${UNIX})
     add_custom_target (install-builder
         COMMAND python ${SETUP_PY} install --home=${CMAKE_INSTALL_PREFIX}
@@ -62,6 +53,15 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES &quot;Windows&quot;)
         DEPENDS builder-setup
     )
     list (APPEND PACKAGE_DEPENDS package-python-win-installer)
+    
+    #~ add_custom_target (buildfsa-exec ALL
+    #~ COMMAND pyinstaller --noconfirm --onefile --console --strip --distpath="${DIST_PATH}" --clean fsa/morfeusz_builder
+#~ )
+#~ 
+#~ add_executable (morfeusz_builder IMPORTED)
+#~ add_dependencies (morfeusz_builder buildfsa-exec)
+#~ set_property (TARGET morfeusz_builder PROPERTY IMPORTED_LOCATION "${DIST_PATH}")
+#~ install (PROGRAMS "${CMAKE_CURRENT_BINARY_DIR}/morfeusz_builder" DESTINATION bin)
 endif ()
  
 add_custom_target(package-builder DEPENDS ${PACKAGE_DEPENDS})
 #!/bin/bash
  
-python buildfsa.py --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
+python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
     --tagset-file=../input/polimorf.tagset \
     --segments-file=../input/segmenty.dat \
     --analyzer \
 #!/bin/bash
  
-python buildfsa.py --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
+python morfeusz_builder --input-files=../input/sgjp-hom.tab,../input/dodatki.tab \
     --tagset-file=../input/polimorf.tagset \
     --segments-file=../input/segmenty.dat \
     --generator \
@@ -12,21 +12,11 @@ import codecs
 from morfeuszbuilder.fsa import encode
 from morfeuszbuilder.fsa import convertinput
 from morfeuszbuilder.fsa.fsa import FSA
-from morfeuszbuilder.fsa.serializer import Serializer
+from morfeuszbuilder.fsa.serializer import Serializer, SerializationMethod
 from morfeuszbuilder.tagset.tagset import Tagset
 from morfeuszbuilder.segrules import rulesParser
 from optparse import OptionParser
  
-# class InputFormat():
-#     ENCODED = 'ENCODED'
-#     POLIMORF = 'POLIMORF'
-#     PLAIN = 'PLAIN'
-
-class SerializationMethod():
-    SIMPLE = 'SIMPLE'
-    V1 = 'V1'
-    V2 = 'V2'
-
 def _checkOption(opt, parser, msg):
     if opt is None:
         print >> sys.stderr, msg
@@ -46,6 +36,8 @@ def _checkOpen(filename, mode):
     try:
         with open(filename, mode) as _:
             pass
+        if 'w' in mode:
+            os.remove(filename)
     except IOError as ex:
         print >> sys.stderr, str(ex)
         exit(1)
@@ -238,7 +230,6 @@ def main(opts):
         fsa, qualifiersMap = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager, opts.trimSupneg)
     else:
         fsa, qualifiersMap = buildGeneratorFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager)
-    print qualifiersMap
     if opts.trainFile:
         logging.info('training with '+opts.trainFile+' ...')
         fsa.train(_readTrainData(opts.trainFile))
@@ -21,6 +21,7 @@ class EncodedForm(object):
         self.cutLength = len(fromWord) - len(root)
         self.suffixToAdd = targetWord[len(root):]
         self.casePattern = [c == c.upper() and c != c.lower() for c in root]
+#         print fromWord.encode('utf8'), targetWord.encode('utf8'), self.casePattern
  
 class EncodedForm4Generator(object):
  
@@ -54,7 +55,7 @@ class Interpretation4Analyzer(object):
         return (
                 self.encodedForm.cutLength, 
                 tuple(self.encodedForm.suffixToAdd), 
-                tuple(self.encodedForm.casePattern), 
+                tuple(self.encodedForm.casePattern),
                 self.tagnum, 
                 self.namenum)
  
@@ -120,16 +120,13 @@ class Encoder(object):
             res[interp.typenum].append(interp)
         return res
  
-    def _getMostLiberalCasePattern(self, interpsList):
-        res = None
+    def _getCasePatterns(self, interpsList):
+        res = []
         for interp in interpsList:
-            if res is None:
-                res = list(interp.encodedForm.casePattern)
+            if not True in interp.encodedForm.casePattern:
+                return []
             else:
-                while len(interp.encodedForm.casePattern) > len(res):
-                    res.append(False)
-                for idx, (case1, case2) in enumerate(itertools.izip_longest(res, interp.encodedForm.casePattern, fillvalue=False)):
-                    res[idx] = case1 and case2
+                res.append(list(interp.encodedForm.casePattern))
         return res
  
     def _encodeInterps4Type(self, typenum, interpsList, withCasePattern, withPrefix, withHomonymId):
@@ -137,7 +134,10 @@ class Encoder(object):
         res.extend(self._encodeTypeNum(typenum))
         encodedInterpsList = bytearray()
         if withCasePattern:
-            encodedInterpsList.extend(self._encodeCasePattern(self._getMostLiberalCasePattern(interpsList)))
+            casePatterns = self._getCasePatterns(interpsList)
+            encodedInterpsList.append(len(casePatterns))
+            for casePattern in casePatterns:
+                encodedInterpsList.extend(self._encodeCasePattern(casePattern))
         for interp in sorted(interpsList, key=lambda i: i.getSortKey()):
             if withHomonymId:
                 encodedInterpsList.extend(self.encodeWord(interp.homonymId, lowercase=False))
@@ -8,6 +8,11 @@ import logging
 from state import State
 from morfeuszbuilder.utils.serializationUtils import *
  
+class SerializationMethod(object):
+    SIMPLE = 'SIMPLE'
+    V1 = 'V1'
+    V2 = 'V2'
+
 class Serializer(object):
  
     MAGIC_NUMBER = 0x8fc2bc1b
@@ -20,7 +25,6 @@ class Serializer(object):
  
     @staticmethod
     def getSerializer(serializationMethod, fsa, tagset, qualifiersMap, segmentationRulesData):
-        from buildfsa import SerializationMethod
         res = {
             SerializationMethod.SIMPLE: SimpleSerializer,
             SerializationMethod.V1: VLengthSerializer1,
@@ -37,7 +41,7 @@ class Serializer(object):
  
     # get the Morfeusz file format version that is being encoded
     def getVersion(self):
-        return 14
+        return 15
  
     def serialize2CppFile(self, fname, isGenerator, headerFilename="data/default_fsa.hpp"):
         res = []
@@ -8,5 +8,5 @@ if __name__ == &#39;__main__&#39;:
           description='Finite state automata builder for Morfeusz.',
           version='${MORFEUSZBUILDER_VERSION}',
           packages=['morfeuszbuilder', 'morfeuszbuilder.fsa', 'morfeuszbuilder.tagset', 'morfeuszbuilder.segrules', 'morfeuszbuilder.utils'],
-          scripts = ['buildfsa.py'],
+          scripts = ['morfeusz_builder'],
           requires=['pyparsing'])
@@ -3,13 +3,13 @@
 ########## generate default dictionary data #################
 add_custom_command (
         OUTPUT "${INPUT_DICTIONARY_CPP}"
-        COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg
+        COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2 --trim-supneg
         DEPENDS "${INPUT_DICTIONARY}"
         COMMENT "Building default dictionary C++ file"
 )
 add_custom_command (
         OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}"
-        COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2
+        COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V2
         DEPENDS "${INPUT_DICTIONARY}"
         COMMENT "Building default dictionary C++ file"
 )
@@ -9,7 +9,7 @@
 #define	CASEPATTERNHELPER_HPP
  
 #include <vector>
-#include "InterpretedChunk.hpp"
+#include "InterpsGroup.hpp"
  
 const uint8_t LEMMA_ONLY_LOWER = 0;
 const uint8_t LEMMA_UPPER_PREFIX = 1;
@@ -39,28 +39,40 @@ public:
         }
         return true;
     }
-
-//    bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const {
-//        if (this->caseSensitive) {
-//            for (unsigned int i = 0; i < chunks.size(); i++) {
-//                const InterpretedChunk& ic = chunks[i];
-//                const unsigned char* casePatternPtr = ic.interpsGroup.ptr;
-//                std::vector<bool> casePattern;
-//                deserializeCasePattern(casePatternPtr, casePattern);
-//                if (!checkCasePattern(ic, casePattern)) {
-//                    return false;
-//                }
-//            }
-//        }
-//        return true;
-//    }
-
-    void skipCasePattern(const unsigned char*& ptr) const {
-        vector<bool> _dupa;
-        deserializeCasePattern(ptr, _dupa);
+    
+    bool checkInterpsGroupCasePatterns(
+        const std::vector<uint32_t>& lowercaseCodepoints, 
+        const std::vector<uint32_t>& originalCodepoints,
+        const InterpsGroup& ig) const {
+        const unsigned char* currPtr = ig.ptr;
+        unsigned char casePatternsNum = *currPtr++;
+        if (casePatternsNum == 0) {
+            return true;
+        }
+        else {
+            for (unsigned int i = 0; i < casePatternsNum; i++) {
+                if (checkCasePattern(
+                        lowercaseCodepoints, 
+                        originalCodepoints, 
+                        deserializeOneCasePattern(currPtr))) {
+                    return true;
+                }
+            }
+            return false;
+        }
     }
-
-    void deserializeCasePattern(const unsigned char*& ptr, std::vector<bool>& res) const {
+    
+    const unsigned char* getInterpretationsPtr(const InterpsGroup& ig) const {
+        const unsigned char* currPtr = ig.ptr;
+        unsigned char casePatternsNum = *currPtr++;
+        for (unsigned int i = 0; i < casePatternsNum; i++) {
+            deserializeOneCasePattern(currPtr);
+        }
+        return currPtr;
+    }
+    
+    std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {
+        std::vector<bool> res;
         uint8_t casePatternType = *ptr;
         ptr++;
         uint8_t prefixLength;
@@ -89,9 +101,31 @@ public:
                 }
                 break;
         }
+        return res;
     }
+
+//    bool checkCasePattern(const std::vector<InterpretedChunk>& chunks) const {
+//        if (this->caseSensitive) {
+//            for (unsigned int i = 0; i < chunks.size(); i++) {
+//                const InterpretedChunk& ic = chunks[i];
+//                const unsigned char* casePatternPtr = ic.interpsGroup.ptr;
+//                std::vector<bool> casePattern;
+//                deserializeCasePattern(casePatternPtr, casePattern);
+//                if (!checkCasePattern(ic, casePattern)) {
+//                    return false;
+//                }
+//            }
+//        }
+//        return true;
+//    }
+
+//    void skipCasePattern(const unsigned char*& ptr) const {
+//        vector<bool> _dupa;
+//        deserializeCasePattern(ptr, _dupa);
+//    }
 private:
     bool caseSensitive;
+    
 };
  
 #endif	/* CASEPATTERNHELPER_HPP */
-/* 
- * File:   EncodedGeneratorInterpretation.hpp
- * Author: mlenart
- *
- * Created on 20 styczeń 2014, 17:15
- */
-
-#ifndef ENCODEDGENERATORINTERPRETATION_HPP
-#define	ENCODEDGENERATORINTERPRETATION_HPP
-
-/*
- * Orth in a compressed format (as in an automaton)
- */
-struct EncodedOrth {
-    int suffixToCut;
-    std::string suffixToAdd;
-    std::string prefixToAdd;
-};
-
-/*
- * Internal representation of an interpretation - with orth encoded
- */
-struct EncodedGeneratorInterpretation {
-    EncodedOrth orth;
-    int tag;
-    int nameClassifier;
-};
-
-#endif	/* ENCODEDGENERATORINTERPRETATION_HPP */
-
@@ -13,9 +13,9 @@ void InflexionGraph::addStartEdge(const Edge&amp; e) {
     if (this->graph.empty()) {
         assert(this->node2ChunkStartPtr.empty());
         this->graph.push_back(vector<Edge>());
-        this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr);
+        this->node2ChunkStartPtr.push_back(e.chunk.textStartPtr);
     }
-    assert(this->node2ChunkStartPtr[0] == e.chunk.chunkStartPtr);
+    assert(this->node2ChunkStartPtr[0] == e.chunk.textStartPtr);
     this->graph[0].push_back(e);
 }
  
@@ -24,7 +24,7 @@ void InflexionGraph::addMiddleEdge(unsigned int startNode, const Edge&amp; e) {
     assert(startNode == this->graph.size());
     if (startNode == this->graph.size()) {
         this->graph.push_back(vector<Edge>());
-        this->node2ChunkStartPtr.push_back(e.chunk.chunkStartPtr);
+        this->node2ChunkStartPtr.push_back(e.chunk.textStartPtr);
     }
     this->graph[startNode].push_back(e);
 }
@@ -98,7 +98,7 @@ set&lt;InflexionGraph::Path&gt; InflexionGraph::getPossiblePaths(unsigned int node) {
         vector<Edge>& edges = this->graph.at(node);
         for (unsigned int i = 0; i < edges.size(); i++) {
             Edge& e = edges[i];
-            InflexionGraph::PathElement pathElem(e.chunk.chunkStartPtr, e.chunk.interpsGroup.type);
+            InflexionGraph::PathElement pathElem(e.chunk.textStartPtr, e.chunk.segmentType);
             if (e.nextNode != this->graph.size()) {
                 set<Path> possiblePaths = this->getPossiblePaths(e.nextNode);
                 vector<Path> nextPaths(possiblePaths.begin(), possiblePaths.end());
@@ -116,9 +116,9 @@ set&lt;InflexionGraph::Path&gt; InflexionGraph::getPossiblePaths(unsigned int node) {
 static bool containsEqualEdge(const vector<InflexionGraph::Edge>& edges, const InflexionGraph::Edge& e) {
     for (unsigned int i = 0; i < edges.size(); i++) {
         const InflexionGraph::Edge& e1 = edges[i];
-        if (e1.chunk.chunkStartPtr == e.chunk.chunkStartPtr
+        if (e1.chunk.textStartPtr == e.chunk.textStartPtr
                 && e1.chunk.lowercaseCodepoints == e.chunk.lowercaseCodepoints
-                && e1.chunk.interpsGroup.type == e.chunk.interpsGroup.type
+                && e1.chunk.segmentType == e.chunk.segmentType
                 && e1.nextNode == e.nextNode) {
             return true;
         }
@@ -12,11 +12,13 @@
 #include "InterpsGroup.hpp"
  
 struct InterpretedChunk {
-    const char* chunkStartPtr;
-    const char* chunkEndPtr;
+    unsigned char segmentType;
+    const char* textStartPtr;
+    const char* textEndPtr;
     std::vector<uint32_t> originalCodepoints;
     std::vector<uint32_t> lowercaseCodepoints;
-    InterpsGroup interpsGroup;
+    const unsigned char* interpsPtr;
+    const unsigned char* interpsEndPtr;
     bool shiftOrth;
     bool orthWasShifted;
     std::vector<InterpretedChunk> prefixChunks;
@@ -69,9 +69,8 @@ public:
         string lemmaPrefix;
         if (convertPrefixes(interpretedChunk, orth, lemmaPrefix)) {
             orth += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
-            const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr;
-            env.getCasePatternHelper().skipCasePattern(currPtr);
-            while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) {
+            const unsigned char* currPtr = interpretedChunk.interpsPtr;
+            while (currPtr < interpretedChunk.interpsEndPtr) {
                 this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr, out);
             }
         }
@@ -104,7 +103,7 @@ protected:
         encodedForm.suffixToAdd = (const char*) ptr;
         ptr += strlen((const char*) ptr) + 1;
         assert(encodedForm.casePattern.size() == 0);
-        env.getCasePatternHelper().deserializeCasePattern(ptr, encodedForm.casePattern);
+        encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
     }
 private:
  
@@ -126,8 +125,8 @@ private:
             std::vector<MorphInterpretation>& out) const {
         string lemma = lemmaPrefix;
         EncodedInterpretation ei = this->deserializeInterp(ptr);
+        this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma);
         if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.value.casePattern)) {
-            this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma);
             pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma);
             out.push_back(MorphInterpretation(
                     startNode, endNode,
@@ -144,9 +143,9 @@ private:
         for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
             const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
             orth += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
-            const unsigned char* ptr = prefixChunk.interpsGroup.ptr;
+            const unsigned char* ptr = prefixChunk.interpsPtr;
             std::vector<MorphInterpretation> mi;
-            env.getCasePatternHelper().skipCasePattern(ptr);
+//            env.getCasePatternHelper().skipCasePattern(ptr);
             this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, ptr, mi);
             if (!mi.empty()) {
                 lemmaPrefix += mi[0].getLemma();
@@ -173,8 +172,8 @@ public:
         string lemma;
         convertPrefixes(interpretedChunk, orthPrefix, lemma);
         lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
-        const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr;
-        while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) {
+        const unsigned char* currPtr = interpretedChunk.interpsPtr;
+        while (currPtr < interpretedChunk.interpsEndPtr) {
             MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr);
             //            cerr << mi.toString(false) << endl;
             //            cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl;
@@ -190,7 +189,7 @@ private:
         for (unsigned int i = 0; i < interpretedChunk.prefixChunks.size(); i++) {
             const InterpretedChunk& prefixChunk = interpretedChunk.prefixChunks[i];
             lemma += env.getCharsetConverter().toString(prefixChunk.originalCodepoints);
-            const unsigned char* ptr = prefixChunk.interpsGroup.ptr;
+            const unsigned char* ptr = prefixChunk.interpsPtr;
             MorphInterpretation mi = this->decodeMorphInterpretation(0, 0, orthPrefix, string(""), prefixChunk, ptr);
             orthPrefix += mi.getOrth();
         }
@@ -108,7 +108,7 @@ static inline void doShiftOrth(InterpretedChunk&amp; from, InterpretedChunk&amp; to) {
             from.prefixChunks.end());
     to.prefixChunks.push_back(from);
     from.orthWasShifted = true;
-    to.chunkStartPtr = from.chunkStartPtr;
+    to.textStartPtr = from.textStartPtr;
 }
  
 static inline string debugInterpsGroup(unsigned char type, const char* startPtr, const char* endPtr) {
@@ -120,7 +120,7 @@ static inline string debugInterpsGroup(unsigned char type, const char* startPtr,
 static inline string debugAccum(vector<InterpretedChunk>& accum) {
     stringstream res;
     for (unsigned int i = 0; i < accum.size(); i++) {
-        res << debugInterpsGroup(accum[i].interpsGroup.type, accum[i].chunkStartPtr, accum[i].chunkEndPtr);
+        res << debugInterpsGroup(accum[i].segmentType, accum[i].textStartPtr, accum[i].textEndPtr);
         //        res << "(" << (int) accum[i].interpsGroup.type << ", " << string(accum[i].chunkStartPtr, accum[i].chunkStartPtr) << "), ";
     }
     return res.str();
@@ -168,33 +168,37 @@ void Morfeusz::doProcessOneWord(
             vector<InterpsGroup> val(state.getValue());
             for (unsigned int i = 0; i < val.size(); i++) {
                 InterpsGroup& ig = val[i];
-                vector<bool> casePattern;
-//                env.getCasePatternHelper().skipCasePattern(ig.ptr);
-                const unsigned char* casePatternPtr = ig.ptr;
-                env.getCasePatternHelper().deserializeCasePattern(casePatternPtr, casePattern);
+                //                vector<bool> casePattern;
+                //                env.getCasePatternHelper().skipCasePattern(ig.ptr);
+                //                const unsigned char* casePatternPtr = ig.ptr;
+                //                env.getCasePatternHelper().deserializeCasePattern(casePatternPtr, casePattern);
                 if (this->options.debug) {
                     cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
                 }
-                if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, originalCodepoints, casePattern)) {
-//                if (true) {
-                    //                cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
-                    set<SegrulesState> newSegrulesStates;
-                    env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
-                    if (this->options.debug && newSegrulesStates.empty()) {
-                        cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
-                    }
-                    //                cerr << "newSegrulesStates.size() " << newSegrulesStates.size() << endl;
+                //                if (env.getCasePatternHelper().checkCasePattern(normalizedCodepoints, originalCodepoints, casePattern)) {
+                //                cerr << "accept at '" << currInput << "' type=" << (int) ig.type << endl;
+                set<SegrulesState> newSegrulesStates;
+                env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
+                if (this->options.debug && newSegrulesStates.empty()) {
+                    cerr << "NOT ACCEPTING " << debugAccum(accum) << debugInterpsGroup(ig.type, inputStart, currInput) << endl;
+                }
+                if (!newSegrulesStates.empty() && env.getCasePatternHelper().checkInterpsGroupCasePatterns(normalizedCodepoints, originalCodepoints, ig)) {
+                    
                     for (
                             set<SegrulesState>::iterator it = newSegrulesStates.begin();
                             it != newSegrulesStates.end();
                             ++it) {
                         SegrulesState newSegrulesState = *it;
+                        const unsigned char* interpsPtr = env.getCasePatternHelper().getInterpretationsPtr(ig);
+                        const unsigned char* interpsEndPtr = ig.ptr + ig.size;
                         InterpretedChunk ic = {
+                            ig.type,
                             inputStart,
                             currInput,
                             originalCodepoints,
                             normalizedCodepoints,
-                            ig,
+                            interpsPtr,
+                            interpsEndPtr,
                             newSegrulesState.shiftOrthFromPrevious,
                             false,
                             vector<InterpretedChunk>(),
@@ -2,7 +2,7 @@
 #include "const.hpp"
  
 extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b;
-extern const uint8_t VERSION_NUM = 14;
+extern const uint8_t VERSION_NUM = 15;
  
 extern const unsigned int VERSION_NUM_OFFSET = 4;
 extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5;
@@ -66,13 +66,13 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
  
     uint32_t magicNumber = ntohl(*((const uint32_t*) ptr));
     if (magicNumber != MAGIC_NUMBER) {
-        throw FSAException("Invalid magic number");
+        throw FSAException("Invalid file format");
     }
  
     uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET);
     if (versionNum != VERSION_NUM) {
         std::ostringstream oss;
-        oss << "Invalid version number: " << versionNum << ", should be: " << VERSION_NUM;
+        oss << "Invalid file format version number: " << (int) versionNum << ", should be: " << (int) VERSION_NUM;
         throw FSAException(oss.str());
     }