dalsze próby optymalizacji, poprawa działania operatora ">" w segmentacji (przen…

…osi orth a nie lemat) git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@180 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

dalsze próby optymalizacji, poprawa działania operatora ">" w segmentacji (przen…
…osi orth a nie lemat) git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@180 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Michał Lenart
1 parent c9317018
Showing 13 changed files with 152 additions and 133 deletions
CMakeLists.txt
fsabuilder/morfeuszbuilder/fsa/fsa.py
morfeusz/CasePatternHelper.hpp
morfeusz/Environment.cpp
morfeusz/Environment.hpp
morfeusz/InterpretedChunksDecoder.hpp
morfeusz/Morfeusz.cpp
morfeusz/Morfeusz.hpp
morfeusz/charset/CharsetConverter.cpp
morfeusz/charset/CharsetConverter.hpp
morfeusz/charset/charset_utils.hpp
morfeusz/segrules/SegrulesFSA.hpp
nbproject/configurations.xml
@@ -5,6 +5,7 @@ project (Morfeusz)
 set (Morfeusz_VERSION_MAJOR 2)
 set (Morfeusz_VERSION_MINOR 0)
 set (Morfeusz_VERSION_PATCH 0)
+set (CMAKE_BUILD_TYPE Release)
  
 enable_testing()
  
@@ -41,7 +41,7 @@ class FSA(object):
         self.n += 1
  
         # debug
-        if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0 or word.startswith('naj'):
+        if self.n < 10 or (self.n < 10000 and self.n % 1000 == 0) or self.n % 10000 == 0:
             logging.info(u'%d %s' % (self.n, word))
         for label in encodedWord:
             self.label2Freq[label] = self.label2Freq.get(label, 0) + 1
@@ -74,29 +74,20 @@ public:
  
     std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr) const {
         std::vector<bool> res;
-        uint8_t casePatternType = *ptr;
-        ptr++;
+        uint8_t casePatternType = *ptr++;
         uint8_t prefixLength;
         uint8_t patternLength;
         switch (casePatternType) {
             case LEMMA_ONLY_LOWER:
                 break;
             case LEMMA_UPPER_PREFIX:
-                prefixLength = *ptr;
-                ptr++;
-                for (unsigned int i = 0; i < prefixLength; i++) {
-                    //                lemma.casePattern[i] = true;
-                    res.push_back(true);
-                }
-                //            lemma.casePattern.resize(prefixLength, true);
+                prefixLength = *ptr++;
+                res.resize(prefixLength, true);
                 break;
             case LEMMA_MIXED_CASE:
-                patternLength = *ptr;
-                ptr++;
+                patternLength = *ptr++;
                 for (unsigned int i = 0; i < patternLength; i++) {
-                    uint8_t idx = *ptr;
-                    ptr++;
-                    //                lemma.casePattern[idx] = true;
+                    uint8_t idx = *ptr++;
                     res.resize(idx + 1, false);
                     res[idx] = true;
                 }
@@ -38,10 +38,6 @@ Environment::Environment(
         MorfeuszProcessorType processorType,
         const unsigned char* fsaFileStartPtr)
 : currentCharsetConverter(getCharsetConverter(charset)),
-utf8CharsetConverter(),
-isoCharsetConverter(),
-cp1250CharsetConverter(),
-cp852CharsetConverter(),
 caseConverter(),
 tagset(fsaFileStartPtr),
 qualifiers(fsaFileStartPtr),
@@ -63,13 +59,13 @@ casePatternHelper() {
 const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const {
     switch (charset) {
         case UTF8:
-            return &this->utf8CharsetConverter;
+            return &UTF8CharsetConverter::getInstance();
         case ISO8859_2:
-            return &this->isoCharsetConverter;
+            return &ISO8859_2_CharsetConverter::getInstance();
         case CP1250:
-            return &this->cp1250CharsetConverter;
+            return &Windows_1250_CharsetConverter::getInstance();
         case CP852:
-            return &this->cp852CharsetConverter;
+            return &CP852_CharsetConverter::getInstance();
         default:
             throw MorfeuszException("invalid charset");
     }
@@ -65,10 +65,6 @@ public:
     virtual ~Environment();
 private:
     const CharsetConverter* currentCharsetConverter;
-    const UTF8CharsetConverter utf8CharsetConverter;
-    const ISO8859_2_CharsetConverter isoCharsetConverter;
-    const Windows_1250_CharsetConverter cp1250CharsetConverter;
-    const CP852_CharsetConverter cp852CharsetConverter;
     const CaseConverter caseConverter;
     Tagset tagset;
     Qualifiers qualifiers;
@@ -62,7 +62,7 @@ public:
             orth += this->env.getCharsetConverter().toString(interpretedChunk.originalCodepoints);
             const unsigned char* currPtr = interpretedChunk.interpsPtr;
             while (currPtr < interpretedChunk.interpsEndPtr) {
-                this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, currPtr, out);
+                this->decodeMorphInterpretation(startNode, endNode, orth, lemmaPrefix, interpretedChunk, false, currPtr, out);
             }
         }
     }
@@ -72,6 +72,7 @@ protected:
     void decodeForm(
             const vector<uint32_t>& orth,
             const EncodedForm& lemma,
+            bool forPrefix,
             string& res) const {
         for (unsigned int i = lemma.prefixToCut; i < orth.size() - lemma.suffixToCut; i++) {
             uint32_t cp =
@@ -80,11 +81,13 @@ protected:
                     : orth[i];
             env.getCharsetConverter().append(cp, res);
         }
-        const char* suffixPtr = lemma.suffixToAdd.c_str();
-        const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length();
-        while (suffixPtr != suffixEnd) {
-            uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd);
-            env.getCharsetConverter().append(cp, res);
+        if (!forPrefix) {
+            const char* suffixPtr = lemma.suffixToAdd.c_str();
+            const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length();
+            while (suffixPtr != suffixEnd) {
+                uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
+                env.getCharsetConverter().append(cp, res);
+            }
         }
     }
  
@@ -97,12 +100,10 @@ protected:
         assert(encodedForm.casePattern.size() == 0);
         if (isLemmaOnlyLower(compressionByte)) {
             encodedForm.casePattern = std::vector<bool>();
-        }
-        else if (isLemmaOnlyTitle(compressionByte)) {
+        } else if (isLemmaOnlyTitle(compressionByte)) {
             encodedForm.casePattern = std::vector<bool>();
             encodedForm.casePattern.push_back(true);
-        }
-        else {
+        } else {
             encodedForm.casePattern = env.getCasePatternHelper().deserializeOneCasePattern(ptr);
         }
     }
@@ -110,11 +111,9 @@ protected:
     EncodedInterpretation deserializeEncodedInterp(const unsigned char*& ptr, unsigned char compressionByte) const {
         EncodedInterpretation interp;
         if (isOrthOnlyLower(compressionByte)) {
-        }
-        else if (isOrthOnlyTitle(compressionByte)) {
+        } else if (isOrthOnlyTitle(compressionByte)) {
             interp.orthCasePattern.push_back(true);
-        }
-        else {
+        } else {
             interp.orthCasePattern = this->env.getCasePatternHelper().deserializeOneCasePattern(ptr);
         }
         deserializeEncodedForm(ptr, compressionByte, interp.value);
@@ -129,8 +128,7 @@ private:
         vector<string> splitRes(split(lemma, ':'));
         if (splitRes.size() == 2) {
             return make_pair(splitRes[0], splitRes[1]);
-        }
-        else {
+        } else {
             return make_pair(lemma, "");
         }
     }
@@ -140,17 +138,18 @@ private:
             const string& orth,
             const string& lemmaPrefix,
             const InterpretedChunk& chunk,
+            bool forPrefix,
             const unsigned char*& ptr,
             std::vector<MorphInterpretation>& out) const {
         string lemma = lemmaPrefix;
         EncodedInterpretation ei = this->deserializeEncodedInterp(ptr, *chunk.interpsGroupPtr);
-        this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma);
+        this->decodeForm(chunk.lowercaseCodepoints, ei.value, forPrefix, lemma);
         if (env.getCasePatternHelper().checkCasePattern(chunk.lowercaseCodepoints, chunk.originalCodepoints, ei.orthCasePattern)) {
             //            pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma);
             out.push_back(MorphInterpretation(
                     startNode, endNode,
                     orth, lemma,
-//                    "",
+                    //                    "",
                     ei.tag,
                     ei.nameClassifier,
                     ei.qualifiers,
@@ -165,11 +164,10 @@ private:
             const unsigned char* ptr = prefixChunk.interpsPtr;
             std::vector<MorphInterpretation> mi;
             //            env.getCasePatternHelper().skipCasePattern(ptr);
-            this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, ptr, mi);
+            this->decodeMorphInterpretation(0, 0, orth, string(""), prefixChunk, true, ptr, mi);
             if (!mi.empty()) {
                 lemmaPrefix += mi[0].getLemma();
-            }
-            else {
+            } else {
                 return false;
             }
         }
@@ -227,7 +225,7 @@ private:
         return MorphInterpretation(
                 startNode, endNode,
                 orth, lemma + HOMONYM_SEPARATOR + ei.homonymId,
-//                ei.homonymId,
+                //                ei.homonymId,
                 ei.tag,
                 ei.nameClassifier,
                 ei.qualifiers,
@@ -245,7 +243,7 @@ private:
         const char* suffixPtr = orth.suffixToAdd.c_str();
         const char* suffixEnd = suffixPtr + orth.suffixToAdd.length();
         while (suffixPtr != suffixEnd) {
-            uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd);
+            uint32_t cp = UTF8CharsetConverter::getInstance().next(suffixPtr, suffixEnd);
             env.getCharsetConverter().append(cp, res);
         }
     }
@@ -39,7 +39,7 @@ Morfeusz::Morfeusz()
 generatorEnv(DEFAULT_MORFEUSZ_CHARSET, GENERATOR, DEFAULT_SYNTH_FSA),
 options(createDefaultOptions()),
 accum(),
-graph(){
+graph() {
     analyzerEnv.setCaseSensitive(options.caseSensitive);
     generatorEnv.setCaseSensitive(false);
 }
@@ -85,14 +85,14 @@ void Morfeusz::processOneWord(
             && isWhitespace(env.getCharsetConverter().peek(inputStart, inputEnd))) {
         env.getCharsetConverter().next(inputStart, inputEnd);
     }
-    
+
     accum.clear();
     graph.clear();
-    
+
     const char* currInput = inputStart;
     const SegrulesFSA& segrulesFSA = env.getCurrentSegrulesFSA();
  
-    doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState, accum, graph);
+    doProcessOneWord(env, currInput, inputEnd, segrulesFSA.initialState);
  
     if (!graph.empty()) {
         const InterpretedChunksDecoder& interpretedChunksDecoder = env.getInterpretedChunksDecoder();
@@ -111,7 +111,6 @@ void Morfeusz::processOneWord(
             && env.getProcessorType() == ANALYZER
             && !insideIgnHandler) {
         this->handleIgnChunk(env, inputStart, currInput, startNodeNum, results);
-        //        this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
     }
     else if (inputStart != inputEnd) {
         this->appendIgnotiumToResults(env, string(inputStart, currInput), startNodeNum, results);
@@ -144,25 +143,44 @@ static inline string debugAccum(vector&lt;InterpretedChunk&gt;&amp; accum) {
     return res.str();
 }
  
+static inline void feedStateDirectly(
+        StateType& state,
+        const char* inputStart,
+        const char* inputEnd) {
+    const char* currInput = inputStart;
+    while (currInput != inputEnd && !state.isSink()) {
+        state.proceedToNext(*currInput++);
+    }
+}
+
+static inline void feedState(
+        StateType& state,
+        int codepoint) {
+    std::string chars;
+    UTF8CharsetConverter::getInstance().append(codepoint, chars);
+    for (unsigned int i = 0; i < chars.length() && !state.isSink(); i++) {
+        state.proceedToNext(chars[i]);
+    }
+}
+
 void Morfeusz::doProcessOneWord(
         const Environment& env,
         const char*& inputData,
         const char* inputEnd,
-        SegrulesState segrulesState,
-        vector<InterpretedChunk>& accum,
-        InflexionGraph& graph) const {
+        SegrulesState segrulesState) const {
     if (this->options.debug) {
         cerr << "----------" << endl;
         cerr << "PROCESS: '" << inputData << "', already recognized: " << debugAccum(accum) << endl;
     }
     //    cerr << "doAnalyzeOneWord " << inputData << endl;
     const char* inputStart = inputData;
+    const char* prevInput = inputData;
     const char* currInput = inputData;
     uint32_t codepoint = inputData == inputEnd ? 0 : env.getCharsetConverter().next(currInput, inputEnd);
     bool currCodepointIsWhitespace = isWhitespace(codepoint);
     vector<uint32_t> originalCodepoints;
     vector<uint32_t> normalizedCodepoints;
-    
+
     originalCodepoints.reserve(16);
     normalizedCodepoints.reserve(16);
  
@@ -174,7 +192,13 @@ void Morfeusz::doProcessOneWord(
                 : codepoint;
         originalCodepoints.push_back(codepoint);
         normalizedCodepoints.push_back(normalizedCodepoint);
-        feedState(state, normalizedCodepoint, UTF8CharsetConverter());
+        if (codepoint == normalizedCodepoint && &env.getCharsetConverter() == &UTF8CharsetConverter::getInstance()) {
+            feedStateDirectly(state, prevInput, currInput);
+        }
+        else {
+            feedState(state, normalizedCodepoint);
+        }
+
         codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd);
         currCodepointIsWhitespace = isWhitespace(codepoint);
         string homonymId;
@@ -184,6 +208,7 @@ void Morfeusz::doProcessOneWord(
             }
             homonymId = string(currInput + 1, inputEnd);
             //            cerr << "homonym " << homonymId << endl;
+            prevInput = currInput;
             currInput = inputEnd;
             codepoint = 0x00;
             currCodepointIsWhitespace = true;
@@ -195,9 +220,8 @@ void Morfeusz::doProcessOneWord(
                 if (this->options.debug) {
                     cerr << "recognized: " << debugInterpsGroup(ig.type, inputStart, currInput) << " at: '" << inputStart << "'" << endl;
                 }
-                vector<SegrulesState> newSegrulesStates;
-                env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, newSegrulesStates);
-                if (!newSegrulesStates.empty() 
+                vector<SegrulesState> newSegrulesStates = env.getCurrentSegrulesFSA().proceedToNext(ig.type, segrulesState, currCodepointIsWhitespace);
+                if (!newSegrulesStates.empty()
                         && env.getCasePatternHelper().checkInterpsGroupOrthCasePatterns(normalizedCodepoints, originalCodepoints, ig)) {
  
                     for (
@@ -225,17 +249,18 @@ void Morfeusz::doProcessOneWord(
                             doShiftOrth(accum.back(), ic);
                         }
                         accum.push_back(ic);
-                        if (currCodepointIsWhitespace
-                                && newSegrulesState.accepting) {
+                        if (currCodepointIsWhitespace) {
+                            assert(newSegrulesState.accepting);
                             if (this->options.debug) {
                                 cerr << "ACCEPTING " << debugAccum(accum) << endl;
                             }
                             graph.addPath(accum, newSegrulesState.weak);
                         }
-                        else if (!currCodepointIsWhitespace) {
+                        else {
+                            assert(!newSegrulesState.sink);
                             //                        cerr << "will process " << currInput << endl;
                             const char* newCurrInput = currInput;
-                            doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph);
+                            doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState);
                         }
                         accum.pop_back();
                     }
@@ -246,6 +271,7 @@ void Morfeusz::doProcessOneWord(
                 }
             }
         }
+        prevInput = currInput;
         codepoint = currInput == inputEnd || currCodepointIsWhitespace ? 0x00 : env.getCharsetConverter().next(currInput, inputEnd);
     }
     inputData = currInput;
@@ -170,9 +170,7 @@ private:
             const Environment& env,
             const char*& inputData,
             const char* inputEnd,
-            SegrulesState segrulesState,
-            std::vector<InterpretedChunk>& accum,
-            InflexionGraph& graph) const;
+            SegrulesState segrulesState) const;
  
     void handleIgnChunk(
         const Environment& env,
@@ -36,6 +36,13 @@ static inline void iterateThroughInvalidUtf8Sequence(const char*&amp; it, const char
     }
 }
  
+const UTF8CharsetConverter& UTF8CharsetConverter::getInstance() {
+    static UTF8CharsetConverter instance;
+    return instance;
+}
+
+UTF8CharsetConverter::UTF8CharsetConverter() {}
+
 uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
     uint32_t cp = 0;
     utf8::internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
@@ -86,14 +93,29 @@ void OneByteCharsetConverter::append(uint32_t cp, std::string&amp; result) const {
     }
 }
  
+const ISO8859_2_CharsetConverter& ISO8859_2_CharsetConverter::getInstance() {
+    static ISO8859_2_CharsetConverter instance;
+    return instance;
+}
+
 ISO8859_2_CharsetConverter::ISO8859_2_CharsetConverter()
 : OneByteCharsetConverter(ISO_8859_2_TO_CODEPOINT) {
 }
  
+const Windows_1250_CharsetConverter& Windows_1250_CharsetConverter::getInstance() {
+    static Windows_1250_CharsetConverter instance;
+    return instance;
+}
+
 Windows_1250_CharsetConverter::Windows_1250_CharsetConverter()
 : OneByteCharsetConverter(WINDOWS_1250_TO_CODEPOINT) {
 }
  
+const CP852_CharsetConverter& CP852_CharsetConverter::getInstance() {
+    static CP852_CharsetConverter instance;
+    return instance;
+}
+
 CP852_CharsetConverter::CP852_CharsetConverter()
 : OneByteCharsetConverter(CP852_TO_CODEPOINT) {
 }
@@ -21,17 +21,23 @@ public:
     virtual std::string fromUTF8(const std::string& input) const;
  
     std::string toString(const std::vector<uint32_t>& codepoints) const;
-    
+
     virtual ~CharsetConverter();
 private:
 };
  
 class UTF8CharsetConverter : public CharsetConverter {
 public:
+
+    static const UTF8CharsetConverter& getInstance();
+
     uint32_t next(const char*& it, const char* end) const;
     void append(uint32_t cp, std::string& result) const;
     std::string fromUTF8(const std::string& input) const;
 private:
+    UTF8CharsetConverter();
+    UTF8CharsetConverter(const UTF8CharsetConverter&); // do not implement
+    void operator=(const UTF8CharsetConverter&); // do not implement
 };
  
 /*
@@ -49,20 +55,29 @@ private:
  
 class ISO8859_2_CharsetConverter : public OneByteCharsetConverter {
 public:
-    ISO8859_2_CharsetConverter();
+    static const ISO8859_2_CharsetConverter& getInstance();
 private:
+    ISO8859_2_CharsetConverter();
+    ISO8859_2_CharsetConverter(const ISO8859_2_CharsetConverter&); // do not implement
+    void operator=(const ISO8859_2_CharsetConverter&); // do not implement
 };
  
 class Windows_1250_CharsetConverter : public OneByteCharsetConverter {
 public:
-    Windows_1250_CharsetConverter();
+    static const Windows_1250_CharsetConverter& getInstance();
 private:
+    Windows_1250_CharsetConverter();
+    Windows_1250_CharsetConverter(const Windows_1250_CharsetConverter&); // do not implement
+    void operator=(const Windows_1250_CharsetConverter&); // do not implement
 };
  
 class CP852_CharsetConverter : public OneByteCharsetConverter {
 public:
-    CP852_CharsetConverter();
+    static const CP852_CharsetConverter& getInstance();
 private:
+    CP852_CharsetConverter();
+    CP852_CharsetConverter(const CP852_CharsetConverter&); // do not implement
+    void operator=(const CP852_CharsetConverter&); // do not implement
 };
  
 #endif	/* ENCODINGCONVERTER_HPP */
@@ -53,17 +53,5 @@ inline bool isWhitespace(uint32_t codepoint) {
     return std::binary_search(whitespaces.begin(), whitespaces.end(), codepoint);
 }
  
-template <class StateClass>
-void feedState(
-        StateClass& state,
-        int codepoint,
-        const CharsetConverter& charsetConverter) {
-    std::string chars;
-    charsetConverter.append(codepoint, chars);
-    for (unsigned int i = 0; i < chars.length(); i++) {
-        state.proceedToNext(chars[i]);
-    }
-}
-
 #endif	/* CHARSET_UTILS_HPP */
  
@@ -17,42 +17,48 @@ struct SegrulesState {
     bool accepting;
     bool weak;
     bool shiftOrthFromPrevious;
+    bool sink;
 };
  
-inline bool operator<(const SegrulesState& s1, const SegrulesState& s2)
-{
-  return s1.offset < s2.offset;
+inline bool operator<(const SegrulesState& s1, const SegrulesState& s2) {
+    return s1.offset < s2.offset;
 }
  
 class SegrulesFSA {
 public:
-    SegrulesFSA(const unsigned char* ptr): initialState(), ptr(ptr) {
-        SegrulesState state = {0, false, false, false};
+
+    SegrulesFSA(const unsigned char* ptr) : initialState(), ptr(ptr) {
+        SegrulesState state = {0, false, false, false, false};
         initialState = state;
     }
-    
-    void proceedToNext(
-        const unsigned char segnum,
-        const SegrulesState state,
-        std::vector<SegrulesState>& newStates) const {
-        
-        const unsigned char* currPtr = ptr + state.offset;
-        currPtr++;
+
+    std::vector<SegrulesState> proceedToNext(
+            const unsigned char segnum,
+            const SegrulesState state,
+            bool atEndOfWord) const {
+        std::vector<SegrulesState> res;
+        const unsigned char* currPtr = ptr + state.offset + 1;
         const unsigned char transitionsNum = *currPtr++;
         for (unsigned int i = 0; i < transitionsNum; i++) {
             if (*currPtr == segnum) {
-                newStates.push_back(this->transition2State(currPtr));
+                SegrulesState newState = this->transition2State(currPtr);
+                if ((atEndOfWord && newState.accepting) 
+                        || (!atEndOfWord && !newState.sink)) {
+                    res.push_back(newState);
+                }
             }
             currPtr += 4;
         }
+        return res;
     }
-    
-    virtual ~SegrulesFSA() {}
-    
+
+    virtual ~SegrulesFSA() {
+    }
+
     SegrulesState initialState;
 private:
     const unsigned char* ptr;
-    
+
     SegrulesState transition2State(const unsigned char* transitionPtr) const {
         unsigned char ACCEPTING_FLAG = 1;
         unsigned char WEAK_FLAG = 2;
@@ -62,6 +68,7 @@ private:
         res.offset = readInt16(transitionPtr);
         res.accepting = *(ptr + res.offset) & ACCEPTING_FLAG;
         res.weak = *(ptr + res.offset) & WEAK_FLAG;
+        res.sink = *(ptr + res.offset + 1) == 0;
         return res;
     }
 };
@@ -26,9 +26,6 @@
       <df name="cli">
         <in>cli.cpp</in>
       </df>
-      <df name="data">
-        <in>default_fsa.cpp</in>
-      </df>
       <df name="fsa">
         <in>const.cpp</in>
         <in>test_not_recognize.cpp</in>
@@ -133,8 +130,6 @@
         </ccTool>
       </item>
       <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4">
-        <ccTool flags="1">
-        </ccTool>
       </item>
       <item path="build/morfeusz/morfeuszJAVA_wrap.cxx"
             ex="false"
@@ -236,25 +231,6 @@
           </preprocessorList>
         </ccTool>
       </folder>
-      <folder path="0/data">
-        <ccTool>
-          <incDir>
-            <pElem>build</pElem>
-            <pElem>build/morfeusz</pElem>
-          </incDir>
-          <preprocessorList>
-            <Elem>NDEBUG</Elem>
-            <Elem>_OPTIMIZE__=1</Elem>
-            <Elem>__PIC__=2</Elem>
-            <Elem>__pic__=2</Elem>
-            <Elem>libmorfeusz_EXPORTS</Elem>
-          </preprocessorList>
-          <undefinedList>
-            <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem>
-            <Elem>__NO_INLINE__</Elem>
-          </undefinedList>
-        </ccTool>
-      </folder>
       <folder path="0/segrules">
         <ccTool>
           <incDir>
@@ -307,7 +283,7 @@
         <ccTool>
           <incDir>
             <pElem>morfeusz</pElem>
-            <pElem>/usr/lib/jvm/default-java/include</pElem>
+            <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem>
           </incDir>
           <preprocessorList>
             <Elem>NDEBUG</Elem>
@@ -373,13 +349,14 @@
         </ccTool>
       </item>
       <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="4">
-        <ccTool flags="2">
+        <ccTool flags="1">
           <incDir>
             <pElem>build</pElem>
             <pElem>morfeusz</pElem>
             <pElem>build/morfeusz</pElem>
           </incDir>
           <preprocessorList>
+            <Elem>NDEBUG</Elem>
             <Elem>libmorfeusz_EXPORTS</Elem>
           </preprocessorList>
         </ccTool>
@@ -410,13 +387,14 @@
         </ccTool>
       </item>
       <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4">
-        <ccTool flags="2">
+        <ccTool flags="1">
           <incDir>
             <pElem>build</pElem>
             <pElem>morfeusz</pElem>
             <pElem>build/morfeusz</pElem>
           </incDir>
           <preprocessorList>
+            <Elem>NDEBUG</Elem>
             <Elem>libmorfeusz_EXPORTS</Elem>
           </preprocessorList>
         </ccTool>
@@ -427,6 +405,11 @@
             ex="false"
             tool="1"
             flavor2="4">
+        <ccTool flags="1">
+          <preprocessorList>
+            <Elem>NDEBUG</Elem>
+          </preprocessorList>
+        </ccTool>
       </item>
       <item path="morfeusz/charset/caseconv.cpp" ex="false" tool="1" flavor2="4">
       </item>
@@ -436,8 +419,6 @@
             flavor2="4">
       </item>
       <item path="morfeusz/cli/cli.cpp" ex="false" tool="1" flavor2="4">
-        <ccTool flags="1">
-        </ccTool>
       </item>
       <item path="morfeusz/const.cpp" ex="false" tool="1" flavor2="4">
         <ccTool flags="1">