dodanie obsługi znacznika !weak

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@130 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

dodanie obsługi znacznika !weak
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@130 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Michał Lenart
1 parent a6443fde
Showing 15 changed files with 44 additions and 73 deletions
fsabuilder/buildanalyzer.sh
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
fsabuilder/morfeuszbuilder/segrules/rules.py
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
input/segmenty.dat
morfeusz/CMakeLists.txt
morfeusz/FlexionGraph.cpp
morfeusz/FlexionGraph.hpp
morfeusz/Morfeusz.cpp
morfeusz/segrules/SegrulesDeserializer.cpp
morfeusz/segrules/SegrulesDeserializer.hpp
morfeusz/segrules/segrules.cpp
morfeusz/segrules/segrules.hpp
nbproject/configurations.xml
 #!/bin/bash
 python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
-    --tagset-file=/tmp/polimorf-sgjp.tagset \
-    --segments-file=/tmp/segmenty.dat \
+    --tagset-file=../input/polimorf.tagset \
+    --segments-file=../input/segmenty.dat \
     --analyzer \
     --serialization-method=SIMPLE \
     --trim-supneg \
@@ -9,7 +9,7 @@ from pyparsing import *
 from morfeuszbuilder.utils import exceptions
 from pyparseString import pyparseString
-identifier = Word(alphas, bodyChars=alphanums+u'_>*+!')
+identifier = Word(alphas, bodyChars=alphanums+u'_>*+')
 define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
 ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
 endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
@@ -61,8 +61,9 @@ def _processLine(lineNum, line, defines, filename):
         rule = Forward()
         defineInstance = Forward()
         localId = identifier.copy()
+        weakLiteral = CaselessLiteral('!weak')
-        rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')))
+        rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')) ^ weakLiteral)
         defineInstance << localId + Suppress('(') + rule + Suppress(')')
         rule.setParseAction(lambda s, l, t: ' '.join(t))
@@ -13,9 +13,12 @@ class SegmentRule(object):
     def __init__(self):
-        '''
-        Constructor
-        '''
+        
+        self.weak = False
+    
+    def setWeak(self, weak):
+        self.weak = weak
+        return self
     def addToNFA(self, fsa):
         raise NotImplementedError()
@@ -31,7 +34,7 @@ class TagRule(SegmentRule):
         self.shiftOrth = shiftOrth
     def addToNFA(self, fsa):
-        endState = RulesNFAState(final=True)
+        endState = RulesNFAState(final=True, weak=self.weak)
         self._doAddToNFA(fsa.initialState, endState)
     def _doAddToNFA(self, startState, endState):
@@ -51,7 +54,7 @@ class ComplexRule(SegmentRule):
         self.children = children
     def addToNFA(self, fsa):
-        endState = RulesNFAState(final=True)
+        endState = RulesNFAState(final=True, weak=self.weak)
         self._doAddToNFA(fsa.initialState, endState)
 class ConcatRule(ComplexRule):
@@ -68,11 +68,11 @@ class RulesNFA(object):
         return res
     def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState):
-        assert all(map(lambda state: state.weak, nfaStates)) \
-            or not any(map(lambda state: state.weak, nfaStates))
-        weak = all(map(lambda state: state.weak or not state.final, nfaStates))
+        assert all(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) \
+            or not any(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates)))
+        weak = any(map(lambda state: state.weak and state.final, nfaStates))
         final = any(map(lambda state: state.final, nfaStates))
-        assert not weak or not final
+#         assert not weak or not final
         if final:
             # dfaState should be final
             # and contain info about weakness
@@ -95,7 +95,7 @@ class RulesParser(object):
             concatRule = OneOrMore(complexRule)
         else:
             concatRule = ZeroOrMore(shiftOrthRule) + tagRule
-        rule << concatRule
+        rule << concatRule + Optional(CaselessLiteral('!weak'))
         tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
         shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
@@ -104,5 +104,6 @@ class RulesParser(object):
         oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
         oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
         concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
+        rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2))
         parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0]
         return parsedRule
@@ -101,6 +101,7 @@ moze_interp( (adja dywiz)+ adj )
 # Stopień najwyższy:
 # np. „naj·zieleńszy”, „naj·mądrzej”
 moze_interp( naj> adj_sup )
+moze_interp( nie> naj> adj_sup ) !weak
 # Formy „zanegowane” gerundiów i imiesłowów:
 # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”:
@@ -112,7 +113,7 @@ moze_interp(z_on_agl)
 moze_interp(z_on_agl on_agl)
 # Liczba zapisana jako ciąg cyfr:
-moze_interp( dig>* dig )
+moze_interp( dig>* dig ) !weak
 # Formacje prefiksalne
 #### trzeba wydzielić odpowiednie samodze!
@@ -36,7 +36,7 @@ set(SRC_FILES
     charset/caseconv.cpp
     charset/conversion_tables.cpp
     segrules/segrules.cpp
-    segrules/SegrulesDeserializer.cpp)
+)
 set(INCLUDE_FILES 
     const.hpp 
@@ -50,7 +50,8 @@ set(INCLUDE_FILES
     charset/CharsetConverter.hpp
     charset/CaseConverter.hpp
     charset/caseconv.hpp
-    charset/conversion_tables.hpp)
+    charset/conversion_tables.hpp
+)
 add_library (libmorfeusz SHARED ${SRC_FILES})
 set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERATED TRUE)
@@ -51,9 +51,17 @@ static inline bool chunkIsTheOnlyOne(
     return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path);
 }
-void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) {
+void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool weak) {
     //    debugPath(path);
     //    debugGraph(this->graph);
+    if (weak && !this->empty() && !this->onlyWeakPaths) {
+        return;
+    }
+    else if (this->onlyWeakPaths && !weak) {
+        this->graph.clear();
+        this->node2ChunkStartPtr.clear();
+        this->onlyWeakPaths = false;
+    }
     for (unsigned int i = 0; i < path.size(); i++) {
         const InterpretedChunk& chunk = path[i];
         if (!chunk.orthWasShifted) {
@@ -15,13 +15,17 @@
 class FlexionGraph {
 public:
+    
+    FlexionGraph(): graph(), node2ChunkStartPtr(), onlyWeakPaths(true) {
+        
+    }
     struct Edge {
         InterpretedChunk chunk;
         unsigned int nextNode;
     };
-    void addPath(const std::vector<InterpretedChunk>& path);
+    void addPath(const std::vector<InterpretedChunk>& path, bool weak);
     //    void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results);
@@ -64,6 +68,7 @@ private:
     std::vector< std::vector<Edge> > graph;
     std::vector< const char* > node2ChunkStartPtr;
+    bool onlyWeakPaths;
 };
 #endif	/* FLEXIONGRAPH_HPP */
@@ -154,8 +154,9 @@ void Morfeusz::doProcessOneWord(
                         doShiftOrth(accum.back(), ic);
                     }
                     accum.push_back(ic);
-                    if (isEndOfWord(codepoint) && newSegrulesState.accepting) {
-                        graph.addPath(accum);
+                    if (isEndOfWord(codepoint) 
+                            && newSegrulesState.accepting) {
+                        graph.addPath(accum, newSegrulesState.weak);
                     }
                     else if (!isEndOfWord(codepoint)) {
 //                        cerr << "will process " << currInput << endl;
-/* 
- * File:   SegrulesDeserializer.cpp
- * Author: mlenart
- * 
- * Created on 25 luty 2014, 16:16
- */
-
-#include "SegrulesDeserializer.hpp"
-
-SegrulesDeserializer::SegrulesDeserializer() {
-}
-
-long SegrulesDeserializer::deserialize(const unsigned char* ptr, unsigned char& object) const {
-    object = *ptr;
-    return 1;
-}
-
-SegrulesDeserializer::~SegrulesDeserializer() {
-}
-
-/* 
- * File:   SegrulesDeserializer.hpp
- * Author: mlenart
- *
- * Created on 25 luty 2014, 16:16
- */
-
-#ifndef SEGRULESDESERIALIZER_HPP
-#define	SEGRULESDESERIALIZER_HPP
-
-#include "../fsa/fsa.hpp"
-
-class SegrulesDeserializer: public Deserializer<unsigned char> {
-public:
-    SegrulesDeserializer();
-    long deserialize(const unsigned char* ptr, unsigned char& object) const;
-    virtual ~SegrulesDeserializer();
-private:
-
-};
-
-#endif	/* SEGRULESDESERIALIZER_HPP */
-
-#include "SegrulesDeserializer.hpp"
 #include "segrules.hpp"
 #include "../fsa/fsa.hpp"
 #include "../fsa/const.hpp"
@@ -19,8 +19,8 @@ typedef std::map&lt;std::string, std::string&gt; SegrulesOptions;
 std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr);
 SegrulesOptions getDefaultSegrulesOptions(const unsigned char* ptr);
-SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr);
-void debugMap(const map<SegrulesOptions, SegrulesFSA*>& res);
+SegrulesFSA* getDefaultSegrulesFSA(const std::map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr);
+void debugMap(const std::map<SegrulesOptions, SegrulesFSA*>& res);
 #endif	/* SEGRULES_HPP */
@@ -33,7 +33,6 @@
         <in>test_speed.cpp</in>
       </df>
       <df name="segrules">
-        <in>SegrulesDeserializer.cpp</in>
         <in>segrules.cpp</in>
       </df>
       <in>Environment.cpp</in>
@@ -496,11 +495,6 @@
           </preprocessorList>
         </ccTool>
       </item>
-      <item path="morfeusz/segrules/SegrulesDeserializer.cpp"
-            ex="false"
-            tool="1"
-            flavor2="4">
-      </item>
       <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
       </item>
       <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">