Commit dfe845014dcca9040b318b2c28f3409a9c45c3a2

Authored by Michał Lenart
1 parent a6443fde

dodanie obsługi znacznika !weak

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@130 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/buildanalyzer.sh
1 #!/bin/bash 1 #!/bin/bash
2 2
3 python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ 3 python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
4 - --tagset-file=/tmp/polimorf-sgjp.tagset \  
5 - --segments-file=/tmp/segmenty.dat \ 4 + --tagset-file=../input/polimorf.tagset \
  5 + --segments-file=../input/segmenty.dat \
6 --analyzer \ 6 --analyzer \
7 --serialization-method=SIMPLE \ 7 --serialization-method=SIMPLE \
8 --trim-supneg \ 8 --trim-supneg \
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -9,7 +9,7 @@ from pyparsing import * @@ -9,7 +9,7 @@ from pyparsing import *
9 from morfeuszbuilder.utils import exceptions 9 from morfeuszbuilder.utils import exceptions
10 from pyparseString import pyparseString 10 from pyparseString import pyparseString
11 11
12 -identifier = Word(alphas, bodyChars=alphanums+u'_>*+!') 12 +identifier = Word(alphas, bodyChars=alphanums+u'_>*+')
13 define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() 13 define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
14 ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() 14 ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
15 endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() 15 endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
@@ -61,8 +61,9 @@ def _processLine(lineNum, line, defines, filename): @@ -61,8 +61,9 @@ def _processLine(lineNum, line, defines, filename):
61 rule = Forward() 61 rule = Forward()
62 defineInstance = Forward() 62 defineInstance = Forward()
63 localId = identifier.copy() 63 localId = identifier.copy()
  64 + weakLiteral = CaselessLiteral('!weak')
64 65
65 - rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')'))) 66 + rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')) ^ weakLiteral)
66 defineInstance << localId + Suppress('(') + rule + Suppress(')') 67 defineInstance << localId + Suppress('(') + rule + Suppress(')')
67 68
68 rule.setParseAction(lambda s, l, t: ' '.join(t)) 69 rule.setParseAction(lambda s, l, t: ' '.join(t))
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -13,9 +13,12 @@ class SegmentRule(object): @@ -13,9 +13,12 @@ class SegmentRule(object):
13 13
14 14
15 def __init__(self): 15 def __init__(self):
16 - '''  
17 - Constructor  
18 - ''' 16 +
  17 + self.weak = False
  18 +
  19 + def setWeak(self, weak):
  20 + self.weak = weak
  21 + return self
19 22
20 def addToNFA(self, fsa): 23 def addToNFA(self, fsa):
21 raise NotImplementedError() 24 raise NotImplementedError()
@@ -31,7 +34,7 @@ class TagRule(SegmentRule): @@ -31,7 +34,7 @@ class TagRule(SegmentRule):
31 self.shiftOrth = shiftOrth 34 self.shiftOrth = shiftOrth
32 35
33 def addToNFA(self, fsa): 36 def addToNFA(self, fsa):
34 - endState = RulesNFAState(final=True) 37 + endState = RulesNFAState(final=True, weak=self.weak)
35 self._doAddToNFA(fsa.initialState, endState) 38 self._doAddToNFA(fsa.initialState, endState)
36 39
37 def _doAddToNFA(self, startState, endState): 40 def _doAddToNFA(self, startState, endState):
@@ -51,7 +54,7 @@ class ComplexRule(SegmentRule): @@ -51,7 +54,7 @@ class ComplexRule(SegmentRule):
51 self.children = children 54 self.children = children
52 55
53 def addToNFA(self, fsa): 56 def addToNFA(self, fsa):
54 - endState = RulesNFAState(final=True) 57 + endState = RulesNFAState(final=True, weak=self.weak)
55 self._doAddToNFA(fsa.initialState, endState) 58 self._doAddToNFA(fsa.initialState, endState)
56 59
57 class ConcatRule(ComplexRule): 60 class ConcatRule(ComplexRule):
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -68,11 +68,11 @@ class RulesNFA(object): @@ -68,11 +68,11 @@ class RulesNFA(object):
68 return res 68 return res
69 69
70 def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): 70 def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState):
71 - assert all(map(lambda state: state.weak, nfaStates)) \  
72 - or not any(map(lambda state: state.weak, nfaStates))  
73 - weak = all(map(lambda state: state.weak or not state.final, nfaStates)) 71 + assert all(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) \
  72 + or not any(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates)))
  73 + weak = any(map(lambda state: state.weak and state.final, nfaStates))
74 final = any(map(lambda state: state.final, nfaStates)) 74 final = any(map(lambda state: state.final, nfaStates))
75 - assert not weak or not final 75 +# assert not weak or not final
76 if final: 76 if final:
77 # dfaState should be final 77 # dfaState should be final
78 # and contain info about weakness 78 # and contain info about weakness
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -95,7 +95,7 @@ class RulesParser(object): @@ -95,7 +95,7 @@ class RulesParser(object):
95 concatRule = OneOrMore(complexRule) 95 concatRule = OneOrMore(complexRule)
96 else: 96 else:
97 concatRule = ZeroOrMore(shiftOrthRule) + tagRule 97 concatRule = ZeroOrMore(shiftOrthRule) + tagRule
98 - rule << concatRule 98 + rule << concatRule + Optional(CaselessLiteral('!weak'))
99 99
100 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) 100 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
101 shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper)) 101 shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
@@ -104,5 +104,6 @@ class RulesParser(object): @@ -104,5 +104,6 @@ class RulesParser(object):
104 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) 104 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
105 oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) 105 oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
106 concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) 106 concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
  107 + rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2))
107 parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0] 108 parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0]
108 return parsedRule 109 return parsedRule
input/segmenty.dat
@@ -101,6 +101,7 @@ moze_interp( (adja dywiz)+ adj ) @@ -101,6 +101,7 @@ moze_interp( (adja dywiz)+ adj )
101 # Stopień najwyższy: 101 # Stopień najwyższy:
102 # np. „naj·zieleńszy”, „naj·mądrzej” 102 # np. „naj·zieleńszy”, „naj·mądrzej”
103 moze_interp( naj> adj_sup ) 103 moze_interp( naj> adj_sup )
  104 +moze_interp( nie> naj> adj_sup ) !weak
104 105
105 # Formy „zanegowane” gerundiów i imiesłowów: 106 # Formy „zanegowane” gerundiów i imiesłowów:
106 # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: 107 # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”:
@@ -112,7 +113,7 @@ moze_interp(z_on_agl) @@ -112,7 +113,7 @@ moze_interp(z_on_agl)
112 moze_interp(z_on_agl on_agl) 113 moze_interp(z_on_agl on_agl)
113 114
114 # Liczba zapisana jako ciąg cyfr: 115 # Liczba zapisana jako ciąg cyfr:
115 -moze_interp( dig>* dig ) 116 +moze_interp( dig>* dig ) !weak
116 117
117 # Formacje prefiksalne 118 # Formacje prefiksalne
118 #### trzeba wydzielić odpowiednie samodze! 119 #### trzeba wydzielić odpowiednie samodze!
morfeusz/CMakeLists.txt
@@ -36,7 +36,7 @@ set(SRC_FILES @@ -36,7 +36,7 @@ set(SRC_FILES
36 charset/caseconv.cpp 36 charset/caseconv.cpp
37 charset/conversion_tables.cpp 37 charset/conversion_tables.cpp
38 segrules/segrules.cpp 38 segrules/segrules.cpp
39 - segrules/SegrulesDeserializer.cpp) 39 +)
40 40
41 set(INCLUDE_FILES 41 set(INCLUDE_FILES
42 const.hpp 42 const.hpp
@@ -50,7 +50,8 @@ set(INCLUDE_FILES @@ -50,7 +50,8 @@ set(INCLUDE_FILES
50 charset/CharsetConverter.hpp 50 charset/CharsetConverter.hpp
51 charset/CaseConverter.hpp 51 charset/CaseConverter.hpp
52 charset/caseconv.hpp 52 charset/caseconv.hpp
53 - charset/conversion_tables.hpp) 53 + charset/conversion_tables.hpp
  54 +)
54 55
55 add_library (libmorfeusz SHARED ${SRC_FILES}) 56 add_library (libmorfeusz SHARED ${SRC_FILES})
56 set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERATED TRUE) 57 set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERATED TRUE)
morfeusz/FlexionGraph.cpp
@@ -51,9 +51,17 @@ static inline bool chunkIsTheOnlyOne( @@ -51,9 +51,17 @@ static inline bool chunkIsTheOnlyOne(
51 return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path); 51 return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path);
52 } 52 }
53 53
54 -void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { 54 +void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool weak) {
55 // debugPath(path); 55 // debugPath(path);
56 // debugGraph(this->graph); 56 // debugGraph(this->graph);
  57 + if (weak && !this->empty() && !this->onlyWeakPaths) {
  58 + return;
  59 + }
  60 + else if (this->onlyWeakPaths && !weak) {
  61 + this->graph.clear();
  62 + this->node2ChunkStartPtr.clear();
  63 + this->onlyWeakPaths = false;
  64 + }
57 for (unsigned int i = 0; i < path.size(); i++) { 65 for (unsigned int i = 0; i < path.size(); i++) {
58 const InterpretedChunk& chunk = path[i]; 66 const InterpretedChunk& chunk = path[i];
59 if (!chunk.orthWasShifted) { 67 if (!chunk.orthWasShifted) {
morfeusz/FlexionGraph.hpp
@@ -15,13 +15,17 @@ @@ -15,13 +15,17 @@
15 15
16 class FlexionGraph { 16 class FlexionGraph {
17 public: 17 public:
  18 +
  19 + FlexionGraph(): graph(), node2ChunkStartPtr(), onlyWeakPaths(true) {
  20 +
  21 + }
18 22
19 struct Edge { 23 struct Edge {
20 InterpretedChunk chunk; 24 InterpretedChunk chunk;
21 unsigned int nextNode; 25 unsigned int nextNode;
22 }; 26 };
23 27
24 - void addPath(const std::vector<InterpretedChunk>& path); 28 + void addPath(const std::vector<InterpretedChunk>& path, bool weak);
25 29
26 // void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results); 30 // void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results);
27 31
@@ -64,6 +68,7 @@ private: @@ -64,6 +68,7 @@ private:
64 68
65 std::vector< std::vector<Edge> > graph; 69 std::vector< std::vector<Edge> > graph;
66 std::vector< const char* > node2ChunkStartPtr; 70 std::vector< const char* > node2ChunkStartPtr;
  71 + bool onlyWeakPaths;
67 }; 72 };
68 73
69 #endif /* FLEXIONGRAPH_HPP */ 74 #endif /* FLEXIONGRAPH_HPP */
morfeusz/Morfeusz.cpp
@@ -154,8 +154,9 @@ void Morfeusz::doProcessOneWord( @@ -154,8 +154,9 @@ void Morfeusz::doProcessOneWord(
154 doShiftOrth(accum.back(), ic); 154 doShiftOrth(accum.back(), ic);
155 } 155 }
156 accum.push_back(ic); 156 accum.push_back(ic);
157 - if (isEndOfWord(codepoint) && newSegrulesState.accepting) {  
158 - graph.addPath(accum); 157 + if (isEndOfWord(codepoint)
  158 + && newSegrulesState.accepting) {
  159 + graph.addPath(accum, newSegrulesState.weak);
159 } 160 }
160 else if (!isEndOfWord(codepoint)) { 161 else if (!isEndOfWord(codepoint)) {
161 // cerr << "will process " << currInput << endl; 162 // cerr << "will process " << currInput << endl;
morfeusz/segrules/SegrulesDeserializer.cpp deleted
1 -/*  
2 - * File: SegrulesDeserializer.cpp  
3 - * Author: mlenart  
4 - *  
5 - * Created on 25 luty 2014, 16:16  
6 - */  
7 -  
8 -#include "SegrulesDeserializer.hpp"  
9 -  
10 -SegrulesDeserializer::SegrulesDeserializer() {  
11 -}  
12 -  
13 -long SegrulesDeserializer::deserialize(const unsigned char* ptr, unsigned char& object) const {  
14 - object = *ptr;  
15 - return 1;  
16 -}  
17 -  
18 -SegrulesDeserializer::~SegrulesDeserializer() {  
19 -}  
20 -  
morfeusz/segrules/SegrulesDeserializer.hpp deleted
1 -/*  
2 - * File: SegrulesDeserializer.hpp  
3 - * Author: mlenart  
4 - *  
5 - * Created on 25 luty 2014, 16:16  
6 - */  
7 -  
8 -#ifndef SEGRULESDESERIALIZER_HPP  
9 -#define SEGRULESDESERIALIZER_HPP  
10 -  
11 -#include "../fsa/fsa.hpp"  
12 -  
13 -class SegrulesDeserializer: public Deserializer<unsigned char> {  
14 -public:  
15 - SegrulesDeserializer();  
16 - long deserialize(const unsigned char* ptr, unsigned char& object) const;  
17 - virtual ~SegrulesDeserializer();  
18 -private:  
19 -  
20 -};  
21 -  
22 -#endif /* SEGRULESDESERIALIZER_HPP */  
23 -  
morfeusz/segrules/segrules.cpp
1 1
2 -#include "SegrulesDeserializer.hpp"  
3 #include "segrules.hpp" 2 #include "segrules.hpp"
4 #include "../fsa/fsa.hpp" 3 #include "../fsa/fsa.hpp"
5 #include "../fsa/const.hpp" 4 #include "../fsa/const.hpp"
morfeusz/segrules/segrules.hpp
@@ -19,8 +19,8 @@ typedef std::map&lt;std::string, std::string&gt; SegrulesOptions; @@ -19,8 +19,8 @@ typedef std::map&lt;std::string, std::string&gt; SegrulesOptions;
19 19
20 std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); 20 std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr);
21 SegrulesOptions getDefaultSegrulesOptions(const unsigned char* ptr); 21 SegrulesOptions getDefaultSegrulesOptions(const unsigned char* ptr);
22 -SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr);  
23 -void debugMap(const map<SegrulesOptions, SegrulesFSA*>& res); 22 +SegrulesFSA* getDefaultSegrulesFSA(const std::map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr);
  23 +void debugMap(const std::map<SegrulesOptions, SegrulesFSA*>& res);
24 24
25 #endif /* SEGRULES_HPP */ 25 #endif /* SEGRULES_HPP */
26 26
nbproject/configurations.xml
@@ -33,7 +33,6 @@ @@ -33,7 +33,6 @@
33 <in>test_speed.cpp</in> 33 <in>test_speed.cpp</in>
34 </df> 34 </df>
35 <df name="segrules"> 35 <df name="segrules">
36 - <in>SegrulesDeserializer.cpp</in>  
37 <in>segrules.cpp</in> 36 <in>segrules.cpp</in>
38 </df> 37 </df>
39 <in>Environment.cpp</in> 38 <in>Environment.cpp</in>
@@ -496,11 +495,6 @@ @@ -496,11 +495,6 @@
496 </preprocessorList> 495 </preprocessorList>
497 </ccTool> 496 </ccTool>
498 </item> 497 </item>
499 - <item path="morfeusz/segrules/SegrulesDeserializer.cpp"  
500 - ex="false"  
501 - tool="1"  
502 - flavor2="4">  
503 - </item>  
504 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> 498 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
505 </item> 499 </item>
506 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> 500 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">