Commit dfe845014dcca9040b318b2c28f3409a9c45c3a2

Authored by Michał Lenart
1 parent a6443fde

dodanie obsługi znacznika !weak

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@130 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/buildanalyzer.sh
1 1 #!/bin/bash
2 2  
3 3 python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
4   - --tagset-file=/tmp/polimorf-sgjp.tagset \
5   - --segments-file=/tmp/segmenty.dat \
  4 + --tagset-file=../input/polimorf.tagset \
  5 + --segments-file=../input/segmenty.dat \
6 6 --analyzer \
7 7 --serialization-method=SIMPLE \
8 8 --trim-supneg \
... ...
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
... ... @@ -9,7 +9,7 @@ from pyparsing import *
9 9 from morfeuszbuilder.utils import exceptions
10 10 from pyparseString import pyparseString
11 11  
12   -identifier = Word(alphas, bodyChars=alphanums+u'_>*+!')
  12 +identifier = Word(alphas, bodyChars=alphanums+u'_>*+')
13 13 define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
14 14 ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
15 15 endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
... ... @@ -61,8 +61,9 @@ def _processLine(lineNum, line, defines, filename):
61 61 rule = Forward()
62 62 defineInstance = Forward()
63 63 localId = identifier.copy()
  64 + weakLiteral = CaselessLiteral('!weak')
64 65  
65   - rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')))
  66 + rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')) ^ weakLiteral)
66 67 defineInstance << localId + Suppress('(') + rule + Suppress(')')
67 68  
68 69 rule.setParseAction(lambda s, l, t: ' '.join(t))
... ...
fsabuilder/morfeuszbuilder/segrules/rules.py
... ... @@ -13,9 +13,12 @@ class SegmentRule(object):
13 13  
14 14  
15 15 def __init__(self):
16   - '''
17   - Constructor
18   - '''
  16 +
  17 + self.weak = False
  18 +
  19 + def setWeak(self, weak):
  20 + self.weak = weak
  21 + return self
19 22  
20 23 def addToNFA(self, fsa):
21 24 raise NotImplementedError()
... ... @@ -31,7 +34,7 @@ class TagRule(SegmentRule):
31 34 self.shiftOrth = shiftOrth
32 35  
33 36 def addToNFA(self, fsa):
34   - endState = RulesNFAState(final=True)
  37 + endState = RulesNFAState(final=True, weak=self.weak)
35 38 self._doAddToNFA(fsa.initialState, endState)
36 39  
37 40 def _doAddToNFA(self, startState, endState):
... ... @@ -51,7 +54,7 @@ class ComplexRule(SegmentRule):
51 54 self.children = children
52 55  
53 56 def addToNFA(self, fsa):
54   - endState = RulesNFAState(final=True)
  57 + endState = RulesNFAState(final=True, weak=self.weak)
55 58 self._doAddToNFA(fsa.initialState, endState)
56 59  
57 60 class ConcatRule(ComplexRule):
... ...
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... ... @@ -68,11 +68,11 @@ class RulesNFA(object):
68 68 return res
69 69  
70 70 def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState):
71   - assert all(map(lambda state: state.weak, nfaStates)) \
72   - or not any(map(lambda state: state.weak, nfaStates))
73   - weak = all(map(lambda state: state.weak or not state.final, nfaStates))
  71 + assert all(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) \
  72 + or not any(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates)))
  73 + weak = any(map(lambda state: state.weak and state.final, nfaStates))
74 74 final = any(map(lambda state: state.final, nfaStates))
75   - assert not weak or not final
  75 +# assert not weak or not final
76 76 if final:
77 77 # dfaState should be final
78 78 # and contain info about weakness
... ...
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... ... @@ -95,7 +95,7 @@ class RulesParser(object):
95 95 concatRule = OneOrMore(complexRule)
96 96 else:
97 97 concatRule = ZeroOrMore(shiftOrthRule) + tagRule
98   - rule << concatRule
  98 + rule << concatRule + Optional(CaselessLiteral('!weak'))
99 99  
100 100 tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
101 101 shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
... ... @@ -104,5 +104,6 @@ class RulesParser(object):
104 104 oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
105 105 oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
106 106 concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
  107 + rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2))
107 108 parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0]
108 109 return parsedRule
... ...
input/segmenty.dat
... ... @@ -101,6 +101,7 @@ moze_interp( (adja dywiz)+ adj )
101 101 # Stopień najwyższy:
102 102 # np. „naj·zieleńszy”, „naj·mądrzej”
103 103 moze_interp( naj> adj_sup )
  104 +moze_interp( nie> naj> adj_sup ) !weak
104 105  
105 106 # Formy „zanegowane” gerundiów i imiesłowów:
106 107 # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”:
... ... @@ -112,7 +113,7 @@ moze_interp(z_on_agl)
112 113 moze_interp(z_on_agl on_agl)
113 114  
114 115 # Liczba zapisana jako ciąg cyfr:
115   -moze_interp( dig>* dig )
  116 +moze_interp( dig>* dig ) !weak
116 117  
117 118 # Formacje prefiksalne
118 119 #### trzeba wydzielić odpowiednie samodze!
... ...
morfeusz/CMakeLists.txt
... ... @@ -36,7 +36,7 @@ set(SRC_FILES
36 36 charset/caseconv.cpp
37 37 charset/conversion_tables.cpp
38 38 segrules/segrules.cpp
39   - segrules/SegrulesDeserializer.cpp)
  39 +)
40 40  
41 41 set(INCLUDE_FILES
42 42 const.hpp
... ... @@ -50,7 +50,8 @@ set(INCLUDE_FILES
50 50 charset/CharsetConverter.hpp
51 51 charset/CaseConverter.hpp
52 52 charset/caseconv.hpp
53   - charset/conversion_tables.hpp)
  53 + charset/conversion_tables.hpp
  54 +)
54 55  
55 56 add_library (libmorfeusz SHARED ${SRC_FILES})
56 57 set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERATED TRUE)
... ...
morfeusz/FlexionGraph.cpp
... ... @@ -51,9 +51,17 @@ static inline bool chunkIsTheOnlyOne(
51 51 return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path);
52 52 }
53 53  
54   -void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) {
  54 +void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool weak) {
55 55 // debugPath(path);
56 56 // debugGraph(this->graph);
  57 + if (weak && !this->empty() && !this->onlyWeakPaths) {
  58 + return;
  59 + }
  60 + else if (this->onlyWeakPaths && !weak) {
  61 + this->graph.clear();
  62 + this->node2ChunkStartPtr.clear();
  63 + this->onlyWeakPaths = false;
  64 + }
57 65 for (unsigned int i = 0; i < path.size(); i++) {
58 66 const InterpretedChunk& chunk = path[i];
59 67 if (!chunk.orthWasShifted) {
... ...
morfeusz/FlexionGraph.hpp
... ... @@ -15,13 +15,17 @@
15 15  
16 16 class FlexionGraph {
17 17 public:
  18 +
  19 + FlexionGraph(): graph(), node2ChunkStartPtr(), onlyWeakPaths(true) {
  20 +
  21 + }
18 22  
19 23 struct Edge {
20 24 InterpretedChunk chunk;
21 25 unsigned int nextNode;
22 26 };
23 27  
24   - void addPath(const std::vector<InterpretedChunk>& path);
  28 + void addPath(const std::vector<InterpretedChunk>& path, bool weak);
25 29  
26 30 // void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results);
27 31  
... ... @@ -64,6 +68,7 @@ private:
64 68  
65 69 std::vector< std::vector<Edge> > graph;
66 70 std::vector< const char* > node2ChunkStartPtr;
  71 + bool onlyWeakPaths;
67 72 };
68 73  
69 74 #endif /* FLEXIONGRAPH_HPP */
... ...
morfeusz/Morfeusz.cpp
... ... @@ -154,8 +154,9 @@ void Morfeusz::doProcessOneWord(
154 154 doShiftOrth(accum.back(), ic);
155 155 }
156 156 accum.push_back(ic);
157   - if (isEndOfWord(codepoint) && newSegrulesState.accepting) {
158   - graph.addPath(accum);
  157 + if (isEndOfWord(codepoint)
  158 + && newSegrulesState.accepting) {
  159 + graph.addPath(accum, newSegrulesState.weak);
159 160 }
160 161 else if (!isEndOfWord(codepoint)) {
161 162 // cerr << "will process " << currInput << endl;
... ...
morfeusz/segrules/SegrulesDeserializer.cpp deleted
1   -/*
2   - * File: SegrulesDeserializer.cpp
3   - * Author: mlenart
4   - *
5   - * Created on 25 luty 2014, 16:16
6   - */
7   -
8   -#include "SegrulesDeserializer.hpp"
9   -
10   -SegrulesDeserializer::SegrulesDeserializer() {
11   -}
12   -
13   -long SegrulesDeserializer::deserialize(const unsigned char* ptr, unsigned char& object) const {
14   - object = *ptr;
15   - return 1;
16   -}
17   -
18   -SegrulesDeserializer::~SegrulesDeserializer() {
19   -}
20   -
morfeusz/segrules/SegrulesDeserializer.hpp deleted
1   -/*
2   - * File: SegrulesDeserializer.hpp
3   - * Author: mlenart
4   - *
5   - * Created on 25 luty 2014, 16:16
6   - */
7   -
8   -#ifndef SEGRULESDESERIALIZER_HPP
9   -#define SEGRULESDESERIALIZER_HPP
10   -
11   -#include "../fsa/fsa.hpp"
12   -
13   -class SegrulesDeserializer: public Deserializer<unsigned char> {
14   -public:
15   - SegrulesDeserializer();
16   - long deserialize(const unsigned char* ptr, unsigned char& object) const;
17   - virtual ~SegrulesDeserializer();
18   -private:
19   -
20   -};
21   -
22   -#endif /* SEGRULESDESERIALIZER_HPP */
23   -
morfeusz/segrules/segrules.cpp
1 1  
2   -#include "SegrulesDeserializer.hpp"
3 2 #include "segrules.hpp"
4 3 #include "../fsa/fsa.hpp"
5 4 #include "../fsa/const.hpp"
... ...
morfeusz/segrules/segrules.hpp
... ... @@ -19,8 +19,8 @@ typedef std::map&lt;std::string, std::string&gt; SegrulesOptions;
19 19  
20 20 std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr);
21 21 SegrulesOptions getDefaultSegrulesOptions(const unsigned char* ptr);
22   -SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr);
23   -void debugMap(const map<SegrulesOptions, SegrulesFSA*>& res);
  22 +SegrulesFSA* getDefaultSegrulesFSA(const std::map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr);
  23 +void debugMap(const std::map<SegrulesOptions, SegrulesFSA*>& res);
24 24  
25 25 #endif /* SEGRULES_HPP */
26 26  
... ...
nbproject/configurations.xml
... ... @@ -33,7 +33,6 @@
33 33 <in>test_speed.cpp</in>
34 34 </df>
35 35 <df name="segrules">
36   - <in>SegrulesDeserializer.cpp</in>
37 36 <in>segrules.cpp</in>
38 37 </df>
39 38 <in>Environment.cpp</in>
... ... @@ -496,11 +495,6 @@
496 495 </preprocessorList>
497 496 </ccTool>
498 497 </item>
499   - <item path="morfeusz/segrules/SegrulesDeserializer.cpp"
500   - ex="false"
501   - tool="1"
502   - flavor2="4">
503   - </item>
504 498 <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
505 499 </item>
506 500 <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
... ...