Commit dfe845014dcca9040b318b2c28f3409a9c45c3a2
1 parent
a6443fde
dodanie obsługi znacznika !weak
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@130 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
15 changed files
with
44 additions
and
73 deletions
fsabuilder/buildanalyzer.sh
1 | #!/bin/bash | 1 | #!/bin/bash |
2 | 2 | ||
3 | python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ | 3 | python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ |
4 | - --tagset-file=/tmp/polimorf-sgjp.tagset \ | ||
5 | - --segments-file=/tmp/segmenty.dat \ | 4 | + --tagset-file=../input/polimorf.tagset \ |
5 | + --segments-file=../input/segmenty.dat \ | ||
6 | --analyzer \ | 6 | --analyzer \ |
7 | --serialization-method=SIMPLE \ | 7 | --serialization-method=SIMPLE \ |
8 | --trim-supneg \ | 8 | --trim-supneg \ |
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -9,7 +9,7 @@ from pyparsing import * | @@ -9,7 +9,7 @@ from pyparsing import * | ||
9 | from morfeuszbuilder.utils import exceptions | 9 | from morfeuszbuilder.utils import exceptions |
10 | from pyparseString import pyparseString | 10 | from pyparseString import pyparseString |
11 | 11 | ||
12 | -identifier = Word(alphas, bodyChars=alphanums+u'_>*+!') | 12 | +identifier = Word(alphas, bodyChars=alphanums+u'_>*+') |
13 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() | 13 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() |
14 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() | 14 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() |
15 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() | 15 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() |
@@ -61,8 +61,9 @@ def _processLine(lineNum, line, defines, filename): | @@ -61,8 +61,9 @@ def _processLine(lineNum, line, defines, filename): | ||
61 | rule = Forward() | 61 | rule = Forward() |
62 | defineInstance = Forward() | 62 | defineInstance = Forward() |
63 | localId = identifier.copy() | 63 | localId = identifier.copy() |
64 | + weakLiteral = CaselessLiteral('!weak') | ||
64 | 65 | ||
65 | - rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')'))) | 66 | + rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')) ^ weakLiteral) |
66 | defineInstance << localId + Suppress('(') + rule + Suppress(')') | 67 | defineInstance << localId + Suppress('(') + rule + Suppress(')') |
67 | 68 | ||
68 | rule.setParseAction(lambda s, l, t: ' '.join(t)) | 69 | rule.setParseAction(lambda s, l, t: ' '.join(t)) |
fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -13,9 +13,12 @@ class SegmentRule(object): | @@ -13,9 +13,12 @@ class SegmentRule(object): | ||
13 | 13 | ||
14 | 14 | ||
15 | def __init__(self): | 15 | def __init__(self): |
16 | - ''' | ||
17 | - Constructor | ||
18 | - ''' | 16 | + |
17 | + self.weak = False | ||
18 | + | ||
19 | + def setWeak(self, weak): | ||
20 | + self.weak = weak | ||
21 | + return self | ||
19 | 22 | ||
20 | def addToNFA(self, fsa): | 23 | def addToNFA(self, fsa): |
21 | raise NotImplementedError() | 24 | raise NotImplementedError() |
@@ -31,7 +34,7 @@ class TagRule(SegmentRule): | @@ -31,7 +34,7 @@ class TagRule(SegmentRule): | ||
31 | self.shiftOrth = shiftOrth | 34 | self.shiftOrth = shiftOrth |
32 | 35 | ||
33 | def addToNFA(self, fsa): | 36 | def addToNFA(self, fsa): |
34 | - endState = RulesNFAState(final=True) | 37 | + endState = RulesNFAState(final=True, weak=self.weak) |
35 | self._doAddToNFA(fsa.initialState, endState) | 38 | self._doAddToNFA(fsa.initialState, endState) |
36 | 39 | ||
37 | def _doAddToNFA(self, startState, endState): | 40 | def _doAddToNFA(self, startState, endState): |
@@ -51,7 +54,7 @@ class ComplexRule(SegmentRule): | @@ -51,7 +54,7 @@ class ComplexRule(SegmentRule): | ||
51 | self.children = children | 54 | self.children = children |
52 | 55 | ||
53 | def addToNFA(self, fsa): | 56 | def addToNFA(self, fsa): |
54 | - endState = RulesNFAState(final=True) | 57 | + endState = RulesNFAState(final=True, weak=self.weak) |
55 | self._doAddToNFA(fsa.initialState, endState) | 58 | self._doAddToNFA(fsa.initialState, endState) |
56 | 59 | ||
57 | class ConcatRule(ComplexRule): | 60 | class ConcatRule(ComplexRule): |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -68,11 +68,11 @@ class RulesNFA(object): | @@ -68,11 +68,11 @@ class RulesNFA(object): | ||
68 | return res | 68 | return res |
69 | 69 | ||
70 | def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): | 70 | def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): |
71 | - assert all(map(lambda state: state.weak, nfaStates)) \ | ||
72 | - or not any(map(lambda state: state.weak, nfaStates)) | ||
73 | - weak = all(map(lambda state: state.weak or not state.final, nfaStates)) | 71 | + assert all(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) \ |
72 | + or not any(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) | ||
73 | + weak = any(map(lambda state: state.weak and state.final, nfaStates)) | ||
74 | final = any(map(lambda state: state.final, nfaStates)) | 74 | final = any(map(lambda state: state.final, nfaStates)) |
75 | - assert not weak or not final | 75 | +# assert not weak or not final |
76 | if final: | 76 | if final: |
77 | # dfaState should be final | 77 | # dfaState should be final |
78 | # and contain info about weakness | 78 | # and contain info about weakness |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -95,7 +95,7 @@ class RulesParser(object): | @@ -95,7 +95,7 @@ class RulesParser(object): | ||
95 | concatRule = OneOrMore(complexRule) | 95 | concatRule = OneOrMore(complexRule) |
96 | else: | 96 | else: |
97 | concatRule = ZeroOrMore(shiftOrthRule) + tagRule | 97 | concatRule = ZeroOrMore(shiftOrthRule) + tagRule |
98 | - rule << concatRule | 98 | + rule << concatRule + Optional(CaselessLiteral('!weak')) |
99 | 99 | ||
100 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) | 100 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) |
101 | shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper)) | 101 | shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper)) |
@@ -104,5 +104,6 @@ class RulesParser(object): | @@ -104,5 +104,6 @@ class RulesParser(object): | ||
104 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) | 104 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) |
105 | oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) | 105 | oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) |
106 | concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) | 106 | concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) |
107 | + rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2)) | ||
107 | parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0] | 108 | parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0] |
108 | return parsedRule | 109 | return parsedRule |
input/segmenty.dat
@@ -101,6 +101,7 @@ moze_interp( (adja dywiz)+ adj ) | @@ -101,6 +101,7 @@ moze_interp( (adja dywiz)+ adj ) | ||
101 | # Stopień najwyższy: | 101 | # Stopień najwyższy: |
102 | # np. „naj·zieleńszy”, „naj·mądrzej” | 102 | # np. „naj·zieleńszy”, „naj·mądrzej” |
103 | moze_interp( naj> adj_sup ) | 103 | moze_interp( naj> adj_sup ) |
104 | +moze_interp( nie> naj> adj_sup ) !weak | ||
104 | 105 | ||
105 | # Formy „zanegowane” gerundiów i imiesłowów: | 106 | # Formy „zanegowane” gerundiów i imiesłowów: |
106 | # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: | 107 | # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: |
@@ -112,7 +113,7 @@ moze_interp(z_on_agl) | @@ -112,7 +113,7 @@ moze_interp(z_on_agl) | ||
112 | moze_interp(z_on_agl on_agl) | 113 | moze_interp(z_on_agl on_agl) |
113 | 114 | ||
114 | # Liczba zapisana jako ciąg cyfr: | 115 | # Liczba zapisana jako ciąg cyfr: |
115 | -moze_interp( dig>* dig ) | 116 | +moze_interp( dig>* dig ) !weak |
116 | 117 | ||
117 | # Formacje prefiksalne | 118 | # Formacje prefiksalne |
118 | #### trzeba wydzielić odpowiednie samodze! | 119 | #### trzeba wydzielić odpowiednie samodze! |
morfeusz/CMakeLists.txt
@@ -36,7 +36,7 @@ set(SRC_FILES | @@ -36,7 +36,7 @@ set(SRC_FILES | ||
36 | charset/caseconv.cpp | 36 | charset/caseconv.cpp |
37 | charset/conversion_tables.cpp | 37 | charset/conversion_tables.cpp |
38 | segrules/segrules.cpp | 38 | segrules/segrules.cpp |
39 | - segrules/SegrulesDeserializer.cpp) | 39 | +) |
40 | 40 | ||
41 | set(INCLUDE_FILES | 41 | set(INCLUDE_FILES |
42 | const.hpp | 42 | const.hpp |
@@ -50,7 +50,8 @@ set(INCLUDE_FILES | @@ -50,7 +50,8 @@ set(INCLUDE_FILES | ||
50 | charset/CharsetConverter.hpp | 50 | charset/CharsetConverter.hpp |
51 | charset/CaseConverter.hpp | 51 | charset/CaseConverter.hpp |
52 | charset/caseconv.hpp | 52 | charset/caseconv.hpp |
53 | - charset/conversion_tables.hpp) | 53 | + charset/conversion_tables.hpp |
54 | +) | ||
54 | 55 | ||
55 | add_library (libmorfeusz SHARED ${SRC_FILES}) | 56 | add_library (libmorfeusz SHARED ${SRC_FILES}) |
56 | set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERATED TRUE) | 57 | set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERATED TRUE) |
morfeusz/FlexionGraph.cpp
@@ -51,9 +51,17 @@ static inline bool chunkIsTheOnlyOne( | @@ -51,9 +51,17 @@ static inline bool chunkIsTheOnlyOne( | ||
51 | return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path); | 51 | return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path); |
52 | } | 52 | } |
53 | 53 | ||
54 | -void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { | 54 | +void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool weak) { |
55 | // debugPath(path); | 55 | // debugPath(path); |
56 | // debugGraph(this->graph); | 56 | // debugGraph(this->graph); |
57 | + if (weak && !this->empty() && !this->onlyWeakPaths) { | ||
58 | + return; | ||
59 | + } | ||
60 | + else if (this->onlyWeakPaths && !weak) { | ||
61 | + this->graph.clear(); | ||
62 | + this->node2ChunkStartPtr.clear(); | ||
63 | + this->onlyWeakPaths = false; | ||
64 | + } | ||
57 | for (unsigned int i = 0; i < path.size(); i++) { | 65 | for (unsigned int i = 0; i < path.size(); i++) { |
58 | const InterpretedChunk& chunk = path[i]; | 66 | const InterpretedChunk& chunk = path[i]; |
59 | if (!chunk.orthWasShifted) { | 67 | if (!chunk.orthWasShifted) { |
morfeusz/FlexionGraph.hpp
@@ -15,13 +15,17 @@ | @@ -15,13 +15,17 @@ | ||
15 | 15 | ||
16 | class FlexionGraph { | 16 | class FlexionGraph { |
17 | public: | 17 | public: |
18 | + | ||
19 | + FlexionGraph(): graph(), node2ChunkStartPtr(), onlyWeakPaths(true) { | ||
20 | + | ||
21 | + } | ||
18 | 22 | ||
19 | struct Edge { | 23 | struct Edge { |
20 | InterpretedChunk chunk; | 24 | InterpretedChunk chunk; |
21 | unsigned int nextNode; | 25 | unsigned int nextNode; |
22 | }; | 26 | }; |
23 | 27 | ||
24 | - void addPath(const std::vector<InterpretedChunk>& path); | 28 | + void addPath(const std::vector<InterpretedChunk>& path, bool weak); |
25 | 29 | ||
26 | // void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results); | 30 | // void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results); |
27 | 31 | ||
@@ -64,6 +68,7 @@ private: | @@ -64,6 +68,7 @@ private: | ||
64 | 68 | ||
65 | std::vector< std::vector<Edge> > graph; | 69 | std::vector< std::vector<Edge> > graph; |
66 | std::vector< const char* > node2ChunkStartPtr; | 70 | std::vector< const char* > node2ChunkStartPtr; |
71 | + bool onlyWeakPaths; | ||
67 | }; | 72 | }; |
68 | 73 | ||
69 | #endif /* FLEXIONGRAPH_HPP */ | 74 | #endif /* FLEXIONGRAPH_HPP */ |
morfeusz/Morfeusz.cpp
@@ -154,8 +154,9 @@ void Morfeusz::doProcessOneWord( | @@ -154,8 +154,9 @@ void Morfeusz::doProcessOneWord( | ||
154 | doShiftOrth(accum.back(), ic); | 154 | doShiftOrth(accum.back(), ic); |
155 | } | 155 | } |
156 | accum.push_back(ic); | 156 | accum.push_back(ic); |
157 | - if (isEndOfWord(codepoint) && newSegrulesState.accepting) { | ||
158 | - graph.addPath(accum); | 157 | + if (isEndOfWord(codepoint) |
158 | + && newSegrulesState.accepting) { | ||
159 | + graph.addPath(accum, newSegrulesState.weak); | ||
159 | } | 160 | } |
160 | else if (!isEndOfWord(codepoint)) { | 161 | else if (!isEndOfWord(codepoint)) { |
161 | // cerr << "will process " << currInput << endl; | 162 | // cerr << "will process " << currInput << endl; |
morfeusz/segrules/SegrulesDeserializer.cpp deleted
1 | -/* | ||
2 | - * File: SegrulesDeserializer.cpp | ||
3 | - * Author: mlenart | ||
4 | - * | ||
5 | - * Created on 25 luty 2014, 16:16 | ||
6 | - */ | ||
7 | - | ||
8 | -#include "SegrulesDeserializer.hpp" | ||
9 | - | ||
10 | -SegrulesDeserializer::SegrulesDeserializer() { | ||
11 | -} | ||
12 | - | ||
13 | -long SegrulesDeserializer::deserialize(const unsigned char* ptr, unsigned char& object) const { | ||
14 | - object = *ptr; | ||
15 | - return 1; | ||
16 | -} | ||
17 | - | ||
18 | -SegrulesDeserializer::~SegrulesDeserializer() { | ||
19 | -} | ||
20 | - |
morfeusz/segrules/SegrulesDeserializer.hpp deleted
1 | -/* | ||
2 | - * File: SegrulesDeserializer.hpp | ||
3 | - * Author: mlenart | ||
4 | - * | ||
5 | - * Created on 25 luty 2014, 16:16 | ||
6 | - */ | ||
7 | - | ||
8 | -#ifndef SEGRULESDESERIALIZER_HPP | ||
9 | -#define SEGRULESDESERIALIZER_HPP | ||
10 | - | ||
11 | -#include "../fsa/fsa.hpp" | ||
12 | - | ||
13 | -class SegrulesDeserializer: public Deserializer<unsigned char> { | ||
14 | -public: | ||
15 | - SegrulesDeserializer(); | ||
16 | - long deserialize(const unsigned char* ptr, unsigned char& object) const; | ||
17 | - virtual ~SegrulesDeserializer(); | ||
18 | -private: | ||
19 | - | ||
20 | -}; | ||
21 | - | ||
22 | -#endif /* SEGRULESDESERIALIZER_HPP */ | ||
23 | - |
morfeusz/segrules/segrules.cpp
morfeusz/segrules/segrules.hpp
@@ -19,8 +19,8 @@ typedef std::map<std::string, std::string> SegrulesOptions; | @@ -19,8 +19,8 @@ typedef std::map<std::string, std::string> SegrulesOptions; | ||
19 | 19 | ||
20 | std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); | 20 | std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); |
21 | SegrulesOptions getDefaultSegrulesOptions(const unsigned char* ptr); | 21 | SegrulesOptions getDefaultSegrulesOptions(const unsigned char* ptr); |
22 | -SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr); | ||
23 | -void debugMap(const map<SegrulesOptions, SegrulesFSA*>& res); | 22 | +SegrulesFSA* getDefaultSegrulesFSA(const std::map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr); |
23 | +void debugMap(const std::map<SegrulesOptions, SegrulesFSA*>& res); | ||
24 | 24 | ||
25 | #endif /* SEGRULES_HPP */ | 25 | #endif /* SEGRULES_HPP */ |
26 | 26 |
nbproject/configurations.xml
@@ -33,7 +33,6 @@ | @@ -33,7 +33,6 @@ | ||
33 | <in>test_speed.cpp</in> | 33 | <in>test_speed.cpp</in> |
34 | </df> | 34 | </df> |
35 | <df name="segrules"> | 35 | <df name="segrules"> |
36 | - <in>SegrulesDeserializer.cpp</in> | ||
37 | <in>segrules.cpp</in> | 36 | <in>segrules.cpp</in> |
38 | </df> | 37 | </df> |
39 | <in>Environment.cpp</in> | 38 | <in>Environment.cpp</in> |
@@ -496,11 +495,6 @@ | @@ -496,11 +495,6 @@ | ||
496 | </preprocessorList> | 495 | </preprocessorList> |
497 | </ccTool> | 496 | </ccTool> |
498 | </item> | 497 | </item> |
499 | - <item path="morfeusz/segrules/SegrulesDeserializer.cpp" | ||
500 | - ex="false" | ||
501 | - tool="1" | ||
502 | - flavor2="4"> | ||
503 | - </item> | ||
504 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> | 498 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
505 | </item> | 499 | </item> |
506 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> | 500 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |