Commit dfe845014dcca9040b318b2c28f3409a9c45c3a2
1 parent
a6443fde
dodanie obsługi znacznika !weak
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@130 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
15 changed files
with
44 additions
and
73 deletions
fsabuilder/buildanalyzer.sh
1 | 1 | #!/bin/bash |
2 | 2 | |
3 | 3 | python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ |
4 | - --tagset-file=/tmp/polimorf-sgjp.tagset \ | |
5 | - --segments-file=/tmp/segmenty.dat \ | |
4 | + --tagset-file=../input/polimorf.tagset \ | |
5 | + --segments-file=../input/segmenty.dat \ | |
6 | 6 | --analyzer \ |
7 | 7 | --serialization-method=SIMPLE \ |
8 | 8 | --trim-supneg \ |
... | ... |
fsabuilder/morfeuszbuilder/segrules/preprocessor.py
... | ... | @@ -9,7 +9,7 @@ from pyparsing import * |
9 | 9 | from morfeuszbuilder.utils import exceptions |
10 | 10 | from pyparseString import pyparseString |
11 | 11 | |
12 | -identifier = Word(alphas, bodyChars=alphanums+u'_>*+!') | |
12 | +identifier = Word(alphas, bodyChars=alphanums+u'_>*+') | |
13 | 13 | define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() |
14 | 14 | ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() |
15 | 15 | endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() |
... | ... | @@ -61,8 +61,9 @@ def _processLine(lineNum, line, defines, filename): |
61 | 61 | rule = Forward() |
62 | 62 | defineInstance = Forward() |
63 | 63 | localId = identifier.copy() |
64 | + weakLiteral = CaselessLiteral('!weak') | |
64 | 65 | |
65 | - rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')'))) | |
66 | + rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')) ^ weakLiteral) | |
66 | 67 | defineInstance << localId + Suppress('(') + rule + Suppress(')') |
67 | 68 | |
68 | 69 | rule.setParseAction(lambda s, l, t: ' '.join(t)) |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rules.py
... | ... | @@ -13,9 +13,12 @@ class SegmentRule(object): |
13 | 13 | |
14 | 14 | |
15 | 15 | def __init__(self): |
16 | - ''' | |
17 | - Constructor | |
18 | - ''' | |
16 | + | |
17 | + self.weak = False | |
18 | + | |
19 | + def setWeak(self, weak): | |
20 | + self.weak = weak | |
21 | + return self | |
19 | 22 | |
20 | 23 | def addToNFA(self, fsa): |
21 | 24 | raise NotImplementedError() |
... | ... | @@ -31,7 +34,7 @@ class TagRule(SegmentRule): |
31 | 34 | self.shiftOrth = shiftOrth |
32 | 35 | |
33 | 36 | def addToNFA(self, fsa): |
34 | - endState = RulesNFAState(final=True) | |
37 | + endState = RulesNFAState(final=True, weak=self.weak) | |
35 | 38 | self._doAddToNFA(fsa.initialState, endState) |
36 | 39 | |
37 | 40 | def _doAddToNFA(self, startState, endState): |
... | ... | @@ -51,7 +54,7 @@ class ComplexRule(SegmentRule): |
51 | 54 | self.children = children |
52 | 55 | |
53 | 56 | def addToNFA(self, fsa): |
54 | - endState = RulesNFAState(final=True) | |
57 | + endState = RulesNFAState(final=True, weak=self.weak) | |
55 | 58 | self._doAddToNFA(fsa.initialState, endState) |
56 | 59 | |
57 | 60 | class ConcatRule(ComplexRule): |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
... | ... | @@ -68,11 +68,11 @@ class RulesNFA(object): |
68 | 68 | return res |
69 | 69 | |
70 | 70 | def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): |
71 | - assert all(map(lambda state: state.weak, nfaStates)) \ | |
72 | - or not any(map(lambda state: state.weak, nfaStates)) | |
73 | - weak = all(map(lambda state: state.weak or not state.final, nfaStates)) | |
71 | + assert all(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) \ | |
72 | + or not any(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) | |
73 | + weak = any(map(lambda state: state.weak and state.final, nfaStates)) | |
74 | 74 | final = any(map(lambda state: state.final, nfaStates)) |
75 | - assert not weak or not final | |
75 | +# assert not weak or not final | |
76 | 76 | if final: |
77 | 77 | # dfaState should be final |
78 | 78 | # and contain info about weakness |
... | ... |
fsabuilder/morfeuszbuilder/segrules/rulesParser.py
... | ... | @@ -95,7 +95,7 @@ class RulesParser(object): |
95 | 95 | concatRule = OneOrMore(complexRule) |
96 | 96 | else: |
97 | 97 | concatRule = ZeroOrMore(shiftOrthRule) + tagRule |
98 | - rule << concatRule | |
98 | + rule << concatRule + Optional(CaselessLiteral('!weak')) | |
99 | 99 | |
100 | 100 | tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) |
101 | 101 | shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper)) |
... | ... | @@ -104,5 +104,6 @@ class RulesParser(object): |
104 | 104 | oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) |
105 | 105 | oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) |
106 | 106 | concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) |
107 | + rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2)) | |
107 | 108 | parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0] |
108 | 109 | return parsedRule |
... | ... |
input/segmenty.dat
... | ... | @@ -101,6 +101,7 @@ moze_interp( (adja dywiz)+ adj ) |
101 | 101 | # Stopień najwyższy: |
102 | 102 | # np. „naj·zieleńszy”, „naj·mądrzej” |
103 | 103 | moze_interp( naj> adj_sup ) |
104 | +moze_interp( nie> naj> adj_sup ) !weak | |
104 | 105 | |
105 | 106 | # Formy „zanegowane” gerundiów i imiesłowów: |
106 | 107 | # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: |
... | ... | @@ -112,7 +113,7 @@ moze_interp(z_on_agl) |
112 | 113 | moze_interp(z_on_agl on_agl) |
113 | 114 | |
114 | 115 | # Liczba zapisana jako ciąg cyfr: |
115 | -moze_interp( dig>* dig ) | |
116 | +moze_interp( dig>* dig ) !weak | |
116 | 117 | |
117 | 118 | # Formacje prefiksalne |
118 | 119 | #### trzeba wydzielić odpowiednie samodze! |
... | ... |
morfeusz/CMakeLists.txt
... | ... | @@ -36,7 +36,7 @@ set(SRC_FILES |
36 | 36 | charset/caseconv.cpp |
37 | 37 | charset/conversion_tables.cpp |
38 | 38 | segrules/segrules.cpp |
39 | - segrules/SegrulesDeserializer.cpp) | |
39 | +) | |
40 | 40 | |
41 | 41 | set(INCLUDE_FILES |
42 | 42 | const.hpp |
... | ... | @@ -50,7 +50,8 @@ set(INCLUDE_FILES |
50 | 50 | charset/CharsetConverter.hpp |
51 | 51 | charset/CaseConverter.hpp |
52 | 52 | charset/caseconv.hpp |
53 | - charset/conversion_tables.hpp) | |
53 | + charset/conversion_tables.hpp | |
54 | +) | |
54 | 55 | |
55 | 56 | add_library (libmorfeusz SHARED ${SRC_FILES}) |
56 | 57 | set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERATED TRUE) |
... | ... |
morfeusz/FlexionGraph.cpp
... | ... | @@ -51,9 +51,17 @@ static inline bool chunkIsTheOnlyOne( |
51 | 51 | return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path); |
52 | 52 | } |
53 | 53 | |
54 | -void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { | |
54 | +void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool weak) { | |
55 | 55 | // debugPath(path); |
56 | 56 | // debugGraph(this->graph); |
57 | + if (weak && !this->empty() && !this->onlyWeakPaths) { | |
58 | + return; | |
59 | + } | |
60 | + else if (this->onlyWeakPaths && !weak) { | |
61 | + this->graph.clear(); | |
62 | + this->node2ChunkStartPtr.clear(); | |
63 | + this->onlyWeakPaths = false; | |
64 | + } | |
57 | 65 | for (unsigned int i = 0; i < path.size(); i++) { |
58 | 66 | const InterpretedChunk& chunk = path[i]; |
59 | 67 | if (!chunk.orthWasShifted) { |
... | ... |
morfeusz/FlexionGraph.hpp
... | ... | @@ -15,13 +15,17 @@ |
15 | 15 | |
16 | 16 | class FlexionGraph { |
17 | 17 | public: |
18 | + | |
19 | + FlexionGraph(): graph(), node2ChunkStartPtr(), onlyWeakPaths(true) { | |
20 | + | |
21 | + } | |
18 | 22 | |
19 | 23 | struct Edge { |
20 | 24 | InterpretedChunk chunk; |
21 | 25 | unsigned int nextNode; |
22 | 26 | }; |
23 | 27 | |
24 | - void addPath(const std::vector<InterpretedChunk>& path); | |
28 | + void addPath(const std::vector<InterpretedChunk>& path, bool weak); | |
25 | 29 | |
26 | 30 | // void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results); |
27 | 31 | |
... | ... | @@ -64,6 +68,7 @@ private: |
64 | 68 | |
65 | 69 | std::vector< std::vector<Edge> > graph; |
66 | 70 | std::vector< const char* > node2ChunkStartPtr; |
71 | + bool onlyWeakPaths; | |
67 | 72 | }; |
68 | 73 | |
69 | 74 | #endif /* FLEXIONGRAPH_HPP */ |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -154,8 +154,9 @@ void Morfeusz::doProcessOneWord( |
154 | 154 | doShiftOrth(accum.back(), ic); |
155 | 155 | } |
156 | 156 | accum.push_back(ic); |
157 | - if (isEndOfWord(codepoint) && newSegrulesState.accepting) { | |
158 | - graph.addPath(accum); | |
157 | + if (isEndOfWord(codepoint) | |
158 | + && newSegrulesState.accepting) { | |
159 | + graph.addPath(accum, newSegrulesState.weak); | |
159 | 160 | } |
160 | 161 | else if (!isEndOfWord(codepoint)) { |
161 | 162 | // cerr << "will process " << currInput << endl; |
... | ... |
morfeusz/segrules/SegrulesDeserializer.cpp deleted
1 | -/* | |
2 | - * File: SegrulesDeserializer.cpp | |
3 | - * Author: mlenart | |
4 | - * | |
5 | - * Created on 25 luty 2014, 16:16 | |
6 | - */ | |
7 | - | |
8 | -#include "SegrulesDeserializer.hpp" | |
9 | - | |
10 | -SegrulesDeserializer::SegrulesDeserializer() { | |
11 | -} | |
12 | - | |
13 | -long SegrulesDeserializer::deserialize(const unsigned char* ptr, unsigned char& object) const { | |
14 | - object = *ptr; | |
15 | - return 1; | |
16 | -} | |
17 | - | |
18 | -SegrulesDeserializer::~SegrulesDeserializer() { | |
19 | -} | |
20 | - |
morfeusz/segrules/SegrulesDeserializer.hpp deleted
1 | -/* | |
2 | - * File: SegrulesDeserializer.hpp | |
3 | - * Author: mlenart | |
4 | - * | |
5 | - * Created on 25 luty 2014, 16:16 | |
6 | - */ | |
7 | - | |
8 | -#ifndef SEGRULESDESERIALIZER_HPP | |
9 | -#define SEGRULESDESERIALIZER_HPP | |
10 | - | |
11 | -#include "../fsa/fsa.hpp" | |
12 | - | |
13 | -class SegrulesDeserializer: public Deserializer<unsigned char> { | |
14 | -public: | |
15 | - SegrulesDeserializer(); | |
16 | - long deserialize(const unsigned char* ptr, unsigned char& object) const; | |
17 | - virtual ~SegrulesDeserializer(); | |
18 | -private: | |
19 | - | |
20 | -}; | |
21 | - | |
22 | -#endif /* SEGRULESDESERIALIZER_HPP */ | |
23 | - |
morfeusz/segrules/segrules.cpp
morfeusz/segrules/segrules.hpp
... | ... | @@ -19,8 +19,8 @@ typedef std::map<std::string, std::string> SegrulesOptions; |
19 | 19 | |
20 | 20 | std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); |
21 | 21 | SegrulesOptions getDefaultSegrulesOptions(const unsigned char* ptr); |
22 | -SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr); | |
23 | -void debugMap(const map<SegrulesOptions, SegrulesFSA*>& res); | |
22 | +SegrulesFSA* getDefaultSegrulesFSA(const std::map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr); | |
23 | +void debugMap(const std::map<SegrulesOptions, SegrulesFSA*>& res); | |
24 | 24 | |
25 | 25 | #endif /* SEGRULES_HPP */ |
26 | 26 | |
... | ... |
nbproject/configurations.xml
... | ... | @@ -33,7 +33,6 @@ |
33 | 33 | <in>test_speed.cpp</in> |
34 | 34 | </df> |
35 | 35 | <df name="segrules"> |
36 | - <in>SegrulesDeserializer.cpp</in> | |
37 | 36 | <in>segrules.cpp</in> |
38 | 37 | </df> |
39 | 38 | <in>Environment.cpp</in> |
... | ... | @@ -496,11 +495,6 @@ |
496 | 495 | </preprocessorList> |
497 | 496 | </ccTool> |
498 | 497 | </item> |
499 | - <item path="morfeusz/segrules/SegrulesDeserializer.cpp" | |
500 | - ex="false" | |
501 | - tool="1" | |
502 | - flavor2="4"> | |
503 | - </item> | |
504 | 498 | <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> |
505 | 499 | </item> |
506 | 500 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
... | ... |