diff --git a/fsabuilder/buildanalyzer.sh b/fsabuilder/buildanalyzer.sh index f38d861..56702e1 100755 --- a/fsabuilder/buildanalyzer.sh +++ b/fsabuilder/buildanalyzer.sh @@ -1,8 +1,8 @@ #!/bin/bash python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \ - --tagset-file=/tmp/polimorf-sgjp.tagset \ - --segments-file=/tmp/segmenty.dat \ + --tagset-file=../input/polimorf.tagset \ + --segments-file=../input/segmenty.dat \ --analyzer \ --serialization-method=SIMPLE \ --trim-supneg \ diff --git a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py index 8d5b1ed..1f36b09 100644 --- a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py +++ b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py @@ -9,7 +9,7 @@ from pyparsing import * from morfeuszbuilder.utils import exceptions from pyparseString import pyparseString -identifier = Word(alphas, bodyChars=alphanums+u'_>*+!') +identifier = Word(alphas, bodyChars=alphanums+u'_>*+') define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd() ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd() endif = Keyword('#endif').suppress() + LineEnd() + StringEnd() @@ -61,8 +61,9 @@ def _processLine(lineNum, line, defines, filename): rule = Forward() defineInstance = Forward() localId = identifier.copy() + weakLiteral = CaselessLiteral('!weak') - rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')'))) + rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')) ^ weakLiteral) defineInstance << localId + Suppress('(') + rule + Suppress(')') rule.setParseAction(lambda s, l, t: ' '.join(t)) diff --git a/fsabuilder/morfeuszbuilder/segrules/rules.py b/fsabuilder/morfeuszbuilder/segrules/rules.py index 08ff8ad..02f57a8 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rules.py +++ b/fsabuilder/morfeuszbuilder/segrules/rules.py @@ -13,9 +13,12 @@ class SegmentRule(object): def __init__(self): - ''' - Constructor - ''' + + self.weak = False + + def setWeak(self, weak): + self.weak = weak + return self def addToNFA(self, fsa): raise NotImplementedError() @@ -31,7 +34,7 @@ class TagRule(SegmentRule): self.shiftOrth = shiftOrth def addToNFA(self, fsa): - endState = RulesNFAState(final=True) + endState = RulesNFAState(final=True, weak=self.weak) self._doAddToNFA(fsa.initialState, endState) def _doAddToNFA(self, startState, endState): @@ -51,7 +54,7 @@ class ComplexRule(SegmentRule): self.children = children def addToNFA(self, fsa): - endState = RulesNFAState(final=True) + endState = RulesNFAState(final=True, weak=self.weak) self._doAddToNFA(fsa.initialState, endState) class ConcatRule(ComplexRule): diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py index 2fe36c1..dcafdb7 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py @@ -68,11 +68,11 @@ class RulesNFA(object): return res def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState): - assert all(map(lambda state: state.weak, nfaStates)) \ - or not any(map(lambda state: state.weak, nfaStates)) - weak = all(map(lambda state: state.weak or not state.final, nfaStates)) + assert all(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) \ + or not any(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) + weak = any(map(lambda state: state.weak and state.final, nfaStates)) final = any(map(lambda state: state.final, nfaStates)) - assert not weak or not final +# assert not weak or not final if final: # dfaState should be final # and contain info about weakness diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py index 82bc9f6..a13cd33 100644 --- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py +++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py @@ -95,7 +95,7 @@ class RulesParser(object): concatRule = OneOrMore(complexRule) else: concatRule = ZeroOrMore(shiftOrthRule) + tagRule - rule << concatRule + rule << concatRule + Optional(CaselessLiteral('!weak')) tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper)) shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper)) @@ -104,5 +104,6 @@ class RulesParser(object): oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])])) oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks)) concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks)) + rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2)) parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0] return parsedRule diff --git a/input/segmenty.dat b/input/segmenty.dat index d0ff85c..9505803 100644 --- a/input/segmenty.dat +++ b/input/segmenty.dat @@ -101,6 +101,7 @@ moze_interp( (adja dywiz)+ adj ) # Stopień najwyższy: # np. „naj·zieleńszy”, „naj·mądrzej” moze_interp( naj> adj_sup ) +moze_interp( nie> naj> adj_sup ) !weak # Formy „zanegowane” gerundiów i imiesłowów: # np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: @@ -112,7 +113,7 @@ moze_interp(z_on_agl) moze_interp(z_on_agl on_agl) # Liczba zapisana jako ciąg cyfr: -moze_interp( dig>* dig ) +moze_interp( dig>* dig ) !weak # Formacje prefiksalne #### trzeba wydzielić odpowiednie samodze! diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt index a56465f..29a52b9 100644 --- a/morfeusz/CMakeLists.txt +++ b/morfeusz/CMakeLists.txt @@ -36,7 +36,7 @@ set(SRC_FILES charset/caseconv.cpp charset/conversion_tables.cpp segrules/segrules.cpp - segrules/SegrulesDeserializer.cpp) +) set(INCLUDE_FILES const.hpp @@ -50,7 +50,8 @@ set(INCLUDE_FILES charset/CharsetConverter.hpp charset/CaseConverter.hpp charset/caseconv.hpp - charset/conversion_tables.hpp) + charset/conversion_tables.hpp +) add_library (libmorfeusz SHARED ${SRC_FILES}) set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERATED TRUE) diff --git a/morfeusz/FlexionGraph.cpp b/morfeusz/FlexionGraph.cpp index 6309035..790b3d7 100644 --- a/morfeusz/FlexionGraph.cpp +++ b/morfeusz/FlexionGraph.cpp @@ -51,9 +51,17 @@ static inline bool chunkIsTheOnlyOne( return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path); } -void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { +void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool weak) { // debugPath(path); // debugGraph(this->graph); + if (weak && !this->empty() && !this->onlyWeakPaths) { + return; + } + else if (this->onlyWeakPaths && !weak) { + this->graph.clear(); + this->node2ChunkStartPtr.clear(); + this->onlyWeakPaths = false; + } for (unsigned int i = 0; i < path.size(); i++) { const InterpretedChunk& chunk = path[i]; if (!chunk.orthWasShifted) { diff --git a/morfeusz/FlexionGraph.hpp b/morfeusz/FlexionGraph.hpp index e394591..e83cadd 100644 --- a/morfeusz/FlexionGraph.hpp +++ b/morfeusz/FlexionGraph.hpp @@ -15,13 +15,17 @@ class FlexionGraph { public: + + FlexionGraph(): graph(), node2ChunkStartPtr(), onlyWeakPaths(true) { + + } struct Edge { InterpretedChunk chunk; unsigned int nextNode; }; - void addPath(const std::vector<InterpretedChunk>& path); + void addPath(const std::vector<InterpretedChunk>& path, bool weak); // void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results); @@ -64,6 +68,7 @@ private: std::vector< std::vector<Edge> > graph; std::vector< const char* > node2ChunkStartPtr; + bool onlyWeakPaths; }; #endif /* FLEXIONGRAPH_HPP */ diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp index c4269b0..bd504e7 100644 --- a/morfeusz/Morfeusz.cpp +++ b/morfeusz/Morfeusz.cpp @@ -154,8 +154,9 @@ void Morfeusz::doProcessOneWord( doShiftOrth(accum.back(), ic); } accum.push_back(ic); - if (isEndOfWord(codepoint) && newSegrulesState.accepting) { - graph.addPath(accum); + if (isEndOfWord(codepoint) + && newSegrulesState.accepting) { + graph.addPath(accum, newSegrulesState.weak); } else if (!isEndOfWord(codepoint)) { // cerr << "will process " << currInput << endl; diff --git a/morfeusz/segrules/SegrulesDeserializer.cpp b/morfeusz/segrules/SegrulesDeserializer.cpp deleted file mode 100644 index cb28d2e..0000000 --- a/morfeusz/segrules/SegrulesDeserializer.cpp +++ /dev/null @@ -1,20 +0,0 @@ -/* - * File: SegrulesDeserializer.cpp - * Author: mlenart - * - * Created on 25 luty 2014, 16:16 - */ - -#include "SegrulesDeserializer.hpp" - -SegrulesDeserializer::SegrulesDeserializer() { -} - -long SegrulesDeserializer::deserialize(const unsigned char* ptr, unsigned char& object) const { - object = *ptr; - return 1; -} - -SegrulesDeserializer::~SegrulesDeserializer() { -} - diff --git a/morfeusz/segrules/SegrulesDeserializer.hpp b/morfeusz/segrules/SegrulesDeserializer.hpp deleted file mode 100644 index 12e6d06..0000000 --- a/morfeusz/segrules/SegrulesDeserializer.hpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * File: SegrulesDeserializer.hpp - * Author: mlenart - * - * Created on 25 luty 2014, 16:16 - */ - -#ifndef SEGRULESDESERIALIZER_HPP -#define SEGRULESDESERIALIZER_HPP - -#include "../fsa/fsa.hpp" - -class SegrulesDeserializer: public Deserializer<unsigned char> { -public: - SegrulesDeserializer(); - long deserialize(const unsigned char* ptr, unsigned char& object) const; - virtual ~SegrulesDeserializer(); -private: - -}; - -#endif /* SEGRULESDESERIALIZER_HPP */ - diff --git a/morfeusz/segrules/segrules.cpp b/morfeusz/segrules/segrules.cpp index a171c32..47450b8 100644 --- a/morfeusz/segrules/segrules.cpp +++ b/morfeusz/segrules/segrules.cpp @@ -1,5 +1,4 @@ -#include "SegrulesDeserializer.hpp" #include "segrules.hpp" #include "../fsa/fsa.hpp" #include "../fsa/const.hpp" diff --git a/morfeusz/segrules/segrules.hpp b/morfeusz/segrules/segrules.hpp index 99d046f..44d616b 100644 --- a/morfeusz/segrules/segrules.hpp +++ b/morfeusz/segrules/segrules.hpp @@ -19,8 +19,8 @@ typedef std::map<std::string, std::string> SegrulesOptions; std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr); SegrulesOptions getDefaultSegrulesOptions(const unsigned char* ptr); -SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr); -void debugMap(const map<SegrulesOptions, SegrulesFSA*>& res); +SegrulesFSA* getDefaultSegrulesFSA(const std::map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr); +void debugMap(const std::map<SegrulesOptions, SegrulesFSA*>& res); #endif /* SEGRULES_HPP */ diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml index a4355c7..1ba3c4f 100644 --- a/nbproject/configurations.xml +++ b/nbproject/configurations.xml @@ -33,7 +33,6 @@ <in>test_speed.cpp</in> </df> <df name="segrules"> - <in>SegrulesDeserializer.cpp</in> <in>segrules.cpp</in> </df> <in>Environment.cpp</in> @@ -496,11 +495,6 @@ </preprocessorList> </ccTool> </item> - <item path="morfeusz/segrules/SegrulesDeserializer.cpp" - ex="false" - tool="1" - flavor2="4"> - </item> <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> </item> <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">