From dfe845014dcca9040b318b2c28f3409a9c45c3a2 Mon Sep 17 00:00:00 2001
From: Michał Lenart <michall@ipipan.waw.pl>
Date: Tue, 25 Mar 2014 14:30:07 +0000
Subject: [PATCH] dodanie obsługi znacznika !weak
---
fsabuilder/buildanalyzer.sh | 4 ++--
fsabuilder/morfeuszbuilder/segrules/preprocessor.py | 5 +++--
fsabuilder/morfeuszbuilder/segrules/rules.py | 13 ++++++++-----
fsabuilder/morfeuszbuilder/segrules/rulesNFA.py | 8 ++++----
fsabuilder/morfeuszbuilder/segrules/rulesParser.py | 3 ++-
input/segmenty.dat | 3 ++-
morfeusz/CMakeLists.txt | 5 +++--
morfeusz/FlexionGraph.cpp | 10 +++++++++-
morfeusz/FlexionGraph.hpp | 7 ++++++-
morfeusz/Morfeusz.cpp | 5 +++--
morfeusz/segrules/SegrulesDeserializer.cpp | 20 --------------------
morfeusz/segrules/SegrulesDeserializer.hpp | 23 -----------------------
morfeusz/segrules/segrules.cpp | 1 -
morfeusz/segrules/segrules.hpp | 4 ++--
nbproject/configurations.xml | 6 ------
15 files changed, 44 insertions(+), 73 deletions(-)
delete mode 100644 morfeusz/segrules/SegrulesDeserializer.cpp
delete mode 100644 morfeusz/segrules/SegrulesDeserializer.hpp
diff --git a/fsabuilder/buildanalyzer.sh b/fsabuilder/buildanalyzer.sh
index f38d861..56702e1 100755
--- a/fsabuilder/buildanalyzer.sh
+++ b/fsabuilder/buildanalyzer.sh
@@ -1,8 +1,8 @@
#!/bin/bash
python buildfsa.py --input-files=../input/PoliMorfSmall.tab,../input/dodatki.tab \
- --tagset-file=/tmp/polimorf-sgjp.tagset \
- --segments-file=/tmp/segmenty.dat \
+ --tagset-file=../input/polimorf.tagset \
+ --segments-file=../input/segmenty.dat \
--analyzer \
--serialization-method=SIMPLE \
--trim-supneg \
diff --git a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py
index 8d5b1ed..1f36b09 100644
--- a/fsabuilder/morfeuszbuilder/segrules/preprocessor.py
+++ b/fsabuilder/morfeuszbuilder/segrules/preprocessor.py
@@ -9,7 +9,7 @@ from pyparsing import *
from morfeuszbuilder.utils import exceptions
from pyparseString import pyparseString
-identifier = Word(alphas, bodyChars=alphanums+u'_>*+!')
+identifier = Word(alphas, bodyChars=alphanums+u'_>*+')
define = Keyword('#define').suppress() + identifier + Optional(Suppress('(') + identifier + Suppress(')')) + restOfLine + LineEnd() + StringEnd()
ifdef = Keyword('#ifdef').suppress() + identifier + LineEnd() + StringEnd()
endif = Keyword('#endif').suppress() + LineEnd() + StringEnd()
@@ -61,8 +61,9 @@ def _processLine(lineNum, line, defines, filename):
rule = Forward()
defineInstance = Forward()
localId = identifier.copy()
+ weakLiteral = CaselessLiteral('!weak')
- rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')))
+ rule << OneOrMore(defineInstance ^ localId ^ Word('*|+?>') ^ (Literal('(') + rule + Literal(')')) ^ weakLiteral)
defineInstance << localId + Suppress('(') + rule + Suppress(')')
rule.setParseAction(lambda s, l, t: ' '.join(t))
diff --git a/fsabuilder/morfeuszbuilder/segrules/rules.py b/fsabuilder/morfeuszbuilder/segrules/rules.py
index 08ff8ad..02f57a8 100644
--- a/fsabuilder/morfeuszbuilder/segrules/rules.py
+++ b/fsabuilder/morfeuszbuilder/segrules/rules.py
@@ -13,9 +13,12 @@ class SegmentRule(object):
def __init__(self):
- '''
- Constructor
- '''
+
+ self.weak = False
+
+ def setWeak(self, weak):
+ self.weak = weak
+ return self
def addToNFA(self, fsa):
raise NotImplementedError()
@@ -31,7 +34,7 @@ class TagRule(SegmentRule):
self.shiftOrth = shiftOrth
def addToNFA(self, fsa):
- endState = RulesNFAState(final=True)
+ endState = RulesNFAState(final=True, weak=self.weak)
self._doAddToNFA(fsa.initialState, endState)
def _doAddToNFA(self, startState, endState):
@@ -51,7 +54,7 @@ class ComplexRule(SegmentRule):
self.children = children
def addToNFA(self, fsa):
- endState = RulesNFAState(final=True)
+ endState = RulesNFAState(final=True, weak=self.weak)
self._doAddToNFA(fsa.initialState, endState)
class ConcatRule(ComplexRule):
diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
index 2fe36c1..dcafdb7 100644
--- a/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
+++ b/fsabuilder/morfeuszbuilder/segrules/rulesNFA.py
@@ -68,11 +68,11 @@ class RulesNFA(object):
return res
def _doConvertState(self, dfaState, nfaStates, nfaSubset2DFAState):
- assert all(map(lambda state: state.weak, nfaStates)) \
- or not any(map(lambda state: state.weak, nfaStates))
- weak = all(map(lambda state: state.weak or not state.final, nfaStates))
+ assert all(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates))) \
+ or not any(map(lambda state: state.weak, filter(lambda state: state.final, nfaStates)))
+ weak = any(map(lambda state: state.weak and state.final, nfaStates))
final = any(map(lambda state: state.final, nfaStates))
- assert not weak or not final
+# assert not weak or not final
if final:
# dfaState should be final
# and contain info about weakness
diff --git a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py
index 82bc9f6..a13cd33 100644
--- a/fsabuilder/morfeuszbuilder/segrules/rulesParser.py
+++ b/fsabuilder/morfeuszbuilder/segrules/rulesParser.py
@@ -95,7 +95,7 @@ class RulesParser(object):
concatRule = OneOrMore(complexRule)
else:
concatRule = ZeroOrMore(shiftOrthRule) + tagRule
- rule << concatRule
+ rule << concatRule + Optional(CaselessLiteral('!weak'))
tagRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], False, lineNum, line, segtypesHelper))
shiftOrthRule.setParseAction(lambda string, loc, toks: self._createNewTagRule(toks[0], True, lineNum, line, segtypesHelper))
@@ -104,5 +104,6 @@ class RulesParser(object):
oneOrMoreRule.setParseAction(lambda string, loc, toks: rules.ConcatRule([toks[0], rules.ZeroOrMoreRule(toks[0])]))
oneOfRule.setParseAction(lambda string, loc, toks: rules.OrRule(toks))
concatRule.setParseAction(lambda string, loc, toks: toks[0] if len(toks) == 1 else rules.ConcatRule(toks))
+ rule.setParseAction(lambda string, loc, toks: toks[0].setWeak(len(toks) == 2))
parsedRule = pyparseString.pyparseString(rule, lineNum, line, filename)[0]
return parsedRule
diff --git a/input/segmenty.dat b/input/segmenty.dat
index d0ff85c..9505803 100644
--- a/input/segmenty.dat
+++ b/input/segmenty.dat
@@ -101,6 +101,7 @@ moze_interp( (adja dywiz)+ adj )
# Stopień najwyższy:
# np. „naj·zieleńszy”, „naj·mądrzej”
moze_interp( naj> adj_sup )
+moze_interp( nie> naj> adj_sup ) !weak
# Formy „zanegowane” gerundiów i imiesłowów:
# np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”:
@@ -112,7 +113,7 @@ moze_interp(z_on_agl)
moze_interp(z_on_agl on_agl)
# Liczba zapisana jako ciąg cyfr:
-moze_interp( dig>* dig )
+moze_interp( dig>* dig ) !weak
# Formacje prefiksalne
#### trzeba wydzielić odpowiednie samodze!
diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt
index a56465f..29a52b9 100644
--- a/morfeusz/CMakeLists.txt
+++ b/morfeusz/CMakeLists.txt
@@ -36,7 +36,7 @@ set(SRC_FILES
charset/caseconv.cpp
charset/conversion_tables.cpp
segrules/segrules.cpp
- segrules/SegrulesDeserializer.cpp)
+)
set(INCLUDE_FILES
const.hpp
@@ -50,7 +50,8 @@ set(INCLUDE_FILES
charset/CharsetConverter.hpp
charset/CaseConverter.hpp
charset/caseconv.hpp
- charset/conversion_tables.hpp)
+ charset/conversion_tables.hpp
+)
add_library (libmorfeusz SHARED ${SRC_FILES})
set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERATED TRUE)
diff --git a/morfeusz/FlexionGraph.cpp b/morfeusz/FlexionGraph.cpp
index 6309035..790b3d7 100644
--- a/morfeusz/FlexionGraph.cpp
+++ b/morfeusz/FlexionGraph.cpp
@@ -51,9 +51,17 @@ static inline bool chunkIsTheOnlyOne(
return chunkIsAtFront(chunk, path) && chunkIsAtBack(chunk, path);
}
-void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) {
+void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path, bool weak) {
// debugPath(path);
// debugGraph(this->graph);
+ if (weak && !this->empty() && !this->onlyWeakPaths) {
+ return;
+ }
+ else if (this->onlyWeakPaths && !weak) {
+ this->graph.clear();
+ this->node2ChunkStartPtr.clear();
+ this->onlyWeakPaths = false;
+ }
for (unsigned int i = 0; i < path.size(); i++) {
const InterpretedChunk& chunk = path[i];
if (!chunk.orthWasShifted) {
diff --git a/morfeusz/FlexionGraph.hpp b/morfeusz/FlexionGraph.hpp
index e394591..e83cadd 100644
--- a/morfeusz/FlexionGraph.hpp
+++ b/morfeusz/FlexionGraph.hpp
@@ -15,13 +15,17 @@
class FlexionGraph {
public:
+
+ FlexionGraph(): graph(), node2ChunkStartPtr(), onlyWeakPaths(true) {
+
+ }
struct Edge {
InterpretedChunk chunk;
unsigned int nextNode;
};
- void addPath(const std::vector<InterpretedChunk>& path);
+ void addPath(const std::vector<InterpretedChunk>& path, bool weak);
// void getResults(const Tagset& tagset, const CharsetConverter& charsetConverter, std::vector<MorphInterpretation>& results);
@@ -64,6 +68,7 @@ private:
std::vector< std::vector<Edge> > graph;
std::vector< const char* > node2ChunkStartPtr;
+ bool onlyWeakPaths;
};
#endif /* FLEXIONGRAPH_HPP */
diff --git a/morfeusz/Morfeusz.cpp b/morfeusz/Morfeusz.cpp
index c4269b0..bd504e7 100644
--- a/morfeusz/Morfeusz.cpp
+++ b/morfeusz/Morfeusz.cpp
@@ -154,8 +154,9 @@ void Morfeusz::doProcessOneWord(
doShiftOrth(accum.back(), ic);
}
accum.push_back(ic);
- if (isEndOfWord(codepoint) && newSegrulesState.accepting) {
- graph.addPath(accum);
+ if (isEndOfWord(codepoint)
+ && newSegrulesState.accepting) {
+ graph.addPath(accum, newSegrulesState.weak);
}
else if (!isEndOfWord(codepoint)) {
// cerr << "will process " << currInput << endl;
diff --git a/morfeusz/segrules/SegrulesDeserializer.cpp b/morfeusz/segrules/SegrulesDeserializer.cpp
deleted file mode 100644
index cb28d2e..0000000
--- a/morfeusz/segrules/SegrulesDeserializer.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * File: SegrulesDeserializer.cpp
- * Author: mlenart
- *
- * Created on 25 luty 2014, 16:16
- */
-
-#include "SegrulesDeserializer.hpp"
-
-SegrulesDeserializer::SegrulesDeserializer() {
-}
-
-long SegrulesDeserializer::deserialize(const unsigned char* ptr, unsigned char& object) const {
- object = *ptr;
- return 1;
-}
-
-SegrulesDeserializer::~SegrulesDeserializer() {
-}
-
diff --git a/morfeusz/segrules/SegrulesDeserializer.hpp b/morfeusz/segrules/SegrulesDeserializer.hpp
deleted file mode 100644
index 12e6d06..0000000
--- a/morfeusz/segrules/SegrulesDeserializer.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * File: SegrulesDeserializer.hpp
- * Author: mlenart
- *
- * Created on 25 luty 2014, 16:16
- */
-
-#ifndef SEGRULESDESERIALIZER_HPP
-#define SEGRULESDESERIALIZER_HPP
-
-#include "../fsa/fsa.hpp"
-
-class SegrulesDeserializer: public Deserializer<unsigned char> {
-public:
- SegrulesDeserializer();
- long deserialize(const unsigned char* ptr, unsigned char& object) const;
- virtual ~SegrulesDeserializer();
-private:
-
-};
-
-#endif /* SEGRULESDESERIALIZER_HPP */
-
diff --git a/morfeusz/segrules/segrules.cpp b/morfeusz/segrules/segrules.cpp
index a171c32..47450b8 100644
--- a/morfeusz/segrules/segrules.cpp
+++ b/morfeusz/segrules/segrules.cpp
@@ -1,5 +1,4 @@
-#include "SegrulesDeserializer.hpp"
#include "segrules.hpp"
#include "../fsa/fsa.hpp"
#include "../fsa/const.hpp"
diff --git a/morfeusz/segrules/segrules.hpp b/morfeusz/segrules/segrules.hpp
index 99d046f..44d616b 100644
--- a/morfeusz/segrules/segrules.hpp
+++ b/morfeusz/segrules/segrules.hpp
@@ -19,8 +19,8 @@ typedef std::map<std::string, std::string> SegrulesOptions;
std::map<SegrulesOptions, SegrulesFSA*> createSegrulesFSAsMap(const unsigned char* analyzerPtr);
SegrulesOptions getDefaultSegrulesOptions(const unsigned char* ptr);
-SegrulesFSA* getDefaultSegrulesFSA(const map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr);
-void debugMap(const map<SegrulesOptions, SegrulesFSA*>& res);
+SegrulesFSA* getDefaultSegrulesFSA(const std::map<SegrulesOptions, SegrulesFSA*>& map, const unsigned char* analyzerPtr);
+void debugMap(const std::map<SegrulesOptions, SegrulesFSA*>& res);
#endif /* SEGRULES_HPP */
diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml
index a4355c7..1ba3c4f 100644
--- a/nbproject/configurations.xml
+++ b/nbproject/configurations.xml
@@ -33,7 +33,6 @@
<in>test_speed.cpp</in>
</df>
<df name="segrules">
- <in>SegrulesDeserializer.cpp</in>
<in>segrules.cpp</in>
</df>
<in>Environment.cpp</in>
@@ -496,11 +495,6 @@
</preprocessorList>
</ccTool>
</item>
- <item path="morfeusz/segrules/SegrulesDeserializer.cpp"
- ex="false"
- tool="1"
- flavor2="4">
- </item>
<item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4">
</item>
<item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4">
--
libgit2 0.22.2