Commit b0a0d741eeaabf2041b25f9e29e69e75fabdf174
1 parent
38b37844
- dalsza praca nad zlepianiem segmentów (dodanie automatu, który ją obsługuje, j…
…eszcze trzeba tylko zacząć go właściwie używać) git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@92 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
19 changed files
with
381 additions
and
81 deletions
CMakeLists.txt
... | ... | @@ -43,6 +43,11 @@ if ("${INPUT_TAGSET}" STREQUAL "") |
43 | 43 | set (INPUT_TAGSET ${PROJECT_SOURCE_DIR}/input/polimorf.tagset) |
44 | 44 | endif () |
45 | 45 | |
46 | +# SEGMENT_RULES_FILE | |
47 | +if ("${SEGMENT_RULES_FILE}" STREQUAL "") | |
48 | + set (SEGMENT_RULES_FILE ${PROJECT_SOURCE_DIR}/input/segmenty.dat) | |
49 | +endif () | |
50 | + | |
46 | 51 | message ("Will use ${INPUT_DICTIONARY} as default input dictionary and ${INPUT_TAGSET} as tagset") |
47 | 52 | |
48 | 53 | # TARGET_DIR |
... | ... |
doc/fileFormat.odt
0 → 100644
No preview for this file type
input/segmenty.dat
0 → 100644
1 | +[options] | |
2 | +aggl=permissive strict isolated | |
3 | +praet=split composite | |
4 | + | |
5 | +[combinations] | |
6 | +#define wsz_interp (interp|kropka|dywiz)* | |
7 | + | |
8 | +#define moze_interp(segmenty) wsz_interp segmenty wsz_interp | |
9 | + | |
10 | +# Segmenty występujące samodzielnie: | |
11 | +# | |
12 | +# domyślny typ segmentu samodzielnego: | |
13 | +moze_interp(samodz) | |
14 | + | |
15 | +# segment samotny, który nie dopuszcza nawet znaku interpunkcyjnego po | |
16 | +# sobie | |
17 | +samotny | |
18 | + | |
19 | +# przeszlik pojedynczy w formie nieaglutynacyjnej, np. „gniótł”: | |
20 | +moze_interp(praet_sg_na) | |
21 | + | |
22 | +# przeszlik pojedynczy w formie niezróżnicowanej aglutynacyjnie, np. „czytał”: | |
23 | +moze_interp(praet_sg) | |
24 | + | |
25 | +# przeszlik mnogi, np. „czytali”: | |
26 | +moze_interp(praet_pl) | |
27 | + | |
28 | +# partykuła „by”: | |
29 | +moze_interp(by) | |
30 | + | |
31 | +# inne segmenty, które dopuszczają po sobie aglutynant, | |
32 | +# np. „powininna”, „czyżby”: | |
33 | +moze_interp(z_aglt) | |
34 | + | |
35 | +# forma przymiotnikowa (dopuszcza adja): | |
36 | +moze_interp(adj) | |
37 | + | |
38 | +# dywiz (jako samodzielny segment jest tyko błędnym użyciem w funkcji | |
39 | +# myślnika, ale trzeba to dopuścić): | |
40 | +dywiz | |
41 | + | |
42 | +#ifdef isolated | |
43 | +adja | |
44 | +#endif | |
45 | + | |
46 | + | |
47 | +# Połączenia z aglutynantami: | |
48 | +# | |
49 | +#ifdef split | |
50 | +# Czas przeszły: | |
51 | +# np. „gniotł·am” | |
52 | +moze_interp( praet_sg_agl aglsg ) | |
53 | +# np. „czytał·em” | |
54 | +moze_interp(praet_sg aglsg) | |
55 | +# np. „czytali·ście” | |
56 | +moze_interp(praet_pl aglpl) | |
57 | + | |
58 | +# Tryb warunkowy: | |
59 | +# np. „gniótł·by” | |
60 | +moze_interp(praet_sg_na by) | |
61 | +# np. „czytało·by” | |
62 | +moze_interp(praet_sg by) | |
63 | +# np. „gnietli·by” | |
64 | +moze_interp(praet_pl by) | |
65 | +# np. „gniótł·by·ś” | |
66 | +moze_interp(praet_sg_na by aglsg) | |
67 | +# np. „czytał·by·m” | |
68 | +moze_interp(praet_sg by aglsg) | |
69 | +# np. „gnietli·by·śmy” | |
70 | +moze_interp(praet_pl by aglpl) | |
71 | +#else | |
72 | +moze_interp(praetcond) | |
73 | +#endif | |
74 | +# np. „by·ś” | |
75 | +moze_interp(by aglsg) | |
76 | +# np. „by·ście” | |
77 | +moze_interp(by aglpl) | |
78 | + | |
79 | +# np. „gdyby·m” | |
80 | +moze_interp(z_aglt aglsg) | |
81 | +# np. „gdyby·ście” | |
82 | +moze_interp(z_aglt aglpl) | |
83 | + | |
84 | +# To jest dużo za dużo, ale tytułem eksperymentu: | |
85 | +#ifdef permissive | |
86 | +moze_interp(samodz aglsg) | |
87 | +moze_interp(samodz aglpl) | |
88 | +#endif | |
89 | + | |
90 | +# Złożone formy przymiotnikowe | |
91 | +# np. „biało·-·czerwony” | |
92 | +moze_interp( (adja dywiz)+ adj ) | |
93 | +# poniższe załatwione przez + powyżej: | |
94 | +# # np. „niebiesko·-·biało·-·czerwona” | |
95 | +# adja dywiz adja dywiz adj interp? | |
96 | +# # itd. (zatrzymujemy się pragmatycznie na 5 członach) | |
97 | +# adja dywiz adja dywiz adja dywiz adj interp? | |
98 | +# adja dywiz adja dywiz adja dywiz adja dywiz adj interp? | |
99 | + | |
100 | +# Stopień najwyższy: | |
101 | +# np. „naj·zieleńszy”, „naj·mądrzej” | |
102 | +moze_interp( naj> adj_sup ) | |
103 | + | |
104 | +# Formy „zanegowane” gerundiów i imiesłowów: | |
105 | +# np. „nie·czytanie”, „nie·przeczytany”, „nie·czytający”: | |
106 | +moze_interp( nie> negat ) | |
107 | + | |
108 | +# Przyimki akceptujące krótką formę „-ń” | |
109 | +moze_interp(z_on_agl) | |
110 | +# np. „do·ń” | |
111 | +moze_interp(z_on_agl on_agl) | |
112 | + | |
113 | +# Liczba zapisana jako ciąg cyfr: | |
114 | +moze_interp( dig>* dig ) | |
115 | + | |
116 | +# Formacje prefiksalne | |
117 | +#### trzeba wydzielić odpowiednie samodze! | |
118 | +# rzeczownikowe i przymiotnikowe | |
119 | +# np. „euro·sodoma”, „e-·papieros”, „euro·sodomski”, „bez·argumentowy” | |
120 | +moze_interp( prefs samodz ) | |
121 | +# czasownikowe np. „po·nakapywać” | |
122 | +moze_interp( prefv samodz ) | |
123 | + | |
124 | +# Apozycje z dywizem | |
125 | +# np. „kobieta-prezydent” | |
126 | +moze_interp( samodz dywiz samodz ) | |
127 | +# poniższe do sprawdzenia, najwyraźniej obecne w tekstach, skoro wprowadziliśmy: | |
128 | +# ? | |
129 | +adj dywiz adj | |
130 | +# ? | |
131 | +adj dywiz samodz | |
132 | +# ? | |
133 | +samodz dywiz adj | |
134 | + | |
135 | +[segment types] | |
136 | +naj | |
137 | +nie | |
138 | +prefs | |
139 | +prefv | |
140 | +dig | |
141 | +adja | |
142 | +adj | |
143 | +adj_sup | |
144 | +negat | |
145 | +on_agl | |
146 | +z_on_agl | |
147 | +samotny | |
148 | +interp | |
149 | +aglsg | |
150 | +aglpl | |
151 | +praetcond | |
152 | +praet_sg_agl | |
153 | +praet_sg_na | |
154 | +praet_sg | |
155 | +praet_pl | |
156 | +samodz | |
157 | + | |
158 | +[tags] | |
159 | +naj naj | |
160 | +nie nie | |
161 | +prefs prefs | |
162 | +prefv prefv | |
163 | +dig dig | |
164 | +adja adja | |
165 | +adj adj:%:pos | |
166 | +adj_sup adj:%:sup | |
167 | +adj_sup adv:sup | |
168 | +negat ger:%:neg | |
169 | +negat pact:%:neg | |
170 | +negat ppas:%:neg | |
171 | +on_agl ppron3:sg:gen.acc:m1.m2.m3:ter:nakc:praep | |
172 | +z_on_agl prep:% | |
173 | +samotny brev:pun | |
174 | +samotny brev:npun | |
175 | +samotny intrj | |
176 | +interp interp | |
177 | +aglsg aglt:sg:% | |
178 | +aglpl aglt:pl:% | |
179 | +praetcond cond:% | |
180 | +praetcond praet:%:pri:% | |
181 | +praetcond praet:%:sec:% | |
182 | +praetcond praet:%:ter:% | |
183 | +praet_sg_agl praet:sg:%:agl | |
184 | +praet_sg_na praet:sg:%:nagl | |
185 | +praet_sg praet:sg:% | |
186 | +praet_pl praet:pl:% | |
187 | +praet_sg winien:sg:% | |
188 | +praet_pl winien:pl:% | |
189 | +samodz % | |
190 | + | |
191 | +[lexemes] | |
192 | +z_aglt aby:comp | |
193 | +z_aglt bowiem:comp | |
194 | +by by:qub | |
195 | +z_aglt by:comp | |
196 | +z_aglt cóż:subst | |
197 | +z_aglt czemu:adv | |
198 | +z_aglt czyżby:qub | |
199 | +z_aglt choćby:comp | |
200 | +z_aglt chociażby:comp | |
201 | +z_aglt dlaczego:adv | |
202 | +z_aglt dopóki:comp | |
203 | +z_aglt dopóty:conj | |
204 | +z_aglt gdyby:comp | |
205 | +z_aglt gdzie:qub | |
206 | +z_aglt gdzie:adv | |
207 | +z_aglt jakby:comp | |
208 | +z_aglt jakoby:comp | |
209 | +z_aglt kiedy:adv | |
210 | +z_aglt kiedy:comp | |
211 | +z_aglt tylko:qub | |
212 | +z_aglt żeby:comp | |
213 | +dywiz -:interp | |
214 | +kropka .:interp | |
... | ... |
morfeusz/CMakeLists.txt
... | ... | @@ -2,7 +2,7 @@ |
2 | 2 | ########## generate default dictionary data ################# |
3 | 3 | add_custom_command ( |
4 | 4 | OUTPUT "${INPUT_DICTIONARY_CPP}" |
5 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer -i "${INPUT_DICTIONARY}" -o "${INPUT_DICTIONARY_CPP}" "--tagset-file=${INPUT_TAGSET}" --cpp --serialization-method=SIMPLE | |
5 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer -i "${INPUT_DICTIONARY}" -o "${INPUT_DICTIONARY_CPP}" "--tagset-file=${INPUT_TAGSET}" "--segments-file=${SEGMENT_RULES_FILE}" --cpp --serialization-method=SIMPLE | |
6 | 6 | DEPENDS "${INPUT_DICTIONARY}" |
7 | 7 | COMMENT "Building default dictionary C++ file" |
8 | 8 | ) |
... | ... | @@ -35,7 +35,9 @@ set(SRC_FILES |
35 | 35 | charset/CharsetConverter.cpp |
36 | 36 | charset/CaseConverter.cpp |
37 | 37 | charset/caseconv.cpp |
38 | - charset/conversion_tables.cpp) | |
38 | + charset/conversion_tables.cpp | |
39 | + segrules/segrules.cpp | |
40 | + segrules/SegrulesDeserializer.cpp) | |
39 | 41 | |
40 | 42 | set(INCLUDE_FILES |
41 | 43 | const.hpp |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -17,6 +17,7 @@ |
17 | 17 | #include "charset/CharsetConverter.hpp" |
18 | 18 | #include "charset/charset_utils.hpp" |
19 | 19 | #include "charset/CaseConverter.hpp" |
20 | +#include "segrules/segrules.hpp" | |
20 | 21 | #include "const.hpp" |
21 | 22 | |
22 | 23 | // TODO - konstruktor kopiujący działający Tak-Jak-Trzeba |
... | ... | @@ -29,56 +30,6 @@ static Deserializer<vector<InterpsGroup> >* initializeAnalyzerDeserializer() { |
29 | 30 | return deserializer; |
30 | 31 | } |
31 | 32 | |
32 | -static FSA<vector<InterpsGroup > > *initializeAnalyzerFSA(const string& filename) { | |
33 | - cerr << "initialize FSA" << endl; | |
34 | - return FSA < vector < InterpsGroup > > ::getFSA(filename, *initializeAnalyzerDeserializer()); | |
35 | -} | |
36 | - | |
37 | -//static FSA<vector<MorphInterpretation > > *initializeSynthFSA(const string& filename, const SynthDeserializer& deserializer) { | |
38 | -// cerr << "initialize synth FSA" << endl; | |
39 | -// return FSA < vector < EncodedGeneratorInterpretation > > ::getFSA(filename, deserializer); | |
40 | -//} | |
41 | -// | |
42 | -//static CharsetConverter* getCharsetConverter(MorfeuszCharset charset) { | |
43 | -// cerr << "initialize charset converter for " << charset << endl; | |
44 | -// static CharsetConverter* utf8Converter = new UTF8CharsetConverter(); | |
45 | -//// static CharsetConverter* utf16LEConverter = new UTF16CharsetConverter(UTF16CharsetConverter::UTF16CharsetConverter::LE); | |
46 | -//// static CharsetConverter* utf16BEConverter = new UTF16CharsetConverter(UTF16CharsetConverter::Endianness::BE); | |
47 | -// static CharsetConverter* iso8859_2Converter = new ISO8859_2_CharsetConverter(); | |
48 | -// static CharsetConverter* windows1250Converter = new Windows_1250_CharsetConverter(); | |
49 | -// static CharsetConverter* cp852Converter = new CP852_CharsetConverter(); | |
50 | -// switch (charset) { | |
51 | -// case UTF8: | |
52 | -// return utf8Converter; | |
53 | -// case ISO8859_2: | |
54 | -// return iso8859_2Converter; | |
55 | -// case CP1250: | |
56 | -// return windows1250Converter; | |
57 | -// case CP852: | |
58 | -// return cp852Converter; | |
59 | -// default: | |
60 | -// throw MorfeuszException("invalid charset"); | |
61 | -// } | |
62 | -//} | |
63 | -// | |
64 | -//static Tagset* initializeTagset(const string& filename) { | |
65 | -// cerr << "initialize tagset" << endl; | |
66 | -// static Tagset* tagset = new Tagset(readFile<unsigned char>(filename.c_str())); | |
67 | -// return tagset; | |
68 | -//} | |
69 | -// | |
70 | -//static Tagset* initializeTagset(const unsigned char* data) { | |
71 | -// cerr << "initialize tagset" << endl; | |
72 | -// static Tagset* tagset = new Tagset(data); | |
73 | -// return tagset; | |
74 | -//} | |
75 | -// | |
76 | -//static CaseConverter* initializeCaseConverter() { | |
77 | -// cerr << "initialize case converter" << endl; | |
78 | -// static CaseConverter* cc = new CaseConverter(); | |
79 | -// return cc; | |
80 | -//} | |
81 | - | |
82 | 33 | static MorfeuszOptions createDefaultOptions() { |
83 | 34 | MorfeuszOptions res; |
84 | 35 | res.caseSensitive = true; |
... | ... | @@ -88,7 +39,9 @@ static MorfeuszOptions createDefaultOptions() { |
88 | 39 | |
89 | 40 | Morfeusz::Morfeusz() |
90 | 41 | : env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET), |
91 | -analyzerFSA(FSAType::getFSA(DEFAULT_FSA, *initializeAnalyzerDeserializer())), | |
42 | +analyzerPtr(DEFAULT_FSA), | |
43 | +analyzerFSA(FSAType::getFSA(analyzerPtr, *initializeAnalyzerDeserializer())), | |
44 | +segrulesFSA(createSegrulesFSA(analyzerPtr)), | |
92 | 45 | isAnalyzerFSAFromFile(false), |
93 | 46 | generator(DEFAULT_SYNTH_FSA, env), |
94 | 47 | options(createDefaultOptions()) { |
... | ... | @@ -98,30 +51,35 @@ options(createDefaultOptions()) { |
98 | 51 | void Morfeusz::setAnalyzerFile(const string& filename) { |
99 | 52 | if (this->isAnalyzerFSAFromFile) { |
100 | 53 | delete this->analyzerFSA; |
54 | + delete this->segrulesFSA; | |
55 | + delete this->analyzerPtr; | |
101 | 56 | } |
102 | - this->analyzerFSA = initializeAnalyzerFSA(filename); | |
57 | + this->analyzerPtr = readFile<unsigned char>(filename.c_str()); | |
58 | + this->analyzerFSA = FSA< vector<InterpsGroup> > ::getFSA(analyzerPtr, *initializeAnalyzerDeserializer()); | |
59 | + this->segrulesFSA = createSegrulesFSA(analyzerPtr); | |
103 | 60 | this->isAnalyzerFSAFromFile = true; |
104 | 61 | } |
105 | 62 | |
106 | 63 | Morfeusz::~Morfeusz() { |
107 | 64 | if (this->isAnalyzerFSAFromFile) { |
108 | 65 | delete this->analyzerFSA; |
66 | + delete this->segrulesFSA; | |
67 | + delete this->analyzerPtr; | |
109 | 68 | } |
110 | 69 | } |
111 | 70 | |
112 | 71 | void Morfeusz::analyzeOneWord( |
113 | - const char*& inputData, | |
72 | + const char*& inputStart, | |
114 | 73 | const char* inputEnd, |
115 | 74 | int startNodeNum, |
116 | 75 | std::vector<MorphInterpretation>& results) const { |
117 | - while (inputData != inputEnd | |
118 | - && isEndOfWord(this->env.getCharsetConverter().peek(inputData, inputEnd))) { | |
119 | - this->env.getCharsetConverter().next(inputData, inputEnd); | |
76 | + while (inputStart != inputEnd | |
77 | + && isEndOfWord(this->env.getCharsetConverter().peek(inputStart, inputEnd))) { | |
78 | + this->env.getCharsetConverter().next(inputStart, inputEnd); | |
120 | 79 | } |
121 | - const char* wordStart = inputData; | |
122 | 80 | vector<InterpretedChunk> accum; |
123 | 81 | FlexionGraph graph; |
124 | - const char* currInput = inputData; | |
82 | + const char* currInput = inputStart; | |
125 | 83 | doAnalyzeOneWord(currInput, inputEnd, accum, graph); |
126 | 84 | if (!graph.empty()) { |
127 | 85 | InterpretedChunksDecoder interpretedChunksDecoder(env); |
... | ... | @@ -136,10 +94,11 @@ void Morfeusz::analyzeOneWord( |
136 | 94 | srcNode++; |
137 | 95 | } |
138 | 96 | // graph.getResults(*this->tagset, results); |
139 | - } else if (wordStart != currInput) { | |
140 | - this->appendIgnotiumToResults(string(wordStart, currInput), startNodeNum, results); | |
141 | 97 | } |
142 | - inputData = currInput; | |
98 | + else if (inputStart != inputEnd) { | |
99 | + this->appendIgnotiumToResults(string(inputStart, currInput), startNodeNum, results); | |
100 | + } | |
101 | + inputStart = currInput; | |
143 | 102 | } |
144 | 103 | |
145 | 104 | void Morfeusz::doAnalyzeOneWord( |
... | ... |
morfeusz/Morfeusz.hpp
... | ... | @@ -68,7 +68,9 @@ private: |
68 | 68 | int startNodeNum, |
69 | 69 | std::vector<MorphInterpretation>& results) const; |
70 | 70 | Environment env; |
71 | + const unsigned char* analyzerPtr; | |
71 | 72 | FSAType* analyzerFSA; |
73 | + FSA<unsigned char>* segrulesFSA; | |
72 | 74 | bool isAnalyzerFSAFromFile; |
73 | 75 | Generator generator; |
74 | 76 | // const CharsetConverter* charsetConverter; |
... | ... |
morfeusz/Tagset.cpp
... | ... | @@ -30,8 +30,9 @@ static void readTags(const unsigned char*& currPtr, vector<string>& tags) { |
30 | 30 | } |
31 | 31 | } |
32 | 32 | |
33 | -Tagset::Tagset(const unsigned char* fsaData) { | |
34 | - const unsigned char* currPtr = fsaData + ADDITIONAL_DATA_OFFSET; | |
33 | +Tagset::Tagset(const unsigned char* ptr) { | |
34 | + uint32_t fsaSize = htonl(*reinterpret_cast<const uint32_t*>(ptr + FSA_DATA_SIZE_OFFSET)); | |
35 | + const unsigned char* currPtr = ptr + FSA_DATA_OFFSET + fsaSize + 4; | |
35 | 36 | readTags(currPtr, this->tags); |
36 | 37 | readTags(currPtr, this->names); |
37 | 38 | } |
... | ... |
morfeusz/fsa/const.cpp
... | ... | @@ -2,9 +2,9 @@ |
2 | 2 | #include "const.hpp" |
3 | 3 | |
4 | 4 | extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; |
5 | -extern const uint8_t VERSION_NUM = 9; | |
5 | +extern const uint8_t VERSION_NUM = 10; | |
6 | 6 | |
7 | 7 | extern const unsigned int VERSION_NUM_OFFSET = 4; |
8 | 8 | extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5; |
9 | -extern const unsigned int ADDITIONAL_DATA_SIZE_OFFSET = 6; | |
10 | -extern const unsigned int ADDITIONAL_DATA_OFFSET = 10; | |
9 | +extern const unsigned int FSA_DATA_SIZE_OFFSET = 6; | |
10 | +extern const unsigned int FSA_DATA_OFFSET = 10; | |
... | ... |
morfeusz/fsa/const.hpp
... | ... | @@ -15,8 +15,8 @@ extern const uint8_t VERSION_NUM; |
15 | 15 | |
16 | 16 | extern const unsigned int VERSION_NUM_OFFSET; |
17 | 17 | extern const unsigned int IMPLEMENTATION_NUM_OFFSET; |
18 | -extern const unsigned int ADDITIONAL_DATA_SIZE_OFFSET; | |
19 | -extern const unsigned int ADDITIONAL_DATA_OFFSET; | |
18 | +extern const unsigned int FSA_DATA_SIZE_OFFSET; | |
19 | +extern const unsigned int FSA_DATA_OFFSET; | |
20 | 20 | |
21 | 21 | #endif /* CONST_HPP */ |
22 | 22 | |
... | ... |
morfeusz/fsa/fsa.hpp
... | ... | @@ -81,11 +81,12 @@ private: |
81 | 81 | template <class T> |
82 | 82 | class SimpleFSA : public FSA<T> { |
83 | 83 | public: |
84 | - SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer); | |
84 | + SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer, bool isTransducer=false); | |
85 | 85 | virtual ~SimpleFSA(); |
86 | 86 | protected: |
87 | 87 | void proceedToNext(const char c, State<T>& state) const; |
88 | 88 | private: |
89 | + bool isTransducer; | |
89 | 90 | }; |
90 | 91 | |
91 | 92 | template <class T> |
... | ... | @@ -167,6 +168,10 @@ public: |
167 | 168 | * For non-accepting states is throws an exception. |
168 | 169 | */ |
169 | 170 | T getValue() const; |
171 | + | |
172 | + unsigned char getLastTransitionValue() const; | |
173 | + | |
174 | + void setLastTransitionValue(unsigned char val); | |
170 | 175 | |
171 | 176 | /** |
172 | 177 | * Get the size (in bytes) of this state's value. |
... | ... | @@ -191,6 +196,7 @@ private: |
191 | 196 | bool sink; |
192 | 197 | T value; |
193 | 198 | long valueSize; |
199 | + unsigned char lastTransitionValue; | |
194 | 200 | }; |
195 | 201 | |
196 | 202 | class FSAException : public std::exception { |
... | ... |
morfeusz/fsa/fsa_impl.hpp
... | ... | @@ -78,9 +78,9 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial |
78 | 78 | |
79 | 79 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); |
80 | 80 | |
81 | - uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET))); | |
81 | +// uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET))); | |
82 | 82 | |
83 | - const unsigned char* startPtr = ptr + ADDITIONAL_DATA_OFFSET + additionalDataSize; | |
83 | + const unsigned char* startPtr = ptr + FSA_DATA_OFFSET; | |
84 | 84 | switch (implementationNum) { |
85 | 85 | case 0: |
86 | 86 | return new SimpleFSA<T>(startPtr, deserializer); |
... | ... | @@ -88,6 +88,8 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial |
88 | 88 | return new CompressedFSA1<T>(startPtr, deserializer); |
89 | 89 | case 2: |
90 | 90 | return new CompressedFSA2<T>(startPtr, deserializer); |
91 | + case 128: | |
92 | + return new SimpleFSA<T>(startPtr, deserializer, true); | |
91 | 93 | default: |
92 | 94 | std::ostringstream oss; |
93 | 95 | oss << "Invalid implementation number: " << versionNum << ", should be: " << VERSION_NUM; |
... | ... |
morfeusz/fsa/simplefsa_impl.hpp
... | ... | @@ -22,8 +22,8 @@ struct StateData { |
22 | 22 | //#pragma pack(pop) /* restore original alignment from stack */ |
23 | 23 | |
24 | 24 | template <class T> |
25 | -SimpleFSA<T>::SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer) | |
26 | -: FSA<T>(ptr, deserializer) { | |
25 | +SimpleFSA<T>::SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer, bool isTransducer) | |
26 | +: FSA<T>(ptr, deserializer), isTransducer(isTransducer) { | |
27 | 27 | } |
28 | 28 | |
29 | 29 | template <class T> |
... | ... | @@ -56,7 +56,8 @@ void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { |
56 | 56 | StateData stateData = decodeStateData(fromPointer); |
57 | 57 | const unsigned char* foundTransition = fromPointer + transitionsTableOffset; |
58 | 58 | bool found = false; |
59 | - for (unsigned int i = 0; i < stateData.transitionsNum; i++, foundTransition += 4) { | |
59 | + unsigned int increment = this->isTransducer ? 5 : 4; | |
60 | + for (unsigned int i = 0; i < stateData.transitionsNum; i++, foundTransition += increment) { | |
60 | 61 | if ((char) *foundTransition == c) { |
61 | 62 | found = true; |
62 | 63 | break; |
... | ... | @@ -76,6 +77,9 @@ void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { |
76 | 77 | } else { |
77 | 78 | state.setNext(offset); |
78 | 79 | } |
80 | + if (isTransducer) { | |
81 | + state.setLastTransitionValue(*(foundTransition + 4)); | |
82 | + } | |
79 | 83 | } |
80 | 84 | } |
81 | 85 | |
... | ... |
morfeusz/fsa/state_impl.hpp
... | ... | @@ -58,6 +58,16 @@ unsigned long State<T>::getValueSize() const { |
58 | 58 | } |
59 | 59 | |
60 | 60 | template <class T> |
61 | +unsigned char State<T>::getLastTransitionValue() const { | |
62 | + return this->lastTransitionValue; | |
63 | +} | |
64 | + | |
65 | +template <class T> | |
66 | +void State<T>::setLastTransitionValue(unsigned char val) { | |
67 | + this->lastTransitionValue = val; | |
68 | +} | |
69 | + | |
70 | +template <class T> | |
61 | 71 | State<T>::~State() { |
62 | 72 | |
63 | 73 | } |
... | ... |
morfeusz/fsa/test_recognize.cpp
... | ... | @@ -24,8 +24,7 @@ void doTest(const FSA<char*>& fsa, const char* fname) { |
24 | 24 | vector<string> splitVector(split(line, '\t')); |
25 | 25 | string key = splitVector[0]; |
26 | 26 | |
27 | - // cerr << "test " << key << endl; | |
28 | - | |
27 | + cerr << "test " << key << endl; | |
29 | 28 | char* value2; |
30 | 29 | validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key); |
31 | 30 | } |
... | ... |
morfeusz/segrules/SegrulesDeserializer.cpp
0 → 100644
1 | +/* | |
2 | + * File: SegrulesDeserializer.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 25 luty 2014, 16:16 | |
6 | + */ | |
7 | + | |
8 | +#include "SegrulesDeserializer.hpp" | |
9 | + | |
10 | +SegrulesDeserializer::SegrulesDeserializer() { | |
11 | +} | |
12 | + | |
13 | +long SegrulesDeserializer::deserialize(const unsigned char* ptr, unsigned char& object) const { | |
14 | + object = *ptr; | |
15 | +} | |
16 | + | |
17 | +SegrulesDeserializer::~SegrulesDeserializer() { | |
18 | +} | |
19 | + | |
... | ... |
morfeusz/segrules/SegrulesDeserializer.hpp
0 → 100644
1 | +/* | |
2 | + * File: SegrulesDeserializer.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 25 luty 2014, 16:16 | |
6 | + */ | |
7 | + | |
8 | +#ifndef SEGRULESDESERIALIZER_HPP | |
9 | +#define SEGRULESDESERIALIZER_HPP | |
10 | + | |
11 | +#include "../fsa/fsa.hpp" | |
12 | + | |
13 | +class SegrulesDeserializer: public Deserializer<unsigned char> { | |
14 | +public: | |
15 | + SegrulesDeserializer(); | |
16 | + long deserialize(const unsigned char* ptr, unsigned char& object) const; | |
17 | + virtual ~SegrulesDeserializer(); | |
18 | +private: | |
19 | + | |
20 | +}; | |
21 | + | |
22 | +#endif /* SEGRULESDESERIALIZER_HPP */ | |
23 | + | |
... | ... |
morfeusz/segrules/segrules.cpp
0 → 100644
1 | + | |
2 | +#include "SegrulesDeserializer.hpp" | |
3 | +#include "segrules.hpp" | |
4 | +#include "../fsa/fsa.hpp" | |
5 | + | |
6 | +FSA<unsigned char>* createSegrulesFSA(const unsigned char* analyzerPtr) { | |
7 | + static SegrulesDeserializer deserializer; | |
8 | + return new SimpleFSA<unsigned char>(analyzerPtr, deserializer, true); | |
9 | +} | |
... | ... |
morfeusz/segrules/segrules.hpp
0 → 100644
1 | +/* | |
2 | + * File: segrules.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 25 luty 2014, 16:35 | |
6 | + */ | |
7 | + | |
8 | +#ifndef SEGRULES_HPP | |
9 | +#define SEGRULES_HPP | |
10 | + | |
11 | +#include "../fsa/fsa.hpp" | |
12 | + | |
13 | +FSA<unsigned char>* createSegrulesFSA(const unsigned char* analyzerPtr); | |
14 | + | |
15 | +#endif /* SEGRULES_HPP */ | |
16 | + | |
... | ... |
nbproject/configurations.xml
... | ... | @@ -26,6 +26,10 @@ |
26 | 26 | <in>test_recognize.cpp</in> |
27 | 27 | <in>test_speed.cpp</in> |
28 | 28 | </df> |
29 | + <df name="segrules"> | |
30 | + <in>SegrulesDeserializer.cpp</in> | |
31 | + <in>segrules.cpp</in> | |
32 | + </df> | |
29 | 33 | <in>Environment.cpp</in> |
30 | 34 | <in>FlexionGraph.cpp</in> |
31 | 35 | <in>Generator.cpp</in> |
... | ... | @@ -35,7 +39,6 @@ |
35 | 39 | <in>MorphInterpretation.cpp</in> |
36 | 40 | <in>Tagset.cpp</in> |
37 | 41 | <in>const.cpp</in> |
38 | - <in>exceptions.hpp</in> | |
39 | 42 | <in>morfeusz_analyzer.cpp</in> |
40 | 43 | <in>morfeusz_generator.cpp</in> |
41 | 44 | <in>test_recognize_dict.cpp</in> |
... | ... | @@ -181,6 +184,23 @@ |
181 | 184 | </undefinedList> |
182 | 185 | </ccTool> |
183 | 186 | </folder> |
187 | + <folder path="0/segrules"> | |
188 | + <ccTool> | |
189 | + <incDir> | |
190 | + <pElem>build</pElem> | |
191 | + <pElem>morfeusz</pElem> | |
192 | + <pElem>build/morfeusz</pElem> | |
193 | + </incDir> | |
194 | + <preprocessorList> | |
195 | + <Elem>__PIC__=2</Elem> | |
196 | + <Elem>__pic__=2</Elem> | |
197 | + <Elem>libmorfeusz_EXPORTS</Elem> | |
198 | + </preprocessorList> | |
199 | + <undefinedList> | |
200 | + <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> | |
201 | + </undefinedList> | |
202 | + </ccTool> | |
203 | + </folder> | |
184 | 204 | <folder path="build"> |
185 | 205 | <ccTool> |
186 | 206 | <incDir> |
... | ... | @@ -418,8 +438,6 @@ |
418 | 438 | <ccTool> |
419 | 439 | </ccTool> |
420 | 440 | </item> |
421 | - <item path="morfeusz/exceptions.hpp" ex="false" tool="3" flavor2="0"> | |
422 | - </item> | |
423 | 441 | <item path="morfeusz/fsa/const.cpp" ex="false" tool="1" flavor2="4"> |
424 | 442 | <ccTool> |
425 | 443 | <incDir> |
... | ... | @@ -479,6 +497,17 @@ |
479 | 497 | </incDir> |
480 | 498 | </ccTool> |
481 | 499 | </item> |
500 | + <item path="morfeusz/segrules/SegrulesDeserializer.cpp" | |
501 | + ex="false" | |
502 | + tool="1" | |
503 | + flavor2="4"> | |
504 | + <ccTool> | |
505 | + </ccTool> | |
506 | + </item> | |
507 | + <item path="morfeusz/segrules/segrules.cpp" ex="false" tool="1" flavor2="4"> | |
508 | + <ccTool> | |
509 | + </ccTool> | |
510 | + </item> | |
482 | 511 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
483 | 512 | <ccTool> |
484 | 513 | <incDir> |
... | ... |