Commit e2ef01be9498be975a0fdd7a7cb68024beaa6c22
1 parent
e05d60fb
- praca nad słownikiem z uwzględnieniem tagsetu
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@14 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
11 changed files
with
121 additions
and
27 deletions
fsa/CMakeLists.txt
... | ... | @@ -3,7 +3,9 @@ add_executable (test_speed test_speed.cpp) |
3 | 3 | add_executable (test_speed_profile test_speed.cpp) |
4 | 4 | add_executable (test_recognize test_recognize.cpp) |
5 | 5 | add_executable (test_not_recognize test_not_recognize.cpp) |
6 | -set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | |
6 | +add_executable (test_morph test_morph.cpp) | |
7 | +set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -pedantic -Wcast-align -Wextra -Wmissing-noreturn -Wconversion -Wcast-qual -Wcast-align" ) | |
7 | 8 | set_target_properties ( test_speed_profile PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -g" ) |
8 | 9 | set_target_properties ( test_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
9 | 10 | set_target_properties ( test_not_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
11 | +set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | |
... | ... |
fsa/cfsa1_impl.hpp
... | ... | @@ -54,11 +54,11 @@ void CompressedFSA1<T>::reallyDoProceed( |
54 | 54 | const unsigned char* statePtr, |
55 | 55 | State<T>& state) const { |
56 | 56 | // const unsigned char stateByte = *statePtr; |
57 | - StateData2* sd = (StateData2*) statePtr; | |
57 | + const StateData2* sd = reinterpret_cast<const StateData2*>(statePtr); | |
58 | 58 | if (sd->accepting) { |
59 | 59 | // cerr << "ACCEPTING" << endl; |
60 | 60 | T object; |
61 | - int size = this->deserializer.deserialize(statePtr + 1, object); | |
61 | + long size = this->deserializer.deserialize(statePtr + 1, object); | |
62 | 62 | state.setNext(statePtr - this->initialStatePtr, object, size); |
63 | 63 | } |
64 | 64 | else { |
... | ... | @@ -163,11 +163,11 @@ void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const { |
163 | 163 | if (shortLabel > 0) { |
164 | 164 | this->doProceedToNextByArray( |
165 | 165 | shortLabel, |
166 | - (uint32_t*) (fromPointer + transitionsTableOffset), | |
166 | + reinterpret_cast<const uint32_t*>(fromPointer + transitionsTableOffset), | |
167 | 167 | state); |
168 | 168 | } |
169 | 169 | else { |
170 | - reallyDoProceed((unsigned char*) fromPointer + transitionsTableOffset + 256, state); | |
170 | + reallyDoProceed(fromPointer + transitionsTableOffset + 256, state); | |
171 | 171 | proceedToNext(c, state); |
172 | 172 | } |
173 | 173 | } |
... | ... | @@ -175,7 +175,7 @@ void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const { |
175 | 175 | this->doProceedToNextByList( |
176 | 176 | c, |
177 | 177 | shortLabel, |
178 | - (unsigned char*) (fromPointer + transitionsTableOffset), | |
178 | + fromPointer + transitionsTableOffset, | |
179 | 179 | sd->transitionsNum, |
180 | 180 | state); |
181 | 181 | } |
... | ... |
fsa/cfsa2_impl.hpp
... | ... | @@ -46,7 +46,7 @@ void CompressedFSA2<T>::reallyDoProceed( |
46 | 46 | State<T>& state) const { |
47 | 47 | if (accepting) { |
48 | 48 | T object; |
49 | - int size = this->deserializer.deserialize(statePtr + 1, object); | |
49 | + long size = this->deserializer.deserialize(statePtr + 1, object); | |
50 | 50 | state.setNext(statePtr - this->initialStatePtr, object, size); |
51 | 51 | } else { |
52 | 52 | state.setNext(statePtr - this->initialStatePtr); |
... | ... | @@ -81,7 +81,7 @@ void CompressedFSA2<T>::doProceedToNextByList( |
81 | 81 | const char c, |
82 | 82 | const unsigned char* ptr, |
83 | 83 | State<T>& state) const { |
84 | - register unsigned char* currPtr = const_cast<unsigned char*> (ptr); | |
84 | + unsigned char* currPtr = const_cast<unsigned char*> (ptr); | |
85 | 85 | while (true) { |
86 | 86 | // const_cast<Counter*>(&counter)->increment(1); |
87 | 87 | if ((char) *currPtr == c) { |
... | ... | @@ -141,7 +141,7 @@ void CompressedFSA2<T>::proceedToNext(const char c, State<T>& state) const { |
141 | 141 | } |
142 | 142 | this->doProceedToNextByList( |
143 | 143 | c, |
144 | - (unsigned char*) (fromPointer + transitionsTableOffset), | |
144 | + fromPointer + transitionsTableOffset, | |
145 | 145 | state); |
146 | 146 | } |
147 | 147 | |
... | ... |
fsa/fsa.hpp
... | ... | @@ -15,6 +15,7 @@ |
15 | 15 | #include <exception> |
16 | 16 | #include <string> |
17 | 17 | #include <vector> |
18 | +#include "interpretation.hpp" | |
18 | 19 | |
19 | 20 | template <class T> class State; |
20 | 21 | template <class T> class FSA; |
... | ... | @@ -29,7 +30,7 @@ public: |
29 | 30 | * Deserialize object from ptr. |
30 | 31 | * Returns number of bytes read or -1 on error. |
31 | 32 | */ |
32 | - virtual int deserialize(const unsigned char* ptr, T& object) const = 0; | |
33 | + virtual long deserialize(const unsigned char* ptr, T& object) const = 0; | |
33 | 34 | }; |
34 | 35 | |
35 | 36 | class StringDeserializer : public Deserializer<char*> { |
... | ... | @@ -42,13 +43,17 @@ public: |
42 | 43 | * Deserialize object from ptr. |
43 | 44 | * Returns number of bytes read or -1 on error. |
44 | 45 | */ |
45 | - int deserialize(const unsigned char* ptr, char*& text) const { | |
46 | + long deserialize(const unsigned char* ptr, char*& text) const { | |
46 | 47 | // text = const_cast<char*> (reinterpret_cast<const char*> (ptr)); |
47 | 48 | // return strlen(text) + 1; |
48 | 49 | return 1; |
49 | 50 | } |
50 | 51 | }; |
51 | 52 | |
53 | +class MorphDeserializer: public Deserializer<std::vector<Interpretation>> { | |
54 | + long deserialize(const unsigned char* ptr, std::vector<Interpretation>& interp) const; | |
55 | +}; | |
56 | + | |
52 | 57 | class Counter { |
53 | 58 | public: |
54 | 59 | |
... | ... | @@ -217,18 +222,18 @@ public: |
217 | 222 | */ |
218 | 223 | unsigned int getValueSize() const; |
219 | 224 | |
220 | - unsigned int getOffset() const; | |
225 | + unsigned long getOffset() const; | |
221 | 226 | |
222 | - void setNext(const unsigned int offset); | |
223 | - void setNext(const unsigned int offset, const T& value, const unsigned int valueSize); | |
227 | + void setNext(const unsigned long offset); | |
228 | + void setNext(const unsigned long offset, const T& value, const unsigned int valueSize); | |
224 | 229 | void setNextAsSink(); |
225 | 230 | |
226 | - State(const FSA<T>& fsa); | |
231 | + explicit State(const FSA<T>& fsa); | |
227 | 232 | |
228 | 233 | virtual ~State(); |
229 | 234 | private: |
230 | 235 | const FSA<T>& fsa; |
231 | - unsigned int offset; | |
236 | + unsigned long offset; | |
232 | 237 | bool accepting; |
233 | 238 | bool sink; |
234 | 239 | T value; |
... | ... |
fsa/fsa_impl.hpp
... | ... | @@ -8,9 +8,11 @@ |
8 | 8 | #ifndef _SIMPLE_FSA_IMPL_HPP |
9 | 9 | #define _SIMPLE_FSA_IMPL_HPP |
10 | 10 | |
11 | +#include <cstring> | |
11 | 12 | #include <algorithm> |
12 | 13 | #include <utility> |
13 | 14 | #include <iostream> |
15 | +#include <vector> | |
14 | 16 | #include <netinet/in.h> |
15 | 17 | #include "fsa.hpp" |
16 | 18 | #include "utils.hpp" |
... | ... | @@ -84,4 +86,34 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial |
84 | 86 | } |
85 | 87 | } |
86 | 88 | |
89 | +static void deserializeLemma(const unsigned char*& ptr, Lemma& lemma) { | |
90 | + // XXX uważać na poprawność danych | |
91 | + lemma.suffixToCut = *ptr; | |
92 | + ptr++; | |
93 | + lemma.suffixToAdd = (const char*) ptr; | |
94 | + ptr += strlen((const char*) ptr) + 1; | |
95 | +} | |
96 | + | |
97 | +static void deserializeInterp(const unsigned char*& ptr, Interpretation& interp) { | |
98 | + deserializeLemma(ptr, interp.lemma); | |
99 | + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | |
100 | + ptr += 2; | |
101 | + interp.nameClassifier = *ptr; | |
102 | + ptr++; | |
103 | +} | |
104 | + | |
105 | +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<Interpretation>& interps) const { | |
106 | + const unsigned char* currPtr = ptr; | |
107 | + uint8_t interpsNum = *ptr; | |
108 | + interps.clear(); | |
109 | + interps.reserve(interpsNum); | |
110 | + currPtr++; | |
111 | + for (unsigned int i = 0; i < interpsNum; i++) { | |
112 | + Interpretation interp; | |
113 | + deserializeInterp(currPtr, interp); | |
114 | + interps.push_back(interp); | |
115 | + } | |
116 | + return currPtr - ptr; | |
117 | +} | |
118 | + | |
87 | 119 | #endif /* _SIMPLE_FSA_IMPL_HPP */ |
... | ... |
fsa/interpretation.hpp
... | ... | @@ -14,15 +14,15 @@ |
14 | 14 | using namespace std; |
15 | 15 | |
16 | 16 | struct Lemma { |
17 | - int suffixToCut; | |
18 | - string suffixToAdd; | |
17 | + unsigned short suffixToCut; | |
18 | + const char* suffixToAdd; | |
19 | 19 | }; |
20 | 20 | |
21 | 21 | struct Interpretation { |
22 | 22 | Lemma lemma; |
23 | - list<int> tag; // np. subst:sg:nom:m1 | |
24 | - int nameClassifier; // np. "pospolita" | |
25 | - int qualifier; // np. "dawne" lub "potoczne" | |
23 | + unsigned int tag; // np. subst:sg:nom:m1 | |
24 | + unsigned short nameClassifier; // np. "pospolita" | |
25 | + unsigned short qualifier; // np. "dawne" lub "potoczne" | |
26 | 26 | }; |
27 | 27 | |
28 | 28 | #endif /* INTERPRETATION_HPP */ |
... | ... |
fsa/simplefsa_impl.hpp
... | ... | @@ -41,7 +41,7 @@ void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { |
41 | 41 | if (state.isAccepting()) { |
42 | 42 | transitionsTableOffset += state.getValueSize(); |
43 | 43 | } |
44 | - StateData stateData = *(StateData*) (fromPointer); | |
44 | + StateData stateData = *reinterpret_cast<const StateData*>(fromPointer); | |
45 | 45 | const unsigned char* foundTransition = fromPointer + transitionsTableOffset; |
46 | 46 | bool found = false; |
47 | 47 | for (int i = 0; i < stateData.transitionsNum; i++, foundTransition += 4) { |
... | ... | @@ -57,7 +57,7 @@ void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { |
57 | 57 | else { |
58 | 58 | unsigned int offset = decodeOffset(foundTransition + 1); |
59 | 59 | const unsigned char* nextStatePointer = this->initialStatePtr + offset; |
60 | - StateData* nextStateData = (StateData*) (nextStatePointer); | |
60 | + const StateData* nextStateData = reinterpret_cast<const StateData*>(nextStatePointer); | |
61 | 61 | if (nextStateData->accepting) { |
62 | 62 | T object; |
63 | 63 | int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object); |
... | ... |
fsa/state_impl.hpp
... | ... | @@ -39,7 +39,7 @@ void State<T>::proceedToNext(const char c) { |
39 | 39 | } |
40 | 40 | |
41 | 41 | template <class T> |
42 | -unsigned int State<T>::getOffset() const { | |
42 | +unsigned long State<T>::getOffset() const { | |
43 | 43 | assert(!this->isSink()); |
44 | 44 | return this->offset; |
45 | 45 | } |
... | ... | @@ -62,14 +62,14 @@ State<T>::~State() { |
62 | 62 | } |
63 | 63 | |
64 | 64 | template <class T> |
65 | -void State<T>::setNext(const unsigned int offset) { | |
65 | +void State<T>::setNext(const unsigned long offset) { | |
66 | 66 | // assert(!this->isSink()); |
67 | 67 | this->offset = offset; |
68 | 68 | this->accepting = false; |
69 | 69 | } |
70 | 70 | |
71 | 71 | template <class T> |
72 | -void State<T>::setNext(const unsigned int offset, const T& value, const unsigned int valueSize) { | |
72 | +void State<T>::setNext(const unsigned long offset, const T& value, const unsigned int valueSize) { | |
73 | 73 | // assert(!this->isSink()); |
74 | 74 | this->offset = offset; |
75 | 75 | this->accepting = true; |
... | ... |
fsa/test_morph.cpp
0 → 100644
1 | +/* | |
2 | + * File: test_morph.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on November 8, 2013, 4:12 PM | |
6 | + */ | |
7 | + | |
8 | +#include <cstdlib> | |
9 | +#include <sstream> | |
10 | +#include <iostream> | |
11 | +#include "fsa.hpp" | |
12 | +#include "utils.hpp" | |
13 | + | |
14 | +using namespace std; | |
15 | + | |
16 | +void debug(const string& key, const vector<Interpretation> value) { | |
17 | + cerr << key << endl; | |
18 | + for (Interpretation i: value) { | |
19 | + cerr << "suffix to cut: " << i.lemma.suffixToCut << endl; | |
20 | + cerr << "suffix to add: " << i.lemma.suffixToAdd << endl; | |
21 | + cerr << "tag: " << i.tag << endl; | |
22 | + cerr << "name: " << i.nameClassifier << endl; | |
23 | + } | |
24 | + cerr << "==================" << endl; | |
25 | +} | |
26 | + | |
27 | +void doTest(const FSA<vector<Interpretation>>& fsa, const char* fname) { | |
28 | + ifstream ifs; | |
29 | + // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); | |
30 | + ifs.open(fname, ios::binary); | |
31 | + string line; | |
32 | + while (getline(ifs, line)) { | |
33 | + vector<string> splitVector(split(line, '\t')); | |
34 | + string key = splitVector[0]; | |
35 | + vector<Interpretation> value2; | |
36 | + fsa.tryToRecognize(key.c_str(), value2); | |
37 | + debug(key, value2); | |
38 | +// validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key); | |
39 | + } | |
40 | + validate(ifs.eof(), "Failed to read the input file to the end"); | |
41 | +} | |
42 | + | |
43 | +int main(int argc, char** argv) { | |
44 | + validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename."); | |
45 | + const unsigned char* fsaData = readFile(argv[1]); | |
46 | + MorphDeserializer deserializer; | |
47 | + FSA<vector<Interpretation>>* fsa = FSA<vector<Interpretation>>::getFSA(fsaData, deserializer); | |
48 | + doTest(*fsa, argv[2]); | |
49 | + // cout << argc << endl; | |
50 | + delete fsa; | |
51 | + return 0; | |
52 | +} | |
53 | + | |
... | ... |
fsa/test_recognize.cpp
... | ... | @@ -35,7 +35,6 @@ void doTest(const FSA<char*>& fsa, const char* fname) { |
35 | 35 | } |
36 | 36 | |
37 | 37 | int main(int argc, char** argv) { |
38 | - cerr << (int) ((unsigned char) - 123) << endl; | |
39 | 38 | validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename."); |
40 | 39 | const unsigned char* fsaData = readFile(argv[1]); |
41 | 40 | StringDeserializer deserializer; |
... | ... |
nbproject/configurations.xml
... | ... | @@ -6,6 +6,7 @@ |
6 | 6 | <in>cfsa2_impl.hpp</in> |
7 | 7 | <in>interpretation.hpp</in> |
8 | 8 | <in>simplefsa_impl.hpp</in> |
9 | + <in>test_morph.cpp</in> | |
9 | 10 | <in>test_not_recognize.cpp</in> |
10 | 11 | <in>test_recognize.cpp</in> |
11 | 12 | <in>test_speed.cpp</in> |
... | ... | @@ -71,6 +72,8 @@ |
71 | 72 | </item> |
72 | 73 | <item path="fsa/simplefsa_impl.hpp" ex="false" tool="3" flavor2="0"> |
73 | 74 | </item> |
75 | + <item path="fsa/test_morph.cpp" ex="false" tool="1" flavor2="0"> | |
76 | + </item> | |
74 | 77 | <item path="fsa/test_not_recognize.cpp" ex="false" tool="1" flavor2="8"> |
75 | 78 | <ccTool> |
76 | 79 | </ccTool> |
... | ... |