Commit 4bf99e6b94697f082572eb35c0dcdd60c3711761
1 parent
d42f73bc
- prawie działa rozpoznawanie informacji morfologicznej
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@16 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
22 changed files
with
498 additions
and
140 deletions
fsa/CMakeLists.txt
1 | 1 | |
2 | -add_executable (test_speed test_speed.cpp) | |
3 | -add_executable (test_speed_profile test_speed.cpp) | |
4 | -add_executable (test_recognize test_recognize.cpp) | |
5 | -add_executable (test_not_recognize test_not_recognize.cpp) | |
6 | -add_executable (test_morph test_morph.cpp) | |
2 | +add_executable (test_speed test_speed.cpp const.cpp) | |
3 | +add_executable (test_speed_profile test_speed.cpp const.cpp) | |
4 | +add_executable (test_recognize test_recognize.cpp const.cpp) | |
5 | +add_executable (test_not_recognize test_not_recognize.cpp const.cpp) | |
6 | + | |
7 | 7 | set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -pedantic -Wcast-align -Wextra -Wmissing-noreturn -Wconversion -Wcast-qual -Wcast-align" ) |
8 | 8 | set_target_properties ( test_speed_profile PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -g" ) |
9 | 9 | set_target_properties ( test_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
10 | 10 | set_target_properties ( test_not_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
11 | -set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | |
... | ... |
fsa/cfsa1_impl.hpp
... | ... | @@ -79,7 +79,7 @@ void CompressedFSA1<T>::doProceedToNextByList( |
79 | 79 | TransitionData2 td; |
80 | 80 | for (unsigned int i = 0; i < transitionsNum; i++) { |
81 | 81 | // const_cast<Counter*>(&counter)->increment(1); |
82 | - td = *((TransitionData2*) currPtr); | |
82 | + td = *(reinterpret_cast<const TransitionData2*>(currPtr)); | |
83 | 83 | if (td.shortLabel == shortLabel) { |
84 | 84 | if (shortLabel == 0) { |
85 | 85 | currPtr++; |
... | ... | @@ -107,7 +107,8 @@ void CompressedFSA1<T>::doProceedToNextByList( |
107 | 107 | if (!found) { |
108 | 108 | // cerr << "SINK for " << c << endl; |
109 | 109 | state.setNextAsSink(); |
110 | - } else { | |
110 | + } | |
111 | + else { | |
111 | 112 | currPtr++; |
112 | 113 | // cerr << "offset size " << td.offsetSize << endl; |
113 | 114 | // cerr << "offset " << offset << endl; |
... | ... | @@ -152,12 +153,12 @@ void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const { |
152 | 153 | // cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl; |
153 | 154 | const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); |
154 | 155 | unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c]; |
155 | - unsigned int transitionsTableOffset = 1; | |
156 | + unsigned long transitionsTableOffset = 1; | |
156 | 157 | if (state.isAccepting()) { |
157 | 158 | transitionsTableOffset += state.getValueSize(); |
158 | 159 | // cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl; |
159 | 160 | } |
160 | - StateData2* sd = (StateData2*) (fromPointer); | |
161 | + const StateData2* sd = reinterpret_cast<const StateData2*>(fromPointer); | |
161 | 162 | // cerr << "transitions num=" << sd->transitionsNum << endl; |
162 | 163 | if (sd->array) { |
163 | 164 | if (shortLabel > 0) { |
... | ... |
fsa/cfsa2_impl.hpp
... | ... | @@ -135,7 +135,7 @@ void CompressedFSA2<T>::proceedToNext(const char c, State<T>& state) const { |
135 | 135 | cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl; |
136 | 136 | #endif |
137 | 137 | const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); |
138 | - unsigned int transitionsTableOffset = 0; | |
138 | + unsigned long transitionsTableOffset = 0; | |
139 | 139 | if (state.isAccepting()) { |
140 | 140 | transitionsTableOffset += state.getValueSize(); |
141 | 141 | } |
... | ... |
fsa/const.cpp
0 → 100644
1 | + | |
2 | +#include "const.hpp" | |
3 | + | |
4 | +extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; | |
5 | +extern const uint8_t VERSION_NUM = 9; | |
6 | + | |
7 | +extern const unsigned int VERSION_NUM_OFFSET = 4; | |
8 | +extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5; | |
9 | +extern const unsigned int ADDITIONAL_DATA_SIZE_OFFSET = 6; | |
10 | +extern const unsigned int ADDITIONAL_DATA_OFFSET = 10; | |
... | ... |
fsa/const.hpp
0 → 100644
1 | +/* | |
2 | + * File: const.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 12 listopad 2013, 14:11 | |
6 | + */ | |
7 | + | |
8 | +#ifndef CONST_HPP | |
9 | +#define CONST_HPP | |
10 | + | |
11 | +#include <netinet/in.h> | |
12 | + | |
13 | +extern const uint32_t MAGIC_NUMBER; | |
14 | +extern const uint8_t VERSION_NUM; | |
15 | + | |
16 | +extern const unsigned int VERSION_NUM_OFFSET; | |
17 | +extern const unsigned int IMPLEMENTATION_NUM_OFFSET; | |
18 | +extern const unsigned int ADDITIONAL_DATA_SIZE_OFFSET; | |
19 | +extern const unsigned int ADDITIONAL_DATA_OFFSET; | |
20 | + | |
21 | +#endif /* CONST_HPP */ | |
22 | + | |
... | ... |
fsa/fsa.hpp
... | ... | @@ -9,13 +9,12 @@ |
9 | 9 | #define FSA_HPP |
10 | 10 | |
11 | 11 | //#include <iostream> |
12 | -//#include <cstring> | |
13 | -#include <typeinfo> | |
12 | +#include <cstring> | |
14 | 13 | #include <cassert> |
14 | +#include <typeinfo> | |
15 | 15 | #include <exception> |
16 | 16 | #include <string> |
17 | 17 | #include <vector> |
18 | -#include "interpretation.hpp" | |
19 | 18 | |
20 | 19 | template <class T> class State; |
21 | 20 | template <class T> class FSA; |
... | ... | @@ -44,16 +43,12 @@ public: |
44 | 43 | * Returns number of bytes read or -1 on error. |
45 | 44 | */ |
46 | 45 | long deserialize(const unsigned char* ptr, char*& text) const { |
47 | - // text = const_cast<char*> (reinterpret_cast<const char*> (ptr)); | |
48 | - // return strlen(text) + 1; | |
49 | - return 1; | |
46 | + text = const_cast<char*> (reinterpret_cast<const char*> (ptr)); | |
47 | + return strlen(text) + 1; | |
48 | +// return 1; | |
50 | 49 | } |
51 | 50 | }; |
52 | 51 | |
53 | -class MorphDeserializer: public Deserializer<std::vector<Interpretation>> { | |
54 | - long deserialize(const unsigned char* ptr, std::vector<Interpretation>& interp) const; | |
55 | -}; | |
56 | - | |
57 | 52 | class Counter { |
58 | 53 | public: |
59 | 54 | |
... | ... | @@ -88,8 +83,6 @@ public: |
88 | 83 | */ |
89 | 84 | static FSA<T>* getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer); |
90 | 85 | |
91 | - static const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; | |
92 | - static const uint8_t VERSION_NUM = 8; | |
93 | 86 | protected: |
94 | 87 | |
95 | 88 | /** |
... | ... | @@ -105,10 +98,6 @@ protected: |
105 | 98 | const Deserializer<T>& deserializer; |
106 | 99 | friend class State<T>; |
107 | 100 | private: |
108 | - static int getMagicNumberOffset(); | |
109 | - static int getVersionNumOffset(); | |
110 | - static int getPopularCharsOffset(); | |
111 | - static int getInitialStateOffset(); | |
112 | 101 | // FSA(); |
113 | 102 | }; |
114 | 103 | |
... | ... | @@ -220,12 +209,12 @@ public: |
220 | 209 | * Makes sense only for accepting states. |
221 | 210 | * For non-accepting states is throws an exception. |
222 | 211 | */ |
223 | - unsigned int getValueSize() const; | |
212 | + unsigned long getValueSize() const; | |
224 | 213 | |
225 | 214 | unsigned long getOffset() const; |
226 | 215 | |
227 | 216 | void setNext(const unsigned long offset); |
228 | - void setNext(const unsigned long offset, const T& value, const unsigned int valueSize); | |
217 | + void setNext(const unsigned long offset, const T& value, const unsigned long valueSize); | |
229 | 218 | void setNextAsSink(); |
230 | 219 | |
231 | 220 | explicit State(const FSA<T>& fsa); |
... | ... | @@ -237,7 +226,7 @@ private: |
237 | 226 | bool accepting; |
238 | 227 | bool sink; |
239 | 228 | T value; |
240 | - int valueSize; | |
229 | + long valueSize; | |
241 | 230 | }; |
242 | 231 | |
243 | 232 | class FSAException : public std::exception { |
... | ... |
fsa/fsa_impl.hpp
... | ... | @@ -14,14 +14,11 @@ |
14 | 14 | #include <iostream> |
15 | 15 | #include <vector> |
16 | 16 | #include <netinet/in.h> |
17 | -#include "fsa.hpp" | |
18 | 17 | #include "utils.hpp" |
18 | +#include "const.hpp" | |
19 | 19 | |
20 | 20 | using namespace std; |
21 | - | |
22 | -static const unsigned int VERSION_NUM_OFFSET = 4; | |
23 | -static const unsigned int IMPLEMENTATION_NUM_OFFSET = 5; | |
24 | -static const unsigned int FSA_OFFSET = 6; | |
21 | +//static const unsigned int FSA_OFFSET = 6; | |
25 | 22 | |
26 | 23 | template <class T> |
27 | 24 | bool FSA<T>::tryToRecognize(const char* input, T& value) const { |
... | ... | @@ -73,7 +70,9 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial |
73 | 70 | } |
74 | 71 | |
75 | 72 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); |
76 | - const unsigned char* startPtr = ptr + FSA_OFFSET; | |
73 | + | |
74 | + uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET))); | |
75 | + const unsigned char* startPtr = ptr + ADDITIONAL_DATA_OFFSET + additionalDataSize; | |
77 | 76 | switch (implementationNum) { |
78 | 77 | case 0: |
79 | 78 | return new SimpleFSA<T>(startPtr, deserializer); |
... | ... | @@ -86,34 +85,4 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial |
86 | 85 | } |
87 | 86 | } |
88 | 87 | |
89 | -static void deserializeLemma(const unsigned char*& ptr, Lemma& lemma) { | |
90 | - // XXX uważać na poprawność danych | |
91 | - lemma.suffixToCut = *ptr; | |
92 | - ptr++; | |
93 | - lemma.suffixToAdd = (const char*) ptr; | |
94 | - ptr += strlen((const char*) ptr) + 1; | |
95 | -} | |
96 | - | |
97 | -static void deserializeInterp(const unsigned char*& ptr, Interpretation& interp) { | |
98 | - deserializeLemma(ptr, interp.lemma); | |
99 | - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | |
100 | - ptr += 2; | |
101 | - interp.nameClassifier = *ptr; | |
102 | - ptr++; | |
103 | -} | |
104 | - | |
105 | -long MorphDeserializer::deserialize(const unsigned char* ptr, vector<Interpretation>& interps) const { | |
106 | - const unsigned char* currPtr = ptr; | |
107 | - uint8_t interpsNum = *ptr; | |
108 | - interps.clear(); | |
109 | - interps.reserve(interpsNum); | |
110 | - currPtr++; | |
111 | - for (unsigned int i = 0; i < interpsNum; i++) { | |
112 | - Interpretation interp; | |
113 | - deserializeInterp(currPtr, interp); | |
114 | - interps.push_back(interp); | |
115 | - } | |
116 | - return currPtr - ptr; | |
117 | -} | |
118 | - | |
119 | 88 | #endif /* _SIMPLE_FSA_IMPL_HPP */ |
... | ... |
fsa/interpretation.hpp deleted
1 | -/* | |
2 | - * File: interpretation.hpp | |
3 | - * Author: mlenart | |
4 | - * | |
5 | - * Created on November 4, 2013, 3:11 PM | |
6 | - */ | |
7 | - | |
8 | -#ifndef INTERPRETATION_HPP | |
9 | -#define INTERPRETATION_HPP | |
10 | - | |
11 | -#include <string> | |
12 | -#include <list> | |
13 | - | |
14 | -using namespace std; | |
15 | - | |
16 | -struct Lemma { | |
17 | - unsigned short suffixToCut; | |
18 | - const char* suffixToAdd; | |
19 | -}; | |
20 | - | |
21 | -struct Interpretation { | |
22 | - Lemma lemma; | |
23 | - unsigned int tag; // np. subst:sg:nom:m1 | |
24 | - unsigned short nameClassifier; // np. "pospolita" | |
25 | - unsigned short qualifier; // np. "dawne" lub "potoczne" | |
26 | -}; | |
27 | - | |
28 | -#endif /* INTERPRETATION_HPP */ | |
29 | - |
fsa/simplefsa_impl.hpp
... | ... | @@ -37,7 +37,7 @@ static unsigned int decodeOffset(const unsigned char* ptr) { |
37 | 37 | template <class T> |
38 | 38 | void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { |
39 | 39 | const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); |
40 | - int transitionsTableOffset = sizeof (StateData); | |
40 | + long transitionsTableOffset = sizeof (StateData); | |
41 | 41 | if (state.isAccepting()) { |
42 | 42 | transitionsTableOffset += state.getValueSize(); |
43 | 43 | } |
... | ... | @@ -60,7 +60,7 @@ void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { |
60 | 60 | const StateData* nextStateData = reinterpret_cast<const StateData*>(nextStatePointer); |
61 | 61 | if (nextStateData->accepting) { |
62 | 62 | T object; |
63 | - int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object); | |
63 | + long size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object); | |
64 | 64 | state.setNext(offset, object, size); |
65 | 65 | } else { |
66 | 66 | state.setNext(offset); |
... | ... |
fsa/state_impl.hpp
... | ... | @@ -51,7 +51,7 @@ T State<T>::getValue() const { |
51 | 51 | } |
52 | 52 | |
53 | 53 | template <class T> |
54 | -unsigned int State<T>::getValueSize() const { | |
54 | +unsigned long State<T>::getValueSize() const { | |
55 | 55 | assert(this->isAccepting()); |
56 | 56 | return this->valueSize; |
57 | 57 | } |
... | ... | @@ -69,7 +69,7 @@ void State<T>::setNext(const unsigned long offset) { |
69 | 69 | } |
70 | 70 | |
71 | 71 | template <class T> |
72 | -void State<T>::setNext(const unsigned long offset, const T& value, const unsigned int valueSize) { | |
72 | +void State<T>::setNext(const unsigned long offset, const T& value, const unsigned long valueSize) { | |
73 | 73 | // assert(!this->isSink()); |
74 | 74 | this->offset = offset; |
75 | 75 | this->accepting = true; |
... | ... |
fsa/utils.hpp
... | ... | @@ -9,14 +9,15 @@ |
9 | 9 | #define UTILS_HPP |
10 | 10 | |
11 | 11 | #include <iostream> |
12 | +#include <fstream> | |
12 | 13 | #include <sstream> |
13 | 14 | #include <string> |
14 | 15 | #include <fstream> |
15 | 16 | #include <vector> |
16 | 17 | |
17 | -using namespace std; | |
18 | +//using namespace std; | |
18 | 19 | |
19 | -//#define DEBUG_BUILD | |
20 | +#define DEBUG_BUILD | |
20 | 21 | |
21 | 22 | #ifdef DEBUG_BUILD |
22 | 23 | # define DEBUG(x) do { std::cerr << x << std::endl; } while (0) |
... | ... | @@ -24,14 +25,14 @@ using namespace std; |
24 | 25 | # define DEBUG(x) |
25 | 26 | #endif |
26 | 27 | |
27 | -void validate(const bool cond, const std::string& msg) { | |
28 | +inline void validate(const bool cond, const std::string& msg) { | |
28 | 29 | if (!cond) { |
29 | 30 | std::cerr << msg << std::endl; |
30 | 31 | exit(1); |
31 | 32 | } |
32 | 33 | } |
33 | 34 | |
34 | -std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) { | |
35 | +inline std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) { | |
35 | 36 | std::stringstream ss(s); |
36 | 37 | std::string item; |
37 | 38 | while (std::getline(ss, item, delim)) { |
... | ... | @@ -41,25 +42,25 @@ std::vector<std::string> &split(const std::string &s, char delim, std::vector<st |
41 | 42 | } |
42 | 43 | |
43 | 44 | |
44 | -std::vector<std::string> split(const std::string &s, char delim) { | |
45 | +inline std::vector<std::string> split(const std::string &s, char delim) { | |
45 | 46 | std::vector<std::string> elems; |
46 | 47 | split(s, delim, elems); |
47 | 48 | return elems; |
48 | 49 | } |
49 | 50 | |
50 | -string &rtrim(string &s) { | |
51 | - s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun<int, int>(isspace))).base(), s.end()); | |
52 | - return s; | |
53 | -} | |
51 | +//string &rtrim(string &s) { | |
52 | +// s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun<int, int>(isspace))).base(), s.end()); | |
53 | +// return s; | |
54 | +//} | |
54 | 55 | |
55 | -unsigned char* readFile(const char* fname) { | |
56 | - ifstream ifs; | |
56 | +inline unsigned char* readFile(const char* fname) { | |
57 | + std::ifstream ifs; | |
57 | 58 | ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); |
58 | - ifs.open(fname, ios::in | ios::binary | ios::ate); | |
59 | + ifs.open(fname, std::ios::in | std::ios::binary | std::ios::ate); | |
59 | 60 | // if (ifs.is_open()) { |
60 | - int size = ifs.tellg(); | |
61 | + long size = ifs.tellg(); | |
61 | 62 | unsigned char* memblock = new unsigned char [size]; |
62 | - ifs.seekg(0, ios::beg); | |
63 | + ifs.seekg(0, std::ios::beg); | |
63 | 64 | ifs.read(reinterpret_cast<char*> (memblock), size); |
64 | 65 | ifs.close(); |
65 | 66 | return memblock; |
... | ... |
morfeusz/CMakeLists.txt
... | ... | @@ -6,7 +6,10 @@ |
6 | 6 | include_directories (${Morfeusz_SOURCE_DIR}/fsa) |
7 | 7 | add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) |
8 | 8 | add_executable (morfeusz2_analyze main.cpp) |
9 | +add_executable (test_morph test_morph.cpp interpretations.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp) | |
9 | 10 | |
10 | 11 | # Link the executable to the Hello library. |
11 | 12 | target_link_libraries (morfeusz2_analyze morfeusz2) |
12 | 13 | set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) |
14 | + | |
15 | +set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | |
... | ... |
morfeusz/MorphDeserializer.cpp
0 → 100644
1 | +/* | |
2 | + * File: MorphDeserializer.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 12 listopad 2013, 15:31 | |
6 | + */ | |
7 | + | |
8 | +#include "MorphDeserializer.hpp" | |
9 | + | |
10 | +MorphDeserializer::MorphDeserializer() { | |
11 | +} | |
12 | + | |
13 | +MorphDeserializer::MorphDeserializer(const MorphDeserializer& orig) { | |
14 | +} | |
15 | + | |
16 | +MorphDeserializer::~MorphDeserializer() { | |
17 | +} | |
18 | + | |
19 | +static void deserializeLemma(const unsigned char*& ptr, Lemma& lemma) { | |
20 | + // XXX uważać na poprawność danych | |
21 | + lemma.suffixToCut = *ptr; | |
22 | + ptr++; | |
23 | + lemma.suffixToAdd = (const char*) ptr; | |
24 | + ptr += strlen((const char*) ptr) + 1; | |
25 | +} | |
26 | + | |
27 | +static void deserializeInterp(const unsigned char*& ptr, Interpretation& interp) { | |
28 | + deserializeLemma(ptr, interp.lemma); | |
29 | + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); | |
30 | + ptr += 2; | |
31 | + interp.nameClassifier = *ptr; | |
32 | + ptr++; | |
33 | +} | |
34 | + | |
35 | +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<Interpretation>& interps) const { | |
36 | + const unsigned char* currPtr = ptr; | |
37 | + uint8_t interpsNum = *ptr; | |
38 | + interps.clear(); | |
39 | + interps.reserve(interpsNum); | |
40 | + currPtr++; | |
41 | + for (unsigned int i = 0; i < interpsNum; i++) { | |
42 | + Interpretation interp; | |
43 | + deserializeInterp(currPtr, interp); | |
44 | + interps.push_back(interp); | |
45 | + } | |
46 | + return currPtr - ptr; | |
47 | +} | |
48 | + | |
... | ... |
morfeusz/MorphDeserializer.hpp
0 → 100644
1 | +/* | |
2 | + * File: MorphDeserializer.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 12 listopad 2013, 15:31 | |
6 | + */ | |
7 | + | |
8 | +#ifndef MORPHDESERIALIZER_HPP | |
9 | +#define MORPHDESERIALIZER_HPP | |
10 | + | |
11 | +#include <vector> | |
12 | +#include "fsa.hpp" | |
13 | +#include "interpretations.hpp" | |
14 | + | |
15 | +class MorphDeserializer: public Deserializer<std::vector<Interpretation>> { | |
16 | +public: | |
17 | + MorphDeserializer(); | |
18 | + MorphDeserializer(const MorphDeserializer& orig); | |
19 | + virtual ~MorphDeserializer(); | |
20 | + long deserialize( | |
21 | + const unsigned char* ptr, | |
22 | + std::vector<Interpretation>& interps) const; | |
23 | +private: | |
24 | + | |
25 | +}; | |
26 | + | |
27 | +#endif /* MORPHDESERIALIZER_HPP */ | |
28 | + | |
... | ... |
morfeusz/Tagset.cpp
0 → 100644
1 | + | |
2 | +#include <string> | |
3 | +#include <netinet/in.h> | |
4 | +#include "Tagset.hpp" | |
5 | +#include "const.hpp" | |
6 | +#include "utils.hpp" | |
7 | + | |
8 | +using namespace std; | |
9 | + | |
10 | +static uint16_t readInt16(const unsigned char*& currPtr) { | |
11 | + DEBUG("readInt16"); | |
12 | + uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr)); | |
13 | + DEBUG("still alive " + to_string(res)); | |
14 | + currPtr += 2; | |
15 | + DEBUG("still alive after ptr add"); | |
16 | + return res; | |
17 | +} | |
18 | + | |
19 | +static string readString(const unsigned char*& currPtr) { | |
20 | + DEBUG("readString"); | |
21 | + string res(reinterpret_cast<const char*>(currPtr)); | |
22 | + currPtr += res.length(); | |
23 | + currPtr++; | |
24 | + return res; | |
25 | +} | |
26 | + | |
27 | +static void readTags(const unsigned char*& currPtr, vector<string>& tags) { | |
28 | + tags.clear(); | |
29 | + tags.resize(65536); | |
30 | + uint16_t tagsNum = readInt16(currPtr); | |
31 | + DEBUG("hi there"); | |
32 | + DEBUG("tagsNum="+to_string((int) tagsNum)); | |
33 | + for (unsigned int i = 0; i < tagsNum; i++) { | |
34 | + unsigned int tagNum = readInt16(currPtr); | |
35 | + tags[tagNum] = readString(currPtr); | |
36 | + } | |
37 | +} | |
38 | + | |
39 | +Tagset::Tagset(const unsigned char* fsaData) { | |
40 | + const unsigned char* currPtr = fsaData + ADDITIONAL_DATA_OFFSET; | |
41 | +// uint32_t tagsNum = ntohl(*reinterpret_cast<const uint32_t*>(currPtr)); | |
42 | +// uint32_t namesNum = ntohl(*reinterpret_cast<const uint32_t*>(fsaData + ADDITIONAL_DATA_OFFSET + 4)); | |
43 | +// const unsigned char* currPtr = fsaData + 8; | |
44 | + DEBUG("will read tags"); | |
45 | + readTags(currPtr, this->tags); | |
46 | + DEBUG("will read names"); | |
47 | + readTags(currPtr, this->names); | |
48 | +} | |
49 | + | |
50 | +const string& Tagset::getTag(const int tagNum) const { | |
51 | + return this->tags.at(tagNum); | |
52 | +} | |
53 | + | |
54 | +const string& Tagset::getName(const int nameNum) const { | |
55 | + return this->names.at(nameNum); | |
56 | +} | |
... | ... |
morfeusz/Tagset.hpp
0 → 100644
1 | +/* | |
2 | + * File: tagset.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 12 listopad 2013, 14:09 | |
6 | + */ | |
7 | + | |
8 | +#ifndef TAGSET_HPP | |
9 | +#define TAGSET_HPP | |
10 | + | |
11 | +#include <string> | |
12 | +#include <vector> | |
13 | + | |
14 | +class Tagset { | |
15 | +public: | |
16 | + explicit Tagset(const unsigned char* fsaData); | |
17 | + const std::string& getTag(const int tagNum) const; | |
18 | + const std::string& getName(const int nameNum) const; | |
19 | +private: | |
20 | + std::vector<std::string> tags; | |
21 | + std::vector<std::string> names; | |
22 | +}; | |
23 | + | |
24 | +#endif /* TAGSET_HPP */ | |
25 | + | |
... | ... |
morfeusz/interpretations.cpp
0 → 100644
1 | + | |
2 | +#include "interpretations.hpp" | |
3 | +#include "Tagset.hpp" | |
4 | + | |
5 | +using namespace std; | |
6 | + | |
7 | +Interpretation::Interpretation() | |
8 | +: lemma(), tag(), nameClassifier() { | |
9 | + | |
10 | +} | |
11 | + | |
12 | +Interpretation::Interpretation(const Lemma& lemma, const int tag, const int name) | |
13 | +: lemma(lemma), tag(tag), nameClassifier(name) { | |
14 | + | |
15 | +} | |
16 | + | |
17 | +StringInterpretation::StringInterpretation( | |
18 | + const string& lemma, | |
19 | + const string& tag, | |
20 | + const string& name) | |
21 | +: lemma(lemma), tag(tag), name(name) { | |
22 | + | |
23 | +} | |
24 | + | |
25 | +string StringInterpretation::toString() const { | |
26 | + std::stringstream ss; | |
27 | + ss << lemma << ":" << tag << ":" << name; | |
28 | + return ss.str(); | |
29 | +} | |
30 | + | |
31 | +string LemmaConverter::convertLemma( | |
32 | + const string& orth, | |
33 | + const Lemma& lemma) const { | |
34 | + string res(orth); | |
35 | + res.erase( | |
36 | + res.end() - lemma.suffixToCut, | |
37 | + res.end()); | |
38 | + res.append(lemma.suffixToAdd); | |
39 | + return res; | |
40 | +} | |
41 | + | |
42 | +InterpretationsConverter::InterpretationsConverter(const unsigned char* data) | |
43 | +: tagset(Tagset(data)) { | |
44 | + | |
45 | +} | |
46 | + | |
47 | +StringInterpretation InterpretationsConverter::convertInterpretation( | |
48 | + const string& orth, | |
49 | + const Interpretation& interp) const { | |
50 | + string lemma = this->lemmaConverter.convertLemma(orth, interp.lemma); | |
51 | + const string& tag = this->tagset.getTag(interp.tag); | |
52 | + const string& name = this->tagset.getName(interp.nameClassifier); | |
53 | + return StringInterpretation(lemma, tag, name); | |
54 | +} | |
55 | + | |
56 | + | |
... | ... |
morfeusz/interpretations.hpp
0 → 100644
1 | +/* | |
2 | + * File: interpretation.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on November 4, 2013, 3:11 PM | |
6 | + */ | |
7 | + | |
8 | +#ifndef INTERPRETATION_HPP | |
9 | +#define INTERPRETATION_HPP | |
10 | + | |
11 | +#include <string> | |
12 | +#include <sstream> | |
13 | +#include "Tagset.hpp" | |
14 | + | |
15 | +using namespace std; | |
16 | + | |
17 | +struct Lemma { | |
18 | + int suffixToCut; | |
19 | + string suffixToAdd; | |
20 | +}; | |
21 | + | |
22 | +struct Interpretation { | |
23 | + Interpretation(); | |
24 | + Interpretation(const Lemma& lemma, const int tag, const int name); | |
25 | + Lemma lemma; | |
26 | + int tag; // np. subst:sg:nom:m1 | |
27 | + int nameClassifier; // np. "pospolita" | |
28 | +// int qualifier; // np. "dawne" lub "potoczne" | |
29 | +}; | |
30 | + | |
31 | +struct StringInterpretation { | |
32 | + StringInterpretation(const std::string& lemma, const std::string& tag, const std::string& name); | |
33 | + const std::string lemma; | |
34 | + const std::string& tag; // np. subst:sg:nom:m1 | |
35 | + const std::string& name; // np. "pospolita" | |
36 | +// std::string qualifier; // np. "dawne" lub "potoczne" | |
37 | + std::string toString() const; | |
38 | +}; | |
39 | + | |
40 | +class LemmaConverter { | |
41 | +public: | |
42 | + std::string convertLemma(const std::string& orth, const Lemma& interp) const; | |
43 | +}; | |
44 | + | |
45 | + | |
46 | +class InterpretationsConverter { | |
47 | +public: | |
48 | + explicit InterpretationsConverter(const unsigned char* data); | |
49 | + StringInterpretation convertInterpretation( | |
50 | + const std::string& orth, | |
51 | + const Interpretation& interp) const; | |
52 | +private: | |
53 | + LemmaConverter lemmaConverter; | |
54 | + Tagset tagset; | |
55 | +}; | |
56 | + | |
57 | +#endif /* INTERPRETATION_HPP */ | |
58 | + | |
... | ... |
morfeusz/main.cpp
fsa/test_morph.cpp renamed to morfeusz/test_morph.cpp
... | ... | @@ -5,11 +5,13 @@ |
5 | 5 | * Created on November 8, 2013, 4:12 PM |
6 | 6 | */ |
7 | 7 | |
8 | -#include <cstdlib> | |
8 | +//#include <cstdlib> | |
9 | 9 | #include <sstream> |
10 | 10 | #include <iostream> |
11 | 11 | #include "fsa.hpp" |
12 | +#include "interpretations.hpp" | |
12 | 13 | #include "utils.hpp" |
14 | +#include "MorphDeserializer.hpp" | |
13 | 15 | |
14 | 16 | using namespace std; |
15 | 17 | |
... | ... | @@ -24,7 +26,14 @@ void debug(const string& key, const vector<Interpretation> value) { |
24 | 26 | cerr << "==================" << endl; |
25 | 27 | } |
26 | 28 | |
27 | -void doTest(const FSA<vector<Interpretation>>& fsa, const char* fname) { | |
29 | +void debug(const string& key, const StringInterpretation& value) { | |
30 | + cerr << key << '\t' << value.toString() << endl; | |
31 | +} | |
32 | + | |
33 | +void doTest( | |
34 | + const FSA<vector<Interpretation>>& fsa, | |
35 | + const InterpretationsConverter& interpsConverter, | |
36 | + const char* fname) { | |
28 | 37 | ifstream ifs; |
29 | 38 | // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); |
30 | 39 | ifs.open(fname, ios::binary); |
... | ... | @@ -32,22 +41,42 @@ void doTest(const FSA<vector<Interpretation>>& fsa, const char* fname) { |
32 | 41 | while (getline(ifs, line)) { |
33 | 42 | vector<string> splitVector(split(line, '\t')); |
34 | 43 | string key = splitVector[0]; |
44 | + string lemma = splitVector[1]; | |
45 | + string tag = splitVector[2]; | |
46 | + string name = splitVector[3]; | |
35 | 47 | vector<Interpretation> value2; |
36 | 48 | fsa.tryToRecognize(key.c_str(), value2); |
37 | - debug(key, value2); | |
49 | + vector<StringInterpretation> parsedValues; | |
50 | + bool found = false; | |
51 | + for (Interpretation interp: value2) { | |
52 | + StringInterpretation parsedValue = interpsConverter.convertInterpretation(key, interp); | |
53 | +// parsedValues.push_back(parsedValue); | |
54 | + debug(key, parsedValue); | |
55 | + if (lemma == parsedValue.lemma && tag == parsedValue.tag && name == parsedValue.name) { | |
56 | + found = true; | |
57 | + } | |
58 | + } | |
59 | + validate(found, "Failed to recognize " + key + " " + lemma + ":" + tag + ":" + name); | |
60 | +// debug(key, value2); | |
38 | 61 | // validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key); |
39 | 62 | } |
40 | 63 | validate(ifs.eof(), "Failed to read the input file to the end"); |
41 | 64 | } |
42 | 65 | |
43 | 66 | int main(int argc, char** argv) { |
44 | - validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename."); | |
67 | + DEBUG("start test"); | |
68 | + validate(argc == 3, "Must provide exactly two arguments - FSA filename, and dictionary filename."); | |
45 | 69 | const unsigned char* fsaData = readFile(argv[1]); |
46 | 70 | MorphDeserializer deserializer; |
71 | + DEBUG("will read FSA"); | |
47 | 72 | FSA<vector<Interpretation>>* fsa = FSA<vector<Interpretation>>::getFSA(fsaData, deserializer); |
48 | - doTest(*fsa, argv[2]); | |
73 | + DEBUG("DONE read FSA"); | |
74 | + DEBUG("will read tagset"); | |
75 | + InterpretationsConverter converter(fsaData); | |
76 | + DEBUG("DONE read tagset"); | |
77 | + DEBUG("still alive"); | |
78 | + doTest(*fsa, converter, argv[2]); | |
49 | 79 | // cout << argc << endl; |
50 | 80 | delete fsa; |
51 | 81 | return 0; |
52 | 82 | } |
53 | - | |
... | ... |
nbproject/configurations.xml
1 | 1 | <?xml version="1.0" encoding="UTF-8"?> |
2 | 2 | <configurationDescriptor version="90"> |
3 | 3 | <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT"> |
4 | + <logicalFolder name="2.8.11.2" | |
5 | + displayName="2.8.11.2" | |
6 | + projectFiles="true" | |
7 | + root="build/CMakeFiles/2.8.11.2"> | |
8 | + <logicalFolder name="CompilerIdC" displayName="CompilerIdC" projectFiles="true"> | |
9 | + <itemPath>build/CMakeFiles/2.8.11.2/CompilerIdC/CMakeCCompilerId.c</itemPath> | |
10 | + </logicalFolder> | |
11 | + <logicalFolder name="CompilerIdCXX" | |
12 | + displayName="CompilerIdCXX" | |
13 | + projectFiles="true"> | |
14 | + <itemPath>build/CMakeFiles/2.8.11.2/CompilerIdCXX/CMakeCXXCompilerId.cpp</itemPath> | |
15 | + </logicalFolder> | |
16 | + </logicalFolder> | |
4 | 17 | <df root="fsa" name="0"> |
5 | - <in>cfsa1_impl.hpp</in> | |
6 | - <in>cfsa2_impl.hpp</in> | |
7 | - <in>interpretation.hpp</in> | |
8 | - <in>simplefsa_impl.hpp</in> | |
9 | - <in>test_morph.cpp</in> | |
10 | 18 | <in>test_not_recognize.cpp</in> |
11 | 19 | <in>test_recognize.cpp</in> |
12 | 20 | <in>test_speed.cpp</in> |
13 | 21 | </df> |
22 | + <logicalFolder name="Modules" | |
23 | + displayName="Modules" | |
24 | + projectFiles="true" | |
25 | + root="/usr/share/cmake-2.8/Modules"> | |
26 | + <itemPath>/usr/share/cmake-2.8/Modules/CMakeCCompilerABI.c</itemPath> | |
27 | + <itemPath>/usr/share/cmake-2.8/Modules/CMakeCXXCompilerABI.cpp</itemPath> | |
28 | + <itemPath>/usr/share/cmake-2.8/Modules/CMakeCompilerABI.h</itemPath> | |
29 | + </logicalFolder> | |
14 | 30 | <df root="morfeusz" name="1"> |
31 | + <in>interpretations.cpp</in> | |
15 | 32 | <in>main.cpp</in> |
16 | 33 | <in>morfeusz.cpp</in> |
17 | 34 | </df> |
... | ... | @@ -22,7 +39,6 @@ |
22 | 39 | <itemPath>CMakeLists.txt</itemPath> |
23 | 40 | <itemPath>build/Makefile</itemPath> |
24 | 41 | </logicalFolder> |
25 | - <itemPath>cfsa1_impl.hpp</itemPath> | |
26 | 42 | </logicalFolder> |
27 | 43 | <sourceFolderFilter>^(nbproject)$</sourceFolderFilter> |
28 | 44 | <sourceRootList> |
... | ... | @@ -44,10 +60,41 @@ |
44 | 60 | <buildCommandWorkingDir>build</buildCommandWorkingDir> |
45 | 61 | <buildCommand>${MAKE} -f Makefile</buildCommand> |
46 | 62 | <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> |
47 | - <executablePath>build/fsa/test_dict</executablePath> | |
63 | + <executablePath>build/fsa/test_speed</executablePath> | |
64 | + <cTool> | |
65 | + <incDir> | |
66 | + <pElem>build/CMakeFiles/CMakeTmp</pElem> | |
67 | + </incDir> | |
68 | + </cTool> | |
48 | 69 | </makeTool> |
49 | 70 | </makefileType> |
50 | - <item path="cfsa1_impl.hpp" ex="false" tool="3" flavor2="0"> | |
71 | + <item path="/usr/share/cmake-2.8/Modules/CMakeCCompilerABI.c" | |
72 | + ex="false" | |
73 | + tool="0" | |
74 | + flavor2="2"> | |
75 | + <cTool> | |
76 | + </cTool> | |
77 | + </item> | |
78 | + <item path="/usr/share/cmake-2.8/Modules/CMakeCXXCompilerABI.cpp" | |
79 | + ex="false" | |
80 | + tool="1" | |
81 | + flavor2="4"> | |
82 | + <ccTool> | |
83 | + </ccTool> | |
84 | + </item> | |
85 | + <item path="build/CMakeFiles/2.8.11.2/CompilerIdC/CMakeCCompilerId.c" | |
86 | + ex="false" | |
87 | + tool="0" | |
88 | + flavor2="2"> | |
89 | + <cTool> | |
90 | + </cTool> | |
91 | + </item> | |
92 | + <item path="build/CMakeFiles/2.8.11.2/CompilerIdCXX/CMakeCXXCompilerId.cpp" | |
93 | + ex="false" | |
94 | + tool="1" | |
95 | + flavor2="4"> | |
96 | + <ccTool> | |
97 | + </ccTool> | |
51 | 98 | </item> |
52 | 99 | <folder path="0"> |
53 | 100 | <ccTool> |
... | ... | @@ -56,23 +103,27 @@ |
56 | 103 | </incDir> |
57 | 104 | </ccTool> |
58 | 105 | </folder> |
59 | - <folder path="1"> | |
106 | + <folder path="2.8.11.2"> | |
107 | + <ccTool> | |
108 | + <incDir> | |
109 | + <pElem>build/CMakeFiles/CMakeTmp</pElem> | |
110 | + </incDir> | |
111 | + </ccTool> | |
112 | + </folder> | |
113 | + <folder path="Modules"> | |
114 | + <ccTool> | |
115 | + <incDir> | |
116 | + <pElem>build/CMakeFiles/CMakeTmp</pElem> | |
117 | + </incDir> | |
118 | + </ccTool> | |
119 | + </folder> | |
120 | + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8"> | |
60 | 121 | <ccTool> |
61 | 122 | <incDir> |
62 | 123 | <pElem>fsa</pElem> |
63 | 124 | <pElem>build/morfeusz</pElem> |
64 | 125 | </incDir> |
65 | 126 | </ccTool> |
66 | - </folder> | |
67 | - <item path="fsa/cfsa1_impl.hpp" ex="false" tool="3" flavor2="0"> | |
68 | - </item> | |
69 | - <item path="fsa/cfsa2_impl.hpp" ex="false" tool="3" flavor2="0"> | |
70 | - </item> | |
71 | - <item path="fsa/interpretation.hpp" ex="false" tool="3" flavor2="0"> | |
72 | - </item> | |
73 | - <item path="fsa/simplefsa_impl.hpp" ex="false" tool="3" flavor2="0"> | |
74 | - </item> | |
75 | - <item path="fsa/test_morph.cpp" ex="false" tool="1" flavor2="0"> | |
76 | 127 | </item> |
77 | 128 | <item path="fsa/test_not_recognize.cpp" ex="false" tool="1" flavor2="8"> |
78 | 129 | <ccTool> |
... | ... | @@ -86,12 +137,53 @@ |
86 | 137 | <ccTool> |
87 | 138 | </ccTool> |
88 | 139 | </item> |
140 | + <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> | |
141 | + <ccTool> | |
142 | + <incDir> | |
143 | + <pElem>fsa</pElem> | |
144 | + <pElem>build/morfeusz</pElem> | |
145 | + </incDir> | |
146 | + </ccTool> | |
147 | + </item> | |
148 | + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> | |
149 | + <ccTool> | |
150 | + <incDir> | |
151 | + <pElem>fsa</pElem> | |
152 | + <pElem>build/morfeusz</pElem> | |
153 | + </incDir> | |
154 | + </ccTool> | |
155 | + </item> | |
156 | + <item path="morfeusz/interpretations.cpp" ex="false" tool="1" flavor2="8"> | |
157 | + <ccTool> | |
158 | + <incDir> | |
159 | + <pElem>fsa</pElem> | |
160 | + <pElem>build/morfeusz</pElem> | |
161 | + </incDir> | |
162 | + </ccTool> | |
163 | + </item> | |
89 | 164 | <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> |
90 | 165 | <ccTool> |
166 | + <incDir> | |
167 | + <pElem>fsa</pElem> | |
168 | + <pElem>build/morfeusz</pElem> | |
169 | + </incDir> | |
91 | 170 | </ccTool> |
92 | 171 | </item> |
93 | 172 | <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> |
94 | 173 | <ccTool> |
174 | + <incDir> | |
175 | + <pElem>build/CMakeFiles/CMakeTmp</pElem> | |
176 | + <pElem>fsa</pElem> | |
177 | + <pElem>build/morfeusz</pElem> | |
178 | + </incDir> | |
179 | + </ccTool> | |
180 | + </item> | |
181 | + <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> | |
182 | + <ccTool> | |
183 | + <incDir> | |
184 | + <pElem>fsa</pElem> | |
185 | + <pElem>build/morfeusz</pElem> | |
186 | + </incDir> | |
95 | 187 | </ccTool> |
96 | 188 | </item> |
97 | 189 | </conf> |
... | ... |
nbproject/project.xml
... | ... | @@ -4,9 +4,9 @@ |
4 | 4 | <configuration> |
5 | 5 | <data xmlns="http://www.netbeans.org/ns/make-project/1"> |
6 | 6 | <name>morfeusz</name> |
7 | - <c-extensions/> | |
7 | + <c-extensions>c</c-extensions> | |
8 | 8 | <cpp-extensions>cpp</cpp-extensions> |
9 | - <header-extensions>hpp</header-extensions> | |
9 | + <header-extensions>h,hpp</header-extensions> | |
10 | 10 | <sourceEncoding>UTF-8</sourceEncoding> |
11 | 11 | <make-dep-projects/> |
12 | 12 | <sourceRootList> |
... | ... |