Commit 4bf99e6b94697f082572eb35c0dcdd60c3711761

Authored by Michał Lenart
1 parent d42f73bc

- prawie działa rozpoznawanie informacji morfologicznej

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@16 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsa/CMakeLists.txt
1 1  
2   -add_executable (test_speed test_speed.cpp)
3   -add_executable (test_speed_profile test_speed.cpp)
4   -add_executable (test_recognize test_recognize.cpp)
5   -add_executable (test_not_recognize test_not_recognize.cpp)
6   -add_executable (test_morph test_morph.cpp)
  2 +add_executable (test_speed test_speed.cpp const.cpp)
  3 +add_executable (test_speed_profile test_speed.cpp const.cpp)
  4 +add_executable (test_recognize test_recognize.cpp const.cpp)
  5 +add_executable (test_not_recognize test_not_recognize.cpp const.cpp)
  6 +
7 7 set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -pedantic -Wcast-align -Wextra -Wmissing-noreturn -Wconversion -Wcast-qual -Wcast-align" )
8 8 set_target_properties ( test_speed_profile PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -g" )
9 9 set_target_properties ( test_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
10 10 set_target_properties ( test_not_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
11   -set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
... ...
fsa/cfsa1_impl.hpp
... ... @@ -79,7 +79,7 @@ void CompressedFSA1<T>::doProceedToNextByList(
79 79 TransitionData2 td;
80 80 for (unsigned int i = 0; i < transitionsNum; i++) {
81 81 // const_cast<Counter*>(&counter)->increment(1);
82   - td = *((TransitionData2*) currPtr);
  82 + td = *(reinterpret_cast<const TransitionData2*>(currPtr));
83 83 if (td.shortLabel == shortLabel) {
84 84 if (shortLabel == 0) {
85 85 currPtr++;
... ... @@ -107,7 +107,8 @@ void CompressedFSA1&lt;T&gt;::doProceedToNextByList(
107 107 if (!found) {
108 108 // cerr << "SINK for " << c << endl;
109 109 state.setNextAsSink();
110   - } else {
  110 + }
  111 + else {
111 112 currPtr++;
112 113 // cerr << "offset size " << td.offsetSize << endl;
113 114 // cerr << "offset " << offset << endl;
... ... @@ -152,12 +153,12 @@ void CompressedFSA1&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
152 153 // cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl;
153 154 const unsigned char* fromPointer = this->initialStatePtr + state.getOffset();
154 155 unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c];
155   - unsigned int transitionsTableOffset = 1;
  156 + unsigned long transitionsTableOffset = 1;
156 157 if (state.isAccepting()) {
157 158 transitionsTableOffset += state.getValueSize();
158 159 // cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl;
159 160 }
160   - StateData2* sd = (StateData2*) (fromPointer);
  161 + const StateData2* sd = reinterpret_cast<const StateData2*>(fromPointer);
161 162 // cerr << "transitions num=" << sd->transitionsNum << endl;
162 163 if (sd->array) {
163 164 if (shortLabel > 0) {
... ...
fsa/cfsa2_impl.hpp
... ... @@ -135,7 +135,7 @@ void CompressedFSA2&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
135 135 cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl;
136 136 #endif
137 137 const unsigned char* fromPointer = this->initialStatePtr + state.getOffset();
138   - unsigned int transitionsTableOffset = 0;
  138 + unsigned long transitionsTableOffset = 0;
139 139 if (state.isAccepting()) {
140 140 transitionsTableOffset += state.getValueSize();
141 141 }
... ...
fsa/const.cpp 0 → 100644
  1 +
  2 +#include "const.hpp"
  3 +
  4 +extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b;
  5 +extern const uint8_t VERSION_NUM = 9;
  6 +
  7 +extern const unsigned int VERSION_NUM_OFFSET = 4;
  8 +extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5;
  9 +extern const unsigned int ADDITIONAL_DATA_SIZE_OFFSET = 6;
  10 +extern const unsigned int ADDITIONAL_DATA_OFFSET = 10;
... ...
fsa/const.hpp 0 → 100644
  1 +/*
  2 + * File: const.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 12 listopad 2013, 14:11
  6 + */
  7 +
  8 +#ifndef CONST_HPP
  9 +#define CONST_HPP
  10 +
  11 +#include <netinet/in.h>
  12 +
  13 +extern const uint32_t MAGIC_NUMBER;
  14 +extern const uint8_t VERSION_NUM;
  15 +
  16 +extern const unsigned int VERSION_NUM_OFFSET;
  17 +extern const unsigned int IMPLEMENTATION_NUM_OFFSET;
  18 +extern const unsigned int ADDITIONAL_DATA_SIZE_OFFSET;
  19 +extern const unsigned int ADDITIONAL_DATA_OFFSET;
  20 +
  21 +#endif /* CONST_HPP */
  22 +
... ...
fsa/fsa.hpp
... ... @@ -9,13 +9,12 @@
9 9 #define FSA_HPP
10 10  
11 11 //#include <iostream>
12   -//#include <cstring>
13   -#include <typeinfo>
  12 +#include <cstring>
14 13 #include <cassert>
  14 +#include <typeinfo>
15 15 #include <exception>
16 16 #include <string>
17 17 #include <vector>
18   -#include "interpretation.hpp"
19 18  
20 19 template <class T> class State;
21 20 template <class T> class FSA;
... ... @@ -44,16 +43,12 @@ public:
44 43 * Returns number of bytes read or -1 on error.
45 44 */
46 45 long deserialize(const unsigned char* ptr, char*& text) const {
47   - // text = const_cast<char*> (reinterpret_cast<const char*> (ptr));
48   - // return strlen(text) + 1;
49   - return 1;
  46 + text = const_cast<char*> (reinterpret_cast<const char*> (ptr));
  47 + return strlen(text) + 1;
  48 +// return 1;
50 49 }
51 50 };
52 51  
53   -class MorphDeserializer: public Deserializer<std::vector<Interpretation>> {
54   - long deserialize(const unsigned char* ptr, std::vector<Interpretation>& interp) const;
55   -};
56   -
57 52 class Counter {
58 53 public:
59 54  
... ... @@ -88,8 +83,6 @@ public:
88 83 */
89 84 static FSA<T>* getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer);
90 85  
91   - static const uint32_t MAGIC_NUMBER = 0x8fc2bc1b;
92   - static const uint8_t VERSION_NUM = 8;
93 86 protected:
94 87  
95 88 /**
... ... @@ -105,10 +98,6 @@ protected:
105 98 const Deserializer<T>& deserializer;
106 99 friend class State<T>;
107 100 private:
108   - static int getMagicNumberOffset();
109   - static int getVersionNumOffset();
110   - static int getPopularCharsOffset();
111   - static int getInitialStateOffset();
112 101 // FSA();
113 102 };
114 103  
... ... @@ -220,12 +209,12 @@ public:
220 209 * Makes sense only for accepting states.
221 210 * For non-accepting states is throws an exception.
222 211 */
223   - unsigned int getValueSize() const;
  212 + unsigned long getValueSize() const;
224 213  
225 214 unsigned long getOffset() const;
226 215  
227 216 void setNext(const unsigned long offset);
228   - void setNext(const unsigned long offset, const T& value, const unsigned int valueSize);
  217 + void setNext(const unsigned long offset, const T& value, const unsigned long valueSize);
229 218 void setNextAsSink();
230 219  
231 220 explicit State(const FSA<T>& fsa);
... ... @@ -237,7 +226,7 @@ private:
237 226 bool accepting;
238 227 bool sink;
239 228 T value;
240   - int valueSize;
  229 + long valueSize;
241 230 };
242 231  
243 232 class FSAException : public std::exception {
... ...
fsa/fsa_impl.hpp
... ... @@ -14,14 +14,11 @@
14 14 #include <iostream>
15 15 #include <vector>
16 16 #include <netinet/in.h>
17   -#include "fsa.hpp"
18 17 #include "utils.hpp"
  18 +#include "const.hpp"
19 19  
20 20 using namespace std;
21   -
22   -static const unsigned int VERSION_NUM_OFFSET = 4;
23   -static const unsigned int IMPLEMENTATION_NUM_OFFSET = 5;
24   -static const unsigned int FSA_OFFSET = 6;
  21 +//static const unsigned int FSA_OFFSET = 6;
25 22  
26 23 template <class T>
27 24 bool FSA<T>::tryToRecognize(const char* input, T& value) const {
... ... @@ -73,7 +70,9 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
73 70 }
74 71  
75 72 uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET);
76   - const unsigned char* startPtr = ptr + FSA_OFFSET;
  73 +
  74 + uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET)));
  75 + const unsigned char* startPtr = ptr + ADDITIONAL_DATA_OFFSET + additionalDataSize;
77 76 switch (implementationNum) {
78 77 case 0:
79 78 return new SimpleFSA<T>(startPtr, deserializer);
... ... @@ -86,34 +85,4 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
86 85 }
87 86 }
88 87  
89   -static void deserializeLemma(const unsigned char*& ptr, Lemma& lemma) {
90   - // XXX uważać na poprawność danych
91   - lemma.suffixToCut = *ptr;
92   - ptr++;
93   - lemma.suffixToAdd = (const char*) ptr;
94   - ptr += strlen((const char*) ptr) + 1;
95   -}
96   -
97   -static void deserializeInterp(const unsigned char*& ptr, Interpretation& interp) {
98   - deserializeLemma(ptr, interp.lemma);
99   - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
100   - ptr += 2;
101   - interp.nameClassifier = *ptr;
102   - ptr++;
103   -}
104   -
105   -long MorphDeserializer::deserialize(const unsigned char* ptr, vector<Interpretation>& interps) const {
106   - const unsigned char* currPtr = ptr;
107   - uint8_t interpsNum = *ptr;
108   - interps.clear();
109   - interps.reserve(interpsNum);
110   - currPtr++;
111   - for (unsigned int i = 0; i < interpsNum; i++) {
112   - Interpretation interp;
113   - deserializeInterp(currPtr, interp);
114   - interps.push_back(interp);
115   - }
116   - return currPtr - ptr;
117   -}
118   -
119 88 #endif /* _SIMPLE_FSA_IMPL_HPP */
... ...
fsa/interpretation.hpp deleted
1   -/*
2   - * File: interpretation.hpp
3   - * Author: mlenart
4   - *
5   - * Created on November 4, 2013, 3:11 PM
6   - */
7   -
8   -#ifndef INTERPRETATION_HPP
9   -#define INTERPRETATION_HPP
10   -
11   -#include <string>
12   -#include <list>
13   -
14   -using namespace std;
15   -
16   -struct Lemma {
17   - unsigned short suffixToCut;
18   - const char* suffixToAdd;
19   -};
20   -
21   -struct Interpretation {
22   - Lemma lemma;
23   - unsigned int tag; // np. subst:sg:nom:m1
24   - unsigned short nameClassifier; // np. "pospolita"
25   - unsigned short qualifier; // np. "dawne" lub "potoczne"
26   -};
27   -
28   -#endif /* INTERPRETATION_HPP */
29   -
fsa/simplefsa_impl.hpp
... ... @@ -37,7 +37,7 @@ static unsigned int decodeOffset(const unsigned char* ptr) {
37 37 template <class T>
38 38 void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const {
39 39 const unsigned char* fromPointer = this->initialStatePtr + state.getOffset();
40   - int transitionsTableOffset = sizeof (StateData);
  40 + long transitionsTableOffset = sizeof (StateData);
41 41 if (state.isAccepting()) {
42 42 transitionsTableOffset += state.getValueSize();
43 43 }
... ... @@ -60,7 +60,7 @@ void SimpleFSA&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
60 60 const StateData* nextStateData = reinterpret_cast<const StateData*>(nextStatePointer);
61 61 if (nextStateData->accepting) {
62 62 T object;
63   - int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object);
  63 + long size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object);
64 64 state.setNext(offset, object, size);
65 65 } else {
66 66 state.setNext(offset);
... ...
fsa/state_impl.hpp
... ... @@ -51,7 +51,7 @@ T State&lt;T&gt;::getValue() const {
51 51 }
52 52  
53 53 template <class T>
54   -unsigned int State<T>::getValueSize() const {
  54 +unsigned long State<T>::getValueSize() const {
55 55 assert(this->isAccepting());
56 56 return this->valueSize;
57 57 }
... ... @@ -69,7 +69,7 @@ void State&lt;T&gt;::setNext(const unsigned long offset) {
69 69 }
70 70  
71 71 template <class T>
72   -void State<T>::setNext(const unsigned long offset, const T& value, const unsigned int valueSize) {
  72 +void State<T>::setNext(const unsigned long offset, const T& value, const unsigned long valueSize) {
73 73 // assert(!this->isSink());
74 74 this->offset = offset;
75 75 this->accepting = true;
... ...
fsa/utils.hpp
... ... @@ -9,14 +9,15 @@
9 9 #define UTILS_HPP
10 10  
11 11 #include <iostream>
  12 +#include <fstream>
12 13 #include <sstream>
13 14 #include <string>
14 15 #include <fstream>
15 16 #include <vector>
16 17  
17   -using namespace std;
  18 +//using namespace std;
18 19  
19   -//#define DEBUG_BUILD
  20 +#define DEBUG_BUILD
20 21  
21 22 #ifdef DEBUG_BUILD
22 23 # define DEBUG(x) do { std::cerr << x << std::endl; } while (0)
... ... @@ -24,14 +25,14 @@ using namespace std;
24 25 # define DEBUG(x)
25 26 #endif
26 27  
27   -void validate(const bool cond, const std::string& msg) {
  28 +inline void validate(const bool cond, const std::string& msg) {
28 29 if (!cond) {
29 30 std::cerr << msg << std::endl;
30 31 exit(1);
31 32 }
32 33 }
33 34  
34   -std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
  35 +inline std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
35 36 std::stringstream ss(s);
36 37 std::string item;
37 38 while (std::getline(ss, item, delim)) {
... ... @@ -41,25 +42,25 @@ std::vector&lt;std::string&gt; &amp;split(const std::string &amp;s, char delim, std::vector&lt;st
41 42 }
42 43  
43 44  
44   -std::vector<std::string> split(const std::string &s, char delim) {
  45 +inline std::vector<std::string> split(const std::string &s, char delim) {
45 46 std::vector<std::string> elems;
46 47 split(s, delim, elems);
47 48 return elems;
48 49 }
49 50  
50   -string &rtrim(string &s) {
51   - s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun<int, int>(isspace))).base(), s.end());
52   - return s;
53   -}
  51 +//string &rtrim(string &s) {
  52 +// s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun<int, int>(isspace))).base(), s.end());
  53 +// return s;
  54 +//}
54 55  
55   -unsigned char* readFile(const char* fname) {
56   - ifstream ifs;
  56 +inline unsigned char* readFile(const char* fname) {
  57 + std::ifstream ifs;
57 58 ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
58   - ifs.open(fname, ios::in | ios::binary | ios::ate);
  59 + ifs.open(fname, std::ios::in | std::ios::binary | std::ios::ate);
59 60 // if (ifs.is_open()) {
60   - int size = ifs.tellg();
  61 + long size = ifs.tellg();
61 62 unsigned char* memblock = new unsigned char [size];
62   - ifs.seekg(0, ios::beg);
  63 + ifs.seekg(0, std::ios::beg);
63 64 ifs.read(reinterpret_cast<char*> (memblock), size);
64 65 ifs.close();
65 66 return memblock;
... ...
morfeusz/CMakeLists.txt
... ... @@ -6,7 +6,10 @@
6 6 include_directories (${Morfeusz_SOURCE_DIR}/fsa)
7 7 add_library (morfeusz2 morfeusz.hpp morfeusz.cpp)
8 8 add_executable (morfeusz2_analyze main.cpp)
  9 +add_executable (test_morph test_morph.cpp interpretations.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp)
9 10  
10 11 # Link the executable to the Hello library.
11 12 target_link_libraries (morfeusz2_analyze morfeusz2)
12 13 set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" )
  14 +
  15 +set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
... ...
morfeusz/MorphDeserializer.cpp 0 → 100644
  1 +/*
  2 + * File: MorphDeserializer.cpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 12 listopad 2013, 15:31
  6 + */
  7 +
  8 +#include "MorphDeserializer.hpp"
  9 +
  10 +MorphDeserializer::MorphDeserializer() {
  11 +}
  12 +
  13 +MorphDeserializer::MorphDeserializer(const MorphDeserializer& orig) {
  14 +}
  15 +
  16 +MorphDeserializer::~MorphDeserializer() {
  17 +}
  18 +
  19 +static void deserializeLemma(const unsigned char*& ptr, Lemma& lemma) {
  20 + // XXX uważać na poprawność danych
  21 + lemma.suffixToCut = *ptr;
  22 + ptr++;
  23 + lemma.suffixToAdd = (const char*) ptr;
  24 + ptr += strlen((const char*) ptr) + 1;
  25 +}
  26 +
  27 +static void deserializeInterp(const unsigned char*& ptr, Interpretation& interp) {
  28 + deserializeLemma(ptr, interp.lemma);
  29 + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
  30 + ptr += 2;
  31 + interp.nameClassifier = *ptr;
  32 + ptr++;
  33 +}
  34 +
  35 +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<Interpretation>& interps) const {
  36 + const unsigned char* currPtr = ptr;
  37 + uint8_t interpsNum = *ptr;
  38 + interps.clear();
  39 + interps.reserve(interpsNum);
  40 + currPtr++;
  41 + for (unsigned int i = 0; i < interpsNum; i++) {
  42 + Interpretation interp;
  43 + deserializeInterp(currPtr, interp);
  44 + interps.push_back(interp);
  45 + }
  46 + return currPtr - ptr;
  47 +}
  48 +
... ...
morfeusz/MorphDeserializer.hpp 0 → 100644
  1 +/*
  2 + * File: MorphDeserializer.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 12 listopad 2013, 15:31
  6 + */
  7 +
  8 +#ifndef MORPHDESERIALIZER_HPP
  9 +#define MORPHDESERIALIZER_HPP
  10 +
  11 +#include <vector>
  12 +#include "fsa.hpp"
  13 +#include "interpretations.hpp"
  14 +
  15 +class MorphDeserializer: public Deserializer<std::vector<Interpretation>> {
  16 +public:
  17 + MorphDeserializer();
  18 + MorphDeserializer(const MorphDeserializer& orig);
  19 + virtual ~MorphDeserializer();
  20 + long deserialize(
  21 + const unsigned char* ptr,
  22 + std::vector<Interpretation>& interps) const;
  23 +private:
  24 +
  25 +};
  26 +
  27 +#endif /* MORPHDESERIALIZER_HPP */
  28 +
... ...
morfeusz/Tagset.cpp 0 → 100644
  1 +
  2 +#include <string>
  3 +#include <netinet/in.h>
  4 +#include "Tagset.hpp"
  5 +#include "const.hpp"
  6 +#include "utils.hpp"
  7 +
  8 +using namespace std;
  9 +
  10 +static uint16_t readInt16(const unsigned char*& currPtr) {
  11 + DEBUG("readInt16");
  12 + uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr));
  13 + DEBUG("still alive " + to_string(res));
  14 + currPtr += 2;
  15 + DEBUG("still alive after ptr add");
  16 + return res;
  17 +}
  18 +
  19 +static string readString(const unsigned char*& currPtr) {
  20 + DEBUG("readString");
  21 + string res(reinterpret_cast<const char*>(currPtr));
  22 + currPtr += res.length();
  23 + currPtr++;
  24 + return res;
  25 +}
  26 +
  27 +static void readTags(const unsigned char*& currPtr, vector<string>& tags) {
  28 + tags.clear();
  29 + tags.resize(65536);
  30 + uint16_t tagsNum = readInt16(currPtr);
  31 + DEBUG("hi there");
  32 + DEBUG("tagsNum="+to_string((int) tagsNum));
  33 + for (unsigned int i = 0; i < tagsNum; i++) {
  34 + unsigned int tagNum = readInt16(currPtr);
  35 + tags[tagNum] = readString(currPtr);
  36 + }
  37 +}
  38 +
  39 +Tagset::Tagset(const unsigned char* fsaData) {
  40 + const unsigned char* currPtr = fsaData + ADDITIONAL_DATA_OFFSET;
  41 +// uint32_t tagsNum = ntohl(*reinterpret_cast<const uint32_t*>(currPtr));
  42 +// uint32_t namesNum = ntohl(*reinterpret_cast<const uint32_t*>(fsaData + ADDITIONAL_DATA_OFFSET + 4));
  43 +// const unsigned char* currPtr = fsaData + 8;
  44 + DEBUG("will read tags");
  45 + readTags(currPtr, this->tags);
  46 + DEBUG("will read names");
  47 + readTags(currPtr, this->names);
  48 +}
  49 +
  50 +const string& Tagset::getTag(const int tagNum) const {
  51 + return this->tags.at(tagNum);
  52 +}
  53 +
  54 +const string& Tagset::getName(const int nameNum) const {
  55 + return this->names.at(nameNum);
  56 +}
... ...
morfeusz/Tagset.hpp 0 → 100644
  1 +/*
  2 + * File: tagset.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 12 listopad 2013, 14:09
  6 + */
  7 +
  8 +#ifndef TAGSET_HPP
  9 +#define TAGSET_HPP
  10 +
  11 +#include <string>
  12 +#include <vector>
  13 +
  14 +class Tagset {
  15 +public:
  16 + explicit Tagset(const unsigned char* fsaData);
  17 + const std::string& getTag(const int tagNum) const;
  18 + const std::string& getName(const int nameNum) const;
  19 +private:
  20 + std::vector<std::string> tags;
  21 + std::vector<std::string> names;
  22 +};
  23 +
  24 +#endif /* TAGSET_HPP */
  25 +
... ...
morfeusz/interpretations.cpp 0 → 100644
  1 +
  2 +#include "interpretations.hpp"
  3 +#include "Tagset.hpp"
  4 +
  5 +using namespace std;
  6 +
  7 +Interpretation::Interpretation()
  8 +: lemma(), tag(), nameClassifier() {
  9 +
  10 +}
  11 +
  12 +Interpretation::Interpretation(const Lemma& lemma, const int tag, const int name)
  13 +: lemma(lemma), tag(tag), nameClassifier(name) {
  14 +
  15 +}
  16 +
  17 +StringInterpretation::StringInterpretation(
  18 + const string& lemma,
  19 + const string& tag,
  20 + const string& name)
  21 +: lemma(lemma), tag(tag), name(name) {
  22 +
  23 +}
  24 +
  25 +string StringInterpretation::toString() const {
  26 + std::stringstream ss;
  27 + ss << lemma << ":" << tag << ":" << name;
  28 + return ss.str();
  29 +}
  30 +
  31 +string LemmaConverter::convertLemma(
  32 + const string& orth,
  33 + const Lemma& lemma) const {
  34 + string res(orth);
  35 + res.erase(
  36 + res.end() - lemma.suffixToCut,
  37 + res.end());
  38 + res.append(lemma.suffixToAdd);
  39 + return res;
  40 +}
  41 +
  42 +InterpretationsConverter::InterpretationsConverter(const unsigned char* data)
  43 +: tagset(Tagset(data)) {
  44 +
  45 +}
  46 +
  47 +StringInterpretation InterpretationsConverter::convertInterpretation(
  48 + const string& orth,
  49 + const Interpretation& interp) const {
  50 + string lemma = this->lemmaConverter.convertLemma(orth, interp.lemma);
  51 + const string& tag = this->tagset.getTag(interp.tag);
  52 + const string& name = this->tagset.getName(interp.nameClassifier);
  53 + return StringInterpretation(lemma, tag, name);
  54 +}
  55 +
  56 +
... ...
morfeusz/interpretations.hpp 0 → 100644
  1 +/*
  2 + * File: interpretation.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on November 4, 2013, 3:11 PM
  6 + */
  7 +
  8 +#ifndef INTERPRETATION_HPP
  9 +#define INTERPRETATION_HPP
  10 +
  11 +#include <string>
  12 +#include <sstream>
  13 +#include "Tagset.hpp"
  14 +
  15 +using namespace std;
  16 +
  17 +struct Lemma {
  18 + int suffixToCut;
  19 + string suffixToAdd;
  20 +};
  21 +
  22 +struct Interpretation {
  23 + Interpretation();
  24 + Interpretation(const Lemma& lemma, const int tag, const int name);
  25 + Lemma lemma;
  26 + int tag; // np. subst:sg:nom:m1
  27 + int nameClassifier; // np. "pospolita"
  28 +// int qualifier; // np. "dawne" lub "potoczne"
  29 +};
  30 +
  31 +struct StringInterpretation {
  32 + StringInterpretation(const std::string& lemma, const std::string& tag, const std::string& name);
  33 + const std::string lemma;
  34 + const std::string& tag; // np. subst:sg:nom:m1
  35 + const std::string& name; // np. "pospolita"
  36 +// std::string qualifier; // np. "dawne" lub "potoczne"
  37 + std::string toString() const;
  38 +};
  39 +
  40 +class LemmaConverter {
  41 +public:
  42 + std::string convertLemma(const std::string& orth, const Lemma& interp) const;
  43 +};
  44 +
  45 +
  46 +class InterpretationsConverter {
  47 +public:
  48 + explicit InterpretationsConverter(const unsigned char* data);
  49 + StringInterpretation convertInterpretation(
  50 + const std::string& orth,
  51 + const Interpretation& interp) const;
  52 +private:
  53 + LemmaConverter lemmaConverter;
  54 + Tagset tagset;
  55 +};
  56 +
  57 +#endif /* INTERPRETATION_HPP */
  58 +
... ...
morfeusz/main.cpp
... ... @@ -9,6 +9,7 @@
9 9 #include <iostream>
10 10 #include "fsa.hpp"
11 11 #include "default_fsa.hpp"
  12 +#include "Tagset.hpp"
12 13  
13 14 using namespace std;
14 15  
... ...
fsa/test_morph.cpp renamed to morfeusz/test_morph.cpp
... ... @@ -5,11 +5,13 @@
5 5 * Created on November 8, 2013, 4:12 PM
6 6 */
7 7  
8   -#include <cstdlib>
  8 +//#include <cstdlib>
9 9 #include <sstream>
10 10 #include <iostream>
11 11 #include "fsa.hpp"
  12 +#include "interpretations.hpp"
12 13 #include "utils.hpp"
  14 +#include "MorphDeserializer.hpp"
13 15  
14 16 using namespace std;
15 17  
... ... @@ -24,7 +26,14 @@ void debug(const string&amp; key, const vector&lt;Interpretation&gt; value) {
24 26 cerr << "==================" << endl;
25 27 }
26 28  
27   -void doTest(const FSA<vector<Interpretation>>& fsa, const char* fname) {
  29 +void debug(const string& key, const StringInterpretation& value) {
  30 + cerr << key << '\t' << value.toString() << endl;
  31 +}
  32 +
  33 +void doTest(
  34 + const FSA<vector<Interpretation>>& fsa,
  35 + const InterpretationsConverter& interpsConverter,
  36 + const char* fname) {
28 37 ifstream ifs;
29 38 // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
30 39 ifs.open(fname, ios::binary);
... ... @@ -32,22 +41,42 @@ void doTest(const FSA&lt;vector&lt;Interpretation&gt;&gt;&amp; fsa, const char* fname) {
32 41 while (getline(ifs, line)) {
33 42 vector<string> splitVector(split(line, '\t'));
34 43 string key = splitVector[0];
  44 + string lemma = splitVector[1];
  45 + string tag = splitVector[2];
  46 + string name = splitVector[3];
35 47 vector<Interpretation> value2;
36 48 fsa.tryToRecognize(key.c_str(), value2);
37   - debug(key, value2);
  49 + vector<StringInterpretation> parsedValues;
  50 + bool found = false;
  51 + for (Interpretation interp: value2) {
  52 + StringInterpretation parsedValue = interpsConverter.convertInterpretation(key, interp);
  53 +// parsedValues.push_back(parsedValue);
  54 + debug(key, parsedValue);
  55 + if (lemma == parsedValue.lemma && tag == parsedValue.tag && name == parsedValue.name) {
  56 + found = true;
  57 + }
  58 + }
  59 + validate(found, "Failed to recognize " + key + " " + lemma + ":" + tag + ":" + name);
  60 +// debug(key, value2);
38 61 // validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key);
39 62 }
40 63 validate(ifs.eof(), "Failed to read the input file to the end");
41 64 }
42 65  
43 66 int main(int argc, char** argv) {
44   - validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename.");
  67 + DEBUG("start test");
  68 + validate(argc == 3, "Must provide exactly two arguments - FSA filename, and dictionary filename.");
45 69 const unsigned char* fsaData = readFile(argv[1]);
46 70 MorphDeserializer deserializer;
  71 + DEBUG("will read FSA");
47 72 FSA<vector<Interpretation>>* fsa = FSA<vector<Interpretation>>::getFSA(fsaData, deserializer);
48   - doTest(*fsa, argv[2]);
  73 + DEBUG("DONE read FSA");
  74 + DEBUG("will read tagset");
  75 + InterpretationsConverter converter(fsaData);
  76 + DEBUG("DONE read tagset");
  77 + DEBUG("still alive");
  78 + doTest(*fsa, converter, argv[2]);
49 79 // cout << argc << endl;
50 80 delete fsa;
51 81 return 0;
52 82 }
53   -
... ...
nbproject/configurations.xml
1 1 <?xml version="1.0" encoding="UTF-8"?>
2 2 <configurationDescriptor version="90">
3 3 <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT">
  4 + <logicalFolder name="2.8.11.2"
  5 + displayName="2.8.11.2"
  6 + projectFiles="true"
  7 + root="build/CMakeFiles/2.8.11.2">
  8 + <logicalFolder name="CompilerIdC" displayName="CompilerIdC" projectFiles="true">
  9 + <itemPath>build/CMakeFiles/2.8.11.2/CompilerIdC/CMakeCCompilerId.c</itemPath>
  10 + </logicalFolder>
  11 + <logicalFolder name="CompilerIdCXX"
  12 + displayName="CompilerIdCXX"
  13 + projectFiles="true">
  14 + <itemPath>build/CMakeFiles/2.8.11.2/CompilerIdCXX/CMakeCXXCompilerId.cpp</itemPath>
  15 + </logicalFolder>
  16 + </logicalFolder>
4 17 <df root="fsa" name="0">
5   - <in>cfsa1_impl.hpp</in>
6   - <in>cfsa2_impl.hpp</in>
7   - <in>interpretation.hpp</in>
8   - <in>simplefsa_impl.hpp</in>
9   - <in>test_morph.cpp</in>
10 18 <in>test_not_recognize.cpp</in>
11 19 <in>test_recognize.cpp</in>
12 20 <in>test_speed.cpp</in>
13 21 </df>
  22 + <logicalFolder name="Modules"
  23 + displayName="Modules"
  24 + projectFiles="true"
  25 + root="/usr/share/cmake-2.8/Modules">
  26 + <itemPath>/usr/share/cmake-2.8/Modules/CMakeCCompilerABI.c</itemPath>
  27 + <itemPath>/usr/share/cmake-2.8/Modules/CMakeCXXCompilerABI.cpp</itemPath>
  28 + <itemPath>/usr/share/cmake-2.8/Modules/CMakeCompilerABI.h</itemPath>
  29 + </logicalFolder>
14 30 <df root="morfeusz" name="1">
  31 + <in>interpretations.cpp</in>
15 32 <in>main.cpp</in>
16 33 <in>morfeusz.cpp</in>
17 34 </df>
... ... @@ -22,7 +39,6 @@
22 39 <itemPath>CMakeLists.txt</itemPath>
23 40 <itemPath>build/Makefile</itemPath>
24 41 </logicalFolder>
25   - <itemPath>cfsa1_impl.hpp</itemPath>
26 42 </logicalFolder>
27 43 <sourceFolderFilter>^(nbproject)$</sourceFolderFilter>
28 44 <sourceRootList>
... ... @@ -44,10 +60,41 @@
44 60 <buildCommandWorkingDir>build</buildCommandWorkingDir>
45 61 <buildCommand>${MAKE} -f Makefile</buildCommand>
46 62 <cleanCommand>${MAKE} -f Makefile clean</cleanCommand>
47   - <executablePath>build/fsa/test_dict</executablePath>
  63 + <executablePath>build/fsa/test_speed</executablePath>
  64 + <cTool>
  65 + <incDir>
  66 + <pElem>build/CMakeFiles/CMakeTmp</pElem>
  67 + </incDir>
  68 + </cTool>
48 69 </makeTool>
49 70 </makefileType>
50   - <item path="cfsa1_impl.hpp" ex="false" tool="3" flavor2="0">
  71 + <item path="/usr/share/cmake-2.8/Modules/CMakeCCompilerABI.c"
  72 + ex="false"
  73 + tool="0"
  74 + flavor2="2">
  75 + <cTool>
  76 + </cTool>
  77 + </item>
  78 + <item path="/usr/share/cmake-2.8/Modules/CMakeCXXCompilerABI.cpp"
  79 + ex="false"
  80 + tool="1"
  81 + flavor2="4">
  82 + <ccTool>
  83 + </ccTool>
  84 + </item>
  85 + <item path="build/CMakeFiles/2.8.11.2/CompilerIdC/CMakeCCompilerId.c"
  86 + ex="false"
  87 + tool="0"
  88 + flavor2="2">
  89 + <cTool>
  90 + </cTool>
  91 + </item>
  92 + <item path="build/CMakeFiles/2.8.11.2/CompilerIdCXX/CMakeCXXCompilerId.cpp"
  93 + ex="false"
  94 + tool="1"
  95 + flavor2="4">
  96 + <ccTool>
  97 + </ccTool>
51 98 </item>
52 99 <folder path="0">
53 100 <ccTool>
... ... @@ -56,23 +103,27 @@
56 103 </incDir>
57 104 </ccTool>
58 105 </folder>
59   - <folder path="1">
  106 + <folder path="2.8.11.2">
  107 + <ccTool>
  108 + <incDir>
  109 + <pElem>build/CMakeFiles/CMakeTmp</pElem>
  110 + </incDir>
  111 + </ccTool>
  112 + </folder>
  113 + <folder path="Modules">
  114 + <ccTool>
  115 + <incDir>
  116 + <pElem>build/CMakeFiles/CMakeTmp</pElem>
  117 + </incDir>
  118 + </ccTool>
  119 + </folder>
  120 + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8">
60 121 <ccTool>
61 122 <incDir>
62 123 <pElem>fsa</pElem>
63 124 <pElem>build/morfeusz</pElem>
64 125 </incDir>
65 126 </ccTool>
66   - </folder>
67   - <item path="fsa/cfsa1_impl.hpp" ex="false" tool="3" flavor2="0">
68   - </item>
69   - <item path="fsa/cfsa2_impl.hpp" ex="false" tool="3" flavor2="0">
70   - </item>
71   - <item path="fsa/interpretation.hpp" ex="false" tool="3" flavor2="0">
72   - </item>
73   - <item path="fsa/simplefsa_impl.hpp" ex="false" tool="3" flavor2="0">
74   - </item>
75   - <item path="fsa/test_morph.cpp" ex="false" tool="1" flavor2="0">
76 127 </item>
77 128 <item path="fsa/test_not_recognize.cpp" ex="false" tool="1" flavor2="8">
78 129 <ccTool>
... ... @@ -86,12 +137,53 @@
86 137 <ccTool>
87 138 </ccTool>
88 139 </item>
  140 + <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8">
  141 + <ccTool>
  142 + <incDir>
  143 + <pElem>fsa</pElem>
  144 + <pElem>build/morfeusz</pElem>
  145 + </incDir>
  146 + </ccTool>
  147 + </item>
  148 + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8">
  149 + <ccTool>
  150 + <incDir>
  151 + <pElem>fsa</pElem>
  152 + <pElem>build/morfeusz</pElem>
  153 + </incDir>
  154 + </ccTool>
  155 + </item>
  156 + <item path="morfeusz/interpretations.cpp" ex="false" tool="1" flavor2="8">
  157 + <ccTool>
  158 + <incDir>
  159 + <pElem>fsa</pElem>
  160 + <pElem>build/morfeusz</pElem>
  161 + </incDir>
  162 + </ccTool>
  163 + </item>
89 164 <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
90 165 <ccTool>
  166 + <incDir>
  167 + <pElem>fsa</pElem>
  168 + <pElem>build/morfeusz</pElem>
  169 + </incDir>
91 170 </ccTool>
92 171 </item>
93 172 <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4">
94 173 <ccTool>
  174 + <incDir>
  175 + <pElem>build/CMakeFiles/CMakeTmp</pElem>
  176 + <pElem>fsa</pElem>
  177 + <pElem>build/morfeusz</pElem>
  178 + </incDir>
  179 + </ccTool>
  180 + </item>
  181 + <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8">
  182 + <ccTool>
  183 + <incDir>
  184 + <pElem>fsa</pElem>
  185 + <pElem>build/morfeusz</pElem>
  186 + </incDir>
95 187 </ccTool>
96 188 </item>
97 189 </conf>
... ...
nbproject/project.xml
... ... @@ -4,9 +4,9 @@
4 4 <configuration>
5 5 <data xmlns="http://www.netbeans.org/ns/make-project/1">
6 6 <name>morfeusz</name>
7   - <c-extensions/>
  7 + <c-extensions>c</c-extensions>
8 8 <cpp-extensions>cpp</cpp-extensions>
9   - <header-extensions>hpp</header-extensions>
  9 + <header-extensions>h,hpp</header-extensions>
10 10 <sourceEncoding>UTF-8</sourceEncoding>
11 11 <make-dep-projects/>
12 12 <sourceRootList>
... ...