Commit e2ef01be9498be975a0fdd7a7cb68024beaa6c22

Authored by Michał Lenart
1 parent e05d60fb

- praca nad słownikiem z uwzględnieniem tagsetu

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@14 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsa/CMakeLists.txt
... ... @@ -3,7 +3,9 @@ add_executable (test_speed test_speed.cpp)
3 3 add_executable (test_speed_profile test_speed.cpp)
4 4 add_executable (test_recognize test_recognize.cpp)
5 5 add_executable (test_not_recognize test_not_recognize.cpp)
6   -set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
  6 +add_executable (test_morph test_morph.cpp)
  7 +set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -pedantic -Wcast-align -Wextra -Wmissing-noreturn -Wconversion -Wcast-qual -Wcast-align" )
7 8 set_target_properties ( test_speed_profile PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -g" )
8 9 set_target_properties ( test_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
9 10 set_target_properties ( test_not_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
  11 +set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
... ...
fsa/cfsa1_impl.hpp
... ... @@ -54,11 +54,11 @@ void CompressedFSA1<T>::reallyDoProceed(
54 54 const unsigned char* statePtr,
55 55 State<T>& state) const {
56 56 // const unsigned char stateByte = *statePtr;
57   - StateData2* sd = (StateData2*) statePtr;
  57 + const StateData2* sd = reinterpret_cast<const StateData2*>(statePtr);
58 58 if (sd->accepting) {
59 59 // cerr << "ACCEPTING" << endl;
60 60 T object;
61   - int size = this->deserializer.deserialize(statePtr + 1, object);
  61 + long size = this->deserializer.deserialize(statePtr + 1, object);
62 62 state.setNext(statePtr - this->initialStatePtr, object, size);
63 63 }
64 64 else {
... ... @@ -163,11 +163,11 @@ void CompressedFSA1&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
163 163 if (shortLabel > 0) {
164 164 this->doProceedToNextByArray(
165 165 shortLabel,
166   - (uint32_t*) (fromPointer + transitionsTableOffset),
  166 + reinterpret_cast<const uint32_t*>(fromPointer + transitionsTableOffset),
167 167 state);
168 168 }
169 169 else {
170   - reallyDoProceed((unsigned char*) fromPointer + transitionsTableOffset + 256, state);
  170 + reallyDoProceed(fromPointer + transitionsTableOffset + 256, state);
171 171 proceedToNext(c, state);
172 172 }
173 173 }
... ... @@ -175,7 +175,7 @@ void CompressedFSA1&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
175 175 this->doProceedToNextByList(
176 176 c,
177 177 shortLabel,
178   - (unsigned char*) (fromPointer + transitionsTableOffset),
  178 + fromPointer + transitionsTableOffset,
179 179 sd->transitionsNum,
180 180 state);
181 181 }
... ...
fsa/cfsa2_impl.hpp
... ... @@ -46,7 +46,7 @@ void CompressedFSA2&lt;T&gt;::reallyDoProceed(
46 46 State<T>& state) const {
47 47 if (accepting) {
48 48 T object;
49   - int size = this->deserializer.deserialize(statePtr + 1, object);
  49 + long size = this->deserializer.deserialize(statePtr + 1, object);
50 50 state.setNext(statePtr - this->initialStatePtr, object, size);
51 51 } else {
52 52 state.setNext(statePtr - this->initialStatePtr);
... ... @@ -81,7 +81,7 @@ void CompressedFSA2&lt;T&gt;::doProceedToNextByList(
81 81 const char c,
82 82 const unsigned char* ptr,
83 83 State<T>& state) const {
84   - register unsigned char* currPtr = const_cast<unsigned char*> (ptr);
  84 + unsigned char* currPtr = const_cast<unsigned char*> (ptr);
85 85 while (true) {
86 86 // const_cast<Counter*>(&counter)->increment(1);
87 87 if ((char) *currPtr == c) {
... ... @@ -141,7 +141,7 @@ void CompressedFSA2&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
141 141 }
142 142 this->doProceedToNextByList(
143 143 c,
144   - (unsigned char*) (fromPointer + transitionsTableOffset),
  144 + fromPointer + transitionsTableOffset,
145 145 state);
146 146 }
147 147  
... ...
fsa/fsa.hpp
... ... @@ -15,6 +15,7 @@
15 15 #include <exception>
16 16 #include <string>
17 17 #include <vector>
  18 +#include "interpretation.hpp"
18 19  
19 20 template <class T> class State;
20 21 template <class T> class FSA;
... ... @@ -29,7 +30,7 @@ public:
29 30 * Deserialize object from ptr.
30 31 * Returns number of bytes read or -1 on error.
31 32 */
32   - virtual int deserialize(const unsigned char* ptr, T& object) const = 0;
  33 + virtual long deserialize(const unsigned char* ptr, T& object) const = 0;
33 34 };
34 35  
35 36 class StringDeserializer : public Deserializer<char*> {
... ... @@ -42,13 +43,17 @@ public:
42 43 * Deserialize object from ptr.
43 44 * Returns number of bytes read or -1 on error.
44 45 */
45   - int deserialize(const unsigned char* ptr, char*& text) const {
  46 + long deserialize(const unsigned char* ptr, char*& text) const {
46 47 // text = const_cast<char*> (reinterpret_cast<const char*> (ptr));
47 48 // return strlen(text) + 1;
48 49 return 1;
49 50 }
50 51 };
51 52  
  53 +class MorphDeserializer: public Deserializer<std::vector<Interpretation>> {
  54 + long deserialize(const unsigned char* ptr, std::vector<Interpretation>& interp) const;
  55 +};
  56 +
52 57 class Counter {
53 58 public:
54 59  
... ... @@ -217,18 +222,18 @@ public:
217 222 */
218 223 unsigned int getValueSize() const;
219 224  
220   - unsigned int getOffset() const;
  225 + unsigned long getOffset() const;
221 226  
222   - void setNext(const unsigned int offset);
223   - void setNext(const unsigned int offset, const T& value, const unsigned int valueSize);
  227 + void setNext(const unsigned long offset);
  228 + void setNext(const unsigned long offset, const T& value, const unsigned int valueSize);
224 229 void setNextAsSink();
225 230  
226   - State(const FSA<T>& fsa);
  231 + explicit State(const FSA<T>& fsa);
227 232  
228 233 virtual ~State();
229 234 private:
230 235 const FSA<T>& fsa;
231   - unsigned int offset;
  236 + unsigned long offset;
232 237 bool accepting;
233 238 bool sink;
234 239 T value;
... ...
fsa/fsa_impl.hpp
... ... @@ -8,9 +8,11 @@
8 8 #ifndef _SIMPLE_FSA_IMPL_HPP
9 9 #define _SIMPLE_FSA_IMPL_HPP
10 10  
  11 +#include <cstring>
11 12 #include <algorithm>
12 13 #include <utility>
13 14 #include <iostream>
  15 +#include <vector>
14 16 #include <netinet/in.h>
15 17 #include "fsa.hpp"
16 18 #include "utils.hpp"
... ... @@ -84,4 +86,34 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
84 86 }
85 87 }
86 88  
  89 +static void deserializeLemma(const unsigned char*& ptr, Lemma& lemma) {
  90 + // XXX uważać na poprawność danych
  91 + lemma.suffixToCut = *ptr;
  92 + ptr++;
  93 + lemma.suffixToAdd = (const char*) ptr;
  94 + ptr += strlen((const char*) ptr) + 1;
  95 +}
  96 +
  97 +static void deserializeInterp(const unsigned char*& ptr, Interpretation& interp) {
  98 + deserializeLemma(ptr, interp.lemma);
  99 + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr)));
  100 + ptr += 2;
  101 + interp.nameClassifier = *ptr;
  102 + ptr++;
  103 +}
  104 +
  105 +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<Interpretation>& interps) const {
  106 + const unsigned char* currPtr = ptr;
  107 + uint8_t interpsNum = *ptr;
  108 + interps.clear();
  109 + interps.reserve(interpsNum);
  110 + currPtr++;
  111 + for (unsigned int i = 0; i < interpsNum; i++) {
  112 + Interpretation interp;
  113 + deserializeInterp(currPtr, interp);
  114 + interps.push_back(interp);
  115 + }
  116 + return currPtr - ptr;
  117 +}
  118 +
87 119 #endif /* _SIMPLE_FSA_IMPL_HPP */
... ...
fsa/interpretation.hpp
... ... @@ -14,15 +14,15 @@
14 14 using namespace std;
15 15  
16 16 struct Lemma {
17   - int suffixToCut;
18   - string suffixToAdd;
  17 + unsigned short suffixToCut;
  18 + const char* suffixToAdd;
19 19 };
20 20  
21 21 struct Interpretation {
22 22 Lemma lemma;
23   - list<int> tag; // np. subst:sg:nom:m1
24   - int nameClassifier; // np. "pospolita"
25   - int qualifier; // np. "dawne" lub "potoczne"
  23 + unsigned int tag; // np. subst:sg:nom:m1
  24 + unsigned short nameClassifier; // np. "pospolita"
  25 + unsigned short qualifier; // np. "dawne" lub "potoczne"
26 26 };
27 27  
28 28 #endif /* INTERPRETATION_HPP */
... ...
fsa/simplefsa_impl.hpp
... ... @@ -41,7 +41,7 @@ void SimpleFSA&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
41 41 if (state.isAccepting()) {
42 42 transitionsTableOffset += state.getValueSize();
43 43 }
44   - StateData stateData = *(StateData*) (fromPointer);
  44 + StateData stateData = *reinterpret_cast<const StateData*>(fromPointer);
45 45 const unsigned char* foundTransition = fromPointer + transitionsTableOffset;
46 46 bool found = false;
47 47 for (int i = 0; i < stateData.transitionsNum; i++, foundTransition += 4) {
... ... @@ -57,7 +57,7 @@ void SimpleFSA&lt;T&gt;::proceedToNext(const char c, State&lt;T&gt;&amp; state) const {
57 57 else {
58 58 unsigned int offset = decodeOffset(foundTransition + 1);
59 59 const unsigned char* nextStatePointer = this->initialStatePtr + offset;
60   - StateData* nextStateData = (StateData*) (nextStatePointer);
  60 + const StateData* nextStateData = reinterpret_cast<const StateData*>(nextStatePointer);
61 61 if (nextStateData->accepting) {
62 62 T object;
63 63 int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object);
... ...
fsa/state_impl.hpp
... ... @@ -39,7 +39,7 @@ void State&lt;T&gt;::proceedToNext(const char c) {
39 39 }
40 40  
41 41 template <class T>
42   -unsigned int State<T>::getOffset() const {
  42 +unsigned long State<T>::getOffset() const {
43 43 assert(!this->isSink());
44 44 return this->offset;
45 45 }
... ... @@ -62,14 +62,14 @@ State&lt;T&gt;::~State() {
62 62 }
63 63  
64 64 template <class T>
65   -void State<T>::setNext(const unsigned int offset) {
  65 +void State<T>::setNext(const unsigned long offset) {
66 66 // assert(!this->isSink());
67 67 this->offset = offset;
68 68 this->accepting = false;
69 69 }
70 70  
71 71 template <class T>
72   -void State<T>::setNext(const unsigned int offset, const T& value, const unsigned int valueSize) {
  72 +void State<T>::setNext(const unsigned long offset, const T& value, const unsigned int valueSize) {
73 73 // assert(!this->isSink());
74 74 this->offset = offset;
75 75 this->accepting = true;
... ...
fsa/test_morph.cpp 0 → 100644
  1 +/*
  2 + * File: test_morph.cpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on November 8, 2013, 4:12 PM
  6 + */
  7 +
  8 +#include <cstdlib>
  9 +#include <sstream>
  10 +#include <iostream>
  11 +#include "fsa.hpp"
  12 +#include "utils.hpp"
  13 +
  14 +using namespace std;
  15 +
  16 +void debug(const string& key, const vector<Interpretation> value) {
  17 + cerr << key << endl;
  18 + for (Interpretation i: value) {
  19 + cerr << "suffix to cut: " << i.lemma.suffixToCut << endl;
  20 + cerr << "suffix to add: " << i.lemma.suffixToAdd << endl;
  21 + cerr << "tag: " << i.tag << endl;
  22 + cerr << "name: " << i.nameClassifier << endl;
  23 + }
  24 + cerr << "==================" << endl;
  25 +}
  26 +
  27 +void doTest(const FSA<vector<Interpretation>>& fsa, const char* fname) {
  28 + ifstream ifs;
  29 + // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
  30 + ifs.open(fname, ios::binary);
  31 + string line;
  32 + while (getline(ifs, line)) {
  33 + vector<string> splitVector(split(line, '\t'));
  34 + string key = splitVector[0];
  35 + vector<Interpretation> value2;
  36 + fsa.tryToRecognize(key.c_str(), value2);
  37 + debug(key, value2);
  38 +// validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key);
  39 + }
  40 + validate(ifs.eof(), "Failed to read the input file to the end");
  41 +}
  42 +
  43 +int main(int argc, char** argv) {
  44 + validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename.");
  45 + const unsigned char* fsaData = readFile(argv[1]);
  46 + MorphDeserializer deserializer;
  47 + FSA<vector<Interpretation>>* fsa = FSA<vector<Interpretation>>::getFSA(fsaData, deserializer);
  48 + doTest(*fsa, argv[2]);
  49 + // cout << argc << endl;
  50 + delete fsa;
  51 + return 0;
  52 +}
  53 +
... ...
fsa/test_recognize.cpp
... ... @@ -35,7 +35,6 @@ void doTest(const FSA&lt;char*&gt;&amp; fsa, const char* fname) {
35 35 }
36 36  
37 37 int main(int argc, char** argv) {
38   - cerr << (int) ((unsigned char) - 123) << endl;
39 38 validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename.");
40 39 const unsigned char* fsaData = readFile(argv[1]);
41 40 StringDeserializer deserializer;
... ...
nbproject/configurations.xml
... ... @@ -6,6 +6,7 @@
6 6 <in>cfsa2_impl.hpp</in>
7 7 <in>interpretation.hpp</in>
8 8 <in>simplefsa_impl.hpp</in>
  9 + <in>test_morph.cpp</in>
9 10 <in>test_not_recognize.cpp</in>
10 11 <in>test_recognize.cpp</in>
11 12 <in>test_speed.cpp</in>
... ... @@ -71,6 +72,8 @@
71 72 </item>
72 73 <item path="fsa/simplefsa_impl.hpp" ex="false" tool="3" flavor2="0">
73 74 </item>
  75 + <item path="fsa/test_morph.cpp" ex="false" tool="1" flavor2="0">
  76 + </item>
74 77 <item path="fsa/test_not_recognize.cpp" ex="false" tool="1" flavor2="8">
75 78 <ccTool>
76 79 </ccTool>
... ...