Commit 0c8c369d5e9b7383af3f3369f822215e412d0d4b
1 parent
ce75f5c3
- zaimplementowanie wersji ze zmienną długością offsetów
- dodanie prostego testu git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@9 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
13 changed files
with
100342 additions
and
112 deletions
Too many changes to show.
To preserve performance only 12 of 13 files are displayed.
CMakeLists.txt
1 | -# The name of our project is "HELLO". CMakeLists files in this project can | |
2 | -# refer to the root source directory of the project as ${HELLO_SOURCE_DIR} and | |
3 | -# to the root binary directory of the project as ${HELLO_BINARY_DIR}. | |
4 | -cmake_minimum_required (VERSION 2.6) | |
1 | + | |
2 | +cmake_minimum_required (VERSION 2.8) | |
5 | 3 | project (Morfeusz) |
6 | 4 | |
7 | -# Recurse into the "Hello" and "Demo" subdirectories. This does not actually | |
8 | -# cause another cmake executable to run. The same process will walk through | |
9 | -# the project's entire directory structure. | |
5 | +enable_testing() | |
6 | + | |
10 | 7 | add_subdirectory (fsa) |
11 | 8 | add_subdirectory (morfeusz) |
12 | 9 | |
10 | +file(COPY fsabuilder testfiles DESTINATION .) | |
11 | + | |
12 | +add_test (TestBuildFSA python fsabuilder/fsa/buildfsa.py -i testfiles/test.txt -o testfiles/test.fsa -t SPELL --input-format=PLAIN --output-format=BINARY) | |
13 | +add_test (TestRecognize fsa/test_recognize testfiles/test.fsa testfiles/test.txt) | |
... | ... |
fsa/CMakeLists.txt
1 | 1 | |
2 | -add_executable (test_dict test_dict.cpp) | |
3 | 2 | add_executable (test_speed test_speed.cpp) |
4 | 3 | add_executable (test_speed_profile test_speed.cpp) |
5 | -set_target_properties ( test_dict PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -g" ) | |
4 | +add_executable (test_recognize test_recognize.cpp) | |
6 | 5 | set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
7 | -set_target_properties ( test_speed_profile PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -g -O2" ) | |
6 | +set_target_properties ( test_speed_profile PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -g" ) | |
7 | +set_target_properties ( test_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -g" ) | |
... | ... |
fsa/_fsa_impl.hpp
... | ... | @@ -63,39 +63,38 @@ SimpleFSA<T>::~SimpleFSA() { |
63 | 63 | // } |
64 | 64 | //} |
65 | 65 | |
66 | -static inline const TransitionData* findTransition(const TransitionData* start, const TransitionData* end, const char c) { | |
67 | - for (const TransitionData* td = start; td != end; td++) { | |
68 | - if (td->label == c) { | |
69 | - return td; | |
70 | - } | |
71 | - } | |
72 | - return end; | |
73 | -} | |
74 | - | |
75 | 66 | template <class T> |
76 | 67 | void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { |
77 | 68 | // if (c <= 'z' && 'a' <= c) |
78 | 69 | // cerr << "NEXT " << c << " from " << state.getOffset() << endl; |
79 | 70 | // else |
80 | 71 | // cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl; |
72 | +// cerr << "NEXT" << endl; | |
81 | 73 | const unsigned char* fromPointer = this->startPtr + state.getOffset(); |
82 | 74 | int transitionsTableOffset = sizeof (StateData); |
83 | 75 | if (state.isAccepting()) { |
84 | 76 | transitionsTableOffset += state.getValueSize(); |
85 | 77 | // cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl; |
86 | 78 | } |
87 | - const StateData stateData = *reinterpret_cast<const StateData*> (fromPointer); | |
88 | - const TransitionData* transitionsStart = reinterpret_cast<const TransitionData*> (fromPointer + transitionsTableOffset); | |
89 | - const TransitionData* transitionsEnd = transitionsStart + stateData.transitionsNum; | |
90 | - const TransitionData* foundTransition = findTransition(transitionsStart, transitionsEnd, c); | |
91 | - if (foundTransition == transitionsEnd || foundTransition->label != c) { | |
79 | + StateData stateData = *(StateData*) (fromPointer); | |
80 | + TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset); | |
81 | + bool found = false; | |
82 | + for (int i = 0; i < stateData.transitionsNum; i++, foundTransition++) { | |
83 | +// cerr << foundTransition->label << endl; | |
84 | + if (foundTransition->label == c) { | |
85 | + found = true; | |
86 | + break; | |
87 | + } | |
88 | + } | |
89 | +// const_cast<Counter*>(&counter)->increment(foundTransition - transitionsStart + 1); | |
90 | + if (!found) { | |
92 | 91 | // cerr << "SINK" << (foundTransition == transitionsEnd) << " " << foundTransition->label << " for " << c << endl; |
93 | 92 | state.setNextAsSink(); |
94 | 93 | } |
95 | 94 | else { |
96 | 95 | // cerr << "FOUND " << foundTransition->label << " " << foundTransition->targetOffset << endl; |
97 | 96 | const unsigned char* nextStatePointer = this->startPtr + foundTransition->targetOffset; |
98 | - const StateData* nextStateData = reinterpret_cast<const StateData*> (nextStatePointer); | |
97 | + StateData* nextStateData = (StateData*) (nextStatePointer); | |
99 | 98 | if (nextStateData->accepting) { |
100 | 99 | // cerr << "ACCEPTING" << endl; |
101 | 100 | T object; |
... | ... | @@ -115,8 +114,12 @@ bool FSA<T>::tryToRecognize(const char* input, T& value) const { |
115 | 114 | currState.proceedToNext(input[i]); |
116 | 115 | i++; |
117 | 116 | } |
117 | + // input[i] == '\0' | |
118 | + currState.proceedToNext(0); | |
119 | + | |
118 | 120 | if (currState.isAccepting()) { |
119 | 121 | value = currState.getValue(); |
122 | + cerr << "RECOGNIZED " << input << endl; | |
120 | 123 | return true; |
121 | 124 | } else { |
122 | 125 | return false; |
... | ... |
fsa/_vfsa_impl.hpp
0 → 100644
1 | +/* | |
2 | + * File: _vfsa_impl.hpp | |
3 | + * Author: lennyn | |
4 | + * | |
5 | + * Created on October 29, 2013, 9:57 PM | |
6 | + */ | |
7 | + | |
8 | +#ifndef _VFSA_IMPL_HPP | |
9 | +#define _VFSA_IMPL_HPP | |
10 | + | |
11 | +#include <algorithm> | |
12 | +#include <utility> | |
13 | +#include <iostream> | |
14 | +#include <netinet/in.h> | |
15 | +#include "fsa.hpp" | |
16 | + | |
17 | +using namespace std; | |
18 | + | |
19 | +#pragma pack(push) /* push current alignment to stack */ | |
20 | +#pragma pack(1) /* set alignment to 1 byte boundary */ | |
21 | + | |
22 | +struct VTransitionData { | |
23 | + unsigned label : 5; | |
24 | + unsigned offsetSize : 2; | |
25 | + unsigned last : 1; | |
26 | +}; | |
27 | + | |
28 | +#pragma pack(pop) /* restore original alignment from stack */ | |
29 | + | |
30 | +template <class T> | |
31 | +int FSAImpl<T>::getMagicNumberOffset() { | |
32 | + return 0; | |
33 | +} | |
34 | + | |
35 | +template <class T> | |
36 | +int FSAImpl<T>::getVersionNumOffset() { | |
37 | + return getMagicNumberOffset() + sizeof (MAGIC_NUMBER); | |
38 | +} | |
39 | + | |
40 | +template <class T> | |
41 | +int FSAImpl<T>::getPopularCharsOffset() { | |
42 | + return getVersionNumOffset() + sizeof (VERSION_NUM); | |
43 | +} | |
44 | + | |
45 | +template <class T> | |
46 | +int FSAImpl<T>::getInitialStateOffset() { | |
47 | + return getPopularCharsOffset() + POPULAR_CHARS_NUM; | |
48 | +} | |
49 | + | |
50 | +template <class T> | |
51 | +vector<unsigned char> FSAImpl<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) { | |
52 | + vector<unsigned char> res(256, FSAImpl<bool>::POPULAR_CHARS_NUM); | |
53 | + const unsigned char* popularChars = ptr + getPopularCharsOffset(); | |
54 | + for (unsigned int i = 0; i < POPULAR_CHARS_NUM; i++) { | |
55 | + res[popularChars[i]] = i; | |
56 | + } | |
57 | + return res; | |
58 | +} | |
59 | + | |
60 | +template <class T> | |
61 | +FSAImpl<T>::FSAImpl(const unsigned char* ptr, const Deserializer<T>& deserializer) | |
62 | +: FSA<T>(ptr + getInitialStateOffset(), deserializer), | |
63 | +char2PopularCharIdx(initializeChar2PopularCharIdx(ptr)) { | |
64 | + uint32_t magicNumber = ntohl(*((uint32_t*) ptr + getMagicNumberOffset())); | |
65 | + if (magicNumber != MAGIC_NUMBER) { | |
66 | + throw FSAException("Invalid magic number"); | |
67 | + } | |
68 | + unsigned char versionNum = *(ptr + getVersionNumOffset()); | |
69 | + if (versionNum != VERSION_NUM) { | |
70 | + throw FSAException("Invalid version number"); | |
71 | + } | |
72 | + cerr << "initial state offset " << getInitialStateOffset() << endl; | |
73 | +} | |
74 | + | |
75 | +template <class T> | |
76 | +FSAImpl<T>::~FSAImpl() { | |
77 | + | |
78 | +} | |
79 | + | |
80 | +template <class T> | |
81 | +void FSAImpl<T>::proceedToNext(const char c, State<T>& state) const { | |
82 | + // if (c <= 'z' && 'a' <= c) | |
83 | + // cerr << "NEXT " << c << " from " << state.getOffset() << endl; | |
84 | + // else | |
85 | + // cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl; | |
86 | + const unsigned char* fromPointer = this->startPtr + state.getOffset(); | |
87 | + int transitionsTableOffset = 0; | |
88 | + if (state.isAccepting()) { | |
89 | + transitionsTableOffset += state.getValueSize(); | |
90 | + cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl; | |
91 | + } | |
92 | + const unsigned char* currPtr = fromPointer + transitionsTableOffset; | |
93 | + bool found = false; | |
94 | + bool failed = false; | |
95 | + unsigned char shortLabel = char2PopularCharIdx[(unsigned char) c]; | |
96 | + cerr << "NEXT " << c << " " << (int) shortLabel << endl; | |
97 | + VTransitionData td; | |
98 | + while (!found && !failed) { | |
99 | + td = *((VTransitionData*) currPtr); | |
100 | + cerr << "transition at " << (currPtr - this->startPtr) << endl; | |
101 | + cerr << "short label: " << (int) td.label << endl; | |
102 | + if (td.label == shortLabel) { | |
103 | + if (td.label != POPULAR_CHARS_NUM) { | |
104 | + found = true; | |
105 | + currPtr++; | |
106 | + } | |
107 | + else { | |
108 | + currPtr++; | |
109 | + char realLabel = (char) *currPtr; | |
110 | + cerr << "full label: " << realLabel << endl; | |
111 | + if (realLabel != c) { | |
112 | + failed = td.last; | |
113 | + currPtr += td.offsetSize + 1; | |
114 | + } else { | |
115 | + found = true; | |
116 | + currPtr++; | |
117 | + } | |
118 | + } | |
119 | + } else if (td.last) { | |
120 | + cerr << "last" << endl; | |
121 | + failed = true; | |
122 | + } else { | |
123 | + if (td.label == POPULAR_CHARS_NUM) { | |
124 | + currPtr++; | |
125 | + } | |
126 | + currPtr += td.offsetSize + 1; | |
127 | + } | |
128 | + } | |
129 | + | |
130 | + if (found) { | |
131 | + // currPtr points at the offset | |
132 | + // or next state (iff offset==0) | |
133 | + int offsetFromHere = 0; | |
134 | + cerr << "offset size " << td.offsetSize << endl; | |
135 | + for (int i = 0; i < td.offsetSize; i++) { | |
136 | + offsetFromHere <<= 8; | |
137 | + cerr << "offset from here " << offsetFromHere << endl; | |
138 | + offsetFromHere += *currPtr; | |
139 | + if (i + 1 < td.offsetSize) | |
140 | + currPtr++; | |
141 | + cerr << "offset from here " << offsetFromHere << endl; | |
142 | + } | |
143 | + currPtr += offsetFromHere; | |
144 | + cerr << "offset " << currPtr - this->startPtr << endl; | |
145 | + bool accepting = c == '\0'; | |
146 | + if (accepting) { | |
147 | + T value; | |
148 | + int valueSize = this->deserializer.deserialize(currPtr, value); | |
149 | + currPtr += valueSize; | |
150 | + state.setNext(currPtr - this->startPtr, value, valueSize); | |
151 | + } else { | |
152 | + state.setNext(currPtr - this->startPtr); | |
153 | + } | |
154 | + } else { | |
155 | + state.setNextAsSink(); | |
156 | + } | |
157 | +} | |
158 | + | |
159 | +#endif /* _VFSA_IMPL_HPP */ | |
160 | + | |
... | ... |
fsa/fsa.hpp
... | ... | @@ -9,13 +9,17 @@ |
9 | 9 | #define FSA_HPP |
10 | 10 | |
11 | 11 | //#include <iostream> |
12 | -#include <cstring> | |
12 | +//#include <cstring> | |
13 | 13 | #include <typeinfo> |
14 | 14 | #include <cassert> |
15 | +#include <exception> | |
16 | +#include <string> | |
17 | +#include <vector> | |
15 | 18 | |
16 | 19 | template <class T> class State; |
17 | 20 | template <class T> class FSA; |
18 | 21 | template <class T> class Deserializer; |
22 | +class FSAException; | |
19 | 23 | |
20 | 24 | template <class T> |
21 | 25 | class Deserializer { |
... | ... | @@ -39,23 +43,36 @@ public: |
39 | 43 | * Returns number of bytes read or -1 on error. |
40 | 44 | */ |
41 | 45 | int deserialize(const unsigned char* ptr, char*& text) const { |
42 | - text = const_cast<char*> (reinterpret_cast<const char*> (ptr)); | |
43 | - return strlen(text) + 1; | |
46 | + // text = const_cast<char*> (reinterpret_cast<const char*> (ptr)); | |
47 | + // return strlen(text) + 1; | |
48 | + return 1; | |
44 | 49 | } |
45 | 50 | }; |
46 | 51 | |
52 | +class Counter { | |
53 | +public: | |
54 | + | |
55 | + Counter() : count(0) { | |
56 | + | |
57 | + } | |
58 | + | |
59 | + void increment(const int n) { | |
60 | + count += n; | |
61 | + } | |
62 | + long long count; | |
63 | +}; | |
64 | + | |
47 | 65 | /** |
48 | 66 | * Finite state automaton. |
49 | 67 | */ |
50 | 68 | template <class T> |
51 | 69 | class FSA { |
52 | 70 | public: |
53 | - | |
54 | 71 | /** |
55 | 72 | * Get this automaton's initial state. |
56 | 73 | */ |
57 | 74 | State<T> getInitialState() const; |
58 | - | |
75 | + | |
59 | 76 | bool tryToRecognize(const char* input, T& value) const; |
60 | 77 | |
61 | 78 | virtual ~FSA() { |
... | ... | @@ -81,10 +98,41 @@ class SimpleFSA : public FSA<T> { |
81 | 98 | public: |
82 | 99 | SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer); |
83 | 100 | virtual ~SimpleFSA(); |
101 | + | |
102 | + long long transitionsCount() { | |
103 | + return counter.count; | |
104 | + } | |
84 | 105 | protected: |
85 | 106 | void proceedToNext(const char c, State<T>& state) const; |
86 | 107 | private: |
108 | + Counter counter; | |
109 | +}; | |
110 | + | |
111 | +template <class T> | |
112 | +class FSAImpl : public FSA<T> { | |
113 | +public: | |
114 | + FSAImpl(const unsigned char* ptr, const Deserializer<T>& deserializer); | |
115 | + virtual ~FSAImpl(); | |
87 | 116 | |
117 | + long long transitionsCount() { | |
118 | + return counter.count; | |
119 | + } | |
120 | + | |
121 | + static const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; | |
122 | + static const unsigned char VERSION_NUM = 1; | |
123 | + static const unsigned int POPULAR_CHARS_NUM = 31; | |
124 | + | |
125 | +protected: | |
126 | + void proceedToNext(const char c, State<T>& state) const; | |
127 | +private: | |
128 | + Counter counter; | |
129 | + const std::vector<unsigned char> char2PopularCharIdx; | |
130 | + | |
131 | + static int getMagicNumberOffset(); | |
132 | + static int getVersionNumOffset(); | |
133 | + static int getPopularCharsOffset(); | |
134 | + static int getInitialStateOffset(); | |
135 | + static std::vector<unsigned char> initializeChar2PopularCharIdx(const unsigned char* ptr); | |
88 | 136 | }; |
89 | 137 | |
90 | 138 | /** |
... | ... | @@ -122,15 +170,15 @@ public: |
122 | 170 | * For non-accepting states is throws an exception. |
123 | 171 | */ |
124 | 172 | unsigned int getValueSize() const; |
125 | - | |
173 | + | |
126 | 174 | unsigned int getOffset() const; |
127 | - | |
175 | + | |
128 | 176 | void setNext(const unsigned int offset); |
129 | 177 | void setNext(const unsigned int offset, const T& value, const unsigned int valueSize); |
130 | 178 | void setNextAsSink(); |
131 | - | |
179 | + | |
132 | 180 | State(const FSA<T>& fsa); |
133 | - | |
181 | + | |
134 | 182 | virtual ~State(); |
135 | 183 | private: |
136 | 184 | const FSA<T>& fsa; |
... | ... | @@ -141,7 +189,19 @@ private: |
141 | 189 | int valueSize; |
142 | 190 | }; |
143 | 191 | |
192 | +class FSAException : public std::exception { | |
193 | +public: | |
194 | + FSAException(const char* what): msg(what) {} | |
195 | + virtual ~FSAException() throw() {} | |
196 | + virtual const char* what() const throw () { | |
197 | + return this->msg.c_str(); | |
198 | + } | |
199 | +private: | |
200 | + const std::string msg; | |
201 | +}; | |
202 | + | |
144 | 203 | #include "_fsa_impl.hpp" |
204 | +#include "_vfsa_impl.hpp" | |
145 | 205 | #include "_state_impl.hpp" |
146 | 206 | |
147 | 207 | #endif /* FSA_HPP */ |
... | ... |
fsa/test_dict.cpp renamed to fsa/test_recognize.cpp
1 | 1 | /* |
2 | - * File: test.cpp | |
2 | + * File: test_recognize.cpp | |
3 | 3 | * Author: lennyn |
4 | 4 | * |
5 | - * Created on October 22, 2013, 2:11 PM | |
5 | + * Created on October 30, 2013, 5:26 PM | |
6 | 6 | */ |
7 | 7 | |
8 | 8 | #include <cstdlib> |
9 | -#include <iostream> | |
10 | -#include <fstream> | |
11 | -#include <string> | |
12 | 9 | #include <sstream> |
13 | -#include <algorithm> | |
14 | -#include <functional> | |
15 | -#include <cctype> | |
16 | -#include <locale> | |
17 | -#include <vector> | |
18 | 10 | #include "fsa.hpp" |
19 | 11 | #include "utils.hpp" |
20 | 12 | |
... | ... | @@ -41,51 +33,34 @@ static inline string &rtrim(string &s) { |
41 | 33 | return s; |
42 | 34 | } |
43 | 35 | |
44 | -void testFSA(const FSA<char*>& fsa, const char* fname) { | |
36 | +void doTest(const FSA<char*>& fsa, const char* fname) { | |
45 | 37 | ifstream ifs; |
46 | 38 | // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); |
47 | 39 | ifs.open(fname, ios::binary); |
48 | 40 | string line; |
49 | 41 | while (getline(ifs, line)) { |
50 | - vector<string> split1(split(line, '\t')); | |
51 | - string key = split1[0]; | |
52 | - key = "bijekcją"; | |
53 | - string value = split1[1]; | |
54 | 42 | |
55 | - for (unsigned int i = 0; i < key.length(); i++) { | |
56 | - cout << (int) key[i] << " "; | |
57 | - } | |
58 | - cout << endl; | |
43 | + vector<string> splitVector(split(line, '\t')); | |
44 | + string key = splitVector[0]; | |
45 | + | |
46 | + cerr << "test " << key << endl; | |
59 | 47 | |
60 | 48 | char* value2; |
61 | - if (fsa.tryToRecognize(key.c_str(), value2)) { | |
62 | - if (string(value) != string(value2)) { | |
63 | - cout << "BAD INTERP " << key << " " << value << " != " << value2 << endl; | |
64 | - } | |
65 | - else { | |
66 | - cout << "OK! " << key << " " << value << endl; | |
67 | - } | |
68 | - } | |
69 | - else { | |
70 | - cout << "MISS " << key << " " << value << " not recognized" << endl; | |
71 | - } | |
49 | + validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize "+key); | |
72 | 50 | } |
73 | - cout << ifs.good() << endl; | |
74 | - cout << ifs.fail() << endl; | |
75 | - cout << ifs.eof() << endl; | |
76 | - cout << "done" << endl; | |
51 | +// validate(ifs.good(), "Something wrong with the input file"); | |
52 | +// validate(!ifs.fail(), "Something wrong with the input file"); | |
53 | + validate(ifs.eof(), "Failed to read the input file to the end"); | |
77 | 54 | } |
78 | 55 | |
79 | -/* | |
80 | - * | |
81 | - */ | |
82 | 56 | int main(int argc, char** argv) { |
57 | + cerr << (int) ((unsigned char) -123) << endl; | |
83 | 58 | validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename."); |
84 | 59 | const unsigned char* fsaData = readFile(argv[1]); |
85 | 60 | StringDeserializer deserializer; |
86 | - SimpleFSA<char*> fsa(fsaData, deserializer); | |
87 | - testFSA(fsa, argv[2]); | |
88 | - cout << argc << endl; | |
61 | + FSAImpl<char*> fsa(fsaData, deserializer); | |
62 | + doTest(fsa, argv[2]); | |
63 | +// cout << argc << endl; | |
89 | 64 | return 0; |
90 | 65 | } |
91 | 66 | |
... | ... |
fsa/test_speed.cpp
... | ... | @@ -25,17 +25,23 @@ int main(int argc, char** argv) { |
25 | 25 | ifstream ifs; |
26 | 26 | // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); |
27 | 27 | ifs.open(argv[2], ios::binary); |
28 | - string line; | |
29 | - while (getline(ifs, line)) { | |
28 | + char line[65536]; | |
29 | + int recognized = 0; | |
30 | + int unrecognized = 0; | |
31 | + while (ifs.getline(line, 65536, '\n')) { | |
30 | 32 | char* val; |
31 | 33 | // cout << line << endl; |
32 | - if (fsa.tryToRecognize(line.c_str(), val)) { | |
33 | -// printf("%s: *OK*\n", line.c_str()); | |
34 | + if (fsa.tryToRecognize(line, val)) { | |
35 | +// printf("%s: *OK*\n", line); | |
36 | + recognized++; | |
34 | 37 | } |
35 | 38 | else { |
36 | -// printf("%s: NOT FOUND\n", line.c_str()); | |
39 | + unrecognized++; | |
40 | +// printf("%s: NOT FOUND\n", line); | |
37 | 41 | } |
38 | 42 | } |
43 | + cout << "recognized: " << recognized << endl; | |
44 | + cout << "unrecognized: " << unrecognized << endl; | |
45 | + cout << "total: " << (recognized + unrecognized) << endl; | |
39 | 46 | return 0; |
40 | 47 | } |
41 | - | |
... | ... |
fsa/utils.hpp
fsabuilder/fsa/buildfsa.py
... | ... | @@ -11,11 +11,11 @@ import codecs |
11 | 11 | import encode |
12 | 12 | import convertinput |
13 | 13 | from fsa import FSA |
14 | -from serializer import SimpleSerializer | |
14 | +from serializer import VLengthSerializer | |
15 | 15 | from visualizer import Visualizer |
16 | 16 | from optparse import OptionParser |
17 | 17 | |
18 | -logging.basicConfig(level=logging.DEBUG) | |
18 | +logging.basicConfig(level=logging.INFO) | |
19 | 19 | |
20 | 20 | class OutputFormat(): |
21 | 21 | BINARY = 'BINARY' |
... | ... | @@ -124,20 +124,23 @@ if __name__ == '__main__': |
124 | 124 | }[opts.inputFormat] |
125 | 125 | |
126 | 126 | logging.info('feeding FSA with data ...') |
127 | - fsa.feed(inputData, appendZero=True) | |
127 | + fsa.feed(inputData) | |
128 | 128 | if opts.trainFile: |
129 | 129 | logging.info('training with '+opts.trainFile+' ...') |
130 | 130 | fsa.train(readTrainData(opts.trainFile)) |
131 | 131 | logging.info('done training') |
132 | - serializer = SimpleSerializer(fsa) | |
132 | + serializer = VLengthSerializer(fsa) | |
133 | 133 | logging.info('states num: '+str(fsa.getStatesNum())) |
134 | + logging.info('transitions num: '+str(fsa.getTransitionsNum())) | |
134 | 135 | logging.info('accepting states num: '+str(len([s for s in fsa.initialState.dfs(set()) if s.isAccepting()]))) |
135 | 136 | logging.info('sink states num: '+str(len([s for s in fsa.initialState.dfs(set()) if len(s.transitionsMap.items()) == 0]))) |
136 | 137 | { |
137 | 138 | OutputFormat.CPP: serializer.serialize2CppFile, |
138 | 139 | OutputFormat.BINARY: serializer.serialize2BinaryFile |
139 | 140 | }[opts.outputFormat](opts.outputFile) |
140 | - | |
141 | + logging.info('size: '+str(fsa.initialState.reverseOffset)) | |
142 | +# for s in fsa.initialState.dfs(set()): | |
143 | +# logging.info(s.offset) | |
141 | 144 | if opts.visualize: |
142 | 145 | Visualizer().visualize(fsa) |
143 | 146 | |
... | ... |
fsabuilder/fsa/fsa.py
... | ... | @@ -59,6 +59,12 @@ class FSA(object): |
59 | 59 | |
60 | 60 | def getStatesNum(self): |
61 | 61 | return self.register.getStatesNum() |
62 | + | |
63 | + def getTransitionsNum(self): | |
64 | + res = 0 | |
65 | + for s in self.initialState.dfs(set()): | |
66 | + res += len(s.transitionsMap) | |
67 | + return res | |
62 | 68 | |
63 | 69 | def _addSorted(self, encodedWord, data): |
64 | 70 | assert self.encodedPrevWord < encodedWord |
... | ... |
fsabuilder/fsa/serializer.py
... | ... | @@ -46,10 +46,12 @@ class Serializer(object): |
46 | 46 | raise NotImplementedError('Not implemented') |
47 | 47 | |
48 | 48 | def fsa2bytearray(self): |
49 | + | |
49 | 50 | res = bytearray() |
50 | 51 | res.extend(self.serializePrologue()) |
51 | 52 | self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
52 | - for state in sorted(self.fsa.initialState.dfs(set()), key=state.offset): | |
53 | + logging.debug('SERIALIZE') | |
54 | + for state in sorted(self.fsa.initialState.dfs(set()), key=lambda s: s.offset): | |
53 | 55 | res.extend(self.state2bytearray(state)) |
54 | 56 | return res |
55 | 57 | |
... | ... | @@ -111,14 +113,17 @@ class VLengthSerializer(Serializer): |
111 | 113 | |
112 | 114 | def __init__(self, fsa): |
113 | 115 | super(VLengthSerializer, self).__init__(fsa) |
114 | - self.statesTable = list(reversed(fsa.dfs(set()))) | |
116 | + self.statesTable = list(reversed(list(fsa.initialState.dfs(set())))) | |
115 | 117 | self.state2Index = dict([(state, idx) for (idx, state) in enumerate(self.statesTable)]) |
116 | 118 | |
117 | 119 | # labels sorted by popularity |
118 | - self.sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda label, freq: (-freq, label))] | |
120 | + self.sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda (label, freq): (-freq, label))] | |
121 | + remainingChars = [c for c in range(256) if not c in self.sortedLabels] | |
122 | + while len(self.sortedLabels) < 256: | |
123 | + self.sortedLabels.append(remainingChars.pop()) | |
119 | 124 | |
120 | 125 | # popular labels table |
121 | - self.label2Index = dict([(label, sortedLabels.index(label)) for label in sortedLabels][:31]) | |
126 | + self.label2Index = dict([(label, self.sortedLabels.index(label)) for label in self.sortedLabels][:31]) | |
122 | 127 | |
123 | 128 | def serializePrologue(self): |
124 | 129 | res = bytearray() |
... | ... | @@ -133,7 +138,8 @@ class VLengthSerializer(Serializer): |
133 | 138 | res.append(VLengthSerializer.VERSION) |
134 | 139 | |
135 | 140 | # serialize popular labels |
136 | - for label, freq in self.sortedLabels[:31]: | |
141 | + logging.debug(self.sortedLabels) | |
142 | + for label in self.sortedLabels[:31]: | |
137 | 143 | res.append(label) |
138 | 144 | |
139 | 145 | return res |
... | ... | @@ -159,20 +165,23 @@ class VLengthSerializer(Serializer): |
159 | 165 | |
160 | 166 | def _transitionsData2bytearray(self, state): |
161 | 167 | res = bytearray() |
162 | - transitions = sorted(state.transitionsMap.iteritems(), key=lambda (label, _): (-next.freq, -self.label2Count[label])) | |
168 | + transitions = sorted(state.transitionsMap.iteritems(), key=lambda (label, nextState): self.sortedLabels.index(label)) | |
163 | 169 | thisIdx = self.state2Index[state] |
164 | - | |
170 | + logging.debug('state '+str(state.offset)) | |
165 | 171 | if len(transitions) == 0: |
166 | 172 | assert state.isAccepting() |
167 | 173 | return bytearray() |
168 | 174 | else: |
169 | - offsetToStateAfterThis = 0 | |
170 | 175 | stateAfterThis = self.statesTable[thisIdx + 1] |
171 | 176 | for reversedN, (label, nextState) in enumerate(reversed(transitions)): |
177 | + transitionBytes = bytearray() | |
172 | 178 | assert nextState.reverseOffset is not None |
179 | + assert stateAfterThis.reverseOffset is not None | |
180 | + logging.debug('next state reverse: '+str(nextState.reverseOffset)) | |
181 | + logging.debug('after state reverse: '+str(stateAfterThis.reverseOffset)) | |
173 | 182 | n = len(transitions) - reversedN |
174 | 183 | |
175 | - popularLabel = self.label2Index[label] < 31 | |
184 | + popularLabel = label in self.label2Index | |
176 | 185 | firstByte = self.label2Index[label] if popularLabel else 31 |
177 | 186 | |
178 | 187 | last = len(transitions) == n |
... | ... | @@ -185,26 +194,29 @@ class VLengthSerializer(Serializer): |
185 | 194 | offset = 0 |
186 | 195 | if not next: |
187 | 196 | offsetSize = 1 |
188 | - offset = (stateAfterThis.reverseOffset - nextState.reverseOffset) + offsetSize | |
197 | +# nextState.offset - stateAfterThis.offset | |
198 | + offset = (stateAfterThis.reverseOffset - nextState.reverseOffset) + offsetSize + len(res) | |
189 | 199 | if offset >= 256: |
190 | - offset += 1 | |
200 | +# offset += 1 | |
191 | 201 | offsetSize += 1 |
192 | 202 | if offset >= 256 * 256: |
193 | - offset += 1 | |
203 | +# offset += 1 | |
194 | 204 | offsetSize += 1 |
195 | 205 | assert offset < 256 * 256 * 256 #TODO - przerobic na jakis porzadny wyjatek |
196 | 206 | |
197 | 207 | firstByte |= (32 * offsetSize) |
198 | 208 | |
199 | - res.append(firstByte) | |
209 | + transitionBytes.append(firstByte) | |
200 | 210 | if not popularLabel: |
201 | - res.append(label) | |
211 | + transitionBytes.append(label) | |
202 | 212 | # serialize offset in big-endian order |
203 | 213 | if offsetSize == 3: |
204 | - res.append((offset & 0xFF0000) >> 16) | |
214 | + transitionBytes.append((offset & 0xFF0000) >> 16) | |
205 | 215 | if offsetSize >= 2: |
206 | - res.append((offset & 0x00FF00) >> 8) | |
216 | + transitionBytes.append((offset & 0x00FF00) >> 8) | |
207 | 217 | if offsetSize >= 1: |
208 | - res.append(offset & 0x0000FF) | |
209 | - | |
218 | + transitionBytes.append(offset & 0x0000FF) | |
219 | + for b in reversed(transitionBytes): | |
220 | + res.insert(0, b) | |
221 | + logging.debug('inserted transition at beginning '+chr(label)+' -> '+str(offset)) | |
210 | 222 | return res |
... | ... |
nbproject/configurations.xml
... | ... | @@ -2,9 +2,7 @@ |
2 | 2 | <configurationDescriptor version="90"> |
3 | 3 | <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT"> |
4 | 4 | <df root="fsa" name="0"> |
5 | - <in>test_dict.cpp</in> | |
6 | 5 | <in>test_speed.cpp</in> |
7 | - <in>utils.hpp</in> | |
8 | 6 | </df> |
9 | 7 | <df root="morfeusz" name="1"> |
10 | 8 | <in>main.cpp</in> |
... | ... | @@ -38,7 +36,7 @@ |
38 | 36 | <buildCommandWorkingDir>build</buildCommandWorkingDir> |
39 | 37 | <buildCommand>${MAKE} -f Makefile</buildCommand> |
40 | 38 | <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> |
41 | - <executablePath></executablePath> | |
39 | + <executablePath>build/fsa/test_dict</executablePath> | |
42 | 40 | </makeTool> |
43 | 41 | </makefileType> |
44 | 42 | <folder path="0"> |
... | ... | @@ -56,19 +54,21 @@ |
56 | 54 | </incDir> |
57 | 55 | </ccTool> |
58 | 56 | </folder> |
59 | - <item path="fsa/test_dict.cpp" ex="false" tool="1" flavor2="8"> | |
60 | - <ccTool> | |
61 | - </ccTool> | |
57 | + <item path="fsa/newmain.cpp" ex="false" tool="1" flavor2="0"> | |
62 | 58 | </item> |
63 | - <item path="fsa/test_speed.cpp" ex="false" tool="1" flavor2="0"> | |
59 | + <item path="fsa/test_recognize.cpp" ex="false" tool="1" flavor2="0"> | |
64 | 60 | </item> |
65 | - <item path="fsa/utils.hpp" ex="false" tool="3" flavor2="0"> | |
61 | + <item path="fsa/test_speed.cpp" ex="false" tool="1" flavor2="8"> | |
62 | + <ccTool> | |
63 | + </ccTool> | |
66 | 64 | </item> |
67 | 65 | <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> |
68 | 66 | <ccTool> |
69 | 67 | </ccTool> |
70 | 68 | </item> |
71 | - <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="0"> | |
69 | + <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> | |
70 | + <ccTool> | |
71 | + </ccTool> | |
72 | 72 | </item> |
73 | 73 | </conf> |
74 | 74 | </confs> |
... | ... |