Commit 0c8c369d5e9b7383af3f3369f822215e412d0d4b

Authored by Michał Lenart
1 parent ce75f5c3

- zaimplementowanie wersji ze zmienną długością offsetów

- dodanie prostego testu


git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@9 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

Too many changes to show.

To preserve performance only 12 of 13 files are displayed.

CMakeLists.txt
1   -# The name of our project is "HELLO". CMakeLists files in this project can
2   -# refer to the root source directory of the project as ${HELLO_SOURCE_DIR} and
3   -# to the root binary directory of the project as ${HELLO_BINARY_DIR}.
4   -cmake_minimum_required (VERSION 2.6)
  1 +
  2 +cmake_minimum_required (VERSION 2.8)
5 3 project (Morfeusz)
6 4  
7   -# Recurse into the "Hello" and "Demo" subdirectories. This does not actually
8   -# cause another cmake executable to run. The same process will walk through
9   -# the project's entire directory structure.
  5 +enable_testing()
  6 +
10 7 add_subdirectory (fsa)
11 8 add_subdirectory (morfeusz)
12 9  
  10 +file(COPY fsabuilder testfiles DESTINATION .)
  11 +
  12 +add_test (TestBuildFSA python fsabuilder/fsa/buildfsa.py -i testfiles/test.txt -o testfiles/test.fsa -t SPELL --input-format=PLAIN --output-format=BINARY)
  13 +add_test (TestRecognize fsa/test_recognize testfiles/test.fsa testfiles/test.txt)
... ...
fsa/CMakeLists.txt
1 1  
2   -add_executable (test_dict test_dict.cpp)
3 2 add_executable (test_speed test_speed.cpp)
4 3 add_executable (test_speed_profile test_speed.cpp)
5   -set_target_properties ( test_dict PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -g" )
  4 +add_executable (test_recognize test_recognize.cpp)
6 5 set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
7   -set_target_properties ( test_speed_profile PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -g -O2" )
  6 +set_target_properties ( test_speed_profile PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -g" )
  7 +set_target_properties ( test_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -g" )
... ...
fsa/_fsa_impl.hpp
... ... @@ -63,39 +63,38 @@ SimpleFSA<T>::~SimpleFSA() {
63 63 // }
64 64 //}
65 65  
66   -static inline const TransitionData* findTransition(const TransitionData* start, const TransitionData* end, const char c) {
67   - for (const TransitionData* td = start; td != end; td++) {
68   - if (td->label == c) {
69   - return td;
70   - }
71   - }
72   - return end;
73   -}
74   -
75 66 template <class T>
76 67 void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const {
77 68 // if (c <= 'z' && 'a' <= c)
78 69 // cerr << "NEXT " << c << " from " << state.getOffset() << endl;
79 70 // else
80 71 // cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl;
  72 +// cerr << "NEXT" << endl;
81 73 const unsigned char* fromPointer = this->startPtr + state.getOffset();
82 74 int transitionsTableOffset = sizeof (StateData);
83 75 if (state.isAccepting()) {
84 76 transitionsTableOffset += state.getValueSize();
85 77 // cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl;
86 78 }
87   - const StateData stateData = *reinterpret_cast<const StateData*> (fromPointer);
88   - const TransitionData* transitionsStart = reinterpret_cast<const TransitionData*> (fromPointer + transitionsTableOffset);
89   - const TransitionData* transitionsEnd = transitionsStart + stateData.transitionsNum;
90   - const TransitionData* foundTransition = findTransition(transitionsStart, transitionsEnd, c);
91   - if (foundTransition == transitionsEnd || foundTransition->label != c) {
  79 + StateData stateData = *(StateData*) (fromPointer);
  80 + TransitionData* foundTransition = (TransitionData*) (fromPointer + transitionsTableOffset);
  81 + bool found = false;
  82 + for (int i = 0; i < stateData.transitionsNum; i++, foundTransition++) {
  83 +// cerr << foundTransition->label << endl;
  84 + if (foundTransition->label == c) {
  85 + found = true;
  86 + break;
  87 + }
  88 + }
  89 +// const_cast<Counter*>(&counter)->increment(foundTransition - transitionsStart + 1);
  90 + if (!found) {
92 91 // cerr << "SINK" << (foundTransition == transitionsEnd) << " " << foundTransition->label << " for " << c << endl;
93 92 state.setNextAsSink();
94 93 }
95 94 else {
96 95 // cerr << "FOUND " << foundTransition->label << " " << foundTransition->targetOffset << endl;
97 96 const unsigned char* nextStatePointer = this->startPtr + foundTransition->targetOffset;
98   - const StateData* nextStateData = reinterpret_cast<const StateData*> (nextStatePointer);
  97 + StateData* nextStateData = (StateData*) (nextStatePointer);
99 98 if (nextStateData->accepting) {
100 99 // cerr << "ACCEPTING" << endl;
101 100 T object;
... ... @@ -115,8 +114,12 @@ bool FSA&lt;T&gt;::tryToRecognize(const char* input, T&amp; value) const {
115 114 currState.proceedToNext(input[i]);
116 115 i++;
117 116 }
  117 + // input[i] == '\0'
  118 + currState.proceedToNext(0);
  119 +
118 120 if (currState.isAccepting()) {
119 121 value = currState.getValue();
  122 + cerr << "RECOGNIZED " << input << endl;
120 123 return true;
121 124 } else {
122 125 return false;
... ...
fsa/_vfsa_impl.hpp 0 → 100644
  1 +/*
  2 + * File: _vfsa_impl.hpp
  3 + * Author: lennyn
  4 + *
  5 + * Created on October 29, 2013, 9:57 PM
  6 + */
  7 +
  8 +#ifndef _VFSA_IMPL_HPP
  9 +#define _VFSA_IMPL_HPP
  10 +
  11 +#include <algorithm>
  12 +#include <utility>
  13 +#include <iostream>
  14 +#include <netinet/in.h>
  15 +#include "fsa.hpp"
  16 +
  17 +using namespace std;
  18 +
  19 +#pragma pack(push) /* push current alignment to stack */
  20 +#pragma pack(1) /* set alignment to 1 byte boundary */
  21 +
  22 +struct VTransitionData {
  23 + unsigned label : 5;
  24 + unsigned offsetSize : 2;
  25 + unsigned last : 1;
  26 +};
  27 +
  28 +#pragma pack(pop) /* restore original alignment from stack */
  29 +
  30 +template <class T>
  31 +int FSAImpl<T>::getMagicNumberOffset() {
  32 + return 0;
  33 +}
  34 +
  35 +template <class T>
  36 +int FSAImpl<T>::getVersionNumOffset() {
  37 + return getMagicNumberOffset() + sizeof (MAGIC_NUMBER);
  38 +}
  39 +
  40 +template <class T>
  41 +int FSAImpl<T>::getPopularCharsOffset() {
  42 + return getVersionNumOffset() + sizeof (VERSION_NUM);
  43 +}
  44 +
  45 +template <class T>
  46 +int FSAImpl<T>::getInitialStateOffset() {
  47 + return getPopularCharsOffset() + POPULAR_CHARS_NUM;
  48 +}
  49 +
  50 +template <class T>
  51 +vector<unsigned char> FSAImpl<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) {
  52 + vector<unsigned char> res(256, FSAImpl<bool>::POPULAR_CHARS_NUM);
  53 + const unsigned char* popularChars = ptr + getPopularCharsOffset();
  54 + for (unsigned int i = 0; i < POPULAR_CHARS_NUM; i++) {
  55 + res[popularChars[i]] = i;
  56 + }
  57 + return res;
  58 +}
  59 +
  60 +template <class T>
  61 +FSAImpl<T>::FSAImpl(const unsigned char* ptr, const Deserializer<T>& deserializer)
  62 +: FSA<T>(ptr + getInitialStateOffset(), deserializer),
  63 +char2PopularCharIdx(initializeChar2PopularCharIdx(ptr)) {
  64 + uint32_t magicNumber = ntohl(*((uint32_t*) ptr + getMagicNumberOffset()));
  65 + if (magicNumber != MAGIC_NUMBER) {
  66 + throw FSAException("Invalid magic number");
  67 + }
  68 + unsigned char versionNum = *(ptr + getVersionNumOffset());
  69 + if (versionNum != VERSION_NUM) {
  70 + throw FSAException("Invalid version number");
  71 + }
  72 + cerr << "initial state offset " << getInitialStateOffset() << endl;
  73 +}
  74 +
  75 +template <class T>
  76 +FSAImpl<T>::~FSAImpl() {
  77 +
  78 +}
  79 +
  80 +template <class T>
  81 +void FSAImpl<T>::proceedToNext(const char c, State<T>& state) const {
  82 + // if (c <= 'z' && 'a' <= c)
  83 + // cerr << "NEXT " << c << " from " << state.getOffset() << endl;
  84 + // else
  85 + // cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl;
  86 + const unsigned char* fromPointer = this->startPtr + state.getOffset();
  87 + int transitionsTableOffset = 0;
  88 + if (state.isAccepting()) {
  89 + transitionsTableOffset += state.getValueSize();
  90 + cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl;
  91 + }
  92 + const unsigned char* currPtr = fromPointer + transitionsTableOffset;
  93 + bool found = false;
  94 + bool failed = false;
  95 + unsigned char shortLabel = char2PopularCharIdx[(unsigned char) c];
  96 + cerr << "NEXT " << c << " " << (int) shortLabel << endl;
  97 + VTransitionData td;
  98 + while (!found && !failed) {
  99 + td = *((VTransitionData*) currPtr);
  100 + cerr << "transition at " << (currPtr - this->startPtr) << endl;
  101 + cerr << "short label: " << (int) td.label << endl;
  102 + if (td.label == shortLabel) {
  103 + if (td.label != POPULAR_CHARS_NUM) {
  104 + found = true;
  105 + currPtr++;
  106 + }
  107 + else {
  108 + currPtr++;
  109 + char realLabel = (char) *currPtr;
  110 + cerr << "full label: " << realLabel << endl;
  111 + if (realLabel != c) {
  112 + failed = td.last;
  113 + currPtr += td.offsetSize + 1;
  114 + } else {
  115 + found = true;
  116 + currPtr++;
  117 + }
  118 + }
  119 + } else if (td.last) {
  120 + cerr << "last" << endl;
  121 + failed = true;
  122 + } else {
  123 + if (td.label == POPULAR_CHARS_NUM) {
  124 + currPtr++;
  125 + }
  126 + currPtr += td.offsetSize + 1;
  127 + }
  128 + }
  129 +
  130 + if (found) {
  131 + // currPtr points at the offset
  132 + // or next state (iff offset==0)
  133 + int offsetFromHere = 0;
  134 + cerr << "offset size " << td.offsetSize << endl;
  135 + for (int i = 0; i < td.offsetSize; i++) {
  136 + offsetFromHere <<= 8;
  137 + cerr << "offset from here " << offsetFromHere << endl;
  138 + offsetFromHere += *currPtr;
  139 + if (i + 1 < td.offsetSize)
  140 + currPtr++;
  141 + cerr << "offset from here " << offsetFromHere << endl;
  142 + }
  143 + currPtr += offsetFromHere;
  144 + cerr << "offset " << currPtr - this->startPtr << endl;
  145 + bool accepting = c == '\0';
  146 + if (accepting) {
  147 + T value;
  148 + int valueSize = this->deserializer.deserialize(currPtr, value);
  149 + currPtr += valueSize;
  150 + state.setNext(currPtr - this->startPtr, value, valueSize);
  151 + } else {
  152 + state.setNext(currPtr - this->startPtr);
  153 + }
  154 + } else {
  155 + state.setNextAsSink();
  156 + }
  157 +}
  158 +
  159 +#endif /* _VFSA_IMPL_HPP */
  160 +
... ...
fsa/fsa.hpp
... ... @@ -9,13 +9,17 @@
9 9 #define FSA_HPP
10 10  
11 11 //#include <iostream>
12   -#include <cstring>
  12 +//#include <cstring>
13 13 #include <typeinfo>
14 14 #include <cassert>
  15 +#include <exception>
  16 +#include <string>
  17 +#include <vector>
15 18  
16 19 template <class T> class State;
17 20 template <class T> class FSA;
18 21 template <class T> class Deserializer;
  22 +class FSAException;
19 23  
20 24 template <class T>
21 25 class Deserializer {
... ... @@ -39,23 +43,36 @@ public:
39 43 * Returns number of bytes read or -1 on error.
40 44 */
41 45 int deserialize(const unsigned char* ptr, char*& text) const {
42   - text = const_cast<char*> (reinterpret_cast<const char*> (ptr));
43   - return strlen(text) + 1;
  46 + // text = const_cast<char*> (reinterpret_cast<const char*> (ptr));
  47 + // return strlen(text) + 1;
  48 + return 1;
44 49 }
45 50 };
46 51  
  52 +class Counter {
  53 +public:
  54 +
  55 + Counter() : count(0) {
  56 +
  57 + }
  58 +
  59 + void increment(const int n) {
  60 + count += n;
  61 + }
  62 + long long count;
  63 +};
  64 +
47 65 /**
48 66 * Finite state automaton.
49 67 */
50 68 template <class T>
51 69 class FSA {
52 70 public:
53   -
54 71 /**
55 72 * Get this automaton's initial state.
56 73 */
57 74 State<T> getInitialState() const;
58   -
  75 +
59 76 bool tryToRecognize(const char* input, T& value) const;
60 77  
61 78 virtual ~FSA() {
... ... @@ -81,10 +98,41 @@ class SimpleFSA : public FSA&lt;T&gt; {
81 98 public:
82 99 SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer);
83 100 virtual ~SimpleFSA();
  101 +
  102 + long long transitionsCount() {
  103 + return counter.count;
  104 + }
84 105 protected:
85 106 void proceedToNext(const char c, State<T>& state) const;
86 107 private:
  108 + Counter counter;
  109 +};
  110 +
  111 +template <class T>
  112 +class FSAImpl : public FSA<T> {
  113 +public:
  114 + FSAImpl(const unsigned char* ptr, const Deserializer<T>& deserializer);
  115 + virtual ~FSAImpl();
87 116  
  117 + long long transitionsCount() {
  118 + return counter.count;
  119 + }
  120 +
  121 + static const uint32_t MAGIC_NUMBER = 0x8fc2bc1b;
  122 + static const unsigned char VERSION_NUM = 1;
  123 + static const unsigned int POPULAR_CHARS_NUM = 31;
  124 +
  125 +protected:
  126 + void proceedToNext(const char c, State<T>& state) const;
  127 +private:
  128 + Counter counter;
  129 + const std::vector<unsigned char> char2PopularCharIdx;
  130 +
  131 + static int getMagicNumberOffset();
  132 + static int getVersionNumOffset();
  133 + static int getPopularCharsOffset();
  134 + static int getInitialStateOffset();
  135 + static std::vector<unsigned char> initializeChar2PopularCharIdx(const unsigned char* ptr);
88 136 };
89 137  
90 138 /**
... ... @@ -122,15 +170,15 @@ public:
122 170 * For non-accepting states is throws an exception.
123 171 */
124 172 unsigned int getValueSize() const;
125   -
  173 +
126 174 unsigned int getOffset() const;
127   -
  175 +
128 176 void setNext(const unsigned int offset);
129 177 void setNext(const unsigned int offset, const T& value, const unsigned int valueSize);
130 178 void setNextAsSink();
131   -
  179 +
132 180 State(const FSA<T>& fsa);
133   -
  181 +
134 182 virtual ~State();
135 183 private:
136 184 const FSA<T>& fsa;
... ... @@ -141,7 +189,19 @@ private:
141 189 int valueSize;
142 190 };
143 191  
  192 +class FSAException : public std::exception {
  193 +public:
  194 + FSAException(const char* what): msg(what) {}
  195 + virtual ~FSAException() throw() {}
  196 + virtual const char* what() const throw () {
  197 + return this->msg.c_str();
  198 + }
  199 +private:
  200 + const std::string msg;
  201 +};
  202 +
144 203 #include "_fsa_impl.hpp"
  204 +#include "_vfsa_impl.hpp"
145 205 #include "_state_impl.hpp"
146 206  
147 207 #endif /* FSA_HPP */
... ...
fsa/test_dict.cpp renamed to fsa/test_recognize.cpp
1 1 /*
2   - * File: test.cpp
  2 + * File: test_recognize.cpp
3 3 * Author: lennyn
4 4 *
5   - * Created on October 22, 2013, 2:11 PM
  5 + * Created on October 30, 2013, 5:26 PM
6 6 */
7 7  
8 8 #include <cstdlib>
9   -#include <iostream>
10   -#include <fstream>
11   -#include <string>
12 9 #include <sstream>
13   -#include <algorithm>
14   -#include <functional>
15   -#include <cctype>
16   -#include <locale>
17   -#include <vector>
18 10 #include "fsa.hpp"
19 11 #include "utils.hpp"
20 12  
... ... @@ -41,51 +33,34 @@ static inline string &amp;rtrim(string &amp;s) {
41 33 return s;
42 34 }
43 35  
44   -void testFSA(const FSA<char*>& fsa, const char* fname) {
  36 +void doTest(const FSA<char*>& fsa, const char* fname) {
45 37 ifstream ifs;
46 38 // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
47 39 ifs.open(fname, ios::binary);
48 40 string line;
49 41 while (getline(ifs, line)) {
50   - vector<string> split1(split(line, '\t'));
51   - string key = split1[0];
52   - key = "bijekcją";
53   - string value = split1[1];
54 42  
55   - for (unsigned int i = 0; i < key.length(); i++) {
56   - cout << (int) key[i] << " ";
57   - }
58   - cout << endl;
  43 + vector<string> splitVector(split(line, '\t'));
  44 + string key = splitVector[0];
  45 +
  46 + cerr << "test " << key << endl;
59 47  
60 48 char* value2;
61   - if (fsa.tryToRecognize(key.c_str(), value2)) {
62   - if (string(value) != string(value2)) {
63   - cout << "BAD INTERP " << key << " " << value << " != " << value2 << endl;
64   - }
65   - else {
66   - cout << "OK! " << key << " " << value << endl;
67   - }
68   - }
69   - else {
70   - cout << "MISS " << key << " " << value << " not recognized" << endl;
71   - }
  49 + validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize "+key);
72 50 }
73   - cout << ifs.good() << endl;
74   - cout << ifs.fail() << endl;
75   - cout << ifs.eof() << endl;
76   - cout << "done" << endl;
  51 +// validate(ifs.good(), "Something wrong with the input file");
  52 +// validate(!ifs.fail(), "Something wrong with the input file");
  53 + validate(ifs.eof(), "Failed to read the input file to the end");
77 54 }
78 55  
79   -/*
80   - *
81   - */
82 56 int main(int argc, char** argv) {
  57 + cerr << (int) ((unsigned char) -123) << endl;
83 58 validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename.");
84 59 const unsigned char* fsaData = readFile(argv[1]);
85 60 StringDeserializer deserializer;
86   - SimpleFSA<char*> fsa(fsaData, deserializer);
87   - testFSA(fsa, argv[2]);
88   - cout << argc << endl;
  61 + FSAImpl<char*> fsa(fsaData, deserializer);
  62 + doTest(fsa, argv[2]);
  63 +// cout << argc << endl;
89 64 return 0;
90 65 }
91 66  
... ...
fsa/test_speed.cpp
... ... @@ -25,17 +25,23 @@ int main(int argc, char** argv) {
25 25 ifstream ifs;
26 26 // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
27 27 ifs.open(argv[2], ios::binary);
28   - string line;
29   - while (getline(ifs, line)) {
  28 + char line[65536];
  29 + int recognized = 0;
  30 + int unrecognized = 0;
  31 + while (ifs.getline(line, 65536, '\n')) {
30 32 char* val;
31 33 // cout << line << endl;
32   - if (fsa.tryToRecognize(line.c_str(), val)) {
33   -// printf("%s: *OK*\n", line.c_str());
  34 + if (fsa.tryToRecognize(line, val)) {
  35 +// printf("%s: *OK*\n", line);
  36 + recognized++;
34 37 }
35 38 else {
36   -// printf("%s: NOT FOUND\n", line.c_str());
  39 + unrecognized++;
  40 +// printf("%s: NOT FOUND\n", line);
37 41 }
38 42 }
  43 + cout << "recognized: " << recognized << endl;
  44 + cout << "unrecognized: " << unrecognized << endl;
  45 + cout << "total: " << (recognized + unrecognized) << endl;
39 46 return 0;
40 47 }
41   -
... ...
fsa/utils.hpp
... ... @@ -10,6 +10,10 @@
10 10  
11 11 #include <iostream>
12 12 #include <string>
  13 +#include <fstream>
  14 +#include <vector>
  15 +
  16 +using namespace std;
13 17  
14 18 void validate(const bool cond, const std::string& msg) {
15 19 if (!cond) {
... ...
fsabuilder/fsa/buildfsa.py
... ... @@ -11,11 +11,11 @@ import codecs
11 11 import encode
12 12 import convertinput
13 13 from fsa import FSA
14   -from serializer import SimpleSerializer
  14 +from serializer import VLengthSerializer
15 15 from visualizer import Visualizer
16 16 from optparse import OptionParser
17 17  
18   -logging.basicConfig(level=logging.DEBUG)
  18 +logging.basicConfig(level=logging.INFO)
19 19  
20 20 class OutputFormat():
21 21 BINARY = 'BINARY'
... ... @@ -124,20 +124,23 @@ if __name__ == &#39;__main__&#39;:
124 124 }[opts.inputFormat]
125 125  
126 126 logging.info('feeding FSA with data ...')
127   - fsa.feed(inputData, appendZero=True)
  127 + fsa.feed(inputData)
128 128 if opts.trainFile:
129 129 logging.info('training with '+opts.trainFile+' ...')
130 130 fsa.train(readTrainData(opts.trainFile))
131 131 logging.info('done training')
132   - serializer = SimpleSerializer(fsa)
  132 + serializer = VLengthSerializer(fsa)
133 133 logging.info('states num: '+str(fsa.getStatesNum()))
  134 + logging.info('transitions num: '+str(fsa.getTransitionsNum()))
134 135 logging.info('accepting states num: '+str(len([s for s in fsa.initialState.dfs(set()) if s.isAccepting()])))
135 136 logging.info('sink states num: '+str(len([s for s in fsa.initialState.dfs(set()) if len(s.transitionsMap.items()) == 0])))
136 137 {
137 138 OutputFormat.CPP: serializer.serialize2CppFile,
138 139 OutputFormat.BINARY: serializer.serialize2BinaryFile
139 140 }[opts.outputFormat](opts.outputFile)
140   -
  141 + logging.info('size: '+str(fsa.initialState.reverseOffset))
  142 +# for s in fsa.initialState.dfs(set()):
  143 +# logging.info(s.offset)
141 144 if opts.visualize:
142 145 Visualizer().visualize(fsa)
143 146  
... ...
fsabuilder/fsa/fsa.py
... ... @@ -59,6 +59,12 @@ class FSA(object):
59 59  
60 60 def getStatesNum(self):
61 61 return self.register.getStatesNum()
  62 +
  63 + def getTransitionsNum(self):
  64 + res = 0
  65 + for s in self.initialState.dfs(set()):
  66 + res += len(s.transitionsMap)
  67 + return res
62 68  
63 69 def _addSorted(self, encodedWord, data):
64 70 assert self.encodedPrevWord < encodedWord
... ...
fsabuilder/fsa/serializer.py
... ... @@ -46,10 +46,12 @@ class Serializer(object):
46 46 raise NotImplementedError('Not implemented')
47 47  
48 48 def fsa2bytearray(self):
  49 +
49 50 res = bytearray()
50 51 res.extend(self.serializePrologue())
51 52 self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
52   - for state in sorted(self.fsa.initialState.dfs(set()), key=state.offset):
  53 + logging.debug('SERIALIZE')
  54 + for state in sorted(self.fsa.initialState.dfs(set()), key=lambda s: s.offset):
53 55 res.extend(self.state2bytearray(state))
54 56 return res
55 57  
... ... @@ -111,14 +113,17 @@ class VLengthSerializer(Serializer):
111 113  
112 114 def __init__(self, fsa):
113 115 super(VLengthSerializer, self).__init__(fsa)
114   - self.statesTable = list(reversed(fsa.dfs(set())))
  116 + self.statesTable = list(reversed(list(fsa.initialState.dfs(set()))))
115 117 self.state2Index = dict([(state, idx) for (idx, state) in enumerate(self.statesTable)])
116 118  
117 119 # labels sorted by popularity
118   - self.sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda label, freq: (-freq, label))]
  120 + self.sortedLabels = [label for (label, freq) in sorted(self.fsa.label2Freq.iteritems(), key=lambda (label, freq): (-freq, label))]
  121 + remainingChars = [c for c in range(256) if not c in self.sortedLabels]
  122 + while len(self.sortedLabels) < 256:
  123 + self.sortedLabels.append(remainingChars.pop())
119 124  
120 125 # popular labels table
121   - self.label2Index = dict([(label, sortedLabels.index(label)) for label in sortedLabels][:31])
  126 + self.label2Index = dict([(label, self.sortedLabels.index(label)) for label in self.sortedLabels][:31])
122 127  
123 128 def serializePrologue(self):
124 129 res = bytearray()
... ... @@ -133,7 +138,8 @@ class VLengthSerializer(Serializer):
133 138 res.append(VLengthSerializer.VERSION)
134 139  
135 140 # serialize popular labels
136   - for label, freq in self.sortedLabels[:31]:
  141 + logging.debug(self.sortedLabels)
  142 + for label in self.sortedLabels[:31]:
137 143 res.append(label)
138 144  
139 145 return res
... ... @@ -159,20 +165,23 @@ class VLengthSerializer(Serializer):
159 165  
160 166 def _transitionsData2bytearray(self, state):
161 167 res = bytearray()
162   - transitions = sorted(state.transitionsMap.iteritems(), key=lambda (label, _): (-next.freq, -self.label2Count[label]))
  168 + transitions = sorted(state.transitionsMap.iteritems(), key=lambda (label, nextState): self.sortedLabels.index(label))
163 169 thisIdx = self.state2Index[state]
164   -
  170 + logging.debug('state '+str(state.offset))
165 171 if len(transitions) == 0:
166 172 assert state.isAccepting()
167 173 return bytearray()
168 174 else:
169   - offsetToStateAfterThis = 0
170 175 stateAfterThis = self.statesTable[thisIdx + 1]
171 176 for reversedN, (label, nextState) in enumerate(reversed(transitions)):
  177 + transitionBytes = bytearray()
172 178 assert nextState.reverseOffset is not None
  179 + assert stateAfterThis.reverseOffset is not None
  180 + logging.debug('next state reverse: '+str(nextState.reverseOffset))
  181 + logging.debug('after state reverse: '+str(stateAfterThis.reverseOffset))
173 182 n = len(transitions) - reversedN
174 183  
175   - popularLabel = self.label2Index[label] < 31
  184 + popularLabel = label in self.label2Index
176 185 firstByte = self.label2Index[label] if popularLabel else 31
177 186  
178 187 last = len(transitions) == n
... ... @@ -185,26 +194,29 @@ class VLengthSerializer(Serializer):
185 194 offset = 0
186 195 if not next:
187 196 offsetSize = 1
188   - offset = (stateAfterThis.reverseOffset - nextState.reverseOffset) + offsetSize
  197 +# nextState.offset - stateAfterThis.offset
  198 + offset = (stateAfterThis.reverseOffset - nextState.reverseOffset) + offsetSize + len(res)
189 199 if offset >= 256:
190   - offset += 1
  200 +# offset += 1
191 201 offsetSize += 1
192 202 if offset >= 256 * 256:
193   - offset += 1
  203 +# offset += 1
194 204 offsetSize += 1
195 205 assert offset < 256 * 256 * 256 #TODO - przerobic na jakis porzadny wyjatek
196 206  
197 207 firstByte |= (32 * offsetSize)
198 208  
199   - res.append(firstByte)
  209 + transitionBytes.append(firstByte)
200 210 if not popularLabel:
201   - res.append(label)
  211 + transitionBytes.append(label)
202 212 # serialize offset in big-endian order
203 213 if offsetSize == 3:
204   - res.append((offset & 0xFF0000) >> 16)
  214 + transitionBytes.append((offset & 0xFF0000) >> 16)
205 215 if offsetSize >= 2:
206   - res.append((offset & 0x00FF00) >> 8)
  216 + transitionBytes.append((offset & 0x00FF00) >> 8)
207 217 if offsetSize >= 1:
208   - res.append(offset & 0x0000FF)
209   -
  218 + transitionBytes.append(offset & 0x0000FF)
  219 + for b in reversed(transitionBytes):
  220 + res.insert(0, b)
  221 + logging.debug('inserted transition at beginning '+chr(label)+' -> '+str(offset))
210 222 return res
... ...
nbproject/configurations.xml
... ... @@ -2,9 +2,7 @@
2 2 <configurationDescriptor version="90">
3 3 <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT">
4 4 <df root="fsa" name="0">
5   - <in>test_dict.cpp</in>
6 5 <in>test_speed.cpp</in>
7   - <in>utils.hpp</in>
8 6 </df>
9 7 <df root="morfeusz" name="1">
10 8 <in>main.cpp</in>
... ... @@ -38,7 +36,7 @@
38 36 <buildCommandWorkingDir>build</buildCommandWorkingDir>
39 37 <buildCommand>${MAKE} -f Makefile</buildCommand>
40 38 <cleanCommand>${MAKE} -f Makefile clean</cleanCommand>
41   - <executablePath></executablePath>
  39 + <executablePath>build/fsa/test_dict</executablePath>
42 40 </makeTool>
43 41 </makefileType>
44 42 <folder path="0">
... ... @@ -56,19 +54,21 @@
56 54 </incDir>
57 55 </ccTool>
58 56 </folder>
59   - <item path="fsa/test_dict.cpp" ex="false" tool="1" flavor2="8">
60   - <ccTool>
61   - </ccTool>
  57 + <item path="fsa/newmain.cpp" ex="false" tool="1" flavor2="0">
62 58 </item>
63   - <item path="fsa/test_speed.cpp" ex="false" tool="1" flavor2="0">
  59 + <item path="fsa/test_recognize.cpp" ex="false" tool="1" flavor2="0">
64 60 </item>
65   - <item path="fsa/utils.hpp" ex="false" tool="3" flavor2="0">
  61 + <item path="fsa/test_speed.cpp" ex="false" tool="1" flavor2="8">
  62 + <ccTool>
  63 + </ccTool>
66 64 </item>
67 65 <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
68 66 <ccTool>
69 67 </ccTool>
70 68 </item>
71   - <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="0">
  69 + <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4">
  70 + <ccTool>
  71 + </ccTool>
72 72 </item>
73 73 </conf>
74 74 </confs>
... ...