Blame view

morfeusz/fsa/fsa.hpp 5.75 KB
Michał Lenart authored
1
2
3
4
5
6
7
8
9
10
11
/* 
 * File:   fsa.hh
 * Author: mlenart
 *
 * Created on October 17, 2013, 2:00 PM
 */

#ifndef FSA_HPP
#define FSA_HPP

//#include <iostream>
Michał Lenart authored
12
#include <cstring>
Michał Lenart authored
13
#include <cassert>
Michał Lenart authored
14
#include <typeinfo>
Michał Lenart authored
15
16
17
#include <exception>
#include <string>
#include <vector>
Michał Lenart authored
18
#include <cstdint>
Michał Lenart authored
19
20
21
22

template <class T> class State;
template <class T> class FSA;
template <class T> class Deserializer;
Michał Lenart authored
23
class FSAException;
Michał Lenart authored
24
25
26
27
28
29
30
31
32

template <class T>
class Deserializer {
public:

    /**
     * Deserialize object from ptr.
     * Returns number of bytes read or -1 on error.
     */
Michał Lenart authored
33
    virtual long deserialize(const unsigned char* ptr, T& object) const = 0;
Michał Lenart authored
34
35
36
37
38
39
40
41
42
43
44
45
};

class StringDeserializer : public Deserializer<char*> {
public:

    StringDeserializer() {
    }

    /**
     * Deserialize object from ptr.
     * Returns number of bytes read or -1 on error.
     */
Michał Lenart authored
46
    long deserialize(const unsigned char* ptr, char*& text) const {
Michał Lenart authored
47
48
49
        text = const_cast<char*> (reinterpret_cast<const char*> (ptr));
        return strlen(text) + 1;
//        return 1;
Michał Lenart authored
50
51
52
    }
};
Michał Lenart authored
53
54
55
56
57
58
59
60
61
62
63
64
65
class Counter {
public:

    Counter() : count(0) {

    }

    void increment(const int n) {
        count += n;
    }
    long long count;
};
Michał Lenart authored
66
67
68
69
70
71
72
73
74
/**
 * Finite state automaton.
 */
template <class T>
class FSA {
public:
    /**
     * Get this automaton's initial state.
     */
Michał Lenart authored
75
    State<T> getInitialState() const;
Michał Lenart authored
76
Michał Lenart authored
77
    bool tryToRecognize(const char* input, T& value) const;
Michał Lenart authored
78
79
80

    virtual ~FSA() {
    }
Michał Lenart authored
81
82
83
84
85
86

    /**
     * Create an FSA object from pointer
     */
    static FSA<T>* getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer);
Michał Lenart authored
87
88
89
90
91
    /**
     * Create an FSA object from file
     */
    static FSA<T>* getFSA(const std::string& filename, const Deserializer<T>& deserializer);
Michał Lenart authored
92
93
protected:
Michał Lenart authored
94
95
96
97
98
    /**
     * Create FSA object for givent initial state pointer and deserializer
     */
    FSA(const unsigned char* initialStatePtr, const Deserializer<T>& deserializer);
Michał Lenart authored
99
100
101
102
    /**
     * Proceed to next state
     */
    virtual void proceedToNext(const char c, State<T>& state) const = 0;
Michał Lenart authored
103
    const unsigned char* initialStatePtr;
Michał Lenart authored
104
105
106
107
108
109
110
111
112
113
114
    const Deserializer<T>& deserializer;
    friend class State<T>;
private:
    //    FSA();
};

template <class T>
class SimpleFSA : public FSA<T> {
public:
    SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer);
    virtual ~SimpleFSA();
Michał Lenart authored
115
116
117
118

    long long transitionsCount() {
        return counter.count;
    }
Michał Lenart authored
119
120
121
protected:
    void proceedToNext(const char c, State<T>& state) const;
private:
Michał Lenart authored
122
123
124
125
    Counter counter;
};

template <class T>
Michał Lenart authored
126
class CompressedFSA1 : public FSA<T> {
Michał Lenart authored
127
public:
Michał Lenart authored
128
129
    CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer);
    virtual ~CompressedFSA1();
Michał Lenart authored
130
Michał Lenart authored
131
132
133
134
135
136
137
    long long transitionsCount() {
        return counter.count;
    }
protected:
    void proceedToNext(const char c, State<T>& state) const;
private:
    Counter counter;
Michał Lenart authored
138
    const std::vector<unsigned char> label2ShortLabel;
Michał Lenart authored
139
Michał Lenart authored
140
    static std::vector<unsigned char> initializeChar2PopularCharIdx(const unsigned char* ptr);
Michał Lenart authored
141
Michał Lenart authored
142
143
144
    void doProceedToNextByList(
        const char c,
        const unsigned char shortLabel,
Michał Lenart authored
145
146
147
148
149
        const unsigned char* ptr,
        const unsigned int transitionsNum,
        State<T>& state) const;
    void reallyDoProceed(
        const unsigned char* statePtr,
Michał Lenart authored
150
151
        State<T>& state) const;
    void doProceedToNextByArray(
Michał Lenart authored
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
        const unsigned char shortLabel,
        const uint32_t* ptr,
        State<T>& state) const;
};


template <class T>
class CompressedFSA2 : public FSA<T> {
public:
    CompressedFSA2(const unsigned char* ptr, const Deserializer<T>& deserializer);
    virtual ~CompressedFSA2();

    long long transitionsCount() {
        return counter.count;
    }
protected:
    void proceedToNext(const char c, State<T>& state) const;
private:
    Counter counter;

    static std::vector<unsigned char> initializeChar2PopularCharIdx(const unsigned char* ptr);

    void doProceedToNextByList(
        const char c,
        const unsigned char* ptr, 
Michał Lenart authored
177
178
179
        State<T>& state) const;
    void reallyDoProceed(
        const unsigned char* statePtr,
Michał Lenart authored
180
        const bool accepting,
Michał Lenart authored
181
        State<T>& state) const;
Michał Lenart authored
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
};

/**
 * A state in an FSA.
 */
template <class T>
class State {
public:

    /**
     * Is this a "sink" state - non-accepting state without outgoing transitions
     */
    bool isSink() const;

    /**
     * Is this an accepting state
     */
    bool isAccepting() const;

    /**
     * Get next state proceeding a transition for given character.
     */
    void proceedToNext(const char c);

    /**
     * Get value of this state.
     * Makes sense only for accepting states.
     * For non-accepting states is throws an exception.
     */
    T getValue() const;

    /**
     * Get the size (in bytes) of this state's value.
     * Makes sense only for accepting states.
     * For non-accepting states is throws an exception.
     */
Michał Lenart authored
218
    unsigned long getValueSize() const;
Michał Lenart authored
219
Michał Lenart authored
220
    unsigned long getOffset() const;
Michał Lenart authored
221
Michał Lenart authored
222
    void setNext(const unsigned long offset);
Michał Lenart authored
223
    void setNext(const unsigned long offset, const T& value, const unsigned long valueSize);
Michał Lenart authored
224
    void setNextAsSink();
Michał Lenart authored
225
Michał Lenart authored
226
    explicit State(const FSA<T>& fsa);
Michał Lenart authored
227
Michał Lenart authored
228
229
230
    virtual ~State();
private:
    const FSA<T>& fsa;
Michał Lenart authored
231
    unsigned long offset;
Michał Lenart authored
232
233
234
    bool accepting;
    bool sink;
    T value;
Michał Lenart authored
235
    long valueSize;
Michał Lenart authored
236
237
};
Michał Lenart authored
238
239
240
class FSAException : public std::exception {
public:
    FSAException(const char* what): msg(what) {}
Michał Lenart authored
241
    FSAException(const std::string& what): msg(what.c_str()) {}
Michał Lenart authored
242
243
244
245
246
247
248
249
    virtual ~FSAException() throw() {}
    virtual const char* what() const throw () {
        return this->msg.c_str();
    }
private:
    const std::string msg;
};
Michał Lenart authored
250
251
252
253
254
#include "state_impl.hpp"
#include "fsa_impl.hpp"
#include "simplefsa_impl.hpp"
#include "cfsa1_impl.hpp"
#include "cfsa2_impl.hpp"
Michał Lenart authored
255
256
257
258

#endif	/* FSA_HPP */