Blame view

morfeusz/fsa/cfsa1_impl.hpp 5.1 KB
Michał Lenart authored
1
/* 
Michał Lenart authored
2
3
 * File:   cfsa1_impl.hpp
 * Author: mlenart
Michał Lenart authored
4
 *
Michał Lenart authored
5
 * Created on November 4, 2013, 1:08 PM
Michał Lenart authored
6
7
 */
Michał Lenart authored
8
9
10
11
#ifndef CFSA1_IMPL_HPP
#define	CFSA1_IMPL_HPP

#include <vector>
Michał Lenart authored
12
#include <climits>
Michał Lenart authored
13
14

#include "fsa.hpp"
Michał Lenart authored
15
#include "../deserialization/deserializationUtils.hpp"
Michał Lenart authored
16
Michał Lenart authored
17
18
namespace morfeusz {
Michał Lenart authored
19
static const unsigned char CFSA1_ACCEPTING_FLAG = 128;
Michał Lenart authored
20
21
//static const unsigned char CFSA1_ARRAY_FLAG = 64;
static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 127;
Michał Lenart authored
22
23
24
25

static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3;

static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257;
Michał Lenart authored
26
27

struct StateData2 {
Michał Lenart authored
28
29
    unsigned int transitionsNum;
    bool isAccepting;
Michał Lenart authored
30
31
32
};

struct TransitionData2 {
Michał Lenart authored
33
34
    unsigned int offsetSize;
    unsigned int shortLabel;
Michał Lenart authored
35
36
};
Michał Lenart authored
37
38
39
40
41
42
43
44
45
46
static inline StateData2 readStateData(const unsigned char*& ptr) {
    StateData2 res;
    unsigned char firstByte = readInt8(ptr);
    res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG;
    res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK;
    if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) {
        res.transitionsNum = readInt8(ptr);
    }
    return res;
}
Michał Lenart authored
47
Michał Lenart authored
48
49
50
51
52
53
54
static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) {
    TransitionData2 res;
    unsigned char firstByte = readInt8(ptr);
    res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK;
    res.shortLabel = firstByte >> 2;
    return res;
}
Michał Lenart authored
55
56

template <class T>
Michał Lenart authored
57
std::vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) {
Michał Lenart authored
58
59
    std::vector<unsigned char> res(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET);
    return res;
Michał Lenart authored
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
}

template <class T>
void CompressedFSA1<T>::initializeInitialTransitions() {
    hasTransition = std::vector<char>(256, false);
    initialTransitions = std::vector< State<T> >(256);
    char c = CHAR_MIN;
    while (true) {
        State<T> state;
        doProceedToNext(c, state, true);
        int currIdx = static_cast<const unsigned char>(c);
        initialTransitions[currIdx] = state;
        if (c == CHAR_MAX) {
            return;
        }
        else {
            c++;
        }
    }
Michał Lenart authored
79
80
81
}

template <class T>
Michał Lenart authored
82
CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer)
Michał Lenart authored
83
: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer),
Michał Lenart authored
84
85
86
87
label2ShortLabel(initializeChar2PopularCharIdx(ptr)),
hasTransition(),
initialTransitions() {
    initializeInitialTransitions();
Michał Lenart authored
88
89
90
}

template <class T>
Michał Lenart authored
91
CompressedFSA1<T>::~CompressedFSA1() {
Michał Lenart authored
92
93
94
95

}

template <class T>
Michał Lenart authored
96
void CompressedFSA1<T>::reallyDoProceed(
Michał Lenart authored
97
98
        const unsigned char* statePtr,
        State<T>& state) const {
Michał Lenart authored
99
100
101
    const unsigned char* currPtr = statePtr;
    const StateData2 sd = readStateData(currPtr);
    if (sd.isAccepting) {
Michał Lenart authored
102
103
104
        T value;
        long valueSize = this->deserializer.deserialize(currPtr, value);
        state.setNext(statePtr - this->initialStatePtr, value, valueSize);
Michał Lenart authored
105
106
    }
    else {
Michał Lenart authored
107
        state.setNext(statePtr - this->initialStatePtr);
Michał Lenart authored
108
109
110
111
    }
}

template <class T>
Michał Lenart authored
112
void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const {
Michał Lenart authored
113
114
115
116
117
118
119
120
121
122
123
    doProceedToNext(c, state, false);
}

template <class T>
void CompressedFSA1<T>::doProceedToNext(const char c, State<T>& state, bool initial) const {
    if (state.offset == 0 && !initial) {
        int idx = static_cast<const unsigned char>(c);
        State<T> newState = this->initialTransitions[idx];
        state = newState;
        return;
    }
Michał Lenart authored
124
125
126
127
128
129
    const unsigned char* currPtr = this->initialStatePtr + state.getOffset();
    unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c];
    const StateData2 sd = readStateData(currPtr);
    if (state.isAccepting()) {
        currPtr += state.getValueSize();
    }
Michał Lenart authored
130
131
    bool found = false;
    TransitionData2 td;
Michał Lenart authored
132
    for (unsigned int i = 0; i < sd.transitionsNum; i++) {
Michał Lenart authored
133
        td = readTransitionFirstByte(currPtr);
Michał Lenart authored
134
135
        if (td.shortLabel == shortLabel) {
            if (shortLabel == 0) {
Michał Lenart authored
136
                char label = static_cast<char> (readInt8(currPtr));
Michał Lenart authored
137
138
139
                if (label == c) {
                    found = true;
                    break;
Michał Lenart authored
140
141
                }
                else {
Michał Lenart authored
142
                    currPtr += td.offsetSize;
Michał Lenart authored
143
                }
Michał Lenart authored
144
145
            } 
            else {
Michał Lenart authored
146
147
148
                found = true;
                break;
            }
Michał Lenart authored
149
150
        }
        else {
Michał Lenart authored
151
152
153
            if (td.shortLabel == 0) {
                currPtr++;
            }
Michał Lenart authored
154
            currPtr += td.offsetSize;
Michał Lenart authored
155
156
157
158
        }
    }
    if (!found) {
        state.setNextAsSink();
Michał Lenart authored
159
160
    } 
    else {
Michał Lenart authored
161
        uint32_t offset;
Michał Lenart authored
162
163
        switch (td.offsetSize) {
            case 0:
Michał Lenart authored
164
                offset = 0;
Michał Lenart authored
165
166
                break;
            case 1:
Michał Lenart authored
167
                offset = readInt8(currPtr);
Michał Lenart authored
168
169
                break;
            case 2:
Michał Lenart authored
170
                offset = readInt16(currPtr);
Michał Lenart authored
171
172
                break;
            case 3:
Michał Lenart authored
173
                offset = readInt24(currPtr);
Michał Lenart authored
174
                break;
Michał Lenart authored
175
            default:
Michał Lenart authored
176
                std::cerr << "Offset size = " << td.offsetSize << std::endl;
Michał Lenart authored
177
                throw FileFormatException("Invalid file format");
Michał Lenart authored
178
        }
Michał Lenart authored
179
        currPtr += offset;
Michał Lenart authored
180
181
182
183
        reallyDoProceed(currPtr, state);
    }
}
Michał Lenart authored
184
185
}
Michał Lenart authored
186
#endif	/* CFSA1_IMPL_HPP */
Michał Lenart authored
187