Blame view

morfeusz/fsa/cfsa1_impl.hpp 5.17 KB
Michał Lenart authored
1
/* 
Michał Lenart authored
2
3
 * File:   cfsa1_impl.hpp
 * Author: mlenart
Michał Lenart authored
4
 *
Michał Lenart authored
5
 * Created on November 4, 2013, 1:08 PM
Michał Lenart authored
6
7
 */
Michał Lenart authored
8
9
10
11
#ifndef CFSA1_IMPL_HPP
#define	CFSA1_IMPL_HPP

#include <vector>
Michał Lenart authored
12
#include <climits>
Michał Lenart authored
13
14

#include "fsa.hpp"
Michał Lenart authored
15
#include "../deserialization/deserializationUtils.hpp"
Michał Lenart authored
16
Michał Lenart authored
17
18
namespace morfeusz {
Michał Lenart authored
19
static const unsigned char CFSA1_ACCEPTING_FLAG = 128;
Michał Lenart authored
20
21
//static const unsigned char CFSA1_ARRAY_FLAG = 64;
static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 127;
Michał Lenart authored
22
23
24
25

static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3;

static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257;
Michał Lenart authored
26
27

struct StateData2 {
Michał Lenart authored
28
    unsigned int transitionsNum;
Michał Lenart authored
29
    //    bool isArray;
Michał Lenart authored
30
    bool isAccepting;
Michał Lenart authored
31
32
33
};

struct TransitionData2 {
Michał Lenart authored
34
35
    unsigned int offsetSize;
    unsigned int shortLabel;
Michał Lenart authored
36
37
};
Michał Lenart authored
38
39
40
static inline StateData2 readStateData(const unsigned char*& ptr) {
    StateData2 res;
    unsigned char firstByte = readInt8(ptr);
Michał Lenart authored
41
    //    res.isArray = firstByte & CFSA1_ARRAY_FLAG;
Michał Lenart authored
42
43
44
45
46
47
48
    res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG;
    res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK;
    if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) {
        res.transitionsNum = readInt8(ptr);
    }
    return res;
}
Michał Lenart authored
49
Michał Lenart authored
50
51
52
53
54
55
56
static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) {
    TransitionData2 res;
    unsigned char firstByte = readInt8(ptr);
    res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK;
    res.shortLabel = firstByte >> 2;
    return res;
}
Michał Lenart authored
57
58

template <class T>
Michał Lenart authored
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
std::vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) {
    return std::vector<unsigned char>(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET);
}

template <class T>
void CompressedFSA1<T>::initializeInitialTransitions() {
    hasTransition = std::vector<char>(256, false);
    initialTransitions = std::vector< State<T> >(256);
    char c = CHAR_MIN;
    while (true) {
        State<T> state;
        doProceedToNext(c, state, true);
        int currIdx = static_cast<const unsigned char>(c);
        initialTransitions[currIdx] = state;
        if (c == CHAR_MAX) {
            return;
        }
        else {
            c++;
        }
    }
Michał Lenart authored
80
81
82
}

template <class T>
Michał Lenart authored
83
CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer)
Michał Lenart authored
84
: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer),
Michał Lenart authored
85
86
87
88
label2ShortLabel(initializeChar2PopularCharIdx(ptr)),
hasTransition(),
initialTransitions() {
    initializeInitialTransitions();
Michał Lenart authored
89
90
91
}

template <class T>
Michał Lenart authored
92
CompressedFSA1<T>::~CompressedFSA1() {
Michał Lenart authored
93
94
95
96

}

template <class T>
Michał Lenart authored
97
void CompressedFSA1<T>::reallyDoProceed(
Michał Lenart authored
98
99
        const unsigned char* statePtr,
        State<T>& state) const {
Michał Lenart authored
100
101
102
    const unsigned char* currPtr = statePtr;
    const StateData2 sd = readStateData(currPtr);
    if (sd.isAccepting) {
Michał Lenart authored
103
104
105
        T value;
        long valueSize = this->deserializer.deserialize(currPtr, value);
        state.setNext(statePtr - this->initialStatePtr, value, valueSize);
Michał Lenart authored
106
107
    }
    else {
Michał Lenart authored
108
        state.setNext(statePtr - this->initialStatePtr);
Michał Lenart authored
109
110
111
112
    }
}

template <class T>
Michał Lenart authored
113
void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const {
Michał Lenart authored
114
115
116
117
118
119
120
121
122
123
124
    doProceedToNext(c, state, false);
}

template <class T>
void CompressedFSA1<T>::doProceedToNext(const char c, State<T>& state, bool initial) const {
    if (state.offset == 0 && !initial) {
        int idx = static_cast<const unsigned char>(c);
        State<T> newState = this->initialTransitions[idx];
        state = newState;
        return;
    }
Michał Lenart authored
125
126
127
128
129
130
    const unsigned char* currPtr = this->initialStatePtr + state.getOffset();
    unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c];
    const StateData2 sd = readStateData(currPtr);
    if (state.isAccepting()) {
        currPtr += state.getValueSize();
    }
Michał Lenart authored
131
132
    bool found = false;
    TransitionData2 td;
Michał Lenart authored
133
    for (unsigned int i = 0; i < sd.transitionsNum; i++) {
Michał Lenart authored
134
        td = readTransitionFirstByte(currPtr);
Michał Lenart authored
135
136
        if (td.shortLabel == shortLabel) {
            if (shortLabel == 0) {
Michał Lenart authored
137
                char label = static_cast<char> (readInt8(currPtr));
Michał Lenart authored
138
139
140
                if (label == c) {
                    found = true;
                    break;
Michał Lenart authored
141
142
                }
                else {
Michał Lenart authored
143
                    currPtr += td.offsetSize;
Michał Lenart authored
144
                }
Michał Lenart authored
145
146
            } 
            else {
Michał Lenart authored
147
148
149
                found = true;
                break;
            }
Michał Lenart authored
150
151
        }
        else {
Michał Lenart authored
152
153
154
            if (td.shortLabel == 0) {
                currPtr++;
            }
Michał Lenart authored
155
            currPtr += td.offsetSize;
Michał Lenart authored
156
157
158
159
        }
    }
    if (!found) {
        state.setNextAsSink();
Michał Lenart authored
160
161
    } 
    else {
Michał Lenart authored
162
        uint32_t offset;
Michał Lenart authored
163
164
        switch (td.offsetSize) {
            case 0:
Michał Lenart authored
165
                offset = 0;
Michał Lenart authored
166
167
                break;
            case 1:
Michał Lenart authored
168
                offset = readInt8(currPtr);
Michał Lenart authored
169
170
                break;
            case 2:
Michał Lenart authored
171
                offset = readInt16(currPtr);
Michał Lenart authored
172
173
                break;
            case 3:
Michał Lenart authored
174
                offset = readInt24(currPtr);
Michał Lenart authored
175
                break;
Michał Lenart authored
176
            default:
Michał Lenart authored
177
                std::cerr << "Offset size = " << td.offsetSize << std::endl;
Michał Lenart authored
178
                throw FileFormatException("Invalid file format");
Michał Lenart authored
179
        }
Michał Lenart authored
180
        currPtr += offset;
Michał Lenart authored
181
182
183
184
        reallyDoProceed(currPtr, state);
    }
}
Michał Lenart authored
185
186
}
Michał Lenart authored
187
#endif	/* CFSA1_IMPL_HPP */
Michał Lenart authored
188