|
1
|
/*
|
|
2
3
|
* File: cfsa1_impl.hpp
* Author: mlenart
|
|
4
|
*
|
|
5
|
* Created on November 4, 2013, 1:08 PM
|
|
6
7
|
*/
|
|
8
9
10
11
|
#ifndef CFSA1_IMPL_HPP
#define CFSA1_IMPL_HPP
#include <vector>
|
|
12
|
#include <climits>
|
|
13
14
|
#include "fsa.hpp"
|
|
15
|
#include "../deserialization/deserializationUtils.hpp"
|
|
16
|
|
|
17
18
|
namespace morfeusz {
|
|
19
|
static const unsigned char CFSA1_ACCEPTING_FLAG = 128;
|
|
20
21
|
//static const unsigned char CFSA1_ARRAY_FLAG = 64;
static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 127;
|
|
22
23
24
25
|
static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3;
static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257;
|
|
26
27
|
struct StateData2 {
|
|
28
|
unsigned int transitionsNum;
|
|
29
|
// bool isArray;
|
|
30
|
bool isAccepting;
|
|
31
32
33
|
};
struct TransitionData2 {
|
|
34
35
|
unsigned int offsetSize;
unsigned int shortLabel;
|
|
36
37
|
};
|
|
38
39
40
|
static inline StateData2 readStateData(const unsigned char*& ptr) {
StateData2 res;
unsigned char firstByte = readInt8(ptr);
|
|
41
|
// res.isArray = firstByte & CFSA1_ARRAY_FLAG;
|
|
42
43
44
45
46
47
48
|
res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG;
res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK;
if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) {
res.transitionsNum = readInt8(ptr);
}
return res;
}
|
|
49
|
|
|
50
51
52
53
54
55
56
|
static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) {
TransitionData2 res;
unsigned char firstByte = readInt8(ptr);
res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK;
res.shortLabel = firstByte >> 2;
return res;
}
|
|
57
58
|
template <class T>
|
|
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
|
std::vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) {
return std::vector<unsigned char>(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET);
}
template <class T>
void CompressedFSA1<T>::initializeInitialTransitions() {
hasTransition = std::vector<char>(256, false);
initialTransitions = std::vector< State<T> >(256);
char c = CHAR_MIN;
while (true) {
State<T> state;
doProceedToNext(c, state, true);
int currIdx = static_cast<const unsigned char>(c);
initialTransitions[currIdx] = state;
if (c == CHAR_MAX) {
return;
}
else {
c++;
}
}
|
|
80
81
82
|
}
template <class T>
|
|
83
|
CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer)
|
|
84
|
: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer),
|
|
85
86
87
88
|
label2ShortLabel(initializeChar2PopularCharIdx(ptr)),
hasTransition(),
initialTransitions() {
initializeInitialTransitions();
|
|
89
90
91
|
}
template <class T>
|
|
92
|
CompressedFSA1<T>::~CompressedFSA1() {
|
|
93
94
95
96
|
}
template <class T>
|
|
97
|
void CompressedFSA1<T>::reallyDoProceed(
|
|
98
99
|
const unsigned char* statePtr,
State<T>& state) const {
|
|
100
101
102
|
const unsigned char* currPtr = statePtr;
const StateData2 sd = readStateData(currPtr);
if (sd.isAccepting) {
|
|
103
104
105
|
T value;
long valueSize = this->deserializer.deserialize(currPtr, value);
state.setNext(statePtr - this->initialStatePtr, value, valueSize);
|
|
106
107
|
}
else {
|
|
108
|
state.setNext(statePtr - this->initialStatePtr);
|
|
109
110
111
112
|
}
}
template <class T>
|
|
113
|
void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const {
|
|
114
115
116
117
118
119
120
121
122
123
124
|
doProceedToNext(c, state, false);
}
template <class T>
void CompressedFSA1<T>::doProceedToNext(const char c, State<T>& state, bool initial) const {
if (state.offset == 0 && !initial) {
int idx = static_cast<const unsigned char>(c);
State<T> newState = this->initialTransitions[idx];
state = newState;
return;
}
|
|
125
126
127
128
129
130
|
const unsigned char* currPtr = this->initialStatePtr + state.getOffset();
unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c];
const StateData2 sd = readStateData(currPtr);
if (state.isAccepting()) {
currPtr += state.getValueSize();
}
|
|
131
132
|
bool found = false;
TransitionData2 td;
|
|
133
|
for (unsigned int i = 0; i < sd.transitionsNum; i++) {
|
|
134
|
td = readTransitionFirstByte(currPtr);
|
|
135
136
|
if (td.shortLabel == shortLabel) {
if (shortLabel == 0) {
|
|
137
|
char label = static_cast<char> (readInt8(currPtr));
|
|
138
139
140
|
if (label == c) {
found = true;
break;
|
|
141
142
|
}
else {
|
|
143
|
currPtr += td.offsetSize;
|
|
144
|
}
|
|
145
146
|
}
else {
|
|
147
148
149
|
found = true;
break;
}
|
|
150
151
|
}
else {
|
|
152
153
154
|
if (td.shortLabel == 0) {
currPtr++;
}
|
|
155
|
currPtr += td.offsetSize;
|
|
156
157
158
159
|
}
}
if (!found) {
state.setNextAsSink();
|
|
160
161
|
}
else {
|
|
162
|
uint32_t offset;
|
|
163
164
|
switch (td.offsetSize) {
case 0:
|
|
165
|
offset = 0;
|
|
166
167
|
break;
case 1:
|
|
168
|
offset = readInt8(currPtr);
|
|
169
170
|
break;
case 2:
|
|
171
|
offset = readInt16(currPtr);
|
|
172
173
|
break;
case 3:
|
|
174
|
offset = readInt24(currPtr);
|
|
175
|
break;
|
|
176
|
default:
|
|
177
|
std::cerr << "Offset size = " << td.offsetSize << std::endl;
|
|
178
|
throw FileFormatException("Invalid file format");
|
|
179
|
}
|
|
180
|
currPtr += offset;
|
|
181
182
183
184
|
reallyDoProceed(currPtr, state);
}
}
|
|
185
186
|
}
|
|
187
|
#endif /* CFSA1_IMPL_HPP */
|
|
188
|
|