|
1
|
/*
|
|
2
3
|
* File: cfsa1_impl.hpp
* Author: mlenart
|
|
4
|
*
|
|
5
|
* Created on November 4, 2013, 1:08 PM
|
|
6
7
|
*/
|
|
8
9
10
11
|
#ifndef CFSA1_IMPL_HPP
#define CFSA1_IMPL_HPP
#include <vector>
|
|
12
|
#include <climits>
|
|
13
14
|
#include "fsa.hpp"
|
|
15
|
#include "../deserialization/deserializationUtils.hpp"
|
|
16
|
|
|
17
18
|
namespace morfeusz {
|
|
19
|
static const unsigned char CFSA1_ACCEPTING_FLAG = 128;
|
|
20
21
|
//static const unsigned char CFSA1_ARRAY_FLAG = 64;
static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 127;
|
|
22
23
24
25
|
static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3;
static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257;
|
|
26
27
|
struct StateData2 {
|
|
28
29
|
unsigned int transitionsNum;
bool isAccepting;
|
|
30
31
32
|
};
struct TransitionData2 {
|
|
33
34
|
unsigned int offsetSize;
unsigned int shortLabel;
|
|
35
36
|
};
|
|
37
38
39
40
41
42
43
44
45
46
|
static inline StateData2 readStateData(const unsigned char*& ptr) {
StateData2 res;
unsigned char firstByte = readInt8(ptr);
res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG;
res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK;
if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) {
res.transitionsNum = readInt8(ptr);
}
return res;
}
|
|
47
|
|
|
48
49
50
51
52
53
54
|
static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) {
TransitionData2 res;
unsigned char firstByte = readInt8(ptr);
res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK;
res.shortLabel = firstByte >> 2;
return res;
}
|
|
55
56
|
template <class T>
|
|
57
|
std::vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) {
|
|
58
59
|
std::vector<unsigned char> res(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET);
return res;
|
|
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
|
}
template <class T>
void CompressedFSA1<T>::initializeInitialTransitions() {
hasTransition = std::vector<char>(256, false);
initialTransitions = std::vector< State<T> >(256);
char c = CHAR_MIN;
while (true) {
State<T> state;
doProceedToNext(c, state, true);
int currIdx = static_cast<const unsigned char>(c);
initialTransitions[currIdx] = state;
if (c == CHAR_MAX) {
return;
}
else {
c++;
}
}
|
|
79
80
81
|
}
template <class T>
|
|
82
|
CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer)
|
|
83
|
: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer),
|
|
84
85
86
87
|
label2ShortLabel(initializeChar2PopularCharIdx(ptr)),
hasTransition(),
initialTransitions() {
initializeInitialTransitions();
|
|
88
89
90
|
}
template <class T>
|
|
91
|
CompressedFSA1<T>::~CompressedFSA1() {
|
|
92
93
94
95
|
}
template <class T>
|
|
96
|
void CompressedFSA1<T>::reallyDoProceed(
|
|
97
98
|
const unsigned char* statePtr,
State<T>& state) const {
|
|
99
100
101
|
const unsigned char* currPtr = statePtr;
const StateData2 sd = readStateData(currPtr);
if (sd.isAccepting) {
|
|
102
103
104
|
T value;
long valueSize = this->deserializer.deserialize(currPtr, value);
state.setNext(statePtr - this->initialStatePtr, value, valueSize);
|
|
105
106
|
}
else {
|
|
107
|
state.setNext(statePtr - this->initialStatePtr);
|
|
108
109
110
111
|
}
}
template <class T>
|
|
112
|
void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const {
|
|
113
114
115
116
117
118
119
120
121
122
123
|
doProceedToNext(c, state, false);
}
template <class T>
void CompressedFSA1<T>::doProceedToNext(const char c, State<T>& state, bool initial) const {
if (state.offset == 0 && !initial) {
int idx = static_cast<const unsigned char>(c);
State<T> newState = this->initialTransitions[idx];
state = newState;
return;
}
|
|
124
125
126
127
128
129
|
const unsigned char* currPtr = this->initialStatePtr + state.getOffset();
unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c];
const StateData2 sd = readStateData(currPtr);
if (state.isAccepting()) {
currPtr += state.getValueSize();
}
|
|
130
131
|
bool found = false;
TransitionData2 td;
|
|
132
|
for (unsigned int i = 0; i < sd.transitionsNum; i++) {
|
|
133
|
td = readTransitionFirstByte(currPtr);
|
|
134
135
|
if (td.shortLabel == shortLabel) {
if (shortLabel == 0) {
|
|
136
|
char label = static_cast<char> (readInt8(currPtr));
|
|
137
138
139
|
if (label == c) {
found = true;
break;
|
|
140
141
|
}
else {
|
|
142
|
currPtr += td.offsetSize;
|
|
143
|
}
|
|
144
145
|
}
else {
|
|
146
147
148
|
found = true;
break;
}
|
|
149
150
|
}
else {
|
|
151
152
153
|
if (td.shortLabel == 0) {
currPtr++;
}
|
|
154
|
currPtr += td.offsetSize;
|
|
155
156
157
158
|
}
}
if (!found) {
state.setNextAsSink();
|
|
159
160
|
}
else {
|
|
161
|
uint32_t offset;
|
|
162
163
|
switch (td.offsetSize) {
case 0:
|
|
164
|
offset = 0;
|
|
165
166
|
break;
case 1:
|
|
167
|
offset = readInt8(currPtr);
|
|
168
169
|
break;
case 2:
|
|
170
|
offset = readInt16(currPtr);
|
|
171
172
|
break;
case 3:
|
|
173
|
offset = readInt24(currPtr);
|
|
174
|
break;
|
|
175
|
default:
|
|
176
|
std::cerr << "Offset size = " << td.offsetSize << std::endl;
|
|
177
|
throw FileFormatException("Invalid file format");
|
|
178
|
}
|
|
179
|
currPtr += offset;
|
|
180
181
182
183
|
reallyDoProceed(currPtr, state);
}
}
|
|
184
185
|
}
|
|
186
|
#endif /* CFSA1_IMPL_HPP */
|
|
187
|
|