cfsa1_impl.hpp 5.1 KB
/* 
 * File:   cfsa1_impl.hpp
 * Author: mlenart
 *
 * Created on November 4, 2013, 1:08 PM
 */

#ifndef CFSA1_IMPL_HPP
#define	CFSA1_IMPL_HPP

#include <vector>
#include <climits>

#include "fsa.hpp"
#include "../deserialization/deserializationUtils.hpp"

namespace morfeusz {

static const unsigned char CFSA1_ACCEPTING_FLAG = 128;
//static const unsigned char CFSA1_ARRAY_FLAG = 64;
static const unsigned char CFSA1_TRANSITIONS_NUM_MASK = 127;

static const unsigned char CFSA1_OFFSET_SIZE_MASK = 3;

static const unsigned int CFSA1_INITIAL_ARRAY_STATE_OFFSET = 257;

struct StateData2 {
    unsigned int transitionsNum;
    bool isAccepting;
};

struct TransitionData2 {
    unsigned int offsetSize;
    unsigned int shortLabel;
};

static inline StateData2 readStateData(const unsigned char*& ptr) {
    StateData2 res;
    unsigned char firstByte = readInt8(ptr);
    res.isAccepting = firstByte & CFSA1_ACCEPTING_FLAG;
    res.transitionsNum = firstByte & CFSA1_TRANSITIONS_NUM_MASK;
    if (res.transitionsNum == CFSA1_TRANSITIONS_NUM_MASK) {
        res.transitionsNum = readInt8(ptr);
    }
    return res;
}

static inline TransitionData2 readTransitionFirstByte(const unsigned char*& ptr) {
    TransitionData2 res;
    unsigned char firstByte = readInt8(ptr);
    res.offsetSize = firstByte & CFSA1_OFFSET_SIZE_MASK;
    res.shortLabel = firstByte >> 2;
    return res;
}

template <class T>
std::vector<unsigned char> CompressedFSA1<T>::initializeChar2PopularCharIdx(const unsigned char* ptr) {
    std::vector<unsigned char> res(ptr, ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET);
    return res;
}

template <class T>
void CompressedFSA1<T>::initializeInitialTransitions() {
    hasTransition = std::vector<char>(256, false);
    initialTransitions = std::vector< State<T> >(256);
    char c = CHAR_MIN;
    while (true) {
        State<T> state;
        doProceedToNext(c, state, true);
        int currIdx = static_cast<const unsigned char>(c);
        initialTransitions[currIdx] = state;
        if (c == CHAR_MAX) {
            return;
        }
        else {
            c++;
        }
    }
}

template <class T>
CompressedFSA1<T>::CompressedFSA1(const unsigned char* ptr, const Deserializer<T>& deserializer)
: FSA<T>(ptr + CFSA1_INITIAL_ARRAY_STATE_OFFSET, deserializer),
label2ShortLabel(initializeChar2PopularCharIdx(ptr)),
hasTransition(),
initialTransitions() {
    initializeInitialTransitions();
}

template <class T>
CompressedFSA1<T>::~CompressedFSA1() {

}

template <class T>
void CompressedFSA1<T>::reallyDoProceed(
        const unsigned char* statePtr,
        State<T>& state) const {
    const unsigned char* currPtr = statePtr;
    const StateData2 sd = readStateData(currPtr);
    if (sd.isAccepting) {
        T value;
        long valueSize = this->deserializer.deserialize(currPtr, value);
        state.setNext(statePtr - this->initialStatePtr, value, valueSize);
    }
    else {
        state.setNext(statePtr - this->initialStatePtr);
    }
}

template <class T>
void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const {
    doProceedToNext(c, state, false);
}

template <class T>
void CompressedFSA1<T>::doProceedToNext(const char c, State<T>& state, bool initial) const {
    if (state.offset == 0 && !initial) {
        int idx = static_cast<const unsigned char>(c);
        State<T> newState = this->initialTransitions[idx];
        state = newState;
        return;
    }
    const unsigned char* currPtr = this->initialStatePtr + state.getOffset();
    unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c];
    const StateData2 sd = readStateData(currPtr);
    if (state.isAccepting()) {
        currPtr += state.getValueSize();
    }
    bool found = false;
    TransitionData2 td;
    for (unsigned int i = 0; i < sd.transitionsNum; i++) {
        td = readTransitionFirstByte(currPtr);
        if (td.shortLabel == shortLabel) {
            if (shortLabel == 0) {
                char label = static_cast<char> (readInt8(currPtr));
                if (label == c) {
                    found = true;
                    break;
                }
                else {
                    currPtr += td.offsetSize;
                }
            } 
            else {
                found = true;
                break;
            }
        }
        else {
            if (td.shortLabel == 0) {
                currPtr++;
            }
            currPtr += td.offsetSize;
        }
    }
    if (!found) {
        state.setNextAsSink();
    } 
    else {
        uint32_t offset;
        switch (td.offsetSize) {
            case 0:
                offset = 0;
                break;
            case 1:
                offset = readInt8(currPtr);
                break;
            case 2:
                offset = readInt16(currPtr);
                break;
            case 3:
                offset = readInt24(currPtr);
                break;
            default:
                std::cerr << "Offset size = " << td.offsetSize << std::endl;
                throw FileFormatException("Invalid file format");
        }
        currPtr += offset;
        reallyDoProceed(currPtr, state);
    }
}

}

#endif	/* CFSA1_IMPL_HPP */