Environment.cpp 5.59 KB
/* 
 * File:   Environment.cpp
 * Author: mlenart
 * 
 * Created on 22 styczeń 2014, 12:08
 */

#include <vector>
#include <algorithm>
#include "Environment.hpp"
#include "deserialization/MorphDeserializer.hpp"
#include "deserialization/morphInterps/InterpretedChunksDecoder.hpp"
#include "deserialization/morphInterps/InterpretedChunksDecoder4Analyzer.hpp"
#include "deserialization/morphInterps/InterpretedChunksDecoder4Generator.hpp"

namespace morfeusz {

static Deserializer<InterpsGroupsReader>& initializeDeserializer(MorfeuszProcessorType processorType) {
    static Deserializer<InterpsGroupsReader> *analyzerDeserializer
            = new MorphDeserializer();
    static Deserializer<InterpsGroupsReader> *generatorDeserializer
            = new MorphDeserializer();
    return *(processorType == ANALYZER ? analyzerDeserializer : generatorDeserializer);
}

static void deleteSegrulesFSAs(std::map<SegrulesOptions, SegrulesFSA*>& fsasMap) {
    for (
            std::map<SegrulesOptions, SegrulesFSA*>::iterator it = fsasMap.begin();
            it != fsasMap.end();
            ++it) {
        delete it->second;
    }
    fsasMap.clear();
}

Environment::Environment(
        Charset charset,
        MorfeuszProcessorType processorType,
        const unsigned char* fsaFileStartPtr)
: currentCharsetConverter(getCharsetConverter(charset)),
caseConverter(),
tagset(fsaFileStartPtr, currentCharsetConverter),
fsaFileStartPtr(fsaFileStartPtr),
fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))),
separatorsList(getSeparatorsList(fsaFileStartPtr)),
segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)),
currSegrulesOptions(getDefaultSegrulesOptions(fsaFileStartPtr)),
currSegrulesFSA(getDefaultSegrulesFSA(segrulesFSAsMap, fsaFileStartPtr)),
isFromFile(false),
chunksDecoder(
processorType == ANALYZER
? (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Analyzer(*this)
: (InterpretedChunksDecoder*) new InterpretedChunksDecoder4Generator(*this)),
processorType(processorType),
casePatternHelper(new CasePatternHelper()) {
}

const CharsetConverter* Environment::getCharsetConverter(Charset charset) const {
    switch (charset) {
        case UTF8:
            return &UTF8CharsetConverter::getInstance();
        case ISO8859_2:
            return &ISO8859_2_CharsetConverter::getInstance();
        case CP1250:
            return &Windows_1250_CharsetConverter::getInstance();
        case CP852:
            return &CP852_CharsetConverter::getInstance();
        default:
            throw MorfeuszException("invalid charset");
    }
}

Environment::~Environment() {
    delete this->fsa;
    if (this->isFromFile) {
        deleteSegrulesFSAs(this->segrulesFSAsMap);
        delete this->fsaFileStartPtr;
    }
    delete this->chunksDecoder;
    delete this->casePatternHelper;
}

void Environment::setCharset(Charset charset) {
    this->currentCharsetConverter = this->getCharsetConverter(charset);
    this->tagset.setCharsetConverter(currentCharsetConverter);
}

const CharsetConverter& Environment::getCharsetConverter() const {
    return *this->currentCharsetConverter;
}

const CaseConverter& Environment::getCaseConverter() const {
    return this->caseConverter;
}

void Environment::setTagset(IdResolverImpl& tagset) {
    this->tagset = tagset;
    this->tagset.setCharsetConverter(currentCharsetConverter);
}

const IdResolverImpl& Environment::getTagset() const {
    return this->tagset;
}

void Environment::setDictionaryFile(const std::string& filename) {
    if (this->isFromFile) {
        delete this->fsa;
        deleteSegrulesFSAs(this->segrulesFSAsMap);
        delete this->fsaFileStartPtr;
    }
    this->fsaFileStartPtr = readFile<unsigned char>(filename.c_str());
    this->fsa = FSA< InterpsGroupsReader > ::getFSA(fsaFileStartPtr, initializeDeserializer(this->processorType));
    this->separatorsList = getSeparatorsList(fsaFileStartPtr);
    this->segrulesFSAsMap = createSegrulesFSAsMap(this->fsaFileStartPtr);
    this->currSegrulesFSA = getDefaultSegrulesFSA(this->segrulesFSAsMap, this->fsaFileStartPtr);
    this->isFromFile = true;
    this->tagset = IdResolverImpl(fsaFileStartPtr, currentCharsetConverter);
}

const SegrulesFSA& Environment::getCurrentSegrulesFSA() const {
    return *(this->currSegrulesFSA);
}

const FSAType& Environment::getFSA() const {
    return *(this->fsa);
}

const InterpretedChunksDecoder& Environment::getInterpretedChunksDecoder() const {
    return *(this->chunksDecoder);
}

void Environment::setSegrulesOption(const std::string& option, const std::string& value) {
    if (this->currSegrulesOptions.find(option) == this->currSegrulesOptions.end()) {
        throw MorfeuszException("Invalid segmentation option '"+option+"'");
    }
    SegrulesOptions prevOptions = this->currSegrulesOptions;
    this->currSegrulesOptions[option] = value;
    if (this->segrulesFSAsMap.find(this->currSegrulesOptions) == this->segrulesFSAsMap.end()) {
        this->currSegrulesOptions = prevOptions;
        throw MorfeuszException("Invalid '"+option+"' option value: '"+value+"'");
    }
    this->currSegrulesFSA = this->segrulesFSAsMap.find(this->currSegrulesOptions)->second;
}

MorfeuszProcessorType Environment::getProcessorType() const {
    return this->processorType;
}

void Environment::setCaseSensitive(bool caseSensitive) {
    this->casePatternHelper->setCaseSensitive(caseSensitive);
}

const CasePatternHelper& Environment::getCasePatternHelper() const {
    return *this->casePatternHelper;
}

bool Environment::isSeparator(uint32_t codepoint) const {
    return binary_search(
            this->separatorsList.begin(), 
            this->separatorsList.end(), 
            codepoint);
}

}