CasePatternHelper.hpp 3.01 KB
/* 
 * File:   CasePatternHelper.hpp
 * Author: lennyn
 *
 * Created on April 4, 2014, 12:11 PM
 */

#ifndef CASEPATTERNHELPER_HPP
#define	CASEPATTERNHELPER_HPP

#include <vector>
#include "InterpsGroup.hpp"
#include "deserialization/morphInterps/compressionByteUtils.hpp"
#include "Environment.hpp"

namespace morfeusz {
    
class Environment;

/**
 * Utility class used to for case-sensitive interpretations filtering
 * (ie. to filter out "berlin" and keep "Berlin")
 */
class CasePatternHelper {
public:

    CasePatternHelper() : caseSensitive(false) {

    }

    /**
     * Set if this case pattern helper cares about case-sensitivity
     * 
     * @param caseSensitive
     */
    void setCaseSensitive(bool caseSensitive) {
        this->caseSensitive = caseSensitive;
    }
    
    /**
     * Check if given word matches given case pattern
     * 
     * @param lowercaseCodepoints - codepoints of checked word converter to lowercase
     * @param originalCodepoints - codepoints of checked word
     * @param casePattern - std::vector representing case pattern ( ie. [False, True] for "mBank")
     * @return - true iff word denoted by given codepoints matches given case pattern
     */
    bool checkCasePattern(
            const std::vector<uint32_t>& lowercaseCodepoints,
            const std::vector<uint32_t>& originalCodepoints,
            const std::vector<bool>& casePattern) const {
        if (this->caseSensitive) {
            for (unsigned int i = 0; i < casePattern.size(); i++) {
                if (casePattern[i] && lowercaseCodepoints[i] == originalCodepoints[i]) {
                    return false;
                }
            }
        }
        return true;
    }

    /**
     * Check if given word has a chance of matching any of case patterns in given interps group.
     * 
     * @param env - environment
     * @param orthStart - pointer to start of word
     * @param orthEnd - pointer to end of word
     * @param ig - interps group
     * @return - true iff word encoded from orthStart to orthEnd 
     *          matches at least one of the interp group's morph interpretation's case pattern.
     */
    bool checkInterpsGroupOrthCasePatterns(
            const Environment& env,
            const char* orthStart,
            const char* orthEnd,
            const InterpsGroup& ig) const;
    
    bool checkMultipleCasePatterns(
            const Environment& env,
            const char* orthStart,
            const char* orthEnd,
            const unsigned char* ptr) const;

    /**
     * Deserializes case pattern encoded at given pointer.
     * 
     * @param ptr
     * @return - case pattern
     */
    static std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr);
private:
    bool caseSensitive;
    
    mutable std::vector<uint32_t> orthCodepoints;
    mutable std::vector<uint32_t> normalizedCodepoints;

    static const uint8_t LEMMA_ONLY_LOWER = 0;
    static const uint8_t LEMMA_UPPER_PREFIX = 1;
    static const uint8_t LEMMA_MIXED_CASE = 2;
};

}

#endif	/* CASEPATTERNHELPER_HPP */