Blame view

morfeusz/case/CasePatternHelper.hpp 3.01 KB
Michał Lenart authored
1
2
3
4
5
6
7
8
9
10
11
/* 
 * File:   CasePatternHelper.hpp
 * Author: lennyn
 *
 * Created on April 4, 2014, 12:11 PM
 */

#ifndef CASEPATTERNHELPER_HPP
#define	CASEPATTERNHELPER_HPP

#include <vector>
Michał Lenart authored
12
#include "InterpsGroup.hpp"
Michał Lenart authored
13
#include "deserialization/morphInterps/compressionByteUtils.hpp"
Michał Lenart authored
14
15
#include "Environment.hpp"
Michał Lenart authored
16
17
namespace morfeusz {
Michał Lenart authored
18
class Environment;
Michał Lenart authored
19
Michał Lenart authored
20
21
22
23
/**
 * Utility class used to for case-sensitive interpretations filtering
 * (ie. to filter out "berlin" and keep "Berlin")
 */
Michał Lenart authored
24
25
26
27
28
29
30
class CasePatternHelper {
public:

    CasePatternHelper() : caseSensitive(false) {

    }
Michał Lenart authored
31
32
33
34
35
    /**
     * Set if this case pattern helper cares about case-sensitivity
     * 
     * @param caseSensitive
     */
Michał Lenart authored
36
37
38
    void setCaseSensitive(bool caseSensitive) {
        this->caseSensitive = caseSensitive;
    }
Michał Lenart authored
39
40
41
42
43
44

    /**
     * Check if given word matches given case pattern
     * 
     * @param lowercaseCodepoints - codepoints of checked word converter to lowercase
     * @param originalCodepoints - codepoints of checked word
Michał Lenart authored
45
     * @param casePattern - std::vector representing case pattern ( ie. [False, True] for "mBank")
Michał Lenart authored
46
47
     * @return - true iff word denoted by given codepoints matches given case pattern
     */
Michał Lenart authored
48
    bool checkCasePattern(
Michał Lenart authored
49
50
51
            const std::vector<uint32_t>& lowercaseCodepoints,
            const std::vector<uint32_t>& originalCodepoints,
            const std::vector<bool>& casePattern) const {
Michał Lenart authored
52
53
        if (this->caseSensitive) {
            for (unsigned int i = 0; i < casePattern.size(); i++) {
Michał Lenart authored
54
                if (casePattern[i] && lowercaseCodepoints[i] == originalCodepoints[i]) {
Michał Lenart authored
55
56
57
58
59
60
                    return false;
                }
            }
        }
        return true;
    }
Michał Lenart authored
61
Michał Lenart authored
62
63
64
65
66
67
68
69
70
71
    /**
     * Check if given word has a chance of matching any of case patterns in given interps group.
     * 
     * @param env - environment
     * @param orthStart - pointer to start of word
     * @param orthEnd - pointer to end of word
     * @param ig - interps group
     * @return - true iff word encoded from orthStart to orthEnd 
     *          matches at least one of the interp group's morph interpretation's case pattern.
     */
Michał Lenart authored
72
    bool checkInterpsGroupOrthCasePatterns(
Michał Lenart authored
73
74
75
76
            const Environment& env,
            const char* orthStart,
            const char* orthEnd,
            const InterpsGroup& ig) const;
Michał Lenart authored
77
78
79
80
81
82

    bool checkMultipleCasePatterns(
            const Environment& env,
            const char* orthStart,
            const char* orthEnd,
            const unsigned char* ptr) const;
Michał Lenart authored
83
Michał Lenart authored
84
85
86
87
88
89
    /**
     * Deserializes case pattern encoded at given pointer.
     * 
     * @param ptr
     * @return - case pattern
     */
Michał Lenart authored
90
    static std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr);
Michał Lenart authored
91
92
private:
    bool caseSensitive;
Michał Lenart authored
93
Michał Lenart authored
94
95
    mutable std::vector<uint32_t> orthCodepoints;
    mutable std::vector<uint32_t> normalizedCodepoints;
Michał Lenart authored
96
97
98
99

    static const uint8_t LEMMA_ONLY_LOWER = 0;
    static const uint8_t LEMMA_UPPER_PREFIX = 1;
    static const uint8_t LEMMA_MIXED_CASE = 2;
Michał Lenart authored
100
101
};
Michał Lenart authored
102
103
}
Michał Lenart authored
104
105
#endif	/* CASEPATTERNHELPER_HPP */