Blame view

morfeusz/case/CasePatternHelper.cpp 2.98 KB
Michał Lenart authored
1
2
3
4
5

#include "CasePatternHelper.hpp"

using namespace std;
Michał Lenart authored
6
7
namespace morfeusz {
Michał Lenart authored
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
    bool CasePatternHelper::checkInterpsGroupOrthCasePatterns(
            const Environment& env,
            const char* orthStart,
            const char* orthEnd,
            const InterpsGroup& ig) const {
        const unsigned char* currPtr = ig.ptr;
        unsigned char compressionByte = *currPtr++;
        if (!this->caseSensitive || isOrthOnlyLower(compressionByte)) {
            return true;
        }
        else if (isOrthOnlyTitle(compressionByte)) {
            uint32_t cp = env.getCharsetConverter().next(orthStart, orthEnd);
            return cp == env.getCaseConverter().toTitle(cp);
        }
        else {
            return checkMultipleCasePatterns(env, orthStart, orthEnd, currPtr);
        }
Michał Lenart authored
25
    }
Michał Lenart authored
26
27
28
29
30
31
32

    bool CasePatternHelper::checkMultipleCasePatterns(
            const Environment& env,
            const char* orthStart,
            const char* orthEnd,
            const unsigned char* ptr) const {
        unsigned char casePatternsNum = *ptr++;
Michał Lenart authored
33
34
35
36
        if (casePatternsNum == 0) {
            return true;
        }
        else {
Michał Lenart authored
37
            const char* currOrthPtr = orthStart;
Michał Lenart authored
38
39
            orthCodepoints.resize(0);
            normalizedCodepoints.resize(0);
Michał Lenart authored
40
41
42
43
44
45
46
47
48
49
            bool isDiff = false;
            while (currOrthPtr != orthEnd) {
                uint32_t codepoint = env.getCharsetConverter().next(currOrthPtr, orthEnd);
                uint32_t normalizedCodepoint = env.getCaseConverter().toLower(codepoint);
                orthCodepoints.push_back(codepoint);
                normalizedCodepoints.push_back(normalizedCodepoint);
                isDiff = isDiff || codepoint != normalizedCodepoint;
            }
            if (!isDiff) {
                return false;
Michał Lenart authored
50
            }
Michał Lenart authored
51
52
53
54
55
56
57
58
59
            else
                for (unsigned int i = 0; i < casePatternsNum; i++) {
                    if (isDiff && checkCasePattern(
                            normalizedCodepoints,
                            orthCodepoints,
                            deserializeOneCasePattern(ptr))) {
                        return true;
                    }
                }
Michał Lenart authored
60
61
62
63
            return false;
        }
    }
Michał Lenart authored
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
    std::vector<bool> CasePatternHelper::deserializeOneCasePattern(const unsigned char*& ptr) {
        std::vector<bool> res;
        uint8_t casePatternType = *ptr++;
        uint8_t prefixLength;
        uint8_t patternLength;
        switch (casePatternType) {
            case LEMMA_ONLY_LOWER:
                break;
            case LEMMA_UPPER_PREFIX:
                prefixLength = *ptr++;
                res.resize(prefixLength, true);
                break;
            case LEMMA_MIXED_CASE:
                patternLength = *ptr++;
                for (unsigned int i = 0; i < patternLength; i++) {
                    uint8_t idx = *ptr++;
                    res.resize(idx + 1, false);
                    res[idx] = true;
                }
                break;
        }
        return res;
Michał Lenart authored
86
    }
Michał Lenart authored
87
88

}