Blame view

morfeusz/case/CasePatternHelper.cpp 2.47 KB
Michał Lenart authored
1
2
3
4
5

#include "CasePatternHelper.hpp"

using namespace std;
Michał Lenart authored
6
7
namespace morfeusz {
Michał Lenart authored
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
bool CasePatternHelper::checkInterpsGroupOrthCasePatterns(
        const Environment& env,
        const char* orthStart,
        const char* orthEnd,
        const InterpsGroup& ig) const {
    const unsigned char* currPtr = ig.ptr;
    unsigned char compressionByte = *currPtr++;
    if (!this->caseSensitive || isOrthOnlyLower(compressionByte)) {
        return true;
    }
    else if (isOrthOnlyTitle(compressionByte)) {
        uint32_t cp = env.getCharsetConverter().next(orthStart, orthEnd);
        return cp == env.getCaseConverter().toTitle(cp);
    }
    else {
        unsigned char casePatternsNum = *currPtr++;
        if (casePatternsNum == 0) {
            return true;
        }
        else {
            for (unsigned int i = 0; i < casePatternsNum; i++) {
                const char* currOrthPtr = orthStart;
                orthCodepoints.clear();
                normalizedCodepoints.clear();
                bool isDiff = false;
                while (currOrthPtr != orthEnd) {
                    uint32_t codepoint = env.getCharsetConverter().next(currOrthPtr, orthEnd);
                    uint32_t normalizedCodepoint = env.getCaseConverter().toLower(codepoint);
                    orthCodepoints.push_back(codepoint);
                    normalizedCodepoints.push_back(normalizedCodepoint);
                    isDiff = isDiff || codepoint != normalizedCodepoint;
                }
                if (isDiff && checkCasePattern(
                        normalizedCodepoints,
                        orthCodepoints,
                        deserializeOneCasePattern(currPtr))) {
                    return true;
                }
            }
            return false;
        }
    }
}

std::vector<bool> CasePatternHelper::deserializeOneCasePattern(const unsigned char*& ptr) {
    std::vector<bool> res;
    uint8_t casePatternType = *ptr++;
    uint8_t prefixLength;
    uint8_t patternLength;
    switch (casePatternType) {
        case LEMMA_ONLY_LOWER:
            break;
        case LEMMA_UPPER_PREFIX:
            prefixLength = *ptr++;
            res.resize(prefixLength, true);
            break;
        case LEMMA_MIXED_CASE:
            patternLength = *ptr++;
            for (unsigned int i = 0; i < patternLength; i++) {
                uint8_t idx = *ptr++;
                res.resize(idx + 1, false);
                res[idx] = true;
            }
            break;
    }
    return res;
}
Michał Lenart authored
75
76

}