|
1
2
3
4
5
|
#include "CasePatternHelper.hpp"
using namespace std;
|
|
6
7
|
namespace morfeusz {
|
|
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
bool CasePatternHelper::checkInterpsGroupOrthCasePatterns(
const Environment& env,
const char* orthStart,
const char* orthEnd,
const InterpsGroup& ig) const {
const unsigned char* currPtr = ig.ptr;
unsigned char compressionByte = *currPtr++;
if (!this->caseSensitive || isOrthOnlyLower(compressionByte)) {
return true;
}
else if (isOrthOnlyTitle(compressionByte)) {
uint32_t cp = env.getCharsetConverter().next(orthStart, orthEnd);
return cp == env.getCaseConverter().toTitle(cp);
}
else {
return checkMultipleCasePatterns(env, orthStart, orthEnd, currPtr);
}
|
|
25
|
}
|
|
26
27
28
29
30
31
32
|
bool CasePatternHelper::checkMultipleCasePatterns(
const Environment& env,
const char* orthStart,
const char* orthEnd,
const unsigned char* ptr) const {
unsigned char casePatternsNum = *ptr++;
|
|
33
34
35
36
|
if (casePatternsNum == 0) {
return true;
}
else {
|
|
37
|
const char* currOrthPtr = orthStart;
|
|
38
39
|
orthCodepoints.resize(0);
normalizedCodepoints.resize(0);
|
|
40
41
42
43
44
45
46
47
48
49
|
bool isDiff = false;
while (currOrthPtr != orthEnd) {
uint32_t codepoint = env.getCharsetConverter().next(currOrthPtr, orthEnd);
uint32_t normalizedCodepoint = env.getCaseConverter().toLower(codepoint);
orthCodepoints.push_back(codepoint);
normalizedCodepoints.push_back(normalizedCodepoint);
isDiff = isDiff || codepoint != normalizedCodepoint;
}
if (!isDiff) {
return false;
|
|
50
|
}
|
|
51
52
53
54
55
56
57
58
59
|
else
for (unsigned int i = 0; i < casePatternsNum; i++) {
if (isDiff && checkCasePattern(
normalizedCodepoints,
orthCodepoints,
deserializeOneCasePattern(ptr))) {
return true;
}
}
|
|
60
61
62
63
|
return false;
}
}
|
|
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
|
std::vector<bool> CasePatternHelper::deserializeOneCasePattern(const unsigned char*& ptr) {
std::vector<bool> res;
uint8_t casePatternType = *ptr++;
uint8_t prefixLength;
uint8_t patternLength;
switch (casePatternType) {
case LEMMA_ONLY_LOWER:
break;
case LEMMA_UPPER_PREFIX:
prefixLength = *ptr++;
res.resize(prefixLength, true);
break;
case LEMMA_MIXED_CASE:
patternLength = *ptr++;
for (unsigned int i = 0; i < patternLength; i++) {
uint8_t idx = *ptr++;
res.resize(idx + 1, false);
res[idx] = true;
}
break;
}
return res;
|
|
86
|
}
|
|
87
88
|
}
|