|
1
2
3
4
5
6
7
8
9
10
11
|
/*
* File: CasePatternHelper.hpp
* Author: lennyn
*
* Created on April 4, 2014, 12:11 PM
*/
#ifndef CASEPATTERNHELPER_HPP
#define CASEPATTERNHELPER_HPP
#include <vector>
|
|
12
|
#include "InterpsGroup.hpp"
|
|
13
|
#include "deserialization/morphInterps/compressionByteUtils.hpp"
|
|
14
15
|
#include "Environment.hpp"
|
|
16
17
|
namespace morfeusz {
|
|
18
|
class Environment;
|
|
19
|
|
|
20
21
22
23
|
/**
* Utility class used to for case-sensitive interpretations filtering
* (ie. to filter out "berlin" and keep "Berlin")
*/
|
|
24
25
26
27
28
29
30
|
class CasePatternHelper {
public:
CasePatternHelper() : caseSensitive(false) {
}
|
|
31
32
33
34
35
|
/**
* Set if this case pattern helper cares about case-sensitivity
*
* @param caseSensitive
*/
|
|
36
37
38
|
void setCaseSensitive(bool caseSensitive) {
this->caseSensitive = caseSensitive;
}
|
|
39
40
41
42
43
44
|
/**
* Check if given word matches given case pattern
*
* @param lowercaseCodepoints - codepoints of checked word converter to lowercase
* @param originalCodepoints - codepoints of checked word
|
|
45
|
* @param casePattern - std::vector representing case pattern ( ie. [False, True] for "mBank")
|
|
46
47
|
* @return - true iff word denoted by given codepoints matches given case pattern
*/
|
|
48
|
bool checkCasePattern(
|
|
49
50
51
|
const std::vector<uint32_t>& lowercaseCodepoints,
const std::vector<uint32_t>& originalCodepoints,
const std::vector<bool>& casePattern) const {
|
|
52
53
|
if (this->caseSensitive) {
for (unsigned int i = 0; i < casePattern.size(); i++) {
|
|
54
|
if (casePattern[i] && lowercaseCodepoints[i] == originalCodepoints[i]) {
|
|
55
56
57
58
59
60
|
return false;
}
}
}
return true;
}
|
|
61
|
|
|
62
63
64
65
66
67
68
69
70
71
|
/**
* Check if given word has a chance of matching any of case patterns in given interps group.
*
* @param env - environment
* @param orthStart - pointer to start of word
* @param orthEnd - pointer to end of word
* @param ig - interps group
* @return - true iff word encoded from orthStart to orthEnd
* matches at least one of the interp group's morph interpretation's case pattern.
*/
|
|
72
|
bool checkInterpsGroupOrthCasePatterns(
|
|
73
74
75
76
|
const Environment& env,
const char* orthStart,
const char* orthEnd,
const InterpsGroup& ig) const;
|
|
77
78
79
80
81
82
|
bool checkMultipleCasePatterns(
const Environment& env,
const char* orthStart,
const char* orthEnd,
const unsigned char* ptr) const;
|
|
83
|
|
|
84
85
86
87
88
89
|
/**
* Deserializes case pattern encoded at given pointer.
*
* @param ptr
* @return - case pattern
*/
|
|
90
|
static std::vector<bool> deserializeOneCasePattern(const unsigned char*& ptr);
|
|
91
92
|
private:
bool caseSensitive;
|
|
93
|
|
|
94
95
|
mutable std::vector<uint32_t> orthCodepoints;
mutable std::vector<uint32_t> normalizedCodepoints;
|
|
96
97
98
99
|
static const uint8_t LEMMA_ONLY_LOWER = 0;
static const uint8_t LEMMA_UPPER_PREFIX = 1;
static const uint8_t LEMMA_MIXED_CASE = 2;
|
|
100
101
|
};
|
|
102
103
|
}
|
|
104
105
|
#endif /* CASEPATTERNHELPER_HPP */
|