Blame view

morfeusz/charset/charset_utils.hpp 1.87 KB
Michał Lenart authored
1
2
3
4
5
6
7
8
9
10
/* 
 * File:   charset_utils.hpp
 * Author: lennyn
 *
 * Created on November 15, 2013, 1:57 PM
 */

#ifndef CHARSET_UTILS_HPP
#define	CHARSET_UTILS_HPP
Michał Lenart authored
11
#include <string>
Michał Lenart authored
12
#include <vector>
Michał Lenart authored
13
#include <algorithm>
Michał Lenart authored
14
#include "CharsetConverter.hpp"
Michał Lenart authored
15
Michał Lenart authored
16
17
namespace morfeusz {
Michał Lenart authored
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
static inline std::vector<char> initializeWhitespaces() {
    std::vector<char> res(0x3000, false);
    res[0x0000] = true; // NULL
    res[0x0009] = true; // CHARACTER TABULATION
    res[0x000A] = true; // LINE FEED (LF)
    res[0x000B] = true; // LINE TABULATION
    res[0x000C] = true; // FORM FEED (FF)
    res[0x000D] = true; // CARRIAGE RETURN (CR)
    res[0x001C] = true; // INFORMATION SEPARATOR FOUR
    res[0x001D] = true; // INFORMATION SEPARATOR THREE
    res[0x001E] = true; // INFORMATION SEPARATOR TWO
    res[0x001F] = true; // INFORMATION SEPARATOR ONE
    res[0x0020] = true; // SPACE
    res[0x0085] = true; // NEXT LINE (NEL)
    res[0x00A0] = true; // NON-BREAKING SPACE
    res[0x1680] = true; // OGHAM SPACE MARK
    res[0x180E] = true; // MONGOLIAN VOWEL SEPARATOR
    res[0x2000] = true; // EN QUAD
    res[0x2001] = true; // EM QUAD
    res[0x2002] = true; // EN SPACE
    res[0x2003] = true; // EM SPACE
    res[0x2004] = true; // THREE-PER-EM SPACE
    res[0x2005] = true; // FOUR-PER-EM SPACE
    res[0x2006] = true; // SIX-PER-EM SPACE
    res[0x2007] = true; // FIGURE SPACE
    res[0x2008] = true; // PUNCTUATION SPACE
    res[0x2009] = true; // THIN SPACE
    res[0x200A] = true; // HAIR SPACE
    res[0x2028] = true; // LINE SEPARATOR
    res[0x2029] = true; // PARAGRAPH SEPARATOR
    res[0x205F] = true; // MEDIUM MATHEMATICAL SPACE
    res[0x3000] = true; // IDEOGRAPHIC SPACE
Michał Lenart authored
50
51
52
    return res;
}
Michał Lenart authored
53
inline bool isWhitespace(uint32_t codepoint) {
Michał Lenart authored
54
55
    static std::vector<char> whitespaces(initializeWhitespaces());
    return codepoint < whitespaces.size() && whitespaces[codepoint];
Michał Lenart authored
56
}
Michał Lenart authored
57
Michał Lenart authored
58
59
}
Michał Lenart authored
60
61
#endif	/* CHARSET_UTILS_HPP */