Blame view

morfeusz/charset/utf8sqlite.hpp 3.81 KB
Michał Lenart authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
/*
 * File:   utf8sqlite.hpp
 * Author: mlenart
 *
 * Created on 24 czerwiec 2014, 15:19
 */

#ifndef UTF8SQLITE_HPP
#define	UTF8SQLITE_HPP

#include <inttypes.h>
#include <iostream>

namespace morfeusz {
    // UTF-8

    /* funkcja do pobierania kolejnych znaków w UTF-8 pozyczona od SQLite'a */

    /*
     ** This table maps from the first byte of a UTF-8 character to the number
     ** of trailing bytes expected. A value '255' indicates that the table key
     ** is not a legal first byte for a UTF-8 character.
     */
    static const unsigned char xtra_utf8_bytes[256] = {
        /* 0xxxxxxx */
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

        /* 10wwwwww */
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,
        255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255,

        /* 110yyyyy */
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

        /* 1110zzzz */
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,

        /* 11110yyy */
        3, 3, 3, 3, 3, 3, 3, 3, 255, 255, 255, 255, 255, 255, 255, 255,
    };

    /*
     ** This table maps from the number of trailing bytes in a UTF-8 character
     ** to an integer constant that is effectively calculated for each character
     ** read by a naive implementation of a UTF-8 character reader. The code
     ** in the READ_UTF8 macro explains things best.
     */
    static const int xtra_utf8_bits[4] = {
        0,
        12416, /* (0xC0 << 6) + (0x80) */
        925824, /* (0xE0 << 12) + (0x80 << 6) + (0x80) */
        63447168 /* (0xF0 << 18) + (0x80 << 12) + (0x80 << 6) + 0x80 */
    };

#define READ_UTF8(zIn, c) { \
  int xtra;                                            \
  c = *(zIn)++;                                        \
  xtra = xtra_utf8_bytes[c];                           \
  switch( xtra ){                                      \
    case 255: c = (int)0xFFFD; break;                  \
    case 3: c = (c<<6) + *(zIn)++;                     \
    case 2: c = (c<<6) + *(zIn)++;                     \
    case 1: c = (c<<6) + *(zIn)++;                     \
    c -= xtra_utf8_bits[xtra];                         \
  }                                                    \
}

    inline uint32_t readUTF8(const unsigned char*& zIn, const unsigned char* zInEnd) {
        int xtra;
        uint32_t cp = *(zIn)++;
        xtra = xtra_utf8_bytes[cp];
        switch (xtra) {
            case 255:
                cp = 0xFFFD;
                break;
            case 3:
                cp = (zIn != zInEnd)
                        ? ((cp << 6) + *(zIn)++)
                        : 0xFFFD;
            case 2:
                cp = (zIn != zInEnd)
                        ? ((cp << 6) + *(zIn)++)
                        : 0xFFFD;
            case 1:
                cp = (zIn != zInEnd)
                        ? ((cp << 6) + *(zIn)++)
                        : 0xFFFD;
                if (cp != 0xFFFD)
                    cp -= xtra_utf8_bits[xtra];
        }
        if (cp == 0xFFFD) {
            std::cerr << "replacing invalid utf8 sequence with replacement character" << std::endl;
        }
        return cp;
    }
}

    /* koniec pożyczki */

#endif	/* UTF8SQLITE_HPP */