utf8.C
4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
/*******************************************************************/
/* */
/* File: utf8.C */
/* Author: Helmut Schmid */
/* Purpose: */
/* Created: Mon Sep 5 17:49:16 2005 */
/* Modified: Thu Nov 27 15:24:42 2008 (schmid) */
/* */
/*******************************************************************/
#include "string.h"
#include "utf8.h"
const unsigned char get3LSbits=7;
const unsigned char get4LSbits=15;
const unsigned char get5LSbits=31;
const unsigned char get6LSbits=63;
const unsigned char set1MSbits=128;
const unsigned char set2MSbits=192;
const unsigned char set3MSbits=224;
const unsigned char set4MSbits=240;
/*******************************************************************/
/* */
/* int2utf8 */
/* */
/*******************************************************************/
char *int2utf8( unsigned int sym )
{
static unsigned char ch[5];
if (sym < 128) {
// 1-byte UTF8 symbol, 7 bits
ch[0] = (unsigned char)sym;
ch[1] = 0;
}
else if (sym < 2048) {
// 2-byte UTF8 symbol, 5+6 bits
ch[0] = (unsigned char)((sym >> 6) | set2MSbits);
ch[1] = (unsigned char)((sym & get6LSbits) | set1MSbits);
ch[2] = 0;
}
else if (sym < 65536) {
// 3-byte UTF8 symbol, 4+6+6 bits
ch[0] = (unsigned char)((sym >> 12) | set3MSbits);
ch[1] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
ch[2] = (unsigned char)((sym & get6LSbits) | set1MSbits);
ch[3] = 0;
}
else if (sym < 2097152) {
// 4-byte UTF8 symbol, 3+6+6+6 bits
ch[0] = (unsigned char)((sym >> 18) | set4MSbits);
ch[1] = (unsigned char)(((sym >> 12) & get6LSbits) | set1MSbits);
ch[2] = (unsigned char)(((sym >> 6) & get6LSbits) | set1MSbits);
ch[3] = (unsigned char)((sym & get6LSbits) | set1MSbits);
ch[4] = 0;
}
else
return NULL;
return (char*)ch;
}
/*******************************************************************/
/* */
/* utf8toint */
/* */
/*******************************************************************/
unsigned int utf8toint( char **s )
{
int bytes_to_come;
unsigned int result=0;
unsigned char c=(unsigned char)**s;
if (c >= (unsigned char)set4MSbits) { // 1111xxxx
bytes_to_come = 3;
result = (result << 3) | (c & get3LSbits);
}
else if (c >= (unsigned char) set3MSbits) { // 1110xxxx
// start of a three-byte symbol
bytes_to_come = 2;
result = (result << 4) | (c & get4LSbits);
}
else if (c >= (unsigned char) set2MSbits) { // 1100xxxx
// start of a two-byte symbol
bytes_to_come = 1;
result = (result << 5) | (c & get5LSbits);
}
else if (c < (unsigned char) set1MSbits) { // 0100xxxx
// one-byte symbol
bytes_to_come = 0;
result = c;
}
else
return 0; // error
while (bytes_to_come > 0) {
bytes_to_come--;
(*s)++;
c = (unsigned char)**s;
if (c < (unsigned char) set2MSbits &&
c >= (unsigned char) set1MSbits) // 1000xxxx
{
result = (result << 6) | (c & get6LSbits);
}
else
return 0;
}
(*s)++;
return result;
}
/*******************************************************************/
/* */
/* utf8toint */
/* */
/*******************************************************************/
unsigned int utf8toint( char *s )
{
unsigned int result = utf8toint( &s );
if (*s == 0) // all bytes converted?
return result;
return 0;
}