CharsetConverter.cpp
3.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#include <vector>
#include <iterator>
#include <algorithm>
#include <inttypes.h>
#include <iostream>
#include "deserialization/endianness.hpp"
#include "utf8.h"
#include "CharsetConverter.hpp"
#include "conversion_tables.hpp"
using namespace std;
const char DEFAULT_UNDEFINED_CHAR = static_cast<char> (0xF7);
string CharsetConverter::toString(const vector<uint32_t>& codepoints) const {
string res;
for (unsigned int i = 0; i < codepoints.size(); i++) {
this->append(codepoints[i], res);
}
return res;
}
CharsetConverter::~CharsetConverter() {
}
//uint32_t CharsetConverter::peek(const char* it, const char* end) const {
// return this->next(it, end);
//}
static inline void iterateThroughInvalidUtf8Sequence(const char*& it, const char* end) {
uint32_t _dupa;
while (it != end && utf8::internal::validate_next(it, end, _dupa) != utf8::internal::UTF8_OK) {
it++;
}
}
const UTF8CharsetConverter& UTF8CharsetConverter::getInstance() {
static UTF8CharsetConverter instance;
return instance;
}
UTF8CharsetConverter::UTF8CharsetConverter() {}
uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
// return utf8::unchecked::next(it);
uint32_t cp = 0;
utf8::internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
if (err_code == utf8::internal::UTF8_OK) {
return cp;
}
else {
cerr << "WARNING: Replacing invalid sequence with replacement char: 0xFFFD" << endl;
iterateThroughInvalidUtf8Sequence(it, end);
return 0xFFFD;
}
}
void UTF8CharsetConverter::append(uint32_t cp, string& result) const {
utf8::unchecked::append(cp, back_inserter(result));
}
static vector<char> reverseArray(const uint32_t* array) {
vector<char> res;
unsigned char c = 0;
do {
uint32_t codepoint = array[c];
res.resize(max(static_cast<uint32_t> (res.size()), codepoint + 1), DEFAULT_UNDEFINED_CHAR);
res[codepoint] = static_cast<char> (c);
c++;
}
while (c != 255);
return res;
}
OneByteCharsetConverter::OneByteCharsetConverter(const uint32_t* array)
: array(array),
codepoint2Char(reverseArray(array)) {
}
// TODO - sprawdzanie zakresu
uint32_t OneByteCharsetConverter::next(const char*& it, const char* end) const {
return this->array[static_cast<unsigned char> (*it++)];
}
void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const {
if (cp < this->codepoint2Char.size()) {
result.push_back(this->codepoint2Char[cp]);
}
else {
result.push_back(DEFAULT_UNDEFINED_CHAR);
}
}
const ISO8859_2_CharsetConverter& ISO8859_2_CharsetConverter::getInstance() {
static ISO8859_2_CharsetConverter instance;
return instance;
}
ISO8859_2_CharsetConverter::ISO8859_2_CharsetConverter()
: OneByteCharsetConverter(ISO_8859_2_TO_CODEPOINT) {
}
const Windows_1250_CharsetConverter& Windows_1250_CharsetConverter::getInstance() {
static Windows_1250_CharsetConverter instance;
return instance;
}
Windows_1250_CharsetConverter::Windows_1250_CharsetConverter()
: OneByteCharsetConverter(WINDOWS_1250_TO_CODEPOINT) {
}
const CP852_CharsetConverter& CP852_CharsetConverter::getInstance() {
static CP852_CharsetConverter instance;
return instance;
}
CP852_CharsetConverter::CP852_CharsetConverter()
: OneByteCharsetConverter(CP852_TO_CODEPOINT) {
}
string CharsetConverter::fromUTF8(const string& input) const {
string res;
const char* currInput = input.c_str();
const char* inputEnd = input.c_str() + input.length();
while (currInput != inputEnd) {
uint32_t cp = utf8::next(currInput, inputEnd);
this->append(cp, res);
}
return res;
}
string UTF8CharsetConverter::fromUTF8(const string& input) const {
return input;
}