CharsetConverter.cpp
3.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#include <vector>
#include <iterator>
#include <algorithm>
#include <inttypes.h>
#include <iostream>
#include "deserialization/endianness.hpp"
#include "utf8.h"
#include "utf8sqlite.hpp"
#include "CharsetConverter.hpp"
#include "conversion_tables.hpp"
using namespace std;
namespace morfeusz {
const char DEFAULT_UNDEFINED_CHAR = static_cast<char> (0xF7);
string CharsetConverter::toString(const vector<uint32_t>& codepoints) const {
string res;
for (unsigned int i = 0; i < codepoints.size(); i++) {
this->append(codepoints[i], res);
}
return res;
}
CharsetConverter::~CharsetConverter() {
}
static inline void iterateThroughInvalidUtf8Sequence(const char*& it, const char* end) {
uint32_t _dupa;
while (it != end && utf8::internal::validate_next(it, end, _dupa) != utf8::internal::UTF8_OK) {
it++;
}
}
const UTF8CharsetConverter& UTF8CharsetConverter::getInstance() {
static UTF8CharsetConverter instance;
return instance;
}
UTF8CharsetConverter::UTF8CharsetConverter() {}
uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
return readUTF8((const unsigned char*&) it, (const unsigned char*) end);
}
void UTF8CharsetConverter::append(uint32_t cp, string& result) const {
utf8::unchecked::append(cp, back_inserter(result));
}
static vector<char> reverseArray(const uint32_t* array) {
vector<char> res;
unsigned char c = 0;
do {
uint32_t codepoint = array[c];
res.resize(max(static_cast<uint32_t> (res.size()), codepoint + 1), DEFAULT_UNDEFINED_CHAR);
res[codepoint] = static_cast<char> (c);
c++;
}
while (c != 255);
return res;
}
OneByteCharsetConverter::OneByteCharsetConverter(const uint32_t* array)
: array(array),
codepoint2Char(reverseArray(array)) {
}
// TODO - sprawdzanie zakresu
uint32_t OneByteCharsetConverter::next(const char*& it, const char* end) const {
return this->array[static_cast<unsigned char> (*it++)];
}
void OneByteCharsetConverter::append(uint32_t cp, std::string& result) const {
if (cp < this->codepoint2Char.size()) {
result.push_back(this->codepoint2Char[cp]);
}
else {
result.push_back(DEFAULT_UNDEFINED_CHAR);
}
}
const ISO8859_2_CharsetConverter& ISO8859_2_CharsetConverter::getInstance() {
static ISO8859_2_CharsetConverter instance;
return instance;
}
ISO8859_2_CharsetConverter::ISO8859_2_CharsetConverter()
: OneByteCharsetConverter(ISO_8859_2_TO_CODEPOINT) {
}
const Windows_1250_CharsetConverter& Windows_1250_CharsetConverter::getInstance() {
static Windows_1250_CharsetConverter instance;
return instance;
}
Windows_1250_CharsetConverter::Windows_1250_CharsetConverter()
: OneByteCharsetConverter(WINDOWS_1250_TO_CODEPOINT) {
}
const CP852_CharsetConverter& CP852_CharsetConverter::getInstance() {
static CP852_CharsetConverter instance;
return instance;
}
CP852_CharsetConverter::CP852_CharsetConverter()
: OneByteCharsetConverter(CP852_TO_CODEPOINT) {
}
string CharsetConverter::fromUTF8(const string& input) const {
string res;
const char* currInput = input.c_str();
const char* inputEnd = input.c_str() + input.length();
while (currInput != inputEnd) {
uint32_t cp = utf8::next(currInput, inputEnd);
this->append(cp, res);
}
return res;
}
string CharsetConverter::toUTF8(const string& input) const {
string res;
const char* currInput = input.c_str();
const char* inputEnd = input.c_str() + input.length();
while (currInput != inputEnd) {
uint32_t cp = this->next(currInput, inputEnd);
UTF8CharsetConverter::getInstance().append(cp, res);
}
return res;
}
string UTF8CharsetConverter::fromUTF8(const string& input) const {
return input;
}
string UTF8CharsetConverter::toUTF8(const string& input) const {
return input;
}
}