Commit f23aead2806888d01857b660722db3c3e372201e
1 parent
58aafafe
- dalsza praca nad klasą Morfeusz
- dodanie konwersji kodowań znaków (na razie tylko głupi szkielet, który obsługuje UTF-8) git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@19 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
18 changed files
with
1171 additions
and
48 deletions
fsa/fsa.hpp
... | ... | @@ -15,6 +15,7 @@ |
15 | 15 | #include <exception> |
16 | 16 | #include <string> |
17 | 17 | #include <vector> |
18 | +#include <netinet/in.h> | |
18 | 19 | |
19 | 20 | template <class T> class State; |
20 | 21 | template <class T> class FSA; |
... | ... | @@ -83,6 +84,11 @@ public: |
83 | 84 | */ |
84 | 85 | static FSA<T>* getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer); |
85 | 86 | |
87 | + /** | |
88 | + * Create an FSA object from file | |
89 | + */ | |
90 | + static FSA<T>* getFSA(const std::string& filename, const Deserializer<T>& deserializer); | |
91 | + | |
86 | 92 | protected: |
87 | 93 | |
88 | 94 | /** |
... | ... |
fsa/fsa_impl.hpp
... | ... | @@ -13,6 +13,7 @@ |
13 | 13 | #include <utility> |
14 | 14 | #include <iostream> |
15 | 15 | #include <vector> |
16 | +#include <string> | |
16 | 17 | #include <netinet/in.h> |
17 | 18 | #include "utils.hpp" |
18 | 19 | #include "const.hpp" |
... | ... | @@ -55,6 +56,11 @@ State<T> FSA<T>::getInitialState() const { |
55 | 56 | } |
56 | 57 | |
57 | 58 | template <class T> |
59 | +FSA<T>* FSA<T>::getFSA(const std::string& filename, const Deserializer<T>& deserializer) { | |
60 | + return getFSA(readFile(filename.c_str()), deserializer); | |
61 | +} | |
62 | + | |
63 | +template <class T> | |
58 | 64 | FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer) { |
59 | 65 | |
60 | 66 | uint32_t magicNumber = ntohl(*((uint32_t*) ptr)); |
... | ... | @@ -64,7 +70,7 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial |
64 | 70 | |
65 | 71 | uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET); |
66 | 72 | if (versionNum != VERSION_NUM) { |
67 | - throw FSAException(string("Invalid version number: ") + to_string(versionNum) + ", should be: " + to_string(VERSION_NUM)); | |
73 | + throw FSAException(string("Invalid version number: ") + std::to_string(versionNum) + ", should be: " + to_string(VERSION_NUM)); | |
68 | 74 | } |
69 | 75 | |
70 | 76 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); |
... | ... |
morfeusz/CMakeLists.txt
... | ... | @@ -7,9 +7,11 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) |
7 | 7 | add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) |
8 | 8 | add_executable (morfeusz2_analyze main.cpp) |
9 | 9 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) |
10 | +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp) | |
10 | 11 | |
11 | 12 | # Link the executable to the Hello library. |
12 | 13 | target_link_libraries (morfeusz2_analyze morfeusz2) |
13 | 14 | set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) |
14 | 15 | |
15 | 16 | set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
17 | +set_target_properties ( test_morfeusz PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | |
16 | 18 | \ No newline at end of file |
... | ... |
morfeusz/EncodedInterpretation.hpp
morfeusz/Morfeusz.cpp
1 | 1 | /* |
2 | 2 | * File: Morfeusz.cpp |
3 | - * Author: lennyn | |
3 | + * Author: mlenart | |
4 | 4 | * |
5 | 5 | * Created on November 13, 2013, 5:21 PM |
6 | 6 | */ |
7 | 7 | |
8 | +#include <string> | |
9 | +#include "utils.hpp" | |
8 | 10 | #include "Morfeusz.hpp" |
11 | +#include "MorphDeserializer.hpp" | |
12 | +#include "encoding/CharsetConverter.hpp" | |
9 | 13 | |
10 | -Morfeusz::Morfeusz() { | |
14 | +using namespace std; | |
15 | + | |
16 | +static FSA<vector<EncodedInterpretation>>* initializeFSA(const string& filename) { | |
17 | + static Deserializer<vector<EncodedInterpretation>>* deserializer | |
18 | + = new MorphDeserializer(); | |
19 | + return FSA<vector<EncodedInterpretation>>::getFSA(filename, *deserializer); | |
20 | +} | |
21 | + | |
22 | +static CharsetConverter* initializeCharsetConverter() { | |
23 | + static CharsetConverter* converter = new UTF8CharsetConverter(); | |
24 | + return converter; | |
11 | 25 | } |
12 | 26 | |
13 | -Morfeusz::Morfeusz(const Morfeusz& orig) { | |
27 | +Morfeusz::Morfeusz(const string& filename) | |
28 | +: fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) { | |
29 | + | |
14 | 30 | } |
15 | 31 | |
32 | +//Morfeusz::Morfeusz(const Morfeusz& orig) { | |
33 | +//} | |
34 | + | |
16 | 35 | Morfeusz::~Morfeusz() { |
36 | + delete &this->fsa; | |
37 | +} | |
38 | + | |
39 | +AnalyzeResult Morfeusz::analyze(const std::string& text) { | |
40 | + const char* textStart = text.c_str(); | |
41 | + const char* textEnd = text.c_str() + text.length(); | |
42 | + AnalyzeResult res = { | |
43 | + ResultsIterator(textStart, textEnd, *this), | |
44 | + ResultsIterator(textEnd, textEnd, *this)}; | |
45 | + return res; | |
17 | 46 | } |
18 | 47 | |
... | ... |
morfeusz/Morfeusz.hpp
... | ... | @@ -9,8 +9,11 @@ |
9 | 9 | #define MORFEUSZ_HPP |
10 | 10 | |
11 | 11 | #include <string> |
12 | +#include <vector> | |
13 | +#include "EncodedInterpretation.hpp" | |
14 | +#include "fsa.hpp" | |
12 | 15 | #include "MorphInterpretation.hpp" |
13 | -//#include "interpretations.hpp" | |
16 | +#include "encoding/CharsetConverter.hpp" | |
14 | 17 | |
15 | 18 | class Morfeusz; |
16 | 19 | class AnalyzeResult; |
... | ... | @@ -18,22 +21,21 @@ class ResultsIterator; |
18 | 21 | |
19 | 22 | class Morfeusz { |
20 | 23 | public: |
21 | - | |
24 | + explicit Morfeusz(const std::string& filename); | |
25 | + virtual ~Morfeusz(); | |
26 | +// Morfeusz(const Morfeusz& orig); | |
22 | 27 | AnalyzeResult analyze(const std::string& text); |
23 | 28 | |
24 | - Morfeusz(); | |
25 | - Morfeusz(const Morfeusz& orig); | |
26 | - virtual ~Morfeusz(); | |
29 | +// Morfeusz(); | |
27 | 30 | private: |
28 | - void processOneWord(const char*& inputData, int startNodeNum, vector<MorphInterpretation>& resInterps); | |
31 | + void processOneWord(const char*& inputData, int startNodeNum, std::vector<MorphInterpretation>& resInterps); | |
32 | + const FSA<std::vector<EncodedInterpretation>>* fsa; | |
33 | + CharsetConverter* charsetConverter; | |
29 | 34 | }; |
30 | 35 | |
31 | 36 | class ResultsIterator { |
32 | 37 | public: |
33 | 38 | ResultsIterator( |
34 | - const std::string& text, | |
35 | - const Morfeusz& morfeusz); | |
36 | - ResultsIterator( | |
37 | 39 | const char* startOfInput, |
38 | 40 | const char* endOfInput, |
39 | 41 | const Morfeusz& morfeusz); |
... | ... |
morfeusz/MorphInterpretation.cpp
... | ... | @@ -42,6 +42,14 @@ MorphInterpretation::MorphInterpretation( |
42 | 42 | MorphInterpretation::~MorphInterpretation() { |
43 | 43 | } |
44 | 44 | |
45 | +int MorphInterpretation::getStartNode() const { | |
46 | + return this->startNode; | |
47 | +} | |
48 | + | |
49 | +int MorphInterpretation::getEndNode() const { | |
50 | + return this->endNode; | |
51 | +} | |
52 | + | |
45 | 53 | const std::string& MorphInterpretation::getOrth() const { |
46 | 54 | return this->orth; |
47 | 55 | } |
... | ... |
morfeusz/MorphInterpretation.hpp
... | ... | @@ -21,6 +21,8 @@ public: |
21 | 21 | const EncodedInterpretation& encodedInterp, |
22 | 22 | const Tagset& tagset); |
23 | 23 | virtual ~MorphInterpretation(); |
24 | + int getStartNode() const; | |
25 | + int getEndNode() const; | |
24 | 26 | const std::string& getOrth() const; |
25 | 27 | const std::string& getLemma() const; |
26 | 28 | int getTagnum() const; |
... | ... |
morfeusz/encoding/CharsetConverter.cpp
0 → 100644
1 | +/* | |
2 | + * File: EncodingConverter.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 14 listopad 2013, 17:28 | |
6 | + */ | |
7 | + | |
8 | +#include "utf8.h" | |
9 | +#include "CharsetConverter.hpp" | |
10 | + | |
11 | +uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { | |
12 | + return utf8::next(it, end); | |
13 | +} | |
14 | +const char* UTF8CharsetConverter::append(uint32_t cp, const char* result) const { | |
15 | + return utf8::append(cp, result); | |
16 | +} | |
... | ... |
morfeusz/encoding/CharsetConverter.hpp
0 → 100644
1 | +/* | |
2 | + * File: EncodingConverter.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 14 listopad 2013, 17:28 | |
6 | + */ | |
7 | + | |
8 | +#ifndef ENCODINGCONVERTER_HPP | |
9 | +#define ENCODINGCONVERTER_HPP | |
10 | + | |
11 | +class CharsetConverter { | |
12 | +public: | |
13 | + virtual uint32_t next(const char*& it, const char* end) const = 0; | |
14 | + virtual const char* append(uint32_t cp, const char* result) const = 0; | |
15 | +private: | |
16 | +}; | |
17 | + | |
18 | +class UTF8CharsetConverter: public CharsetConverter { | |
19 | +public: | |
20 | + uint32_t next(const char*& it, const char* end) const; | |
21 | + const char* append(uint32_t cp, const char* result) const; | |
22 | +private: | |
23 | +}; | |
24 | + | |
25 | +class UTF16CharsetConverter: public CharsetConverter { | |
26 | +public: | |
27 | + uint32_t next(const char*& it, const char* end) const; | |
28 | + const char* append(uint32_t cp, const char* result) const; | |
29 | +private: | |
30 | +}; | |
31 | + | |
32 | +class UTF32CharsetConverter: public CharsetConverter { | |
33 | +public: | |
34 | + uint32_t next(const char*& it, const char* end) const; | |
35 | + const char* append(uint32_t cp, const char* result) const; | |
36 | +private: | |
37 | +}; | |
38 | + | |
39 | +class ISO8859_2_CharsetConverter: public CharsetConverter { | |
40 | +public: | |
41 | + uint32_t next(const char*& it, const char* end) const; | |
42 | + const char* append(uint32_t cp, const char* result) const; | |
43 | +private: | |
44 | +}; | |
45 | + | |
46 | +#endif /* ENCODINGCONVERTER_HPP */ | |
47 | + | |
... | ... |
morfeusz/encoding/utf8.h
0 → 100644
1 | +// Copyright 2006 Nemanja Trifunovic | |
2 | + | |
3 | +/* | |
4 | +Permission is hereby granted, free of charge, to any person or organization | |
5 | +obtaining a copy of the software and accompanying documentation covered by | |
6 | +this license (the "Software") to use, reproduce, display, distribute, | |
7 | +execute, and transmit the Software, and to prepare derivative works of the | |
8 | +Software, and to permit third-parties to whom the Software is furnished to | |
9 | +do so, all subject to the following: | |
10 | + | |
11 | +The copyright notices in the Software and this entire statement, including | |
12 | +the above license grant, this restriction and the following disclaimer, | |
13 | +must be included in all copies of the Software, in whole or in part, and | |
14 | +all derivative works of the Software, unless such copies or derivative | |
15 | +works are solely in the form of machine-executable object code generated by | |
16 | +a source language processor. | |
17 | + | |
18 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 | +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | |
21 | +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | |
22 | +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | |
23 | +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
24 | +DEALINGS IN THE SOFTWARE. | |
25 | +*/ | |
26 | + | |
27 | + | |
28 | +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 | |
29 | +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 | |
30 | + | |
31 | +#include "utf8/checked.h" | |
32 | +#include "utf8/unchecked.h" | |
33 | + | |
34 | +#endif // header guard | |
... | ... |
morfeusz/encoding/utf8/checked.h
0 → 100644
1 | +// Copyright 2006 Nemanja Trifunovic | |
2 | + | |
3 | +/* | |
4 | +Permission is hereby granted, free of charge, to any person or organization | |
5 | +obtaining a copy of the software and accompanying documentation covered by | |
6 | +this license (the "Software") to use, reproduce, display, distribute, | |
7 | +execute, and transmit the Software, and to prepare derivative works of the | |
8 | +Software, and to permit third-parties to whom the Software is furnished to | |
9 | +do so, all subject to the following: | |
10 | + | |
11 | +The copyright notices in the Software and this entire statement, including | |
12 | +the above license grant, this restriction and the following disclaimer, | |
13 | +must be included in all copies of the Software, in whole or in part, and | |
14 | +all derivative works of the Software, unless such copies or derivative | |
15 | +works are solely in the form of machine-executable object code generated by | |
16 | +a source language processor. | |
17 | + | |
18 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 | +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | |
21 | +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | |
22 | +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | |
23 | +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
24 | +DEALINGS IN THE SOFTWARE. | |
25 | +*/ | |
26 | + | |
27 | + | |
28 | +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | |
29 | +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | |
30 | + | |
31 | +#include "core.h" | |
32 | +#include <stdexcept> | |
33 | + | |
34 | +namespace utf8 | |
35 | +{ | |
36 | + // Base for the exceptions that may be thrown from the library | |
37 | + class exception : public ::std::exception { | |
38 | + }; | |
39 | + | |
40 | + // Exceptions that may be thrown from the library functions. | |
41 | + class invalid_code_point : public exception { | |
42 | + uint32_t cp; | |
43 | + public: | |
44 | + invalid_code_point(uint32_t cp) : cp(cp) {} | |
45 | + virtual const char* what() const throw() { return "Invalid code point"; } | |
46 | + uint32_t code_point() const {return cp;} | |
47 | + }; | |
48 | + | |
49 | + class invalid_utf8 : public exception { | |
50 | + uint8_t u8; | |
51 | + public: | |
52 | + invalid_utf8 (uint8_t u) : u8(u) {} | |
53 | + virtual const char* what() const throw() { return "Invalid UTF-8"; } | |
54 | + uint8_t utf8_octet() const {return u8;} | |
55 | + }; | |
56 | + | |
57 | + class invalid_utf16 : public exception { | |
58 | + uint16_t u16; | |
59 | + public: | |
60 | + invalid_utf16 (uint16_t u) : u16(u) {} | |
61 | + virtual const char* what() const throw() { return "Invalid UTF-16"; } | |
62 | + uint16_t utf16_word() const {return u16;} | |
63 | + }; | |
64 | + | |
65 | + class not_enough_room : public exception { | |
66 | + public: | |
67 | + virtual const char* what() const throw() { return "Not enough space"; } | |
68 | + }; | |
69 | + | |
70 | + /// The library API - functions intended to be called by the users | |
71 | + | |
72 | + template <typename octet_iterator> | |
73 | + octet_iterator append(uint32_t cp, octet_iterator result) | |
74 | + { | |
75 | + if (!utf8::internal::is_code_point_valid(cp)) | |
76 | + throw invalid_code_point(cp); | |
77 | + | |
78 | + if (cp < 0x80) // one octet | |
79 | + *(result++) = static_cast<uint8_t>(cp); | |
80 | + else if (cp < 0x800) { // two octets | |
81 | + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); | |
82 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | |
83 | + } | |
84 | + else if (cp < 0x10000) { // three octets | |
85 | + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); | |
86 | + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); | |
87 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | |
88 | + } | |
89 | + else { // four octets | |
90 | + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); | |
91 | + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80); | |
92 | + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); | |
93 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | |
94 | + } | |
95 | + return result; | |
96 | + } | |
97 | + | |
98 | + template <typename octet_iterator, typename output_iterator> | |
99 | + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) | |
100 | + { | |
101 | + while (start != end) { | |
102 | + octet_iterator sequence_start = start; | |
103 | + internal::utf_error err_code = utf8::internal::validate_next(start, end); | |
104 | + switch (err_code) { | |
105 | + case internal::UTF8_OK : | |
106 | + for (octet_iterator it = sequence_start; it != start; ++it) | |
107 | + *out++ = *it; | |
108 | + break; | |
109 | + case internal::NOT_ENOUGH_ROOM: | |
110 | + throw not_enough_room(); | |
111 | + case internal::INVALID_LEAD: | |
112 | + out = utf8::append (replacement, out); | |
113 | + ++start; | |
114 | + break; | |
115 | + case internal::INCOMPLETE_SEQUENCE: | |
116 | + case internal::OVERLONG_SEQUENCE: | |
117 | + case internal::INVALID_CODE_POINT: | |
118 | + out = utf8::append (replacement, out); | |
119 | + ++start; | |
120 | + // just one replacement mark for the sequence | |
121 | + while (start != end && utf8::internal::is_trail(*start)) | |
122 | + ++start; | |
123 | + break; | |
124 | + } | |
125 | + } | |
126 | + return out; | |
127 | + } | |
128 | + | |
129 | + template <typename octet_iterator, typename output_iterator> | |
130 | + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) | |
131 | + { | |
132 | + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); | |
133 | + return utf8::replace_invalid(start, end, out, replacement_marker); | |
134 | + } | |
135 | + | |
136 | + template <typename octet_iterator> | |
137 | + uint32_t next(octet_iterator& it, octet_iterator end) | |
138 | + { | |
139 | + uint32_t cp = 0; | |
140 | + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); | |
141 | + switch (err_code) { | |
142 | + case internal::UTF8_OK : | |
143 | + break; | |
144 | + case internal::NOT_ENOUGH_ROOM : | |
145 | + throw not_enough_room(); | |
146 | + case internal::INVALID_LEAD : | |
147 | + case internal::INCOMPLETE_SEQUENCE : | |
148 | + case internal::OVERLONG_SEQUENCE : | |
149 | + throw invalid_utf8(*it); | |
150 | + case internal::INVALID_CODE_POINT : | |
151 | + throw invalid_code_point(cp); | |
152 | + } | |
153 | + return cp; | |
154 | + } | |
155 | + | |
156 | + template <typename octet_iterator> | |
157 | + uint32_t peek_next(octet_iterator it, octet_iterator end) | |
158 | + { | |
159 | + return utf8::next(it, end); | |
160 | + } | |
161 | + | |
162 | + template <typename octet_iterator> | |
163 | + uint32_t prior(octet_iterator& it, octet_iterator start) | |
164 | + { | |
165 | + // can't do much if it == start | |
166 | + if (it == start) | |
167 | + throw not_enough_room(); | |
168 | + | |
169 | + octet_iterator end = it; | |
170 | + // Go back until we hit either a lead octet or start | |
171 | + while (utf8::internal::is_trail(*(--it))) | |
172 | + if (it == start) | |
173 | + throw invalid_utf8(*it); // error - no lead byte in the sequence | |
174 | + return utf8::peek_next(it, end); | |
175 | + } | |
176 | + | |
177 | + /// Deprecated in versions that include "prior" | |
178 | + template <typename octet_iterator> | |
179 | + uint32_t previous(octet_iterator& it, octet_iterator pass_start) | |
180 | + { | |
181 | + octet_iterator end = it; | |
182 | + while (utf8::internal::is_trail(*(--it))) | |
183 | + if (it == pass_start) | |
184 | + throw invalid_utf8(*it); // error - no lead byte in the sequence | |
185 | + octet_iterator temp = it; | |
186 | + return utf8::next(temp, end); | |
187 | + } | |
188 | + | |
189 | + template <typename octet_iterator, typename distance_type> | |
190 | + void advance (octet_iterator& it, distance_type n, octet_iterator end) | |
191 | + { | |
192 | + for (distance_type i = 0; i < n; ++i) | |
193 | + utf8::next(it, end); | |
194 | + } | |
195 | + | |
196 | + template <typename octet_iterator> | |
197 | + typename std::iterator_traits<octet_iterator>::difference_type | |
198 | + distance (octet_iterator first, octet_iterator last) | |
199 | + { | |
200 | + typename std::iterator_traits<octet_iterator>::difference_type dist; | |
201 | + for (dist = 0; first < last; ++dist) | |
202 | + utf8::next(first, last); | |
203 | + return dist; | |
204 | + } | |
205 | + | |
206 | + template <typename u16bit_iterator, typename octet_iterator> | |
207 | + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) | |
208 | + { | |
209 | + while (start != end) { | |
210 | + uint32_t cp = utf8::internal::mask16(*start++); | |
211 | + // Take care of surrogate pairs first | |
212 | + if (utf8::internal::is_lead_surrogate(cp)) { | |
213 | + if (start != end) { | |
214 | + uint32_t trail_surrogate = utf8::internal::mask16(*start++); | |
215 | + if (utf8::internal::is_trail_surrogate(trail_surrogate)) | |
216 | + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; | |
217 | + else | |
218 | + throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); | |
219 | + } | |
220 | + else | |
221 | + throw invalid_utf16(static_cast<uint16_t>(cp)); | |
222 | + | |
223 | + } | |
224 | + // Lone trail surrogate | |
225 | + else if (utf8::internal::is_trail_surrogate(cp)) | |
226 | + throw invalid_utf16(static_cast<uint16_t>(cp)); | |
227 | + | |
228 | + result = utf8::append(cp, result); | |
229 | + } | |
230 | + return result; | |
231 | + } | |
232 | + | |
233 | + template <typename u16bit_iterator, typename octet_iterator> | |
234 | + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) | |
235 | + { | |
236 | + while (start != end) { | |
237 | + uint32_t cp = utf8::next(start, end); | |
238 | + if (cp > 0xffff) { //make a surrogate pair | |
239 | + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); | |
240 | + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); | |
241 | + } | |
242 | + else | |
243 | + *result++ = static_cast<uint16_t>(cp); | |
244 | + } | |
245 | + return result; | |
246 | + } | |
247 | + | |
248 | + template <typename octet_iterator, typename u32bit_iterator> | |
249 | + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) | |
250 | + { | |
251 | + while (start != end) | |
252 | + result = utf8::append(*(start++), result); | |
253 | + | |
254 | + return result; | |
255 | + } | |
256 | + | |
257 | + template <typename octet_iterator, typename u32bit_iterator> | |
258 | + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) | |
259 | + { | |
260 | + while (start != end) | |
261 | + (*result++) = utf8::next(start, end); | |
262 | + | |
263 | + return result; | |
264 | + } | |
265 | + | |
266 | + // The iterator class | |
267 | + template <typename octet_iterator> | |
268 | + class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { | |
269 | + octet_iterator it; | |
270 | + octet_iterator range_start; | |
271 | + octet_iterator range_end; | |
272 | + public: | |
273 | + iterator () {} | |
274 | + explicit iterator (const octet_iterator& octet_it, | |
275 | + const octet_iterator& range_start, | |
276 | + const octet_iterator& range_end) : | |
277 | + it(octet_it), range_start(range_start), range_end(range_end) | |
278 | + { | |
279 | + if (it < range_start || it > range_end) | |
280 | + throw std::out_of_range("Invalid utf-8 iterator position"); | |
281 | + } | |
282 | + // the default "big three" are OK | |
283 | + octet_iterator base () const { return it; } | |
284 | + uint32_t operator * () const | |
285 | + { | |
286 | + octet_iterator temp = it; | |
287 | + return utf8::next(temp, range_end); | |
288 | + } | |
289 | + bool operator == (const iterator& rhs) const | |
290 | + { | |
291 | + if (range_start != rhs.range_start || range_end != rhs.range_end) | |
292 | + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); | |
293 | + return (it == rhs.it); | |
294 | + } | |
295 | + bool operator != (const iterator& rhs) const | |
296 | + { | |
297 | + return !(operator == (rhs)); | |
298 | + } | |
299 | + iterator& operator ++ () | |
300 | + { | |
301 | + utf8::next(it, range_end); | |
302 | + return *this; | |
303 | + } | |
304 | + iterator operator ++ (int) | |
305 | + { | |
306 | + iterator temp = *this; | |
307 | + utf8::next(it, range_end); | |
308 | + return temp; | |
309 | + } | |
310 | + iterator& operator -- () | |
311 | + { | |
312 | + utf8::prior(it, range_start); | |
313 | + return *this; | |
314 | + } | |
315 | + iterator operator -- (int) | |
316 | + { | |
317 | + iterator temp = *this; | |
318 | + utf8::prior(it, range_start); | |
319 | + return temp; | |
320 | + } | |
321 | + }; // class iterator | |
322 | + | |
323 | +} // namespace utf8 | |
324 | + | |
325 | +#endif //header guard | |
326 | + | |
327 | + | |
... | ... |
morfeusz/encoding/utf8/core.h
0 → 100644
1 | +// Copyright 2006 Nemanja Trifunovic | |
2 | + | |
3 | +/* | |
4 | +Permission is hereby granted, free of charge, to any person or organization | |
5 | +obtaining a copy of the software and accompanying documentation covered by | |
6 | +this license (the "Software") to use, reproduce, display, distribute, | |
7 | +execute, and transmit the Software, and to prepare derivative works of the | |
8 | +Software, and to permit third-parties to whom the Software is furnished to | |
9 | +do so, all subject to the following: | |
10 | + | |
11 | +The copyright notices in the Software and this entire statement, including | |
12 | +the above license grant, this restriction and the following disclaimer, | |
13 | +must be included in all copies of the Software, in whole or in part, and | |
14 | +all derivative works of the Software, unless such copies or derivative | |
15 | +works are solely in the form of machine-executable object code generated by | |
16 | +a source language processor. | |
17 | + | |
18 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 | +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | |
21 | +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | |
22 | +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | |
23 | +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
24 | +DEALINGS IN THE SOFTWARE. | |
25 | +*/ | |
26 | + | |
27 | + | |
28 | +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | |
29 | +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | |
30 | + | |
31 | +#include <iterator> | |
32 | +#include <cstdint> | |
33 | + | |
34 | +namespace utf8 | |
35 | +{ | |
36 | + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers | |
37 | + // You may need to change them to match your system. | |
38 | + // These typedefs have the same names as ones from cstdint, or boost/cstdint | |
39 | +// typedef unsigned char uint8_t; | |
40 | +// typedef unsigned short uint16_t; | |
41 | +// typedef unsigned int uint32_t; | |
42 | + | |
43 | +// Helper code - not intended to be directly called by the library users. May be changed at any time | |
44 | +namespace internal | |
45 | +{ | |
46 | + // Unicode constants | |
47 | + // Leading (high) surrogates: 0xd800 - 0xdbff | |
48 | + // Trailing (low) surrogates: 0xdc00 - 0xdfff | |
49 | + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; | |
50 | + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; | |
51 | + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; | |
52 | + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; | |
53 | + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); | |
54 | + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; | |
55 | + | |
56 | + // Maximum valid value for a Unicode code point | |
57 | + const uint32_t CODE_POINT_MAX = 0x0010ffffu; | |
58 | + | |
59 | + template<typename octet_type> | |
60 | + inline uint8_t mask8(octet_type oc) | |
61 | + { | |
62 | + return static_cast<uint8_t>(0xff & oc); | |
63 | + } | |
64 | + template<typename u16_type> | |
65 | + inline uint16_t mask16(u16_type oc) | |
66 | + { | |
67 | + return static_cast<uint16_t>(0xffff & oc); | |
68 | + } | |
69 | + template<typename octet_type> | |
70 | + inline bool is_trail(octet_type oc) | |
71 | + { | |
72 | + return ((utf8::internal::mask8(oc) >> 6) == 0x2); | |
73 | + } | |
74 | + | |
75 | + template <typename u16> | |
76 | + inline bool is_lead_surrogate(u16 cp) | |
77 | + { | |
78 | + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); | |
79 | + } | |
80 | + | |
81 | + template <typename u16> | |
82 | + inline bool is_trail_surrogate(u16 cp) | |
83 | + { | |
84 | + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); | |
85 | + } | |
86 | + | |
87 | + template <typename u16> | |
88 | + inline bool is_surrogate(u16 cp) | |
89 | + { | |
90 | + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); | |
91 | + } | |
92 | + | |
93 | + template <typename u32> | |
94 | + inline bool is_code_point_valid(u32 cp) | |
95 | + { | |
96 | + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); | |
97 | + } | |
98 | + | |
99 | + template <typename octet_iterator> | |
100 | + inline typename std::iterator_traits<octet_iterator>::difference_type | |
101 | + sequence_length(octet_iterator lead_it) | |
102 | + { | |
103 | + uint8_t lead = utf8::internal::mask8(*lead_it); | |
104 | + if (lead < 0x80) | |
105 | + return 1; | |
106 | + else if ((lead >> 5) == 0x6) | |
107 | + return 2; | |
108 | + else if ((lead >> 4) == 0xe) | |
109 | + return 3; | |
110 | + else if ((lead >> 3) == 0x1e) | |
111 | + return 4; | |
112 | + else | |
113 | + return 0; | |
114 | + } | |
115 | + | |
116 | + template <typename octet_difference_type> | |
117 | + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) | |
118 | + { | |
119 | + if (cp < 0x80) { | |
120 | + if (length != 1) | |
121 | + return true; | |
122 | + } | |
123 | + else if (cp < 0x800) { | |
124 | + if (length != 2) | |
125 | + return true; | |
126 | + } | |
127 | + else if (cp < 0x10000) { | |
128 | + if (length != 3) | |
129 | + return true; | |
130 | + } | |
131 | + | |
132 | + return false; | |
133 | + } | |
134 | + | |
135 | + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; | |
136 | + | |
137 | + /// Helper for get_sequence_x | |
138 | + template <typename octet_iterator> | |
139 | + utf_error increase_safely(octet_iterator& it, octet_iterator end) | |
140 | + { | |
141 | + if (++it == end) | |
142 | + return NOT_ENOUGH_ROOM; | |
143 | + | |
144 | + if (!utf8::internal::is_trail(*it)) | |
145 | + return INCOMPLETE_SEQUENCE; | |
146 | + | |
147 | + return UTF8_OK; | |
148 | + } | |
149 | + | |
150 | + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} | |
151 | + | |
152 | + /// get_sequence_x functions decode utf-8 sequences of the length x | |
153 | + template <typename octet_iterator> | |
154 | + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) | |
155 | + { | |
156 | + if (it == end) | |
157 | + return NOT_ENOUGH_ROOM; | |
158 | + | |
159 | + code_point = utf8::internal::mask8(*it); | |
160 | + | |
161 | + return UTF8_OK; | |
162 | + } | |
163 | + | |
164 | + template <typename octet_iterator> | |
165 | + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) | |
166 | + { | |
167 | + if (it == end) | |
168 | + return NOT_ENOUGH_ROOM; | |
169 | + | |
170 | + code_point = utf8::internal::mask8(*it); | |
171 | + | |
172 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | |
173 | + | |
174 | + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); | |
175 | + | |
176 | + return UTF8_OK; | |
177 | + } | |
178 | + | |
179 | + template <typename octet_iterator> | |
180 | + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) | |
181 | + { | |
182 | + if (it == end) | |
183 | + return NOT_ENOUGH_ROOM; | |
184 | + | |
185 | + code_point = utf8::internal::mask8(*it); | |
186 | + | |
187 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | |
188 | + | |
189 | + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); | |
190 | + | |
191 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | |
192 | + | |
193 | + code_point += (*it) & 0x3f; | |
194 | + | |
195 | + return UTF8_OK; | |
196 | + } | |
197 | + | |
198 | + template <typename octet_iterator> | |
199 | + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) | |
200 | + { | |
201 | + if (it == end) | |
202 | + return NOT_ENOUGH_ROOM; | |
203 | + | |
204 | + code_point = utf8::internal::mask8(*it); | |
205 | + | |
206 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | |
207 | + | |
208 | + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); | |
209 | + | |
210 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | |
211 | + | |
212 | + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; | |
213 | + | |
214 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | |
215 | + | |
216 | + code_point += (*it) & 0x3f; | |
217 | + | |
218 | + return UTF8_OK; | |
219 | + } | |
220 | + | |
221 | + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR | |
222 | + | |
223 | + template <typename octet_iterator> | |
224 | + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) | |
225 | + { | |
226 | + // Save the original value of it so we can go back in case of failure | |
227 | + // Of course, it does not make much sense with i.e. stream iterators | |
228 | + octet_iterator original_it = it; | |
229 | + | |
230 | + uint32_t cp = 0; | |
231 | + // Determine the sequence length based on the lead octet | |
232 | + typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; | |
233 | + const octet_difference_type length = utf8::internal::sequence_length(it); | |
234 | + | |
235 | + // Get trail octets and calculate the code point | |
236 | + utf_error err = UTF8_OK; | |
237 | + switch (length) { | |
238 | + case 0: | |
239 | + return INVALID_LEAD; | |
240 | + case 1: | |
241 | + err = utf8::internal::get_sequence_1(it, end, cp); | |
242 | + break; | |
243 | + case 2: | |
244 | + err = utf8::internal::get_sequence_2(it, end, cp); | |
245 | + break; | |
246 | + case 3: | |
247 | + err = utf8::internal::get_sequence_3(it, end, cp); | |
248 | + break; | |
249 | + case 4: | |
250 | + err = utf8::internal::get_sequence_4(it, end, cp); | |
251 | + break; | |
252 | + } | |
253 | + | |
254 | + if (err == UTF8_OK) { | |
255 | + // Decoding succeeded. Now, security checks... | |
256 | + if (utf8::internal::is_code_point_valid(cp)) { | |
257 | + if (!utf8::internal::is_overlong_sequence(cp, length)){ | |
258 | + // Passed! Return here. | |
259 | + code_point = cp; | |
260 | + ++it; | |
261 | + return UTF8_OK; | |
262 | + } | |
263 | + else | |
264 | + err = OVERLONG_SEQUENCE; | |
265 | + } | |
266 | + else | |
267 | + err = INVALID_CODE_POINT; | |
268 | + } | |
269 | + | |
270 | + // Failure branch - restore the original value of the iterator | |
271 | + it = original_it; | |
272 | + return err; | |
273 | + } | |
274 | + | |
275 | + template <typename octet_iterator> | |
276 | + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { | |
277 | + uint32_t ignored; | |
278 | + return utf8::internal::validate_next(it, end, ignored); | |
279 | + } | |
280 | + | |
281 | +} // namespace internal | |
282 | + | |
283 | + /// The library API - functions intended to be called by the users | |
284 | + | |
285 | + // Byte order mark | |
286 | + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; | |
287 | + | |
288 | + template <typename octet_iterator> | |
289 | + octet_iterator find_invalid(octet_iterator start, octet_iterator end) | |
290 | + { | |
291 | + octet_iterator result = start; | |
292 | + while (result != end) { | |
293 | + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); | |
294 | + if (err_code != internal::UTF8_OK) | |
295 | + return result; | |
296 | + } | |
297 | + return result; | |
298 | + } | |
299 | + | |
300 | + template <typename octet_iterator> | |
301 | + inline bool is_valid(octet_iterator start, octet_iterator end) | |
302 | + { | |
303 | + return (utf8::find_invalid(start, end) == end); | |
304 | + } | |
305 | + | |
306 | + template <typename octet_iterator> | |
307 | + inline bool starts_with_bom (octet_iterator it, octet_iterator end) | |
308 | + { | |
309 | + return ( | |
310 | + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && | |
311 | + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && | |
312 | + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) | |
313 | + ); | |
314 | + } | |
315 | + | |
316 | + //Deprecated in release 2.3 | |
317 | + template <typename octet_iterator> | |
318 | + inline bool is_bom (octet_iterator it) | |
319 | + { | |
320 | + return ( | |
321 | + (utf8::internal::mask8(*it++)) == bom[0] && | |
322 | + (utf8::internal::mask8(*it++)) == bom[1] && | |
323 | + (utf8::internal::mask8(*it)) == bom[2] | |
324 | + ); | |
325 | + } | |
326 | +} // namespace utf8 | |
327 | + | |
328 | +#endif // header guard | |
329 | + | |
330 | + | |
... | ... |
morfeusz/encoding/utf8/unchecked.h
0 → 100644
1 | +// Copyright 2006 Nemanja Trifunovic | |
2 | + | |
3 | +/* | |
4 | +Permission is hereby granted, free of charge, to any person or organization | |
5 | +obtaining a copy of the software and accompanying documentation covered by | |
6 | +this license (the "Software") to use, reproduce, display, distribute, | |
7 | +execute, and transmit the Software, and to prepare derivative works of the | |
8 | +Software, and to permit third-parties to whom the Software is furnished to | |
9 | +do so, all subject to the following: | |
10 | + | |
11 | +The copyright notices in the Software and this entire statement, including | |
12 | +the above license grant, this restriction and the following disclaimer, | |
13 | +must be included in all copies of the Software, in whole or in part, and | |
14 | +all derivative works of the Software, unless such copies or derivative | |
15 | +works are solely in the form of machine-executable object code generated by | |
16 | +a source language processor. | |
17 | + | |
18 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
19 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
20 | +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | |
21 | +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | |
22 | +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | |
23 | +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | |
24 | +DEALINGS IN THE SOFTWARE. | |
25 | +*/ | |
26 | + | |
27 | + | |
28 | +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | |
29 | +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | |
30 | + | |
31 | +#include "core.h" | |
32 | + | |
33 | +namespace utf8 | |
34 | +{ | |
35 | + namespace unchecked | |
36 | + { | |
37 | + template <typename octet_iterator> | |
38 | + octet_iterator append(uint32_t cp, octet_iterator result) | |
39 | + { | |
40 | + if (cp < 0x80) // one octet | |
41 | + *(result++) = static_cast<uint8_t>(cp); | |
42 | + else if (cp < 0x800) { // two octets | |
43 | + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); | |
44 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | |
45 | + } | |
46 | + else if (cp < 0x10000) { // three octets | |
47 | + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); | |
48 | + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); | |
49 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | |
50 | + } | |
51 | + else { // four octets | |
52 | + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); | |
53 | + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80); | |
54 | + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); | |
55 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | |
56 | + } | |
57 | + return result; | |
58 | + } | |
59 | + | |
60 | + template <typename octet_iterator> | |
61 | + uint32_t next(octet_iterator& it) | |
62 | + { | |
63 | + uint32_t cp = utf8::internal::mask8(*it); | |
64 | + typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it); | |
65 | + switch (length) { | |
66 | + case 1: | |
67 | + break; | |
68 | + case 2: | |
69 | + it++; | |
70 | + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); | |
71 | + break; | |
72 | + case 3: | |
73 | + ++it; | |
74 | + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); | |
75 | + ++it; | |
76 | + cp += (*it) & 0x3f; | |
77 | + break; | |
78 | + case 4: | |
79 | + ++it; | |
80 | + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); | |
81 | + ++it; | |
82 | + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; | |
83 | + ++it; | |
84 | + cp += (*it) & 0x3f; | |
85 | + break; | |
86 | + } | |
87 | + ++it; | |
88 | + return cp; | |
89 | + } | |
90 | + | |
91 | + template <typename octet_iterator> | |
92 | + uint32_t peek_next(octet_iterator it) | |
93 | + { | |
94 | + return utf8::unchecked::next(it); | |
95 | + } | |
96 | + | |
97 | + template <typename octet_iterator> | |
98 | + uint32_t prior(octet_iterator& it) | |
99 | + { | |
100 | + while (utf8::internal::is_trail(*(--it))) ; | |
101 | + octet_iterator temp = it; | |
102 | + return utf8::unchecked::next(temp); | |
103 | + } | |
104 | + | |
105 | + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) | |
106 | + template <typename octet_iterator> | |
107 | + inline uint32_t previous(octet_iterator& it) | |
108 | + { | |
109 | + return utf8::unchecked::prior(it); | |
110 | + } | |
111 | + | |
112 | + template <typename octet_iterator, typename distance_type> | |
113 | + void advance (octet_iterator& it, distance_type n) | |
114 | + { | |
115 | + for (distance_type i = 0; i < n; ++i) | |
116 | + utf8::unchecked::next(it); | |
117 | + } | |
118 | + | |
119 | + template <typename octet_iterator> | |
120 | + typename std::iterator_traits<octet_iterator>::difference_type | |
121 | + distance (octet_iterator first, octet_iterator last) | |
122 | + { | |
123 | + typename std::iterator_traits<octet_iterator>::difference_type dist; | |
124 | + for (dist = 0; first < last; ++dist) | |
125 | + utf8::unchecked::next(first); | |
126 | + return dist; | |
127 | + } | |
128 | + | |
129 | + template <typename u16bit_iterator, typename octet_iterator> | |
130 | + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) | |
131 | + { | |
132 | + while (start != end) { | |
133 | + uint32_t cp = utf8::internal::mask16(*start++); | |
134 | + // Take care of surrogate pairs first | |
135 | + if (utf8::internal::is_lead_surrogate(cp)) { | |
136 | + uint32_t trail_surrogate = utf8::internal::mask16(*start++); | |
137 | + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; | |
138 | + } | |
139 | + result = utf8::unchecked::append(cp, result); | |
140 | + } | |
141 | + return result; | |
142 | + } | |
143 | + | |
144 | + template <typename u16bit_iterator, typename octet_iterator> | |
145 | + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) | |
146 | + { | |
147 | + while (start < end) { | |
148 | + uint32_t cp = utf8::unchecked::next(start); | |
149 | + if (cp > 0xffff) { //make a surrogate pair | |
150 | + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); | |
151 | + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); | |
152 | + } | |
153 | + else | |
154 | + *result++ = static_cast<uint16_t>(cp); | |
155 | + } | |
156 | + return result; | |
157 | + } | |
158 | + | |
159 | + template <typename octet_iterator, typename u32bit_iterator> | |
160 | + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) | |
161 | + { | |
162 | + while (start != end) | |
163 | + result = utf8::unchecked::append(*(start++), result); | |
164 | + | |
165 | + return result; | |
166 | + } | |
167 | + | |
168 | + template <typename octet_iterator, typename u32bit_iterator> | |
169 | + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) | |
170 | + { | |
171 | + while (start < end) | |
172 | + (*result++) = utf8::unchecked::next(start); | |
173 | + | |
174 | + return result; | |
175 | + } | |
176 | + | |
177 | + // The iterator class | |
178 | + template <typename octet_iterator> | |
179 | + class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { | |
180 | + octet_iterator it; | |
181 | + public: | |
182 | + iterator () {} | |
183 | + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} | |
184 | + // the default "big three" are OK | |
185 | + octet_iterator base () const { return it; } | |
186 | + uint32_t operator * () const | |
187 | + { | |
188 | + octet_iterator temp = it; | |
189 | + return utf8::unchecked::next(temp); | |
190 | + } | |
191 | + bool operator == (const iterator& rhs) const | |
192 | + { | |
193 | + return (it == rhs.it); | |
194 | + } | |
195 | + bool operator != (const iterator& rhs) const | |
196 | + { | |
197 | + return !(operator == (rhs)); | |
198 | + } | |
199 | + iterator& operator ++ () | |
200 | + { | |
201 | + ::std::advance(it, utf8::internal::sequence_length(it)); | |
202 | + return *this; | |
203 | + } | |
204 | + iterator operator ++ (int) | |
205 | + { | |
206 | + iterator temp = *this; | |
207 | + ::std::advance(it, utf8::internal::sequence_length(it)); | |
208 | + return temp; | |
209 | + } | |
210 | + iterator& operator -- () | |
211 | + { | |
212 | + utf8::unchecked::prior(it); | |
213 | + return *this; | |
214 | + } | |
215 | + iterator operator -- (int) | |
216 | + { | |
217 | + iterator temp = *this; | |
218 | + utf8::unchecked::prior(it); | |
219 | + return temp; | |
220 | + } | |
221 | + }; // class iterator | |
222 | + | |
223 | + } // namespace utf8::unchecked | |
224 | +} // namespace utf8 | |
225 | + | |
226 | + | |
227 | +#endif // header guard | |
228 | + | |
... | ... |
morfeusz/test_morfeusz.cpp
0 → 100644
1 | +/* | |
2 | + * File: test_morfeusz.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 14 listopad 2013, 15:50 | |
6 | + */ | |
7 | + | |
8 | +#include <cstdlib> | |
9 | +#include <cstdio> | |
10 | +#include <sstream> | |
11 | +#include <iostream> | |
12 | + | |
13 | +#include "Morfeusz.hpp" | |
14 | + | |
15 | +using namespace std; | |
16 | + | |
17 | +void debug(const MorphInterpretation& interp) { | |
18 | + fprintf(stderr, | |
19 | + "%d %d %s %s %s %s\n", | |
20 | + interp.getStartNode(), interp.getEndNode(), | |
21 | + interp.getOrth(), interp.getLemma(), | |
22 | + interp.getTag(), interp.getName()); | |
23 | +} | |
24 | + | |
25 | +void doTest( | |
26 | + const Morfeusz& morfeusz, | |
27 | + const string& inputFilename) { | |
28 | + ifstream ifs; | |
29 | + // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); | |
30 | + ifs.open(inputFilename, ios::binary); | |
31 | + string line; | |
32 | + while (getline(ifs, line)) { | |
33 | + AnalyzeResult res(morfeusz.analyze(line)); | |
34 | + while (res.iterator != res.end) { | |
35 | + debug(*res); | |
36 | + res++; | |
37 | + } | |
38 | + } | |
39 | + validate(ifs.eof(), "Failed to read the input file to the end"); | |
40 | +} | |
41 | + | |
42 | +int main(int argc, char** argv) { | |
43 | + validate(argc == 3, "Must provide exactly two arguments - FSA filename, and input filename."); | |
44 | + string fsaFilename = argv[1]; | |
45 | + string inputFilename = argv[2]; | |
46 | + Morfeusz morfeusz(fsaFilename); | |
47 | + doTest(morfeusz, inputFilename); | |
48 | + return 0; | |
49 | +} | |
50 | + | |
... | ... |
morfeusz/test_morph.cpp
... | ... | @@ -17,21 +17,6 @@ |
17 | 17 | |
18 | 18 | using namespace std; |
19 | 19 | |
20 | -void debug(const string& key, const vector<EncodedInterpretation> value) { | |
21 | - cerr << key << endl; | |
22 | - for (EncodedInterpretation i: value) { | |
23 | - cerr << "suffix to cut: " << i.lemma.suffixToCut << endl; | |
24 | - cerr << "suffix to add: " << i.lemma.suffixToAdd << endl; | |
25 | - cerr << "tag: " << i.tag << endl; | |
26 | - cerr << "name: " << i.nameClassifier << endl; | |
27 | - } | |
28 | - cerr << "==================" << endl; | |
29 | -} | |
30 | - | |
31 | -//void debug(const string& key, const TaggedInterpretation& value) { | |
32 | -// cerr << key << '\t' << value.toString() << endl; | |
33 | -//} | |
34 | - | |
35 | 20 | void doTest( |
36 | 21 | const FSA<vector<EncodedInterpretation>>& fsa, |
37 | 22 | const Tagset& tagset, |
... | ... |
nbproject/configurations.xml
... | ... | @@ -8,11 +8,13 @@ |
8 | 8 | <in>test_speed.cpp</in> |
9 | 9 | </df> |
10 | 10 | <df root="morfeusz" name="1"> |
11 | + <df name="encoding"> | |
12 | + <in>CharsetConverter.cpp</in> | |
13 | + <in>CharsetConverter.hpp</in> | |
14 | + </df> | |
11 | 15 | <in>Morfeusz.cpp</in> |
12 | - <in>Morfeusz.hpp</in> | |
13 | 16 | <in>MorphDeserializer.cpp</in> |
14 | 17 | <in>MorphInterpretation.cpp</in> |
15 | - <in>MorphInterpretation.hpp</in> | |
16 | 18 | <in>Tagset.cpp</in> |
17 | 19 | <in>main.cpp</in> |
18 | 20 | <in>morfeusz.cpp</in> |
... | ... | @@ -49,19 +51,11 @@ |
49 | 51 | <executablePath>build/fsa/test_dict</executablePath> |
50 | 52 | </makeTool> |
51 | 53 | </makefileType> |
52 | - <folder path="1"> | |
54 | + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="4"> | |
53 | 55 | <ccTool> |
54 | 56 | <incDir> |
55 | 57 | <pElem>fsa</pElem> |
56 | - <pElem>build/morfeusz</pElem> | |
57 | - </incDir> | |
58 | - </ccTool> | |
59 | - </folder> | |
60 | - <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8"> | |
61 | - <ccTool> | |
62 | - <incDir> | |
63 | - <pElem>fsa</pElem> | |
64 | - <pElem>build/morfeusz</pElem> | |
58 | + <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> | |
65 | 59 | </incDir> |
66 | 60 | </ccTool> |
67 | 61 | </item> |
... | ... | @@ -86,32 +80,86 @@ |
86 | 80 | </incDir> |
87 | 81 | </ccTool> |
88 | 82 | </item> |
89 | - <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="0"> | |
90 | - </item> | |
91 | - <item path="morfeusz/Morfeusz.hpp" ex="false" tool="3" flavor2="0"> | |
83 | + <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> | |
84 | + <ccTool> | |
85 | + <incDir> | |
86 | + <pElem>fsa</pElem> | |
87 | + <pElem>build/morfeusz</pElem> | |
88 | + </incDir> | |
89 | + </ccTool> | |
92 | 90 | </item> |
93 | 91 | <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> |
94 | 92 | <ccTool> |
93 | + <incDir> | |
94 | + <pElem>fsa</pElem> | |
95 | + <pElem>build/morfeusz</pElem> | |
96 | + </incDir> | |
95 | 97 | </ccTool> |
96 | 98 | </item> |
97 | - <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="0"> | |
98 | - </item> | |
99 | - <item path="morfeusz/MorphInterpretation.hpp" ex="false" tool="3" flavor2="0"> | |
99 | + <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="4"> | |
100 | + <ccTool> | |
101 | + <incDir> | |
102 | + <pElem>morfeusz</pElem> | |
103 | + <pElem>/usr/include/c++/4.8/bits</pElem> | |
104 | + <pElem>/usr/include/c++/4.8/ext</pElem> | |
105 | + <pElem>/usr/include/c++/4.8</pElem> | |
106 | + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> | |
107 | + <pElem>/usr/include/c++/4.8/debug</pElem> | |
108 | + <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> | |
109 | + <pElem>/usr/include/c++/4.8/backward</pElem> | |
110 | + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> | |
111 | + <pElem>build/morfeusz</pElem> | |
112 | + </incDir> | |
113 | + </ccTool> | |
100 | 114 | </item> |
101 | - <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> | |
115 | + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> | |
102 | 116 | <ccTool> |
117 | + <incDir> | |
118 | + <pElem>morfeusz</pElem> | |
119 | + <pElem>/usr/include/c++/4.8/bits</pElem> | |
120 | + <pElem>/usr/include/c++/4.8/ext</pElem> | |
121 | + <pElem>/usr/include/c++/4.8</pElem> | |
122 | + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> | |
123 | + <pElem>/usr/include/c++/4.8/debug</pElem> | |
124 | + <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> | |
125 | + <pElem>fsa</pElem> | |
126 | + <pElem>/usr/include/c++/4.8/backward</pElem> | |
127 | + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> | |
128 | + <pElem>build/morfeusz</pElem> | |
129 | + </incDir> | |
103 | 130 | </ccTool> |
104 | 131 | </item> |
132 | + <item path="morfeusz/encoding/CharsetConverter.cpp" | |
133 | + ex="false" | |
134 | + tool="1" | |
135 | + flavor2="0"> | |
136 | + </item> | |
137 | + <item path="morfeusz/encoding/CharsetConverter.hpp" | |
138 | + ex="false" | |
139 | + tool="3" | |
140 | + flavor2="0"> | |
141 | + </item> | |
105 | 142 | <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> |
106 | 143 | <ccTool> |
144 | + <incDir> | |
145 | + <pElem>fsa</pElem> | |
146 | + <pElem>build/morfeusz</pElem> | |
147 | + </incDir> | |
107 | 148 | </ccTool> |
108 | 149 | </item> |
109 | 150 | <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> |
110 | 151 | <ccTool> |
152 | + <incDir> | |
153 | + <pElem>morfeusz</pElem> | |
154 | + </incDir> | |
111 | 155 | </ccTool> |
112 | 156 | </item> |
113 | 157 | <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> |
114 | 158 | <ccTool> |
159 | + <incDir> | |
160 | + <pElem>fsa</pElem> | |
161 | + <pElem>build/morfeusz</pElem> | |
162 | + </incDir> | |
115 | 163 | </ccTool> |
116 | 164 | </item> |
117 | 165 | </conf> |
... | ... |
nbproject/project.xml
... | ... | @@ -6,7 +6,7 @@ |
6 | 6 | <name>morfeusz</name> |
7 | 7 | <c-extensions/> |
8 | 8 | <cpp-extensions>cpp</cpp-extensions> |
9 | - <header-extensions>hpp</header-extensions> | |
9 | + <header-extensions>h,hpp</header-extensions> | |
10 | 10 | <sourceEncoding>UTF-8</sourceEncoding> |
11 | 11 | <make-dep-projects/> |
12 | 12 | <sourceRootList> |
... | ... |