Commit f23aead2806888d01857b660722db3c3e372201e

Authored by Michał Lenart
1 parent 58aafafe

- dalsza praca nad klasą Morfeusz

- dodanie konwersji kodowań znaków (na razie tylko głupi szkielet, który obsługuje UTF-8)

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@19 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsa/fsa.hpp
... ... @@ -15,6 +15,7 @@
15 15 #include <exception>
16 16 #include <string>
17 17 #include <vector>
  18 +#include <netinet/in.h>
18 19  
19 20 template <class T> class State;
20 21 template <class T> class FSA;
... ... @@ -83,6 +84,11 @@ public:
83 84 */
84 85 static FSA<T>* getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer);
85 86  
  87 + /**
  88 + * Create an FSA object from file
  89 + */
  90 + static FSA<T>* getFSA(const std::string& filename, const Deserializer<T>& deserializer);
  91 +
86 92 protected:
87 93  
88 94 /**
... ...
fsa/fsa_impl.hpp
... ... @@ -13,6 +13,7 @@
13 13 #include <utility>
14 14 #include <iostream>
15 15 #include <vector>
  16 +#include <string>
16 17 #include <netinet/in.h>
17 18 #include "utils.hpp"
18 19 #include "const.hpp"
... ... @@ -55,6 +56,11 @@ State&lt;T&gt; FSA&lt;T&gt;::getInitialState() const {
55 56 }
56 57  
57 58 template <class T>
  59 +FSA<T>* FSA<T>::getFSA(const std::string& filename, const Deserializer<T>& deserializer) {
  60 + return getFSA(readFile(filename.c_str()), deserializer);
  61 +}
  62 +
  63 +template <class T>
58 64 FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer) {
59 65  
60 66 uint32_t magicNumber = ntohl(*((uint32_t*) ptr));
... ... @@ -64,7 +70,7 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
64 70  
65 71 uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET);
66 72 if (versionNum != VERSION_NUM) {
67   - throw FSAException(string("Invalid version number: ") + to_string(versionNum) + ", should be: " + to_string(VERSION_NUM));
  73 + throw FSAException(string("Invalid version number: ") + std::to_string(versionNum) + ", should be: " + to_string(VERSION_NUM));
68 74 }
69 75  
70 76 uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET);
... ...
morfeusz/CMakeLists.txt
... ... @@ -7,9 +7,11 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa)
7 7 add_library (morfeusz2 morfeusz.hpp morfeusz.cpp)
8 8 add_executable (morfeusz2_analyze main.cpp)
9 9 add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp)
  10 +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp)
10 11  
11 12 # Link the executable to the Hello library.
12 13 target_link_libraries (morfeusz2_analyze morfeusz2)
13 14 set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" )
14 15  
15 16 set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
  17 +set_target_properties ( test_morfeusz PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
16 18 \ No newline at end of file
... ...
morfeusz/EncodedInterpretation.hpp
... ... @@ -15,6 +15,9 @@
15 15  
16 16 using namespace std;
17 17  
  18 +/*
  19 + * Lemma in a compressed format (as in an automaton)
  20 + */
18 21 struct EncodedLemma {
19 22 int suffixToCut;
20 23 string suffixToAdd;
... ...
morfeusz/Morfeusz.cpp
1 1 /*
2 2 * File: Morfeusz.cpp
3   - * Author: lennyn
  3 + * Author: mlenart
4 4 *
5 5 * Created on November 13, 2013, 5:21 PM
6 6 */
7 7  
  8 +#include <string>
  9 +#include "utils.hpp"
8 10 #include "Morfeusz.hpp"
  11 +#include "MorphDeserializer.hpp"
  12 +#include "encoding/CharsetConverter.hpp"
9 13  
10   -Morfeusz::Morfeusz() {
  14 +using namespace std;
  15 +
  16 +static FSA<vector<EncodedInterpretation>>* initializeFSA(const string& filename) {
  17 + static Deserializer<vector<EncodedInterpretation>>* deserializer
  18 + = new MorphDeserializer();
  19 + return FSA<vector<EncodedInterpretation>>::getFSA(filename, *deserializer);
  20 +}
  21 +
  22 +static CharsetConverter* initializeCharsetConverter() {
  23 + static CharsetConverter* converter = new UTF8CharsetConverter();
  24 + return converter;
11 25 }
12 26  
13   -Morfeusz::Morfeusz(const Morfeusz& orig) {
  27 +Morfeusz::Morfeusz(const string& filename)
  28 +: fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) {
  29 +
14 30 }
15 31  
  32 +//Morfeusz::Morfeusz(const Morfeusz& orig) {
  33 +//}
  34 +
16 35 Morfeusz::~Morfeusz() {
  36 + delete &this->fsa;
  37 +}
  38 +
  39 +AnalyzeResult Morfeusz::analyze(const std::string& text) {
  40 + const char* textStart = text.c_str();
  41 + const char* textEnd = text.c_str() + text.length();
  42 + AnalyzeResult res = {
  43 + ResultsIterator(textStart, textEnd, *this),
  44 + ResultsIterator(textEnd, textEnd, *this)};
  45 + return res;
17 46 }
18 47  
... ...
morfeusz/Morfeusz.hpp
... ... @@ -9,8 +9,11 @@
9 9 #define MORFEUSZ_HPP
10 10  
11 11 #include <string>
  12 +#include <vector>
  13 +#include "EncodedInterpretation.hpp"
  14 +#include "fsa.hpp"
12 15 #include "MorphInterpretation.hpp"
13   -//#include "interpretations.hpp"
  16 +#include "encoding/CharsetConverter.hpp"
14 17  
15 18 class Morfeusz;
16 19 class AnalyzeResult;
... ... @@ -18,22 +21,21 @@ class ResultsIterator;
18 21  
19 22 class Morfeusz {
20 23 public:
21   -
  24 + explicit Morfeusz(const std::string& filename);
  25 + virtual ~Morfeusz();
  26 +// Morfeusz(const Morfeusz& orig);
22 27 AnalyzeResult analyze(const std::string& text);
23 28  
24   - Morfeusz();
25   - Morfeusz(const Morfeusz& orig);
26   - virtual ~Morfeusz();
  29 +// Morfeusz();
27 30 private:
28   - void processOneWord(const char*& inputData, int startNodeNum, vector<MorphInterpretation>& resInterps);
  31 + void processOneWord(const char*& inputData, int startNodeNum, std::vector<MorphInterpretation>& resInterps);
  32 + const FSA<std::vector<EncodedInterpretation>>* fsa;
  33 + CharsetConverter* charsetConverter;
29 34 };
30 35  
31 36 class ResultsIterator {
32 37 public:
33 38 ResultsIterator(
34   - const std::string& text,
35   - const Morfeusz& morfeusz);
36   - ResultsIterator(
37 39 const char* startOfInput,
38 40 const char* endOfInput,
39 41 const Morfeusz& morfeusz);
... ...
morfeusz/MorphInterpretation.cpp
... ... @@ -42,6 +42,14 @@ MorphInterpretation::MorphInterpretation(
42 42 MorphInterpretation::~MorphInterpretation() {
43 43 }
44 44  
  45 +int MorphInterpretation::getStartNode() const {
  46 + return this->startNode;
  47 +}
  48 +
  49 +int MorphInterpretation::getEndNode() const {
  50 + return this->endNode;
  51 +}
  52 +
45 53 const std::string& MorphInterpretation::getOrth() const {
46 54 return this->orth;
47 55 }
... ...
morfeusz/MorphInterpretation.hpp
... ... @@ -21,6 +21,8 @@ public:
21 21 const EncodedInterpretation& encodedInterp,
22 22 const Tagset& tagset);
23 23 virtual ~MorphInterpretation();
  24 + int getStartNode() const;
  25 + int getEndNode() const;
24 26 const std::string& getOrth() const;
25 27 const std::string& getLemma() const;
26 28 int getTagnum() const;
... ...
morfeusz/encoding/CharsetConverter.cpp 0 → 100644
  1 +/*
  2 + * File: EncodingConverter.cpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 14 listopad 2013, 17:28
  6 + */
  7 +
  8 +#include "utf8.h"
  9 +#include "CharsetConverter.hpp"
  10 +
  11 +uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
  12 + return utf8::next(it, end);
  13 +}
  14 +const char* UTF8CharsetConverter::append(uint32_t cp, const char* result) const {
  15 + return utf8::append(cp, result);
  16 +}
... ...
morfeusz/encoding/CharsetConverter.hpp 0 → 100644
  1 +/*
  2 + * File: EncodingConverter.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 14 listopad 2013, 17:28
  6 + */
  7 +
  8 +#ifndef ENCODINGCONVERTER_HPP
  9 +#define ENCODINGCONVERTER_HPP
  10 +
  11 +class CharsetConverter {
  12 +public:
  13 + virtual uint32_t next(const char*& it, const char* end) const = 0;
  14 + virtual const char* append(uint32_t cp, const char* result) const = 0;
  15 +private:
  16 +};
  17 +
  18 +class UTF8CharsetConverter: public CharsetConverter {
  19 +public:
  20 + uint32_t next(const char*& it, const char* end) const;
  21 + const char* append(uint32_t cp, const char* result) const;
  22 +private:
  23 +};
  24 +
  25 +class UTF16CharsetConverter: public CharsetConverter {
  26 +public:
  27 + uint32_t next(const char*& it, const char* end) const;
  28 + const char* append(uint32_t cp, const char* result) const;
  29 +private:
  30 +};
  31 +
  32 +class UTF32CharsetConverter: public CharsetConverter {
  33 +public:
  34 + uint32_t next(const char*& it, const char* end) const;
  35 + const char* append(uint32_t cp, const char* result) const;
  36 +private:
  37 +};
  38 +
  39 +class ISO8859_2_CharsetConverter: public CharsetConverter {
  40 +public:
  41 + uint32_t next(const char*& it, const char* end) const;
  42 + const char* append(uint32_t cp, const char* result) const;
  43 +private:
  44 +};
  45 +
  46 +#endif /* ENCODINGCONVERTER_HPP */
  47 +
... ...
morfeusz/encoding/utf8.h 0 → 100644
  1 +// Copyright 2006 Nemanja Trifunovic
  2 +
  3 +/*
  4 +Permission is hereby granted, free of charge, to any person or organization
  5 +obtaining a copy of the software and accompanying documentation covered by
  6 +this license (the "Software") to use, reproduce, display, distribute,
  7 +execute, and transmit the Software, and to prepare derivative works of the
  8 +Software, and to permit third-parties to whom the Software is furnished to
  9 +do so, all subject to the following:
  10 +
  11 +The copyright notices in the Software and this entire statement, including
  12 +the above license grant, this restriction and the following disclaimer,
  13 +must be included in all copies of the Software, in whole or in part, and
  14 +all derivative works of the Software, unless such copies or derivative
  15 +works are solely in the form of machine-executable object code generated by
  16 +a source language processor.
  17 +
  18 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
  21 +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
  22 +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
  23 +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24 +DEALINGS IN THE SOFTWARE.
  25 +*/
  26 +
  27 +
  28 +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
  29 +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731
  30 +
  31 +#include "utf8/checked.h"
  32 +#include "utf8/unchecked.h"
  33 +
  34 +#endif // header guard
... ...
morfeusz/encoding/utf8/checked.h 0 → 100644
  1 +// Copyright 2006 Nemanja Trifunovic
  2 +
  3 +/*
  4 +Permission is hereby granted, free of charge, to any person or organization
  5 +obtaining a copy of the software and accompanying documentation covered by
  6 +this license (the "Software") to use, reproduce, display, distribute,
  7 +execute, and transmit the Software, and to prepare derivative works of the
  8 +Software, and to permit third-parties to whom the Software is furnished to
  9 +do so, all subject to the following:
  10 +
  11 +The copyright notices in the Software and this entire statement, including
  12 +the above license grant, this restriction and the following disclaimer,
  13 +must be included in all copies of the Software, in whole or in part, and
  14 +all derivative works of the Software, unless such copies or derivative
  15 +works are solely in the form of machine-executable object code generated by
  16 +a source language processor.
  17 +
  18 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
  21 +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
  22 +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
  23 +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24 +DEALINGS IN THE SOFTWARE.
  25 +*/
  26 +
  27 +
  28 +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
  29 +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
  30 +
  31 +#include "core.h"
  32 +#include <stdexcept>
  33 +
  34 +namespace utf8
  35 +{
  36 + // Base for the exceptions that may be thrown from the library
  37 + class exception : public ::std::exception {
  38 + };
  39 +
  40 + // Exceptions that may be thrown from the library functions.
  41 + class invalid_code_point : public exception {
  42 + uint32_t cp;
  43 + public:
  44 + invalid_code_point(uint32_t cp) : cp(cp) {}
  45 + virtual const char* what() const throw() { return "Invalid code point"; }
  46 + uint32_t code_point() const {return cp;}
  47 + };
  48 +
  49 + class invalid_utf8 : public exception {
  50 + uint8_t u8;
  51 + public:
  52 + invalid_utf8 (uint8_t u) : u8(u) {}
  53 + virtual const char* what() const throw() { return "Invalid UTF-8"; }
  54 + uint8_t utf8_octet() const {return u8;}
  55 + };
  56 +
  57 + class invalid_utf16 : public exception {
  58 + uint16_t u16;
  59 + public:
  60 + invalid_utf16 (uint16_t u) : u16(u) {}
  61 + virtual const char* what() const throw() { return "Invalid UTF-16"; }
  62 + uint16_t utf16_word() const {return u16;}
  63 + };
  64 +
  65 + class not_enough_room : public exception {
  66 + public:
  67 + virtual const char* what() const throw() { return "Not enough space"; }
  68 + };
  69 +
  70 + /// The library API - functions intended to be called by the users
  71 +
  72 + template <typename octet_iterator>
  73 + octet_iterator append(uint32_t cp, octet_iterator result)
  74 + {
  75 + if (!utf8::internal::is_code_point_valid(cp))
  76 + throw invalid_code_point(cp);
  77 +
  78 + if (cp < 0x80) // one octet
  79 + *(result++) = static_cast<uint8_t>(cp);
  80 + else if (cp < 0x800) { // two octets
  81 + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
  82 + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
  83 + }
  84 + else if (cp < 0x10000) { // three octets
  85 + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
  86 + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
  87 + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
  88 + }
  89 + else { // four octets
  90 + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
  91 + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
  92 + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
  93 + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
  94 + }
  95 + return result;
  96 + }
  97 +
  98 + template <typename octet_iterator, typename output_iterator>
  99 + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement)
  100 + {
  101 + while (start != end) {
  102 + octet_iterator sequence_start = start;
  103 + internal::utf_error err_code = utf8::internal::validate_next(start, end);
  104 + switch (err_code) {
  105 + case internal::UTF8_OK :
  106 + for (octet_iterator it = sequence_start; it != start; ++it)
  107 + *out++ = *it;
  108 + break;
  109 + case internal::NOT_ENOUGH_ROOM:
  110 + throw not_enough_room();
  111 + case internal::INVALID_LEAD:
  112 + out = utf8::append (replacement, out);
  113 + ++start;
  114 + break;
  115 + case internal::INCOMPLETE_SEQUENCE:
  116 + case internal::OVERLONG_SEQUENCE:
  117 + case internal::INVALID_CODE_POINT:
  118 + out = utf8::append (replacement, out);
  119 + ++start;
  120 + // just one replacement mark for the sequence
  121 + while (start != end && utf8::internal::is_trail(*start))
  122 + ++start;
  123 + break;
  124 + }
  125 + }
  126 + return out;
  127 + }
  128 +
  129 + template <typename octet_iterator, typename output_iterator>
  130 + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out)
  131 + {
  132 + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd);
  133 + return utf8::replace_invalid(start, end, out, replacement_marker);
  134 + }
  135 +
  136 + template <typename octet_iterator>
  137 + uint32_t next(octet_iterator& it, octet_iterator end)
  138 + {
  139 + uint32_t cp = 0;
  140 + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp);
  141 + switch (err_code) {
  142 + case internal::UTF8_OK :
  143 + break;
  144 + case internal::NOT_ENOUGH_ROOM :
  145 + throw not_enough_room();
  146 + case internal::INVALID_LEAD :
  147 + case internal::INCOMPLETE_SEQUENCE :
  148 + case internal::OVERLONG_SEQUENCE :
  149 + throw invalid_utf8(*it);
  150 + case internal::INVALID_CODE_POINT :
  151 + throw invalid_code_point(cp);
  152 + }
  153 + return cp;
  154 + }
  155 +
  156 + template <typename octet_iterator>
  157 + uint32_t peek_next(octet_iterator it, octet_iterator end)
  158 + {
  159 + return utf8::next(it, end);
  160 + }
  161 +
  162 + template <typename octet_iterator>
  163 + uint32_t prior(octet_iterator& it, octet_iterator start)
  164 + {
  165 + // can't do much if it == start
  166 + if (it == start)
  167 + throw not_enough_room();
  168 +
  169 + octet_iterator end = it;
  170 + // Go back until we hit either a lead octet or start
  171 + while (utf8::internal::is_trail(*(--it)))
  172 + if (it == start)
  173 + throw invalid_utf8(*it); // error - no lead byte in the sequence
  174 + return utf8::peek_next(it, end);
  175 + }
  176 +
  177 + /// Deprecated in versions that include "prior"
  178 + template <typename octet_iterator>
  179 + uint32_t previous(octet_iterator& it, octet_iterator pass_start)
  180 + {
  181 + octet_iterator end = it;
  182 + while (utf8::internal::is_trail(*(--it)))
  183 + if (it == pass_start)
  184 + throw invalid_utf8(*it); // error - no lead byte in the sequence
  185 + octet_iterator temp = it;
  186 + return utf8::next(temp, end);
  187 + }
  188 +
  189 + template <typename octet_iterator, typename distance_type>
  190 + void advance (octet_iterator& it, distance_type n, octet_iterator end)
  191 + {
  192 + for (distance_type i = 0; i < n; ++i)
  193 + utf8::next(it, end);
  194 + }
  195 +
  196 + template <typename octet_iterator>
  197 + typename std::iterator_traits<octet_iterator>::difference_type
  198 + distance (octet_iterator first, octet_iterator last)
  199 + {
  200 + typename std::iterator_traits<octet_iterator>::difference_type dist;
  201 + for (dist = 0; first < last; ++dist)
  202 + utf8::next(first, last);
  203 + return dist;
  204 + }
  205 +
  206 + template <typename u16bit_iterator, typename octet_iterator>
  207 + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
  208 + {
  209 + while (start != end) {
  210 + uint32_t cp = utf8::internal::mask16(*start++);
  211 + // Take care of surrogate pairs first
  212 + if (utf8::internal::is_lead_surrogate(cp)) {
  213 + if (start != end) {
  214 + uint32_t trail_surrogate = utf8::internal::mask16(*start++);
  215 + if (utf8::internal::is_trail_surrogate(trail_surrogate))
  216 + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
  217 + else
  218 + throw invalid_utf16(static_cast<uint16_t>(trail_surrogate));
  219 + }
  220 + else
  221 + throw invalid_utf16(static_cast<uint16_t>(cp));
  222 +
  223 + }
  224 + // Lone trail surrogate
  225 + else if (utf8::internal::is_trail_surrogate(cp))
  226 + throw invalid_utf16(static_cast<uint16_t>(cp));
  227 +
  228 + result = utf8::append(cp, result);
  229 + }
  230 + return result;
  231 + }
  232 +
  233 + template <typename u16bit_iterator, typename octet_iterator>
  234 + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
  235 + {
  236 + while (start != end) {
  237 + uint32_t cp = utf8::next(start, end);
  238 + if (cp > 0xffff) { //make a surrogate pair
  239 + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
  240 + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
  241 + }
  242 + else
  243 + *result++ = static_cast<uint16_t>(cp);
  244 + }
  245 + return result;
  246 + }
  247 +
  248 + template <typename octet_iterator, typename u32bit_iterator>
  249 + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
  250 + {
  251 + while (start != end)
  252 + result = utf8::append(*(start++), result);
  253 +
  254 + return result;
  255 + }
  256 +
  257 + template <typename octet_iterator, typename u32bit_iterator>
  258 + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
  259 + {
  260 + while (start != end)
  261 + (*result++) = utf8::next(start, end);
  262 +
  263 + return result;
  264 + }
  265 +
  266 + // The iterator class
  267 + template <typename octet_iterator>
  268 + class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
  269 + octet_iterator it;
  270 + octet_iterator range_start;
  271 + octet_iterator range_end;
  272 + public:
  273 + iterator () {}
  274 + explicit iterator (const octet_iterator& octet_it,
  275 + const octet_iterator& range_start,
  276 + const octet_iterator& range_end) :
  277 + it(octet_it), range_start(range_start), range_end(range_end)
  278 + {
  279 + if (it < range_start || it > range_end)
  280 + throw std::out_of_range("Invalid utf-8 iterator position");
  281 + }
  282 + // the default "big three" are OK
  283 + octet_iterator base () const { return it; }
  284 + uint32_t operator * () const
  285 + {
  286 + octet_iterator temp = it;
  287 + return utf8::next(temp, range_end);
  288 + }
  289 + bool operator == (const iterator& rhs) const
  290 + {
  291 + if (range_start != rhs.range_start || range_end != rhs.range_end)
  292 + throw std::logic_error("Comparing utf-8 iterators defined with different ranges");
  293 + return (it == rhs.it);
  294 + }
  295 + bool operator != (const iterator& rhs) const
  296 + {
  297 + return !(operator == (rhs));
  298 + }
  299 + iterator& operator ++ ()
  300 + {
  301 + utf8::next(it, range_end);
  302 + return *this;
  303 + }
  304 + iterator operator ++ (int)
  305 + {
  306 + iterator temp = *this;
  307 + utf8::next(it, range_end);
  308 + return temp;
  309 + }
  310 + iterator& operator -- ()
  311 + {
  312 + utf8::prior(it, range_start);
  313 + return *this;
  314 + }
  315 + iterator operator -- (int)
  316 + {
  317 + iterator temp = *this;
  318 + utf8::prior(it, range_start);
  319 + return temp;
  320 + }
  321 + }; // class iterator
  322 +
  323 +} // namespace utf8
  324 +
  325 +#endif //header guard
  326 +
  327 +
... ...
morfeusz/encoding/utf8/core.h 0 → 100644
  1 +// Copyright 2006 Nemanja Trifunovic
  2 +
  3 +/*
  4 +Permission is hereby granted, free of charge, to any person or organization
  5 +obtaining a copy of the software and accompanying documentation covered by
  6 +this license (the "Software") to use, reproduce, display, distribute,
  7 +execute, and transmit the Software, and to prepare derivative works of the
  8 +Software, and to permit third-parties to whom the Software is furnished to
  9 +do so, all subject to the following:
  10 +
  11 +The copyright notices in the Software and this entire statement, including
  12 +the above license grant, this restriction and the following disclaimer,
  13 +must be included in all copies of the Software, in whole or in part, and
  14 +all derivative works of the Software, unless such copies or derivative
  15 +works are solely in the form of machine-executable object code generated by
  16 +a source language processor.
  17 +
  18 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
  21 +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
  22 +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
  23 +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24 +DEALINGS IN THE SOFTWARE.
  25 +*/
  26 +
  27 +
  28 +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
  29 +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
  30 +
  31 +#include <iterator>
  32 +#include <cstdint>
  33 +
  34 +namespace utf8
  35 +{
  36 + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers
  37 + // You may need to change them to match your system.
  38 + // These typedefs have the same names as ones from cstdint, or boost/cstdint
  39 +// typedef unsigned char uint8_t;
  40 +// typedef unsigned short uint16_t;
  41 +// typedef unsigned int uint32_t;
  42 +
  43 +// Helper code - not intended to be directly called by the library users. May be changed at any time
  44 +namespace internal
  45 +{
  46 + // Unicode constants
  47 + // Leading (high) surrogates: 0xd800 - 0xdbff
  48 + // Trailing (low) surrogates: 0xdc00 - 0xdfff
  49 + const uint16_t LEAD_SURROGATE_MIN = 0xd800u;
  50 + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu;
  51 + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u;
  52 + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu;
  53 + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10);
  54 + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN;
  55 +
  56 + // Maximum valid value for a Unicode code point
  57 + const uint32_t CODE_POINT_MAX = 0x0010ffffu;
  58 +
  59 + template<typename octet_type>
  60 + inline uint8_t mask8(octet_type oc)
  61 + {
  62 + return static_cast<uint8_t>(0xff & oc);
  63 + }
  64 + template<typename u16_type>
  65 + inline uint16_t mask16(u16_type oc)
  66 + {
  67 + return static_cast<uint16_t>(0xffff & oc);
  68 + }
  69 + template<typename octet_type>
  70 + inline bool is_trail(octet_type oc)
  71 + {
  72 + return ((utf8::internal::mask8(oc) >> 6) == 0x2);
  73 + }
  74 +
  75 + template <typename u16>
  76 + inline bool is_lead_surrogate(u16 cp)
  77 + {
  78 + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX);
  79 + }
  80 +
  81 + template <typename u16>
  82 + inline bool is_trail_surrogate(u16 cp)
  83 + {
  84 + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
  85 + }
  86 +
  87 + template <typename u16>
  88 + inline bool is_surrogate(u16 cp)
  89 + {
  90 + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX);
  91 + }
  92 +
  93 + template <typename u32>
  94 + inline bool is_code_point_valid(u32 cp)
  95 + {
  96 + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp));
  97 + }
  98 +
  99 + template <typename octet_iterator>
  100 + inline typename std::iterator_traits<octet_iterator>::difference_type
  101 + sequence_length(octet_iterator lead_it)
  102 + {
  103 + uint8_t lead = utf8::internal::mask8(*lead_it);
  104 + if (lead < 0x80)
  105 + return 1;
  106 + else if ((lead >> 5) == 0x6)
  107 + return 2;
  108 + else if ((lead >> 4) == 0xe)
  109 + return 3;
  110 + else if ((lead >> 3) == 0x1e)
  111 + return 4;
  112 + else
  113 + return 0;
  114 + }
  115 +
  116 + template <typename octet_difference_type>
  117 + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length)
  118 + {
  119 + if (cp < 0x80) {
  120 + if (length != 1)
  121 + return true;
  122 + }
  123 + else if (cp < 0x800) {
  124 + if (length != 2)
  125 + return true;
  126 + }
  127 + else if (cp < 0x10000) {
  128 + if (length != 3)
  129 + return true;
  130 + }
  131 +
  132 + return false;
  133 + }
  134 +
  135 + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT};
  136 +
  137 + /// Helper for get_sequence_x
  138 + template <typename octet_iterator>
  139 + utf_error increase_safely(octet_iterator& it, octet_iterator end)
  140 + {
  141 + if (++it == end)
  142 + return NOT_ENOUGH_ROOM;
  143 +
  144 + if (!utf8::internal::is_trail(*it))
  145 + return INCOMPLETE_SEQUENCE;
  146 +
  147 + return UTF8_OK;
  148 + }
  149 +
  150 + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;}
  151 +
  152 + /// get_sequence_x functions decode utf-8 sequences of the length x
  153 + template <typename octet_iterator>
  154 + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point)
  155 + {
  156 + if (it == end)
  157 + return NOT_ENOUGH_ROOM;
  158 +
  159 + code_point = utf8::internal::mask8(*it);
  160 +
  161 + return UTF8_OK;
  162 + }
  163 +
  164 + template <typename octet_iterator>
  165 + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point)
  166 + {
  167 + if (it == end)
  168 + return NOT_ENOUGH_ROOM;
  169 +
  170 + code_point = utf8::internal::mask8(*it);
  171 +
  172 + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
  173 +
  174 + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f);
  175 +
  176 + return UTF8_OK;
  177 + }
  178 +
  179 + template <typename octet_iterator>
  180 + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point)
  181 + {
  182 + if (it == end)
  183 + return NOT_ENOUGH_ROOM;
  184 +
  185 + code_point = utf8::internal::mask8(*it);
  186 +
  187 + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
  188 +
  189 + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
  190 +
  191 + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
  192 +
  193 + code_point += (*it) & 0x3f;
  194 +
  195 + return UTF8_OK;
  196 + }
  197 +
  198 + template <typename octet_iterator>
  199 + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point)
  200 + {
  201 + if (it == end)
  202 + return NOT_ENOUGH_ROOM;
  203 +
  204 + code_point = utf8::internal::mask8(*it);
  205 +
  206 + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
  207 +
  208 + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
  209 +
  210 + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
  211 +
  212 + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff;
  213 +
  214 + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end)
  215 +
  216 + code_point += (*it) & 0x3f;
  217 +
  218 + return UTF8_OK;
  219 + }
  220 +
  221 + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR
  222 +
  223 + template <typename octet_iterator>
  224 + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point)
  225 + {
  226 + // Save the original value of it so we can go back in case of failure
  227 + // Of course, it does not make much sense with i.e. stream iterators
  228 + octet_iterator original_it = it;
  229 +
  230 + uint32_t cp = 0;
  231 + // Determine the sequence length based on the lead octet
  232 + typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type;
  233 + const octet_difference_type length = utf8::internal::sequence_length(it);
  234 +
  235 + // Get trail octets and calculate the code point
  236 + utf_error err = UTF8_OK;
  237 + switch (length) {
  238 + case 0:
  239 + return INVALID_LEAD;
  240 + case 1:
  241 + err = utf8::internal::get_sequence_1(it, end, cp);
  242 + break;
  243 + case 2:
  244 + err = utf8::internal::get_sequence_2(it, end, cp);
  245 + break;
  246 + case 3:
  247 + err = utf8::internal::get_sequence_3(it, end, cp);
  248 + break;
  249 + case 4:
  250 + err = utf8::internal::get_sequence_4(it, end, cp);
  251 + break;
  252 + }
  253 +
  254 + if (err == UTF8_OK) {
  255 + // Decoding succeeded. Now, security checks...
  256 + if (utf8::internal::is_code_point_valid(cp)) {
  257 + if (!utf8::internal::is_overlong_sequence(cp, length)){
  258 + // Passed! Return here.
  259 + code_point = cp;
  260 + ++it;
  261 + return UTF8_OK;
  262 + }
  263 + else
  264 + err = OVERLONG_SEQUENCE;
  265 + }
  266 + else
  267 + err = INVALID_CODE_POINT;
  268 + }
  269 +
  270 + // Failure branch - restore the original value of the iterator
  271 + it = original_it;
  272 + return err;
  273 + }
  274 +
  275 + template <typename octet_iterator>
  276 + inline utf_error validate_next(octet_iterator& it, octet_iterator end) {
  277 + uint32_t ignored;
  278 + return utf8::internal::validate_next(it, end, ignored);
  279 + }
  280 +
  281 +} // namespace internal
  282 +
  283 + /// The library API - functions intended to be called by the users
  284 +
  285 + // Byte order mark
  286 + const uint8_t bom[] = {0xef, 0xbb, 0xbf};
  287 +
  288 + template <typename octet_iterator>
  289 + octet_iterator find_invalid(octet_iterator start, octet_iterator end)
  290 + {
  291 + octet_iterator result = start;
  292 + while (result != end) {
  293 + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end);
  294 + if (err_code != internal::UTF8_OK)
  295 + return result;
  296 + }
  297 + return result;
  298 + }
  299 +
  300 + template <typename octet_iterator>
  301 + inline bool is_valid(octet_iterator start, octet_iterator end)
  302 + {
  303 + return (utf8::find_invalid(start, end) == end);
  304 + }
  305 +
  306 + template <typename octet_iterator>
  307 + inline bool starts_with_bom (octet_iterator it, octet_iterator end)
  308 + {
  309 + return (
  310 + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) &&
  311 + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) &&
  312 + ((it != end) && (utf8::internal::mask8(*it)) == bom[2])
  313 + );
  314 + }
  315 +
  316 + //Deprecated in release 2.3
  317 + template <typename octet_iterator>
  318 + inline bool is_bom (octet_iterator it)
  319 + {
  320 + return (
  321 + (utf8::internal::mask8(*it++)) == bom[0] &&
  322 + (utf8::internal::mask8(*it++)) == bom[1] &&
  323 + (utf8::internal::mask8(*it)) == bom[2]
  324 + );
  325 + }
  326 +} // namespace utf8
  327 +
  328 +#endif // header guard
  329 +
  330 +
... ...
morfeusz/encoding/utf8/unchecked.h 0 → 100644
  1 +// Copyright 2006 Nemanja Trifunovic
  2 +
  3 +/*
  4 +Permission is hereby granted, free of charge, to any person or organization
  5 +obtaining a copy of the software and accompanying documentation covered by
  6 +this license (the "Software") to use, reproduce, display, distribute,
  7 +execute, and transmit the Software, and to prepare derivative works of the
  8 +Software, and to permit third-parties to whom the Software is furnished to
  9 +do so, all subject to the following:
  10 +
  11 +The copyright notices in the Software and this entire statement, including
  12 +the above license grant, this restriction and the following disclaimer,
  13 +must be included in all copies of the Software, in whole or in part, and
  14 +all derivative works of the Software, unless such copies or derivative
  15 +works are solely in the form of machine-executable object code generated by
  16 +a source language processor.
  17 +
  18 +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  19 +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  20 +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT
  21 +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE
  22 +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE,
  23 +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  24 +DEALINGS IN THE SOFTWARE.
  25 +*/
  26 +
  27 +
  28 +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
  29 +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731
  30 +
  31 +#include "core.h"
  32 +
  33 +namespace utf8
  34 +{
  35 + namespace unchecked
  36 + {
  37 + template <typename octet_iterator>
  38 + octet_iterator append(uint32_t cp, octet_iterator result)
  39 + {
  40 + if (cp < 0x80) // one octet
  41 + *(result++) = static_cast<uint8_t>(cp);
  42 + else if (cp < 0x800) { // two octets
  43 + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
  44 + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
  45 + }
  46 + else if (cp < 0x10000) { // three octets
  47 + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
  48 + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
  49 + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
  50 + }
  51 + else { // four octets
  52 + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
  53 + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80);
  54 + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
  55 + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
  56 + }
  57 + return result;
  58 + }
  59 +
  60 + template <typename octet_iterator>
  61 + uint32_t next(octet_iterator& it)
  62 + {
  63 + uint32_t cp = utf8::internal::mask8(*it);
  64 + typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it);
  65 + switch (length) {
  66 + case 1:
  67 + break;
  68 + case 2:
  69 + it++;
  70 + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f);
  71 + break;
  72 + case 3:
  73 + ++it;
  74 + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff);
  75 + ++it;
  76 + cp += (*it) & 0x3f;
  77 + break;
  78 + case 4:
  79 + ++it;
  80 + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff);
  81 + ++it;
  82 + cp += (utf8::internal::mask8(*it) << 6) & 0xfff;
  83 + ++it;
  84 + cp += (*it) & 0x3f;
  85 + break;
  86 + }
  87 + ++it;
  88 + return cp;
  89 + }
  90 +
  91 + template <typename octet_iterator>
  92 + uint32_t peek_next(octet_iterator it)
  93 + {
  94 + return utf8::unchecked::next(it);
  95 + }
  96 +
  97 + template <typename octet_iterator>
  98 + uint32_t prior(octet_iterator& it)
  99 + {
  100 + while (utf8::internal::is_trail(*(--it))) ;
  101 + octet_iterator temp = it;
  102 + return utf8::unchecked::next(temp);
  103 + }
  104 +
  105 + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous)
  106 + template <typename octet_iterator>
  107 + inline uint32_t previous(octet_iterator& it)
  108 + {
  109 + return utf8::unchecked::prior(it);
  110 + }
  111 +
  112 + template <typename octet_iterator, typename distance_type>
  113 + void advance (octet_iterator& it, distance_type n)
  114 + {
  115 + for (distance_type i = 0; i < n; ++i)
  116 + utf8::unchecked::next(it);
  117 + }
  118 +
  119 + template <typename octet_iterator>
  120 + typename std::iterator_traits<octet_iterator>::difference_type
  121 + distance (octet_iterator first, octet_iterator last)
  122 + {
  123 + typename std::iterator_traits<octet_iterator>::difference_type dist;
  124 + for (dist = 0; first < last; ++dist)
  125 + utf8::unchecked::next(first);
  126 + return dist;
  127 + }
  128 +
  129 + template <typename u16bit_iterator, typename octet_iterator>
  130 + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result)
  131 + {
  132 + while (start != end) {
  133 + uint32_t cp = utf8::internal::mask16(*start++);
  134 + // Take care of surrogate pairs first
  135 + if (utf8::internal::is_lead_surrogate(cp)) {
  136 + uint32_t trail_surrogate = utf8::internal::mask16(*start++);
  137 + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET;
  138 + }
  139 + result = utf8::unchecked::append(cp, result);
  140 + }
  141 + return result;
  142 + }
  143 +
  144 + template <typename u16bit_iterator, typename octet_iterator>
  145 + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result)
  146 + {
  147 + while (start < end) {
  148 + uint32_t cp = utf8::unchecked::next(start);
  149 + if (cp > 0xffff) { //make a surrogate pair
  150 + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET);
  151 + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN);
  152 + }
  153 + else
  154 + *result++ = static_cast<uint16_t>(cp);
  155 + }
  156 + return result;
  157 + }
  158 +
  159 + template <typename octet_iterator, typename u32bit_iterator>
  160 + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result)
  161 + {
  162 + while (start != end)
  163 + result = utf8::unchecked::append(*(start++), result);
  164 +
  165 + return result;
  166 + }
  167 +
  168 + template <typename octet_iterator, typename u32bit_iterator>
  169 + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result)
  170 + {
  171 + while (start < end)
  172 + (*result++) = utf8::unchecked::next(start);
  173 +
  174 + return result;
  175 + }
  176 +
  177 + // The iterator class
  178 + template <typename octet_iterator>
  179 + class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> {
  180 + octet_iterator it;
  181 + public:
  182 + iterator () {}
  183 + explicit iterator (const octet_iterator& octet_it): it(octet_it) {}
  184 + // the default "big three" are OK
  185 + octet_iterator base () const { return it; }
  186 + uint32_t operator * () const
  187 + {
  188 + octet_iterator temp = it;
  189 + return utf8::unchecked::next(temp);
  190 + }
  191 + bool operator == (const iterator& rhs) const
  192 + {
  193 + return (it == rhs.it);
  194 + }
  195 + bool operator != (const iterator& rhs) const
  196 + {
  197 + return !(operator == (rhs));
  198 + }
  199 + iterator& operator ++ ()
  200 + {
  201 + ::std::advance(it, utf8::internal::sequence_length(it));
  202 + return *this;
  203 + }
  204 + iterator operator ++ (int)
  205 + {
  206 + iterator temp = *this;
  207 + ::std::advance(it, utf8::internal::sequence_length(it));
  208 + return temp;
  209 + }
  210 + iterator& operator -- ()
  211 + {
  212 + utf8::unchecked::prior(it);
  213 + return *this;
  214 + }
  215 + iterator operator -- (int)
  216 + {
  217 + iterator temp = *this;
  218 + utf8::unchecked::prior(it);
  219 + return temp;
  220 + }
  221 + }; // class iterator
  222 +
  223 + } // namespace utf8::unchecked
  224 +} // namespace utf8
  225 +
  226 +
  227 +#endif // header guard
  228 +
... ...
morfeusz/test_morfeusz.cpp 0 → 100644
  1 +/*
  2 + * File: test_morfeusz.cpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 14 listopad 2013, 15:50
  6 + */
  7 +
  8 +#include <cstdlib>
  9 +#include <cstdio>
  10 +#include <sstream>
  11 +#include <iostream>
  12 +
  13 +#include "Morfeusz.hpp"
  14 +
  15 +using namespace std;
  16 +
  17 +void debug(const MorphInterpretation& interp) {
  18 + fprintf(stderr,
  19 + "%d %d %s %s %s %s\n",
  20 + interp.getStartNode(), interp.getEndNode(),
  21 + interp.getOrth(), interp.getLemma(),
  22 + interp.getTag(), interp.getName());
  23 +}
  24 +
  25 +void doTest(
  26 + const Morfeusz& morfeusz,
  27 + const string& inputFilename) {
  28 + ifstream ifs;
  29 + // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
  30 + ifs.open(inputFilename, ios::binary);
  31 + string line;
  32 + while (getline(ifs, line)) {
  33 + AnalyzeResult res(morfeusz.analyze(line));
  34 + while (res.iterator != res.end) {
  35 + debug(*res);
  36 + res++;
  37 + }
  38 + }
  39 + validate(ifs.eof(), "Failed to read the input file to the end");
  40 +}
  41 +
  42 +int main(int argc, char** argv) {
  43 + validate(argc == 3, "Must provide exactly two arguments - FSA filename, and input filename.");
  44 + string fsaFilename = argv[1];
  45 + string inputFilename = argv[2];
  46 + Morfeusz morfeusz(fsaFilename);
  47 + doTest(morfeusz, inputFilename);
  48 + return 0;
  49 +}
  50 +
... ...
morfeusz/test_morph.cpp
... ... @@ -17,21 +17,6 @@
17 17  
18 18 using namespace std;
19 19  
20   -void debug(const string& key, const vector<EncodedInterpretation> value) {
21   - cerr << key << endl;
22   - for (EncodedInterpretation i: value) {
23   - cerr << "suffix to cut: " << i.lemma.suffixToCut << endl;
24   - cerr << "suffix to add: " << i.lemma.suffixToAdd << endl;
25   - cerr << "tag: " << i.tag << endl;
26   - cerr << "name: " << i.nameClassifier << endl;
27   - }
28   - cerr << "==================" << endl;
29   -}
30   -
31   -//void debug(const string& key, const TaggedInterpretation& value) {
32   -// cerr << key << '\t' << value.toString() << endl;
33   -//}
34   -
35 20 void doTest(
36 21 const FSA<vector<EncodedInterpretation>>& fsa,
37 22 const Tagset& tagset,
... ...
nbproject/configurations.xml
... ... @@ -8,11 +8,13 @@
8 8 <in>test_speed.cpp</in>
9 9 </df>
10 10 <df root="morfeusz" name="1">
  11 + <df name="encoding">
  12 + <in>CharsetConverter.cpp</in>
  13 + <in>CharsetConverter.hpp</in>
  14 + </df>
11 15 <in>Morfeusz.cpp</in>
12   - <in>Morfeusz.hpp</in>
13 16 <in>MorphDeserializer.cpp</in>
14 17 <in>MorphInterpretation.cpp</in>
15   - <in>MorphInterpretation.hpp</in>
16 18 <in>Tagset.cpp</in>
17 19 <in>main.cpp</in>
18 20 <in>morfeusz.cpp</in>
... ... @@ -49,19 +51,11 @@
49 51 <executablePath>build/fsa/test_dict</executablePath>
50 52 </makeTool>
51 53 </makefileType>
52   - <folder path="1">
  54 + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="4">
53 55 <ccTool>
54 56 <incDir>
55 57 <pElem>fsa</pElem>
56   - <pElem>build/morfeusz</pElem>
57   - </incDir>
58   - </ccTool>
59   - </folder>
60   - <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8">
61   - <ccTool>
62   - <incDir>
63   - <pElem>fsa</pElem>
64   - <pElem>build/morfeusz</pElem>
  58 + <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem>
65 59 </incDir>
66 60 </ccTool>
67 61 </item>
... ... @@ -86,32 +80,86 @@
86 80 </incDir>
87 81 </ccTool>
88 82 </item>
89   - <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="0">
90   - </item>
91   - <item path="morfeusz/Morfeusz.hpp" ex="false" tool="3" flavor2="0">
  83 + <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8">
  84 + <ccTool>
  85 + <incDir>
  86 + <pElem>fsa</pElem>
  87 + <pElem>build/morfeusz</pElem>
  88 + </incDir>
  89 + </ccTool>
92 90 </item>
93 91 <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8">
94 92 <ccTool>
  93 + <incDir>
  94 + <pElem>fsa</pElem>
  95 + <pElem>build/morfeusz</pElem>
  96 + </incDir>
95 97 </ccTool>
96 98 </item>
97   - <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="0">
98   - </item>
99   - <item path="morfeusz/MorphInterpretation.hpp" ex="false" tool="3" flavor2="0">
  99 + <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="4">
  100 + <ccTool>
  101 + <incDir>
  102 + <pElem>morfeusz</pElem>
  103 + <pElem>/usr/include/c++/4.8/bits</pElem>
  104 + <pElem>/usr/include/c++/4.8/ext</pElem>
  105 + <pElem>/usr/include/c++/4.8</pElem>
  106 + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem>
  107 + <pElem>/usr/include/c++/4.8/debug</pElem>
  108 + <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem>
  109 + <pElem>/usr/include/c++/4.8/backward</pElem>
  110 + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem>
  111 + <pElem>build/morfeusz</pElem>
  112 + </incDir>
  113 + </ccTool>
100 114 </item>
101   - <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8">
  115 + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4">
102 116 <ccTool>
  117 + <incDir>
  118 + <pElem>morfeusz</pElem>
  119 + <pElem>/usr/include/c++/4.8/bits</pElem>
  120 + <pElem>/usr/include/c++/4.8/ext</pElem>
  121 + <pElem>/usr/include/c++/4.8</pElem>
  122 + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem>
  123 + <pElem>/usr/include/c++/4.8/debug</pElem>
  124 + <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem>
  125 + <pElem>fsa</pElem>
  126 + <pElem>/usr/include/c++/4.8/backward</pElem>
  127 + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem>
  128 + <pElem>build/morfeusz</pElem>
  129 + </incDir>
103 130 </ccTool>
104 131 </item>
  132 + <item path="morfeusz/encoding/CharsetConverter.cpp"
  133 + ex="false"
  134 + tool="1"
  135 + flavor2="0">
  136 + </item>
  137 + <item path="morfeusz/encoding/CharsetConverter.hpp"
  138 + ex="false"
  139 + tool="3"
  140 + flavor2="0">
  141 + </item>
105 142 <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
106 143 <ccTool>
  144 + <incDir>
  145 + <pElem>fsa</pElem>
  146 + <pElem>build/morfeusz</pElem>
  147 + </incDir>
107 148 </ccTool>
108 149 </item>
109 150 <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4">
110 151 <ccTool>
  152 + <incDir>
  153 + <pElem>morfeusz</pElem>
  154 + </incDir>
111 155 </ccTool>
112 156 </item>
113 157 <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8">
114 158 <ccTool>
  159 + <incDir>
  160 + <pElem>fsa</pElem>
  161 + <pElem>build/morfeusz</pElem>
  162 + </incDir>
115 163 </ccTool>
116 164 </item>
117 165 </conf>
... ...
nbproject/project.xml
... ... @@ -6,7 +6,7 @@
6 6 <name>morfeusz</name>
7 7 <c-extensions/>
8 8 <cpp-extensions>cpp</cpp-extensions>
9   - <header-extensions>hpp</header-extensions>
  9 + <header-extensions>h,hpp</header-extensions>
10 10 <sourceEncoding>UTF-8</sourceEncoding>
11 11 <make-dep-projects/>
12 12 <sourceRootList>
... ...