Commit f23aead2806888d01857b660722db3c3e372201e
1 parent
58aafafe
- dalsza praca nad klasą Morfeusz
- dodanie konwersji kodowań znaków (na razie tylko głupi szkielet, który obsługuje UTF-8) git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@19 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
18 changed files
with
1171 additions
and
48 deletions
fsa/fsa.hpp
@@ -15,6 +15,7 @@ | @@ -15,6 +15,7 @@ | ||
15 | #include <exception> | 15 | #include <exception> |
16 | #include <string> | 16 | #include <string> |
17 | #include <vector> | 17 | #include <vector> |
18 | +#include <netinet/in.h> | ||
18 | 19 | ||
19 | template <class T> class State; | 20 | template <class T> class State; |
20 | template <class T> class FSA; | 21 | template <class T> class FSA; |
@@ -83,6 +84,11 @@ public: | @@ -83,6 +84,11 @@ public: | ||
83 | */ | 84 | */ |
84 | static FSA<T>* getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer); | 85 | static FSA<T>* getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer); |
85 | 86 | ||
87 | + /** | ||
88 | + * Create an FSA object from file | ||
89 | + */ | ||
90 | + static FSA<T>* getFSA(const std::string& filename, const Deserializer<T>& deserializer); | ||
91 | + | ||
86 | protected: | 92 | protected: |
87 | 93 | ||
88 | /** | 94 | /** |
fsa/fsa_impl.hpp
@@ -13,6 +13,7 @@ | @@ -13,6 +13,7 @@ | ||
13 | #include <utility> | 13 | #include <utility> |
14 | #include <iostream> | 14 | #include <iostream> |
15 | #include <vector> | 15 | #include <vector> |
16 | +#include <string> | ||
16 | #include <netinet/in.h> | 17 | #include <netinet/in.h> |
17 | #include "utils.hpp" | 18 | #include "utils.hpp" |
18 | #include "const.hpp" | 19 | #include "const.hpp" |
@@ -55,6 +56,11 @@ State<T> FSA<T>::getInitialState() const { | @@ -55,6 +56,11 @@ State<T> FSA<T>::getInitialState() const { | ||
55 | } | 56 | } |
56 | 57 | ||
57 | template <class T> | 58 | template <class T> |
59 | +FSA<T>* FSA<T>::getFSA(const std::string& filename, const Deserializer<T>& deserializer) { | ||
60 | + return getFSA(readFile(filename.c_str()), deserializer); | ||
61 | +} | ||
62 | + | ||
63 | +template <class T> | ||
58 | FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer) { | 64 | FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer) { |
59 | 65 | ||
60 | uint32_t magicNumber = ntohl(*((uint32_t*) ptr)); | 66 | uint32_t magicNumber = ntohl(*((uint32_t*) ptr)); |
@@ -64,7 +70,7 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial | @@ -64,7 +70,7 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial | ||
64 | 70 | ||
65 | uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET); | 71 | uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET); |
66 | if (versionNum != VERSION_NUM) { | 72 | if (versionNum != VERSION_NUM) { |
67 | - throw FSAException(string("Invalid version number: ") + to_string(versionNum) + ", should be: " + to_string(VERSION_NUM)); | 73 | + throw FSAException(string("Invalid version number: ") + std::to_string(versionNum) + ", should be: " + to_string(VERSION_NUM)); |
68 | } | 74 | } |
69 | 75 | ||
70 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); | 76 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); |
morfeusz/CMakeLists.txt
@@ -7,9 +7,11 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) | @@ -7,9 +7,11 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) | ||
7 | add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) | 7 | add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) |
8 | add_executable (morfeusz2_analyze main.cpp) | 8 | add_executable (morfeusz2_analyze main.cpp) |
9 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) | 9 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) |
10 | +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp) | ||
10 | 11 | ||
11 | # Link the executable to the Hello library. | 12 | # Link the executable to the Hello library. |
12 | target_link_libraries (morfeusz2_analyze morfeusz2) | 13 | target_link_libraries (morfeusz2_analyze morfeusz2) |
13 | set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) | 14 | set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) |
14 | 15 | ||
15 | set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | 16 | set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
17 | +set_target_properties ( test_morfeusz PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | ||
16 | \ No newline at end of file | 18 | \ No newline at end of file |
morfeusz/EncodedInterpretation.hpp
@@ -15,6 +15,9 @@ | @@ -15,6 +15,9 @@ | ||
15 | 15 | ||
16 | using namespace std; | 16 | using namespace std; |
17 | 17 | ||
18 | +/* | ||
19 | + * Lemma in a compressed format (as in an automaton) | ||
20 | + */ | ||
18 | struct EncodedLemma { | 21 | struct EncodedLemma { |
19 | int suffixToCut; | 22 | int suffixToCut; |
20 | string suffixToAdd; | 23 | string suffixToAdd; |
morfeusz/Morfeusz.cpp
1 | /* | 1 | /* |
2 | * File: Morfeusz.cpp | 2 | * File: Morfeusz.cpp |
3 | - * Author: lennyn | 3 | + * Author: mlenart |
4 | * | 4 | * |
5 | * Created on November 13, 2013, 5:21 PM | 5 | * Created on November 13, 2013, 5:21 PM |
6 | */ | 6 | */ |
7 | 7 | ||
8 | +#include <string> | ||
9 | +#include "utils.hpp" | ||
8 | #include "Morfeusz.hpp" | 10 | #include "Morfeusz.hpp" |
11 | +#include "MorphDeserializer.hpp" | ||
12 | +#include "encoding/CharsetConverter.hpp" | ||
9 | 13 | ||
10 | -Morfeusz::Morfeusz() { | 14 | +using namespace std; |
15 | + | ||
16 | +static FSA<vector<EncodedInterpretation>>* initializeFSA(const string& filename) { | ||
17 | + static Deserializer<vector<EncodedInterpretation>>* deserializer | ||
18 | + = new MorphDeserializer(); | ||
19 | + return FSA<vector<EncodedInterpretation>>::getFSA(filename, *deserializer); | ||
20 | +} | ||
21 | + | ||
22 | +static CharsetConverter* initializeCharsetConverter() { | ||
23 | + static CharsetConverter* converter = new UTF8CharsetConverter(); | ||
24 | + return converter; | ||
11 | } | 25 | } |
12 | 26 | ||
13 | -Morfeusz::Morfeusz(const Morfeusz& orig) { | 27 | +Morfeusz::Morfeusz(const string& filename) |
28 | +: fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) { | ||
29 | + | ||
14 | } | 30 | } |
15 | 31 | ||
32 | +//Morfeusz::Morfeusz(const Morfeusz& orig) { | ||
33 | +//} | ||
34 | + | ||
16 | Morfeusz::~Morfeusz() { | 35 | Morfeusz::~Morfeusz() { |
36 | + delete &this->fsa; | ||
37 | +} | ||
38 | + | ||
39 | +AnalyzeResult Morfeusz::analyze(const std::string& text) { | ||
40 | + const char* textStart = text.c_str(); | ||
41 | + const char* textEnd = text.c_str() + text.length(); | ||
42 | + AnalyzeResult res = { | ||
43 | + ResultsIterator(textStart, textEnd, *this), | ||
44 | + ResultsIterator(textEnd, textEnd, *this)}; | ||
45 | + return res; | ||
17 | } | 46 | } |
18 | 47 |
morfeusz/Morfeusz.hpp
@@ -9,8 +9,11 @@ | @@ -9,8 +9,11 @@ | ||
9 | #define MORFEUSZ_HPP | 9 | #define MORFEUSZ_HPP |
10 | 10 | ||
11 | #include <string> | 11 | #include <string> |
12 | +#include <vector> | ||
13 | +#include "EncodedInterpretation.hpp" | ||
14 | +#include "fsa.hpp" | ||
12 | #include "MorphInterpretation.hpp" | 15 | #include "MorphInterpretation.hpp" |
13 | -//#include "interpretations.hpp" | 16 | +#include "encoding/CharsetConverter.hpp" |
14 | 17 | ||
15 | class Morfeusz; | 18 | class Morfeusz; |
16 | class AnalyzeResult; | 19 | class AnalyzeResult; |
@@ -18,22 +21,21 @@ class ResultsIterator; | @@ -18,22 +21,21 @@ class ResultsIterator; | ||
18 | 21 | ||
19 | class Morfeusz { | 22 | class Morfeusz { |
20 | public: | 23 | public: |
21 | - | 24 | + explicit Morfeusz(const std::string& filename); |
25 | + virtual ~Morfeusz(); | ||
26 | +// Morfeusz(const Morfeusz& orig); | ||
22 | AnalyzeResult analyze(const std::string& text); | 27 | AnalyzeResult analyze(const std::string& text); |
23 | 28 | ||
24 | - Morfeusz(); | ||
25 | - Morfeusz(const Morfeusz& orig); | ||
26 | - virtual ~Morfeusz(); | 29 | +// Morfeusz(); |
27 | private: | 30 | private: |
28 | - void processOneWord(const char*& inputData, int startNodeNum, vector<MorphInterpretation>& resInterps); | 31 | + void processOneWord(const char*& inputData, int startNodeNum, std::vector<MorphInterpretation>& resInterps); |
32 | + const FSA<std::vector<EncodedInterpretation>>* fsa; | ||
33 | + CharsetConverter* charsetConverter; | ||
29 | }; | 34 | }; |
30 | 35 | ||
31 | class ResultsIterator { | 36 | class ResultsIterator { |
32 | public: | 37 | public: |
33 | ResultsIterator( | 38 | ResultsIterator( |
34 | - const std::string& text, | ||
35 | - const Morfeusz& morfeusz); | ||
36 | - ResultsIterator( | ||
37 | const char* startOfInput, | 39 | const char* startOfInput, |
38 | const char* endOfInput, | 40 | const char* endOfInput, |
39 | const Morfeusz& morfeusz); | 41 | const Morfeusz& morfeusz); |
morfeusz/MorphInterpretation.cpp
@@ -42,6 +42,14 @@ MorphInterpretation::MorphInterpretation( | @@ -42,6 +42,14 @@ MorphInterpretation::MorphInterpretation( | ||
42 | MorphInterpretation::~MorphInterpretation() { | 42 | MorphInterpretation::~MorphInterpretation() { |
43 | } | 43 | } |
44 | 44 | ||
45 | +int MorphInterpretation::getStartNode() const { | ||
46 | + return this->startNode; | ||
47 | +} | ||
48 | + | ||
49 | +int MorphInterpretation::getEndNode() const { | ||
50 | + return this->endNode; | ||
51 | +} | ||
52 | + | ||
45 | const std::string& MorphInterpretation::getOrth() const { | 53 | const std::string& MorphInterpretation::getOrth() const { |
46 | return this->orth; | 54 | return this->orth; |
47 | } | 55 | } |
morfeusz/MorphInterpretation.hpp
@@ -21,6 +21,8 @@ public: | @@ -21,6 +21,8 @@ public: | ||
21 | const EncodedInterpretation& encodedInterp, | 21 | const EncodedInterpretation& encodedInterp, |
22 | const Tagset& tagset); | 22 | const Tagset& tagset); |
23 | virtual ~MorphInterpretation(); | 23 | virtual ~MorphInterpretation(); |
24 | + int getStartNode() const; | ||
25 | + int getEndNode() const; | ||
24 | const std::string& getOrth() const; | 26 | const std::string& getOrth() const; |
25 | const std::string& getLemma() const; | 27 | const std::string& getLemma() const; |
26 | int getTagnum() const; | 28 | int getTagnum() const; |
morfeusz/encoding/CharsetConverter.cpp
0 → 100644
1 | +/* | ||
2 | + * File: EncodingConverter.cpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on 14 listopad 2013, 17:28 | ||
6 | + */ | ||
7 | + | ||
8 | +#include "utf8.h" | ||
9 | +#include "CharsetConverter.hpp" | ||
10 | + | ||
11 | +uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { | ||
12 | + return utf8::next(it, end); | ||
13 | +} | ||
14 | +const char* UTF8CharsetConverter::append(uint32_t cp, const char* result) const { | ||
15 | + return utf8::append(cp, result); | ||
16 | +} |
morfeusz/encoding/CharsetConverter.hpp
0 → 100644
1 | +/* | ||
2 | + * File: EncodingConverter.hpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on 14 listopad 2013, 17:28 | ||
6 | + */ | ||
7 | + | ||
8 | +#ifndef ENCODINGCONVERTER_HPP | ||
9 | +#define ENCODINGCONVERTER_HPP | ||
10 | + | ||
11 | +class CharsetConverter { | ||
12 | +public: | ||
13 | + virtual uint32_t next(const char*& it, const char* end) const = 0; | ||
14 | + virtual const char* append(uint32_t cp, const char* result) const = 0; | ||
15 | +private: | ||
16 | +}; | ||
17 | + | ||
18 | +class UTF8CharsetConverter: public CharsetConverter { | ||
19 | +public: | ||
20 | + uint32_t next(const char*& it, const char* end) const; | ||
21 | + const char* append(uint32_t cp, const char* result) const; | ||
22 | +private: | ||
23 | +}; | ||
24 | + | ||
25 | +class UTF16CharsetConverter: public CharsetConverter { | ||
26 | +public: | ||
27 | + uint32_t next(const char*& it, const char* end) const; | ||
28 | + const char* append(uint32_t cp, const char* result) const; | ||
29 | +private: | ||
30 | +}; | ||
31 | + | ||
32 | +class UTF32CharsetConverter: public CharsetConverter { | ||
33 | +public: | ||
34 | + uint32_t next(const char*& it, const char* end) const; | ||
35 | + const char* append(uint32_t cp, const char* result) const; | ||
36 | +private: | ||
37 | +}; | ||
38 | + | ||
39 | +class ISO8859_2_CharsetConverter: public CharsetConverter { | ||
40 | +public: | ||
41 | + uint32_t next(const char*& it, const char* end) const; | ||
42 | + const char* append(uint32_t cp, const char* result) const; | ||
43 | +private: | ||
44 | +}; | ||
45 | + | ||
46 | +#endif /* ENCODINGCONVERTER_HPP */ | ||
47 | + |
morfeusz/encoding/utf8.h
0 → 100644
1 | +// Copyright 2006 Nemanja Trifunovic | ||
2 | + | ||
3 | +/* | ||
4 | +Permission is hereby granted, free of charge, to any person or organization | ||
5 | +obtaining a copy of the software and accompanying documentation covered by | ||
6 | +this license (the "Software") to use, reproduce, display, distribute, | ||
7 | +execute, and transmit the Software, and to prepare derivative works of the | ||
8 | +Software, and to permit third-parties to whom the Software is furnished to | ||
9 | +do so, all subject to the following: | ||
10 | + | ||
11 | +The copyright notices in the Software and this entire statement, including | ||
12 | +the above license grant, this restriction and the following disclaimer, | ||
13 | +must be included in all copies of the Software, in whole or in part, and | ||
14 | +all derivative works of the Software, unless such copies or derivative | ||
15 | +works are solely in the form of machine-executable object code generated by | ||
16 | +a source language processor. | ||
17 | + | ||
18 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
19 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
20 | +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | ||
21 | +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | ||
22 | +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | ||
23 | +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
24 | +DEALINGS IN THE SOFTWARE. | ||
25 | +*/ | ||
26 | + | ||
27 | + | ||
28 | +#ifndef UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
29 | +#define UTF8_FOR_CPP_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
30 | + | ||
31 | +#include "utf8/checked.h" | ||
32 | +#include "utf8/unchecked.h" | ||
33 | + | ||
34 | +#endif // header guard |
morfeusz/encoding/utf8/checked.h
0 → 100644
1 | +// Copyright 2006 Nemanja Trifunovic | ||
2 | + | ||
3 | +/* | ||
4 | +Permission is hereby granted, free of charge, to any person or organization | ||
5 | +obtaining a copy of the software and accompanying documentation covered by | ||
6 | +this license (the "Software") to use, reproduce, display, distribute, | ||
7 | +execute, and transmit the Software, and to prepare derivative works of the | ||
8 | +Software, and to permit third-parties to whom the Software is furnished to | ||
9 | +do so, all subject to the following: | ||
10 | + | ||
11 | +The copyright notices in the Software and this entire statement, including | ||
12 | +the above license grant, this restriction and the following disclaimer, | ||
13 | +must be included in all copies of the Software, in whole or in part, and | ||
14 | +all derivative works of the Software, unless such copies or derivative | ||
15 | +works are solely in the form of machine-executable object code generated by | ||
16 | +a source language processor. | ||
17 | + | ||
18 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
19 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
20 | +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | ||
21 | +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | ||
22 | +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | ||
23 | +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
24 | +DEALINGS IN THE SOFTWARE. | ||
25 | +*/ | ||
26 | + | ||
27 | + | ||
28 | +#ifndef UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
29 | +#define UTF8_FOR_CPP_CHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
30 | + | ||
31 | +#include "core.h" | ||
32 | +#include <stdexcept> | ||
33 | + | ||
34 | +namespace utf8 | ||
35 | +{ | ||
36 | + // Base for the exceptions that may be thrown from the library | ||
37 | + class exception : public ::std::exception { | ||
38 | + }; | ||
39 | + | ||
40 | + // Exceptions that may be thrown from the library functions. | ||
41 | + class invalid_code_point : public exception { | ||
42 | + uint32_t cp; | ||
43 | + public: | ||
44 | + invalid_code_point(uint32_t cp) : cp(cp) {} | ||
45 | + virtual const char* what() const throw() { return "Invalid code point"; } | ||
46 | + uint32_t code_point() const {return cp;} | ||
47 | + }; | ||
48 | + | ||
49 | + class invalid_utf8 : public exception { | ||
50 | + uint8_t u8; | ||
51 | + public: | ||
52 | + invalid_utf8 (uint8_t u) : u8(u) {} | ||
53 | + virtual const char* what() const throw() { return "Invalid UTF-8"; } | ||
54 | + uint8_t utf8_octet() const {return u8;} | ||
55 | + }; | ||
56 | + | ||
57 | + class invalid_utf16 : public exception { | ||
58 | + uint16_t u16; | ||
59 | + public: | ||
60 | + invalid_utf16 (uint16_t u) : u16(u) {} | ||
61 | + virtual const char* what() const throw() { return "Invalid UTF-16"; } | ||
62 | + uint16_t utf16_word() const {return u16;} | ||
63 | + }; | ||
64 | + | ||
65 | + class not_enough_room : public exception { | ||
66 | + public: | ||
67 | + virtual const char* what() const throw() { return "Not enough space"; } | ||
68 | + }; | ||
69 | + | ||
70 | + /// The library API - functions intended to be called by the users | ||
71 | + | ||
72 | + template <typename octet_iterator> | ||
73 | + octet_iterator append(uint32_t cp, octet_iterator result) | ||
74 | + { | ||
75 | + if (!utf8::internal::is_code_point_valid(cp)) | ||
76 | + throw invalid_code_point(cp); | ||
77 | + | ||
78 | + if (cp < 0x80) // one octet | ||
79 | + *(result++) = static_cast<uint8_t>(cp); | ||
80 | + else if (cp < 0x800) { // two octets | ||
81 | + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); | ||
82 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | ||
83 | + } | ||
84 | + else if (cp < 0x10000) { // three octets | ||
85 | + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); | ||
86 | + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); | ||
87 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | ||
88 | + } | ||
89 | + else { // four octets | ||
90 | + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); | ||
91 | + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80); | ||
92 | + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); | ||
93 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | ||
94 | + } | ||
95 | + return result; | ||
96 | + } | ||
97 | + | ||
98 | + template <typename octet_iterator, typename output_iterator> | ||
99 | + output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out, uint32_t replacement) | ||
100 | + { | ||
101 | + while (start != end) { | ||
102 | + octet_iterator sequence_start = start; | ||
103 | + internal::utf_error err_code = utf8::internal::validate_next(start, end); | ||
104 | + switch (err_code) { | ||
105 | + case internal::UTF8_OK : | ||
106 | + for (octet_iterator it = sequence_start; it != start; ++it) | ||
107 | + *out++ = *it; | ||
108 | + break; | ||
109 | + case internal::NOT_ENOUGH_ROOM: | ||
110 | + throw not_enough_room(); | ||
111 | + case internal::INVALID_LEAD: | ||
112 | + out = utf8::append (replacement, out); | ||
113 | + ++start; | ||
114 | + break; | ||
115 | + case internal::INCOMPLETE_SEQUENCE: | ||
116 | + case internal::OVERLONG_SEQUENCE: | ||
117 | + case internal::INVALID_CODE_POINT: | ||
118 | + out = utf8::append (replacement, out); | ||
119 | + ++start; | ||
120 | + // just one replacement mark for the sequence | ||
121 | + while (start != end && utf8::internal::is_trail(*start)) | ||
122 | + ++start; | ||
123 | + break; | ||
124 | + } | ||
125 | + } | ||
126 | + return out; | ||
127 | + } | ||
128 | + | ||
129 | + template <typename octet_iterator, typename output_iterator> | ||
130 | + inline output_iterator replace_invalid(octet_iterator start, octet_iterator end, output_iterator out) | ||
131 | + { | ||
132 | + static const uint32_t replacement_marker = utf8::internal::mask16(0xfffd); | ||
133 | + return utf8::replace_invalid(start, end, out, replacement_marker); | ||
134 | + } | ||
135 | + | ||
136 | + template <typename octet_iterator> | ||
137 | + uint32_t next(octet_iterator& it, octet_iterator end) | ||
138 | + { | ||
139 | + uint32_t cp = 0; | ||
140 | + internal::utf_error err_code = utf8::internal::validate_next(it, end, cp); | ||
141 | + switch (err_code) { | ||
142 | + case internal::UTF8_OK : | ||
143 | + break; | ||
144 | + case internal::NOT_ENOUGH_ROOM : | ||
145 | + throw not_enough_room(); | ||
146 | + case internal::INVALID_LEAD : | ||
147 | + case internal::INCOMPLETE_SEQUENCE : | ||
148 | + case internal::OVERLONG_SEQUENCE : | ||
149 | + throw invalid_utf8(*it); | ||
150 | + case internal::INVALID_CODE_POINT : | ||
151 | + throw invalid_code_point(cp); | ||
152 | + } | ||
153 | + return cp; | ||
154 | + } | ||
155 | + | ||
156 | + template <typename octet_iterator> | ||
157 | + uint32_t peek_next(octet_iterator it, octet_iterator end) | ||
158 | + { | ||
159 | + return utf8::next(it, end); | ||
160 | + } | ||
161 | + | ||
162 | + template <typename octet_iterator> | ||
163 | + uint32_t prior(octet_iterator& it, octet_iterator start) | ||
164 | + { | ||
165 | + // can't do much if it == start | ||
166 | + if (it == start) | ||
167 | + throw not_enough_room(); | ||
168 | + | ||
169 | + octet_iterator end = it; | ||
170 | + // Go back until we hit either a lead octet or start | ||
171 | + while (utf8::internal::is_trail(*(--it))) | ||
172 | + if (it == start) | ||
173 | + throw invalid_utf8(*it); // error - no lead byte in the sequence | ||
174 | + return utf8::peek_next(it, end); | ||
175 | + } | ||
176 | + | ||
177 | + /// Deprecated in versions that include "prior" | ||
178 | + template <typename octet_iterator> | ||
179 | + uint32_t previous(octet_iterator& it, octet_iterator pass_start) | ||
180 | + { | ||
181 | + octet_iterator end = it; | ||
182 | + while (utf8::internal::is_trail(*(--it))) | ||
183 | + if (it == pass_start) | ||
184 | + throw invalid_utf8(*it); // error - no lead byte in the sequence | ||
185 | + octet_iterator temp = it; | ||
186 | + return utf8::next(temp, end); | ||
187 | + } | ||
188 | + | ||
189 | + template <typename octet_iterator, typename distance_type> | ||
190 | + void advance (octet_iterator& it, distance_type n, octet_iterator end) | ||
191 | + { | ||
192 | + for (distance_type i = 0; i < n; ++i) | ||
193 | + utf8::next(it, end); | ||
194 | + } | ||
195 | + | ||
196 | + template <typename octet_iterator> | ||
197 | + typename std::iterator_traits<octet_iterator>::difference_type | ||
198 | + distance (octet_iterator first, octet_iterator last) | ||
199 | + { | ||
200 | + typename std::iterator_traits<octet_iterator>::difference_type dist; | ||
201 | + for (dist = 0; first < last; ++dist) | ||
202 | + utf8::next(first, last); | ||
203 | + return dist; | ||
204 | + } | ||
205 | + | ||
206 | + template <typename u16bit_iterator, typename octet_iterator> | ||
207 | + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) | ||
208 | + { | ||
209 | + while (start != end) { | ||
210 | + uint32_t cp = utf8::internal::mask16(*start++); | ||
211 | + // Take care of surrogate pairs first | ||
212 | + if (utf8::internal::is_lead_surrogate(cp)) { | ||
213 | + if (start != end) { | ||
214 | + uint32_t trail_surrogate = utf8::internal::mask16(*start++); | ||
215 | + if (utf8::internal::is_trail_surrogate(trail_surrogate)) | ||
216 | + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; | ||
217 | + else | ||
218 | + throw invalid_utf16(static_cast<uint16_t>(trail_surrogate)); | ||
219 | + } | ||
220 | + else | ||
221 | + throw invalid_utf16(static_cast<uint16_t>(cp)); | ||
222 | + | ||
223 | + } | ||
224 | + // Lone trail surrogate | ||
225 | + else if (utf8::internal::is_trail_surrogate(cp)) | ||
226 | + throw invalid_utf16(static_cast<uint16_t>(cp)); | ||
227 | + | ||
228 | + result = utf8::append(cp, result); | ||
229 | + } | ||
230 | + return result; | ||
231 | + } | ||
232 | + | ||
233 | + template <typename u16bit_iterator, typename octet_iterator> | ||
234 | + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) | ||
235 | + { | ||
236 | + while (start != end) { | ||
237 | + uint32_t cp = utf8::next(start, end); | ||
238 | + if (cp > 0xffff) { //make a surrogate pair | ||
239 | + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); | ||
240 | + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); | ||
241 | + } | ||
242 | + else | ||
243 | + *result++ = static_cast<uint16_t>(cp); | ||
244 | + } | ||
245 | + return result; | ||
246 | + } | ||
247 | + | ||
248 | + template <typename octet_iterator, typename u32bit_iterator> | ||
249 | + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) | ||
250 | + { | ||
251 | + while (start != end) | ||
252 | + result = utf8::append(*(start++), result); | ||
253 | + | ||
254 | + return result; | ||
255 | + } | ||
256 | + | ||
257 | + template <typename octet_iterator, typename u32bit_iterator> | ||
258 | + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) | ||
259 | + { | ||
260 | + while (start != end) | ||
261 | + (*result++) = utf8::next(start, end); | ||
262 | + | ||
263 | + return result; | ||
264 | + } | ||
265 | + | ||
266 | + // The iterator class | ||
267 | + template <typename octet_iterator> | ||
268 | + class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { | ||
269 | + octet_iterator it; | ||
270 | + octet_iterator range_start; | ||
271 | + octet_iterator range_end; | ||
272 | + public: | ||
273 | + iterator () {} | ||
274 | + explicit iterator (const octet_iterator& octet_it, | ||
275 | + const octet_iterator& range_start, | ||
276 | + const octet_iterator& range_end) : | ||
277 | + it(octet_it), range_start(range_start), range_end(range_end) | ||
278 | + { | ||
279 | + if (it < range_start || it > range_end) | ||
280 | + throw std::out_of_range("Invalid utf-8 iterator position"); | ||
281 | + } | ||
282 | + // the default "big three" are OK | ||
283 | + octet_iterator base () const { return it; } | ||
284 | + uint32_t operator * () const | ||
285 | + { | ||
286 | + octet_iterator temp = it; | ||
287 | + return utf8::next(temp, range_end); | ||
288 | + } | ||
289 | + bool operator == (const iterator& rhs) const | ||
290 | + { | ||
291 | + if (range_start != rhs.range_start || range_end != rhs.range_end) | ||
292 | + throw std::logic_error("Comparing utf-8 iterators defined with different ranges"); | ||
293 | + return (it == rhs.it); | ||
294 | + } | ||
295 | + bool operator != (const iterator& rhs) const | ||
296 | + { | ||
297 | + return !(operator == (rhs)); | ||
298 | + } | ||
299 | + iterator& operator ++ () | ||
300 | + { | ||
301 | + utf8::next(it, range_end); | ||
302 | + return *this; | ||
303 | + } | ||
304 | + iterator operator ++ (int) | ||
305 | + { | ||
306 | + iterator temp = *this; | ||
307 | + utf8::next(it, range_end); | ||
308 | + return temp; | ||
309 | + } | ||
310 | + iterator& operator -- () | ||
311 | + { | ||
312 | + utf8::prior(it, range_start); | ||
313 | + return *this; | ||
314 | + } | ||
315 | + iterator operator -- (int) | ||
316 | + { | ||
317 | + iterator temp = *this; | ||
318 | + utf8::prior(it, range_start); | ||
319 | + return temp; | ||
320 | + } | ||
321 | + }; // class iterator | ||
322 | + | ||
323 | +} // namespace utf8 | ||
324 | + | ||
325 | +#endif //header guard | ||
326 | + | ||
327 | + |
morfeusz/encoding/utf8/core.h
0 → 100644
1 | +// Copyright 2006 Nemanja Trifunovic | ||
2 | + | ||
3 | +/* | ||
4 | +Permission is hereby granted, free of charge, to any person or organization | ||
5 | +obtaining a copy of the software and accompanying documentation covered by | ||
6 | +this license (the "Software") to use, reproduce, display, distribute, | ||
7 | +execute, and transmit the Software, and to prepare derivative works of the | ||
8 | +Software, and to permit third-parties to whom the Software is furnished to | ||
9 | +do so, all subject to the following: | ||
10 | + | ||
11 | +The copyright notices in the Software and this entire statement, including | ||
12 | +the above license grant, this restriction and the following disclaimer, | ||
13 | +must be included in all copies of the Software, in whole or in part, and | ||
14 | +all derivative works of the Software, unless such copies or derivative | ||
15 | +works are solely in the form of machine-executable object code generated by | ||
16 | +a source language processor. | ||
17 | + | ||
18 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
19 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
20 | +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | ||
21 | +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | ||
22 | +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | ||
23 | +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
24 | +DEALINGS IN THE SOFTWARE. | ||
25 | +*/ | ||
26 | + | ||
27 | + | ||
28 | +#ifndef UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
29 | +#define UTF8_FOR_CPP_CORE_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
30 | + | ||
31 | +#include <iterator> | ||
32 | +#include <cstdint> | ||
33 | + | ||
34 | +namespace utf8 | ||
35 | +{ | ||
36 | + // The typedefs for 8-bit, 16-bit and 32-bit unsigned integers | ||
37 | + // You may need to change them to match your system. | ||
38 | + // These typedefs have the same names as ones from cstdint, or boost/cstdint | ||
39 | +// typedef unsigned char uint8_t; | ||
40 | +// typedef unsigned short uint16_t; | ||
41 | +// typedef unsigned int uint32_t; | ||
42 | + | ||
43 | +// Helper code - not intended to be directly called by the library users. May be changed at any time | ||
44 | +namespace internal | ||
45 | +{ | ||
46 | + // Unicode constants | ||
47 | + // Leading (high) surrogates: 0xd800 - 0xdbff | ||
48 | + // Trailing (low) surrogates: 0xdc00 - 0xdfff | ||
49 | + const uint16_t LEAD_SURROGATE_MIN = 0xd800u; | ||
50 | + const uint16_t LEAD_SURROGATE_MAX = 0xdbffu; | ||
51 | + const uint16_t TRAIL_SURROGATE_MIN = 0xdc00u; | ||
52 | + const uint16_t TRAIL_SURROGATE_MAX = 0xdfffu; | ||
53 | + const uint16_t LEAD_OFFSET = LEAD_SURROGATE_MIN - (0x10000 >> 10); | ||
54 | + const uint32_t SURROGATE_OFFSET = 0x10000u - (LEAD_SURROGATE_MIN << 10) - TRAIL_SURROGATE_MIN; | ||
55 | + | ||
56 | + // Maximum valid value for a Unicode code point | ||
57 | + const uint32_t CODE_POINT_MAX = 0x0010ffffu; | ||
58 | + | ||
59 | + template<typename octet_type> | ||
60 | + inline uint8_t mask8(octet_type oc) | ||
61 | + { | ||
62 | + return static_cast<uint8_t>(0xff & oc); | ||
63 | + } | ||
64 | + template<typename u16_type> | ||
65 | + inline uint16_t mask16(u16_type oc) | ||
66 | + { | ||
67 | + return static_cast<uint16_t>(0xffff & oc); | ||
68 | + } | ||
69 | + template<typename octet_type> | ||
70 | + inline bool is_trail(octet_type oc) | ||
71 | + { | ||
72 | + return ((utf8::internal::mask8(oc) >> 6) == 0x2); | ||
73 | + } | ||
74 | + | ||
75 | + template <typename u16> | ||
76 | + inline bool is_lead_surrogate(u16 cp) | ||
77 | + { | ||
78 | + return (cp >= LEAD_SURROGATE_MIN && cp <= LEAD_SURROGATE_MAX); | ||
79 | + } | ||
80 | + | ||
81 | + template <typename u16> | ||
82 | + inline bool is_trail_surrogate(u16 cp) | ||
83 | + { | ||
84 | + return (cp >= TRAIL_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); | ||
85 | + } | ||
86 | + | ||
87 | + template <typename u16> | ||
88 | + inline bool is_surrogate(u16 cp) | ||
89 | + { | ||
90 | + return (cp >= LEAD_SURROGATE_MIN && cp <= TRAIL_SURROGATE_MAX); | ||
91 | + } | ||
92 | + | ||
93 | + template <typename u32> | ||
94 | + inline bool is_code_point_valid(u32 cp) | ||
95 | + { | ||
96 | + return (cp <= CODE_POINT_MAX && !utf8::internal::is_surrogate(cp)); | ||
97 | + } | ||
98 | + | ||
99 | + template <typename octet_iterator> | ||
100 | + inline typename std::iterator_traits<octet_iterator>::difference_type | ||
101 | + sequence_length(octet_iterator lead_it) | ||
102 | + { | ||
103 | + uint8_t lead = utf8::internal::mask8(*lead_it); | ||
104 | + if (lead < 0x80) | ||
105 | + return 1; | ||
106 | + else if ((lead >> 5) == 0x6) | ||
107 | + return 2; | ||
108 | + else if ((lead >> 4) == 0xe) | ||
109 | + return 3; | ||
110 | + else if ((lead >> 3) == 0x1e) | ||
111 | + return 4; | ||
112 | + else | ||
113 | + return 0; | ||
114 | + } | ||
115 | + | ||
116 | + template <typename octet_difference_type> | ||
117 | + inline bool is_overlong_sequence(uint32_t cp, octet_difference_type length) | ||
118 | + { | ||
119 | + if (cp < 0x80) { | ||
120 | + if (length != 1) | ||
121 | + return true; | ||
122 | + } | ||
123 | + else if (cp < 0x800) { | ||
124 | + if (length != 2) | ||
125 | + return true; | ||
126 | + } | ||
127 | + else if (cp < 0x10000) { | ||
128 | + if (length != 3) | ||
129 | + return true; | ||
130 | + } | ||
131 | + | ||
132 | + return false; | ||
133 | + } | ||
134 | + | ||
135 | + enum utf_error {UTF8_OK, NOT_ENOUGH_ROOM, INVALID_LEAD, INCOMPLETE_SEQUENCE, OVERLONG_SEQUENCE, INVALID_CODE_POINT}; | ||
136 | + | ||
137 | + /// Helper for get_sequence_x | ||
138 | + template <typename octet_iterator> | ||
139 | + utf_error increase_safely(octet_iterator& it, octet_iterator end) | ||
140 | + { | ||
141 | + if (++it == end) | ||
142 | + return NOT_ENOUGH_ROOM; | ||
143 | + | ||
144 | + if (!utf8::internal::is_trail(*it)) | ||
145 | + return INCOMPLETE_SEQUENCE; | ||
146 | + | ||
147 | + return UTF8_OK; | ||
148 | + } | ||
149 | + | ||
150 | + #define UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(IT, END) {utf_error ret = increase_safely(IT, END); if (ret != UTF8_OK) return ret;} | ||
151 | + | ||
152 | + /// get_sequence_x functions decode utf-8 sequences of the length x | ||
153 | + template <typename octet_iterator> | ||
154 | + utf_error get_sequence_1(octet_iterator& it, octet_iterator end, uint32_t& code_point) | ||
155 | + { | ||
156 | + if (it == end) | ||
157 | + return NOT_ENOUGH_ROOM; | ||
158 | + | ||
159 | + code_point = utf8::internal::mask8(*it); | ||
160 | + | ||
161 | + return UTF8_OK; | ||
162 | + } | ||
163 | + | ||
164 | + template <typename octet_iterator> | ||
165 | + utf_error get_sequence_2(octet_iterator& it, octet_iterator end, uint32_t& code_point) | ||
166 | + { | ||
167 | + if (it == end) | ||
168 | + return NOT_ENOUGH_ROOM; | ||
169 | + | ||
170 | + code_point = utf8::internal::mask8(*it); | ||
171 | + | ||
172 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
173 | + | ||
174 | + code_point = ((code_point << 6) & 0x7ff) + ((*it) & 0x3f); | ||
175 | + | ||
176 | + return UTF8_OK; | ||
177 | + } | ||
178 | + | ||
179 | + template <typename octet_iterator> | ||
180 | + utf_error get_sequence_3(octet_iterator& it, octet_iterator end, uint32_t& code_point) | ||
181 | + { | ||
182 | + if (it == end) | ||
183 | + return NOT_ENOUGH_ROOM; | ||
184 | + | ||
185 | + code_point = utf8::internal::mask8(*it); | ||
186 | + | ||
187 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
188 | + | ||
189 | + code_point = ((code_point << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); | ||
190 | + | ||
191 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
192 | + | ||
193 | + code_point += (*it) & 0x3f; | ||
194 | + | ||
195 | + return UTF8_OK; | ||
196 | + } | ||
197 | + | ||
198 | + template <typename octet_iterator> | ||
199 | + utf_error get_sequence_4(octet_iterator& it, octet_iterator end, uint32_t& code_point) | ||
200 | + { | ||
201 | + if (it == end) | ||
202 | + return NOT_ENOUGH_ROOM; | ||
203 | + | ||
204 | + code_point = utf8::internal::mask8(*it); | ||
205 | + | ||
206 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
207 | + | ||
208 | + code_point = ((code_point << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); | ||
209 | + | ||
210 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
211 | + | ||
212 | + code_point += (utf8::internal::mask8(*it) << 6) & 0xfff; | ||
213 | + | ||
214 | + UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR(it, end) | ||
215 | + | ||
216 | + code_point += (*it) & 0x3f; | ||
217 | + | ||
218 | + return UTF8_OK; | ||
219 | + } | ||
220 | + | ||
221 | + #undef UTF8_CPP_INCREASE_AND_RETURN_ON_ERROR | ||
222 | + | ||
223 | + template <typename octet_iterator> | ||
224 | + utf_error validate_next(octet_iterator& it, octet_iterator end, uint32_t& code_point) | ||
225 | + { | ||
226 | + // Save the original value of it so we can go back in case of failure | ||
227 | + // Of course, it does not make much sense with i.e. stream iterators | ||
228 | + octet_iterator original_it = it; | ||
229 | + | ||
230 | + uint32_t cp = 0; | ||
231 | + // Determine the sequence length based on the lead octet | ||
232 | + typedef typename std::iterator_traits<octet_iterator>::difference_type octet_difference_type; | ||
233 | + const octet_difference_type length = utf8::internal::sequence_length(it); | ||
234 | + | ||
235 | + // Get trail octets and calculate the code point | ||
236 | + utf_error err = UTF8_OK; | ||
237 | + switch (length) { | ||
238 | + case 0: | ||
239 | + return INVALID_LEAD; | ||
240 | + case 1: | ||
241 | + err = utf8::internal::get_sequence_1(it, end, cp); | ||
242 | + break; | ||
243 | + case 2: | ||
244 | + err = utf8::internal::get_sequence_2(it, end, cp); | ||
245 | + break; | ||
246 | + case 3: | ||
247 | + err = utf8::internal::get_sequence_3(it, end, cp); | ||
248 | + break; | ||
249 | + case 4: | ||
250 | + err = utf8::internal::get_sequence_4(it, end, cp); | ||
251 | + break; | ||
252 | + } | ||
253 | + | ||
254 | + if (err == UTF8_OK) { | ||
255 | + // Decoding succeeded. Now, security checks... | ||
256 | + if (utf8::internal::is_code_point_valid(cp)) { | ||
257 | + if (!utf8::internal::is_overlong_sequence(cp, length)){ | ||
258 | + // Passed! Return here. | ||
259 | + code_point = cp; | ||
260 | + ++it; | ||
261 | + return UTF8_OK; | ||
262 | + } | ||
263 | + else | ||
264 | + err = OVERLONG_SEQUENCE; | ||
265 | + } | ||
266 | + else | ||
267 | + err = INVALID_CODE_POINT; | ||
268 | + } | ||
269 | + | ||
270 | + // Failure branch - restore the original value of the iterator | ||
271 | + it = original_it; | ||
272 | + return err; | ||
273 | + } | ||
274 | + | ||
275 | + template <typename octet_iterator> | ||
276 | + inline utf_error validate_next(octet_iterator& it, octet_iterator end) { | ||
277 | + uint32_t ignored; | ||
278 | + return utf8::internal::validate_next(it, end, ignored); | ||
279 | + } | ||
280 | + | ||
281 | +} // namespace internal | ||
282 | + | ||
283 | + /// The library API - functions intended to be called by the users | ||
284 | + | ||
285 | + // Byte order mark | ||
286 | + const uint8_t bom[] = {0xef, 0xbb, 0xbf}; | ||
287 | + | ||
288 | + template <typename octet_iterator> | ||
289 | + octet_iterator find_invalid(octet_iterator start, octet_iterator end) | ||
290 | + { | ||
291 | + octet_iterator result = start; | ||
292 | + while (result != end) { | ||
293 | + utf8::internal::utf_error err_code = utf8::internal::validate_next(result, end); | ||
294 | + if (err_code != internal::UTF8_OK) | ||
295 | + return result; | ||
296 | + } | ||
297 | + return result; | ||
298 | + } | ||
299 | + | ||
300 | + template <typename octet_iterator> | ||
301 | + inline bool is_valid(octet_iterator start, octet_iterator end) | ||
302 | + { | ||
303 | + return (utf8::find_invalid(start, end) == end); | ||
304 | + } | ||
305 | + | ||
306 | + template <typename octet_iterator> | ||
307 | + inline bool starts_with_bom (octet_iterator it, octet_iterator end) | ||
308 | + { | ||
309 | + return ( | ||
310 | + ((it != end) && (utf8::internal::mask8(*it++)) == bom[0]) && | ||
311 | + ((it != end) && (utf8::internal::mask8(*it++)) == bom[1]) && | ||
312 | + ((it != end) && (utf8::internal::mask8(*it)) == bom[2]) | ||
313 | + ); | ||
314 | + } | ||
315 | + | ||
316 | + //Deprecated in release 2.3 | ||
317 | + template <typename octet_iterator> | ||
318 | + inline bool is_bom (octet_iterator it) | ||
319 | + { | ||
320 | + return ( | ||
321 | + (utf8::internal::mask8(*it++)) == bom[0] && | ||
322 | + (utf8::internal::mask8(*it++)) == bom[1] && | ||
323 | + (utf8::internal::mask8(*it)) == bom[2] | ||
324 | + ); | ||
325 | + } | ||
326 | +} // namespace utf8 | ||
327 | + | ||
328 | +#endif // header guard | ||
329 | + | ||
330 | + |
morfeusz/encoding/utf8/unchecked.h
0 → 100644
1 | +// Copyright 2006 Nemanja Trifunovic | ||
2 | + | ||
3 | +/* | ||
4 | +Permission is hereby granted, free of charge, to any person or organization | ||
5 | +obtaining a copy of the software and accompanying documentation covered by | ||
6 | +this license (the "Software") to use, reproduce, display, distribute, | ||
7 | +execute, and transmit the Software, and to prepare derivative works of the | ||
8 | +Software, and to permit third-parties to whom the Software is furnished to | ||
9 | +do so, all subject to the following: | ||
10 | + | ||
11 | +The copyright notices in the Software and this entire statement, including | ||
12 | +the above license grant, this restriction and the following disclaimer, | ||
13 | +must be included in all copies of the Software, in whole or in part, and | ||
14 | +all derivative works of the Software, unless such copies or derivative | ||
15 | +works are solely in the form of machine-executable object code generated by | ||
16 | +a source language processor. | ||
17 | + | ||
18 | +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
19 | +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
20 | +FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT | ||
21 | +SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE | ||
22 | +FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, | ||
23 | +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER | ||
24 | +DEALINGS IN THE SOFTWARE. | ||
25 | +*/ | ||
26 | + | ||
27 | + | ||
28 | +#ifndef UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
29 | +#define UTF8_FOR_CPP_UNCHECKED_H_2675DCD0_9480_4c0c_B92A_CC14C027B731 | ||
30 | + | ||
31 | +#include "core.h" | ||
32 | + | ||
33 | +namespace utf8 | ||
34 | +{ | ||
35 | + namespace unchecked | ||
36 | + { | ||
37 | + template <typename octet_iterator> | ||
38 | + octet_iterator append(uint32_t cp, octet_iterator result) | ||
39 | + { | ||
40 | + if (cp < 0x80) // one octet | ||
41 | + *(result++) = static_cast<uint8_t>(cp); | ||
42 | + else if (cp < 0x800) { // two octets | ||
43 | + *(result++) = static_cast<uint8_t>((cp >> 6) | 0xc0); | ||
44 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | ||
45 | + } | ||
46 | + else if (cp < 0x10000) { // three octets | ||
47 | + *(result++) = static_cast<uint8_t>((cp >> 12) | 0xe0); | ||
48 | + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); | ||
49 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | ||
50 | + } | ||
51 | + else { // four octets | ||
52 | + *(result++) = static_cast<uint8_t>((cp >> 18) | 0xf0); | ||
53 | + *(result++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)| 0x80); | ||
54 | + *(result++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80); | ||
55 | + *(result++) = static_cast<uint8_t>((cp & 0x3f) | 0x80); | ||
56 | + } | ||
57 | + return result; | ||
58 | + } | ||
59 | + | ||
60 | + template <typename octet_iterator> | ||
61 | + uint32_t next(octet_iterator& it) | ||
62 | + { | ||
63 | + uint32_t cp = utf8::internal::mask8(*it); | ||
64 | + typename std::iterator_traits<octet_iterator>::difference_type length = utf8::internal::sequence_length(it); | ||
65 | + switch (length) { | ||
66 | + case 1: | ||
67 | + break; | ||
68 | + case 2: | ||
69 | + it++; | ||
70 | + cp = ((cp << 6) & 0x7ff) + ((*it) & 0x3f); | ||
71 | + break; | ||
72 | + case 3: | ||
73 | + ++it; | ||
74 | + cp = ((cp << 12) & 0xffff) + ((utf8::internal::mask8(*it) << 6) & 0xfff); | ||
75 | + ++it; | ||
76 | + cp += (*it) & 0x3f; | ||
77 | + break; | ||
78 | + case 4: | ||
79 | + ++it; | ||
80 | + cp = ((cp << 18) & 0x1fffff) + ((utf8::internal::mask8(*it) << 12) & 0x3ffff); | ||
81 | + ++it; | ||
82 | + cp += (utf8::internal::mask8(*it) << 6) & 0xfff; | ||
83 | + ++it; | ||
84 | + cp += (*it) & 0x3f; | ||
85 | + break; | ||
86 | + } | ||
87 | + ++it; | ||
88 | + return cp; | ||
89 | + } | ||
90 | + | ||
91 | + template <typename octet_iterator> | ||
92 | + uint32_t peek_next(octet_iterator it) | ||
93 | + { | ||
94 | + return utf8::unchecked::next(it); | ||
95 | + } | ||
96 | + | ||
97 | + template <typename octet_iterator> | ||
98 | + uint32_t prior(octet_iterator& it) | ||
99 | + { | ||
100 | + while (utf8::internal::is_trail(*(--it))) ; | ||
101 | + octet_iterator temp = it; | ||
102 | + return utf8::unchecked::next(temp); | ||
103 | + } | ||
104 | + | ||
105 | + // Deprecated in versions that include prior, but only for the sake of consistency (see utf8::previous) | ||
106 | + template <typename octet_iterator> | ||
107 | + inline uint32_t previous(octet_iterator& it) | ||
108 | + { | ||
109 | + return utf8::unchecked::prior(it); | ||
110 | + } | ||
111 | + | ||
112 | + template <typename octet_iterator, typename distance_type> | ||
113 | + void advance (octet_iterator& it, distance_type n) | ||
114 | + { | ||
115 | + for (distance_type i = 0; i < n; ++i) | ||
116 | + utf8::unchecked::next(it); | ||
117 | + } | ||
118 | + | ||
119 | + template <typename octet_iterator> | ||
120 | + typename std::iterator_traits<octet_iterator>::difference_type | ||
121 | + distance (octet_iterator first, octet_iterator last) | ||
122 | + { | ||
123 | + typename std::iterator_traits<octet_iterator>::difference_type dist; | ||
124 | + for (dist = 0; first < last; ++dist) | ||
125 | + utf8::unchecked::next(first); | ||
126 | + return dist; | ||
127 | + } | ||
128 | + | ||
129 | + template <typename u16bit_iterator, typename octet_iterator> | ||
130 | + octet_iterator utf16to8 (u16bit_iterator start, u16bit_iterator end, octet_iterator result) | ||
131 | + { | ||
132 | + while (start != end) { | ||
133 | + uint32_t cp = utf8::internal::mask16(*start++); | ||
134 | + // Take care of surrogate pairs first | ||
135 | + if (utf8::internal::is_lead_surrogate(cp)) { | ||
136 | + uint32_t trail_surrogate = utf8::internal::mask16(*start++); | ||
137 | + cp = (cp << 10) + trail_surrogate + internal::SURROGATE_OFFSET; | ||
138 | + } | ||
139 | + result = utf8::unchecked::append(cp, result); | ||
140 | + } | ||
141 | + return result; | ||
142 | + } | ||
143 | + | ||
144 | + template <typename u16bit_iterator, typename octet_iterator> | ||
145 | + u16bit_iterator utf8to16 (octet_iterator start, octet_iterator end, u16bit_iterator result) | ||
146 | + { | ||
147 | + while (start < end) { | ||
148 | + uint32_t cp = utf8::unchecked::next(start); | ||
149 | + if (cp > 0xffff) { //make a surrogate pair | ||
150 | + *result++ = static_cast<uint16_t>((cp >> 10) + internal::LEAD_OFFSET); | ||
151 | + *result++ = static_cast<uint16_t>((cp & 0x3ff) + internal::TRAIL_SURROGATE_MIN); | ||
152 | + } | ||
153 | + else | ||
154 | + *result++ = static_cast<uint16_t>(cp); | ||
155 | + } | ||
156 | + return result; | ||
157 | + } | ||
158 | + | ||
159 | + template <typename octet_iterator, typename u32bit_iterator> | ||
160 | + octet_iterator utf32to8 (u32bit_iterator start, u32bit_iterator end, octet_iterator result) | ||
161 | + { | ||
162 | + while (start != end) | ||
163 | + result = utf8::unchecked::append(*(start++), result); | ||
164 | + | ||
165 | + return result; | ||
166 | + } | ||
167 | + | ||
168 | + template <typename octet_iterator, typename u32bit_iterator> | ||
169 | + u32bit_iterator utf8to32 (octet_iterator start, octet_iterator end, u32bit_iterator result) | ||
170 | + { | ||
171 | + while (start < end) | ||
172 | + (*result++) = utf8::unchecked::next(start); | ||
173 | + | ||
174 | + return result; | ||
175 | + } | ||
176 | + | ||
177 | + // The iterator class | ||
178 | + template <typename octet_iterator> | ||
179 | + class iterator : public std::iterator <std::bidirectional_iterator_tag, uint32_t> { | ||
180 | + octet_iterator it; | ||
181 | + public: | ||
182 | + iterator () {} | ||
183 | + explicit iterator (const octet_iterator& octet_it): it(octet_it) {} | ||
184 | + // the default "big three" are OK | ||
185 | + octet_iterator base () const { return it; } | ||
186 | + uint32_t operator * () const | ||
187 | + { | ||
188 | + octet_iterator temp = it; | ||
189 | + return utf8::unchecked::next(temp); | ||
190 | + } | ||
191 | + bool operator == (const iterator& rhs) const | ||
192 | + { | ||
193 | + return (it == rhs.it); | ||
194 | + } | ||
195 | + bool operator != (const iterator& rhs) const | ||
196 | + { | ||
197 | + return !(operator == (rhs)); | ||
198 | + } | ||
199 | + iterator& operator ++ () | ||
200 | + { | ||
201 | + ::std::advance(it, utf8::internal::sequence_length(it)); | ||
202 | + return *this; | ||
203 | + } | ||
204 | + iterator operator ++ (int) | ||
205 | + { | ||
206 | + iterator temp = *this; | ||
207 | + ::std::advance(it, utf8::internal::sequence_length(it)); | ||
208 | + return temp; | ||
209 | + } | ||
210 | + iterator& operator -- () | ||
211 | + { | ||
212 | + utf8::unchecked::prior(it); | ||
213 | + return *this; | ||
214 | + } | ||
215 | + iterator operator -- (int) | ||
216 | + { | ||
217 | + iterator temp = *this; | ||
218 | + utf8::unchecked::prior(it); | ||
219 | + return temp; | ||
220 | + } | ||
221 | + }; // class iterator | ||
222 | + | ||
223 | + } // namespace utf8::unchecked | ||
224 | +} // namespace utf8 | ||
225 | + | ||
226 | + | ||
227 | +#endif // header guard | ||
228 | + |
morfeusz/test_morfeusz.cpp
0 → 100644
1 | +/* | ||
2 | + * File: test_morfeusz.cpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on 14 listopad 2013, 15:50 | ||
6 | + */ | ||
7 | + | ||
8 | +#include <cstdlib> | ||
9 | +#include <cstdio> | ||
10 | +#include <sstream> | ||
11 | +#include <iostream> | ||
12 | + | ||
13 | +#include "Morfeusz.hpp" | ||
14 | + | ||
15 | +using namespace std; | ||
16 | + | ||
17 | +void debug(const MorphInterpretation& interp) { | ||
18 | + fprintf(stderr, | ||
19 | + "%d %d %s %s %s %s\n", | ||
20 | + interp.getStartNode(), interp.getEndNode(), | ||
21 | + interp.getOrth(), interp.getLemma(), | ||
22 | + interp.getTag(), interp.getName()); | ||
23 | +} | ||
24 | + | ||
25 | +void doTest( | ||
26 | + const Morfeusz& morfeusz, | ||
27 | + const string& inputFilename) { | ||
28 | + ifstream ifs; | ||
29 | + // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); | ||
30 | + ifs.open(inputFilename, ios::binary); | ||
31 | + string line; | ||
32 | + while (getline(ifs, line)) { | ||
33 | + AnalyzeResult res(morfeusz.analyze(line)); | ||
34 | + while (res.iterator != res.end) { | ||
35 | + debug(*res); | ||
36 | + res++; | ||
37 | + } | ||
38 | + } | ||
39 | + validate(ifs.eof(), "Failed to read the input file to the end"); | ||
40 | +} | ||
41 | + | ||
42 | +int main(int argc, char** argv) { | ||
43 | + validate(argc == 3, "Must provide exactly two arguments - FSA filename, and input filename."); | ||
44 | + string fsaFilename = argv[1]; | ||
45 | + string inputFilename = argv[2]; | ||
46 | + Morfeusz morfeusz(fsaFilename); | ||
47 | + doTest(morfeusz, inputFilename); | ||
48 | + return 0; | ||
49 | +} | ||
50 | + |
morfeusz/test_morph.cpp
@@ -17,21 +17,6 @@ | @@ -17,21 +17,6 @@ | ||
17 | 17 | ||
18 | using namespace std; | 18 | using namespace std; |
19 | 19 | ||
20 | -void debug(const string& key, const vector<EncodedInterpretation> value) { | ||
21 | - cerr << key << endl; | ||
22 | - for (EncodedInterpretation i: value) { | ||
23 | - cerr << "suffix to cut: " << i.lemma.suffixToCut << endl; | ||
24 | - cerr << "suffix to add: " << i.lemma.suffixToAdd << endl; | ||
25 | - cerr << "tag: " << i.tag << endl; | ||
26 | - cerr << "name: " << i.nameClassifier << endl; | ||
27 | - } | ||
28 | - cerr << "==================" << endl; | ||
29 | -} | ||
30 | - | ||
31 | -//void debug(const string& key, const TaggedInterpretation& value) { | ||
32 | -// cerr << key << '\t' << value.toString() << endl; | ||
33 | -//} | ||
34 | - | ||
35 | void doTest( | 20 | void doTest( |
36 | const FSA<vector<EncodedInterpretation>>& fsa, | 21 | const FSA<vector<EncodedInterpretation>>& fsa, |
37 | const Tagset& tagset, | 22 | const Tagset& tagset, |
nbproject/configurations.xml
@@ -8,11 +8,13 @@ | @@ -8,11 +8,13 @@ | ||
8 | <in>test_speed.cpp</in> | 8 | <in>test_speed.cpp</in> |
9 | </df> | 9 | </df> |
10 | <df root="morfeusz" name="1"> | 10 | <df root="morfeusz" name="1"> |
11 | + <df name="encoding"> | ||
12 | + <in>CharsetConverter.cpp</in> | ||
13 | + <in>CharsetConverter.hpp</in> | ||
14 | + </df> | ||
11 | <in>Morfeusz.cpp</in> | 15 | <in>Morfeusz.cpp</in> |
12 | - <in>Morfeusz.hpp</in> | ||
13 | <in>MorphDeserializer.cpp</in> | 16 | <in>MorphDeserializer.cpp</in> |
14 | <in>MorphInterpretation.cpp</in> | 17 | <in>MorphInterpretation.cpp</in> |
15 | - <in>MorphInterpretation.hpp</in> | ||
16 | <in>Tagset.cpp</in> | 18 | <in>Tagset.cpp</in> |
17 | <in>main.cpp</in> | 19 | <in>main.cpp</in> |
18 | <in>morfeusz.cpp</in> | 20 | <in>morfeusz.cpp</in> |
@@ -49,19 +51,11 @@ | @@ -49,19 +51,11 @@ | ||
49 | <executablePath>build/fsa/test_dict</executablePath> | 51 | <executablePath>build/fsa/test_dict</executablePath> |
50 | </makeTool> | 52 | </makeTool> |
51 | </makefileType> | 53 | </makefileType> |
52 | - <folder path="1"> | 54 | + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="4"> |
53 | <ccTool> | 55 | <ccTool> |
54 | <incDir> | 56 | <incDir> |
55 | <pElem>fsa</pElem> | 57 | <pElem>fsa</pElem> |
56 | - <pElem>build/morfeusz</pElem> | ||
57 | - </incDir> | ||
58 | - </ccTool> | ||
59 | - </folder> | ||
60 | - <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8"> | ||
61 | - <ccTool> | ||
62 | - <incDir> | ||
63 | - <pElem>fsa</pElem> | ||
64 | - <pElem>build/morfeusz</pElem> | 58 | + <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> |
65 | </incDir> | 59 | </incDir> |
66 | </ccTool> | 60 | </ccTool> |
67 | </item> | 61 | </item> |
@@ -86,32 +80,86 @@ | @@ -86,32 +80,86 @@ | ||
86 | </incDir> | 80 | </incDir> |
87 | </ccTool> | 81 | </ccTool> |
88 | </item> | 82 | </item> |
89 | - <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="0"> | ||
90 | - </item> | ||
91 | - <item path="morfeusz/Morfeusz.hpp" ex="false" tool="3" flavor2="0"> | 83 | + <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> |
84 | + <ccTool> | ||
85 | + <incDir> | ||
86 | + <pElem>fsa</pElem> | ||
87 | + <pElem>build/morfeusz</pElem> | ||
88 | + </incDir> | ||
89 | + </ccTool> | ||
92 | </item> | 90 | </item> |
93 | <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> | 91 | <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> |
94 | <ccTool> | 92 | <ccTool> |
93 | + <incDir> | ||
94 | + <pElem>fsa</pElem> | ||
95 | + <pElem>build/morfeusz</pElem> | ||
96 | + </incDir> | ||
95 | </ccTool> | 97 | </ccTool> |
96 | </item> | 98 | </item> |
97 | - <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="0"> | ||
98 | - </item> | ||
99 | - <item path="morfeusz/MorphInterpretation.hpp" ex="false" tool="3" flavor2="0"> | 99 | + <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="4"> |
100 | + <ccTool> | ||
101 | + <incDir> | ||
102 | + <pElem>morfeusz</pElem> | ||
103 | + <pElem>/usr/include/c++/4.8/bits</pElem> | ||
104 | + <pElem>/usr/include/c++/4.8/ext</pElem> | ||
105 | + <pElem>/usr/include/c++/4.8</pElem> | ||
106 | + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> | ||
107 | + <pElem>/usr/include/c++/4.8/debug</pElem> | ||
108 | + <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> | ||
109 | + <pElem>/usr/include/c++/4.8/backward</pElem> | ||
110 | + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> | ||
111 | + <pElem>build/morfeusz</pElem> | ||
112 | + </incDir> | ||
113 | + </ccTool> | ||
100 | </item> | 114 | </item> |
101 | - <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> | 115 | + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="4"> |
102 | <ccTool> | 116 | <ccTool> |
117 | + <incDir> | ||
118 | + <pElem>morfeusz</pElem> | ||
119 | + <pElem>/usr/include/c++/4.8/bits</pElem> | ||
120 | + <pElem>/usr/include/c++/4.8/ext</pElem> | ||
121 | + <pElem>/usr/include/c++/4.8</pElem> | ||
122 | + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8/bits</pElem> | ||
123 | + <pElem>/usr/include/c++/4.8/debug</pElem> | ||
124 | + <pElem>/usr/lib/gcc/x86_64-linux-gnu/4.8/include</pElem> | ||
125 | + <pElem>fsa</pElem> | ||
126 | + <pElem>/usr/include/c++/4.8/backward</pElem> | ||
127 | + <pElem>/usr/include/x86_64-linux-gnu/c++/4.8</pElem> | ||
128 | + <pElem>build/morfeusz</pElem> | ||
129 | + </incDir> | ||
103 | </ccTool> | 130 | </ccTool> |
104 | </item> | 131 | </item> |
132 | + <item path="morfeusz/encoding/CharsetConverter.cpp" | ||
133 | + ex="false" | ||
134 | + tool="1" | ||
135 | + flavor2="0"> | ||
136 | + </item> | ||
137 | + <item path="morfeusz/encoding/CharsetConverter.hpp" | ||
138 | + ex="false" | ||
139 | + tool="3" | ||
140 | + flavor2="0"> | ||
141 | + </item> | ||
105 | <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> | 142 | <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> |
106 | <ccTool> | 143 | <ccTool> |
144 | + <incDir> | ||
145 | + <pElem>fsa</pElem> | ||
146 | + <pElem>build/morfeusz</pElem> | ||
147 | + </incDir> | ||
107 | </ccTool> | 148 | </ccTool> |
108 | </item> | 149 | </item> |
109 | <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> | 150 | <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> |
110 | <ccTool> | 151 | <ccTool> |
152 | + <incDir> | ||
153 | + <pElem>morfeusz</pElem> | ||
154 | + </incDir> | ||
111 | </ccTool> | 155 | </ccTool> |
112 | </item> | 156 | </item> |
113 | <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> | 157 | <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> |
114 | <ccTool> | 158 | <ccTool> |
159 | + <incDir> | ||
160 | + <pElem>fsa</pElem> | ||
161 | + <pElem>build/morfeusz</pElem> | ||
162 | + </incDir> | ||
115 | </ccTool> | 163 | </ccTool> |
116 | </item> | 164 | </item> |
117 | </conf> | 165 | </conf> |
nbproject/project.xml
@@ -6,7 +6,7 @@ | @@ -6,7 +6,7 @@ | ||
6 | <name>morfeusz</name> | 6 | <name>morfeusz</name> |
7 | <c-extensions/> | 7 | <c-extensions/> |
8 | <cpp-extensions>cpp</cpp-extensions> | 8 | <cpp-extensions>cpp</cpp-extensions> |
9 | - <header-extensions>hpp</header-extensions> | 9 | + <header-extensions>h,hpp</header-extensions> |
10 | <sourceEncoding>UTF-8</sourceEncoding> | 10 | <sourceEncoding>UTF-8</sourceEncoding> |
11 | <make-dep-projects/> | 11 | <make-dep-projects/> |
12 | <sourceRootList> | 12 | <sourceRootList> |