Commit 40c791419d285c2c3513e60324680fc945315efd
1 parent
0133e003
- generator w zasadzie już działa
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@82 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
35 changed files
with
723 additions
and
357 deletions
CMakeLists.txt
... | ... | @@ -29,6 +29,7 @@ set (PROJECT_VERSION "${Morfeusz_VERSION_MAJOR}.${Morfeusz_VERSION_MINOR}.${Morf |
29 | 29 | |
30 | 30 | # INPUT_DICTIONARY_CPP |
31 | 31 | set (INPUT_DICTIONARY_CPP "${CMAKE_CURRENT_BINARY_DIR}/default_fsa.cpp") |
32 | +set (INPUT_SYNTH_DICTIONARY_CPP "${CMAKE_CURRENT_BINARY_DIR}/default_synth_fsa.cpp") | |
32 | 33 | if ("${INPUT_DICTIONARY}" STREQUAL "") |
33 | 34 | if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") |
34 | 35 | set (INPUT_DICTIONARY ${PROJECT_SOURCE_DIR}/input/empty.txt) |
... | ... | @@ -52,7 +53,10 @@ endif () |
52 | 53 | ### Compilation and linking flags |
53 | 54 | |
54 | 55 | if (${CMAKE_SYSTEM_NAME} MATCHES "Linux") |
55 | - set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++98 -Wall -pedantic -Wcast-align -Wextra -Wmissing-noreturn -Wconversion -Wcast-qual -Wcast-align -O2") | |
56 | + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++98 -Wall -pedantic -Wcast-align -Wextra -Wmissing-noreturn -Wconversion -Wcast-qual -Wcast-align") | |
57 | + if (${CMAKE_BUILD_TYPE} STREQUAL "Release") | |
58 | + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2") | |
59 | + endif () | |
56 | 60 | elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows") |
57 | 61 | set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++0x -Wall -O2") |
58 | 62 | set (CMAKE_SHARED_LIBRARY_PREFIX "") |
... | ... | @@ -107,7 +111,8 @@ add_subdirectory (fsabuilder) |
107 | 111 | ########## add tests ########## |
108 | 112 | |
109 | 113 | macro (test_build_and_recognize fname method) |
110 | - add_test (TestBuild-${method}-${fname} python fsabuilder/buildfsa.py --analyzer -i testfiles/${fname} -o /tmp/test-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --output-format=BINARY --serialization-method=${method}) | |
114 | + add_test (TestBuild-${method}-${fname} python fsabuilder/buildfsa.py --analyzer -i testfiles/${fname} -o /tmp/test-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --serialization-method=${method}) | |
115 | + add_test (TestBuild4Synth-${method}-${fname} python fsabuilder/buildfsa.py --generator -i testfiles/${fname} -o /tmp/test-synth-${method}-${fname}.fsa --tagset-file=testfiles/polimorf.tagset --serialization-method=${method}) | |
111 | 116 | add_test (TestRecognize-${method}-${fname} morfeusz/test_recognize_dict /tmp/test-${method}-${fname}.fsa testfiles/${fname}) |
112 | 117 | # add_test (TestNOTRecognize-${method}-${fname} fsa/test_not_recognize /tmp/test-${method}-${fname}.fsa testfiles/out_of_dict) |
113 | 118 | # add_test (TestSpeed-${method}-${fname} fsa/test_speed /tmp/test-${method}-${fname}.fsa testfiles/speed_test_data) |
... | ... |
fsabuilder/buildfsa.py
... | ... | @@ -162,7 +162,7 @@ def buildGeneratorFromPoliMorf(inputFile, tagsetFile): |
162 | 162 | encoder = encode.Encoder4Generator() |
163 | 163 | tagset = common.Tagset(tagsetFile) |
164 | 164 | fsa = FSA(encoder, tagset) |
165 | - inputData = _readPolimorfInput4Analyzer(inputFile, tagset, encoder) | |
165 | + inputData = _readPolimorfInput4Generator(inputFile, tagset, encoder) | |
166 | 166 | for word, data in inputData: |
167 | 167 | fsa.addEntry(word, data) |
168 | 168 | fsa.close() |
... | ... | @@ -192,7 +192,7 @@ def main(opts): |
192 | 192 | }[opts.serializationMethod](fsa) |
193 | 193 | |
194 | 194 | if opts.cpp: |
195 | - serializer.serialize2CppFile(opts.outputFile) | |
195 | + serializer.serialize2CppFile(opts.outputFile, generator=opts.generator) | |
196 | 196 | else: |
197 | 197 | serializer.serialize2BinaryFile(opts.outputFile) |
198 | 198 | # { |
... | ... |
fsabuilder/morfeuszbuilder/fsa/common.py
... | ... | @@ -54,7 +54,6 @@ class Interpretation4Generator(object): |
54 | 54 | self.orth = EncodedForm(base, orth) |
55 | 55 | self.tagnum = tagnum |
56 | 56 | self.namenum = namenum |
57 | - logging.warn(self) | |
58 | 57 | |
59 | 58 | def getSortKey(self): |
60 | 59 | return ( |
... | ... | @@ -74,7 +73,10 @@ class Interpretation4Generator(object): |
74 | 73 | return hash(self.getSortKey()) |
75 | 74 | |
76 | 75 | def __unicode__(self): |
77 | - return u'%s %d %s %d %d' % (self.lemma, self.orth.cutLength, self.orth.suffixToAdd, self.tagnum, self.namenum) | |
76 | + return u'%s,(%d %s),%d,%d' % (self.lemma, self.orth.cutLength, self.orth.suffixToAdd, self.tagnum, self.namenum) | |
77 | + | |
78 | + def __repr__(self): | |
79 | + return unicode(self) | |
78 | 80 | |
79 | 81 | class Tagset(object): |
80 | 82 | |
... | ... | @@ -86,8 +88,8 @@ class Tagset(object): |
86 | 88 | self.tag2tagnum = {} |
87 | 89 | self.name2namenum = {} |
88 | 90 | self._doInit(filename, encoding) |
89 | - print self.tag2tagnum | |
90 | - print self.name2namenum | |
91 | +# print self.tag2tagnum | |
92 | +# print self.name2namenum | |
91 | 93 | |
92 | 94 | def _doInit(self, filename, encoding): |
93 | 95 | addingTo = None |
... | ... |
fsabuilder/morfeuszbuilder/fsa/convertinput.py
... | ... | @@ -480,10 +480,9 @@ class PolimorfConverter4Generator(object): |
480 | 480 | line = line.decode(self.inputEncoding).strip(u'\n') |
481 | 481 | if line: |
482 | 482 | # print line |
483 | - orth, base, tagnum, namenum, typenum = line.split(u' ') | |
483 | + orth, base, tagnum, namenum = line.split(u' ') | |
484 | 484 | tagnum = int(tagnum) |
485 | 485 | namenum = int(namenum) |
486 | - typenum = int(typenum) | |
487 | 486 | yield (base, Interpretation4Generator(orth, base, tagnum, namenum)) |
488 | 487 | |
489 | 488 | def convert(self, inputLines): |
... | ... |
fsabuilder/morfeuszbuilder/fsa/encode.py
... | ... | @@ -29,9 +29,6 @@ class Encoder(object): |
29 | 29 | |
30 | 30 | def decodeData(self, rawData): |
31 | 31 | return NotImplementedError() |
32 | -# print unicode(str(rawData), self.encoding)[:-1] | |
33 | -# print unicode(str(rawData), self.encoding)[:-1].split(u'|') | |
34 | -# return unicode(str(rawData), self.encoding)[:-1].split(u'|') | |
35 | 32 | |
36 | 33 | def decodeWord(self, rawWord): |
37 | 34 | return unicode(str(rawWord).strip('\x00'), self.encoding) |
... | ... | @@ -49,7 +46,8 @@ class Encoder(object): |
49 | 46 | res.append(form.cutLength) |
50 | 47 | res.extend(self.encodeWord(form.suffixToAdd, lowercase=False)) |
51 | 48 | res.append(0) |
52 | - res.extend(self._encodeCasePattern(form.casePattern)) | |
49 | + if withCasePattern: | |
50 | + res.extend(self._encodeCasePattern(form.casePattern)) | |
53 | 51 | return res |
54 | 52 | |
55 | 53 | def _encodeCasePattern(self, casePattern): |
... | ... | @@ -96,17 +94,6 @@ class Encoder(object): |
96 | 94 | assert namenum < 256 and namenum >= 0 |
97 | 95 | return bytearray([namenum]) |
98 | 96 | |
99 | -# class SimpleEncoder(Encoder): | |
100 | -# | |
101 | -# def __init__(self, encoding='utf8'): | |
102 | -# super(SimpleEncoder, self).__init__(encoding) | |
103 | -# | |
104 | -# def encodeData(self, data): | |
105 | -# return bytearray(data, encoding=self.encoding) + bytearray([0]) | |
106 | -# | |
107 | -# def decodeData(self, rawData): | |
108 | -# return unicode(str(rawData)[:-1], self.encoding) | |
109 | - | |
110 | 97 | class MorphEncoder(Encoder): |
111 | 98 | |
112 | 99 | def __init__(self, encoding='utf8'): |
... | ... | @@ -133,11 +120,10 @@ class MorphEncoder(Encoder): |
133 | 120 | class Encoder4Generator(Encoder): |
134 | 121 | |
135 | 122 | def __init__(self, encoding='utf8'): |
136 | - super(MorphEncoder, self).__init__(encoding) | |
123 | + super(Encoder4Generator, self).__init__(encoding) | |
137 | 124 | |
138 | 125 | def encodeData(self, interpsList): |
139 | 126 | res = bytearray() |
140 | -# print interpsList | |
141 | 127 | firstByte = len(interpsList) |
142 | 128 | assert firstByte < 256 |
143 | 129 | assert firstByte > 0 |
... | ... | @@ -148,3 +134,6 @@ class Encoder4Generator(Encoder): |
148 | 134 | res.extend(self._encodeTagNum(interp.tagnum)) |
149 | 135 | res.extend(self._encodeNameNum(interp.namenum)) |
150 | 136 | return res |
137 | +# | |
138 | +# def decodeData(self, data): | |
139 | +# | |
... | ... |
fsabuilder/morfeuszbuilder/fsa/fsa.py
... | ... | @@ -54,30 +54,6 @@ class FSA(object): |
54 | 54 | self.encodedPrevWord = None |
55 | 55 | self.closed = True |
56 | 56 | |
57 | -# def feed(self, input): | |
58 | -# | |
59 | -# # allWords = [] | |
60 | -# for n, (word, data) in enumerate(input, start=1): | |
61 | -# assert data is not None | |
62 | -# encodedWord = self.encodeWord(word) | |
63 | -# assert encodedWord > self.encodedPrevWord | |
64 | -# if encodedWord > self.encodedPrevWord: | |
65 | -# self._addSorted(encodedWord, self.encodeData(data)) | |
66 | -# self.encodedPrevWord = encodedWord | |
67 | -# # assert self.tryToRecognize(word) == data | |
68 | -# if n % 10000 == 0: | |
69 | -# logging.info(word) | |
70 | -# logging.info(str(self.register.getStatesNum())) | |
71 | -# # allWords.append(word) | |
72 | -# for label in encodedWord: | |
73 | -# self.label2Freq[label] = self.label2Freq.get(label, 0) + 1 | |
74 | -# | |
75 | -# self.initialState = self._replaceOrRegister(self.initialState, self.encodeWord(word)) | |
76 | -# self.encodedPrevWord = None | |
77 | - | |
78 | -# for w in allWords: | |
79 | -# self.tryToRecognize(w, True) | |
80 | - | |
81 | 57 | def train(self, trainData): |
82 | 58 | self.label2Freq = {} |
83 | 59 | for idx, word in enumerate(trainData): |
... | ... |
fsabuilder/morfeuszbuilder/fsa/fsa.pyc
No preview for this file type
fsabuilder/morfeuszbuilder/fsa/serializer.py
... | ... | @@ -22,14 +22,17 @@ class Serializer(object): |
22 | 22 | def getVersion(self): |
23 | 23 | return 9 |
24 | 24 | |
25 | - def serialize2CppFile(self, fname): | |
25 | + def serialize2CppFile(self, fname, generator): | |
26 | 26 | res = [] |
27 | 27 | # self.fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) |
28 | 28 | res.append('\n') |
29 | 29 | res.append('#include "%s"' % self.headerFilename) |
30 | 30 | res.append('\n') |
31 | 31 | res.append('\n') |
32 | - res.append('extern const unsigned char DEFAULT_FSA[] = {') | |
32 | + if generator: | |
33 | + res.append('extern const unsigned char DEFAULT_SYNTH_FSA[] = {') | |
34 | + else: | |
35 | + res.append('extern const unsigned char DEFAULT_FSA[] = {') | |
33 | 36 | res.append('\n') |
34 | 37 | for byte in self.fsa2bytearray(): |
35 | 38 | res.append(hex(byte)); |
... | ... |
fsabuilder/morfeuszbuilder/fsa/serializer.pyc
No preview for this file type
morfeusz/CMakeLists.txt
... | ... | @@ -2,7 +2,13 @@ |
2 | 2 | ########## generate default dictionary data ################# |
3 | 3 | add_custom_command ( |
4 | 4 | OUTPUT "${INPUT_DICTIONARY_CPP}" |
5 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer -i "${INPUT_DICTIONARY}" -o "${INPUT_DICTIONARY_CPP}" "--tagset-file=${INPUT_TAGSET}" --output-format=CPP --serialization-method=SIMPLE | |
5 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --analyzer -i "${INPUT_DICTIONARY}" -o "${INPUT_DICTIONARY_CPP}" "--tagset-file=${INPUT_TAGSET}" --cpp --serialization-method=SIMPLE | |
6 | + DEPENDS "${INPUT_DICTIONARY}" | |
7 | + COMMENT "Building default dictionary C++ file" | |
8 | +) | |
9 | +add_custom_command ( | |
10 | + OUTPUT "${INPUT_SYNTH_DICTIONARY_CPP}" | |
11 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/buildfsa.py --generator -i "${INPUT_DICTIONARY}" -o "${INPUT_SYNTH_DICTIONARY_CPP}" "--tagset-file=${INPUT_TAGSET}" --cpp --serialization-method=SIMPLE | |
6 | 12 | DEPENDS "${INPUT_DICTIONARY}" |
7 | 13 | COMMENT "Building default dictionary C++ file" |
8 | 14 | ) |
... | ... | @@ -14,10 +20,13 @@ include_directories( ${CMAKE_CURRENT_SOURCE_DIR} ) |
14 | 20 | #### build ##### |
15 | 21 | |
16 | 22 | set(SRC_FILES |
17 | - const.cpp | |
23 | + const.cpp | |
18 | 24 | ${INPUT_DICTIONARY_CPP} |
25 | + ${INPUT_SYNTH_DICTIONARY_CPP} | |
26 | + Environment.cpp | |
19 | 27 | MorphDeserializer.cpp |
20 | 28 | GeneratorDeserializer.cpp |
29 | + Generator.cpp | |
21 | 30 | Tagset.cpp |
22 | 31 | fsa/const.cpp |
23 | 32 | MorphInterpretation.cpp |
... | ... | @@ -31,7 +40,9 @@ set(SRC_FILES |
31 | 40 | set(INCLUDE_FILES |
32 | 41 | const.hpp |
33 | 42 | data/default_fsa.hpp |
34 | - MorphDeserializer.hpp | |
43 | + MorphDeserializer.hpp | |
44 | + GeneratorDeserializer.hpp | |
45 | + Generator.hpp | |
35 | 46 | Tagset.hpp |
36 | 47 | fsa/const.hpp |
37 | 48 | MorphInterpretation.hpp |
... | ... | @@ -47,11 +58,13 @@ set_source_files_properties ( SOURCE "${INPUT_DICTIONARY_CPP}" PROPERTIES GENERA |
47 | 58 | # add_dependencies (libmorfeusz dupa) |
48 | 59 | set_target_properties (libmorfeusz PROPERTIES OUTPUT_NAME "morfeusz") |
49 | 60 | |
50 | -add_executable (morfeusz main.cpp) | |
61 | +add_executable (morfeusz_analyzer morfeusz_analyzer.cpp) | |
62 | +add_executable (morfeusz_generator morfeusz_generator.cpp) | |
51 | 63 | add_executable (test_result_equals test_result_equals.cpp) |
52 | 64 | add_executable (test_recognize_dict test_recognize_dict.cpp) |
53 | 65 | |
54 | -target_link_libraries (morfeusz libmorfeusz) | |
66 | +target_link_libraries (morfeusz_analyzer libmorfeusz) | |
67 | +target_link_libraries (morfeusz_generator libmorfeusz) | |
55 | 68 | target_link_libraries (test_result_equals libmorfeusz) |
56 | 69 | target_link_libraries (test_recognize_dict libmorfeusz) |
57 | 70 | |
... | ... | @@ -67,4 +80,4 @@ add_subdirectory (python) |
67 | 80 | |
68 | 81 | install (FILES ${INCLUDE_FILES} DESTINATION include/morfeusz) |
69 | 82 | install (TARGETS libmorfeusz DESTINATION ${TARGET_LIB_DIR}) |
70 | -install (TARGETS morfeusz DESTINATION bin) | |
83 | +install (TARGETS morfeusz_analyzer morfeusz_generator DESTINATION bin) | |
... | ... |
morfeusz/Environment.cpp
0 → 100644
1 | +/* | |
2 | + * File: Environment.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 22 styczeń 2014, 12:08 | |
6 | + */ | |
7 | + | |
8 | +#include "Environment.hpp" | |
9 | +#include "exceptions.hpp" | |
10 | + | |
11 | +Environment::Environment( | |
12 | + const Tagset& analyzerTagset, | |
13 | + const Tagset& generatorTagset, | |
14 | + MorfeuszCharset charset) | |
15 | +: currentCharsetConverter(getCharsetConverter(charset)), | |
16 | + analyzerTagset(analyzerTagset), | |
17 | +generatorTagset(generatorTagset) { | |
18 | + | |
19 | +} | |
20 | + | |
21 | +const CharsetConverter* Environment::getCharsetConverter(MorfeuszCharset charset) const { | |
22 | + switch (charset) { | |
23 | + case UTF8: | |
24 | + return &this->utf8CharsetConverter; | |
25 | + case ISO8859_2: | |
26 | + return &this->isoCharsetConverter; | |
27 | + case CP1250: | |
28 | + return &this->cp1250CharsetConverter; | |
29 | + case CP852: | |
30 | + return &this->cp852CharsetConverter; | |
31 | + default: | |
32 | + throw MorfeuszException("invalid charset"); | |
33 | + } | |
34 | +} | |
35 | + | |
36 | +Environment::~Environment() { | |
37 | +} | |
38 | + | |
39 | +void Environment::setCharset(MorfeuszCharset charset) { | |
40 | + this->currentCharsetConverter = this->getCharsetConverter(charset); | |
41 | +} | |
42 | + | |
43 | +const CharsetConverter& Environment::getCharsetConverter() const { | |
44 | + return *this->currentCharsetConverter; | |
45 | +} | |
46 | + | |
47 | +void Environment::setAnalyzerTagset(const Tagset& tagset) { | |
48 | + this->analyzerTagset = tagset; | |
49 | +} | |
50 | + | |
51 | +const Tagset& Environment::getAnalyzerTagset() const { | |
52 | + return this->analyzerTagset; | |
53 | +} | |
54 | + | |
55 | +void Environment::setGeneratorTagset(const Tagset& tagset) { | |
56 | + this->generatorTagset = tagset; | |
57 | +} | |
58 | + | |
59 | +const Tagset& Environment::getGeneratorTagset() const { | |
60 | + return this->generatorTagset; | |
61 | +} | |
62 | + | |
63 | +const CaseConverter& Environment::getCaseConverter() const { | |
64 | + return this->caseConverter; | |
65 | +} | |
... | ... |
morfeusz/Environment.hpp
0 → 100644
1 | +/* | |
2 | + * File: Environment.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 22 styczeń 2014, 12:08 | |
6 | + */ | |
7 | + | |
8 | +#ifndef ENVIRONMENT_HPP | |
9 | +#define ENVIRONMENT_HPP | |
10 | + | |
11 | +#include "charset/CaseConverter.hpp" | |
12 | +#include "charset/CharsetConverter.hpp" | |
13 | +#include "const.hpp" | |
14 | +#include "Tagset.hpp" | |
15 | + | |
16 | + | |
17 | +class Environment { | |
18 | +public: | |
19 | + Environment( | |
20 | + const Tagset& analyzerTagset, | |
21 | + const Tagset& generatorTagset, | |
22 | + MorfeuszCharset charset); | |
23 | + void setCharset(MorfeuszCharset charset); | |
24 | + const CharsetConverter& getCharsetConverter() const; | |
25 | + | |
26 | + void setAnalyzerTagset(const Tagset& tagset); | |
27 | + const Tagset& getAnalyzerTagset() const; | |
28 | + | |
29 | + void setGeneratorTagset(const Tagset& tagset); | |
30 | + const Tagset& getGeneratorTagset() const; | |
31 | + | |
32 | + const CaseConverter& getCaseConverter() const; | |
33 | + | |
34 | + virtual ~Environment(); | |
35 | +private: | |
36 | + const CharsetConverter* currentCharsetConverter; | |
37 | + const UTF8CharsetConverter utf8CharsetConverter; | |
38 | + const ISO8859_2_CharsetConverter isoCharsetConverter; | |
39 | + const Windows_1250_CharsetConverter cp1250CharsetConverter; | |
40 | + const CP852_CharsetConverter cp852CharsetConverter; | |
41 | + Tagset analyzerTagset; | |
42 | + Tagset generatorTagset; | |
43 | + const CaseConverter caseConverter; | |
44 | + | |
45 | + const CharsetConverter* getCharsetConverter(MorfeuszCharset charset) const; | |
46 | +}; | |
47 | + | |
48 | +#endif /* ENVIRONMENT_HPP */ | |
49 | + | |
... | ... |
morfeusz/Generator.cpp
0 → 100644
1 | +/* | |
2 | + * File: Generator.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 21 styczeń 2014, 14:38 | |
6 | + */ | |
7 | + | |
8 | +#include <string> | |
9 | +#include <iostream> | |
10 | +#include "charset/charset_utils.hpp" | |
11 | +#include "MorphInterpretation.hpp" | |
12 | +#include "Generator.hpp" | |
13 | +#include "Environment.hpp" | |
14 | + | |
15 | + | |
16 | +using namespace std; | |
17 | + | |
18 | +Generator::Generator( | |
19 | + const unsigned char* ptr, | |
20 | + const Environment& env) | |
21 | +: deserializer(env), | |
22 | +fsa(SynthFSAType::getFSA(ptr, deserializer)), | |
23 | +env(env) { | |
24 | +} | |
25 | + | |
26 | +Generator::~Generator() { | |
27 | +} | |
28 | + | |
29 | +std::string Generator::decodeOrth( | |
30 | + const EncodedOrth& orth, | |
31 | + const std::vector<uint32_t>& lemma) const { | |
32 | + string res; | |
33 | + for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) { | |
34 | + uint32_t cp = lemma[i]; | |
35 | + env.getCharsetConverter().append(cp, res); | |
36 | + } | |
37 | + const char* suffixPtr = orth.suffixToAdd.c_str(); | |
38 | + const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); | |
39 | + while (suffixPtr != suffixEnd) { | |
40 | + uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd); | |
41 | + env.getCharsetConverter().append(cp, res); | |
42 | + } | |
43 | + return res; | |
44 | +} | |
45 | + | |
46 | +void Generator::decodeRes( | |
47 | + const std::vector<EncodedGeneratorInterpretation>& encodedRes, | |
48 | + const std::string& lemma, | |
49 | + const std::vector<uint32_t>& lemmaCodepoints, | |
50 | + std::vector<MorphInterpretation>& result) const { | |
51 | + | |
52 | + for (unsigned int i = 0; i < encodedRes.size(); i++) { | |
53 | + EncodedGeneratorInterpretation egi = encodedRes[i]; | |
54 | + string decodedOrth = this->decodeOrth(egi.orth, lemmaCodepoints); | |
55 | + MorphInterpretation mi( | |
56 | + 0, 0, | |
57 | + decodedOrth, lemma, | |
58 | + egi.tag, | |
59 | + egi.nameClassifier, | |
60 | + env.getAnalyzerTagset(), | |
61 | + env.getCharsetConverter()); | |
62 | + result.push_back(mi); | |
63 | + } | |
64 | +} | |
65 | + | |
66 | +void Generator::generate(const string& lemma, vector<MorphInterpretation>& result) const { | |
67 | + const char* currInput = lemma.c_str(); | |
68 | + const char* inputEnd = currInput + lemma.length(); | |
69 | + vector<uint32_t> codepoints; | |
70 | + SynthStateType state = this->fsa->getInitialState(); | |
71 | + while (currInput != inputEnd && !state.isSink()) { | |
72 | + uint32_t codepoint = this->env.getCharsetConverter().next(currInput, inputEnd); | |
73 | + feedState(state, codepoint, this->env.getCharsetConverter()); | |
74 | + codepoints.push_back(codepoint); | |
75 | + } | |
76 | + if (state.isAccepting()) { | |
77 | + vector<EncodedGeneratorInterpretation> encodedRes = state.getValue(); | |
78 | + decodeRes(encodedRes, lemma, codepoints, result); | |
79 | + } | |
80 | +} | |
... | ... |
morfeusz/Generator.hpp
0 → 100644
1 | +/* | |
2 | + * File: Generator.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 21 styczeń 2014, 14:38 | |
6 | + */ | |
7 | + | |
8 | +#ifndef GENERATOR_HPP | |
9 | +#define GENERATOR_HPP | |
10 | + | |
11 | +#include <string> | |
12 | +#include <vector> | |
13 | +#include "charset/CharsetConverter.hpp" | |
14 | +#include "MorphInterpretation.hpp" | |
15 | +#include "Tagset.hpp" | |
16 | +#include "GeneratorDeserializer.hpp" | |
17 | + | |
18 | +typedef FSA< std::vector<EncodedGeneratorInterpretation > > SynthFSAType; | |
19 | +typedef State< std::vector<EncodedGeneratorInterpretation > > SynthStateType; | |
20 | + | |
21 | +class Generator { | |
22 | +public: | |
23 | + Generator( | |
24 | + const unsigned char* ptr, | |
25 | + const Environment& env); | |
26 | + void generate(const std::string& lemma, std::vector<MorphInterpretation>& result) const; | |
27 | + virtual ~Generator(); | |
28 | +private: | |
29 | +// Generator(const SynthDeserializer& deserializer); | |
30 | + GeneratorDeserializer deserializer; | |
31 | + const SynthFSAType* fsa; | |
32 | + const Environment& env; | |
33 | + | |
34 | + std::string decodeOrth( | |
35 | + const EncodedOrth& orth, | |
36 | + const std::vector<uint32_t>& lemmaCodepoints) const; | |
37 | + | |
38 | + void decodeRes( | |
39 | + const std::vector<EncodedGeneratorInterpretation>& encodedRes, | |
40 | + const std::string& lemma, | |
41 | + const std::vector<uint32_t>& lemmaCodepoints, | |
42 | + std::vector<MorphInterpretation>& result) const; | |
43 | +}; | |
44 | + | |
45 | +#endif /* GENERATOR_HPP */ | |
46 | + | |
... | ... |
morfeusz/GeneratorDeserializer.cpp
... | ... | @@ -6,24 +6,47 @@ |
6 | 6 | */ |
7 | 7 | |
8 | 8 | #include "GeneratorDeserializer.hpp" |
9 | +#include "EncodedGeneratorInterpretation.hpp" | |
9 | 10 | |
10 | 11 | using namespace std; |
11 | 12 | |
12 | -GeneratorDeserializer::GeneratorDeserializer(const string& lemma) | |
13 | -: lemma(&lemma) { | |
14 | - | |
13 | +GeneratorDeserializer::GeneratorDeserializer(const Environment& env) | |
14 | +: env(env) { | |
15 | + | |
16 | +} | |
17 | + | |
18 | +void GeneratorDeserializer::deserializeOrth(const unsigned char*& ptr, EncodedOrth& orth) const { | |
19 | + // XXX uważać na poprawność danych | |
20 | + orth.suffixToCut = *ptr; | |
21 | + ptr++; | |
22 | + orth.suffixToAdd = (const char*) ptr; | |
23 | + ptr += strlen((const char*) ptr) + 1; | |
15 | 24 | } |
16 | 25 | |
17 | -void GeneratorDeserializer::setCurrentLemma(const string& lemma) { | |
18 | - this->lemma = &lemma; | |
26 | +void GeneratorDeserializer::deserializeInterp(const unsigned char*& ptr, EncodedGeneratorInterpretation& interp) const { | |
27 | + deserializeOrth(ptr, interp.orth); | |
28 | + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*> (ptr))); | |
29 | + ptr += 2; | |
30 | + interp.nameClassifier = *ptr; | |
31 | + ptr++; | |
19 | 32 | } |
20 | 33 | |
21 | 34 | long GeneratorDeserializer::deserialize( |
22 | 35 | const unsigned char* ptr, |
23 | - std::vector<MorphInterpretation>& interps) const { | |
24 | - | |
36 | + std::vector<EncodedGeneratorInterpretation>& interps) const { | |
37 | + const unsigned char* currPtr = ptr; | |
38 | + uint8_t interpsNum = *ptr; | |
39 | + interps.clear(); | |
40 | + interps.reserve(interpsNum); | |
41 | + currPtr++; | |
42 | + for (unsigned int i = 0; i < interpsNum; ++i) { | |
43 | + EncodedGeneratorInterpretation interp; | |
44 | + this->deserializeInterp(currPtr, interp); | |
45 | + interps.push_back(interp); | |
46 | + } | |
47 | + return currPtr - ptr; | |
25 | 48 | } |
26 | 49 | |
27 | 50 | GeneratorDeserializer::~GeneratorDeserializer() { |
28 | - | |
51 | + | |
29 | 52 | } |
... | ... |
morfeusz/GeneratorDeserializer.hpp
... | ... | @@ -5,25 +5,29 @@ |
5 | 5 | * Created on 20 styczeń 2014, 17:14 |
6 | 6 | */ |
7 | 7 | |
8 | -#ifndef GENERATORDESERIALIZER_HPP | |
9 | -#define GENERATORDESERIALIZER_HPP | |
8 | +#ifndef SYNTHDESERIALIZER_HPP | |
9 | +#define SYNTHDESERIALIZER_HPP | |
10 | 10 | |
11 | 11 | #include <string> |
12 | 12 | #include <vector> |
13 | 13 | #include "fsa/fsa.hpp" |
14 | -#include "MorphInterpretation.hpp" | |
14 | +#include "Tagset.hpp" | |
15 | +#include "EncodedGeneratorInterpretation.hpp" | |
16 | +#include "Environment.hpp" | |
15 | 17 | |
16 | -class GeneratorDeserializer: public Deserializer< std::vector<MorphInterpretation> > { | |
18 | +class GeneratorDeserializer: public Deserializer< std::vector<EncodedGeneratorInterpretation> > { | |
17 | 19 | public: |
18 | - GeneratorDeserializer(const std::string& lemma); | |
19 | - void setCurrentLemma(const std::string& lemma); | |
20 | + explicit GeneratorDeserializer(const Environment& env); | |
20 | 21 | long deserialize( |
21 | 22 | const unsigned char* ptr, |
22 | - std::vector<MorphInterpretation>& interps) const; | |
23 | + std::vector<EncodedGeneratorInterpretation>& interps) const; | |
23 | 24 | virtual ~GeneratorDeserializer(); |
24 | 25 | private: |
25 | - const std::string* lemma; | |
26 | + const Environment& env; | |
27 | + | |
28 | + void deserializeInterp(const unsigned char*& ptr, EncodedGeneratorInterpretation& interp) const; | |
29 | + void deserializeOrth(const unsigned char*& ptr, EncodedOrth& orth) const; | |
26 | 30 | }; |
27 | 31 | |
28 | -#endif /* GENERATORDESERIALIZER_HPP */ | |
32 | +#endif /* SYNTHDESERIALIZER_HPP */ | |
29 | 33 | |
... | ... |
morfeusz/InterpretedChunksDecoder.hpp
... | ... | @@ -13,18 +13,13 @@ |
13 | 13 | #include "InterpretedChunk.hpp" |
14 | 14 | #include "EncodedInterpretation.hpp" |
15 | 15 | #include "charset/CaseConverter.hpp" |
16 | +#include "Environment.hpp" | |
16 | 17 | |
17 | 18 | class InterpretedChunksDecoder { |
18 | 19 | public: |
19 | 20 | |
20 | - InterpretedChunksDecoder( | |
21 | - const Tagset& tagset, | |
22 | - const CharsetConverter& charsetConverter, | |
23 | - const CaseConverter& caseConverter) | |
24 | - : tagset(tagset), | |
25 | - charsetConverter(charsetConverter), | |
26 | - utf8CharsetConverter(), | |
27 | - caseConverter(caseConverter) { | |
21 | + InterpretedChunksDecoder(const Environment& env) | |
22 | + : env(env) { | |
28 | 23 | |
29 | 24 | } |
30 | 25 | |
... | ... | @@ -34,7 +29,7 @@ public: |
34 | 29 | unsigned int endNode, |
35 | 30 | const InterpretedChunk& interpretedChunk, |
36 | 31 | OutputIterator out) { |
37 | - string orth = charsetConverter.toString(interpretedChunk.originalCodepoints); | |
32 | + string orth = env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); | |
38 | 33 | for (unsigned int i = 0; i < interpretedChunk.interpsGroup.interps.size(); i++) { |
39 | 34 | const EncodedInterpretation& ei = interpretedChunk.interpsGroup.interps[i]; |
40 | 35 | string lemma = convertLemma( |
... | ... | @@ -45,8 +40,8 @@ public: |
45 | 40 | orth, lemma, |
46 | 41 | ei.tag, |
47 | 42 | ei.nameClassifier, |
48 | - tagset, | |
49 | - charsetConverter); | |
43 | + env.getAnalyzerTagset(), | |
44 | + env.getCharsetConverter()); | |
50 | 45 | ++out; |
51 | 46 | } |
52 | 47 | return out; |
... | ... | @@ -61,28 +56,20 @@ private: |
61 | 56 | for (unsigned int i = 0; i < orth.size() - lemma.suffixToCut; i++) { |
62 | 57 | uint32_t cp = |
63 | 58 | (i < lemma.casePattern.size() && lemma.casePattern[i]) |
64 | - ? this->caseConverter.toTitle(orth[i]) | |
59 | + ? env.getCaseConverter().toTitle(orth[i]) | |
65 | 60 | : orth[i]; |
66 | - charsetConverter.append(cp, res); | |
61 | + env.getCharsetConverter().append(cp, res); | |
67 | 62 | } |
68 | 63 | const char* suffixPtr = lemma.suffixToAdd.c_str(); |
69 | 64 | const char* suffixEnd = suffixPtr + lemma.suffixToAdd.length(); |
70 | 65 | while (suffixPtr != suffixEnd) { |
71 | - uint32_t cp = utf8CharsetConverter.next(suffixPtr, suffixEnd); | |
72 | - charsetConverter.append(cp, res); | |
66 | + uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd); | |
67 | + env.getCharsetConverter().append(cp, res); | |
73 | 68 | } |
74 | - // string res(orth); | |
75 | - // res.erase( | |
76 | - // res.end() - lemma.suffixToCut, | |
77 | - // res.end()); | |
78 | - // res.append(lemma.suffixToAdd); | |
79 | 69 | return res; |
80 | 70 | } |
81 | 71 | |
82 | - const Tagset& tagset; | |
83 | - const CharsetConverter& charsetConverter; | |
84 | - const UTF8CharsetConverter utf8CharsetConverter; | |
85 | - const CaseConverter& caseConverter; | |
72 | + const Environment& env; | |
86 | 73 | }; |
87 | 74 | |
88 | 75 | #endif /* INTERPSGROUPDECODER_HPP */ |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -12,6 +12,7 @@ |
12 | 12 | #include "data/default_fsa.hpp" |
13 | 13 | #include "Morfeusz.hpp" |
14 | 14 | #include "MorphDeserializer.hpp" |
15 | +#include "GeneratorDeserializer.hpp" | |
15 | 16 | #include "InterpretedChunksDecoder.hpp" |
16 | 17 | #include "charset/CharsetConverter.hpp" |
17 | 18 | #include "charset/charset_utils.hpp" |
... | ... | @@ -22,56 +23,61 @@ |
22 | 23 | |
23 | 24 | using namespace std; |
24 | 25 | |
25 | -static Deserializer<vector<InterpsGroup> >* initializeDeserializer() { | |
26 | +static Deserializer<vector<InterpsGroup> >* initializeAnalyzerDeserializer() { | |
26 | 27 | static Deserializer < vector < InterpsGroup > > *deserializer |
27 | 28 | = new MorphDeserializer(); |
28 | 29 | return deserializer; |
29 | 30 | } |
30 | 31 | |
31 | -static FSA<vector<InterpsGroup > > *initializeFSA(const string& filename) { | |
32 | +static FSA<vector<InterpsGroup > > *initializeAnalyzerFSA(const string& filename) { | |
32 | 33 | cerr << "initialize FSA" << endl; |
33 | - return FSA < vector < InterpsGroup > > ::getFSA(filename, *initializeDeserializer()); | |
34 | -} | |
35 | - | |
36 | -static CharsetConverter* getCharsetConverter(MorfeuszCharset charset) { | |
37 | - cerr << "initialize charset converter for " << charset << endl; | |
38 | - static CharsetConverter* utf8Converter = new UTF8CharsetConverter(); | |
39 | -// static CharsetConverter* utf16LEConverter = new UTF16CharsetConverter(UTF16CharsetConverter::UTF16CharsetConverter::LE); | |
40 | -// static CharsetConverter* utf16BEConverter = new UTF16CharsetConverter(UTF16CharsetConverter::Endianness::BE); | |
41 | - static CharsetConverter* iso8859_2Converter = new ISO8859_2_CharsetConverter(); | |
42 | - static CharsetConverter* windows1250Converter = new Windows_1250_CharsetConverter(); | |
43 | - static CharsetConverter* cp852Converter = new CP852_CharsetConverter(); | |
44 | - switch (charset) { | |
45 | - case UTF8: | |
46 | - return utf8Converter; | |
47 | - case ISO8859_2: | |
48 | - return iso8859_2Converter; | |
49 | - case CP1250: | |
50 | - return windows1250Converter; | |
51 | - case CP852: | |
52 | - return cp852Converter; | |
53 | - default: | |
54 | - throw MorfeuszException("invalid charset"); | |
55 | - } | |
56 | -} | |
57 | - | |
58 | -static Tagset* initializeTagset(const string& filename) { | |
59 | - cerr << "initialize tagset" << endl; | |
60 | - static Tagset* tagset = new Tagset(readFile<unsigned char>(filename.c_str())); | |
61 | - return tagset; | |
62 | -} | |
63 | - | |
64 | -static Tagset* initializeTagset(const unsigned char* data) { | |
65 | - cerr << "initialize tagset" << endl; | |
66 | - static Tagset* tagset = new Tagset(data); | |
67 | - return tagset; | |
68 | -} | |
69 | - | |
70 | -static CaseConverter* initializeCaseConverter() { | |
71 | - cerr << "initialize case converter" << endl; | |
72 | - static CaseConverter* cc = new CaseConverter(); | |
73 | - return cc; | |
74 | -} | |
34 | + return FSA < vector < InterpsGroup > > ::getFSA(filename, *initializeAnalyzerDeserializer()); | |
35 | +} | |
36 | + | |
37 | +//static FSA<vector<MorphInterpretation > > *initializeSynthFSA(const string& filename, const SynthDeserializer& deserializer) { | |
38 | +// cerr << "initialize synth FSA" << endl; | |
39 | +// return FSA < vector < EncodedGeneratorInterpretation > > ::getFSA(filename, deserializer); | |
40 | +//} | |
41 | +// | |
42 | +//static CharsetConverter* getCharsetConverter(MorfeuszCharset charset) { | |
43 | +// cerr << "initialize charset converter for " << charset << endl; | |
44 | +// static CharsetConverter* utf8Converter = new UTF8CharsetConverter(); | |
45 | +//// static CharsetConverter* utf16LEConverter = new UTF16CharsetConverter(UTF16CharsetConverter::UTF16CharsetConverter::LE); | |
46 | +//// static CharsetConverter* utf16BEConverter = new UTF16CharsetConverter(UTF16CharsetConverter::Endianness::BE); | |
47 | +// static CharsetConverter* iso8859_2Converter = new ISO8859_2_CharsetConverter(); | |
48 | +// static CharsetConverter* windows1250Converter = new Windows_1250_CharsetConverter(); | |
49 | +// static CharsetConverter* cp852Converter = new CP852_CharsetConverter(); | |
50 | +// switch (charset) { | |
51 | +// case UTF8: | |
52 | +// return utf8Converter; | |
53 | +// case ISO8859_2: | |
54 | +// return iso8859_2Converter; | |
55 | +// case CP1250: | |
56 | +// return windows1250Converter; | |
57 | +// case CP852: | |
58 | +// return cp852Converter; | |
59 | +// default: | |
60 | +// throw MorfeuszException("invalid charset"); | |
61 | +// } | |
62 | +//} | |
63 | +// | |
64 | +//static Tagset* initializeTagset(const string& filename) { | |
65 | +// cerr << "initialize tagset" << endl; | |
66 | +// static Tagset* tagset = new Tagset(readFile<unsigned char>(filename.c_str())); | |
67 | +// return tagset; | |
68 | +//} | |
69 | +// | |
70 | +//static Tagset* initializeTagset(const unsigned char* data) { | |
71 | +// cerr << "initialize tagset" << endl; | |
72 | +// static Tagset* tagset = new Tagset(data); | |
73 | +// return tagset; | |
74 | +//} | |
75 | +// | |
76 | +//static CaseConverter* initializeCaseConverter() { | |
77 | +// cerr << "initialize case converter" << endl; | |
78 | +// static CaseConverter* cc = new CaseConverter(); | |
79 | +// return cc; | |
80 | +//} | |
75 | 81 | |
76 | 82 | static MorfeuszOptions createDefaultOptions() { |
77 | 83 | MorfeuszOptions res; |
... | ... | @@ -81,44 +87,44 @@ static MorfeuszOptions createDefaultOptions() { |
81 | 87 | } |
82 | 88 | |
83 | 89 | Morfeusz::Morfeusz() |
84 | -: fsa(FSAType::getFSA(DEFAULT_FSA, *initializeDeserializer())), | |
85 | -charsetConverter(getCharsetConverter(DEFAULT_MORFEUSZ_CHARSET)), | |
86 | -tagset(initializeTagset(DEFAULT_FSA)), | |
87 | -caseConverter(initializeCaseConverter()), | |
90 | +: env(Tagset(DEFAULT_FSA), Tagset(DEFAULT_SYNTH_FSA), DEFAULT_MORFEUSZ_CHARSET), | |
91 | +analyzerFSA(FSAType::getFSA(DEFAULT_FSA, *initializeAnalyzerDeserializer())), | |
92 | +isAnalyzerFSAFromFile(false), | |
93 | +generator(DEFAULT_SYNTH_FSA, env), | |
88 | 94 | options(createDefaultOptions()) { |
89 | 95 | |
90 | 96 | } |
91 | 97 | |
92 | -Morfeusz::Morfeusz(const string& filename) | |
93 | -: fsa(initializeFSA(filename)), | |
94 | -charsetConverter(getCharsetConverter(DEFAULT_MORFEUSZ_CHARSET)), | |
95 | -tagset(initializeTagset(filename)), | |
96 | -caseConverter(initializeCaseConverter()), | |
97 | -options(createDefaultOptions()) { | |
98 | - | |
98 | +void Morfeusz::setAnalyzerFile(const string& filename) { | |
99 | + if (this->isAnalyzerFSAFromFile) { | |
100 | + delete this->analyzerFSA; | |
101 | + } | |
102 | + this->analyzerFSA = initializeAnalyzerFSA(filename); | |
103 | + this->isAnalyzerFSAFromFile = true; | |
99 | 104 | } |
100 | 105 | |
101 | 106 | Morfeusz::~Morfeusz() { |
102 | - // delete &this->fsa; | |
103 | - // delete &this->charsetConverter; | |
107 | + if (this->isAnalyzerFSAFromFile) { | |
108 | + delete this->analyzerFSA; | |
109 | + } | |
104 | 110 | } |
105 | 111 | |
106 | -void Morfeusz::processOneWord( | |
112 | +void Morfeusz::analyzeOneWord( | |
107 | 113 | const char*& inputData, |
108 | 114 | const char* inputEnd, |
109 | 115 | int startNodeNum, |
110 | 116 | std::vector<MorphInterpretation>& results) const { |
111 | 117 | while (inputData != inputEnd |
112 | - && isEndOfWord(this->charsetConverter->peek(inputData, inputEnd))) { | |
113 | - this->charsetConverter->next(inputData, inputEnd); | |
118 | + && isEndOfWord(this->env.getCharsetConverter().peek(inputData, inputEnd))) { | |
119 | + this->env.getCharsetConverter().next(inputData, inputEnd); | |
114 | 120 | } |
115 | 121 | const char* wordStart = inputData; |
116 | 122 | vector<InterpretedChunk> accum; |
117 | 123 | FlexionGraph graph; |
118 | 124 | const char* currInput = inputData; |
119 | - doProcessOneWord(currInput, inputEnd, accum, graph); | |
125 | + doAnalyzeOneWord(currInput, inputEnd, accum, graph); | |
120 | 126 | if (!graph.empty()) { |
121 | - InterpretedChunksDecoder interpretedChunksDecoder(*tagset, *charsetConverter, *caseConverter); | |
127 | + InterpretedChunksDecoder interpretedChunksDecoder(env); | |
122 | 128 | int srcNode = startNodeNum; |
123 | 129 | for (unsigned int i = 0; i < graph.getTheGraph().size(); i++) { |
124 | 130 | vector<FlexionGraph::Edge>& edges = graph.getTheGraph()[i]; |
... | ... | @@ -136,25 +142,25 @@ void Morfeusz::processOneWord( |
136 | 142 | inputData = currInput; |
137 | 143 | } |
138 | 144 | |
139 | -void Morfeusz::doProcessOneWord( | |
145 | +void Morfeusz::doAnalyzeOneWord( | |
140 | 146 | const char*& inputData, |
141 | 147 | const char* inputEnd, |
142 | 148 | vector<InterpretedChunk>& accum, |
143 | 149 | FlexionGraph& graph) const { |
144 | 150 | bool endOfWord = inputData == inputEnd; |
145 | 151 | const char* currInput = inputData; |
146 | - uint32_t codepoint = endOfWord ? 0 : this->charsetConverter->next(currInput, inputEnd); | |
152 | + uint32_t codepoint = endOfWord ? 0 : this->env.getCharsetConverter().next(currInput, inputEnd); | |
147 | 153 | // UnicodeChunk uchunk(*(this->charsetConverter), *(this->caseConverter)); |
148 | 154 | vector<uint32_t> originalCodepoints; |
149 | 155 | vector<uint32_t> lowercaseCodepoints; |
150 | 156 | |
151 | - StateType state = this->fsa->getInitialState(); | |
157 | + StateType state = this->analyzerFSA->getInitialState(); | |
152 | 158 | |
153 | 159 | while (!isEndOfWord(codepoint)) { |
154 | - uint32_t lowerCP = this->caseConverter->toLower(codepoint); | |
160 | + uint32_t lowerCP = this->env.getCaseConverter().toLower(codepoint); | |
155 | 161 | originalCodepoints.push_back(codepoint); |
156 | 162 | lowercaseCodepoints.push_back(lowerCP); |
157 | - this->feedState(state, lowerCP); | |
163 | + feedState(state, lowerCP, UTF8CharsetConverter()); | |
158 | 164 | if (state.isAccepting()) { |
159 | 165 | vector< InterpsGroup > val(state.getValue()); |
160 | 166 | for (unsigned int i = 0; i < val.size(); i++) { |
... | ... | @@ -162,13 +168,13 @@ void Morfeusz::doProcessOneWord( |
162 | 168 | InterpretedChunk ic = {inputData, originalCodepoints, lowercaseCodepoints, ig}; |
163 | 169 | accum.push_back(ic); |
164 | 170 | const char* newCurrInput = currInput; |
165 | - doProcessOneWord(newCurrInput, inputEnd, accum, graph); | |
171 | + doAnalyzeOneWord(newCurrInput, inputEnd, accum, graph); | |
166 | 172 | accum.pop_back(); |
167 | 173 | } |
168 | 174 | } |
169 | - codepoint = currInput == inputEnd ? 0 : this->charsetConverter->peek(currInput, inputEnd); | |
175 | + codepoint = currInput == inputEnd ? 0 : this->env.getCharsetConverter().peek(currInput, inputEnd); | |
170 | 176 | if (!isEndOfWord(codepoint)) { |
171 | - this->charsetConverter->next(currInput, inputEnd); | |
177 | + this->env.getCharsetConverter().next(currInput, inputEnd); | |
172 | 178 | } |
173 | 179 | } |
174 | 180 | if (state.isAccepting()) { |
... | ... | @@ -184,28 +190,20 @@ void Morfeusz::doProcessOneWord( |
184 | 190 | inputData = currInput; |
185 | 191 | } |
186 | 192 | |
187 | -void Morfeusz::feedState( | |
188 | - StateType& state, | |
189 | - int codepoint) const { | |
190 | - string chars; | |
191 | - this->utf8CharsetConverter.append(codepoint, chars); | |
192 | - for (unsigned int i = 0; i < chars.length(); i++) { | |
193 | - state.proceedToNext(chars[i]); | |
194 | - } | |
195 | -} | |
196 | - | |
197 | 193 | void Morfeusz::appendIgnotiumToResults( |
198 | 194 | const string& word, |
199 | 195 | int startNodeNum, |
200 | 196 | std::vector<MorphInterpretation>& results) const { |
201 | - MorphInterpretation interp = MorphInterpretation::createIgn(startNodeNum, word, *this->tagset, *this->charsetConverter); | |
197 | + MorphInterpretation interp = MorphInterpretation::createIgn(startNodeNum, word, env.getAnalyzerTagset(), env.getCharsetConverter()); | |
202 | 198 | results.push_back(interp); |
203 | 199 | } |
204 | 200 | |
205 | 201 | ResultsIterator Morfeusz::analyze(const string& text) const { |
206 | 202 | // const char* textStart = text.c_str(); |
207 | 203 | // const char* textEnd = text.c_str() + text.length(); |
208 | - return ResultsIterator(text, *this); | |
204 | + vector<MorphInterpretation> res; | |
205 | + this->analyze(text, res); | |
206 | + return ResultsIterator(res); | |
209 | 207 | } |
210 | 208 | |
211 | 209 | void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) const { |
... | ... | @@ -213,21 +211,28 @@ void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) |
213 | 211 | const char* inputEnd = input + text.length(); |
214 | 212 | while (input != inputEnd) { |
215 | 213 | int startNode = results.empty() ? 0 : results.back().getEndNode(); |
216 | - DEBUG("process " + string(input, inputEnd)); | |
217 | - this->processOneWord(input, inputEnd, startNode, results); | |
214 | + this->analyzeOneWord(input, inputEnd, startNode, results); | |
218 | 215 | } |
219 | 216 | } |
220 | 217 | |
221 | -void Morfeusz::setEncoding(MorfeuszCharset encoding) { | |
222 | - this->options.encoding = encoding; | |
223 | - this->charsetConverter = getCharsetConverter(encoding); | |
218 | +ResultsIterator Morfeusz::generate(const string& text) const { | |
219 | + // const char* textStart = text.c_str(); | |
220 | + // const char* textEnd = text.c_str() + text.length(); | |
221 | + vector<MorphInterpretation> res; | |
222 | + this->generate(text, res); | |
223 | + return ResultsIterator(res); | |
224 | +} | |
225 | + | |
226 | +void Morfeusz::generate(const string& text, vector<MorphInterpretation>& results) const { | |
227 | + this->generator.generate(text, results); | |
224 | 228 | } |
225 | 229 | |
226 | -ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz) | |
227 | -: rawInput(text.c_str()), | |
228 | -morfeusz(morfeusz) { | |
229 | - vector<MorphInterpretation> res; | |
230 | - morfeusz.analyze(text, res); | |
230 | +void Morfeusz::setCharset(MorfeuszCharset charset) { | |
231 | + this->options.encoding = charset; | |
232 | + this->env.setCharset(charset); | |
233 | +} | |
234 | + | |
235 | +ResultsIterator::ResultsIterator(vector<MorphInterpretation>& res) { | |
231 | 236 | resultsBuffer.insert(resultsBuffer.begin(), res.begin(), res.end()); |
232 | 237 | } |
233 | 238 | |
... | ... |
morfeusz/Morfeusz.hpp
... | ... | @@ -21,71 +21,61 @@ |
21 | 21 | #include "FlexionGraph.hpp" |
22 | 22 | #include "MorfeuszOptions.hpp" |
23 | 23 | #include "const.hpp" |
24 | +#include "exceptions.hpp" | |
25 | +#include "Generator.hpp" | |
26 | +#include "Environment.hpp" | |
24 | 27 | |
25 | 28 | class Morfeusz; |
26 | 29 | class ResultsIterator; |
27 | 30 | |
28 | -typedef FSA<std::vector<InterpsGroup > > FSAType; | |
29 | -typedef State<std::vector<InterpsGroup > > StateType; | |
30 | - | |
31 | -class MorfeuszException : public std::exception { | |
32 | -public: | |
33 | - | |
34 | - MorfeuszException(const std::string& what) : msg(what.c_str()) { | |
35 | - } | |
36 | - | |
37 | - virtual ~MorfeuszException() throw () { | |
38 | - } | |
39 | - | |
40 | - virtual const char* what() const throw () { | |
41 | - return this->msg.c_str(); | |
42 | - } | |
43 | -private: | |
44 | - const std::string msg; | |
45 | -}; | |
31 | +typedef FSA< std::vector<InterpsGroup > > FSAType; | |
32 | +typedef State< std::vector<InterpsGroup > > StateType; | |
46 | 33 | |
47 | 34 | class Morfeusz { |
48 | 35 | public: |
49 | 36 | Morfeusz(); |
50 | - explicit Morfeusz(const std::string& filename); | |
37 | + // explicit Morfeusz(const std::string& filename); | |
38 | + void setAnalyzerFile(const std::string& filename); | |
39 | + void setSynthesizerFile(const std::string& filename); | |
51 | 40 | virtual ~Morfeusz(); |
52 | 41 | // Morfeusz(const Morfeusz& orig); |
53 | 42 | ResultsIterator analyze(const std::string& text) const; |
54 | 43 | void analyze(const std::string& text, std::vector<MorphInterpretation>& result) const; |
55 | 44 | |
56 | - void setEncoding(MorfeuszCharset encoding); | |
45 | + void generate(const std::string& lemma, std::vector<MorphInterpretation>& result) const; | |
46 | + ResultsIterator generate(const std::string& lemma) const; | |
47 | + | |
48 | + void setCharset(MorfeuszCharset encoding); | |
57 | 49 | |
58 | 50 | // Morfeusz(); |
59 | 51 | friend class ResultsIterator; |
60 | 52 | private: |
61 | 53 | |
62 | - void processOneWord( | |
54 | + void analyzeOneWord( | |
63 | 55 | const char*& inputData, |
64 | 56 | const char* inputEnd, |
65 | 57 | int startNodeNum, |
66 | 58 | std::vector<MorphInterpretation>& result) const; |
67 | 59 | |
68 | - void doProcessOneWord( | |
60 | + void doAnalyzeOneWord( | |
69 | 61 | const char*& inputData, |
70 | 62 | const char* inputEnd, |
71 | 63 | std::vector<InterpretedChunk>& accum, |
72 | 64 | FlexionGraph& graph) const; |
73 | 65 | |
74 | - void feedState( | |
75 | - StateType& state, | |
76 | - int codepoint) const; | |
77 | - | |
78 | 66 | void appendIgnotiumToResults( |
79 | 67 | const std::string& word, |
80 | 68 | int startNodeNum, |
81 | 69 | std::vector<MorphInterpretation>& results) const; |
82 | - | |
83 | - FSAType* fsa; | |
84 | - CharsetConverter* charsetConverter; | |
85 | - Tagset* tagset; | |
86 | - CaseConverter* caseConverter; | |
87 | - | |
88 | - UTF8CharsetConverter utf8CharsetConverter; | |
70 | + Environment env; | |
71 | + FSAType* analyzerFSA; | |
72 | + bool isAnalyzerFSAFromFile; | |
73 | + Generator generator; | |
74 | +// const CharsetConverter* charsetConverter; | |
75 | +// const Tagset* tagset; | |
76 | +// const CaseConverter* caseConverter; | |
77 | +// | |
78 | +// UTF8CharsetConverter utf8CharsetConverter; | |
89 | 79 | |
90 | 80 | MorfeuszOptions options; |
91 | 81 | }; |
... | ... | @@ -96,9 +86,8 @@ public: |
96 | 86 | bool hasNext(); |
97 | 87 | friend class Morfeusz; |
98 | 88 | private: |
99 | - ResultsIterator(const std::string& text, const Morfeusz& morfeusz); | |
89 | + ResultsIterator(vector<MorphInterpretation>& res); | |
100 | 90 | const char* rawInput; |
101 | - const Morfeusz& morfeusz; | |
102 | 91 | std::list<MorphInterpretation> resultsBuffer; |
103 | 92 | int startNode; |
104 | 93 | }; |
... | ... |
morfeusz/Tagset.cpp
... | ... | @@ -36,6 +36,11 @@ Tagset::Tagset(const unsigned char* fsaData) { |
36 | 36 | readTags(currPtr, this->names); |
37 | 37 | } |
38 | 38 | |
39 | +//Tagset::Tagset(const Tagset& tagset) | |
40 | +//: tags(tagset.tags), names(tagset.names) { | |
41 | +// | |
42 | +//} | |
43 | + | |
39 | 44 | const string Tagset::getTag(const int tagNum, const CharsetConverter& charsetConverter) const { |
40 | 45 | return charsetConverter.fromUTF8(this->tags.at(tagNum)); |
41 | 46 | } |
... | ... |
morfeusz/Tagset.hpp
... | ... | @@ -15,6 +15,7 @@ |
15 | 15 | class Tagset { |
16 | 16 | public: |
17 | 17 | explicit Tagset(const unsigned char* fsaData); |
18 | +// Tagset(const Tagset& tagset); | |
18 | 19 | const std::string getTag(const int tagNum, const CharsetConverter& charsetConverter) const; |
19 | 20 | const std::string getName(const int nameNum, const CharsetConverter& charsetConverter) const; |
20 | 21 | private: |
... | ... |
morfeusz/charset/charset_utils.hpp
... | ... | @@ -8,7 +8,9 @@ |
8 | 8 | #ifndef CHARSET_UTILS_HPP |
9 | 9 | #define CHARSET_UTILS_HPP |
10 | 10 | |
11 | +#include <string> | |
11 | 12 | #include <set> |
13 | +#include "CharsetConverter.hpp" | |
12 | 14 | |
13 | 15 | static inline std::set<int> initializeWhitespaces() { |
14 | 16 | std::set<int> res; |
... | ... | @@ -18,10 +20,22 @@ static inline std::set<int> initializeWhitespaces() { |
18 | 20 | return res; |
19 | 21 | } |
20 | 22 | |
21 | -bool isEndOfWord(int codepoint) { | |
23 | +inline bool isEndOfWord(int codepoint) { | |
22 | 24 | static std::set<int> whitespaces(initializeWhitespaces()); |
23 | 25 | return whitespaces.count(codepoint); |
24 | 26 | } |
25 | 27 | |
28 | +template <class StateClass> | |
29 | +void feedState( | |
30 | + StateClass& state, | |
31 | + int codepoint, | |
32 | + const CharsetConverter& charsetConverter) { | |
33 | + std::string chars; | |
34 | + charsetConverter.append(codepoint, chars); | |
35 | + for (unsigned int i = 0; i < chars.length(); i++) { | |
36 | + state.proceedToNext(chars[i]); | |
37 | + } | |
38 | +} | |
39 | + | |
26 | 40 | #endif /* CHARSET_UTILS_HPP */ |
27 | 41 | |
... | ... |
morfeusz/data/default_fsa.hpp
morfeusz/exceptions.hpp
0 → 100644
1 | +/* | |
2 | + * File: exceptions.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 22 styczeń 2014, 13:16 | |
6 | + */ | |
7 | + | |
8 | +#ifndef EXCEPTIONS_HPP | |
9 | +#define EXCEPTIONS_HPP | |
10 | + | |
11 | +class MorfeuszException : public std::exception { | |
12 | +public: | |
13 | + | |
14 | + MorfeuszException(const std::string& what) : msg(what.c_str()) { | |
15 | + } | |
16 | + | |
17 | + virtual ~MorfeuszException() throw () { | |
18 | + } | |
19 | + | |
20 | + virtual const char* what() const throw () { | |
21 | + return this->msg.c_str(); | |
22 | + } | |
23 | +private: | |
24 | + const std::string msg; | |
25 | +}; | |
26 | + | |
27 | +#endif /* EXCEPTIONS_HPP */ | |
28 | + | |
... | ... |
morfeusz/fsa/cfsa1_impl.hpp
... | ... | @@ -118,10 +118,10 @@ void CompressedFSA1<T>::doProceedToNextByList( |
118 | 118 | currPtr += *currPtr + 1; |
119 | 119 | break; |
120 | 120 | case 2: |
121 | - currPtr += ntohs(*((uint16_t*) currPtr)) + 2; | |
121 | + currPtr += ntohs(*((const uint16_t*) currPtr)) + 2; | |
122 | 122 | break; |
123 | 123 | case 3: |
124 | - currPtr += (((unsigned int) ntohs(*((uint16_t*) currPtr))) << 8) + currPtr[2] + 3; | |
124 | + currPtr += (((const unsigned int) ntohs(*((const uint16_t*) currPtr))) << 8) + currPtr[2] + 3; | |
125 | 125 | break; |
126 | 126 | } |
127 | 127 | // cerr << "FOUND " << c << " " << currPtr - this->startPtr << endl; |
... | ... |
morfeusz/fsa/fsa_impl.hpp
... | ... | @@ -64,7 +64,7 @@ FSA<T>* FSA<T>::getFSA(const std::string& filename, const Deserializer<T>& deser |
64 | 64 | template <class T> |
65 | 65 | FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer) { |
66 | 66 | |
67 | - uint32_t magicNumber = ntohl(*((uint32_t*) ptr)); | |
67 | + uint32_t magicNumber = ntohl(*((const uint32_t*) ptr)); | |
68 | 68 | if (magicNumber != MAGIC_NUMBER) { |
69 | 69 | throw FSAException("Invalid magic number"); |
70 | 70 | } |
... | ... |
morfeusz/fsa/simplefsa_impl.hpp
morfeusz/morfeusz.i
... | ... | @@ -12,6 +12,7 @@ |
12 | 12 | %{ |
13 | 13 | #include "Morfeusz.hpp" |
14 | 14 | #include "MorphInterpretation.hpp" |
15 | +#include "exceptions.hpp" | |
15 | 16 | #include "const.hpp" |
16 | 17 | %} |
17 | 18 | |
... | ... | @@ -49,6 +50,7 @@ |
49 | 50 | %include "Morfeusz.hpp" |
50 | 51 | %include "MorphInterpretation.hpp" |
51 | 52 | %include "const.hpp" |
53 | +%include "exceptions.hpp" | |
52 | 54 | |
53 | 55 | // instantiate vector of interpretations |
54 | 56 | namespace std { |
... | ... |
morfeusz/main.cpp renamed to morfeusz/morfeusz_analyzer.cpp
... | ... | @@ -18,10 +18,10 @@ using namespace std; |
18 | 18 | int main(int argc, char** argv) { |
19 | 19 | Morfeusz morfeusz; |
20 | 20 | #ifdef _WIN32 |
21 | - morfeusz.setEncoding(CP852); | |
21 | + morfeusz.setCharset(CP852); | |
22 | 22 | #endif |
23 | 23 | #ifdef _WIN64 |
24 | - morfeusz.setEncoding(CP852); | |
24 | + morfeusz.ssetCharsetCP852); | |
25 | 25 | #endif |
26 | 26 | string line; |
27 | 27 | while (getline(cin, line)) { |
... | ... | @@ -51,5 +51,3 @@ int main(int argc, char** argv) { |
51 | 51 | printf("\n"); |
52 | 52 | return 0; |
53 | 53 | } |
54 | - | |
55 | - | |
... | ... |
morfeusz/morfeusz_generator.cpp
0 → 100644
1 | +/* | |
2 | + * File: morfeusz_generator.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 21 styczeń 2014, 12:02 | |
6 | + */ | |
7 | + | |
8 | +#include <cstdlib> | |
9 | +#include <iostream> | |
10 | +#include <vector> | |
11 | +#include "fsa/fsa.hpp" | |
12 | +#include "Tagset.hpp" | |
13 | +#include "Morfeusz.hpp" | |
14 | +#include "const.hpp" | |
15 | + | |
16 | +using namespace std; | |
17 | + | |
18 | +int main(int argc, char** argv) { | |
19 | + Morfeusz morfeusz; | |
20 | +#ifdef _WIN32 | |
21 | + morfeusz.setCharset(CP852); | |
22 | +#endif | |
23 | +#ifdef _WIN64 | |
24 | + morfeusz.ssetCharsetCP852); | |
25 | +#endif | |
26 | + string line; | |
27 | + while (getline(cin, line)) { | |
28 | + // printf("%s\n", line.c_str()); | |
29 | + vector<MorphInterpretation> res; | |
30 | + morfeusz.generate(line, res); | |
31 | + printf("["); | |
32 | + for (unsigned int i = 0; i < res.size(); i++) { | |
33 | + if (i > 0) { | |
34 | + printf("; "); | |
35 | + } | |
36 | + MorphInterpretation& mi = res[i]; | |
37 | + printf("%s,%s,%s,%s", | |
38 | + mi.getOrth().c_str(), mi.getLemma().c_str(), | |
39 | + mi.getTag().c_str(), mi.getName().c_str()); | |
40 | + } | |
41 | + printf("]\n"); | |
42 | + } | |
43 | + printf("\n"); | |
44 | + return 0; | |
45 | +} | |
... | ... |
morfeusz/test_recognize_dict.cpp
... | ... | @@ -16,53 +16,12 @@ |
16 | 16 | |
17 | 17 | using namespace std; |
18 | 18 | |
19 | -//void doTest( | |
20 | -// const FSA<vector<InterpsGroup >> &fsa, | |
21 | -// const Tagset& tagset, | |
22 | -// // const InterpretationsDecoder<TaggedInterpretation>& interpsConverter, | |
23 | -// const char* fname) { | |
24 | -// ifstream ifs; | |
25 | -// // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); | |
26 | -// ifs.open(fname, ios::binary); | |
27 | -// string line; | |
28 | -// while (getline(ifs, line)) { | |
29 | -// vector<string> splitVector(split(line, '\t')); | |
30 | -// string orth = splitVector[0]; | |
31 | -// string lemma = splitVector[1]; | |
32 | -// string tag = splitVector[2]; | |
33 | -// string name = splitVector[3]; | |
34 | -// vector<InterpsGroup> value2; | |
35 | -// fsa.tryToRecognize(orth.c_str(), value2); | |
36 | -// DEBUG("recognized " + to_string(value2.size())); | |
37 | -// // vector<TaggedInterpretation> parsedValues; | |
38 | -// bool found = false; | |
39 | -// | |
40 | -// for (InterpsGroup ig : value2) | |
41 | -// for (MorphInterpretation interp : ig.getRealInterps(orth, 0, 0, tagset)) { | |
42 | -// // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp); | |
43 | -// // (0, 0, orth, encodedInterp, tagset); | |
44 | -// // parsedValues.push_back(parsedValue); | |
45 | -// // debug(orth, parsedValue); | |
46 | -// if (lemma == interp.getLemma() && tag == interp.getTag() && name == interp.getName()) { | |
47 | -// DEBUG("RECOGNIZED"); | |
48 | -// found = true; | |
49 | -// } | |
50 | -// else { | |
51 | -// DEBUG("not matching " + interp.getLemma() + " " + interp.getTag() + " " + interp.getName()); | |
52 | -// } | |
53 | -// } | |
54 | -// validate(found, "Failed to recognize " + orth + " " + lemma + ":" + tag + ":" + name); | |
55 | -// // debug(key, value2); | |
56 | -// // validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key); | |
57 | -// } | |
58 | -// validate(ifs.eof(), "Failed to read the input file to the end"); | |
59 | -//} | |
60 | - | |
61 | 19 | int main(int argc, char** argv) { |
62 | 20 | validate(argc == 3, "Must provide exactly 2 arguments - input FSA filename and dictionary filename."); |
63 | 21 | string fsaFilename = argv[1]; |
64 | 22 | string dictFilename = argv[2]; |
65 | - Morfeusz morfeusz(fsaFilename); | |
23 | + Morfeusz morfeusz; | |
24 | + morfeusz.setAnalyzerFile(fsaFilename); | |
66 | 25 | ifstream in; |
67 | 26 | in.open(dictFilename.c_str()); |
68 | 27 | string line; |
... | ... |
morfeusz/test_result_equals.cpp
... | ... | @@ -48,7 +48,7 @@ int main(int argc, char** argv) { |
48 | 48 | Morfeusz morfeusz; |
49 | 49 | if (argc == 4) { |
50 | 50 | MorfeuszCharset encoding = getEncoding(argv[3]); |
51 | - morfeusz.setEncoding(encoding); | |
51 | + morfeusz.setCharset(encoding); | |
52 | 52 | } |
53 | 53 | string line; |
54 | 54 | while (getline(in, line)) { |
... | ... |
morfeusz/test_synth_dict.cpp
0 → 100644
morfeusz/utils.hpp
... | ... | @@ -81,7 +81,8 @@ void appendMorfeuszResults(const std::vector<MorphInterpretation>& res, OutputSt |
81 | 81 | if (prevStart != -1 |
82 | 82 | && (prevStart != mi.getStartNode() || prevEnd != mi.getEndNode())) { |
83 | 83 | out << "]\n["; |
84 | - } else if (prevStart != -1) { | |
84 | + } | |
85 | + else if (prevStart != -1) { | |
85 | 86 | out << "; "; |
86 | 87 | } |
87 | 88 | out << mi.getStartNode() << "," |
... | ... |
nbproject/configurations.xml
1 | 1 | <?xml version="1.0" encoding="UTF-8"?> |
2 | 2 | <configurationDescriptor version="90"> |
3 | 3 | <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT"> |
4 | + <logicalFolder name="build" | |
5 | + displayName="build" | |
6 | + projectFiles="true" | |
7 | + root="build"> | |
8 | + <itemPath>build/default_fsa.cpp</itemPath> | |
9 | + <itemPath>build/default_synth_fsa.cpp</itemPath> | |
10 | + </logicalFolder> | |
4 | 11 | <logicalFolder name="f1" displayName="input" projectFiles="true"> |
5 | 12 | </logicalFolder> |
6 | 13 | <df root="morfeusz" name="0"> |
... | ... | @@ -19,21 +26,21 @@ |
19 | 26 | <in>test_recognize.cpp</in> |
20 | 27 | <in>test_speed.cpp</in> |
21 | 28 | </df> |
22 | - <df name="generator"> | |
23 | - <in>EncodedGeneratorInterpretation.hpp</in> | |
24 | - <in>GeneratorDeserializer.cpp</in> | |
25 | - <in>GeneratorDeserializer.hpp</in> | |
26 | - </df> | |
29 | + <in>Environment.cpp</in> | |
27 | 30 | <in>FlexionGraph.cpp</in> |
31 | + <in>Generator.cpp</in> | |
32 | + <in>GeneratorDeserializer.cpp</in> | |
28 | 33 | <in>Morfeusz.cpp</in> |
29 | 34 | <in>MorphDeserializer.cpp</in> |
30 | 35 | <in>MorphInterpretation.cpp</in> |
31 | 36 | <in>Tagset.cpp</in> |
32 | - <in>Toolchain-Linux-amd64.cmake</in> | |
33 | 37 | <in>const.cpp</in> |
34 | - <in>main.cpp</in> | |
38 | + <in>exceptions.hpp</in> | |
39 | + <in>morfeusz_analyzer.cpp</in> | |
40 | + <in>morfeusz_generator.cpp</in> | |
35 | 41 | <in>test_recognize_dict.cpp</in> |
36 | 42 | <in>test_result_equals.cpp</in> |
43 | + <in>test_synth_dict.cpp</in> | |
37 | 44 | </df> |
38 | 45 | <logicalFolder name="morfeusz" |
39 | 46 | displayName="morfeusz" |
... | ... | @@ -76,9 +83,17 @@ |
76 | 83 | <buildCommandWorkingDir>build</buildCommandWorkingDir> |
77 | 84 | <buildCommand>${MAKE} -f Makefile</buildCommand> |
78 | 85 | <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> |
79 | - <executablePath>build/morfeusz/test_result_equals</executablePath> | |
86 | + <executablePath>build/morfeusz/morfeusz_generator</executablePath> | |
80 | 87 | </makeTool> |
81 | 88 | </makefileType> |
89 | + <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> | |
90 | + <ccTool> | |
91 | + </ccTool> | |
92 | + </item> | |
93 | + <item path="build/default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> | |
94 | + <ccTool> | |
95 | + </ccTool> | |
96 | + </item> | |
82 | 97 | <item path="build/morfeusz/java/swigJAVA.cpp" ex="false" tool="1" flavor2="4"> |
83 | 98 | <ccTool> |
84 | 99 | </ccTool> |
... | ... | @@ -94,8 +109,13 @@ |
94 | 109 | <pElem>build/morfeusz/java</pElem> |
95 | 110 | </incDir> |
96 | 111 | <preprocessorList> |
112 | + <Elem>NDEBUG</Elem> | |
113 | + <Elem>_OPTIMIZE__=1</Elem> | |
97 | 114 | <Elem>jmorfeusz_EXPORTS</Elem> |
98 | 115 | </preprocessorList> |
116 | + <undefinedList> | |
117 | + <Elem>__NO_INLINE__</Elem> | |
118 | + </undefinedList> | |
99 | 119 | </ccTool> |
100 | 120 | </item> |
101 | 121 | <item path="build/morfeusz/morfeuszPYTHON_wrap.cxx" |
... | ... | @@ -109,8 +129,13 @@ |
109 | 129 | <pElem>build/morfeusz/python</pElem> |
110 | 130 | </incDir> |
111 | 131 | <preprocessorList> |
132 | + <Elem>NDEBUG</Elem> | |
133 | + <Elem>_OPTIMIZE__=1</Elem> | |
112 | 134 | <Elem>_morfeusz_EXPORTS</Elem> |
113 | 135 | </preprocessorList> |
136 | + <undefinedList> | |
137 | + <Elem>__NO_INLINE__</Elem> | |
138 | + </undefinedList> | |
114 | 139 | </ccTool> |
115 | 140 | </item> |
116 | 141 | <item path="build/morfeusz/python/swigPYTHON.cpp" |
... | ... | @@ -124,18 +149,16 @@ |
124 | 149 | <ccTool> |
125 | 150 | <incDir> |
126 | 151 | <pElem>build</pElem> |
152 | + <pElem>morfeusz</pElem> | |
127 | 153 | <pElem>build/morfeusz</pElem> |
128 | 154 | </incDir> |
129 | 155 | <preprocessorList> |
130 | - <Elem>NDEBUG</Elem> | |
131 | - <Elem>_OPTIMIZE__=1</Elem> | |
132 | 156 | <Elem>__PIC__=2</Elem> |
133 | 157 | <Elem>__pic__=2</Elem> |
134 | 158 | <Elem>libmorfeusz_EXPORTS</Elem> |
135 | 159 | </preprocessorList> |
136 | 160 | <undefinedList> |
137 | 161 | <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> |
138 | - <Elem>__NO_INLINE__</Elem> | |
139 | 162 | </undefinedList> |
140 | 163 | </ccTool> |
141 | 164 | </folder> |
... | ... | @@ -158,28 +181,42 @@ |
158 | 181 | </undefinedList> |
159 | 182 | </ccTool> |
160 | 183 | </folder> |
184 | + <folder path="build"> | |
185 | + <ccTool> | |
186 | + <incDir> | |
187 | + <pElem>build</pElem> | |
188 | + <pElem>morfeusz</pElem> | |
189 | + <pElem>build/morfeusz</pElem> | |
190 | + </incDir> | |
191 | + <preprocessorList> | |
192 | + <Elem>__PIC__=2</Elem> | |
193 | + <Elem>__pic__=2</Elem> | |
194 | + <Elem>libmorfeusz_EXPORTS</Elem> | |
195 | + </preprocessorList> | |
196 | + <undefinedList> | |
197 | + <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> | |
198 | + </undefinedList> | |
199 | + </ccTool> | |
200 | + </folder> | |
161 | 201 | <folder path="morfeusz"> |
162 | 202 | <ccTool> |
163 | 203 | <incDir> |
164 | 204 | <pElem>build</pElem> |
165 | 205 | </incDir> |
166 | 206 | <preprocessorList> |
167 | - <Elem>NDEBUG</Elem> | |
168 | - <Elem>_OPTIMIZE__=1</Elem> | |
169 | 207 | <Elem>__PIC__=2</Elem> |
170 | 208 | <Elem>__pic__=2</Elem> |
171 | 209 | </preprocessorList> |
172 | 210 | <undefinedList> |
173 | 211 | <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> |
174 | - <Elem>__NO_INLINE__</Elem> | |
175 | 212 | </undefinedList> |
176 | 213 | </ccTool> |
177 | 214 | </folder> |
178 | 215 | <folder path="morfeusz/java"> |
179 | 216 | <ccTool> |
180 | 217 | <incDir> |
181 | - <pElem>/usr/lib/jvm/default-java/include</pElem> | |
182 | 218 | <pElem>morfeusz</pElem> |
219 | + <pElem>/usr/lib/jvm/default-java/include</pElem> | |
183 | 220 | </incDir> |
184 | 221 | <preprocessorList> |
185 | 222 | <Elem>jmorfeusz_EXPORTS</Elem> |
... | ... | @@ -193,26 +230,80 @@ |
193 | 230 | <pElem>morfeusz</pElem> |
194 | 231 | </incDir> |
195 | 232 | <preprocessorList> |
233 | + <Elem>NDEBUG</Elem> | |
234 | + <Elem>_OPTIMIZE__=1</Elem> | |
196 | 235 | <Elem>pymorfeusz_EXPORTS</Elem> |
197 | 236 | </preprocessorList> |
237 | + <undefinedList> | |
238 | + <Elem>__NO_INLINE__</Elem> | |
239 | + </undefinedList> | |
198 | 240 | </ccTool> |
199 | 241 | </folder> |
242 | + <item path="morfeusz/Environment.cpp" ex="false" tool="1" flavor2="4"> | |
243 | + <ccTool> | |
244 | + <incDir> | |
245 | + <pElem>build</pElem> | |
246 | + <pElem>morfeusz</pElem> | |
247 | + <pElem>build/morfeusz</pElem> | |
248 | + </incDir> | |
249 | + <preprocessorList> | |
250 | + <Elem>__PIC__=2</Elem> | |
251 | + <Elem>__pic__=2</Elem> | |
252 | + <Elem>libmorfeusz_EXPORTS</Elem> | |
253 | + </preprocessorList> | |
254 | + <undefinedList> | |
255 | + <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> | |
256 | + </undefinedList> | |
257 | + </ccTool> | |
258 | + </item> | |
200 | 259 | <item path="morfeusz/FlexionGraph.cpp" ex="false" tool="1" flavor2="4"> |
201 | 260 | <ccTool> |
202 | 261 | <incDir> |
203 | 262 | <pElem>build</pElem> |
263 | + <pElem>morfeusz</pElem> | |
264 | + <pElem>build/morfeusz</pElem> | |
265 | + </incDir> | |
266 | + <preprocessorList> | |
267 | + <Elem>__PIC__=2</Elem> | |
268 | + <Elem>__pic__=2</Elem> | |
269 | + <Elem>libmorfeusz_EXPORTS</Elem> | |
270 | + </preprocessorList> | |
271 | + <undefinedList> | |
272 | + <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> | |
273 | + </undefinedList> | |
274 | + </ccTool> | |
275 | + </item> | |
276 | + <item path="morfeusz/Generator.cpp" ex="false" tool="1" flavor2="4"> | |
277 | + <ccTool> | |
278 | + <incDir> | |
279 | + <pElem>build</pElem> | |
280 | + <pElem>morfeusz</pElem> | |
281 | + <pElem>build/morfeusz</pElem> | |
282 | + </incDir> | |
283 | + <preprocessorList> | |
284 | + <Elem>__PIC__=2</Elem> | |
285 | + <Elem>__pic__=2</Elem> | |
286 | + <Elem>libmorfeusz_EXPORTS</Elem> | |
287 | + </preprocessorList> | |
288 | + <undefinedList> | |
289 | + <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> | |
290 | + </undefinedList> | |
291 | + </ccTool> | |
292 | + </item> | |
293 | + <item path="morfeusz/GeneratorDeserializer.cpp" ex="false" tool="1" flavor2="4"> | |
294 | + <ccTool> | |
295 | + <incDir> | |
296 | + <pElem>build</pElem> | |
297 | + <pElem>morfeusz</pElem> | |
204 | 298 | <pElem>build/morfeusz</pElem> |
205 | 299 | </incDir> |
206 | 300 | <preprocessorList> |
207 | - <Elem>NDEBUG</Elem> | |
208 | - <Elem>_OPTIMIZE__=1</Elem> | |
209 | 301 | <Elem>__PIC__=2</Elem> |
210 | 302 | <Elem>__pic__=2</Elem> |
211 | 303 | <Elem>libmorfeusz_EXPORTS</Elem> |
212 | 304 | </preprocessorList> |
213 | 305 | <undefinedList> |
214 | 306 | <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> |
215 | - <Elem>__NO_INLINE__</Elem> | |
216 | 307 | </undefinedList> |
217 | 308 | </ccTool> |
218 | 309 | </item> |
... | ... | @@ -220,18 +311,16 @@ |
220 | 311 | <ccTool> |
221 | 312 | <incDir> |
222 | 313 | <pElem>build</pElem> |
314 | + <pElem>morfeusz</pElem> | |
223 | 315 | <pElem>build/morfeusz</pElem> |
224 | 316 | </incDir> |
225 | 317 | <preprocessorList> |
226 | - <Elem>NDEBUG</Elem> | |
227 | - <Elem>_OPTIMIZE__=1</Elem> | |
228 | 318 | <Elem>__PIC__=2</Elem> |
229 | 319 | <Elem>__pic__=2</Elem> |
230 | 320 | <Elem>libmorfeusz_EXPORTS</Elem> |
231 | 321 | </preprocessorList> |
232 | 322 | <undefinedList> |
233 | 323 | <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> |
234 | - <Elem>__NO_INLINE__</Elem> | |
235 | 324 | </undefinedList> |
236 | 325 | </ccTool> |
237 | 326 | </item> |
... | ... | @@ -239,18 +328,16 @@ |
239 | 328 | <ccTool> |
240 | 329 | <incDir> |
241 | 330 | <pElem>build</pElem> |
331 | + <pElem>morfeusz</pElem> | |
242 | 332 | <pElem>build/morfeusz</pElem> |
243 | 333 | </incDir> |
244 | 334 | <preprocessorList> |
245 | - <Elem>NDEBUG</Elem> | |
246 | - <Elem>_OPTIMIZE__=1</Elem> | |
247 | 335 | <Elem>__PIC__=2</Elem> |
248 | 336 | <Elem>__pic__=2</Elem> |
249 | 337 | <Elem>libmorfeusz_EXPORTS</Elem> |
250 | 338 | </preprocessorList> |
251 | 339 | <undefinedList> |
252 | 340 | <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> |
253 | - <Elem>__NO_INLINE__</Elem> | |
254 | 341 | </undefinedList> |
255 | 342 | </ccTool> |
256 | 343 | </item> |
... | ... | @@ -258,18 +345,16 @@ |
258 | 345 | <ccTool> |
259 | 346 | <incDir> |
260 | 347 | <pElem>build</pElem> |
348 | + <pElem>morfeusz</pElem> | |
261 | 349 | <pElem>build/morfeusz</pElem> |
262 | 350 | </incDir> |
263 | 351 | <preprocessorList> |
264 | - <Elem>NDEBUG</Elem> | |
265 | - <Elem>_OPTIMIZE__=1</Elem> | |
266 | 352 | <Elem>__PIC__=2</Elem> |
267 | 353 | <Elem>__pic__=2</Elem> |
268 | 354 | <Elem>libmorfeusz_EXPORTS</Elem> |
269 | 355 | </preprocessorList> |
270 | 356 | <undefinedList> |
271 | 357 | <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> |
272 | - <Elem>__NO_INLINE__</Elem> | |
273 | 358 | </undefinedList> |
274 | 359 | </ccTool> |
275 | 360 | </item> |
... | ... | @@ -277,26 +362,19 @@ |
277 | 362 | <ccTool> |
278 | 363 | <incDir> |
279 | 364 | <pElem>build</pElem> |
365 | + <pElem>morfeusz</pElem> | |
280 | 366 | <pElem>build/morfeusz</pElem> |
281 | 367 | </incDir> |
282 | 368 | <preprocessorList> |
283 | - <Elem>NDEBUG</Elem> | |
284 | - <Elem>_OPTIMIZE__=1</Elem> | |
285 | 369 | <Elem>__PIC__=2</Elem> |
286 | 370 | <Elem>__pic__=2</Elem> |
287 | 371 | <Elem>libmorfeusz_EXPORTS</Elem> |
288 | 372 | </preprocessorList> |
289 | 373 | <undefinedList> |
290 | 374 | <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> |
291 | - <Elem>__NO_INLINE__</Elem> | |
292 | 375 | </undefinedList> |
293 | 376 | </ccTool> |
294 | 377 | </item> |
295 | - <item path="morfeusz/Toolchain-Linux-amd64.cmake" | |
296 | - ex="false" | |
297 | - tool="3" | |
298 | - flavor2="0"> | |
299 | - </item> | |
300 | 378 | <item path="morfeusz/charset/CaseConverter.cpp" ex="false" tool="1" flavor2="4"> |
301 | 379 | <ccTool> |
302 | 380 | </ccTool> |
... | ... | @@ -323,18 +401,16 @@ |
323 | 401 | <ccTool> |
324 | 402 | <incDir> |
325 | 403 | <pElem>build</pElem> |
404 | + <pElem>morfeusz</pElem> | |
326 | 405 | <pElem>build/morfeusz</pElem> |
327 | 406 | </incDir> |
328 | 407 | <preprocessorList> |
329 | - <Elem>NDEBUG</Elem> | |
330 | - <Elem>_OPTIMIZE__=1</Elem> | |
331 | 408 | <Elem>__PIC__=2</Elem> |
332 | 409 | <Elem>__pic__=2</Elem> |
333 | 410 | <Elem>libmorfeusz_EXPORTS</Elem> |
334 | 411 | </preprocessorList> |
335 | 412 | <undefinedList> |
336 | 413 | <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> |
337 | - <Elem>__NO_INLINE__</Elem> | |
338 | 414 | </undefinedList> |
339 | 415 | </ccTool> |
340 | 416 | </item> |
... | ... | @@ -342,22 +418,22 @@ |
342 | 418 | <ccTool> |
343 | 419 | </ccTool> |
344 | 420 | </item> |
421 | + <item path="morfeusz/exceptions.hpp" ex="false" tool="3" flavor2="0"> | |
422 | + </item> | |
345 | 423 | <item path="morfeusz/fsa/const.cpp" ex="false" tool="1" flavor2="4"> |
346 | 424 | <ccTool> |
347 | 425 | <incDir> |
348 | 426 | <pElem>build</pElem> |
427 | + <pElem>morfeusz</pElem> | |
349 | 428 | <pElem>build/morfeusz</pElem> |
350 | 429 | </incDir> |
351 | 430 | <preprocessorList> |
352 | - <Elem>NDEBUG</Elem> | |
353 | - <Elem>_OPTIMIZE__=1</Elem> | |
354 | 431 | <Elem>__PIC__=2</Elem> |
355 | 432 | <Elem>__pic__=2</Elem> |
356 | 433 | <Elem>libmorfeusz_EXPORTS</Elem> |
357 | 434 | </preprocessorList> |
358 | 435 | <undefinedList> |
359 | 436 | <Elem>__GCC_HAVE_DWARF2_CFI_ASM=1</Elem> |
360 | - <Elem>__NO_INLINE__</Elem> | |
361 | 437 | </undefinedList> |
362 | 438 | </ccTool> |
363 | 439 | </item> |
... | ... | @@ -385,64 +461,44 @@ |
385 | 461 | </incDir> |
386 | 462 | </ccTool> |
387 | 463 | </item> |
388 | - <item path="morfeusz/generator/EncodedGeneratorInterpretation.hpp" | |
389 | - ex="false" | |
390 | - tool="3" | |
391 | - flavor2="0"> | |
392 | - </item> | |
393 | - <item path="morfeusz/generator/GeneratorDeserializer.cpp" | |
394 | - ex="false" | |
395 | - tool="1" | |
396 | - flavor2="0"> | |
397 | - </item> | |
398 | - <item path="morfeusz/generator/GeneratorDeserializer.hpp" | |
399 | - ex="false" | |
400 | - tool="3" | |
401 | - flavor2="0"> | |
464 | + <item path="morfeusz/morfeusz_analyzer.cpp" ex="false" tool="1" flavor2="4"> | |
465 | + <ccTool> | |
466 | + <incDir> | |
467 | + <pElem>build</pElem> | |
468 | + <pElem>morfeusz</pElem> | |
469 | + <pElem>build/morfeusz</pElem> | |
470 | + </incDir> | |
471 | + </ccTool> | |
402 | 472 | </item> |
403 | - <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="4"> | |
473 | + <item path="morfeusz/morfeusz_generator.cpp" ex="false" tool="1" flavor2="4"> | |
404 | 474 | <ccTool> |
405 | 475 | <incDir> |
406 | 476 | <pElem>build</pElem> |
477 | + <pElem>morfeusz</pElem> | |
407 | 478 | <pElem>build/morfeusz</pElem> |
408 | 479 | </incDir> |
409 | - <preprocessorList> | |
410 | - <Elem>NDEBUG</Elem> | |
411 | - <Elem>_OPTIMIZE__=1</Elem> | |
412 | - </preprocessorList> | |
413 | - <undefinedList> | |
414 | - <Elem>__NO_INLINE__</Elem> | |
415 | - </undefinedList> | |
416 | 480 | </ccTool> |
417 | 481 | </item> |
418 | 482 | <item path="morfeusz/test_recognize_dict.cpp" ex="false" tool="1" flavor2="4"> |
419 | 483 | <ccTool> |
420 | 484 | <incDir> |
421 | 485 | <pElem>build</pElem> |
486 | + <pElem>morfeusz</pElem> | |
422 | 487 | <pElem>build/morfeusz</pElem> |
423 | 488 | </incDir> |
424 | - <preprocessorList> | |
425 | - <Elem>NDEBUG</Elem> | |
426 | - <Elem>_OPTIMIZE__=1</Elem> | |
427 | - </preprocessorList> | |
428 | - <undefinedList> | |
429 | - <Elem>__NO_INLINE__</Elem> | |
430 | - </undefinedList> | |
431 | 489 | </ccTool> |
432 | 490 | </item> |
433 | 491 | <item path="morfeusz/test_result_equals.cpp" ex="false" tool="1" flavor2="4"> |
434 | 492 | <ccTool> |
435 | 493 | <incDir> |
436 | 494 | <pElem>build</pElem> |
495 | + <pElem>morfeusz</pElem> | |
437 | 496 | <pElem>build/morfeusz</pElem> |
438 | 497 | </incDir> |
439 | - <preprocessorList> | |
440 | - <Elem>NDEBUG</Elem> | |
441 | - <Elem>_OPTIMIZE__=1</Elem> | |
442 | - </preprocessorList> | |
443 | - <undefinedList> | |
444 | - <Elem>__NO_INLINE__</Elem> | |
445 | - </undefinedList> | |
498 | + </ccTool> | |
499 | + </item> | |
500 | + <item path="morfeusz/test_synth_dict.cpp" ex="false" tool="1" flavor2="4"> | |
501 | + <ccTool> | |
446 | 502 | </ccTool> |
447 | 503 | </item> |
448 | 504 | </conf> |
... | ... |