Commit df3ada33fcf61cba25a0112bf0c9940779eb6458
1 parent
22f68665
ogarnięcie (w końcu!) kwestii homonimów w całości
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@128 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
17 changed files
with
97 additions
and
303 deletions
CMakeLists.txt
... | ... | @@ -36,7 +36,7 @@ if ("${INPUT_DICTIONARIES}" STREQUAL "") |
36 | 36 | if ("${EMPTY_INPUT_DICTIONARY}" STREQUAL "TRUE") |
37 | 37 | set (INPUT_DICTIONARIES ${PROJECT_SOURCE_DIR}/input/empty.txt) |
38 | 38 | else () |
39 | - set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/PoliMorfSmall.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
39 | + set (INPUT_DICTIONARIES "${PROJECT_SOURCE_DIR}/input/sgjp-hom.tab,${PROJECT_SOURCE_DIR}/input/dodatki.tab") | |
40 | 40 | endif () |
41 | 41 | endif () |
42 | 42 | |
... | ... | @@ -72,7 +72,9 @@ elseif (${CMAKE_SYSTEM_NAME} MATCHES "Windows") |
72 | 72 | set (CMAKE_SHARED_LINKER_FLAGS "-s -Os -static-libstdc++ -static-libgcc -Wl,--exclude-libs,libgcc_eh.a") |
73 | 73 | set (CMAKE_EXE_LINKER_FLAGS "-s -Os -static-libstdc++ -static-libgcc") |
74 | 74 | elseif (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") |
75 | - set (MACOSX_RPATH TRUE) | |
75 | + #~ set (CMAKE_INSTALL_NAME_DIR @executable_path) | |
76 | + #~ set (CMAKE_BUILD_WITH_INSTALL_RPATH ON) | |
77 | + #~ set (MACOSX_RPATH TRUE) | |
76 | 78 | endif () |
77 | 79 | |
78 | 80 | ########## Setup RPATH ########## |
... | ... |
fsabuilder/morfeuszbuilder/tagset/segtypes.py
... | ... | @@ -179,7 +179,7 @@ class Segtypes(object): |
179 | 179 | class SegtypePattern(object): |
180 | 180 | |
181 | 181 | def __init__(self, lemma, pattern, segnum): |
182 | - self.lemma = lemma.split(':')[0] if len(lemma) > 1 else lemma | |
182 | + self.lemma = lemma.split(':')[0] if lemma and len(lemma) > 1 else lemma | |
183 | 183 | self.pattern = pattern |
184 | 184 | self.segnum = segnum |
185 | 185 | |
... | ... | @@ -190,7 +190,7 @@ class SegtypePattern(object): |
190 | 190 | patterns2Match.append(self.pattern.replace('%', '.*')) |
191 | 191 | patterns2Match.append(re.sub(r'\:\%$', '', self.pattern).replace('%', '.*')) |
192 | 192 | if lemma: |
193 | - lemma = lemma.split(':')[0] if len(lemma) > 1 else lemma | |
193 | + lemma = lemma.split(':')[0] if lemma and len(lemma) > 1 else lemma | |
194 | 194 | if (self.lemma is None or self.lemma == lemma) \ |
195 | 195 | and any([re.match(p, tag) for p in patterns2Match]): |
196 | 196 | return self.segnum |
... | ... |
morfeusz/CMakeLists.txt
... | ... | @@ -26,8 +26,6 @@ set(SRC_FILES |
26 | 26 | ${INPUT_SYNTH_DICTIONARY_CPP} |
27 | 27 | Environment.cpp |
28 | 28 | MorphDeserializer.cpp |
29 | - GeneratorDeserializer.cpp | |
30 | - Generator.cpp | |
31 | 29 | Tagset.cpp |
32 | 30 | fsa/const.cpp |
33 | 31 | MorphInterpretation.cpp |
... | ... | @@ -44,8 +42,6 @@ set(INCLUDE_FILES |
44 | 42 | const.hpp |
45 | 43 | data/default_fsa.hpp |
46 | 44 | MorphDeserializer.hpp |
47 | - GeneratorDeserializer.hpp | |
48 | - Generator.hpp | |
49 | 45 | Tagset.hpp |
50 | 46 | fsa/const.hpp |
51 | 47 | MorphInterpretation.hpp |
... | ... |
morfeusz/Generator.cpp deleted
1 | -/* | |
2 | - * File: Generator.cpp | |
3 | - * Author: mlenart | |
4 | - * | |
5 | - * Created on 21 styczeń 2014, 14:38 | |
6 | - */ | |
7 | - | |
8 | -#include <string> | |
9 | -#include <iostream> | |
10 | -#include "charset/charset_utils.hpp" | |
11 | -#include "MorphInterpretation.hpp" | |
12 | -#include "Generator.hpp" | |
13 | -#include "Environment.hpp" | |
14 | - | |
15 | - | |
16 | -using namespace std; | |
17 | - | |
18 | -Generator::Generator( | |
19 | - const unsigned char* ptr, | |
20 | - const Environment& env) | |
21 | -: deserializer(env), | |
22 | -fsa(SynthFSAType::getFSA(ptr, deserializer)), | |
23 | -env(env), | |
24 | -generatorPtr(ptr) { | |
25 | -} | |
26 | - | |
27 | -Generator::~Generator() { | |
28 | -} | |
29 | - | |
30 | -void Generator::setGeneratorPtr(const unsigned char* ptr) { | |
31 | - delete this->fsa; | |
32 | - this->generatorPtr = ptr; | |
33 | - this->fsa = SynthFSAType::getFSA(ptr, deserializer); | |
34 | -} | |
35 | - | |
36 | -void Generator::appendString(const string& str, string& res) const { | |
37 | - const char* suffixPtr = str.c_str(); | |
38 | - const char* suffixEnd = suffixPtr + str.length(); | |
39 | - while (suffixPtr != suffixEnd) { | |
40 | - uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd); | |
41 | - env.getCharsetConverter().append(cp, res); | |
42 | - } | |
43 | -} | |
44 | - | |
45 | -std::string Generator::decodeOrth( | |
46 | - const EncodedOrth& orth, | |
47 | - const std::vector<uint32_t>& lemma) const { | |
48 | - string res; | |
49 | - this->appendString(orth.prefixToAdd, res); | |
50 | - for (unsigned int i = 0; i < lemma.size() - orth.suffixToCut; i++) { | |
51 | - uint32_t cp = lemma[i]; | |
52 | - env.getCharsetConverter().append(cp, res); | |
53 | - } | |
54 | - this->appendString(orth.suffixToAdd, res); | |
55 | -// const char* suffixPtr = orth.suffixToAdd.c_str(); | |
56 | -// const char* suffixEnd = suffixPtr + orth.suffixToAdd.length(); | |
57 | -// while (suffixPtr != suffixEnd) { | |
58 | -// uint32_t cp = UTF8CharsetConverter().next(suffixPtr, suffixEnd); | |
59 | -// env.getCharsetConverter().append(cp, res); | |
60 | -// } | |
61 | - return res; | |
62 | -} | |
63 | - | |
64 | -void Generator::decodeRes( | |
65 | - const std::vector<EncodedGeneratorInterpretation>& encodedRes, | |
66 | - const std::string& lemma, | |
67 | - const std::vector<uint32_t>& lemmaCodepoints, | |
68 | - std::vector<MorphInterpretation>& result) const { | |
69 | - | |
70 | - for (unsigned int i = 0; i < encodedRes.size(); i++) { | |
71 | - EncodedGeneratorInterpretation egi = encodedRes[i]; | |
72 | - string decodedOrth = this->decodeOrth(egi.orth, lemmaCodepoints); | |
73 | - MorphInterpretation mi( | |
74 | - 0, 0, | |
75 | - decodedOrth, lemma, | |
76 | - egi.tag, | |
77 | - egi.nameClassifier, | |
78 | - env.getTagset(), | |
79 | - env.getCharsetConverter()); | |
80 | - result.push_back(mi); | |
81 | - } | |
82 | -} | |
83 | - | |
84 | -void Generator::generate(const string& lemma, vector<MorphInterpretation>& result) const { | |
85 | - const char* currInput = lemma.c_str(); | |
86 | - const char* inputEnd = currInput + lemma.length(); | |
87 | - vector<uint32_t> codepoints; | |
88 | - SynthStateType state = this->fsa->getInitialState(); | |
89 | - while (currInput != inputEnd && !state.isSink()) { | |
90 | - uint32_t codepoint = this->env.getCharsetConverter().next(currInput, inputEnd); | |
91 | - feedState(state, codepoint, this->env.getCharsetConverter()); | |
92 | - codepoints.push_back(codepoint); | |
93 | - } | |
94 | - if (state.isAccepting()) { | |
95 | - vector<EncodedGeneratorInterpretation> encodedRes = state.getValue(); | |
96 | - decodeRes(encodedRes, lemma, codepoints, result); | |
97 | - } | |
98 | -} |
morfeusz/Generator.hpp deleted
1 | -/* | |
2 | - * File: Generator.hpp | |
3 | - * Author: mlenart | |
4 | - * | |
5 | - * Created on 21 styczeń 2014, 14:38 | |
6 | - */ | |
7 | - | |
8 | -#ifndef GENERATOR_HPP | |
9 | -#define GENERATOR_HPP | |
10 | - | |
11 | -#include <string> | |
12 | -#include <vector> | |
13 | -#include "charset/CharsetConverter.hpp" | |
14 | -#include "MorphInterpretation.hpp" | |
15 | -#include "Tagset.hpp" | |
16 | -#include "GeneratorDeserializer.hpp" | |
17 | - | |
18 | -typedef FSA< std::vector<EncodedGeneratorInterpretation > > SynthFSAType; | |
19 | -typedef State< std::vector<EncodedGeneratorInterpretation > > SynthStateType; | |
20 | - | |
21 | -class Generator { | |
22 | -public: | |
23 | - Generator( | |
24 | - const unsigned char* ptr, | |
25 | - const Environment& env); | |
26 | - void generate(const std::string& lemma, std::vector<MorphInterpretation>& result) const; | |
27 | - void setGeneratorPtr(const unsigned char* ptr); | |
28 | - virtual ~Generator(); | |
29 | -private: | |
30 | -// Generator(const SynthDeserializer& deserializer); | |
31 | - GeneratorDeserializer deserializer; | |
32 | - const SynthFSAType* fsa; | |
33 | - const Environment& env; | |
34 | - const unsigned char* generatorPtr; | |
35 | - | |
36 | - std::string decodeOrth( | |
37 | - const EncodedOrth& orth, | |
38 | - const std::vector<uint32_t>& lemmaCodepoints) const; | |
39 | - | |
40 | - void decodeRes( | |
41 | - const std::vector<EncodedGeneratorInterpretation>& encodedRes, | |
42 | - const std::string& lemma, | |
43 | - const std::vector<uint32_t>& lemmaCodepoints, | |
44 | - std::vector<MorphInterpretation>& result) const; | |
45 | - | |
46 | - void appendString(const string& str, string& res) const; | |
47 | -}; | |
48 | - | |
49 | -#endif /* GENERATOR_HPP */ | |
50 | - |
morfeusz/GeneratorDeserializer.cpp deleted
1 | -/* | |
2 | - * File: GeneratorDeserializer.cpp | |
3 | - * Author: mlenart | |
4 | - * | |
5 | - * Created on 20 styczeń 2014, 17:14 | |
6 | - */ | |
7 | - | |
8 | -#include "GeneratorDeserializer.hpp" | |
9 | -#include "EncodedGeneratorInterpretation.hpp" | |
10 | - | |
11 | -using namespace std; | |
12 | - | |
13 | -GeneratorDeserializer::GeneratorDeserializer(const Environment& env) | |
14 | -: env(env) { | |
15 | - | |
16 | -} | |
17 | - | |
18 | -void GeneratorDeserializer::deserializeOrth(const unsigned char*& ptr, EncodedOrth& orth) const { | |
19 | - orth.prefixToAdd = (const char*) ptr; | |
20 | - ptr += strlen((const char*) ptr) + 1; | |
21 | - | |
22 | - orth.suffixToCut = *ptr; | |
23 | - ptr++; | |
24 | - | |
25 | - orth.suffixToAdd = (const char*) ptr; | |
26 | - ptr += strlen((const char*) ptr) + 1; | |
27 | -} | |
28 | - | |
29 | -void GeneratorDeserializer::deserializeInterp(const unsigned char*& ptr, EncodedGeneratorInterpretation& interp) const { | |
30 | - deserializeOrth(ptr, interp.orth); | |
31 | - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*> (ptr))); | |
32 | - ptr += 2; | |
33 | - interp.nameClassifier = *ptr; | |
34 | - ptr++; | |
35 | -} | |
36 | - | |
37 | -long GeneratorDeserializer::deserialize( | |
38 | - const unsigned char* ptr, | |
39 | - std::vector<EncodedGeneratorInterpretation>& interps) const { | |
40 | - const unsigned char* currPtr = ptr; | |
41 | - uint8_t interpsNum = *ptr; | |
42 | - interps.clear(); | |
43 | - interps.reserve(interpsNum); | |
44 | - currPtr++; | |
45 | - for (unsigned int i = 0; i < interpsNum; ++i) { | |
46 | - EncodedGeneratorInterpretation interp; | |
47 | - this->deserializeInterp(currPtr, interp); | |
48 | - interps.push_back(interp); | |
49 | - } | |
50 | - return currPtr - ptr; | |
51 | -} | |
52 | - | |
53 | -GeneratorDeserializer::~GeneratorDeserializer() { | |
54 | - | |
55 | -} |
morfeusz/GeneratorDeserializer.hpp deleted
1 | -/* | |
2 | - * File: GeneratorDeserializer.hpp | |
3 | - * Author: mlenart | |
4 | - * | |
5 | - * Created on 20 styczeń 2014, 17:14 | |
6 | - */ | |
7 | - | |
8 | -#ifndef SYNTHDESERIALIZER_HPP | |
9 | -#define SYNTHDESERIALIZER_HPP | |
10 | - | |
11 | -#include <string> | |
12 | -#include <vector> | |
13 | -#include "fsa/fsa.hpp" | |
14 | -#include "Tagset.hpp" | |
15 | -#include "EncodedGeneratorInterpretation.hpp" | |
16 | -#include "Environment.hpp" | |
17 | - | |
18 | -class GeneratorDeserializer: public Deserializer< std::vector<EncodedGeneratorInterpretation> > { | |
19 | -public: | |
20 | - explicit GeneratorDeserializer(const Environment& env); | |
21 | - long deserialize( | |
22 | - const unsigned char* ptr, | |
23 | - std::vector<EncodedGeneratorInterpretation>& interps) const; | |
24 | - virtual ~GeneratorDeserializer(); | |
25 | -private: | |
26 | - const Environment& env; | |
27 | - | |
28 | - void deserializeInterp(const unsigned char*& ptr, EncodedGeneratorInterpretation& interp) const; | |
29 | - void deserializeOrth(const unsigned char*& ptr, EncodedOrth& orth) const; | |
30 | -}; | |
31 | - | |
32 | -#endif /* SYNTHDESERIALIZER_HPP */ | |
33 | - |
morfeusz/InterpretedChunk.hpp
morfeusz/InterpretedChunksDecoder.hpp
... | ... | @@ -10,6 +10,7 @@ |
10 | 10 | |
11 | 11 | #include <string> |
12 | 12 | #include <vector> |
13 | +#include <utility> | |
13 | 14 | |
14 | 15 | #include "charset/CharsetConverter.hpp" |
15 | 16 | #include "EncodedInterpretation.hpp" |
... | ... | @@ -147,6 +148,16 @@ protected: |
147 | 148 | } |
148 | 149 | private: |
149 | 150 | |
151 | + pair<string, string> getLemmaHomonymIdPair(const string& lemma) const { | |
152 | + vector<string> splitRes(split(lemma, ':')); | |
153 | + if (splitRes.size() == 2) { | |
154 | + return make_pair(splitRes[0], splitRes[1]); | |
155 | + } | |
156 | + else { | |
157 | + return make_pair(lemma, ""); | |
158 | + } | |
159 | + } | |
160 | + | |
150 | 161 | MorphInterpretation decodeMorphInterpretation( |
151 | 162 | unsigned int startNode, unsigned int endNode, |
152 | 163 | const string& orth, |
... | ... | @@ -156,9 +167,11 @@ private: |
156 | 167 | string lemma = lemmaPrefix; |
157 | 168 | EncodedInterpretation ei = this->deserializeInterp(ptr); |
158 | 169 | this->decodeForm(chunk.lowercaseCodepoints, ei.value, lemma); |
170 | + pair<string, string> lemmaHomonymId = getLemmaHomonymIdPair(lemma); | |
159 | 171 | return MorphInterpretation( |
160 | 172 | startNode, endNode, |
161 | - orth, lemma, | |
173 | + orth, lemmaHomonymId.first, | |
174 | + lemmaHomonymId.second, | |
162 | 175 | ei.tag, |
163 | 176 | ei.nameClassifier, |
164 | 177 | env.getTagset(), |
... | ... | @@ -193,7 +206,12 @@ public: |
193 | 206 | lemma += env.getCharsetConverter().toString(interpretedChunk.originalCodepoints); |
194 | 207 | const unsigned char* currPtr = interpretedChunk.interpsGroup.ptr; |
195 | 208 | while (currPtr - interpretedChunk.interpsGroup.ptr < interpretedChunk.interpsGroup.size) { |
196 | - out.push_back(this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr)); | |
209 | + MorphInterpretation mi = this->decodeMorphInterpretation(startNode, endNode, orthPrefix, lemma, interpretedChunk, currPtr); | |
210 | +// cerr << mi.toString(false) << endl; | |
211 | +// cerr << "required='" << interpretedChunk.requiredHomonymId << "' morphInterp='" << mi.getHomonymId() << "'" << endl; | |
212 | + if (interpretedChunk.requiredHomonymId.empty() || mi.getHomonymId() == interpretedChunk.requiredHomonymId) { | |
213 | + out.push_back(mi); | |
214 | + } | |
197 | 215 | } |
198 | 216 | } |
199 | 217 | |
... | ... | @@ -220,10 +238,11 @@ private: |
220 | 238 | ptr += strlen((const char*) ptr) + 1; |
221 | 239 | EncodedInterpretation ei = this->deserializeInterp(ptr); |
222 | 240 | this->decodeForm(chunk.originalCodepoints, ei.value, orth); |
223 | - string realLemma = homonymId.empty() ? lemma : (lemma + ":" + homonymId); | |
241 | +// string realLemma = homonymId.empty() ? lemma : (lemma + ":" + homonymId); | |
224 | 242 | return MorphInterpretation( |
225 | 243 | startNode, endNode, |
226 | - orth, realLemma, | |
244 | + orth, lemma, | |
245 | + homonymId, | |
227 | 246 | ei.tag, |
228 | 247 | ei.nameClassifier, |
229 | 248 | env.getTagset(), |
... | ... |
morfeusz/Morfeusz.cpp
... | ... | @@ -12,7 +12,6 @@ |
12 | 12 | #include "data/default_fsa.hpp" |
13 | 13 | #include "Morfeusz.hpp" |
14 | 14 | #include "MorphDeserializer.hpp" |
15 | -#include "GeneratorDeserializer.hpp" | |
16 | 15 | #include "InterpretedChunksDecoder.hpp" |
17 | 16 | #include "charset/CharsetConverter.hpp" |
18 | 17 | #include "charset/charset_utils.hpp" |
... | ... | @@ -118,6 +117,16 @@ void Morfeusz::doProcessOneWord( |
118 | 117 | normalizedCodepoints.push_back(normalizedCodepoint); |
119 | 118 | feedState(state, normalizedCodepoint, UTF8CharsetConverter()); |
120 | 119 | codepoint = currInput == inputEnd ? 0 : env.getCharsetConverter().peek(currInput, inputEnd); |
120 | + string homonymId; | |
121 | + if (env.getProcessorType() == GENERATOR && codepoint == 0x3A && currInput + 1 != inputEnd) { | |
122 | + if (originalCodepoints.size() == 1) { | |
123 | + throw MorfeuszException("Lemma of length > 1 cannot start with a colon"); | |
124 | + } | |
125 | + homonymId = string(currInput + 1, inputEnd); | |
126 | +// cerr << "homonym " << homonymId << endl; | |
127 | + currInput = inputEnd; | |
128 | + codepoint = 0x00; | |
129 | + } | |
121 | 130 | if (state.isAccepting()) { |
122 | 131 | vector<InterpsGroup> val(state.getValue()); |
123 | 132 | for (unsigned int i = 0; i < val.size(); i++) { |
... | ... | @@ -138,24 +147,18 @@ void Morfeusz::doProcessOneWord( |
138 | 147 | ig, |
139 | 148 | newSegrulesState.shiftOrthFromPrevious, |
140 | 149 | false, |
141 | - vector<InterpretedChunk>() | |
150 | + vector<InterpretedChunk>(), | |
151 | + homonymId | |
142 | 152 | }; |
143 | 153 | if (!accum.empty() && accum.back().shiftOrth) { |
144 | -// cerr << "shift orth from " << (int) accum.back().interpsGroup.type << " to " << (int) ig.type << endl; | |
145 | 154 | doShiftOrth(accum.back(), ic); |
146 | 155 | } |
147 | 156 | accum.push_back(ic); |
148 | - if (isEndOfWord(codepoint)) { | |
149 | -// cerr << "end of word" << endl; | |
150 | - if (newSegrulesState.accepting) { | |
151 | -// cerr << "accept " << (int) ig.type << endl; | |
152 | - graph.addPath(accum); | |
153 | - } | |
154 | - else { | |
155 | -// cerr << "not accept " << (int) ig.type << endl; | |
156 | - } | |
157 | + if (isEndOfWord(codepoint) && newSegrulesState.accepting) { | |
158 | + graph.addPath(accum); | |
157 | 159 | } |
158 | - else { | |
160 | + else if (!isEndOfWord(codepoint)) { | |
161 | +// cerr << "will process " << currInput << endl; | |
159 | 162 | const char* newCurrInput = currInput; |
160 | 163 | doProcessOneWord(env, newCurrInput, inputEnd, newSegrulesState, accum, graph); |
161 | 164 | } |
... | ... |
morfeusz/Morfeusz.hpp
morfeusz/MorphInterpretation.cpp
... | ... | @@ -6,6 +6,7 @@ |
6 | 6 | */ |
7 | 7 | |
8 | 8 | #include <string> |
9 | +#include <sstream> | |
9 | 10 | #include "MorphInterpretation.hpp" |
10 | 11 | #include "EncodedInterpretation.hpp" |
11 | 12 | |
... | ... | @@ -16,6 +17,7 @@ MorphInterpretation::MorphInterpretation( |
16 | 17 | int endNode, |
17 | 18 | const string& orth, |
18 | 19 | const string& lemma, |
20 | + const string& homonymId, | |
19 | 21 | int tagnum, |
20 | 22 | int namenum, |
21 | 23 | const Tagset& tagset, |
... | ... | @@ -24,6 +26,7 @@ MorphInterpretation::MorphInterpretation( |
24 | 26 | endNode(endNode), |
25 | 27 | orth(orth), |
26 | 28 | lemma(lemma), |
29 | + homonymId(homonymId), | |
27 | 30 | tagnum(tagnum), |
28 | 31 | namenum(namenum), |
29 | 32 | tag(tagset.getTag(tagnum, charsetConverter)), |
... | ... | @@ -40,6 +43,7 @@ MorphInterpretation::MorphInterpretation( |
40 | 43 | endNode(startNode + 1), |
41 | 44 | orth(orth), |
42 | 45 | lemma(orth), |
46 | + homonymId(""), | |
43 | 47 | tagnum(0), |
44 | 48 | namenum(0), |
45 | 49 | tag(tagset.getTag(0, charsetConverter)), |
... | ... | @@ -67,6 +71,10 @@ const std::string& MorphInterpretation::getLemma() const { |
67 | 71 | return this->lemma; |
68 | 72 | } |
69 | 73 | |
74 | +const std::string& MorphInterpretation::getHomonymId() const { | |
75 | + return this->homonymId; | |
76 | +} | |
77 | + | |
70 | 78 | int MorphInterpretation::getTagnum() const { |
71 | 79 | return this->tagnum; |
72 | 80 | } |
... | ... | @@ -83,3 +91,22 @@ const std::string& MorphInterpretation::getName() const { |
83 | 91 | return this->name; |
84 | 92 | } |
85 | 93 | |
94 | +std::string MorphInterpretation::toString(bool includeNodeNumbers) const { | |
95 | + std::stringstream res; | |
96 | + if (includeNodeNumbers) { | |
97 | + res << startNode << "," << endNode << ","; | |
98 | + } | |
99 | + res << orth << ","; | |
100 | + | |
101 | + res << lemma; | |
102 | + if (!this->homonymId.empty()) { | |
103 | + res << ":" << homonymId; | |
104 | + } | |
105 | + res << ","; | |
106 | + | |
107 | + res << tag; | |
108 | + if (!name.empty()) { | |
109 | + res << "," << name; | |
110 | + } | |
111 | + return res.str(); | |
112 | +} | |
... | ... |
morfeusz/MorphInterpretation.hpp
... | ... | @@ -20,6 +20,7 @@ public: |
20 | 20 | int endNode, |
21 | 21 | const std::string& orth, |
22 | 22 | const std::string& lemma, |
23 | + const std::string& homonymId, | |
23 | 24 | int tagnum, |
24 | 25 | int namenum, |
25 | 26 | const Tagset& tagset, |
... | ... | @@ -30,10 +31,13 @@ public: |
30 | 31 | int getEndNode() const; |
31 | 32 | const std::string& getOrth() const; |
32 | 33 | const std::string& getLemma() const; |
34 | + const std::string& getHomonymId() const; | |
33 | 35 | int getTagnum() const; |
34 | 36 | int getNamenum() const; |
35 | 37 | const std::string& getTag() const; |
36 | 38 | const std::string& getName() const; |
39 | + | |
40 | + std::string toString(bool includeNodeNumbers) const; | |
37 | 41 | private: |
38 | 42 | MorphInterpretation( |
39 | 43 | int startNode, |
... | ... | @@ -44,6 +48,7 @@ private: |
44 | 48 | int endNode; |
45 | 49 | std::string orth; |
46 | 50 | std::string lemma; |
51 | + std::string homonymId; | |
47 | 52 | int tagnum; |
48 | 53 | int namenum; |
49 | 54 | std::string tag; |
... | ... |
morfeusz/java/CMakeLists.txt
... | ... | @@ -31,20 +31,26 @@ endif () |
31 | 31 | |
32 | 32 | if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") |
33 | 33 | set (CMAKE_SHARED_LIBRARY_SUFFIX ".jnilib") |
34 | + #~ set (CMAKE_SHARED_LINKER_FLAGS "${-dylib") | |
34 | 35 | endif () |
35 | 36 | |
36 | 37 | set (CMAKE_JAVA_TARGET_VERSION ${JMORFEUSZ_VERSION}) |
37 | 38 | set (CMAKE_JAVA_TARGET_OUTPUT_NAME jmorfeusz) |
38 | 39 | |
40 | +#~ add_custom_target (jmorfeusz-repair-library | |
41 | + #~ COMMAND ${DARWIN64_ROOT}/x86_64-apple-darwin9/bin/x86_64-apple-darwin9-install_name_tool -change /home/mlenart/xxx/morfeusz/buildall/build-Darwin-amd64/morfeusz/libmorfeusz2.dylib morfeusz2 ${PROJECT_BINARY_DIR}/morfeusz/java/libjmorfeusz.jnilib | |
42 | + #~ DEPENDS libjmorfeusz) | |
43 | + | |
39 | 44 | # build jmorfeusz |
40 | 45 | file(GLOB_RECURSE JAVA_SOURCES ${JAVA_SRC_DIR} "*.java") |
41 | 46 | add_jar (jmorfeusz |
42 | 47 | SOURCES "${JAVA_SOURCES}" |
43 | 48 | DEPENDS libjmorfeusz) |
44 | 49 | |
45 | -add_custom_target(jmorfeusz-copy-readme | |
50 | +add_custom_target (jmorfeusz-copy-readme | |
46 | 51 | COMMAND ${CMAKE_COMMAND} -E copy "${CMAKE_CURRENT_SOURCE_DIR}/README" "${CMAKE_CURRENT_BINARY_DIR}/README") |
47 | 52 | |
53 | + | |
48 | 54 | add_custom_target(package-java |
49 | 55 | COMMAND mkdir -p "${TARGET_DIR}" && ${CMAKE_COMMAND} -E tar "cfvz" "${TARGET_DIR}/jmorfeusz-${JMORFEUSZ_VERSION}-${CMAKE_SYSTEM_NAME}-${ARCHITECTURE}.tar.gz" "${CMAKE_CURRENT_BINARY_DIR}/*.jar" "${CMAKE_CURRENT_BINARY_DIR}/*${CMAKE_SHARED_LIBRARY_SUFFIX}" "${CMAKE_CURRENT_BINARY_DIR}/README" |
50 | 56 | DEPENDS jmorfeusz jmorfeusz-copy-readme libjmorfeusz) |
... | ... |
morfeusz/morfeusz_analyzer.cpp
... | ... | @@ -126,6 +126,11 @@ int main(int argc, const char** argv) { |
126 | 126 | printf("["); |
127 | 127 | for (unsigned int i = 0; i < res.size(); i++) { |
128 | 128 | MorphInterpretation& mi = res[i]; |
129 | + string lemmaToShow = mi.getLemma().c_str(); | |
130 | + if (!mi.getHomonymId().empty()) { | |
131 | + lemmaToShow += ":"; | |
132 | + lemmaToShow += mi.getHomonymId(); | |
133 | + } | |
129 | 134 | if (prevStart != -1 |
130 | 135 | && (prevStart != mi.getStartNode() || prevEnd != mi.getEndNode())) { |
131 | 136 | printf("]\n["); |
... | ... | @@ -133,10 +138,11 @@ int main(int argc, const char** argv) { |
133 | 138 | else if (prevStart != -1) { |
134 | 139 | printf("; "); |
135 | 140 | } |
136 | - printf("%d,%d,%s,%s,%s,%s", | |
137 | - mi.getStartNode(), mi.getEndNode(), | |
138 | - mi.getOrth().c_str(), mi.getLemma().c_str(), | |
139 | - mi.getTag().c_str(), mi.getName().c_str()); | |
141 | + printf("%s", mi.toString(true).c_str()); | |
142 | +// printf("%d,%d,%s,%s,%s,%s", | |
143 | +// mi.getStartNode(), mi.getEndNode(), | |
144 | +// mi.getOrth().c_str(), lemmaToShow.c_str(), | |
145 | +// mi.getTag().c_str(), lemmaToShow.c_str()); | |
140 | 146 | prevStart = mi.getStartNode(); |
141 | 147 | prevEnd = mi.getEndNode(); |
142 | 148 | } |
... | ... |
morfeusz/morfeusz_generator.cpp
... | ... | @@ -38,9 +38,7 @@ int main(int argc, char** argv) { |
38 | 38 | printf("; "); |
39 | 39 | } |
40 | 40 | MorphInterpretation& mi = res[i]; |
41 | - printf("%s,%s,%s,%s", | |
42 | - mi.getOrth().c_str(), mi.getLemma().c_str(), | |
43 | - mi.getTag().c_str(), mi.getName().c_str()); | |
41 | + printf("%s", mi.toString(false).c_str()); | |
44 | 42 | } |
45 | 43 | printf("]\n"); |
46 | 44 | } |
... | ... |
nbproject/configurations.xml
... | ... | @@ -38,8 +38,6 @@ |
38 | 38 | </df> |
39 | 39 | <in>Environment.cpp</in> |
40 | 40 | <in>FlexionGraph.cpp</in> |
41 | - <in>Generator.cpp</in> | |
42 | - <in>GeneratorDeserializer.cpp</in> | |
43 | 41 | <in>Morfeusz.cpp</in> |
44 | 42 | <in>MorphDeserializer.cpp</in> |
45 | 43 | <in>MorphInterpretation.cpp</in> |
... | ... | @@ -106,12 +104,8 @@ |
106 | 104 | </makeTool> |
107 | 105 | </makefileType> |
108 | 106 | <item path="../default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
109 | - <ccTool flags="1"> | |
110 | - </ccTool> | |
111 | 107 | </item> |
112 | 108 | <item path="../default_synth_fsa.cpp" ex="false" tool="1" flavor2="4"> |
113 | - <ccTool flags="1"> | |
114 | - </ccTool> | |
115 | 109 | </item> |
116 | 110 | <item path="build/default_fsa.cpp" ex="false" tool="1" flavor2="4"> |
117 | 111 | </item> |
... | ... | @@ -281,7 +275,7 @@ |
281 | 275 | <ccTool> |
282 | 276 | <incDir> |
283 | 277 | <pElem>morfeusz</pElem> |
284 | - <pElem>/usr/lib/jvm/java-6-openjdk/include</pElem> | |
278 | + <pElem>/usr/lib/jvm/default-java/include</pElem> | |
285 | 279 | </incDir> |
286 | 280 | <preprocessorList> |
287 | 281 | <Elem>NDEBUG</Elem> |
... | ... | @@ -347,32 +341,6 @@ |
347 | 341 | </preprocessorList> |
348 | 342 | </ccTool> |
349 | 343 | </item> |
350 | - <item path="morfeusz/Generator.cpp" ex="false" tool="1" flavor2="4"> | |
351 | - <ccTool flags="1"> | |
352 | - <incDir> | |
353 | - <pElem>build</pElem> | |
354 | - <pElem>morfeusz</pElem> | |
355 | - <pElem>build/morfeusz</pElem> | |
356 | - </incDir> | |
357 | - <preprocessorList> | |
358 | - <Elem>NDEBUG</Elem> | |
359 | - <Elem>libmorfeusz_EXPORTS</Elem> | |
360 | - </preprocessorList> | |
361 | - </ccTool> | |
362 | - </item> | |
363 | - <item path="morfeusz/GeneratorDeserializer.cpp" ex="false" tool="1" flavor2="4"> | |
364 | - <ccTool flags="1"> | |
365 | - <incDir> | |
366 | - <pElem>build</pElem> | |
367 | - <pElem>morfeusz</pElem> | |
368 | - <pElem>build/morfeusz</pElem> | |
369 | - </incDir> | |
370 | - <preprocessorList> | |
371 | - <Elem>NDEBUG</Elem> | |
372 | - <Elem>libmorfeusz_EXPORTS</Elem> | |
373 | - </preprocessorList> | |
374 | - </ccTool> | |
375 | - </item> | |
376 | 344 | <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="4"> |
377 | 345 | <ccTool flags="1"> |
378 | 346 | <incDir> |
... | ... |