Commit 58aafafe36f62bfa9e6b785ad28a1ea4c9042b24
1 parent
612cbdc9
- trochę refaktoryzacji, zrobienie klasy MorphInterpretation będącej krawędzią w grafie fleksyjnym
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@18 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
10 changed files
with
172 additions
and
180 deletions
morfeusz/CMakeLists.txt
@@ -6,7 +6,7 @@ | @@ -6,7 +6,7 @@ | ||
6 | include_directories (${Morfeusz_SOURCE_DIR}/fsa) | 6 | include_directories (${Morfeusz_SOURCE_DIR}/fsa) |
7 | add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) | 7 | add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) |
8 | add_executable (morfeusz2_analyze main.cpp) | 8 | add_executable (morfeusz2_analyze main.cpp) |
9 | -add_executable (test_morph test_morph.cpp interpretations.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp) | 9 | +add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) |
10 | 10 | ||
11 | # Link the executable to the Hello library. | 11 | # Link the executable to the Hello library. |
12 | target_link_libraries (morfeusz2_analyze morfeusz2) | 12 | target_link_libraries (morfeusz2_analyze morfeusz2) |
morfeusz/EncodedInterpretation.hpp
0 → 100644
1 | +/* | ||
2 | + * File: interpretation.hpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on November 4, 2013, 3:11 PM | ||
6 | + */ | ||
7 | + | ||
8 | +#ifndef INTERPRETATION_HPP | ||
9 | +#define INTERPRETATION_HPP | ||
10 | + | ||
11 | +#include <string> | ||
12 | +#include <sstream> | ||
13 | +#include <iterator> | ||
14 | +#include "Tagset.hpp" | ||
15 | + | ||
16 | +using namespace std; | ||
17 | + | ||
18 | +struct EncodedLemma { | ||
19 | + int suffixToCut; | ||
20 | + string suffixToAdd; | ||
21 | +}; | ||
22 | + | ||
23 | +/* | ||
24 | + * Internal representation of an interpretation - with lemma encoded | ||
25 | + */ | ||
26 | +struct EncodedInterpretation { | ||
27 | + EncodedLemma lemma; | ||
28 | + int tag; | ||
29 | + int nameClassifier; | ||
30 | +}; | ||
31 | + | ||
32 | +#endif /* INTERPRETATION_HPP */ |
morfeusz/Morfeusz.hpp
@@ -9,7 +9,8 @@ | @@ -9,7 +9,8 @@ | ||
9 | #define MORFEUSZ_HPP | 9 | #define MORFEUSZ_HPP |
10 | 10 | ||
11 | #include <string> | 11 | #include <string> |
12 | -#include "interpretations.hpp" | 12 | +#include "MorphInterpretation.hpp" |
13 | +//#include "interpretations.hpp" | ||
13 | 14 | ||
14 | class Morfeusz; | 15 | class Morfeusz; |
15 | class AnalyzeResult; | 16 | class AnalyzeResult; |
morfeusz/MorphDeserializer.hpp
@@ -10,7 +10,7 @@ | @@ -10,7 +10,7 @@ | ||
10 | 10 | ||
11 | #include <vector> | 11 | #include <vector> |
12 | #include "fsa.hpp" | 12 | #include "fsa.hpp" |
13 | -#include "interpretations.hpp" | 13 | +#include "EncodedInterpretation.hpp" |
14 | 14 | ||
15 | class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> { | 15 | class MorphDeserializer: public Deserializer<std::vector<EncodedInterpretation>> { |
16 | public: | 16 | public: |
morfeusz/MorphInterpretation.cpp
0 → 100644
1 | +/* | ||
2 | + * File: MorphInterpretation.cpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on November 14, 2013, 11:47 AM | ||
6 | + */ | ||
7 | + | ||
8 | +#include <string> | ||
9 | +#include "MorphInterpretation.hpp" | ||
10 | +#include "EncodedInterpretation.hpp" | ||
11 | + | ||
12 | +using namespace std; | ||
13 | + | ||
14 | +static string convertLemma( | ||
15 | + const string& orth, | ||
16 | + const EncodedLemma& lemma) { | ||
17 | + string res(orth); | ||
18 | + res.erase( | ||
19 | + res.end() - lemma.suffixToCut, | ||
20 | + res.end()); | ||
21 | + res.append(lemma.suffixToAdd); | ||
22 | + return res; | ||
23 | +} | ||
24 | + | ||
25 | +MorphInterpretation::MorphInterpretation( | ||
26 | + int startNode, | ||
27 | + int endNode, | ||
28 | + const std::string& orth, | ||
29 | + const EncodedInterpretation& encodedInterp, | ||
30 | + const Tagset& tagset) | ||
31 | +: startNode(startNode), | ||
32 | + endNode(endNode), | ||
33 | + orth(orth), | ||
34 | + lemma(convertLemma(orth, encodedInterp.lemma)), | ||
35 | + tagnum(encodedInterp.tag), | ||
36 | + namenum(encodedInterp.nameClassifier), | ||
37 | + tag(tagset.getTag(encodedInterp.tag)), | ||
38 | + name(tagset.getName(encodedInterp.nameClassifier)) { | ||
39 | + | ||
40 | +} | ||
41 | + | ||
42 | +MorphInterpretation::~MorphInterpretation() { | ||
43 | +} | ||
44 | + | ||
45 | +const std::string& MorphInterpretation::getOrth() const { | ||
46 | + return this->orth; | ||
47 | +} | ||
48 | + | ||
49 | +const std::string& MorphInterpretation::getLemma() const { | ||
50 | + return this->lemma; | ||
51 | +} | ||
52 | + | ||
53 | +int MorphInterpretation::getTagnum() const { | ||
54 | + return this->tagnum; | ||
55 | +} | ||
56 | + | ||
57 | +int MorphInterpretation::getNamenum() const { | ||
58 | + return this->namenum; | ||
59 | +} | ||
60 | + | ||
61 | +const std::string& MorphInterpretation::getTag() const { | ||
62 | + return this->tag; | ||
63 | +} | ||
64 | + | ||
65 | +const std::string& MorphInterpretation::getName() const { | ||
66 | + return this->name; | ||
67 | +} | ||
68 | + |
morfeusz/MorphInterpretation.hpp
0 → 100644
1 | +/* | ||
2 | + * File: MorphInterpretation.hpp | ||
3 | + * Author: mlenart | ||
4 | + * | ||
5 | + * Created on November 14, 2013, 11:47 AM | ||
6 | + */ | ||
7 | + | ||
8 | +#ifndef MORPHINTERPRETATION_HPP | ||
9 | +#define MORPHINTERPRETATION_HPP | ||
10 | + | ||
11 | +#include <string> | ||
12 | +#include "Tagset.hpp" | ||
13 | +#include "EncodedInterpretation.hpp" | ||
14 | + | ||
15 | +class MorphInterpretation { | ||
16 | +public: | ||
17 | + MorphInterpretation( | ||
18 | + int startNode, | ||
19 | + int endNode, | ||
20 | + const std::string& orth, | ||
21 | + const EncodedInterpretation& encodedInterp, | ||
22 | + const Tagset& tagset); | ||
23 | + virtual ~MorphInterpretation(); | ||
24 | + const std::string& getOrth() const; | ||
25 | + const std::string& getLemma() const; | ||
26 | + int getTagnum() const; | ||
27 | + int getNamenum() const; | ||
28 | + const std::string& getTag() const; | ||
29 | + const std::string& getName() const; | ||
30 | +private: | ||
31 | + int startNode; | ||
32 | + int endNode; | ||
33 | + std::string orth; | ||
34 | + std::string lemma; | ||
35 | + int tagnum; | ||
36 | + int namenum; | ||
37 | + const std::string& tag; | ||
38 | + const std::string& name; | ||
39 | +}; | ||
40 | + | ||
41 | +#endif /* MORPHINTERPRETATION_HPP */ | ||
42 | + |
morfeusz/interpretations.cpp deleted
1 | - | ||
2 | -#include "interpretations.hpp" | ||
3 | -#include "Tagset.hpp" | ||
4 | - | ||
5 | -using namespace std; | ||
6 | - | ||
7 | -string TaggedInterpretation::toString() const { | ||
8 | - std::stringstream ss; | ||
9 | - ss << lemma << ":" << tag << ":" << name; | ||
10 | - return ss.str(); | ||
11 | -} | ||
12 | - | ||
13 | -template <class T> | ||
14 | -string InterpretationsDecoder<T>::convertLemma( | ||
15 | - const string& orth, | ||
16 | - const EncodedLemma& lemma) const { | ||
17 | - string res(orth); | ||
18 | - res.erase( | ||
19 | - res.end() - lemma.suffixToCut, | ||
20 | - res.end()); | ||
21 | - res.append(lemma.suffixToAdd); | ||
22 | - return res; | ||
23 | -} | ||
24 | - | ||
25 | -RawInterpretation RawInterpretationsDecoder::getInterpretation( | ||
26 | - const string& orth, | ||
27 | - const EncodedInterpretation& interp) const { | ||
28 | - string lemma = this->convertLemma(orth, interp.lemma); | ||
29 | - RawInterpretation res = {lemma, interp.tag, interp.nameClassifier}; | ||
30 | - return res; | ||
31 | -} | ||
32 | - | ||
33 | -TaggedInterpretationsDecoder::TaggedInterpretationsDecoder(const Tagset& tagset) | ||
34 | -: tagset(tagset) { | ||
35 | - | ||
36 | -} | ||
37 | - | ||
38 | -TaggedInterpretation TaggedInterpretationsDecoder::getInterpretation( | ||
39 | - const string& orth, | ||
40 | - const EncodedInterpretation& interp) const { | ||
41 | - string lemma = this->convertLemma(orth, interp.lemma); | ||
42 | - const string& tag = this->tagset.getTag(interp.tag); | ||
43 | - const string& name = this->tagset.getName(interp.nameClassifier); | ||
44 | - TaggedInterpretation res = {lemma, tag, name}; | ||
45 | - return res; | ||
46 | -} |
morfeusz/interpretations.hpp deleted
1 | -/* | ||
2 | - * File: interpretation.hpp | ||
3 | - * Author: mlenart | ||
4 | - * | ||
5 | - * Created on November 4, 2013, 3:11 PM | ||
6 | - */ | ||
7 | - | ||
8 | -#ifndef INTERPRETATION_HPP | ||
9 | -#define INTERPRETATION_HPP | ||
10 | - | ||
11 | -#include <string> | ||
12 | -#include <sstream> | ||
13 | -#include <iterator> | ||
14 | -#include "Tagset.hpp" | ||
15 | - | ||
16 | -using namespace std; | ||
17 | - | ||
18 | -struct EncodedLemma { | ||
19 | - int suffixToCut; | ||
20 | - string suffixToAdd; | ||
21 | -}; | ||
22 | - | ||
23 | -/* | ||
24 | - * Internal representation of an interpretation - with lemma encoded | ||
25 | - */ | ||
26 | -struct EncodedInterpretation { | ||
27 | - EncodedLemma lemma; | ||
28 | - int tag; | ||
29 | - int nameClassifier; | ||
30 | -}; | ||
31 | - | ||
32 | -class MorphInterpretation { | ||
33 | -public: | ||
34 | - MorphInterpretation( | ||
35 | - int startNode, | ||
36 | - int endNode, | ||
37 | - const std::string& orth, | ||
38 | - const EncodedInterpretation& encodedInterp); | ||
39 | - const std::string& getOrth() const; | ||
40 | - const std::string& getLemma() const; | ||
41 | - int getTagnum() const; | ||
42 | - int getNamenum() const; | ||
43 | - const std::string& getTag(const Tagset& tagset) const; | ||
44 | - const std::string& getName(const Tagset& tagset) const; | ||
45 | -private: | ||
46 | - int startNode; | ||
47 | - int endNode; | ||
48 | - std::string orth; | ||
49 | - std::string lemma; | ||
50 | - int tagnum; | ||
51 | - int namenum; | ||
52 | -}; | ||
53 | - | ||
54 | -// ALL BELOW IS DEPRECATED | ||
55 | - | ||
56 | -/* | ||
57 | - * Interpretation with tags as integers (need a Tagset object to decode them) | ||
58 | - */ | ||
59 | -struct RawInterpretation { | ||
60 | - string lemma; | ||
61 | - int tagnum; | ||
62 | - int namenum; | ||
63 | -}; | ||
64 | - | ||
65 | -/* | ||
66 | - * Interpretation with tags as strings (already processed with a Tagset object) | ||
67 | - */ | ||
68 | -struct TaggedInterpretation { | ||
69 | - std::string lemma; | ||
70 | - const std::string& tag; // np. subst:sg:nom:m1 | ||
71 | - const std::string& name; // np. "pospolita" | ||
72 | - std::string toString() const; | ||
73 | -}; | ||
74 | - | ||
75 | -template <class InterpType> | ||
76 | -class InterpretationsDecoder { | ||
77 | -public: | ||
78 | -// explicit InterpretationsDecoder(const Tagset& tagset); | ||
79 | - | ||
80 | - virtual InterpType getInterpretation( | ||
81 | - const std::string& orth, | ||
82 | - const EncodedInterpretation& interp) const = 0; | ||
83 | - | ||
84 | -protected: | ||
85 | - std::string convertLemma(const std::string& orth, const EncodedLemma& interp) const; | ||
86 | -}; | ||
87 | - | ||
88 | -class TaggedInterpretationsDecoder: public InterpretationsDecoder<TaggedInterpretation> { | ||
89 | -public: | ||
90 | - explicit TaggedInterpretationsDecoder(const Tagset& tagset); | ||
91 | - | ||
92 | - TaggedInterpretation getInterpretation( | ||
93 | - const std::string& orth, | ||
94 | - const EncodedInterpretation& interp) const; | ||
95 | -private: | ||
96 | - Tagset tagset; | ||
97 | -}; | ||
98 | - | ||
99 | -class RawInterpretationsDecoder: public InterpretationsDecoder<RawInterpretation> { | ||
100 | -public: | ||
101 | - RawInterpretationsDecoder(); | ||
102 | - | ||
103 | - RawInterpretation getInterpretation( | ||
104 | - const std::string& orth, | ||
105 | - const EncodedInterpretation& interp) const; | ||
106 | -}; | ||
107 | - | ||
108 | -#endif /* INTERPRETATION_HPP */ | ||
109 | - |
morfeusz/test_morph.cpp
@@ -9,10 +9,11 @@ | @@ -9,10 +9,11 @@ | ||
9 | #include <sstream> | 9 | #include <sstream> |
10 | #include <iostream> | 10 | #include <iostream> |
11 | #include "fsa.hpp" | 11 | #include "fsa.hpp" |
12 | -#include "interpretations.hpp" | 12 | +#include "EncodedInterpretation.hpp" |
13 | #include "utils.hpp" | 13 | #include "utils.hpp" |
14 | #include "MorphDeserializer.hpp" | 14 | #include "MorphDeserializer.hpp" |
15 | #include "Morfeusz.hpp" | 15 | #include "Morfeusz.hpp" |
16 | +#include "MorphInterpretation.hpp" | ||
16 | 17 | ||
17 | using namespace std; | 18 | using namespace std; |
18 | 19 | ||
@@ -27,13 +28,14 @@ void debug(const string& key, const vector<EncodedInterpretation> value) { | @@ -27,13 +28,14 @@ void debug(const string& key, const vector<EncodedInterpretation> value) { | ||
27 | cerr << "==================" << endl; | 28 | cerr << "==================" << endl; |
28 | } | 29 | } |
29 | 30 | ||
30 | -void debug(const string& key, const TaggedInterpretation& value) { | ||
31 | - cerr << key << '\t' << value.toString() << endl; | ||
32 | -} | 31 | +//void debug(const string& key, const TaggedInterpretation& value) { |
32 | +// cerr << key << '\t' << value.toString() << endl; | ||
33 | +//} | ||
33 | 34 | ||
34 | void doTest( | 35 | void doTest( |
35 | - const FSA<vector<EncodedInterpretation>>& fsa, | ||
36 | - const InterpretationsDecoder<TaggedInterpretation>& interpsConverter, | 36 | + const FSA<vector<EncodedInterpretation>>& fsa, |
37 | + const Tagset& tagset, | ||
38 | +// const InterpretationsDecoder<TaggedInterpretation>& interpsConverter, | ||
37 | const char* fname) { | 39 | const char* fname) { |
38 | ifstream ifs; | 40 | ifstream ifs; |
39 | // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); | 41 | // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); |
@@ -41,28 +43,29 @@ void doTest( | @@ -41,28 +43,29 @@ void doTest( | ||
41 | string line; | 43 | string line; |
42 | while (getline(ifs, line)) { | 44 | while (getline(ifs, line)) { |
43 | vector<string> splitVector(split(line, '\t')); | 45 | vector<string> splitVector(split(line, '\t')); |
44 | - string key = splitVector[0]; | 46 | + string orth = splitVector[0]; |
45 | string lemma = splitVector[1]; | 47 | string lemma = splitVector[1]; |
46 | string tag = splitVector[2]; | 48 | string tag = splitVector[2]; |
47 | string name = splitVector[3]; | 49 | string name = splitVector[3]; |
48 | vector<EncodedInterpretation> value2; | 50 | vector<EncodedInterpretation> value2; |
49 | - fsa.tryToRecognize(key.c_str(), value2); | 51 | + fsa.tryToRecognize(orth.c_str(), value2); |
50 | DEBUG("recognized "+to_string(value2.size())); | 52 | DEBUG("recognized "+to_string(value2.size())); |
51 | - vector<TaggedInterpretation> parsedValues; | 53 | +// vector<TaggedInterpretation> parsedValues; |
52 | bool found = false; | 54 | bool found = false; |
53 | - for (EncodedInterpretation interp: value2) { | ||
54 | - TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp); | 55 | + for (EncodedInterpretation encodedInterp: value2) { |
56 | +// TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp); | ||
57 | + MorphInterpretation interp(0, 0, orth, encodedInterp, tagset); | ||
55 | // parsedValues.push_back(parsedValue); | 58 | // parsedValues.push_back(parsedValue); |
56 | - debug(key, parsedValue); | ||
57 | - if (lemma == parsedValue.lemma && tag == parsedValue.tag && name == parsedValue.name) { | 59 | +// debug(orth, parsedValue); |
60 | + if (lemma == interp.getLemma() && tag == interp.getTag() && name == interp.getName()) { | ||
58 | DEBUG("RECOGNIZED"); | 61 | DEBUG("RECOGNIZED"); |
59 | found = true; | 62 | found = true; |
60 | } | 63 | } |
61 | else { | 64 | else { |
62 | - DEBUG("not matching "+parsedValue.lemma+ " " + parsedValue.tag + " " + parsedValue.name); | 65 | + DEBUG("not matching "+interp.getLemma()+ " " + interp.getTag() + " " + interp.getName()); |
63 | } | 66 | } |
64 | } | 67 | } |
65 | - validate(found, "Failed to recognize " + key + " " + lemma + ":" + tag + ":" + name); | 68 | + validate(found, "Failed to recognize " + orth + " " + lemma + ":" + tag + ":" + name); |
66 | // debug(key, value2); | 69 | // debug(key, value2); |
67 | // validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key); | 70 | // validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key); |
68 | } | 71 | } |
@@ -79,10 +82,10 @@ int main(int argc, char** argv) { | @@ -79,10 +82,10 @@ int main(int argc, char** argv) { | ||
79 | DEBUG("DONE read FSA"); | 82 | DEBUG("DONE read FSA"); |
80 | DEBUG("will read tagset"); | 83 | DEBUG("will read tagset"); |
81 | Tagset tagset(fsaData); | 84 | Tagset tagset(fsaData); |
82 | - TaggedInterpretationsDecoder interpsDecoder(tagset); | 85 | +// TaggedInterpretationsDecoder interpsDecoder(tagset); |
83 | DEBUG("DONE read tagset"); | 86 | DEBUG("DONE read tagset"); |
84 | DEBUG("still alive"); | 87 | DEBUG("still alive"); |
85 | - doTest(*fsa, interpsDecoder, argv[2]); | 88 | + doTest(*fsa, tagset, argv[2]); |
86 | delete fsa; | 89 | delete fsa; |
87 | return 0; | 90 | return 0; |
88 | } | 91 | } |
nbproject/configurations.xml
@@ -11,8 +11,9 @@ | @@ -11,8 +11,9 @@ | ||
11 | <in>Morfeusz.cpp</in> | 11 | <in>Morfeusz.cpp</in> |
12 | <in>Morfeusz.hpp</in> | 12 | <in>Morfeusz.hpp</in> |
13 | <in>MorphDeserializer.cpp</in> | 13 | <in>MorphDeserializer.cpp</in> |
14 | + <in>MorphInterpretation.cpp</in> | ||
15 | + <in>MorphInterpretation.hpp</in> | ||
14 | <in>Tagset.cpp</in> | 16 | <in>Tagset.cpp</in> |
15 | - <in>interpretations.cpp</in> | ||
16 | <in>main.cpp</in> | 17 | <in>main.cpp</in> |
17 | <in>morfeusz.cpp</in> | 18 | <in>morfeusz.cpp</in> |
18 | <in>test_morph.cpp</in> | 19 | <in>test_morph.cpp</in> |
@@ -93,11 +94,11 @@ | @@ -93,11 +94,11 @@ | ||
93 | <ccTool> | 94 | <ccTool> |
94 | </ccTool> | 95 | </ccTool> |
95 | </item> | 96 | </item> |
96 | - <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> | ||
97 | - <ccTool> | ||
98 | - </ccTool> | 97 | + <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="0"> |
99 | </item> | 98 | </item> |
100 | - <item path="morfeusz/interpretations.cpp" ex="false" tool="1" flavor2="8"> | 99 | + <item path="morfeusz/MorphInterpretation.hpp" ex="false" tool="3" flavor2="0"> |
100 | + </item> | ||
101 | + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> | ||
101 | <ccTool> | 102 | <ccTool> |
102 | </ccTool> | 103 | </ccTool> |
103 | </item> | 104 | </item> |