Commit e5220b907d711d35e0832ea2ca0734a2be1e0711
1 parent
3cc7bcb1
- podpięcie pierwszego malutkiego testu analizy morfologicznej jednego słowa
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@22 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
8 changed files
with
69 additions
and
55 deletions
fsa/fsa_impl.hpp
@@ -18,7 +18,7 @@ | @@ -18,7 +18,7 @@ | ||
18 | #include "utils.hpp" | 18 | #include "utils.hpp" |
19 | #include "const.hpp" | 19 | #include "const.hpp" |
20 | 20 | ||
21 | -using namespace std; | 21 | +//using namespace std; |
22 | //static const unsigned int FSA_OFFSET = 6; | 22 | //static const unsigned int FSA_OFFSET = 6; |
23 | 23 | ||
24 | template <class T> | 24 | template <class T> |
@@ -70,7 +70,7 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial | @@ -70,7 +70,7 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial | ||
70 | 70 | ||
71 | uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET); | 71 | uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET); |
72 | if (versionNum != VERSION_NUM) { | 72 | if (versionNum != VERSION_NUM) { |
73 | - throw FSAException(string("Invalid version number: ") + std::to_string(versionNum) + ", should be: " + to_string(VERSION_NUM)); | 73 | + throw FSAException(string("Invalid version number: ") + std::to_string(versionNum) + ", should be: " + std::to_string(VERSION_NUM)); |
74 | } | 74 | } |
75 | 75 | ||
76 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); | 76 | uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); |
@@ -85,7 +85,7 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial | @@ -85,7 +85,7 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial | ||
85 | case 2: | 85 | case 2: |
86 | return new CompressedFSA2<T>(startPtr, deserializer); | 86 | return new CompressedFSA2<T>(startPtr, deserializer); |
87 | default: | 87 | default: |
88 | - throw FSAException(string("Invalid implementation number: ") + to_string(versionNum) + ", should be: " + to_string(VERSION_NUM)); | 88 | + throw FSAException(string("Invalid implementation number: ") + std::to_string(versionNum) + ", should be: " + std::to_string(VERSION_NUM)); |
89 | } | 89 | } |
90 | } | 90 | } |
91 | 91 |
morfeusz/CMakeLists.txt
@@ -8,10 +8,12 @@ add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) | @@ -8,10 +8,12 @@ add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) | ||
8 | add_executable (morfeusz2_analyze main.cpp) | 8 | add_executable (morfeusz2_analyze main.cpp) |
9 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) | 9 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) |
10 | add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp) | 10 | add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp) |
11 | +add_executable (test_simple test_simple.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp) | ||
11 | 12 | ||
12 | # Link the executable to the Hello library. | 13 | # Link the executable to the Hello library. |
13 | target_link_libraries (morfeusz2_analyze morfeusz2) | 14 | target_link_libraries (morfeusz2_analyze morfeusz2) |
14 | set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) | 15 | set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) |
15 | 16 | ||
16 | set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | 17 | set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
17 | -set_target_properties ( test_morfeusz PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | ||
18 | \ No newline at end of file | 18 | \ No newline at end of file |
19 | +set_target_properties ( test_morfeusz PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | ||
20 | +set_target_properties ( test_simple PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | ||
19 | \ No newline at end of file | 21 | \ No newline at end of file |
morfeusz/FlexionGraph.cpp
@@ -30,7 +30,7 @@ void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { | @@ -30,7 +30,7 @@ void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { | ||
30 | 30 | ||
31 | void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results) { | 31 | void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results) { |
32 | int endNode = graph.size(); | 32 | int endNode = graph.size(); |
33 | - for (int i = 0; i < graph.size(); i++) { | 33 | + for (unsigned int i = 0; i < graph.size(); i++) { |
34 | vector<Edge>& edges = graph[i]; | 34 | vector<Edge>& edges = graph[i]; |
35 | for (Edge& e: edges) { | 35 | for (Edge& e: edges) { |
36 | int realStartNode = i + this->startNode; | 36 | int realStartNode = i + this->startNode; |
morfeusz/InterpretedChunk.hpp
@@ -13,7 +13,7 @@ | @@ -13,7 +13,7 @@ | ||
13 | struct InterpretedChunk { | 13 | struct InterpretedChunk { |
14 | const char* chunk; | 14 | const char* chunk; |
15 | long chunkLength; | 15 | long chunkLength; |
16 | - InterpsGroup& interpsGroup; | 16 | + InterpsGroup interpsGroup; |
17 | }; | 17 | }; |
18 | 18 | ||
19 | #endif /* INTERPRETEDCHUNK_HPP */ | 19 | #endif /* INTERPRETEDCHUNK_HPP */ |
morfeusz/Morfeusz.cpp
@@ -6,6 +6,7 @@ | @@ -6,6 +6,7 @@ | ||
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <string> | 8 | #include <string> |
9 | +#include <iostream> | ||
9 | #include "fsa.hpp" | 10 | #include "fsa.hpp" |
10 | #include "utils.hpp" | 11 | #include "utils.hpp" |
11 | #include "Morfeusz.hpp" | 12 | #include "Morfeusz.hpp" |
@@ -18,18 +19,28 @@ | @@ -18,18 +19,28 @@ | ||
18 | using namespace std; | 19 | using namespace std; |
19 | 20 | ||
20 | static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) { | 21 | static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) { |
22 | + cerr << "initialize FSA" << endl; | ||
21 | static Deserializer < vector < InterpsGroup >> *deserializer | 23 | static Deserializer < vector < InterpsGroup >> *deserializer |
22 | = new MorphDeserializer(); | 24 | = new MorphDeserializer(); |
23 | return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer); | 25 | return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer); |
24 | } | 26 | } |
25 | 27 | ||
26 | static CharsetConverter* initializeCharsetConverter() { | 28 | static CharsetConverter* initializeCharsetConverter() { |
29 | + cerr << "initialize charset converter" << endl; | ||
27 | static CharsetConverter* converter = new UTF8CharsetConverter(); | 30 | static CharsetConverter* converter = new UTF8CharsetConverter(); |
28 | return converter; | 31 | return converter; |
29 | } | 32 | } |
30 | 33 | ||
34 | +static Tagset* initializeTagset(const string& filename) { | ||
35 | + cerr << "initialize tagset" << endl; | ||
36 | + static Tagset* tagset = new Tagset(readFile(filename.c_str())); | ||
37 | + return tagset; | ||
38 | +} | ||
39 | + | ||
31 | Morfeusz::Morfeusz(const string& filename) | 40 | Morfeusz::Morfeusz(const string& filename) |
32 | -: fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) { | 41 | +: fsa(initializeFSA(filename)), |
42 | + charsetConverter(initializeCharsetConverter()), | ||
43 | + tagset(initializeTagset(filename)) { | ||
33 | 44 | ||
34 | } | 45 | } |
35 | 46 | ||
@@ -47,7 +58,7 @@ void Morfeusz::processOneWord( | @@ -47,7 +58,7 @@ void Morfeusz::processOneWord( | ||
47 | FlexionGraph graph(startNodeNum); | 58 | FlexionGraph graph(startNodeNum); |
48 | const char* currInput = inputData; | 59 | const char* currInput = inputData; |
49 | doProcessOneWord(currInput, inputEnd, accum, graph); | 60 | doProcessOneWord(currInput, inputEnd, accum, graph); |
50 | - graph.appendToResults(this->tagset, results); | 61 | + graph.appendToResults(*this->tagset, results); |
51 | inputData = currInput; | 62 | inputData = currInput; |
52 | } | 63 | } |
53 | 64 | ||
@@ -88,12 +99,16 @@ void Morfeusz::feedState( | @@ -88,12 +99,16 @@ void Morfeusz::feedState( | ||
88 | } | 99 | } |
89 | } | 100 | } |
90 | 101 | ||
91 | -ResultsIterator Morfeusz::analyze(const std::string& text) { | 102 | +ResultsIterator Morfeusz::analyze(const string& text) { |
92 | // const char* textStart = text.c_str(); | 103 | // const char* textStart = text.c_str(); |
93 | // const char* textEnd = text.c_str() + text.length(); | 104 | // const char* textEnd = text.c_str() + text.length(); |
94 | return ResultsIterator(text, *this); | 105 | return ResultsIterator(text, *this); |
95 | } | 106 | } |
96 | 107 | ||
108 | +void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) { | ||
109 | + | ||
110 | +} | ||
111 | + | ||
97 | ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz) | 112 | ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz) |
98 | : rawInput(text.c_str()), | 113 | : rawInput(text.c_str()), |
99 | morfeusz(morfeusz) { | 114 | morfeusz(morfeusz) { |
morfeusz/Morfeusz.hpp
@@ -32,15 +32,17 @@ public: | @@ -32,15 +32,17 @@ public: | ||
32 | virtual ~Morfeusz(); | 32 | virtual ~Morfeusz(); |
33 | // Morfeusz(const Morfeusz& orig); | 33 | // Morfeusz(const Morfeusz& orig); |
34 | ResultsIterator analyze(const std::string& text); | 34 | ResultsIterator analyze(const std::string& text); |
35 | - | ||
36 | -// Morfeusz(); | ||
37 | - friend class ResultsIterator; | ||
38 | -private: | 35 | + void analyze(const std::string& text, std::vector<MorphInterpretation>& result); |
36 | + | ||
39 | void processOneWord( | 37 | void processOneWord( |
40 | const char*& inputData, | 38 | const char*& inputData, |
41 | const char* inputEnd, | 39 | const char* inputEnd, |
42 | const int startNodeNum, | 40 | const int startNodeNum, |
43 | std::vector<MorphInterpretation>& result) const; | 41 | std::vector<MorphInterpretation>& result) const; |
42 | + | ||
43 | +// Morfeusz(); | ||
44 | + friend class ResultsIterator; | ||
45 | +private: | ||
44 | 46 | ||
45 | void doProcessOneWord( | 47 | void doProcessOneWord( |
46 | const char*& inputData, | 48 | const char*& inputData, |
@@ -54,6 +56,7 @@ private: | @@ -54,6 +56,7 @@ private: | ||
54 | 56 | ||
55 | FSAType* fsa; | 57 | FSAType* fsa; |
56 | CharsetConverter* charsetConverter; | 58 | CharsetConverter* charsetConverter; |
59 | + Tagset* tagset; | ||
57 | }; | 60 | }; |
58 | 61 | ||
59 | class ResultsIterator { | 62 | class ResultsIterator { |
morfeusz/test_simple.cpp
0 → 100644
1 | +/* | ||
2 | + * File: test_simple.cpp | ||
3 | + * Author: lennyn | ||
4 | + * | ||
5 | + * Created on November 18, 2013, 10:30 PM | ||
6 | + */ | ||
7 | + | ||
8 | +#include <cstdlib> | ||
9 | + | ||
10 | +#include "Morfeusz.hpp" | ||
11 | +#include "MorphInterpretation.hpp" | ||
12 | + | ||
13 | +using namespace std; | ||
14 | + | ||
15 | +/* | ||
16 | + * | ||
17 | + */ | ||
18 | +int main(int argc, char** argv) { | ||
19 | + Morfeusz morfeusz(argv[1]); | ||
20 | + vector<MorphInterpretation> res; | ||
21 | + string word = "mijałem"; | ||
22 | + const char* ptr = word.c_str(); | ||
23 | + morfeusz.processOneWord(ptr, word.c_str() + word.size(), 0, res); | ||
24 | + for (MorphInterpretation& mi: res) { | ||
25 | + cerr << mi.getStartNode() << " " << mi.getEndNode() << " " << mi.getLemma() << " " << mi.getTag() << " " << mi.getName() << endl; | ||
26 | + } | ||
27 | + return 0; | ||
28 | +} | ||
29 | + |
nbproject/configurations.xml
@@ -11,12 +11,6 @@ | @@ -11,12 +11,6 @@ | ||
11 | <df name="charset"> | 11 | <df name="charset"> |
12 | <in>CharsetConverter.cpp</in> | 12 | <in>CharsetConverter.cpp</in> |
13 | </df> | 13 | </df> |
14 | - <df name="encoding"> | ||
15 | - <in>CharsetConverter.cpp</in> | ||
16 | - </df> | ||
17 | - <df name="flexion"> | ||
18 | - <in>FlexionGraph.cpp</in> | ||
19 | - </df> | ||
20 | <in>FlexionGraph.cpp</in> | 14 | <in>FlexionGraph.cpp</in> |
21 | <in>Morfeusz.cpp</in> | 15 | <in>Morfeusz.cpp</in> |
22 | <in>MorphDeserializer.cpp</in> | 16 | <in>MorphDeserializer.cpp</in> |
@@ -24,6 +18,7 @@ | @@ -24,6 +18,7 @@ | ||
24 | <in>Tagset.cpp</in> | 18 | <in>Tagset.cpp</in> |
25 | <in>main.cpp</in> | 19 | <in>main.cpp</in> |
26 | <in>morfeusz.cpp</in> | 20 | <in>morfeusz.cpp</in> |
21 | + <in>test_morfeusz.cpp</in> | ||
27 | <in>test_morph.cpp</in> | 22 | <in>test_morph.cpp</in> |
28 | </df> | 23 | </df> |
29 | <logicalFolder name="ExternalFiles" | 24 | <logicalFolder name="ExternalFiles" |
@@ -57,7 +52,7 @@ | @@ -57,7 +52,7 @@ | ||
57 | <executablePath>build/fsa/test_dict</executablePath> | 52 | <executablePath>build/fsa/test_dict</executablePath> |
58 | </makeTool> | 53 | </makeTool> |
59 | </makefileType> | 54 | </makefileType> |
60 | - <folder path="1/charset"> | 55 | + <folder path="1"> |
61 | <ccTool> | 56 | <ccTool> |
62 | <incDir> | 57 | <incDir> |
63 | <pElem>fsa</pElem> | 58 | <pElem>fsa</pElem> |
@@ -96,42 +91,22 @@ | @@ -96,42 +91,22 @@ | ||
96 | </item> | 91 | </item> |
97 | <item path="morfeusz/FlexionGraph.cpp" ex="false" tool="1" flavor2="8"> | 92 | <item path="morfeusz/FlexionGraph.cpp" ex="false" tool="1" flavor2="8"> |
98 | <ccTool> | 93 | <ccTool> |
99 | - <incDir> | ||
100 | - <pElem>fsa</pElem> | ||
101 | - <pElem>build/morfeusz</pElem> | ||
102 | - </incDir> | ||
103 | </ccTool> | 94 | </ccTool> |
104 | </item> | 95 | </item> |
105 | <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> | 96 | <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> |
106 | <ccTool> | 97 | <ccTool> |
107 | - <incDir> | ||
108 | - <pElem>fsa</pElem> | ||
109 | - <pElem>build/morfeusz</pElem> | ||
110 | - </incDir> | ||
111 | </ccTool> | 98 | </ccTool> |
112 | </item> | 99 | </item> |
113 | <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> | 100 | <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> |
114 | <ccTool> | 101 | <ccTool> |
115 | - <incDir> | ||
116 | - <pElem>fsa</pElem> | ||
117 | - <pElem>build/morfeusz</pElem> | ||
118 | - </incDir> | ||
119 | </ccTool> | 102 | </ccTool> |
120 | </item> | 103 | </item> |
121 | <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8"> | 104 | <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8"> |
122 | <ccTool> | 105 | <ccTool> |
123 | - <incDir> | ||
124 | - <pElem>fsa</pElem> | ||
125 | - <pElem>build/morfeusz</pElem> | ||
126 | - </incDir> | ||
127 | </ccTool> | 106 | </ccTool> |
128 | </item> | 107 | </item> |
129 | <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> | 108 | <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> |
130 | <ccTool> | 109 | <ccTool> |
131 | - <incDir> | ||
132 | - <pElem>fsa</pElem> | ||
133 | - <pElem>build/morfeusz</pElem> | ||
134 | - </incDir> | ||
135 | </ccTool> | 110 | </ccTool> |
136 | </item> | 111 | </item> |
137 | <item path="morfeusz/charset/CharsetConverter.cpp" | 112 | <item path="morfeusz/charset/CharsetConverter.cpp" |
@@ -141,33 +116,23 @@ | @@ -141,33 +116,23 @@ | ||
141 | <ccTool> | 116 | <ccTool> |
142 | </ccTool> | 117 | </ccTool> |
143 | </item> | 118 | </item> |
144 | - <item path="morfeusz/encoding/CharsetConverter.cpp" | ||
145 | - ex="false" | ||
146 | - tool="1" | ||
147 | - flavor2="4"> | 119 | + <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> |
148 | <ccTool> | 120 | <ccTool> |
149 | </ccTool> | 121 | </ccTool> |
150 | </item> | 122 | </item> |
151 | - <item path="morfeusz/flexion/FlexionGraph.cpp" ex="false" tool="1" flavor2="4"> | 123 | + <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> |
152 | <ccTool> | 124 | <ccTool> |
153 | </ccTool> | 125 | </ccTool> |
154 | </item> | 126 | </item> |
155 | - <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> | 127 | + <item path="morfeusz/test_morfeusz.cpp" ex="false" tool="1" flavor2="4"> |
156 | <ccTool> | 128 | <ccTool> |
157 | - <incDir> | ||
158 | - <pElem>fsa</pElem> | ||
159 | - <pElem>build/morfeusz</pElem> | ||
160 | - </incDir> | ||
161 | </ccTool> | 129 | </ccTool> |
162 | </item> | 130 | </item> |
163 | - <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> | 131 | + <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> |
164 | <ccTool> | 132 | <ccTool> |
165 | - <incDir> | ||
166 | - <pElem>morfeusz</pElem> | ||
167 | - </incDir> | ||
168 | </ccTool> | 133 | </ccTool> |
169 | </item> | 134 | </item> |
170 | - <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> | 135 | + <item path="morfeusz/test_simple.cpp" ex="false" tool="1" flavor2="0"> |
171 | <ccTool> | 136 | <ccTool> |
172 | <incDir> | 137 | <incDir> |
173 | <pElem>fsa</pElem> | 138 | <pElem>fsa</pElem> |