Commit e5220b907d711d35e0832ea2ca0734a2be1e0711

Authored by Michał Lenart
1 parent 3cc7bcb1

- podpięcie pierwszego malutkiego testu analizy morfologicznej jednego słowa

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@22 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsa/fsa_impl.hpp
@@ -18,7 +18,7 @@ @@ -18,7 +18,7 @@
18 #include "utils.hpp" 18 #include "utils.hpp"
19 #include "const.hpp" 19 #include "const.hpp"
20 20
21 -using namespace std; 21 +//using namespace std;
22 //static const unsigned int FSA_OFFSET = 6; 22 //static const unsigned int FSA_OFFSET = 6;
23 23
24 template <class T> 24 template <class T>
@@ -70,7 +70,7 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial @@ -70,7 +70,7 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
70 70
71 uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET); 71 uint8_t versionNum = *(ptr + VERSION_NUM_OFFSET);
72 if (versionNum != VERSION_NUM) { 72 if (versionNum != VERSION_NUM) {
73 - throw FSAException(string("Invalid version number: ") + std::to_string(versionNum) + ", should be: " + to_string(VERSION_NUM)); 73 + throw FSAException(string("Invalid version number: ") + std::to_string(versionNum) + ", should be: " + std::to_string(VERSION_NUM));
74 } 74 }
75 75
76 uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); 76 uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET);
@@ -85,7 +85,7 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial @@ -85,7 +85,7 @@ FSA&lt;T&gt;* FSA&lt;T&gt;::getFSA(const unsigned char* ptr, const Deserializer&lt;T&gt;&amp; deserial
85 case 2: 85 case 2:
86 return new CompressedFSA2<T>(startPtr, deserializer); 86 return new CompressedFSA2<T>(startPtr, deserializer);
87 default: 87 default:
88 - throw FSAException(string("Invalid implementation number: ") + to_string(versionNum) + ", should be: " + to_string(VERSION_NUM)); 88 + throw FSAException(string("Invalid implementation number: ") + std::to_string(versionNum) + ", should be: " + std::to_string(VERSION_NUM));
89 } 89 }
90 } 90 }
91 91
morfeusz/CMakeLists.txt
@@ -8,10 +8,12 @@ add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) @@ -8,10 +8,12 @@ add_library (morfeusz2 morfeusz.hpp morfeusz.cpp)
8 add_executable (morfeusz2_analyze main.cpp) 8 add_executable (morfeusz2_analyze main.cpp)
9 add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) 9 add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp)
10 add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp) 10 add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp)
  11 +add_executable (test_simple test_simple.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp)
11 12
12 # Link the executable to the Hello library. 13 # Link the executable to the Hello library.
13 target_link_libraries (morfeusz2_analyze morfeusz2) 14 target_link_libraries (morfeusz2_analyze morfeusz2)
14 set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) 15 set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" )
15 16
16 set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) 17 set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
17 -set_target_properties ( test_morfeusz PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )  
18 \ No newline at end of file 18 \ No newline at end of file
  19 +set_target_properties ( test_morfeusz PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
  20 +set_target_properties ( test_simple PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
19 \ No newline at end of file 21 \ No newline at end of file
morfeusz/FlexionGraph.cpp
@@ -30,7 +30,7 @@ void FlexionGraph::addPath(const std::vector&lt;InterpretedChunk&gt;&amp; path) { @@ -30,7 +30,7 @@ void FlexionGraph::addPath(const std::vector&lt;InterpretedChunk&gt;&amp; path) {
30 30
31 void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results) { 31 void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results) {
32 int endNode = graph.size(); 32 int endNode = graph.size();
33 - for (int i = 0; i < graph.size(); i++) { 33 + for (unsigned int i = 0; i < graph.size(); i++) {
34 vector<Edge>& edges = graph[i]; 34 vector<Edge>& edges = graph[i];
35 for (Edge& e: edges) { 35 for (Edge& e: edges) {
36 int realStartNode = i + this->startNode; 36 int realStartNode = i + this->startNode;
morfeusz/InterpretedChunk.hpp
@@ -13,7 +13,7 @@ @@ -13,7 +13,7 @@
13 struct InterpretedChunk { 13 struct InterpretedChunk {
14 const char* chunk; 14 const char* chunk;
15 long chunkLength; 15 long chunkLength;
16 - InterpsGroup& interpsGroup; 16 + InterpsGroup interpsGroup;
17 }; 17 };
18 18
19 #endif /* INTERPRETEDCHUNK_HPP */ 19 #endif /* INTERPRETEDCHUNK_HPP */
morfeusz/Morfeusz.cpp
@@ -6,6 +6,7 @@ @@ -6,6 +6,7 @@
6 */ 6 */
7 7
8 #include <string> 8 #include <string>
  9 +#include <iostream>
9 #include "fsa.hpp" 10 #include "fsa.hpp"
10 #include "utils.hpp" 11 #include "utils.hpp"
11 #include "Morfeusz.hpp" 12 #include "Morfeusz.hpp"
@@ -18,18 +19,28 @@ @@ -18,18 +19,28 @@
18 using namespace std; 19 using namespace std;
19 20
20 static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) { 21 static FSA<vector<InterpsGroup >> *initializeFSA(const string& filename) {
  22 + cerr << "initialize FSA" << endl;
21 static Deserializer < vector < InterpsGroup >> *deserializer 23 static Deserializer < vector < InterpsGroup >> *deserializer
22 = new MorphDeserializer(); 24 = new MorphDeserializer();
23 return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer); 25 return FSA < vector < InterpsGroup >> ::getFSA(filename, *deserializer);
24 } 26 }
25 27
26 static CharsetConverter* initializeCharsetConverter() { 28 static CharsetConverter* initializeCharsetConverter() {
  29 + cerr << "initialize charset converter" << endl;
27 static CharsetConverter* converter = new UTF8CharsetConverter(); 30 static CharsetConverter* converter = new UTF8CharsetConverter();
28 return converter; 31 return converter;
29 } 32 }
30 33
  34 +static Tagset* initializeTagset(const string& filename) {
  35 + cerr << "initialize tagset" << endl;
  36 + static Tagset* tagset = new Tagset(readFile(filename.c_str()));
  37 + return tagset;
  38 +}
  39 +
31 Morfeusz::Morfeusz(const string& filename) 40 Morfeusz::Morfeusz(const string& filename)
32 -: fsa(initializeFSA(filename)), charsetConverter(initializeCharsetConverter()) { 41 +: fsa(initializeFSA(filename)),
  42 + charsetConverter(initializeCharsetConverter()),
  43 + tagset(initializeTagset(filename)) {
33 44
34 } 45 }
35 46
@@ -47,7 +58,7 @@ void Morfeusz::processOneWord( @@ -47,7 +58,7 @@ void Morfeusz::processOneWord(
47 FlexionGraph graph(startNodeNum); 58 FlexionGraph graph(startNodeNum);
48 const char* currInput = inputData; 59 const char* currInput = inputData;
49 doProcessOneWord(currInput, inputEnd, accum, graph); 60 doProcessOneWord(currInput, inputEnd, accum, graph);
50 - graph.appendToResults(this->tagset, results); 61 + graph.appendToResults(*this->tagset, results);
51 inputData = currInput; 62 inputData = currInput;
52 } 63 }
53 64
@@ -88,12 +99,16 @@ void Morfeusz::feedState( @@ -88,12 +99,16 @@ void Morfeusz::feedState(
88 } 99 }
89 } 100 }
90 101
91 -ResultsIterator Morfeusz::analyze(const std::string& text) { 102 +ResultsIterator Morfeusz::analyze(const string& text) {
92 // const char* textStart = text.c_str(); 103 // const char* textStart = text.c_str();
93 // const char* textEnd = text.c_str() + text.length(); 104 // const char* textEnd = text.c_str() + text.length();
94 return ResultsIterator(text, *this); 105 return ResultsIterator(text, *this);
95 } 106 }
96 107
  108 +void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) {
  109 +
  110 +}
  111 +
97 ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz) 112 ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz)
98 : rawInput(text.c_str()), 113 : rawInput(text.c_str()),
99 morfeusz(morfeusz) { 114 morfeusz(morfeusz) {
morfeusz/Morfeusz.hpp
@@ -32,15 +32,17 @@ public: @@ -32,15 +32,17 @@ public:
32 virtual ~Morfeusz(); 32 virtual ~Morfeusz();
33 // Morfeusz(const Morfeusz& orig); 33 // Morfeusz(const Morfeusz& orig);
34 ResultsIterator analyze(const std::string& text); 34 ResultsIterator analyze(const std::string& text);
35 -  
36 -// Morfeusz();  
37 - friend class ResultsIterator;  
38 -private: 35 + void analyze(const std::string& text, std::vector<MorphInterpretation>& result);
  36 +
39 void processOneWord( 37 void processOneWord(
40 const char*& inputData, 38 const char*& inputData,
41 const char* inputEnd, 39 const char* inputEnd,
42 const int startNodeNum, 40 const int startNodeNum,
43 std::vector<MorphInterpretation>& result) const; 41 std::vector<MorphInterpretation>& result) const;
  42 +
  43 +// Morfeusz();
  44 + friend class ResultsIterator;
  45 +private:
44 46
45 void doProcessOneWord( 47 void doProcessOneWord(
46 const char*& inputData, 48 const char*& inputData,
@@ -54,6 +56,7 @@ private: @@ -54,6 +56,7 @@ private:
54 56
55 FSAType* fsa; 57 FSAType* fsa;
56 CharsetConverter* charsetConverter; 58 CharsetConverter* charsetConverter;
  59 + Tagset* tagset;
57 }; 60 };
58 61
59 class ResultsIterator { 62 class ResultsIterator {
morfeusz/test_simple.cpp 0 → 100644
  1 +/*
  2 + * File: test_simple.cpp
  3 + * Author: lennyn
  4 + *
  5 + * Created on November 18, 2013, 10:30 PM
  6 + */
  7 +
  8 +#include <cstdlib>
  9 +
  10 +#include "Morfeusz.hpp"
  11 +#include "MorphInterpretation.hpp"
  12 +
  13 +using namespace std;
  14 +
  15 +/*
  16 + *
  17 + */
  18 +int main(int argc, char** argv) {
  19 + Morfeusz morfeusz(argv[1]);
  20 + vector<MorphInterpretation> res;
  21 + string word = "mijałem";
  22 + const char* ptr = word.c_str();
  23 + morfeusz.processOneWord(ptr, word.c_str() + word.size(), 0, res);
  24 + for (MorphInterpretation& mi: res) {
  25 + cerr << mi.getStartNode() << " " << mi.getEndNode() << " " << mi.getLemma() << " " << mi.getTag() << " " << mi.getName() << endl;
  26 + }
  27 + return 0;
  28 +}
  29 +
nbproject/configurations.xml
@@ -11,12 +11,6 @@ @@ -11,12 +11,6 @@
11 <df name="charset"> 11 <df name="charset">
12 <in>CharsetConverter.cpp</in> 12 <in>CharsetConverter.cpp</in>
13 </df> 13 </df>
14 - <df name="encoding">  
15 - <in>CharsetConverter.cpp</in>  
16 - </df>  
17 - <df name="flexion">  
18 - <in>FlexionGraph.cpp</in>  
19 - </df>  
20 <in>FlexionGraph.cpp</in> 14 <in>FlexionGraph.cpp</in>
21 <in>Morfeusz.cpp</in> 15 <in>Morfeusz.cpp</in>
22 <in>MorphDeserializer.cpp</in> 16 <in>MorphDeserializer.cpp</in>
@@ -24,6 +18,7 @@ @@ -24,6 +18,7 @@
24 <in>Tagset.cpp</in> 18 <in>Tagset.cpp</in>
25 <in>main.cpp</in> 19 <in>main.cpp</in>
26 <in>morfeusz.cpp</in> 20 <in>morfeusz.cpp</in>
  21 + <in>test_morfeusz.cpp</in>
27 <in>test_morph.cpp</in> 22 <in>test_morph.cpp</in>
28 </df> 23 </df>
29 <logicalFolder name="ExternalFiles" 24 <logicalFolder name="ExternalFiles"
@@ -57,7 +52,7 @@ @@ -57,7 +52,7 @@
57 <executablePath>build/fsa/test_dict</executablePath> 52 <executablePath>build/fsa/test_dict</executablePath>
58 </makeTool> 53 </makeTool>
59 </makefileType> 54 </makefileType>
60 - <folder path="1/charset"> 55 + <folder path="1">
61 <ccTool> 56 <ccTool>
62 <incDir> 57 <incDir>
63 <pElem>fsa</pElem> 58 <pElem>fsa</pElem>
@@ -96,42 +91,22 @@ @@ -96,42 +91,22 @@
96 </item> 91 </item>
97 <item path="morfeusz/FlexionGraph.cpp" ex="false" tool="1" flavor2="8"> 92 <item path="morfeusz/FlexionGraph.cpp" ex="false" tool="1" flavor2="8">
98 <ccTool> 93 <ccTool>
99 - <incDir>  
100 - <pElem>fsa</pElem>  
101 - <pElem>build/morfeusz</pElem>  
102 - </incDir>  
103 </ccTool> 94 </ccTool>
104 </item> 95 </item>
105 <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> 96 <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8">
106 <ccTool> 97 <ccTool>
107 - <incDir>  
108 - <pElem>fsa</pElem>  
109 - <pElem>build/morfeusz</pElem>  
110 - </incDir>  
111 </ccTool> 98 </ccTool>
112 </item> 99 </item>
113 <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> 100 <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8">
114 <ccTool> 101 <ccTool>
115 - <incDir>  
116 - <pElem>fsa</pElem>  
117 - <pElem>build/morfeusz</pElem>  
118 - </incDir>  
119 </ccTool> 102 </ccTool>
120 </item> 103 </item>
121 <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8"> 104 <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8">
122 <ccTool> 105 <ccTool>
123 - <incDir>  
124 - <pElem>fsa</pElem>  
125 - <pElem>build/morfeusz</pElem>  
126 - </incDir>  
127 </ccTool> 106 </ccTool>
128 </item> 107 </item>
129 <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> 108 <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8">
130 <ccTool> 109 <ccTool>
131 - <incDir>  
132 - <pElem>fsa</pElem>  
133 - <pElem>build/morfeusz</pElem>  
134 - </incDir>  
135 </ccTool> 110 </ccTool>
136 </item> 111 </item>
137 <item path="morfeusz/charset/CharsetConverter.cpp" 112 <item path="morfeusz/charset/CharsetConverter.cpp"
@@ -141,33 +116,23 @@ @@ -141,33 +116,23 @@
141 <ccTool> 116 <ccTool>
142 </ccTool> 117 </ccTool>
143 </item> 118 </item>
144 - <item path="morfeusz/encoding/CharsetConverter.cpp"  
145 - ex="false"  
146 - tool="1"  
147 - flavor2="4"> 119 + <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
148 <ccTool> 120 <ccTool>
149 </ccTool> 121 </ccTool>
150 </item> 122 </item>
151 - <item path="morfeusz/flexion/FlexionGraph.cpp" ex="false" tool="1" flavor2="4"> 123 + <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4">
152 <ccTool> 124 <ccTool>
153 </ccTool> 125 </ccTool>
154 </item> 126 </item>
155 - <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> 127 + <item path="morfeusz/test_morfeusz.cpp" ex="false" tool="1" flavor2="4">
156 <ccTool> 128 <ccTool>
157 - <incDir>  
158 - <pElem>fsa</pElem>  
159 - <pElem>build/morfeusz</pElem>  
160 - </incDir>  
161 </ccTool> 129 </ccTool>
162 </item> 130 </item>
163 - <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> 131 + <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8">
164 <ccTool> 132 <ccTool>
165 - <incDir>  
166 - <pElem>morfeusz</pElem>  
167 - </incDir>  
168 </ccTool> 133 </ccTool>
169 </item> 134 </item>
170 - <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> 135 + <item path="morfeusz/test_simple.cpp" ex="false" tool="1" flavor2="0">
171 <ccTool> 136 <ccTool>
172 <incDir> 137 <incDir>
173 <pElem>fsa</pElem> 138 <pElem>fsa</pElem>