Commit 5700d3a02747423a76b75148b338f19e6912b168
1 parent
e5220b90
- podstawa analizy tekstu już działa
- obsług ign-ów w zasadzie też git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@23 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
17 changed files
with
223 additions
and
65 deletions
CMakeLists.txt
morfeusz/CMakeLists.txt
@@ -4,15 +4,14 @@ | @@ -4,15 +4,14 @@ | ||
4 | # Make sure the linker can find the Hello library once it is built. | 4 | # Make sure the linker can find the Hello library once it is built. |
5 | #link_directories (${Morfeusz_BINARY_DIR}/Hello) | 5 | #link_directories (${Morfeusz_BINARY_DIR}/Hello) |
6 | include_directories (${Morfeusz_SOURCE_DIR}/fsa) | 6 | include_directories (${Morfeusz_SOURCE_DIR}/fsa) |
7 | -add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) | ||
8 | -add_executable (morfeusz2_analyze main.cpp) | 7 | +# add_executable (morfeusz2_analyze main.cpp) |
9 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) | 8 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) |
10 | add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp) | 9 | add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp) |
11 | add_executable (test_simple test_simple.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp) | 10 | add_executable (test_simple test_simple.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp) |
12 | 11 | ||
13 | # Link the executable to the Hello library. | 12 | # Link the executable to the Hello library. |
14 | -target_link_libraries (morfeusz2_analyze morfeusz2) | ||
15 | -set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) | 13 | +#target_link_libraries (morfeusz2_analyze morfeusz2) |
14 | +#set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) | ||
16 | 15 | ||
17 | set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | 16 | set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
18 | set_target_properties ( test_morfeusz PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) | 17 | set_target_properties ( test_morfeusz PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) |
morfeusz/EncodedInterpretation.hpp
@@ -31,8 +31,6 @@ struct EncodedInterpretation { | @@ -31,8 +31,6 @@ struct EncodedInterpretation { | ||
31 | int type; | 31 | int type; |
32 | int tag; | 32 | int tag; |
33 | int nameClassifier; | 33 | int nameClassifier; |
34 | - int startNode; | ||
35 | - int endNode; | ||
36 | }; | 34 | }; |
37 | 35 | ||
38 | #endif /* INTERPRETATION_HPP */ | 36 | #endif /* INTERPRETATION_HPP */ |
morfeusz/FlexionGraph.cpp
1 | 1 | ||
2 | +#include <string> | ||
3 | +#include "utils.hpp" | ||
2 | #include "FlexionGraph.hpp" | 4 | #include "FlexionGraph.hpp" |
3 | 5 | ||
4 | FlexionGraph::FlexionGraph(int startNode) | 6 | FlexionGraph::FlexionGraph(int startNode) |
@@ -6,29 +8,56 @@ FlexionGraph::FlexionGraph(int startNode) | @@ -6,29 +8,56 @@ FlexionGraph::FlexionGraph(int startNode) | ||
6 | 8 | ||
7 | } | 9 | } |
8 | 10 | ||
11 | +static inline void debugPath(const std::vector<InterpretedChunk>& path) { | ||
12 | + for (const InterpretedChunk& chunk: path) { | ||
13 | + std::string text(chunk.chunk, chunk.chunkLength); | ||
14 | + DEBUG(text); | ||
15 | + DEBUG(chunk.chunkLength); | ||
16 | + } | ||
17 | +} | ||
18 | + | ||
19 | +void FlexionGraph::addStartEdge(const Edge& e) { | ||
20 | + if (this->graph.empty()) { | ||
21 | + this->graph.push_back(vector<Edge>()); | ||
22 | + } | ||
23 | + this->graph[0].push_back(e); | ||
24 | +} | ||
25 | + | ||
26 | +void FlexionGraph::addMiddleEdge(const Edge& e) { | ||
27 | + this->graph.push_back(vector<Edge>(1, e)); | ||
28 | +} | ||
29 | + | ||
9 | void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { | 30 | void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { |
31 | +// debugPath(path); | ||
10 | for (const InterpretedChunk& chunk: path) { | 32 | for (const InterpretedChunk& chunk: path) { |
11 | - if (&chunk == &(path.back())) { | 33 | + if (&chunk == &(path.front()) |
34 | + && &chunk == &(path.back())) { | ||
12 | Edge e = { chunk, -1 }; | 35 | Edge e = { chunk, -1 }; |
13 | - vector<Edge> v; | ||
14 | - v.push_back(e); | ||
15 | - this->graph.push_back(v); | ||
16 | -// this->graph[node].push_back(e); | 36 | + this->addStartEdge(e); |
17 | } | 37 | } |
18 | else if (&chunk == &(path.front())) { | 38 | else if (&chunk == &(path.front())) { |
19 | - Edge e = { chunk, (int) this->graph.size() }; | ||
20 | - this->graph[0].push_back(e); | 39 | + Edge e = { chunk, (int) this->graph.size() + 1 }; |
40 | + this->addStartEdge(e); | ||
41 | + } | ||
42 | + else if (&chunk == &(path.back())) { | ||
43 | + Edge e = { chunk, -1 }; | ||
44 | + this->addMiddleEdge(e); | ||
21 | } | 45 | } |
22 | else { | 46 | else { |
23 | - Edge e = { chunk, (int) this->graph.size() }; | ||
24 | - vector<Edge> v; | ||
25 | - v.push_back(e); | ||
26 | - this->graph.push_back(v); | 47 | + Edge e = { chunk, (int) this->graph.size() + 1 }; |
48 | + this->addMiddleEdge(e); | ||
27 | } | 49 | } |
28 | } | 50 | } |
29 | } | 51 | } |
30 | 52 | ||
53 | +void FlexionGraph::minimizeGraph() { | ||
54 | + if (this->graph.size() > 2) { | ||
55 | + | ||
56 | + } | ||
57 | +} | ||
58 | + | ||
31 | void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results) { | 59 | void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results) { |
60 | + this->minimizeGraph(); | ||
32 | int endNode = graph.size(); | 61 | int endNode = graph.size(); |
33 | for (unsigned int i = 0; i < graph.size(); i++) { | 62 | for (unsigned int i = 0; i < graph.size(); i++) { |
34 | vector<Edge>& edges = graph[i]; | 63 | vector<Edge>& edges = graph[i]; |
@@ -41,3 +70,7 @@ void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterp | @@ -41,3 +70,7 @@ void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterp | ||
41 | } | 70 | } |
42 | } | 71 | } |
43 | } | 72 | } |
73 | + | ||
74 | +bool FlexionGraph::empty() const { | ||
75 | + return this->graph.empty(); | ||
76 | +} |
morfeusz/FlexionGraph.hpp
@@ -16,6 +16,22 @@ struct Edge { | @@ -16,6 +16,22 @@ struct Edge { | ||
16 | int nextNode; | 16 | int nextNode; |
17 | }; | 17 | }; |
18 | 18 | ||
19 | +//struct EdgeLabel { | ||
20 | +// int type; | ||
21 | +// const char* textStart; | ||
22 | +// int textLength; | ||
23 | +// | ||
24 | +// bool operator==(const EdgeLabel &el) const { | ||
25 | +// return this->type == el.type | ||
26 | +// && this->textStart == el.textStart | ||
27 | +// && this->textLength == el.textLength; | ||
28 | +// } | ||
29 | +// | ||
30 | +// bool operator<(const coord &o) { | ||
31 | +// return x < o.x || (x == o.x && y < o.y); | ||
32 | +// } | ||
33 | +//}; | ||
34 | + | ||
19 | class FlexionGraph { | 35 | class FlexionGraph { |
20 | public: | 36 | public: |
21 | 37 | ||
@@ -25,8 +41,17 @@ public: | @@ -25,8 +41,17 @@ public: | ||
25 | 41 | ||
26 | void appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results); | 42 | void appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results); |
27 | 43 | ||
44 | + bool empty() const; | ||
45 | + | ||
28 | // virtual ~FlexionGraph(); | 46 | // virtual ~FlexionGraph(); |
29 | private: | 47 | private: |
48 | + | ||
49 | + void addStartEdge(const Edge& e); | ||
50 | + | ||
51 | + void addMiddleEdge(const Edge& e); | ||
52 | + | ||
53 | + void minimizeGraph(); | ||
54 | + | ||
30 | int startNode; | 55 | int startNode; |
31 | std::vector< std::vector<Edge> > graph; | 56 | std::vector< std::vector<Edge> > graph; |
32 | }; | 57 | }; |
morfeusz/Morfeusz.cpp
@@ -38,27 +38,37 @@ static Tagset* initializeTagset(const string& filename) { | @@ -38,27 +38,37 @@ static Tagset* initializeTagset(const string& filename) { | ||
38 | } | 38 | } |
39 | 39 | ||
40 | Morfeusz::Morfeusz(const string& filename) | 40 | Morfeusz::Morfeusz(const string& filename) |
41 | -: fsa(initializeFSA(filename)), | ||
42 | - charsetConverter(initializeCharsetConverter()), | ||
43 | - tagset(initializeTagset(filename)) { | 41 | +: fsa(initializeFSA(filename)), |
42 | +charsetConverter(initializeCharsetConverter()), | ||
43 | +tagset(initializeTagset(filename)) { | ||
44 | 44 | ||
45 | } | 45 | } |
46 | 46 | ||
47 | Morfeusz::~Morfeusz() { | 47 | Morfeusz::~Morfeusz() { |
48 | - delete &this->fsa; | ||
49 | - delete &this->charsetConverter; | 48 | + // delete &this->fsa; |
49 | + // delete &this->charsetConverter; | ||
50 | } | 50 | } |
51 | 51 | ||
52 | void Morfeusz::processOneWord( | 52 | void Morfeusz::processOneWord( |
53 | const char*& inputData, | 53 | const char*& inputData, |
54 | const char* inputEnd, | 54 | const char* inputEnd, |
55 | - const int startNodeNum, | 55 | + int startNodeNum, |
56 | std::vector<MorphInterpretation>& results) const { | 56 | std::vector<MorphInterpretation>& results) const { |
57 | + while (inputData != inputEnd | ||
58 | + && isEndOfWord(this->charsetConverter->peek(inputData, inputEnd))) { | ||
59 | + this->charsetConverter->next(inputData, inputEnd); | ||
60 | + } | ||
61 | + const char* wordStart = inputData; | ||
57 | vector<InterpretedChunk> accum; | 62 | vector<InterpretedChunk> accum; |
58 | FlexionGraph graph(startNodeNum); | 63 | FlexionGraph graph(startNodeNum); |
59 | const char* currInput = inputData; | 64 | const char* currInput = inputData; |
60 | doProcessOneWord(currInput, inputEnd, accum, graph); | 65 | doProcessOneWord(currInput, inputEnd, accum, graph); |
61 | - graph.appendToResults(*this->tagset, results); | 66 | + if (!graph.empty()) { |
67 | + graph.appendToResults(*this->tagset, results); | ||
68 | + } | ||
69 | + else if (wordStart != currInput) { | ||
70 | + this->appendIgnotiumToResults(string(wordStart, currInput), startNodeNum, results); | ||
71 | + } | ||
62 | inputData = currInput; | 72 | inputData = currInput; |
63 | } | 73 | } |
64 | 74 | ||
@@ -67,38 +77,56 @@ void Morfeusz::doProcessOneWord( | @@ -67,38 +77,56 @@ void Morfeusz::doProcessOneWord( | ||
67 | const char* inputEnd, | 77 | const char* inputEnd, |
68 | vector<InterpretedChunk>& accum, | 78 | vector<InterpretedChunk>& accum, |
69 | FlexionGraph& graph) const { | 79 | FlexionGraph& graph) const { |
80 | + bool endOfWord = inputData == inputEnd; | ||
70 | const char* currInput = inputData; | 81 | const char* currInput = inputData; |
71 | - StateType state = this->fsa->getInitialState(); | ||
72 | - int codepoint = this->charsetConverter->next(currInput, inputEnd); | 82 | + const char* prevInput = inputData; |
83 | + int codepoint = endOfWord ? 0 : this->charsetConverter->next(currInput, inputEnd); | ||
73 | 84 | ||
74 | - if (!accum.empty() && isEndOfWord(codepoint)) { | ||
75 | - graph.addPath(accum); | ||
76 | - } | ||
77 | - else | ||
78 | - while (!isEndOfWord(codepoint)) { | ||
79 | - this->feedState(state, codepoint); | ||
80 | - codepoint = this->charsetConverter->next(currInput, inputEnd); | ||
81 | - if (state.isAccepting()) { | ||
82 | - for (InterpsGroup& ig : state.getValue()) { | ||
83 | - InterpretedChunk ic = {inputData, currInput - inputData, ig}; | ||
84 | - accum.push_back(ic); | ||
85 | - doProcessOneWord(currInput, inputEnd, accum, graph); | ||
86 | - accum.pop_back(); | ||
87 | - } | 85 | + StateType state = this->fsa->getInitialState(); |
86 | + | ||
87 | + while (!isEndOfWord(codepoint)) { | ||
88 | + this->feedState(state, codepoint); | ||
89 | + if (state.isAccepting()) { | ||
90 | + for (InterpsGroup& ig : state.getValue()) { | ||
91 | + InterpretedChunk ic = {inputData, currInput - inputData, ig}; | ||
92 | + accum.push_back(ic); | ||
93 | + const char* newCurrInput = currInput; | ||
94 | + doProcessOneWord(newCurrInput, inputEnd, accum, graph); | ||
95 | + accum.pop_back(); | ||
88 | } | 96 | } |
89 | } | 97 | } |
98 | + prevInput = currInput; | ||
99 | + codepoint = currInput == inputEnd ? 0 : this->charsetConverter->next(currInput, inputEnd); | ||
100 | + } | ||
101 | + if (state.isAccepting()) { | ||
102 | + for (InterpsGroup& ig : state.getValue()) { | ||
103 | + InterpretedChunk ic = {inputData, prevInput - inputData, ig}; | ||
104 | + accum.push_back(ic); | ||
105 | + graph.addPath(accum); | ||
106 | + accum.pop_back(); | ||
107 | + } | ||
108 | + } | ||
109 | + inputData = currInput; | ||
90 | } | 110 | } |
91 | 111 | ||
92 | void Morfeusz::feedState( | 112 | void Morfeusz::feedState( |
93 | StateType& state, | 113 | StateType& state, |
94 | - const int codepoint) const { | 114 | + int codepoint) const { |
95 | vector<char> chars; | 115 | vector<char> chars; |
96 | this->charsetConverter->append(codepoint, chars); | 116 | this->charsetConverter->append(codepoint, chars); |
97 | - for (char c: chars) { | 117 | + for (char c : chars) { |
98 | state.proceedToNext(c); | 118 | state.proceedToNext(c); |
99 | } | 119 | } |
100 | } | 120 | } |
101 | 121 | ||
122 | +void Morfeusz::appendIgnotiumToResults( | ||
123 | + const string& word, | ||
124 | + int startNodeNum, | ||
125 | + std::vector<MorphInterpretation>& results) const { | ||
126 | + MorphInterpretation interp = MorphInterpretation::createIgn(startNodeNum, word, *this->tagset); | ||
127 | + results.push_back(interp); | ||
128 | +} | ||
129 | + | ||
102 | ResultsIterator Morfeusz::analyze(const string& text) { | 130 | ResultsIterator Morfeusz::analyze(const string& text) { |
103 | // const char* textStart = text.c_str(); | 131 | // const char* textStart = text.c_str(); |
104 | // const char* textEnd = text.c_str() + text.length(); | 132 | // const char* textEnd = text.c_str() + text.length(); |
@@ -106,7 +134,12 @@ ResultsIterator Morfeusz::analyze(const string& text) { | @@ -106,7 +134,12 @@ ResultsIterator Morfeusz::analyze(const string& text) { | ||
106 | } | 134 | } |
107 | 135 | ||
108 | void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) { | 136 | void Morfeusz::analyze(const string& text, vector<MorphInterpretation>& results) { |
109 | - | 137 | + const char* input = text.c_str(); |
138 | + const char* inputEnd = input + text.length(); | ||
139 | + while (input != inputEnd) { | ||
140 | + int startNode = results.empty() ? 0 : results.back().getEndNode(); | ||
141 | + this->processOneWord(input, inputEnd, startNode, results); | ||
142 | + } | ||
110 | } | 143 | } |
111 | 144 | ||
112 | ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz) | 145 | ResultsIterator::ResultsIterator(const string& text, const Morfeusz& morfeusz) |
morfeusz/Morfeusz.hpp
1 | /* | 1 | /* |
2 | * File: Morfeusz.hpp | 2 | * File: Morfeusz.hpp |
3 | - * Author: lennyn | 3 | + * Author: mlenart |
4 | * | 4 | * |
5 | * Created on November 13, 2013, 5:21 PM | 5 | * Created on November 13, 2013, 5:21 PM |
6 | */ | 6 | */ |
@@ -37,7 +37,7 @@ public: | @@ -37,7 +37,7 @@ public: | ||
37 | void processOneWord( | 37 | void processOneWord( |
38 | const char*& inputData, | 38 | const char*& inputData, |
39 | const char* inputEnd, | 39 | const char* inputEnd, |
40 | - const int startNodeNum, | 40 | + int startNodeNum, |
41 | std::vector<MorphInterpretation>& result) const; | 41 | std::vector<MorphInterpretation>& result) const; |
42 | 42 | ||
43 | // Morfeusz(); | 43 | // Morfeusz(); |
@@ -52,7 +52,12 @@ private: | @@ -52,7 +52,12 @@ private: | ||
52 | 52 | ||
53 | void feedState( | 53 | void feedState( |
54 | StateType& state, | 54 | StateType& state, |
55 | - const int codepoint) const; | 55 | + int codepoint) const; |
56 | + | ||
57 | + void appendIgnotiumToResults( | ||
58 | + const std::string& word, | ||
59 | + int startNodeNum, | ||
60 | + std::vector<MorphInterpretation>& results) const; | ||
56 | 61 | ||
57 | FSAType* fsa; | 62 | FSAType* fsa; |
58 | CharsetConverter* charsetConverter; | 63 | CharsetConverter* charsetConverter; |
morfeusz/MorphInterpretation.cpp
@@ -39,6 +39,25 @@ MorphInterpretation::MorphInterpretation( | @@ -39,6 +39,25 @@ MorphInterpretation::MorphInterpretation( | ||
39 | 39 | ||
40 | } | 40 | } |
41 | 41 | ||
42 | +MorphInterpretation::MorphInterpretation( | ||
43 | + int startNode, | ||
44 | + const std::string& orth, | ||
45 | + const Tagset& tagset) | ||
46 | +: startNode(startNode), | ||
47 | + endNode(startNode + 1), | ||
48 | + orth(orth), | ||
49 | + lemma(orth), | ||
50 | + tagnum(0), | ||
51 | + namenum(0), | ||
52 | + tag(tagset.getTag(0)), | ||
53 | + name(tagset.getName(0)) { | ||
54 | + | ||
55 | +} | ||
56 | + | ||
57 | +MorphInterpretation MorphInterpretation::createIgn(int startNode, const std::string& orth, const Tagset& tagset) { | ||
58 | + return MorphInterpretation(startNode, orth, tagset); | ||
59 | +} | ||
60 | + | ||
42 | MorphInterpretation::~MorphInterpretation() { | 61 | MorphInterpretation::~MorphInterpretation() { |
43 | } | 62 | } |
44 | 63 |
morfeusz/MorphInterpretation.hpp
@@ -20,6 +20,7 @@ public: | @@ -20,6 +20,7 @@ public: | ||
20 | const std::string& orth, | 20 | const std::string& orth, |
21 | const EncodedInterpretation& encodedInterp, | 21 | const EncodedInterpretation& encodedInterp, |
22 | const Tagset& tagset); | 22 | const Tagset& tagset); |
23 | + static MorphInterpretation createIgn(int startNode, const std::string& orth, const Tagset& tagset); | ||
23 | virtual ~MorphInterpretation(); | 24 | virtual ~MorphInterpretation(); |
24 | int getStartNode() const; | 25 | int getStartNode() const; |
25 | int getEndNode() const; | 26 | int getEndNode() const; |
@@ -30,6 +31,10 @@ public: | @@ -30,6 +31,10 @@ public: | ||
30 | const std::string& getTag() const; | 31 | const std::string& getTag() const; |
31 | const std::string& getName() const; | 32 | const std::string& getName() const; |
32 | private: | 33 | private: |
34 | + MorphInterpretation( | ||
35 | + int startNode, | ||
36 | + const std::string& orth, | ||
37 | + const Tagset& tagset); | ||
33 | int startNode; | 38 | int startNode; |
34 | int endNode; | 39 | int endNode; |
35 | std::string orth; | 40 | std::string orth; |
morfeusz/charset/CharsetConverter.cpp
@@ -6,6 +6,10 @@ | @@ -6,6 +6,10 @@ | ||
6 | 6 | ||
7 | using namespace std; | 7 | using namespace std; |
8 | 8 | ||
9 | +uint32_t UTF8CharsetConverter::peek(const char*& it, const char* end) const { | ||
10 | + return utf8::peek_next(it, end); | ||
11 | +} | ||
12 | + | ||
9 | uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { | 13 | uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { |
10 | return utf8::next(it, end); | 14 | return utf8::next(it, end); |
11 | } | 15 | } |
morfeusz/charset/CharsetConverter.hpp
@@ -10,6 +10,7 @@ | @@ -10,6 +10,7 @@ | ||
10 | 10 | ||
11 | class CharsetConverter { | 11 | class CharsetConverter { |
12 | public: | 12 | public: |
13 | + virtual uint32_t peek(const char*& it, const char* end) const = 0; | ||
13 | virtual uint32_t next(const char*& it, const char* end) const = 0; | 14 | virtual uint32_t next(const char*& it, const char* end) const = 0; |
14 | virtual void append(uint32_t cp, std::vector<char>& result) const = 0; | 15 | virtual void append(uint32_t cp, std::vector<char>& result) const = 0; |
15 | private: | 16 | private: |
@@ -17,6 +18,7 @@ private: | @@ -17,6 +18,7 @@ private: | ||
17 | 18 | ||
18 | class UTF8CharsetConverter: public CharsetConverter { | 19 | class UTF8CharsetConverter: public CharsetConverter { |
19 | public: | 20 | public: |
21 | + uint32_t peek(const char*& it, const char* end) const; | ||
20 | uint32_t next(const char*& it, const char* end) const; | 22 | uint32_t next(const char*& it, const char* end) const; |
21 | void append(uint32_t cp, std::vector<char>& result) const; | 23 | void append(uint32_t cp, std::vector<char>& result) const; |
22 | private: | 24 | private: |
@@ -24,6 +26,7 @@ private: | @@ -24,6 +26,7 @@ private: | ||
24 | 26 | ||
25 | class UTF16CharsetConverter: public CharsetConverter { | 27 | class UTF16CharsetConverter: public CharsetConverter { |
26 | public: | 28 | public: |
29 | + uint32_t peek(const char*& it, const char* end) const; | ||
27 | uint32_t next(const char*& it, const char* end) const; | 30 | uint32_t next(const char*& it, const char* end) const; |
28 | void append(uint32_t cp, std::vector<char>& result) const; | 31 | void append(uint32_t cp, std::vector<char>& result) const; |
29 | private: | 32 | private: |
@@ -31,6 +34,7 @@ private: | @@ -31,6 +34,7 @@ private: | ||
31 | 34 | ||
32 | class UTF32CharsetConverter: public CharsetConverter { | 35 | class UTF32CharsetConverter: public CharsetConverter { |
33 | public: | 36 | public: |
37 | + uint32_t peek(const char*& it, const char* end) const; | ||
34 | uint32_t next(const char*& it, const char* end) const; | 38 | uint32_t next(const char*& it, const char* end) const; |
35 | void append(uint32_t cp, std::vector<char>& result) const; | 39 | void append(uint32_t cp, std::vector<char>& result) const; |
36 | private: | 40 | private: |
@@ -38,6 +42,7 @@ private: | @@ -38,6 +42,7 @@ private: | ||
38 | 42 | ||
39 | class ISO8859_2_CharsetConverter: public CharsetConverter { | 43 | class ISO8859_2_CharsetConverter: public CharsetConverter { |
40 | public: | 44 | public: |
45 | + uint32_t peek(const char*& it, const char* end) const; | ||
41 | uint32_t next(const char*& it, const char* end) const; | 46 | uint32_t next(const char*& it, const char* end) const; |
42 | void append(uint32_t cp, std::vector<char>& result) const; | 47 | void append(uint32_t cp, std::vector<char>& result) const; |
43 | private: | 48 | private: |
morfeusz/morfeusz.cpp deleted
morfeusz/morfeusz.hpp deleted
morfeusz/test_simple.cpp
@@ -7,6 +7,7 @@ | @@ -7,6 +7,7 @@ | ||
7 | 7 | ||
8 | #include <cstdlib> | 8 | #include <cstdlib> |
9 | 9 | ||
10 | +#include "utils.hpp" | ||
10 | #include "Morfeusz.hpp" | 11 | #include "Morfeusz.hpp" |
11 | #include "MorphInterpretation.hpp" | 12 | #include "MorphInterpretation.hpp" |
12 | 13 | ||
@@ -16,11 +17,11 @@ using namespace std; | @@ -16,11 +17,11 @@ using namespace std; | ||
16 | * | 17 | * |
17 | */ | 18 | */ |
18 | int main(int argc, char** argv) { | 19 | int main(int argc, char** argv) { |
19 | - Morfeusz morfeusz(argv[1]); | 20 | + Morfeusz morfeusz("/tmp/test-SIMPLE-PoliMorfSmall.tab.fsa"); |
20 | vector<MorphInterpretation> res; | 21 | vector<MorphInterpretation> res; |
21 | - string word = "mijałem"; | ||
22 | - const char* ptr = word.c_str(); | ||
23 | - morfeusz.processOneWord(ptr, word.c_str() + word.size(), 0, res); | 22 | + string word = " mijałem fasdfasd abdominalności "; |
23 | + morfeusz.analyze(word, res); | ||
24 | + DEBUG("znaleziono "+to_string(res.size())); | ||
24 | for (MorphInterpretation& mi: res) { | 25 | for (MorphInterpretation& mi: res) { |
25 | cerr << mi.getStartNode() << " " << mi.getEndNode() << " " << mi.getLemma() << " " << mi.getTag() << " " << mi.getName() << endl; | 26 | cerr << mi.getStartNode() << " " << mi.getEndNode() << " " << mi.getLemma() << " " << mi.getTag() << " " << mi.getName() << endl; |
26 | } | 27 | } |
nbproject/configurations.xml
@@ -17,9 +17,9 @@ | @@ -17,9 +17,9 @@ | ||
17 | <in>MorphInterpretation.cpp</in> | 17 | <in>MorphInterpretation.cpp</in> |
18 | <in>Tagset.cpp</in> | 18 | <in>Tagset.cpp</in> |
19 | <in>main.cpp</in> | 19 | <in>main.cpp</in> |
20 | - <in>morfeusz.cpp</in> | ||
21 | <in>test_morfeusz.cpp</in> | 20 | <in>test_morfeusz.cpp</in> |
22 | <in>test_morph.cpp</in> | 21 | <in>test_morph.cpp</in> |
22 | + <in>test_simple.cpp</in> | ||
23 | </df> | 23 | </df> |
24 | <logicalFolder name="ExternalFiles" | 24 | <logicalFolder name="ExternalFiles" |
25 | displayName="Important Files" | 25 | displayName="Important Files" |
@@ -49,7 +49,7 @@ | @@ -49,7 +49,7 @@ | ||
49 | <buildCommandWorkingDir>build</buildCommandWorkingDir> | 49 | <buildCommandWorkingDir>build</buildCommandWorkingDir> |
50 | <buildCommand>${MAKE} -f Makefile</buildCommand> | 50 | <buildCommand>${MAKE} -f Makefile</buildCommand> |
51 | <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> | 51 | <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> |
52 | - <executablePath>build/fsa/test_dict</executablePath> | 52 | + <executablePath>build/morfeusz/test_simple</executablePath> |
53 | </makeTool> | 53 | </makeTool> |
54 | </makefileType> | 54 | </makefileType> |
55 | <folder path="1"> | 55 | <folder path="1"> |
@@ -120,10 +120,6 @@ | @@ -120,10 +120,6 @@ | ||
120 | <ccTool> | 120 | <ccTool> |
121 | </ccTool> | 121 | </ccTool> |
122 | </item> | 122 | </item> |
123 | - <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> | ||
124 | - <ccTool> | ||
125 | - </ccTool> | ||
126 | - </item> | ||
127 | <item path="morfeusz/test_morfeusz.cpp" ex="false" tool="1" flavor2="4"> | 123 | <item path="morfeusz/test_morfeusz.cpp" ex="false" tool="1" flavor2="4"> |
128 | <ccTool> | 124 | <ccTool> |
129 | </ccTool> | 125 | </ccTool> |
@@ -132,12 +128,8 @@ | @@ -132,12 +128,8 @@ | ||
132 | <ccTool> | 128 | <ccTool> |
133 | </ccTool> | 129 | </ccTool> |
134 | </item> | 130 | </item> |
135 | - <item path="morfeusz/test_simple.cpp" ex="false" tool="1" flavor2="0"> | 131 | + <item path="morfeusz/test_simple.cpp" ex="false" tool="1" flavor2="8"> |
136 | <ccTool> | 132 | <ccTool> |
137 | - <incDir> | ||
138 | - <pElem>fsa</pElem> | ||
139 | - <pElem>build/morfeusz</pElem> | ||
140 | - </incDir> | ||
141 | </ccTool> | 133 | </ccTool> |
142 | </item> | 134 | </item> |
143 | </conf> | 135 | </conf> |
testfiles/PoliMorfSmall.tab
@@ -579,3 +579,43 @@ abdominoplastyki abdominoplastyka subst:pl:voc:f pospolita | @@ -579,3 +579,43 @@ abdominoplastyki abdominoplastyka subst:pl:voc:f pospolita | ||
579 | abdominoplastyki abdominoplastyka subst:sg:gen:f pospolita | 579 | abdominoplastyki abdominoplastyka subst:sg:gen:f pospolita |
580 | abdominoplastyko abdominoplastyka subst:sg:voc:f pospolita | 580 | abdominoplastyko abdominoplastyka subst:sg:voc:f pospolita |
581 | abdominoplastykom abdominoplastyka subst:pl:dat:f pospolita | 581 | abdominoplastykom abdominoplastyka subst:pl:dat:f pospolita |
582 | +mijał mijać praet:sg:m1.m2.m3:imperf pospolita | ||
583 | +mijała mijać praet:sg:f:imperf pospolita | ||
584 | +mijało mijać praet:sg:n1.n2:imperf pospolita | ||
585 | +mijały mijać praet:pl:m2.m3.f.n1.n2.p2.p3:imperf pospolita | ||
586 | +omijał omijać praet:sg:m1.m2.m3:imperf pospolita | ||
587 | +omijała omijać praet:sg:f:imperf pospolita | ||
588 | +omijało omijać praet:sg:n1.n2:imperf pospolita | ||
589 | +omijały omijać praet:pl:m2.m3.f.n1.n2.p2.p3:imperf pospolita | ||
590 | +pomijał pomijać praet:sg:m1.m2.m3:imperf pospolita | ||
591 | +pomijała pomijać praet:sg:f:imperf pospolita | ||
592 | +pomijało pomijać praet:sg:n1.n2:imperf pospolita | ||
593 | +pomijały pomijać praet:pl:m2.m3.f.n1.n2.p2.p3:imperf pospolita | ||
594 | +powymijał powymijać praet:sg:m1.m2.m3:perf pospolita | ||
595 | +powymijała powymijać praet:sg:f:perf pospolita | ||
596 | +powymijało powymijać praet:sg:n1.n2:perf pospolita | ||
597 | +powymijały powymijać praet:pl:m2.m3.f.n1.n2.p2.p3:perf pospolita | ||
598 | +przemijał przemijać praet:sg:m1.m2.m3:imperf pospolita | ||
599 | +przemijała przemijać praet:sg:f:imperf pospolita | ||
600 | +przemijało przemijać praet:sg:n1.n2:imperf pospolita | ||
601 | +przemijały przemijać praet:pl:m2.m3.f.n1.n2.p2.p3:imperf pospolita | ||
602 | +rozmijał rozmijać praet:sg:m1.m2.m3:imperf pospolita | ||
603 | +rozmijała rozmijać praet:sg:f:imperf pospolita | ||
604 | +rozmijało rozmijać praet:sg:n1.n2:imperf pospolita | ||
605 | +rozmijały rozmijać praet:pl:m2.m3.f.n1.n2.p2.p3:imperf pospolita | ||
606 | +wymijał wymijać praet:sg:m1.m2.m3:imperf pospolita | ||
607 | +wymijała wymijać praet:sg:f:imperf pospolita | ||
608 | +wymijało wymijać praet:sg:n1.n2:imperf pospolita | ||
609 | +wymijały wymijać praet:pl:m2.m3.f.n1.n2.p2.p3:imperf pospolita | ||
610 | +zmijał zmijać praet:sg:m1.m2.m3:imperf pospolita | ||
611 | +zmijała zmijać praet:sg:f:imperf pospolita | ||
612 | +zmijało zmijać praet:sg:n1.n2:imperf pospolita | ||
613 | +zmijały zmijać praet:pl:m2.m3.f.n1.n2.p2.p3:imperf pospolita | ||
614 | +em być aglt:sg:pri:imperf:wok pospolita | ||
615 | +eś być aglt:sg:sec:imperf:wok pospolita | ||
616 | +eście być aglt:pl:sec:imperf:wok pospolita | ||
617 | +eśmy być aglt:pl:pri:imperf:wok pospolita | ||
618 | +m być aglt:sg:pri:imperf:nwok pospolita | ||
619 | +ś być aglt:sg:sec:imperf:nwok pospolita | ||
620 | +ście być aglt:pl:sec:imperf:nwok pospolita | ||
621 | +śmy być aglt:pl:pri:imperf:nwok pospolita |
testfiles/polimorf.tagset
@@ -2,7 +2,7 @@ | @@ -2,7 +2,7 @@ | ||
2 | 2 | ||
3 | [TAGS] | 3 | [TAGS] |
4 | 4 | ||
5 | -0 adj:pl:acc:m1.p1:com | 5 | +0 ign |
6 | 1 adj:pl:acc:m1.p1:pos | 6 | 1 adj:pl:acc:m1.p1:pos |
7 | 2 adj:pl:acc:m1.p1:sup | 7 | 2 adj:pl:acc:m1.p1:sup |
8 | 3 adj:pl:acc:m2.m3.f.n1.n2.p2.p3:com | 8 | 3 adj:pl:acc:m2.m3.f.n1.n2.p2.p3:com |
@@ -576,6 +576,7 @@ | @@ -576,6 +576,7 @@ | ||
576 | 571 winien:sg:f:imperf | 576 | 571 winien:sg:f:imperf |
577 | 572 winien:sg:m1.m2.m3:imperf | 577 | 572 winien:sg:m1.m2.m3:imperf |
578 | 573 winien:sg:n1.n2:imperf | 578 | 573 winien:sg:n1.n2:imperf |
579 | +574 adj:pl:acc:m1.p1:com | ||
579 | 580 | ||
580 | [NAMES] | 581 | [NAMES] |
581 | 582 |