Commit 3cc7bcb1bbf51effb5019a70f7aabd383ece5679

Authored by Michał Lenart
1 parent a9d3e65c

- praca nad grafem fleksyjnym

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@21 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
morfeusz/CMakeLists.txt
... ... @@ -7,7 +7,7 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa)
7 7 add_library (morfeusz2 morfeusz.hpp morfeusz.cpp)
8 8 add_executable (morfeusz2_analyze main.cpp)
9 9 add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp)
10   -add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp)
  10 +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp)
11 11  
12 12 # Link the executable to the Hello library.
13 13 target_link_libraries (morfeusz2_analyze morfeusz2)
... ...
morfeusz/FlexionGraph.cpp 0 → 100644
  1 +
  2 +#include "FlexionGraph.hpp"
  3 +
  4 +FlexionGraph::FlexionGraph(int startNode)
  5 +: startNode(startNode) {
  6 +
  7 +}
  8 +
  9 +void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) {
  10 + for (const InterpretedChunk& chunk: path) {
  11 + if (&chunk == &(path.back())) {
  12 + Edge e = { chunk, -1 };
  13 + vector<Edge> v;
  14 + v.push_back(e);
  15 + this->graph.push_back(v);
  16 +// this->graph[node].push_back(e);
  17 + }
  18 + else if (&chunk == &(path.front())) {
  19 + Edge e = { chunk, (int) this->graph.size() };
  20 + this->graph[0].push_back(e);
  21 + }
  22 + else {
  23 + Edge e = { chunk, (int) this->graph.size() };
  24 + vector<Edge> v;
  25 + v.push_back(e);
  26 + this->graph.push_back(v);
  27 + }
  28 + }
  29 +}
  30 +
  31 +void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results) {
  32 + int endNode = graph.size();
  33 + for (int i = 0; i < graph.size(); i++) {
  34 + vector<Edge>& edges = graph[i];
  35 + for (Edge& e: edges) {
  36 + int realStartNode = i + this->startNode;
  37 + int realEndNode = e.nextNode == -1 ? (endNode + this->startNode) : (i + e.nextNode);
  38 + string orth(e.chunk.chunk, e.chunk.chunkLength);
  39 + vector<MorphInterpretation> interps = e.chunk.interpsGroup.getRealInterps(orth, realStartNode, realEndNode, tagset);
  40 + results.insert(results.end(), interps.begin(), interps.end());
  41 + }
  42 + }
  43 +}
... ...
morfeusz/FlexionGraph.hpp 0 → 100644
  1 +/*
  2 + * File: FlexionGraph.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 18 listopad 2013, 15:03
  6 + */
  7 +
  8 +#ifndef FLEXIONGRAPH_HPP
  9 +#define FLEXIONGRAPH_HPP
  10 +
  11 +#include <vector>
  12 +#include "InterpretedChunk.hpp"
  13 +
  14 +struct Edge {
  15 + InterpretedChunk chunk;
  16 + int nextNode;
  17 +};
  18 +
  19 +class FlexionGraph {
  20 +public:
  21 +
  22 + explicit FlexionGraph(int startNode);
  23 +
  24 + void addPath(const std::vector<InterpretedChunk>& path);
  25 +
  26 + void appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results);
  27 +
  28 +// virtual ~FlexionGraph();
  29 +private:
  30 + int startNode;
  31 + std::vector< std::vector<Edge> > graph;
  32 +};
  33 +
  34 +#endif /* FLEXIONGRAPH_HPP */
  35 +
... ...
morfeusz/InterpretedChunk.hpp 0 → 100644
  1 +/*
  2 + * File: InterpretedChunk.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 18 listopad 2013, 15:00
  6 + */
  7 +
  8 +#ifndef INTERPRETEDCHUNK_HPP
  9 +#define INTERPRETEDCHUNK_HPP
  10 +
  11 +#include "InterpsGroup.hpp"
  12 +
  13 +struct InterpretedChunk {
  14 + const char* chunk;
  15 + long chunkLength;
  16 + InterpsGroup& interpsGroup;
  17 +};
  18 +
  19 +#endif /* INTERPRETEDCHUNK_HPP */
  20 +
... ...
morfeusz/InterpsGroup.hpp
1 1 /*
2 2 * File: GroupedInterpretations.hpp
3   - * Author: lennyn
  3 + * Author: mlenart
4 4 *
5 5 * Created on November 16, 2013, 7:58 PM
6 6 */
7 7  
8   -#ifndef GROUPEDINTERPRETATIONS_HPP
9   -#define GROUPEDINTERPRETATIONS_HPP
  8 +#ifndef INTERPSGROUP_HPP
  9 +#define INTERPSGROUP_HPP
10 10  
11 11 #include <vector>
12 12 #include <string>
... ... @@ -26,7 +26,11 @@ public:
26 26  
27 27 }
28 28  
29   - std::vector<MorphInterpretation> getRealInterps(const std::string& orth, const Tagset& tagset) {
  29 + std::vector<MorphInterpretation> getRealInterps(
  30 + const std::string& orth,
  31 + const int startNode,
  32 + const int endNode,
  33 + const Tagset& tagset) {
30 34 std::vector<MorphInterpretation> res;
31 35 for (EncodedInterpretation& ei: interps) {
32 36 res.push_back(MorphInterpretation(startNode, endNode, orth, ei, tagset));
... ... @@ -39,8 +43,7 @@ public:
39 43 }
40 44  
41 45 int type;
42   - int startNode;
43   - int endNode;
  46 +
44 47 private:
45 48 std::vector<EncodedInterpretation> interps;
46 49 };
... ...
morfeusz/Morfeusz.cpp
... ... @@ -11,6 +11,9 @@
11 11 #include "Morfeusz.hpp"
12 12 #include "MorphDeserializer.hpp"
13 13 #include "charset/CharsetConverter.hpp"
  14 +#include "charset/charset_utils.hpp"
  15 +
  16 +// TODO - konstruktor kopiujący działający Tak-Jak-Trzeba
14 17  
15 18 using namespace std;
16 19  
... ... @@ -30,16 +33,64 @@ Morfeusz::Morfeusz(const string&amp; filename)
30 33  
31 34 }
32 35  
33   -//Morfeusz::Morfeusz(const Morfeusz& orig) {
34   -//}
35   -
36 36 Morfeusz::~Morfeusz() {
37 37 delete &this->fsa;
  38 + delete &this->charsetConverter;
  39 +}
  40 +
  41 +void Morfeusz::processOneWord(
  42 + const char*& inputData,
  43 + const char* inputEnd,
  44 + const int startNodeNum,
  45 + std::vector<MorphInterpretation>& results) const {
  46 + vector<InterpretedChunk> accum;
  47 + FlexionGraph graph(startNodeNum);
  48 + const char* currInput = inputData;
  49 + doProcessOneWord(currInput, inputEnd, accum, graph);
  50 + graph.appendToResults(this->tagset, results);
  51 + inputData = currInput;
  52 +}
  53 +
  54 +void Morfeusz::doProcessOneWord(
  55 + const char*& inputData,
  56 + const char* inputEnd,
  57 + vector<InterpretedChunk>& accum,
  58 + FlexionGraph& graph) const {
  59 + const char* currInput = inputData;
  60 + StateType state = this->fsa->getInitialState();
  61 + int codepoint = this->charsetConverter->next(currInput, inputEnd);
  62 +
  63 + if (!accum.empty() && isEndOfWord(codepoint)) {
  64 + graph.addPath(accum);
  65 + }
  66 + else
  67 + while (!isEndOfWord(codepoint)) {
  68 + this->feedState(state, codepoint);
  69 + codepoint = this->charsetConverter->next(currInput, inputEnd);
  70 + if (state.isAccepting()) {
  71 + for (InterpsGroup& ig : state.getValue()) {
  72 + InterpretedChunk ic = {inputData, currInput - inputData, ig};
  73 + accum.push_back(ic);
  74 + doProcessOneWord(currInput, inputEnd, accum, graph);
  75 + accum.pop_back();
  76 + }
  77 + }
  78 + }
  79 +}
  80 +
  81 +void Morfeusz::feedState(
  82 + StateType& state,
  83 + const int codepoint) const {
  84 + vector<char> chars;
  85 + this->charsetConverter->append(codepoint, chars);
  86 + for (char c: chars) {
  87 + state.proceedToNext(c);
  88 + }
38 89 }
39 90  
40 91 ResultsIterator Morfeusz::analyze(const std::string& text) {
41   -// const char* textStart = text.c_str();
42   -// const char* textEnd = text.c_str() + text.length();
  92 + // const char* textStart = text.c_str();
  93 + // const char* textEnd = text.c_str() + text.length();
43 94 return ResultsIterator(text, *this);
44 95 }
45 96  
... ... @@ -49,13 +100,13 @@ morfeusz(morfeusz) {
49 100 }
50 101  
51 102 MorphInterpretation ResultsIterator::getNext() {
52   -// if (resultsBuffer.empty()) {
53   -// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer));
54   -// }
55   -// startNode = resultsBuffer.back().getEndNode();
56   -// MorphInterpretation res = resultsBuffer.front();
57   -// resultsBuffer.pop_front();
58   -// return res;
  103 + // if (resultsBuffer.empty()) {
  104 + // morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer));
  105 + // }
  106 + // startNode = resultsBuffer.back().getEndNode();
  107 + // MorphInterpretation res = resultsBuffer.front();
  108 + // resultsBuffer.pop_front();
  109 + // return res;
59 110 }
60 111  
61 112 bool ResultsIterator::hasNext() {
... ...
morfeusz/Morfeusz.hpp
... ... @@ -16,6 +16,8 @@
16 16 #include "MorphInterpretation.hpp"
17 17 #include "InterpsGroup.hpp"
18 18 #include "charset/CharsetConverter.hpp"
  19 +#include "InterpretedChunk.hpp"
  20 +#include "FlexionGraph.hpp"
19 21  
20 22 class Morfeusz;
21 23 //class AnalyzeResult;
... ... @@ -34,17 +36,26 @@ public:
34 36 // Morfeusz();
35 37 friend class ResultsIterator;
36 38 private:
37   - template <class OutputIterator>
38   -// void processOneWord(const char*& inputData, int startNodeNum, OutputIterator resInterps) const;
  39 + void processOneWord(
  40 + const char*& inputData,
  41 + const char* inputEnd,
  42 + const int startNodeNum,
  43 + std::vector<MorphInterpretation>& result) const;
39 44  
40   - int doProcessOneWord(const char*& inputData, int startNodeNum, std::vector<InterpsGroup>& interps) const;
  45 + void doProcessOneWord(
  46 + const char*& inputData,
  47 + const char* inputEnd,
  48 + std::vector<InterpretedChunk>& accum,
  49 + FlexionGraph& graph) const;
41 50  
42   - const FSAType* fsa;
  51 + void feedState(
  52 + StateType& state,
  53 + const int codepoint) const;
  54 +
  55 + FSAType* fsa;
43 56 CharsetConverter* charsetConverter;
44 57 };
45 58  
46   -#include "Morfeusz_impl.hpp"
47   -
48 59 class ResultsIterator {
49 60 public:
50 61 ResultsIterator(const std::string& text, const Morfeusz& morfeusz);
... ...
morfeusz/Morfeusz_impl.hpp deleted
1   -/*
2   - * File: Morfeusz_impl.hpp
3   - * Author: lennyn
4   - *
5   - * Created on November 15, 2013, 1:43 PM
6   - */
7   -
8   -#ifndef MORFEUSZ_IMPL_HPP
9   -#define MORFEUSZ_IMPL_HPP
10   -
11   -#include <cassert>
12   -#include "Morfeusz.hpp"
13   -
14   -//template <class OutputIterator>
15   -//void Morfeusz::processOneWord(const char*& inputData, const char* inputEnd, int startNodeNum, OutputIterator output, bool insertIgn = true) const {
16   -// if (inputData == inputEnd) {
17   -// return;
18   -// }
19   -// const char* start = inputData;
20   -// StateType state = fsa->getInitialState();
21   -// int currNodeNum = startNodeNum;
22   -// do {
23   -// int codepoint = this->charsetConverter->next(inputData, inputEnd);
24   -// if (!isSpace(codepoint) && codepoint != 0) {
25   -// feedAutomaton(state, codepoint);
26   -// if (state.isAccepting()) {
27   -// int currInput = inputData;
28   -// vector<MorphInterpretation> additionalInterps;
29   -// processOneWord(
30   -// currInput, inputEnd,
31   -// currNodeNum + 1,
32   -// back_inserter(additionalInterps), false);
33   -// if (!additionalInterps.empty()) {
34   -// currNodeNum = additionalInterps.back().getEndNode();
35   -// }
36   -// }
37   -// }
38   -// }
39   -//}
40   -
41   -#endif /* MORFEUSZ_IMPL_HPP */
42   -
morfeusz/charset/CharsetConverter.cpp
1   -/*
2   - * File: EncodingConverter.cpp
3   - * Author: mlenart
4   - *
5   - * Created on 14 listopad 2013, 17:28
6   - */
7 1  
  2 +#include <vector>
  3 +#include <iterator>
8 4 #include "utf8.h"
9 5 #include "CharsetConverter.hpp"
10 6  
  7 +using namespace std;
  8 +
11 9 uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const {
12 10 return utf8::next(it, end);
13 11 }
14   -char* UTF8CharsetConverter::append(uint32_t cp, char* result) const {
15   - return utf8::append(cp, result);
  12 +
  13 +void UTF8CharsetConverter::append(uint32_t cp, vector<char>& result) const {
  14 + utf8::append(cp, back_inserter(result));
16 15 }
... ...
morfeusz/charset/CharsetConverter.hpp
... ... @@ -11,35 +11,35 @@
11 11 class CharsetConverter {
12 12 public:
13 13 virtual uint32_t next(const char*& it, const char* end) const = 0;
14   - virtual char* append(uint32_t cp, char* result) const = 0;
  14 + virtual void append(uint32_t cp, std::vector<char>& result) const = 0;
15 15 private:
16 16 };
17 17  
18 18 class UTF8CharsetConverter: public CharsetConverter {
19 19 public:
20 20 uint32_t next(const char*& it, const char* end) const;
21   - char* append(uint32_t cp, char* result) const;
  21 + void append(uint32_t cp, std::vector<char>& result) const;
22 22 private:
23 23 };
24 24  
25 25 class UTF16CharsetConverter: public CharsetConverter {
26 26 public:
27 27 uint32_t next(const char*& it, const char* end) const;
28   - char* append(uint32_t cp, char* result) const;
  28 + void append(uint32_t cp, std::vector<char>& result) const;
29 29 private:
30 30 };
31 31  
32 32 class UTF32CharsetConverter: public CharsetConverter {
33 33 public:
34 34 uint32_t next(const char*& it, const char* end) const;
35   - char* append(uint32_t cp, char* result) const;
  35 + void append(uint32_t cp, std::vector<char>& result) const;
36 36 private:
37 37 };
38 38  
39 39 class ISO8859_2_CharsetConverter: public CharsetConverter {
40 40 public:
41 41 uint32_t next(const char*& it, const char* end) const;
42   - char* append(uint32_t cp, char* result) const;
  42 + void append(uint32_t cp, std::vector<char>& result) const;
43 43 private:
44 44 };
45 45  
... ...
morfeusz/charset/charset_utils.hpp
... ... @@ -8,7 +8,12 @@
8 8 #ifndef CHARSET_UTILS_HPP
9 9 #define CHARSET_UTILS_HPP
10 10  
  11 +#include <set>
11 12  
  13 +bool isEndOfWord(int codepoint) {
  14 + static std::set<int> whitespaces = { 0x00, 0x0A, 0x20 };
  15 + return whitespaces.count(codepoint) != 0;
  16 +}
12 17  
13 18 #endif /* CHARSET_UTILS_HPP */
14 19  
... ...
morfeusz/test_morph.cpp
... ... @@ -38,7 +38,7 @@ void doTest(
38 38 // vector<TaggedInterpretation> parsedValues;
39 39 bool found = false;
40 40 for (InterpsGroup gi: value2)
41   - for (MorphInterpretation interp: gi.getRealInterps(orth, tagset)) {
  41 + for (MorphInterpretation interp: gi.getRealInterps(orth, 0, 0, tagset)) {
42 42 // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp);
43 43 // (0, 0, orth, encodedInterp, tagset);
44 44 // parsedValues.push_back(parsedValue);
... ...
nbproject/configurations.xml
... ... @@ -10,11 +10,15 @@
10 10 <df root="morfeusz" name="1">
11 11 <df name="charset">
12 12 <in>CharsetConverter.cpp</in>
13   - <in>charset_utils.hpp</in>
14 13 </df>
15   - <in>InterpsGroup.hpp</in>
  14 + <df name="encoding">
  15 + <in>CharsetConverter.cpp</in>
  16 + </df>
  17 + <df name="flexion">
  18 + <in>FlexionGraph.cpp</in>
  19 + </df>
  20 + <in>FlexionGraph.cpp</in>
16 21 <in>Morfeusz.cpp</in>
17   - <in>Morfeusz_impl.hpp</in>
18 22 <in>MorphDeserializer.cpp</in>
19 23 <in>MorphInterpretation.cpp</in>
20 24 <in>Tagset.cpp</in>
... ... @@ -53,7 +57,7 @@
53 57 <executablePath>build/fsa/test_dict</executablePath>
54 58 </makeTool>
55 59 </makefileType>
56   - <folder path="1">
  60 + <folder path="1/charset">
57 61 <ccTool>
58 62 <incDir>
59 63 <pElem>fsa</pElem>
... ... @@ -90,24 +94,44 @@
90 94 </incDir>
91 95 </ccTool>
92 96 </item>
93   - <item path="morfeusz/InterpsGroup.hpp" ex="false" tool="3" flavor2="0">
  97 + <item path="morfeusz/FlexionGraph.cpp" ex="false" tool="1" flavor2="8">
  98 + <ccTool>
  99 + <incDir>
  100 + <pElem>fsa</pElem>
  101 + <pElem>build/morfeusz</pElem>
  102 + </incDir>
  103 + </ccTool>
94 104 </item>
95 105 <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8">
96 106 <ccTool>
  107 + <incDir>
  108 + <pElem>fsa</pElem>
  109 + <pElem>build/morfeusz</pElem>
  110 + </incDir>
97 111 </ccTool>
98 112 </item>
99   - <item path="morfeusz/Morfeusz_impl.hpp" ex="false" tool="3" flavor2="0">
100   - </item>
101 113 <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8">
102 114 <ccTool>
  115 + <incDir>
  116 + <pElem>fsa</pElem>
  117 + <pElem>build/morfeusz</pElem>
  118 + </incDir>
103 119 </ccTool>
104 120 </item>
105 121 <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8">
106 122 <ccTool>
  123 + <incDir>
  124 + <pElem>fsa</pElem>
  125 + <pElem>build/morfeusz</pElem>
  126 + </incDir>
107 127 </ccTool>
108 128 </item>
109 129 <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8">
110 130 <ccTool>
  131 + <incDir>
  132 + <pElem>fsa</pElem>
  133 + <pElem>build/morfeusz</pElem>
  134 + </incDir>
111 135 </ccTool>
112 136 </item>
113 137 <item path="morfeusz/charset/CharsetConverter.cpp"
... ... @@ -117,18 +141,38 @@
117 141 <ccTool>
118 142 </ccTool>
119 143 </item>
120   - <item path="morfeusz/charset/charset_utils.hpp" ex="false" tool="3" flavor2="0">
  144 + <item path="morfeusz/encoding/CharsetConverter.cpp"
  145 + ex="false"
  146 + tool="1"
  147 + flavor2="4">
  148 + <ccTool>
  149 + </ccTool>
  150 + </item>
  151 + <item path="morfeusz/flexion/FlexionGraph.cpp" ex="false" tool="1" flavor2="4">
  152 + <ccTool>
  153 + </ccTool>
121 154 </item>
122 155 <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
123 156 <ccTool>
  157 + <incDir>
  158 + <pElem>fsa</pElem>
  159 + <pElem>build/morfeusz</pElem>
  160 + </incDir>
124 161 </ccTool>
125 162 </item>
126 163 <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4">
127 164 <ccTool>
  165 + <incDir>
  166 + <pElem>morfeusz</pElem>
  167 + </incDir>
128 168 </ccTool>
129 169 </item>
130 170 <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8">
131 171 <ccTool>
  172 + <incDir>
  173 + <pElem>fsa</pElem>
  174 + <pElem>build/morfeusz</pElem>
  175 + </incDir>
132 176 </ccTool>
133 177 </item>
134 178 </conf>
... ...