Commit 3cc7bcb1bbf51effb5019a70f7aabd383ece5679
1 parent
a9d3e65c
- praca nad grafem fleksyjnym
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@21 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
13 changed files
with
258 additions
and
89 deletions
morfeusz/CMakeLists.txt
| @@ -7,7 +7,7 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) | @@ -7,7 +7,7 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) | ||
| 7 | add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) | 7 | add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) |
| 8 | add_executable (morfeusz2_analyze main.cpp) | 8 | add_executable (morfeusz2_analyze main.cpp) |
| 9 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) | 9 | add_executable (test_morph test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp) |
| 10 | -add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp) | 10 | +add_executable (test_morfeusz test_morph.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp MorphInterpretation.cpp Morfeusz.cpp charset/CharsetConverter.cpp FlexionGraph.cpp) |
| 11 | 11 | ||
| 12 | # Link the executable to the Hello library. | 12 | # Link the executable to the Hello library. |
| 13 | target_link_libraries (morfeusz2_analyze morfeusz2) | 13 | target_link_libraries (morfeusz2_analyze morfeusz2) |
morfeusz/FlexionGraph.cpp
0 → 100644
| 1 | + | ||
| 2 | +#include "FlexionGraph.hpp" | ||
| 3 | + | ||
| 4 | +FlexionGraph::FlexionGraph(int startNode) | ||
| 5 | +: startNode(startNode) { | ||
| 6 | + | ||
| 7 | +} | ||
| 8 | + | ||
| 9 | +void FlexionGraph::addPath(const std::vector<InterpretedChunk>& path) { | ||
| 10 | + for (const InterpretedChunk& chunk: path) { | ||
| 11 | + if (&chunk == &(path.back())) { | ||
| 12 | + Edge e = { chunk, -1 }; | ||
| 13 | + vector<Edge> v; | ||
| 14 | + v.push_back(e); | ||
| 15 | + this->graph.push_back(v); | ||
| 16 | +// this->graph[node].push_back(e); | ||
| 17 | + } | ||
| 18 | + else if (&chunk == &(path.front())) { | ||
| 19 | + Edge e = { chunk, (int) this->graph.size() }; | ||
| 20 | + this->graph[0].push_back(e); | ||
| 21 | + } | ||
| 22 | + else { | ||
| 23 | + Edge e = { chunk, (int) this->graph.size() }; | ||
| 24 | + vector<Edge> v; | ||
| 25 | + v.push_back(e); | ||
| 26 | + this->graph.push_back(v); | ||
| 27 | + } | ||
| 28 | + } | ||
| 29 | +} | ||
| 30 | + | ||
| 31 | +void FlexionGraph::appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results) { | ||
| 32 | + int endNode = graph.size(); | ||
| 33 | + for (int i = 0; i < graph.size(); i++) { | ||
| 34 | + vector<Edge>& edges = graph[i]; | ||
| 35 | + for (Edge& e: edges) { | ||
| 36 | + int realStartNode = i + this->startNode; | ||
| 37 | + int realEndNode = e.nextNode == -1 ? (endNode + this->startNode) : (i + e.nextNode); | ||
| 38 | + string orth(e.chunk.chunk, e.chunk.chunkLength); | ||
| 39 | + vector<MorphInterpretation> interps = e.chunk.interpsGroup.getRealInterps(orth, realStartNode, realEndNode, tagset); | ||
| 40 | + results.insert(results.end(), interps.begin(), interps.end()); | ||
| 41 | + } | ||
| 42 | + } | ||
| 43 | +} |
morfeusz/FlexionGraph.hpp
0 → 100644
| 1 | +/* | ||
| 2 | + * File: FlexionGraph.hpp | ||
| 3 | + * Author: mlenart | ||
| 4 | + * | ||
| 5 | + * Created on 18 listopad 2013, 15:03 | ||
| 6 | + */ | ||
| 7 | + | ||
| 8 | +#ifndef FLEXIONGRAPH_HPP | ||
| 9 | +#define FLEXIONGRAPH_HPP | ||
| 10 | + | ||
| 11 | +#include <vector> | ||
| 12 | +#include "InterpretedChunk.hpp" | ||
| 13 | + | ||
| 14 | +struct Edge { | ||
| 15 | + InterpretedChunk chunk; | ||
| 16 | + int nextNode; | ||
| 17 | +}; | ||
| 18 | + | ||
| 19 | +class FlexionGraph { | ||
| 20 | +public: | ||
| 21 | + | ||
| 22 | + explicit FlexionGraph(int startNode); | ||
| 23 | + | ||
| 24 | + void addPath(const std::vector<InterpretedChunk>& path); | ||
| 25 | + | ||
| 26 | + void appendToResults(const Tagset& tagset, std::vector<MorphInterpretation>& results); | ||
| 27 | + | ||
| 28 | +// virtual ~FlexionGraph(); | ||
| 29 | +private: | ||
| 30 | + int startNode; | ||
| 31 | + std::vector< std::vector<Edge> > graph; | ||
| 32 | +}; | ||
| 33 | + | ||
| 34 | +#endif /* FLEXIONGRAPH_HPP */ | ||
| 35 | + |
morfeusz/InterpretedChunk.hpp
0 → 100644
| 1 | +/* | ||
| 2 | + * File: InterpretedChunk.hpp | ||
| 3 | + * Author: mlenart | ||
| 4 | + * | ||
| 5 | + * Created on 18 listopad 2013, 15:00 | ||
| 6 | + */ | ||
| 7 | + | ||
| 8 | +#ifndef INTERPRETEDCHUNK_HPP | ||
| 9 | +#define INTERPRETEDCHUNK_HPP | ||
| 10 | + | ||
| 11 | +#include "InterpsGroup.hpp" | ||
| 12 | + | ||
| 13 | +struct InterpretedChunk { | ||
| 14 | + const char* chunk; | ||
| 15 | + long chunkLength; | ||
| 16 | + InterpsGroup& interpsGroup; | ||
| 17 | +}; | ||
| 18 | + | ||
| 19 | +#endif /* INTERPRETEDCHUNK_HPP */ | ||
| 20 | + |
morfeusz/InterpsGroup.hpp
| 1 | /* | 1 | /* |
| 2 | * File: GroupedInterpretations.hpp | 2 | * File: GroupedInterpretations.hpp |
| 3 | - * Author: lennyn | 3 | + * Author: mlenart |
| 4 | * | 4 | * |
| 5 | * Created on November 16, 2013, 7:58 PM | 5 | * Created on November 16, 2013, 7:58 PM |
| 6 | */ | 6 | */ |
| 7 | 7 | ||
| 8 | -#ifndef GROUPEDINTERPRETATIONS_HPP | ||
| 9 | -#define GROUPEDINTERPRETATIONS_HPP | 8 | +#ifndef INTERPSGROUP_HPP |
| 9 | +#define INTERPSGROUP_HPP | ||
| 10 | 10 | ||
| 11 | #include <vector> | 11 | #include <vector> |
| 12 | #include <string> | 12 | #include <string> |
| @@ -26,7 +26,11 @@ public: | @@ -26,7 +26,11 @@ public: | ||
| 26 | 26 | ||
| 27 | } | 27 | } |
| 28 | 28 | ||
| 29 | - std::vector<MorphInterpretation> getRealInterps(const std::string& orth, const Tagset& tagset) { | 29 | + std::vector<MorphInterpretation> getRealInterps( |
| 30 | + const std::string& orth, | ||
| 31 | + const int startNode, | ||
| 32 | + const int endNode, | ||
| 33 | + const Tagset& tagset) { | ||
| 30 | std::vector<MorphInterpretation> res; | 34 | std::vector<MorphInterpretation> res; |
| 31 | for (EncodedInterpretation& ei: interps) { | 35 | for (EncodedInterpretation& ei: interps) { |
| 32 | res.push_back(MorphInterpretation(startNode, endNode, orth, ei, tagset)); | 36 | res.push_back(MorphInterpretation(startNode, endNode, orth, ei, tagset)); |
| @@ -39,8 +43,7 @@ public: | @@ -39,8 +43,7 @@ public: | ||
| 39 | } | 43 | } |
| 40 | 44 | ||
| 41 | int type; | 45 | int type; |
| 42 | - int startNode; | ||
| 43 | - int endNode; | 46 | + |
| 44 | private: | 47 | private: |
| 45 | std::vector<EncodedInterpretation> interps; | 48 | std::vector<EncodedInterpretation> interps; |
| 46 | }; | 49 | }; |
morfeusz/Morfeusz.cpp
| @@ -11,6 +11,9 @@ | @@ -11,6 +11,9 @@ | ||
| 11 | #include "Morfeusz.hpp" | 11 | #include "Morfeusz.hpp" |
| 12 | #include "MorphDeserializer.hpp" | 12 | #include "MorphDeserializer.hpp" |
| 13 | #include "charset/CharsetConverter.hpp" | 13 | #include "charset/CharsetConverter.hpp" |
| 14 | +#include "charset/charset_utils.hpp" | ||
| 15 | + | ||
| 16 | +// TODO - konstruktor kopiujący działający Tak-Jak-Trzeba | ||
| 14 | 17 | ||
| 15 | using namespace std; | 18 | using namespace std; |
| 16 | 19 | ||
| @@ -30,16 +33,64 @@ Morfeusz::Morfeusz(const string& filename) | @@ -30,16 +33,64 @@ Morfeusz::Morfeusz(const string& filename) | ||
| 30 | 33 | ||
| 31 | } | 34 | } |
| 32 | 35 | ||
| 33 | -//Morfeusz::Morfeusz(const Morfeusz& orig) { | ||
| 34 | -//} | ||
| 35 | - | ||
| 36 | Morfeusz::~Morfeusz() { | 36 | Morfeusz::~Morfeusz() { |
| 37 | delete &this->fsa; | 37 | delete &this->fsa; |
| 38 | + delete &this->charsetConverter; | ||
| 39 | +} | ||
| 40 | + | ||
| 41 | +void Morfeusz::processOneWord( | ||
| 42 | + const char*& inputData, | ||
| 43 | + const char* inputEnd, | ||
| 44 | + const int startNodeNum, | ||
| 45 | + std::vector<MorphInterpretation>& results) const { | ||
| 46 | + vector<InterpretedChunk> accum; | ||
| 47 | + FlexionGraph graph(startNodeNum); | ||
| 48 | + const char* currInput = inputData; | ||
| 49 | + doProcessOneWord(currInput, inputEnd, accum, graph); | ||
| 50 | + graph.appendToResults(this->tagset, results); | ||
| 51 | + inputData = currInput; | ||
| 52 | +} | ||
| 53 | + | ||
| 54 | +void Morfeusz::doProcessOneWord( | ||
| 55 | + const char*& inputData, | ||
| 56 | + const char* inputEnd, | ||
| 57 | + vector<InterpretedChunk>& accum, | ||
| 58 | + FlexionGraph& graph) const { | ||
| 59 | + const char* currInput = inputData; | ||
| 60 | + StateType state = this->fsa->getInitialState(); | ||
| 61 | + int codepoint = this->charsetConverter->next(currInput, inputEnd); | ||
| 62 | + | ||
| 63 | + if (!accum.empty() && isEndOfWord(codepoint)) { | ||
| 64 | + graph.addPath(accum); | ||
| 65 | + } | ||
| 66 | + else | ||
| 67 | + while (!isEndOfWord(codepoint)) { | ||
| 68 | + this->feedState(state, codepoint); | ||
| 69 | + codepoint = this->charsetConverter->next(currInput, inputEnd); | ||
| 70 | + if (state.isAccepting()) { | ||
| 71 | + for (InterpsGroup& ig : state.getValue()) { | ||
| 72 | + InterpretedChunk ic = {inputData, currInput - inputData, ig}; | ||
| 73 | + accum.push_back(ic); | ||
| 74 | + doProcessOneWord(currInput, inputEnd, accum, graph); | ||
| 75 | + accum.pop_back(); | ||
| 76 | + } | ||
| 77 | + } | ||
| 78 | + } | ||
| 79 | +} | ||
| 80 | + | ||
| 81 | +void Morfeusz::feedState( | ||
| 82 | + StateType& state, | ||
| 83 | + const int codepoint) const { | ||
| 84 | + vector<char> chars; | ||
| 85 | + this->charsetConverter->append(codepoint, chars); | ||
| 86 | + for (char c: chars) { | ||
| 87 | + state.proceedToNext(c); | ||
| 88 | + } | ||
| 38 | } | 89 | } |
| 39 | 90 | ||
| 40 | ResultsIterator Morfeusz::analyze(const std::string& text) { | 91 | ResultsIterator Morfeusz::analyze(const std::string& text) { |
| 41 | -// const char* textStart = text.c_str(); | ||
| 42 | -// const char* textEnd = text.c_str() + text.length(); | 92 | + // const char* textStart = text.c_str(); |
| 93 | + // const char* textEnd = text.c_str() + text.length(); | ||
| 43 | return ResultsIterator(text, *this); | 94 | return ResultsIterator(text, *this); |
| 44 | } | 95 | } |
| 45 | 96 | ||
| @@ -49,13 +100,13 @@ morfeusz(morfeusz) { | @@ -49,13 +100,13 @@ morfeusz(morfeusz) { | ||
| 49 | } | 100 | } |
| 50 | 101 | ||
| 51 | MorphInterpretation ResultsIterator::getNext() { | 102 | MorphInterpretation ResultsIterator::getNext() { |
| 52 | -// if (resultsBuffer.empty()) { | ||
| 53 | -// morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer)); | ||
| 54 | -// } | ||
| 55 | -// startNode = resultsBuffer.back().getEndNode(); | ||
| 56 | -// MorphInterpretation res = resultsBuffer.front(); | ||
| 57 | -// resultsBuffer.pop_front(); | ||
| 58 | -// return res; | 103 | + // if (resultsBuffer.empty()) { |
| 104 | + // morfeusz.processOneWord(rawInput, startNode, back_inserter(resultsBuffer)); | ||
| 105 | + // } | ||
| 106 | + // startNode = resultsBuffer.back().getEndNode(); | ||
| 107 | + // MorphInterpretation res = resultsBuffer.front(); | ||
| 108 | + // resultsBuffer.pop_front(); | ||
| 109 | + // return res; | ||
| 59 | } | 110 | } |
| 60 | 111 | ||
| 61 | bool ResultsIterator::hasNext() { | 112 | bool ResultsIterator::hasNext() { |
morfeusz/Morfeusz.hpp
| @@ -16,6 +16,8 @@ | @@ -16,6 +16,8 @@ | ||
| 16 | #include "MorphInterpretation.hpp" | 16 | #include "MorphInterpretation.hpp" |
| 17 | #include "InterpsGroup.hpp" | 17 | #include "InterpsGroup.hpp" |
| 18 | #include "charset/CharsetConverter.hpp" | 18 | #include "charset/CharsetConverter.hpp" |
| 19 | +#include "InterpretedChunk.hpp" | ||
| 20 | +#include "FlexionGraph.hpp" | ||
| 19 | 21 | ||
| 20 | class Morfeusz; | 22 | class Morfeusz; |
| 21 | //class AnalyzeResult; | 23 | //class AnalyzeResult; |
| @@ -34,17 +36,26 @@ public: | @@ -34,17 +36,26 @@ public: | ||
| 34 | // Morfeusz(); | 36 | // Morfeusz(); |
| 35 | friend class ResultsIterator; | 37 | friend class ResultsIterator; |
| 36 | private: | 38 | private: |
| 37 | - template <class OutputIterator> | ||
| 38 | -// void processOneWord(const char*& inputData, int startNodeNum, OutputIterator resInterps) const; | 39 | + void processOneWord( |
| 40 | + const char*& inputData, | ||
| 41 | + const char* inputEnd, | ||
| 42 | + const int startNodeNum, | ||
| 43 | + std::vector<MorphInterpretation>& result) const; | ||
| 39 | 44 | ||
| 40 | - int doProcessOneWord(const char*& inputData, int startNodeNum, std::vector<InterpsGroup>& interps) const; | 45 | + void doProcessOneWord( |
| 46 | + const char*& inputData, | ||
| 47 | + const char* inputEnd, | ||
| 48 | + std::vector<InterpretedChunk>& accum, | ||
| 49 | + FlexionGraph& graph) const; | ||
| 41 | 50 | ||
| 42 | - const FSAType* fsa; | 51 | + void feedState( |
| 52 | + StateType& state, | ||
| 53 | + const int codepoint) const; | ||
| 54 | + | ||
| 55 | + FSAType* fsa; | ||
| 43 | CharsetConverter* charsetConverter; | 56 | CharsetConverter* charsetConverter; |
| 44 | }; | 57 | }; |
| 45 | 58 | ||
| 46 | -#include "Morfeusz_impl.hpp" | ||
| 47 | - | ||
| 48 | class ResultsIterator { | 59 | class ResultsIterator { |
| 49 | public: | 60 | public: |
| 50 | ResultsIterator(const std::string& text, const Morfeusz& morfeusz); | 61 | ResultsIterator(const std::string& text, const Morfeusz& morfeusz); |
morfeusz/Morfeusz_impl.hpp deleted
| 1 | -/* | ||
| 2 | - * File: Morfeusz_impl.hpp | ||
| 3 | - * Author: lennyn | ||
| 4 | - * | ||
| 5 | - * Created on November 15, 2013, 1:43 PM | ||
| 6 | - */ | ||
| 7 | - | ||
| 8 | -#ifndef MORFEUSZ_IMPL_HPP | ||
| 9 | -#define MORFEUSZ_IMPL_HPP | ||
| 10 | - | ||
| 11 | -#include <cassert> | ||
| 12 | -#include "Morfeusz.hpp" | ||
| 13 | - | ||
| 14 | -//template <class OutputIterator> | ||
| 15 | -//void Morfeusz::processOneWord(const char*& inputData, const char* inputEnd, int startNodeNum, OutputIterator output, bool insertIgn = true) const { | ||
| 16 | -// if (inputData == inputEnd) { | ||
| 17 | -// return; | ||
| 18 | -// } | ||
| 19 | -// const char* start = inputData; | ||
| 20 | -// StateType state = fsa->getInitialState(); | ||
| 21 | -// int currNodeNum = startNodeNum; | ||
| 22 | -// do { | ||
| 23 | -// int codepoint = this->charsetConverter->next(inputData, inputEnd); | ||
| 24 | -// if (!isSpace(codepoint) && codepoint != 0) { | ||
| 25 | -// feedAutomaton(state, codepoint); | ||
| 26 | -// if (state.isAccepting()) { | ||
| 27 | -// int currInput = inputData; | ||
| 28 | -// vector<MorphInterpretation> additionalInterps; | ||
| 29 | -// processOneWord( | ||
| 30 | -// currInput, inputEnd, | ||
| 31 | -// currNodeNum + 1, | ||
| 32 | -// back_inserter(additionalInterps), false); | ||
| 33 | -// if (!additionalInterps.empty()) { | ||
| 34 | -// currNodeNum = additionalInterps.back().getEndNode(); | ||
| 35 | -// } | ||
| 36 | -// } | ||
| 37 | -// } | ||
| 38 | -// } | ||
| 39 | -//} | ||
| 40 | - | ||
| 41 | -#endif /* MORFEUSZ_IMPL_HPP */ | ||
| 42 | - |
morfeusz/charset/CharsetConverter.cpp
| 1 | -/* | ||
| 2 | - * File: EncodingConverter.cpp | ||
| 3 | - * Author: mlenart | ||
| 4 | - * | ||
| 5 | - * Created on 14 listopad 2013, 17:28 | ||
| 6 | - */ | ||
| 7 | 1 | ||
| 2 | +#include <vector> | ||
| 3 | +#include <iterator> | ||
| 8 | #include "utf8.h" | 4 | #include "utf8.h" |
| 9 | #include "CharsetConverter.hpp" | 5 | #include "CharsetConverter.hpp" |
| 10 | 6 | ||
| 7 | +using namespace std; | ||
| 8 | + | ||
| 11 | uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { | 9 | uint32_t UTF8CharsetConverter::next(const char*& it, const char* end) const { |
| 12 | return utf8::next(it, end); | 10 | return utf8::next(it, end); |
| 13 | } | 11 | } |
| 14 | -char* UTF8CharsetConverter::append(uint32_t cp, char* result) const { | ||
| 15 | - return utf8::append(cp, result); | 12 | + |
| 13 | +void UTF8CharsetConverter::append(uint32_t cp, vector<char>& result) const { | ||
| 14 | + utf8::append(cp, back_inserter(result)); | ||
| 16 | } | 15 | } |
morfeusz/charset/CharsetConverter.hpp
| @@ -11,35 +11,35 @@ | @@ -11,35 +11,35 @@ | ||
| 11 | class CharsetConverter { | 11 | class CharsetConverter { |
| 12 | public: | 12 | public: |
| 13 | virtual uint32_t next(const char*& it, const char* end) const = 0; | 13 | virtual uint32_t next(const char*& it, const char* end) const = 0; |
| 14 | - virtual char* append(uint32_t cp, char* result) const = 0; | 14 | + virtual void append(uint32_t cp, std::vector<char>& result) const = 0; |
| 15 | private: | 15 | private: |
| 16 | }; | 16 | }; |
| 17 | 17 | ||
| 18 | class UTF8CharsetConverter: public CharsetConverter { | 18 | class UTF8CharsetConverter: public CharsetConverter { |
| 19 | public: | 19 | public: |
| 20 | uint32_t next(const char*& it, const char* end) const; | 20 | uint32_t next(const char*& it, const char* end) const; |
| 21 | - char* append(uint32_t cp, char* result) const; | 21 | + void append(uint32_t cp, std::vector<char>& result) const; |
| 22 | private: | 22 | private: |
| 23 | }; | 23 | }; |
| 24 | 24 | ||
| 25 | class UTF16CharsetConverter: public CharsetConverter { | 25 | class UTF16CharsetConverter: public CharsetConverter { |
| 26 | public: | 26 | public: |
| 27 | uint32_t next(const char*& it, const char* end) const; | 27 | uint32_t next(const char*& it, const char* end) const; |
| 28 | - char* append(uint32_t cp, char* result) const; | 28 | + void append(uint32_t cp, std::vector<char>& result) const; |
| 29 | private: | 29 | private: |
| 30 | }; | 30 | }; |
| 31 | 31 | ||
| 32 | class UTF32CharsetConverter: public CharsetConverter { | 32 | class UTF32CharsetConverter: public CharsetConverter { |
| 33 | public: | 33 | public: |
| 34 | uint32_t next(const char*& it, const char* end) const; | 34 | uint32_t next(const char*& it, const char* end) const; |
| 35 | - char* append(uint32_t cp, char* result) const; | 35 | + void append(uint32_t cp, std::vector<char>& result) const; |
| 36 | private: | 36 | private: |
| 37 | }; | 37 | }; |
| 38 | 38 | ||
| 39 | class ISO8859_2_CharsetConverter: public CharsetConverter { | 39 | class ISO8859_2_CharsetConverter: public CharsetConverter { |
| 40 | public: | 40 | public: |
| 41 | uint32_t next(const char*& it, const char* end) const; | 41 | uint32_t next(const char*& it, const char* end) const; |
| 42 | - char* append(uint32_t cp, char* result) const; | 42 | + void append(uint32_t cp, std::vector<char>& result) const; |
| 43 | private: | 43 | private: |
| 44 | }; | 44 | }; |
| 45 | 45 |
morfeusz/charset/charset_utils.hpp
| @@ -8,7 +8,12 @@ | @@ -8,7 +8,12 @@ | ||
| 8 | #ifndef CHARSET_UTILS_HPP | 8 | #ifndef CHARSET_UTILS_HPP |
| 9 | #define CHARSET_UTILS_HPP | 9 | #define CHARSET_UTILS_HPP |
| 10 | 10 | ||
| 11 | +#include <set> | ||
| 11 | 12 | ||
| 13 | +bool isEndOfWord(int codepoint) { | ||
| 14 | + static std::set<int> whitespaces = { 0x00, 0x0A, 0x20 }; | ||
| 15 | + return whitespaces.count(codepoint) != 0; | ||
| 16 | +} | ||
| 12 | 17 | ||
| 13 | #endif /* CHARSET_UTILS_HPP */ | 18 | #endif /* CHARSET_UTILS_HPP */ |
| 14 | 19 |
morfeusz/test_morph.cpp
| @@ -38,7 +38,7 @@ void doTest( | @@ -38,7 +38,7 @@ void doTest( | ||
| 38 | // vector<TaggedInterpretation> parsedValues; | 38 | // vector<TaggedInterpretation> parsedValues; |
| 39 | bool found = false; | 39 | bool found = false; |
| 40 | for (InterpsGroup gi: value2) | 40 | for (InterpsGroup gi: value2) |
| 41 | - for (MorphInterpretation interp: gi.getRealInterps(orth, tagset)) { | 41 | + for (MorphInterpretation interp: gi.getRealInterps(orth, 0, 0, tagset)) { |
| 42 | // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp); | 42 | // TaggedInterpretation parsedValue = interpsConverter.getInterpretation(key, interp); |
| 43 | // (0, 0, orth, encodedInterp, tagset); | 43 | // (0, 0, orth, encodedInterp, tagset); |
| 44 | // parsedValues.push_back(parsedValue); | 44 | // parsedValues.push_back(parsedValue); |
nbproject/configurations.xml
| @@ -10,11 +10,15 @@ | @@ -10,11 +10,15 @@ | ||
| 10 | <df root="morfeusz" name="1"> | 10 | <df root="morfeusz" name="1"> |
| 11 | <df name="charset"> | 11 | <df name="charset"> |
| 12 | <in>CharsetConverter.cpp</in> | 12 | <in>CharsetConverter.cpp</in> |
| 13 | - <in>charset_utils.hpp</in> | ||
| 14 | </df> | 13 | </df> |
| 15 | - <in>InterpsGroup.hpp</in> | 14 | + <df name="encoding"> |
| 15 | + <in>CharsetConverter.cpp</in> | ||
| 16 | + </df> | ||
| 17 | + <df name="flexion"> | ||
| 18 | + <in>FlexionGraph.cpp</in> | ||
| 19 | + </df> | ||
| 20 | + <in>FlexionGraph.cpp</in> | ||
| 16 | <in>Morfeusz.cpp</in> | 21 | <in>Morfeusz.cpp</in> |
| 17 | - <in>Morfeusz_impl.hpp</in> | ||
| 18 | <in>MorphDeserializer.cpp</in> | 22 | <in>MorphDeserializer.cpp</in> |
| 19 | <in>MorphInterpretation.cpp</in> | 23 | <in>MorphInterpretation.cpp</in> |
| 20 | <in>Tagset.cpp</in> | 24 | <in>Tagset.cpp</in> |
| @@ -53,7 +57,7 @@ | @@ -53,7 +57,7 @@ | ||
| 53 | <executablePath>build/fsa/test_dict</executablePath> | 57 | <executablePath>build/fsa/test_dict</executablePath> |
| 54 | </makeTool> | 58 | </makeTool> |
| 55 | </makefileType> | 59 | </makefileType> |
| 56 | - <folder path="1"> | 60 | + <folder path="1/charset"> |
| 57 | <ccTool> | 61 | <ccTool> |
| 58 | <incDir> | 62 | <incDir> |
| 59 | <pElem>fsa</pElem> | 63 | <pElem>fsa</pElem> |
| @@ -90,24 +94,44 @@ | @@ -90,24 +94,44 @@ | ||
| 90 | </incDir> | 94 | </incDir> |
| 91 | </ccTool> | 95 | </ccTool> |
| 92 | </item> | 96 | </item> |
| 93 | - <item path="morfeusz/InterpsGroup.hpp" ex="false" tool="3" flavor2="0"> | 97 | + <item path="morfeusz/FlexionGraph.cpp" ex="false" tool="1" flavor2="8"> |
| 98 | + <ccTool> | ||
| 99 | + <incDir> | ||
| 100 | + <pElem>fsa</pElem> | ||
| 101 | + <pElem>build/morfeusz</pElem> | ||
| 102 | + </incDir> | ||
| 103 | + </ccTool> | ||
| 94 | </item> | 104 | </item> |
| 95 | <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> | 105 | <item path="morfeusz/Morfeusz.cpp" ex="false" tool="1" flavor2="8"> |
| 96 | <ccTool> | 106 | <ccTool> |
| 107 | + <incDir> | ||
| 108 | + <pElem>fsa</pElem> | ||
| 109 | + <pElem>build/morfeusz</pElem> | ||
| 110 | + </incDir> | ||
| 97 | </ccTool> | 111 | </ccTool> |
| 98 | </item> | 112 | </item> |
| 99 | - <item path="morfeusz/Morfeusz_impl.hpp" ex="false" tool="3" flavor2="0"> | ||
| 100 | - </item> | ||
| 101 | <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> | 113 | <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> |
| 102 | <ccTool> | 114 | <ccTool> |
| 115 | + <incDir> | ||
| 116 | + <pElem>fsa</pElem> | ||
| 117 | + <pElem>build/morfeusz</pElem> | ||
| 118 | + </incDir> | ||
| 103 | </ccTool> | 119 | </ccTool> |
| 104 | </item> | 120 | </item> |
| 105 | <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8"> | 121 | <item path="morfeusz/MorphInterpretation.cpp" ex="false" tool="1" flavor2="8"> |
| 106 | <ccTool> | 122 | <ccTool> |
| 123 | + <incDir> | ||
| 124 | + <pElem>fsa</pElem> | ||
| 125 | + <pElem>build/morfeusz</pElem> | ||
| 126 | + </incDir> | ||
| 107 | </ccTool> | 127 | </ccTool> |
| 108 | </item> | 128 | </item> |
| 109 | <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> | 129 | <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> |
| 110 | <ccTool> | 130 | <ccTool> |
| 131 | + <incDir> | ||
| 132 | + <pElem>fsa</pElem> | ||
| 133 | + <pElem>build/morfeusz</pElem> | ||
| 134 | + </incDir> | ||
| 111 | </ccTool> | 135 | </ccTool> |
| 112 | </item> | 136 | </item> |
| 113 | <item path="morfeusz/charset/CharsetConverter.cpp" | 137 | <item path="morfeusz/charset/CharsetConverter.cpp" |
| @@ -117,18 +141,38 @@ | @@ -117,18 +141,38 @@ | ||
| 117 | <ccTool> | 141 | <ccTool> |
| 118 | </ccTool> | 142 | </ccTool> |
| 119 | </item> | 143 | </item> |
| 120 | - <item path="morfeusz/charset/charset_utils.hpp" ex="false" tool="3" flavor2="0"> | 144 | + <item path="morfeusz/encoding/CharsetConverter.cpp" |
| 145 | + ex="false" | ||
| 146 | + tool="1" | ||
| 147 | + flavor2="4"> | ||
| 148 | + <ccTool> | ||
| 149 | + </ccTool> | ||
| 150 | + </item> | ||
| 151 | + <item path="morfeusz/flexion/FlexionGraph.cpp" ex="false" tool="1" flavor2="4"> | ||
| 152 | + <ccTool> | ||
| 153 | + </ccTool> | ||
| 121 | </item> | 154 | </item> |
| 122 | <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> | 155 | <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> |
| 123 | <ccTool> | 156 | <ccTool> |
| 157 | + <incDir> | ||
| 158 | + <pElem>fsa</pElem> | ||
| 159 | + <pElem>build/morfeusz</pElem> | ||
| 160 | + </incDir> | ||
| 124 | </ccTool> | 161 | </ccTool> |
| 125 | </item> | 162 | </item> |
| 126 | <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> | 163 | <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> |
| 127 | <ccTool> | 164 | <ccTool> |
| 165 | + <incDir> | ||
| 166 | + <pElem>morfeusz</pElem> | ||
| 167 | + </incDir> | ||
| 128 | </ccTool> | 168 | </ccTool> |
| 129 | </item> | 169 | </item> |
| 130 | <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> | 170 | <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> |
| 131 | <ccTool> | 171 | <ccTool> |
| 172 | + <incDir> | ||
| 173 | + <pElem>fsa</pElem> | ||
| 174 | + <pElem>build/morfeusz</pElem> | ||
| 175 | + </incDir> | ||
| 132 | </ccTool> | 176 | </ccTool> |
| 133 | </item> | 177 | </item> |
| 134 | </conf> | 178 | </conf> |