From 4bf99e6b94697f082572eb35c0dcdd60c3711761 Mon Sep 17 00:00:00 2001 From: Michał Lenart <michall@ipipan.waw.pl> Date: Tue, 12 Nov 2013 17:53:25 +0000 Subject: [PATCH] - prawie działa rozpoznawanie informacji morfologicznej --- fsa/CMakeLists.txt | 11 +++++------ fsa/cfsa1_impl.hpp | 9 +++++---- fsa/cfsa2_impl.hpp | 2 +- fsa/const.cpp | 10 ++++++++++ fsa/const.hpp | 22 ++++++++++++++++++++++ fsa/fsa.hpp | 27 ++++++++------------------- fsa/fsa_impl.hpp | 41 +++++------------------------------------ fsa/interpretation.hpp | 29 ----------------------------- fsa/simplefsa_impl.hpp | 4 ++-- fsa/state_impl.hpp | 4 ++-- fsa/test_morph.cpp | 53 ----------------------------------------------------- fsa/utils.hpp | 29 +++++++++++++++-------------- morfeusz/CMakeLists.txt | 3 +++ morfeusz/MorphDeserializer.cpp | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ morfeusz/MorphDeserializer.hpp | 28 ++++++++++++++++++++++++++++ morfeusz/Tagset.cpp | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ morfeusz/Tagset.hpp | 25 +++++++++++++++++++++++++ morfeusz/interpretations.cpp | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ morfeusz/interpretations.hpp | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ morfeusz/main.cpp | 1 + morfeusz/test_morph.cpp | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nbproject/configurations.xml | 130 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------- nbproject/project.xml | 4 ++-- 23 files changed, 545 insertions(+), 187 deletions(-) create mode 100644 fsa/const.cpp create mode 100644 fsa/const.hpp delete mode 100644 fsa/interpretation.hpp delete mode 100644 fsa/test_morph.cpp create mode 100644 morfeusz/MorphDeserializer.cpp create mode 100644 morfeusz/MorphDeserializer.hpp create mode 100644 morfeusz/Tagset.cpp create mode 100644 morfeusz/Tagset.hpp create mode 100644 morfeusz/interpretations.cpp create mode 100644 morfeusz/interpretations.hpp create mode 100644 morfeusz/test_morph.cpp diff --git a/fsa/CMakeLists.txt b/fsa/CMakeLists.txt index a013849..ace2488 100644 --- a/fsa/CMakeLists.txt +++ b/fsa/CMakeLists.txt @@ -1,11 +1,10 @@ -add_executable (test_speed test_speed.cpp) -add_executable (test_speed_profile test_speed.cpp) -add_executable (test_recognize test_recognize.cpp) -add_executable (test_not_recognize test_not_recognize.cpp) -add_executable (test_morph test_morph.cpp) +add_executable (test_speed test_speed.cpp const.cpp) +add_executable (test_speed_profile test_speed.cpp const.cpp) +add_executable (test_recognize test_recognize.cpp const.cpp) +add_executable (test_not_recognize test_not_recognize.cpp const.cpp) + set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -pedantic -Wcast-align -Wextra -Wmissing-noreturn -Wconversion -Wcast-qual -Wcast-align" ) set_target_properties ( test_speed_profile PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2 -g" ) set_target_properties ( test_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) set_target_properties ( test_not_recognize PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) -set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) diff --git a/fsa/cfsa1_impl.hpp b/fsa/cfsa1_impl.hpp index e56f56f..3d41ae1 100644 --- a/fsa/cfsa1_impl.hpp +++ b/fsa/cfsa1_impl.hpp @@ -79,7 +79,7 @@ void CompressedFSA1<T>::doProceedToNextByList( TransitionData2 td; for (unsigned int i = 0; i < transitionsNum; i++) { // const_cast<Counter*>(&counter)->increment(1); - td = *((TransitionData2*) currPtr); + td = *(reinterpret_cast<const TransitionData2*>(currPtr)); if (td.shortLabel == shortLabel) { if (shortLabel == 0) { currPtr++; @@ -107,7 +107,8 @@ void CompressedFSA1<T>::doProceedToNextByList( if (!found) { // cerr << "SINK for " << c << endl; state.setNextAsSink(); - } else { + } + else { currPtr++; // cerr << "offset size " << td.offsetSize << endl; // cerr << "offset " << offset << endl; @@ -152,12 +153,12 @@ void CompressedFSA1<T>::proceedToNext(const char c, State<T>& state) const { // cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl; const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); unsigned char shortLabel = this->label2ShortLabel[(const unsigned char) c]; - unsigned int transitionsTableOffset = 1; + unsigned long transitionsTableOffset = 1; if (state.isAccepting()) { transitionsTableOffset += state.getValueSize(); // cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl; } - StateData2* sd = (StateData2*) (fromPointer); + const StateData2* sd = reinterpret_cast<const StateData2*>(fromPointer); // cerr << "transitions num=" << sd->transitionsNum << endl; if (sd->array) { if (shortLabel > 0) { diff --git a/fsa/cfsa2_impl.hpp b/fsa/cfsa2_impl.hpp index 45767ac..fe59618 100644 --- a/fsa/cfsa2_impl.hpp +++ b/fsa/cfsa2_impl.hpp @@ -135,7 +135,7 @@ void CompressedFSA2<T>::proceedToNext(const char c, State<T>& state) const { cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl; #endif const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); - unsigned int transitionsTableOffset = 0; + unsigned long transitionsTableOffset = 0; if (state.isAccepting()) { transitionsTableOffset += state.getValueSize(); } diff --git a/fsa/const.cpp b/fsa/const.cpp new file mode 100644 index 0000000..0bc9c6d --- /dev/null +++ b/fsa/const.cpp @@ -0,0 +1,10 @@ + +#include "const.hpp" + +extern const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; +extern const uint8_t VERSION_NUM = 9; + +extern const unsigned int VERSION_NUM_OFFSET = 4; +extern const unsigned int IMPLEMENTATION_NUM_OFFSET = 5; +extern const unsigned int ADDITIONAL_DATA_SIZE_OFFSET = 6; +extern const unsigned int ADDITIONAL_DATA_OFFSET = 10; diff --git a/fsa/const.hpp b/fsa/const.hpp new file mode 100644 index 0000000..c37e921 --- /dev/null +++ b/fsa/const.hpp @@ -0,0 +1,22 @@ +/* + * File: const.hpp + * Author: mlenart + * + * Created on 12 listopad 2013, 14:11 + */ + +#ifndef CONST_HPP +#define CONST_HPP + +#include <netinet/in.h> + +extern const uint32_t MAGIC_NUMBER; +extern const uint8_t VERSION_NUM; + +extern const unsigned int VERSION_NUM_OFFSET; +extern const unsigned int IMPLEMENTATION_NUM_OFFSET; +extern const unsigned int ADDITIONAL_DATA_SIZE_OFFSET; +extern const unsigned int ADDITIONAL_DATA_OFFSET; + +#endif /* CONST_HPP */ + diff --git a/fsa/fsa.hpp b/fsa/fsa.hpp index 4296d1e..052f46a 100644 --- a/fsa/fsa.hpp +++ b/fsa/fsa.hpp @@ -9,13 +9,12 @@ #define FSA_HPP //#include <iostream> -//#include <cstring> -#include <typeinfo> +#include <cstring> #include <cassert> +#include <typeinfo> #include <exception> #include <string> #include <vector> -#include "interpretation.hpp" template <class T> class State; template <class T> class FSA; @@ -44,16 +43,12 @@ public: * Returns number of bytes read or -1 on error. */ long deserialize(const unsigned char* ptr, char*& text) const { - // text = const_cast<char*> (reinterpret_cast<const char*> (ptr)); - // return strlen(text) + 1; - return 1; + text = const_cast<char*> (reinterpret_cast<const char*> (ptr)); + return strlen(text) + 1; +// return 1; } }; -class MorphDeserializer: public Deserializer<std::vector<Interpretation>> { - long deserialize(const unsigned char* ptr, std::vector<Interpretation>& interp) const; -}; - class Counter { public: @@ -88,8 +83,6 @@ public: */ static FSA<T>* getFSA(const unsigned char* ptr, const Deserializer<T>& deserializer); - static const uint32_t MAGIC_NUMBER = 0x8fc2bc1b; - static const uint8_t VERSION_NUM = 8; protected: /** @@ -105,10 +98,6 @@ protected: const Deserializer<T>& deserializer; friend class State<T>; private: - static int getMagicNumberOffset(); - static int getVersionNumOffset(); - static int getPopularCharsOffset(); - static int getInitialStateOffset(); // FSA(); }; @@ -220,12 +209,12 @@ public: * Makes sense only for accepting states. * For non-accepting states is throws an exception. */ - unsigned int getValueSize() const; + unsigned long getValueSize() const; unsigned long getOffset() const; void setNext(const unsigned long offset); - void setNext(const unsigned long offset, const T& value, const unsigned int valueSize); + void setNext(const unsigned long offset, const T& value, const unsigned long valueSize); void setNextAsSink(); explicit State(const FSA<T>& fsa); @@ -237,7 +226,7 @@ private: bool accepting; bool sink; T value; - int valueSize; + long valueSize; }; class FSAException : public std::exception { diff --git a/fsa/fsa_impl.hpp b/fsa/fsa_impl.hpp index 05f119e..6bc66ca 100644 --- a/fsa/fsa_impl.hpp +++ b/fsa/fsa_impl.hpp @@ -14,14 +14,11 @@ #include <iostream> #include <vector> #include <netinet/in.h> -#include "fsa.hpp" #include "utils.hpp" +#include "const.hpp" using namespace std; - -static const unsigned int VERSION_NUM_OFFSET = 4; -static const unsigned int IMPLEMENTATION_NUM_OFFSET = 5; -static const unsigned int FSA_OFFSET = 6; +//static const unsigned int FSA_OFFSET = 6; template <class T> bool FSA<T>::tryToRecognize(const char* input, T& value) const { @@ -73,7 +70,9 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial } uint8_t implementationNum = *(ptr + IMPLEMENTATION_NUM_OFFSET); - const unsigned char* startPtr = ptr + FSA_OFFSET; + + uint32_t additionalDataSize = ntohl(*(reinterpret_cast<const uint32_t*>(ptr + ADDITIONAL_DATA_SIZE_OFFSET))); + const unsigned char* startPtr = ptr + ADDITIONAL_DATA_OFFSET + additionalDataSize; switch (implementationNum) { case 0: return new SimpleFSA<T>(startPtr, deserializer); @@ -86,34 +85,4 @@ FSA<T>* FSA<T>::getFSA(const unsigned char* ptr, const Deserializer<T>& deserial } } -static void deserializeLemma(const unsigned char*& ptr, Lemma& lemma) { - // XXX uważać na poprawność danych - lemma.suffixToCut = *ptr; - ptr++; - lemma.suffixToAdd = (const char*) ptr; - ptr += strlen((const char*) ptr) + 1; -} - -static void deserializeInterp(const unsigned char*& ptr, Interpretation& interp) { - deserializeLemma(ptr, interp.lemma); - interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); - ptr += 2; - interp.nameClassifier = *ptr; - ptr++; -} - -long MorphDeserializer::deserialize(const unsigned char* ptr, vector<Interpretation>& interps) const { - const unsigned char* currPtr = ptr; - uint8_t interpsNum = *ptr; - interps.clear(); - interps.reserve(interpsNum); - currPtr++; - for (unsigned int i = 0; i < interpsNum; i++) { - Interpretation interp; - deserializeInterp(currPtr, interp); - interps.push_back(interp); - } - return currPtr - ptr; -} - #endif /* _SIMPLE_FSA_IMPL_HPP */ diff --git a/fsa/interpretation.hpp b/fsa/interpretation.hpp deleted file mode 100644 index 6c83055..0000000 --- a/fsa/interpretation.hpp +++ /dev/null @@ -1,29 +0,0 @@ -/* - * File: interpretation.hpp - * Author: mlenart - * - * Created on November 4, 2013, 3:11 PM - */ - -#ifndef INTERPRETATION_HPP -#define INTERPRETATION_HPP - -#include <string> -#include <list> - -using namespace std; - -struct Lemma { - unsigned short suffixToCut; - const char* suffixToAdd; -}; - -struct Interpretation { - Lemma lemma; - unsigned int tag; // np. subst:sg:nom:m1 - unsigned short nameClassifier; // np. "pospolita" - unsigned short qualifier; // np. "dawne" lub "potoczne" -}; - -#endif /* INTERPRETATION_HPP */ - diff --git a/fsa/simplefsa_impl.hpp b/fsa/simplefsa_impl.hpp index fecef99..9445404 100644 --- a/fsa/simplefsa_impl.hpp +++ b/fsa/simplefsa_impl.hpp @@ -37,7 +37,7 @@ static unsigned int decodeOffset(const unsigned char* ptr) { template <class T> void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { const unsigned char* fromPointer = this->initialStatePtr + state.getOffset(); - int transitionsTableOffset = sizeof (StateData); + long transitionsTableOffset = sizeof (StateData); if (state.isAccepting()) { transitionsTableOffset += state.getValueSize(); } @@ -60,7 +60,7 @@ void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { const StateData* nextStateData = reinterpret_cast<const StateData*>(nextStatePointer); if (nextStateData->accepting) { T object; - int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object); + long size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object); state.setNext(offset, object, size); } else { state.setNext(offset); diff --git a/fsa/state_impl.hpp b/fsa/state_impl.hpp index 0fd40a9..200722e 100644 --- a/fsa/state_impl.hpp +++ b/fsa/state_impl.hpp @@ -51,7 +51,7 @@ T State<T>::getValue() const { } template <class T> -unsigned int State<T>::getValueSize() const { +unsigned long State<T>::getValueSize() const { assert(this->isAccepting()); return this->valueSize; } @@ -69,7 +69,7 @@ void State<T>::setNext(const unsigned long offset) { } template <class T> -void State<T>::setNext(const unsigned long offset, const T& value, const unsigned int valueSize) { +void State<T>::setNext(const unsigned long offset, const T& value, const unsigned long valueSize) { // assert(!this->isSink()); this->offset = offset; this->accepting = true; diff --git a/fsa/test_morph.cpp b/fsa/test_morph.cpp deleted file mode 100644 index 100d558..0000000 --- a/fsa/test_morph.cpp +++ /dev/null @@ -1,53 +0,0 @@ -/* - * File: test_morph.cpp - * Author: mlenart - * - * Created on November 8, 2013, 4:12 PM - */ - -#include <cstdlib> -#include <sstream> -#include <iostream> -#include "fsa.hpp" -#include "utils.hpp" - -using namespace std; - -void debug(const string& key, const vector<Interpretation> value) { - cerr << key << endl; - for (Interpretation i: value) { - cerr << "suffix to cut: " << i.lemma.suffixToCut << endl; - cerr << "suffix to add: " << i.lemma.suffixToAdd << endl; - cerr << "tag: " << i.tag << endl; - cerr << "name: " << i.nameClassifier << endl; - } - cerr << "==================" << endl; -} - -void doTest(const FSA<vector<Interpretation>>& fsa, const char* fname) { - ifstream ifs; - // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); - ifs.open(fname, ios::binary); - string line; - while (getline(ifs, line)) { - vector<string> splitVector(split(line, '\t')); - string key = splitVector[0]; - vector<Interpretation> value2; - fsa.tryToRecognize(key.c_str(), value2); - debug(key, value2); -// validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key); - } - validate(ifs.eof(), "Failed to read the input file to the end"); -} - -int main(int argc, char** argv) { - validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename."); - const unsigned char* fsaData = readFile(argv[1]); - MorphDeserializer deserializer; - FSA<vector<Interpretation>>* fsa = FSA<vector<Interpretation>>::getFSA(fsaData, deserializer); - doTest(*fsa, argv[2]); - // cout << argc << endl; - delete fsa; - return 0; -} - diff --git a/fsa/utils.hpp b/fsa/utils.hpp index 5475a0c..35e69c5 100644 --- a/fsa/utils.hpp +++ b/fsa/utils.hpp @@ -9,14 +9,15 @@ #define UTILS_HPP #include <iostream> +#include <fstream> #include <sstream> #include <string> #include <fstream> #include <vector> -using namespace std; +//using namespace std; -//#define DEBUG_BUILD +#define DEBUG_BUILD #ifdef DEBUG_BUILD # define DEBUG(x) do { std::cerr << x << std::endl; } while (0) @@ -24,14 +25,14 @@ using namespace std; # define DEBUG(x) #endif -void validate(const bool cond, const std::string& msg) { +inline void validate(const bool cond, const std::string& msg) { if (!cond) { std::cerr << msg << std::endl; exit(1); } } -std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) { +inline std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) { std::stringstream ss(s); std::string item; while (std::getline(ss, item, delim)) { @@ -41,25 +42,25 @@ std::vector<std::string> &split(const std::string &s, char delim, std::vector<st } -std::vector<std::string> split(const std::string &s, char delim) { +inline std::vector<std::string> split(const std::string &s, char delim) { std::vector<std::string> elems; split(s, delim, elems); return elems; } -string &rtrim(string &s) { - s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun<int, int>(isspace))).base(), s.end()); - return s; -} +//string &rtrim(string &s) { +// s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun<int, int>(isspace))).base(), s.end()); +// return s; +//} -unsigned char* readFile(const char* fname) { - ifstream ifs; +inline unsigned char* readFile(const char* fname) { + std::ifstream ifs; ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); - ifs.open(fname, ios::in | ios::binary | ios::ate); + ifs.open(fname, std::ios::in | std::ios::binary | std::ios::ate); // if (ifs.is_open()) { - int size = ifs.tellg(); + long size = ifs.tellg(); unsigned char* memblock = new unsigned char [size]; - ifs.seekg(0, ios::beg); + ifs.seekg(0, std::ios::beg); ifs.read(reinterpret_cast<char*> (memblock), size); ifs.close(); return memblock; diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt index 641748b..ea4b19f 100644 --- a/morfeusz/CMakeLists.txt +++ b/morfeusz/CMakeLists.txt @@ -6,7 +6,10 @@ include_directories (${Morfeusz_SOURCE_DIR}/fsa) add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) add_executable (morfeusz2_analyze main.cpp) +add_executable (test_morph test_morph.cpp interpretations.cpp MorphDeserializer.cpp Tagset.cpp ../fsa/const.cpp) # Link the executable to the Hello library. target_link_libraries (morfeusz2_analyze morfeusz2) set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) + +set_target_properties ( test_morph PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" ) diff --git a/morfeusz/MorphDeserializer.cpp b/morfeusz/MorphDeserializer.cpp new file mode 100644 index 0000000..6d3605b --- /dev/null +++ b/morfeusz/MorphDeserializer.cpp @@ -0,0 +1,48 @@ +/* + * File: MorphDeserializer.cpp + * Author: mlenart + * + * Created on 12 listopad 2013, 15:31 + */ + +#include "MorphDeserializer.hpp" + +MorphDeserializer::MorphDeserializer() { +} + +MorphDeserializer::MorphDeserializer(const MorphDeserializer& orig) { +} + +MorphDeserializer::~MorphDeserializer() { +} + +static void deserializeLemma(const unsigned char*& ptr, Lemma& lemma) { + // XXX uważać na poprawność danych + lemma.suffixToCut = *ptr; + ptr++; + lemma.suffixToAdd = (const char*) ptr; + ptr += strlen((const char*) ptr) + 1; +} + +static void deserializeInterp(const unsigned char*& ptr, Interpretation& interp) { + deserializeLemma(ptr, interp.lemma); + interp.tag = ntohs(*(reinterpret_cast<const uint16_t*>(ptr))); + ptr += 2; + interp.nameClassifier = *ptr; + ptr++; +} + +long MorphDeserializer::deserialize(const unsigned char* ptr, vector<Interpretation>& interps) const { + const unsigned char* currPtr = ptr; + uint8_t interpsNum = *ptr; + interps.clear(); + interps.reserve(interpsNum); + currPtr++; + for (unsigned int i = 0; i < interpsNum; i++) { + Interpretation interp; + deserializeInterp(currPtr, interp); + interps.push_back(interp); + } + return currPtr - ptr; +} + diff --git a/morfeusz/MorphDeserializer.hpp b/morfeusz/MorphDeserializer.hpp new file mode 100644 index 0000000..ad4b73d --- /dev/null +++ b/morfeusz/MorphDeserializer.hpp @@ -0,0 +1,28 @@ +/* + * File: MorphDeserializer.hpp + * Author: mlenart + * + * Created on 12 listopad 2013, 15:31 + */ + +#ifndef MORPHDESERIALIZER_HPP +#define MORPHDESERIALIZER_HPP + +#include <vector> +#include "fsa.hpp" +#include "interpretations.hpp" + +class MorphDeserializer: public Deserializer<std::vector<Interpretation>> { +public: + MorphDeserializer(); + MorphDeserializer(const MorphDeserializer& orig); + virtual ~MorphDeserializer(); + long deserialize( + const unsigned char* ptr, + std::vector<Interpretation>& interps) const; +private: + +}; + +#endif /* MORPHDESERIALIZER_HPP */ + diff --git a/morfeusz/Tagset.cpp b/morfeusz/Tagset.cpp new file mode 100644 index 0000000..b5ed12d --- /dev/null +++ b/morfeusz/Tagset.cpp @@ -0,0 +1,56 @@ + +#include <string> +#include <netinet/in.h> +#include "Tagset.hpp" +#include "const.hpp" +#include "utils.hpp" + +using namespace std; + +static uint16_t readInt16(const unsigned char*& currPtr) { + DEBUG("readInt16"); + uint16_t res = htons(*reinterpret_cast<const uint16_t*>(currPtr)); + DEBUG("still alive " + to_string(res)); + currPtr += 2; + DEBUG("still alive after ptr add"); + return res; +} + +static string readString(const unsigned char*& currPtr) { + DEBUG("readString"); + string res(reinterpret_cast<const char*>(currPtr)); + currPtr += res.length(); + currPtr++; + return res; +} + +static void readTags(const unsigned char*& currPtr, vector<string>& tags) { + tags.clear(); + tags.resize(65536); + uint16_t tagsNum = readInt16(currPtr); + DEBUG("hi there"); + DEBUG("tagsNum="+to_string((int) tagsNum)); + for (unsigned int i = 0; i < tagsNum; i++) { + unsigned int tagNum = readInt16(currPtr); + tags[tagNum] = readString(currPtr); + } +} + +Tagset::Tagset(const unsigned char* fsaData) { + const unsigned char* currPtr = fsaData + ADDITIONAL_DATA_OFFSET; +// uint32_t tagsNum = ntohl(*reinterpret_cast<const uint32_t*>(currPtr)); +// uint32_t namesNum = ntohl(*reinterpret_cast<const uint32_t*>(fsaData + ADDITIONAL_DATA_OFFSET + 4)); +// const unsigned char* currPtr = fsaData + 8; + DEBUG("will read tags"); + readTags(currPtr, this->tags); + DEBUG("will read names"); + readTags(currPtr, this->names); +} + +const string& Tagset::getTag(const int tagNum) const { + return this->tags.at(tagNum); +} + +const string& Tagset::getName(const int nameNum) const { + return this->names.at(nameNum); +} diff --git a/morfeusz/Tagset.hpp b/morfeusz/Tagset.hpp new file mode 100644 index 0000000..83529f6 --- /dev/null +++ b/morfeusz/Tagset.hpp @@ -0,0 +1,25 @@ +/* + * File: tagset.hpp + * Author: mlenart + * + * Created on 12 listopad 2013, 14:09 + */ + +#ifndef TAGSET_HPP +#define TAGSET_HPP + +#include <string> +#include <vector> + +class Tagset { +public: + explicit Tagset(const unsigned char* fsaData); + const std::string& getTag(const int tagNum) const; + const std::string& getName(const int nameNum) const; +private: + std::vector<std::string> tags; + std::vector<std::string> names; +}; + +#endif /* TAGSET_HPP */ + diff --git a/morfeusz/interpretations.cpp b/morfeusz/interpretations.cpp new file mode 100644 index 0000000..fa29d1c --- /dev/null +++ b/morfeusz/interpretations.cpp @@ -0,0 +1,56 @@ + +#include "interpretations.hpp" +#include "Tagset.hpp" + +using namespace std; + +Interpretation::Interpretation() +: lemma(), tag(), nameClassifier() { + +} + +Interpretation::Interpretation(const Lemma& lemma, const int tag, const int name) +: lemma(lemma), tag(tag), nameClassifier(name) { + +} + +StringInterpretation::StringInterpretation( + const string& lemma, + const string& tag, + const string& name) +: lemma(lemma), tag(tag), name(name) { + +} + +string StringInterpretation::toString() const { + std::stringstream ss; + ss << lemma << ":" << tag << ":" << name; + return ss.str(); +} + +string LemmaConverter::convertLemma( + const string& orth, + const Lemma& lemma) const { + string res(orth); + res.erase( + res.end() - lemma.suffixToCut, + res.end()); + res.append(lemma.suffixToAdd); + return res; +} + +InterpretationsConverter::InterpretationsConverter(const unsigned char* data) +: tagset(Tagset(data)) { + +} + +StringInterpretation InterpretationsConverter::convertInterpretation( + const string& orth, + const Interpretation& interp) const { + string lemma = this->lemmaConverter.convertLemma(orth, interp.lemma); + const string& tag = this->tagset.getTag(interp.tag); + const string& name = this->tagset.getName(interp.nameClassifier); + return StringInterpretation(lemma, tag, name); +} + + diff --git a/morfeusz/interpretations.hpp b/morfeusz/interpretations.hpp new file mode 100644 index 0000000..4b3ac49 --- /dev/null +++ b/morfeusz/interpretations.hpp @@ -0,0 +1,58 @@ +/* + * File: interpretation.hpp + * Author: mlenart + * + * Created on November 4, 2013, 3:11 PM + */ + +#ifndef INTERPRETATION_HPP +#define INTERPRETATION_HPP + +#include <string> +#include <sstream> +#include "Tagset.hpp" + +using namespace std; + +struct Lemma { + int suffixToCut; + string suffixToAdd; +}; + +struct Interpretation { + Interpretation(); + Interpretation(const Lemma& lemma, const int tag, const int name); + Lemma lemma; + int tag; // np. subst:sg:nom:m1 + int nameClassifier; // np. "pospolita" +// int qualifier; // np. "dawne" lub "potoczne" +}; + +struct StringInterpretation { + StringInterpretation(const std::string& lemma, const std::string& tag, const std::string& name); + const std::string lemma; + const std::string& tag; // np. subst:sg:nom:m1 + const std::string& name; // np. "pospolita" +// std::string qualifier; // np. "dawne" lub "potoczne" + std::string toString() const; +}; + +class LemmaConverter { +public: + std::string convertLemma(const std::string& orth, const Lemma& interp) const; +}; + + +class InterpretationsConverter { +public: + explicit InterpretationsConverter(const unsigned char* data); + StringInterpretation convertInterpretation( + const std::string& orth, + const Interpretation& interp) const; +private: + LemmaConverter lemmaConverter; + Tagset tagset; +}; + +#endif /* INTERPRETATION_HPP */ + diff --git a/morfeusz/main.cpp b/morfeusz/main.cpp index 9713b42..d0b4b21 100644 --- a/morfeusz/main.cpp +++ b/morfeusz/main.cpp @@ -9,6 +9,7 @@ #include <iostream> #include "fsa.hpp" #include "default_fsa.hpp" +#include "Tagset.hpp" using namespace std; diff --git a/morfeusz/test_morph.cpp b/morfeusz/test_morph.cpp new file mode 100644 index 0000000..278afed --- /dev/null +++ b/morfeusz/test_morph.cpp @@ -0,0 +1,82 @@ +/* + * File: test_morph.cpp + * Author: mlenart + * + * Created on November 8, 2013, 4:12 PM + */ + +//#include <cstdlib> +#include <sstream> +#include <iostream> +#include "fsa.hpp" +#include "interpretations.hpp" +#include "utils.hpp" +#include "MorphDeserializer.hpp" + +using namespace std; + +void debug(const string& key, const vector<Interpretation> value) { + cerr << key << endl; + for (Interpretation i: value) { + cerr << "suffix to cut: " << i.lemma.suffixToCut << endl; + cerr << "suffix to add: " << i.lemma.suffixToAdd << endl; + cerr << "tag: " << i.tag << endl; + cerr << "name: " << i.nameClassifier << endl; + } + cerr << "==================" << endl; +} + +void debug(const string& key, const StringInterpretation& value) { + cerr << key << '\t' << value.toString() << endl; +} + +void doTest( + const FSA<vector<Interpretation>>& fsa, + const InterpretationsConverter& interpsConverter, + const char* fname) { + ifstream ifs; + // ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); + ifs.open(fname, ios::binary); + string line; + while (getline(ifs, line)) { + vector<string> splitVector(split(line, '\t')); + string key = splitVector[0]; + string lemma = splitVector[1]; + string tag = splitVector[2]; + string name = splitVector[3]; + vector<Interpretation> value2; + fsa.tryToRecognize(key.c_str(), value2); + vector<StringInterpretation> parsedValues; + bool found = false; + for (Interpretation interp: value2) { + StringInterpretation parsedValue = interpsConverter.convertInterpretation(key, interp); +// parsedValues.push_back(parsedValue); + debug(key, parsedValue); + if (lemma == parsedValue.lemma && tag == parsedValue.tag && name == parsedValue.name) { + found = true; + } + } + validate(found, "Failed to recognize " + key + " " + lemma + ":" + tag + ":" + name); +// debug(key, value2); +// validate(fsa.tryToRecognize(key.c_str(), value2), "Failed to recognize " + key); + } + validate(ifs.eof(), "Failed to read the input file to the end"); +} + +int main(int argc, char** argv) { + DEBUG("start test"); + validate(argc == 3, "Must provide exactly two arguments - FSA filename, and dictionary filename."); + const unsigned char* fsaData = readFile(argv[1]); + MorphDeserializer deserializer; + DEBUG("will read FSA"); + FSA<vector<Interpretation>>* fsa = FSA<vector<Interpretation>>::getFSA(fsaData, deserializer); + DEBUG("DONE read FSA"); + DEBUG("will read tagset"); + InterpretationsConverter converter(fsaData); + DEBUG("DONE read tagset"); + DEBUG("still alive"); + doTest(*fsa, converter, argv[2]); + // cout << argc << endl; + delete fsa; + return 0; +} diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml index 47c78ca..58caadc 100644 --- a/nbproject/configurations.xml +++ b/nbproject/configurations.xml @@ -1,17 +1,34 @@ <?xml version="1.0" encoding="UTF-8"?> <configurationDescriptor version="90"> <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT"> + <logicalFolder name="2.8.11.2" + displayName="2.8.11.2" + projectFiles="true" + root="build/CMakeFiles/2.8.11.2"> + <logicalFolder name="CompilerIdC" displayName="CompilerIdC" projectFiles="true"> + <itemPath>build/CMakeFiles/2.8.11.2/CompilerIdC/CMakeCCompilerId.c</itemPath> + </logicalFolder> + <logicalFolder name="CompilerIdCXX" + displayName="CompilerIdCXX" + projectFiles="true"> + <itemPath>build/CMakeFiles/2.8.11.2/CompilerIdCXX/CMakeCXXCompilerId.cpp</itemPath> + </logicalFolder> + </logicalFolder> <df root="fsa" name="0"> - <in>cfsa1_impl.hpp</in> - <in>cfsa2_impl.hpp</in> - <in>interpretation.hpp</in> - <in>simplefsa_impl.hpp</in> - <in>test_morph.cpp</in> <in>test_not_recognize.cpp</in> <in>test_recognize.cpp</in> <in>test_speed.cpp</in> </df> + <logicalFolder name="Modules" + displayName="Modules" + projectFiles="true" + root="/usr/share/cmake-2.8/Modules"> + <itemPath>/usr/share/cmake-2.8/Modules/CMakeCCompilerABI.c</itemPath> + <itemPath>/usr/share/cmake-2.8/Modules/CMakeCXXCompilerABI.cpp</itemPath> + <itemPath>/usr/share/cmake-2.8/Modules/CMakeCompilerABI.h</itemPath> + </logicalFolder> <df root="morfeusz" name="1"> + <in>interpretations.cpp</in> <in>main.cpp</in> <in>morfeusz.cpp</in> </df> @@ -22,7 +39,6 @@ <itemPath>CMakeLists.txt</itemPath> <itemPath>build/Makefile</itemPath> </logicalFolder> - <itemPath>cfsa1_impl.hpp</itemPath> </logicalFolder> <sourceFolderFilter>^(nbproject)$</sourceFolderFilter> <sourceRootList> @@ -44,10 +60,41 @@ <buildCommandWorkingDir>build</buildCommandWorkingDir> <buildCommand>${MAKE} -f Makefile</buildCommand> <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> - <executablePath>build/fsa/test_dict</executablePath> + <executablePath>build/fsa/test_speed</executablePath> + <cTool> + <incDir> + <pElem>build/CMakeFiles/CMakeTmp</pElem> + </incDir> + </cTool> </makeTool> </makefileType> - <item path="cfsa1_impl.hpp" ex="false" tool="3" flavor2="0"> + <item path="/usr/share/cmake-2.8/Modules/CMakeCCompilerABI.c" + ex="false" + tool="0" + flavor2="2"> + <cTool> + </cTool> + </item> + <item path="/usr/share/cmake-2.8/Modules/CMakeCXXCompilerABI.cpp" + ex="false" + tool="1" + flavor2="4"> + <ccTool> + </ccTool> + </item> + <item path="build/CMakeFiles/2.8.11.2/CompilerIdC/CMakeCCompilerId.c" + ex="false" + tool="0" + flavor2="2"> + <cTool> + </cTool> + </item> + <item path="build/CMakeFiles/2.8.11.2/CompilerIdCXX/CMakeCXXCompilerId.cpp" + ex="false" + tool="1" + flavor2="4"> + <ccTool> + </ccTool> </item> <folder path="0"> <ccTool> @@ -56,23 +103,27 @@ </incDir> </ccTool> </folder> - <folder path="1"> + <folder path="2.8.11.2"> + <ccTool> + <incDir> + <pElem>build/CMakeFiles/CMakeTmp</pElem> + </incDir> + </ccTool> + </folder> + <folder path="Modules"> + <ccTool> + <incDir> + <pElem>build/CMakeFiles/CMakeTmp</pElem> + </incDir> + </ccTool> + </folder> + <item path="fsa/const.cpp" ex="false" tool="1" flavor2="8"> <ccTool> <incDir> <pElem>fsa</pElem> <pElem>build/morfeusz</pElem> </incDir> </ccTool> - </folder> - <item path="fsa/cfsa1_impl.hpp" ex="false" tool="3" flavor2="0"> - </item> - <item path="fsa/cfsa2_impl.hpp" ex="false" tool="3" flavor2="0"> - </item> - <item path="fsa/interpretation.hpp" ex="false" tool="3" flavor2="0"> - </item> - <item path="fsa/simplefsa_impl.hpp" ex="false" tool="3" flavor2="0"> - </item> - <item path="fsa/test_morph.cpp" ex="false" tool="1" flavor2="0"> </item> <item path="fsa/test_not_recognize.cpp" ex="false" tool="1" flavor2="8"> <ccTool> @@ -86,12 +137,53 @@ <ccTool> </ccTool> </item> + <item path="morfeusz/MorphDeserializer.cpp" ex="false" tool="1" flavor2="8"> + <ccTool> + <incDir> + <pElem>fsa</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + </ccTool> + </item> + <item path="morfeusz/Tagset.cpp" ex="false" tool="1" flavor2="8"> + <ccTool> + <incDir> + <pElem>fsa</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + </ccTool> + </item> + <item path="morfeusz/interpretations.cpp" ex="false" tool="1" flavor2="8"> + <ccTool> + <incDir> + <pElem>fsa</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + </ccTool> + </item> <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> <ccTool> + <incDir> + <pElem>fsa</pElem> + <pElem>build/morfeusz</pElem> + </incDir> </ccTool> </item> <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="4"> <ccTool> + <incDir> + <pElem>build/CMakeFiles/CMakeTmp</pElem> + <pElem>fsa</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + </ccTool> + </item> + <item path="morfeusz/test_morph.cpp" ex="false" tool="1" flavor2="8"> + <ccTool> + <incDir> + <pElem>fsa</pElem> + <pElem>build/morfeusz</pElem> + </incDir> </ccTool> </item> </conf> diff --git a/nbproject/project.xml b/nbproject/project.xml index 84b35e8..6630483 100644 --- a/nbproject/project.xml +++ b/nbproject/project.xml @@ -4,9 +4,9 @@ <configuration> <data xmlns="http://www.netbeans.org/ns/make-project/1"> <name>morfeusz</name> - <c-extensions/> + <c-extensions>c</c-extensions> <cpp-extensions>cpp</cpp-extensions> - <header-extensions>hpp</header-extensions> + <header-extensions>h,hpp</header-extensions> <sourceEncoding>UTF-8</sourceEncoding> <make-dep-projects/> <sourceRootList> -- libgit2 0.22.2