diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..3c6fc4f --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,12 @@ +# The name of our project is "HELLO". CMakeLists files in this project can +# refer to the root source directory of the project as ${HELLO_SOURCE_DIR} and +# to the root binary directory of the project as ${HELLO_BINARY_DIR}. +cmake_minimum_required (VERSION 2.6) +project (Morfeusz) + +# Recurse into the "Hello" and "Demo" subdirectories. This does not actually +# cause another cmake executable to run. The same process will walk through +# the project's entire directory structure. +add_subdirectory (fsa) +add_subdirectory (morfeusz) + diff --git a/fsa/CMakeLists.txt b/fsa/CMakeLists.txt new file mode 100644 index 0000000..be16a40 --- /dev/null +++ b/fsa/CMakeLists.txt @@ -0,0 +1,3 @@ + +add_executable (test_dict test_dict.cpp) +set_target_properties ( test_dict PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -g" ) diff --git a/fsa/_simple_fsa_impl.hpp b/fsa/_simple_fsa_impl.hpp new file mode 100644 index 0000000..1539c91 --- /dev/null +++ b/fsa/_simple_fsa_impl.hpp @@ -0,0 +1,107 @@ +/* + * File: _simple_fsa_impl.hpp + * Author: mlenart + * + * Created on October 20, 2013, 12:25 PM + */ + +#ifndef _SIMPLE_FSA_IMPL_HPP +#define _SIMPLE_FSA_IMPL_HPP + +#include <algorithm> +#include <utility> +#include <iostream> +#include "fsa.hpp" + +using namespace std; + +#pragma pack(push) /* push current alignment to stack */ +#pragma pack(1) /* set alignment to 1 byte boundary */ + +struct StateData { + unsigned transitionsNum : 7; + unsigned accepting : 1; +}; + +struct TransitionData { + char label; + unsigned targetOffset : 24; +}; + +#pragma pack(pop) /* restore original alignment from stack */ + +static bool compareTransitions(TransitionData t1, TransitionData t2) { + return t1.label < t2.label; +} + +template <class T> +SimpleFSA<T>::SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer) +: FSA<T>(ptr, deserializer) { +} + +template <class T> +SimpleFSA<T>::~SimpleFSA() { + +} + +static void debugState(const StateData* stateData) { + cerr << "STATE" << endl; + cerr << stateData->transitionsNum << " " << stateData->accepting << endl; +} + +static void debugTransitions(const TransitionData* transitionsTable, const TransitionData* transitionsEnd) { + int offset = 0; + cerr << "TRANSITIONS" << endl; + while (transitionsTable + offset < transitionsEnd) { + const TransitionData td = *(transitionsTable + offset); + if ((td.label <= 'z' && 'a' <= td.label)) + cerr << td.label << " " << td.targetOffset << endl; + else { + cerr << ((int) td.label) << " " << td.targetOffset << endl; + } + offset++; + } +} + +template <class T> +void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { + if (c<= 'z' && 'a' <= c) + cerr << "NEXT " << c << " from " << state.getOffset() << endl; + else + cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl; + const unsigned char* fromPointer = this->startPtr + state.getOffset(); + int transitionsTableOffset = sizeof (StateData); + if (state.isAccepting()) { + transitionsTableOffset += state.getValueSize(); + cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl; + } + const StateData* stateData = reinterpret_cast<const StateData*> (fromPointer); + const TransitionData* transitionsTable = reinterpret_cast<const TransitionData*> (fromPointer + transitionsTableOffset); + const TransitionData* transitionsEnd = transitionsTable + stateData->transitionsNum; + debugState(stateData); + debugTransitions(transitionsTable, transitionsEnd); + const TransitionData* foundTransition = std::lower_bound( + transitionsTable, transitionsEnd, + TransitionData{c, 0}, + compareTransitions); + if (foundTransition == transitionsEnd || foundTransition->label != c) { + cerr << "SINK" << (foundTransition == transitionsEnd) << " " << foundTransition->label << " for " << c << endl; + state.setNextAsSink(); + } + else { +// cerr << "FOUND " << foundTransition->label << " " << foundTransition->targetOffset << endl; + const unsigned char* nextStatePointer = this->startPtr + foundTransition->targetOffset; + const StateData* nextStateData = reinterpret_cast<const StateData*> (nextStatePointer); + if (nextStateData->accepting) { + cerr << "ACCEPTING" << endl; + T object; + int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object); + state.setNext(foundTransition->targetOffset, object, size); + } + else { + state.setNext(foundTransition->targetOffset); + } + } +} + +#endif /* _SIMPLE_FSA_IMPL_HPP */ diff --git a/fsa/_state_impl.hpp b/fsa/_state_impl.hpp new file mode 100644 index 0000000..ac9a1ab --- /dev/null +++ b/fsa/_state_impl.hpp @@ -0,0 +1,87 @@ +/* + * File: _state_impl.hpp + * Author: mlenart + * + * Created on 21 październik 2013, 15:20 + */ + +#ifndef _STATE_IMPL_HPP +#define _STATE_IMPL_HPP + +#include <typeinfo> +#include "fsa.hpp" + +using namespace std; + +template <class T> +State<T>::State(const FSA<T>& fsa) +: fsa(fsa), offset(0), accepting(false), sink(false), value(), valueSize(0) { +} + +template <class T> +bool State<T>::isSink() const { + return this->sink; +} + +template <class T> +bool State<T>::isAccepting() const { + return this->accepting; +} + +template <class T> +void State<T>::proceedToNext(const char c) { + if (this->isSink()) { + return; + } + else { + this->fsa.proceedToNext(c, *this); + } +} + +template <class T> +unsigned int State<T>::getOffset() const { + assert(!this->isSink()); + return this->offset; +} + +template <class T> +T State<T>::getValue() const { + assert(this->isAccepting()); + return this->value; +} + +template <class T> +unsigned int State<T>::getValueSize() const { + assert(this->isAccepting()); + return this->valueSize; +} + +template <class T> +State<T>::~State() { + +} + +template <class T> +void State<T>::setNext(const unsigned int offset) { + assert(!this->isSink()); + this->offset = offset; + this->accepting = false; +} + +template <class T> +void State<T>::setNext(const unsigned int offset, const T& value, const unsigned int valueSize) { + assert(!this->isSink()); + this->offset = offset; + this->accepting = true; + this->value = value; + this->valueSize = valueSize; +} + +template <class T> +void State<T>::setNextAsSink() { + this->sink = true; + this->accepting = false; +} + +#endif /* _STATE_IMPL_HPP */ + diff --git a/fsa/fsa.hpp b/fsa/fsa.hpp new file mode 100644 index 0000000..a9d7e0a --- /dev/null +++ b/fsa/fsa.hpp @@ -0,0 +1,170 @@ +/* + * File: fsa.hh + * Author: mlenart + * + * Created on October 17, 2013, 2:00 PM + */ + +#ifndef FSA_HPP +#define FSA_HPP + +//#include <iostream> +#include <cstring> +#include <typeinfo> +#include <cassert> + +template <class T> class State; +template <class T> class FSA; +template <class T> class Deserializer; + +template <class T> +class Deserializer { +public: + + /** + * Deserialize object from ptr. + * Returns number of bytes read or -1 on error. + */ + virtual int deserialize(const unsigned char* ptr, T& object) const = 0; +}; + +class StringDeserializer : public Deserializer<char*> { +public: + + StringDeserializer() { + } + + /** + * Deserialize object from ptr. + * Returns number of bytes read or -1 on error. + */ + int deserialize(const unsigned char* ptr, char*& text) const { + text = const_cast<char*> (reinterpret_cast<const char*> (ptr)); + return strlen(text) + 1; + } +}; + +/** + * Finite state automaton. + */ +template <class T> +class FSA { +public: + + /** + * Get this automaton's initial state. + */ + State<T> getInitialState() const { + return State<T>(*this); + } + + bool tryToRecognize(const char* input, T& value) const { + State<T> currState = this->getInitialState(); + int i = 0; + while (!currState.isSink() && input[i] != '\0') { + currState.proceedToNext(input[i]); + i++; + } + if (currState.isAccepting()) { + value = currState.getValue(); + return true; + } + else { + return false; + } + } + + virtual ~FSA() { + } +protected: + + FSA(const unsigned char* ptr, const Deserializer<T>& deserializer) + : startPtr(ptr), deserializer(deserializer) { + } + /** + * Proceed to next state + * + * @param fromPointer - wskaźnik + * @param c - char for the transition. + * @return next state + */ + virtual void proceedToNext(const char c, State<T>& state) const = 0; + const unsigned char* startPtr; + const Deserializer<T>& deserializer; + friend class State<T>; +private: + // FSA(); +}; + +template <class T> +class SimpleFSA : public FSA<T> { +public: + SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer); + virtual ~SimpleFSA(); +protected: + void proceedToNext(const char c, State<T>& state) const; +private: + +}; + +#include "_simple_fsa_impl.hpp" + +/** + * A state in an FSA. + */ +template <class T> +class State { +public: + + /** + * Is this a "sink" state - non-accepting state without outgoing transitions + */ + bool isSink() const; + + /** + * Is this an accepting state + */ + bool isAccepting() const; + + /** + * Get next state proceeding a transition for given character. + */ + void proceedToNext(const char c); + + /** + * Get value of this state. + * Makes sense only for accepting states. + * For non-accepting states is throws an exception. + */ + T getValue() const; + + /** + * Get the size (in bytes) of this state's value. + * Makes sense only for accepting states. + * For non-accepting states is throws an exception. + */ + unsigned int getValueSize() const; + + unsigned int getOffset() const; + + void setNext(const unsigned int offset); + void setNext(const unsigned int offset, const T& value, const unsigned int valueSize); + void setNextAsSink(); + + State(const FSA<T>& fsa); + + virtual ~State(); +private: + const FSA<T>& fsa; + unsigned int offset; + bool accepting; + bool sink; + T value; + int valueSize; +}; + +#include "_state_impl.hpp" + +#endif /* FSA_HPP */ + + diff --git a/fsa/test_dict.cpp b/fsa/test_dict.cpp new file mode 100644 index 0000000..8eee700 --- /dev/null +++ b/fsa/test_dict.cpp @@ -0,0 +1,113 @@ +/* + * File: test.cpp + * Author: lennyn + * + * Created on October 22, 2013, 2:11 PM + */ + +#include <cstdlib> +#include <iostream> +#include <fstream> +#include <string> +#include <sstream> +#include <algorithm> +#include <functional> +#include <cctype> +#include <locale> +#include "fsa.hpp" + +using namespace std; + +void validate(const bool cond, const string& msg) { + if (!cond) { + cerr << msg << endl; + exit(1); + } +} + +unsigned char* readFile(const char* fname) { + ifstream ifs; + ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); + ifs.open(fname, ios::in | ios::binary | ios::ate); + // if (ifs.is_open()) { + int size = ifs.tellg(); + unsigned char* memblock = new unsigned char [size]; + ifs.seekg(0, ios::beg); + ifs.read(reinterpret_cast<char*> (memblock), size); + ifs.close(); + return memblock; + // } + // else { + // cerr << "Unable to open file " << fname << endl; + // } +} + +std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) { + std::stringstream ss(s); + std::string item; + while (std::getline(ss, item, delim)) { + elems.push_back(item); + } + return elems; +} + + +std::vector<std::string> split(const std::string &s, char delim) { + std::vector<std::string> elems; + split(s, delim, elems); + return elems; +} + +static inline string &rtrim(string &s) { + s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun<int, int>(isspace))).base(), s.end()); + return s; +} + +void testFSA(const FSA<char*>& fsa, const char* fname) { + ifstream ifs; +// ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); + ifs.open(fname, ios::binary); + string line; + while (getline(ifs, line)) { + vector<string> split1(split(line, '\t')); + string key = split1[0]; + key = "bijekcją"; + string value = split1[1]; + + for (unsigned int i = 0; i < key.length(); i++) { + cout << (int) key[i] << " "; + } + cout << endl; + + char* value2; + if (fsa.tryToRecognize(key.c_str(), value2)) { + if (string(value) != string(value2)) { + cout << "BAD INTERP " << key << " " << value << " != " << value2 << endl; + } + else { + cout << "OK! " << key << " " << value << endl; + } + } + else { + cout << "MISS " << key << " " << value << " not recognized" << endl; + } + } + cout << ifs.good() << endl; + cout << ifs.fail() << endl; + cout << ifs.eof() << endl; + cout << "done" << endl; +} + +/* + * + */ +int main(int argc, char** argv) { + validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename."); + const unsigned char* fsaData = readFile(argv[1]); + StringDeserializer deserializer; + SimpleFSA<char*> fsa(fsaData, deserializer); + testFSA(fsa, argv[2]); + cout << argc << endl; + return 0; +} + diff --git a/fsabuilder/.settings/org.eclipse.core.resources.prefs b/fsabuilder/.settings/org.eclipse.core.resources.prefs new file mode 100644 index 0000000..90c53a4 --- /dev/null +++ b/fsabuilder/.settings/org.eclipse.core.resources.prefs @@ -0,0 +1,3 @@ +eclipse.preferences.version=1 +encoding//fsa/buildfsa.py=utf-8 +encoding//fsa/test/testConstruction.py=utf-8 diff --git a/fsabuilder/fsa/__init__.py b/fsabuilder/fsa/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/fsabuilder/fsa/__init__.py diff --git a/fsabuilder/fsa/buildfsa.py b/fsabuilder/fsa/buildfsa.py new file mode 100644 index 0000000..39edb96 --- /dev/null +++ b/fsabuilder/fsa/buildfsa.py @@ -0,0 +1,95 @@ +# -*- coding:utf-8 -*- +''' +Created on 21 paź 2013 + +@author: mlenart +''' + +import sys +import logging +import codecs +import encode +import convertinput +from fsa import FSA +from serializer import SimpleSerializerWithStringValues +from visualizer import Visualizer +from optparse import OptionParser + +logging.basicConfig(level=logging.DEBUG) + +class OutputFormat(): + BINARY = 'BINARY' + CPP = 'CPP' + +class InputFormat(): + ENCODED = 'ENCODED' + POLIMORF = 'POLIMORF' + +def parseOptions(): + """ + Parses commandline args + """ + parser = OptionParser() + parser.add_option('-i', '--input-file', + dest='inputFile', + metavar='FILE', + help='path to input file') + parser.add_option('-o', '--output-file', + dest='outputFile', + metavar='FILE', + help='path to output file') + parser.add_option('--input-format', + dest='inputFormat', + help='input format - ENCODED or POLIMORF') + parser.add_option('--output-format', + dest='outputFormat', + help='output format - BINARY or CPP') + parser.add_option('--visualize', + dest='visualize', + action='store_true', + default=False, + help='visualize result') + + opts, args = parser.parse_args() + + if None in [opts.inputFile, opts.outputFile, opts.outputFormat, opts.inputFormat]: + parser.print_help() + exit(1) + if not opts.outputFormat.upper() in [OutputFormat.BINARY, OutputFormat.CPP]: + print >> sys.stderr, 'output format must be one of ('+str([OutputFormat.BINARY, OutputFormat.CPP])+')' + exit(1) + if not opts.inputFormat.upper() in [InputFormat.ENCODED, InputFormat.POLIMORF]: + print >> sys.stderr, 'input format must be one of ('+str([InputFormat.ENCODED, InputFormat.POLIMORF])+')' + exit(1) + return opts + +def readEncodedInput(inputFile): + with codecs.open(inputFile, 'r', 'utf8') as f: + for line in f.readlines(): + word, interps = line.strip().split() + yield word, interps.split(u'|') + +def readPolimorfInput(inputFile, encoder): + with codecs.open(inputFile, 'r', 'utf8') as f: + for entry in convertinput.convertPolimorf(f.readlines(), lambda (word, interp): encoder.word2SortKey(word)): + yield entry + +if __name__ == '__main__': + opts = parseOptions() + encoder = encode.Encoder() + fsa = FSA(encoder) + serializer = SimpleSerializerWithStringValues() + + inputData = readEncodedInput(opts.inputFile) \ + if opts.inputFormat == InputFormat.ENCODED \ + else readPolimorfInput(opts.inputFile, encoder) + + logging.info('feeding FSA with data ...') + fsa.feed(inputData) + logging.info('states num: '+str(fsa.getStatesNum())) + if opts.outputFormat == 'CPP': + serializer.serialize2CppFile(fsa, opts.outputFile) + else: + serializer.serialize2BinaryFile(fsa, opts.outputFile) + if opts.visualize: + Visualizer().visualize(fsa) diff --git a/fsabuilder/fsa/convertinput.py b/fsabuilder/fsa/convertinput.py new file mode 100644 index 0000000..986d3ee --- /dev/null +++ b/fsabuilder/fsa/convertinput.py @@ -0,0 +1,61 @@ +''' +Created on Oct 23, 2013 + +@author: mlenart +''' +import sys +import fileinput +import logging +from encode import Encoder + +def _encodeInterp(orth, base, tag, name): + removePrefix = 0 + root = u'' + for o, b in zip(orth, base): + if o == b: + root += o + else: + break + removeSuffixNum = len(orth) - len(root) + addSuffix = base[len(root):] + return u'+'.join([ + chr(ord('A')+removePrefix) + chr(ord('A')+removeSuffixNum) + addSuffix, + tag, + name]) + +def _parsePolimorf(inputLines): + for line0 in inputLines: + line = line0.strip(u'\n') + if line: +# print line + orth, base, tag, name = line.split(u'\t') + yield (orth, _encodeInterp(orth, base, tag, name)) + +def _sortAndMergeParsedInput(inputData, key=lambda k: k): + logging.info('sorting input...') + entries = list(inputData) + entries.sort(key=key) + logging.info('done sorting') + prevOrth = None + prevInterps = None + for orth, interp in entries: + if prevOrth and prevOrth == orth: + prevInterps.append(interp) + else: + if prevOrth: + yield (prevOrth, sorted(set(prevInterps))) + prevOrth = orth + prevInterps = [interp] + +def convertPolimorf(inputLines, sortKey=lambda k: k): + for orth, interps in _sortAndMergeParsedInput(_parsePolimorf(inputLines), key=sortKey): + yield orth, interps + +def _decodeInputLines(rawInputLines, encoding): + for line in rawInputLines: + yield line.decode(encoding) + +if __name__ == '__main__': + encoder = Encoder() + for orth, interps in convertPolimorf(_decodeInputLines(fileinput.input(), 'utf8'), lambda (orth, interp): encoder.word2SortKey(orth)): + print u'\t'.join([orth, u'|'.join(interps)]).encode('utf8') diff --git a/fsabuilder/fsa/encode.py b/fsabuilder/fsa/encode.py new file mode 100644 index 0000000..db6f68d --- /dev/null +++ b/fsabuilder/fsa/encode.py @@ -0,0 +1,31 @@ +''' +Created on Oct 23, 2013 + +@author: lennyn +''' + +class Encoder(object): + ''' + classdocs + ''' + + + def __init__(self, encoding='utf8'): + ''' + Constructor + ''' + self.encoding = encoding + + def encodeWord(self, word): + return bytearray(word, self.encoding) + + def encodeData(self, data): + return bytearray(u'|'.join(data).encode(self.encoding)) + bytearray([0]) + + def decodeData(self, rawData): +# print unicode(str(rawData), self.encoding)[:-1] +# print unicode(str(rawData), self.encoding)[:-1].split(u'|') + return unicode(str(rawData), self.encoding)[:-1].split(u'|') + + def word2SortKey(self, word): + return word.encode(self.encoding) diff --git a/fsabuilder/fsa/fsa.py b/fsabuilder/fsa/fsa.py new file mode 100644 index 0000000..9243863 --- /dev/null +++ b/fsabuilder/fsa/fsa.py @@ -0,0 +1,92 @@ +''' +Created on Oct 8, 2013 + +@author: mlenart +''' + +import state +import register +import logging + +class FSA(object): + ''' + A finite state automaton + ''' + + + def __init__(self, encoder): + self.encodeWord = encoder.encodeWord + self.encodeData = encoder.encodeData + self.decodeData = encoder.decodeData + self.encodedPrevWord = None + self.initialState = state.State() + self.register = register.Register() + + def tryToRecognize(self, word): + return self.decodeData(self.initialState.tryToRecognize(self.encodeWord(word))) + + def feed(self, input): + + for n, (word, data) in enumerate(input, start=1): + assert data is not None + if type(data) in [str, unicode]: + data = [data] + encodedWord = self.encodeWord(word) + assert encodedWord > self.encodedPrevWord + self._addSorted(encodedWord, self.encodeData(data)) + self.encodedPrevWord = encodedWord + assert self.tryToRecognize(word) == data + if n % 10000 == 0: + logging.info(word) + + self.initialState = self._replaceOrRegister(self.initialState, self.encodeWord(word)) + self.encodedPrevWord = None + + def getStatesNum(self): + return self.register.getStatesNum() + + def _addSorted(self, encodedWord, data): + assert self.encodedPrevWord < encodedWord + q = self.initialState + i = 0 + while i <= len(encodedWord) and q.hasNext(encodedWord[i]): + q = q.getNext(encodedWord[i]) + i += 1 + if self.encodedPrevWord and i < len(self.encodedPrevWord): + nextState = q.getNext(self.encodedPrevWord[i]) + q.setTransition( + self.encodedPrevWord[i], + self._replaceOrRegister(nextState, self.encodedPrevWord[i+1:])) + + while i < len(encodedWord): + q.setTransition(encodedWord[i], state.State()) + q = q.getNext(encodedWord[i]) + i += 1 + + assert q.encodedData is None +# print q, encodedData + q.encodedData = data + + def _replaceOrRegister(self, q, encodedWord): + if encodedWord: + nextState = q.getNext(encodedWord[0]) + q.setTransition( + encodedWord[0], + self._replaceOrRegister(nextState, encodedWord[1:])) + + if self.register.containsEquivalentState(q): + return self.register.getEquivalentState(q) + else: + self.register.addState(q) + return q + + def calculateOffsets(self, sizeCounter): + currReverseOffset = 0 + for state in self.initialState.dfs(set()): + currReverseOffset += sizeCounter(state) + state.reverseOffset = currReverseOffset + for state in self.initialState.dfs(set()): + state.offset = currReverseOffset - state.reverseOffset + + + \ No newline at end of file diff --git a/fsabuilder/fsa/register.py b/fsabuilder/fsa/register.py new file mode 100644 index 0000000..b20899f --- /dev/null +++ b/fsabuilder/fsa/register.py @@ -0,0 +1,26 @@ +''' +Created on Oct 8, 2013 + +@author: mlenart +''' + +class Register(object): + ''' + States register. + ''' + + + def __init__(self): + self._map = {} + + def addState(self, state): + self._map[state.getRegisterKey()] = state + + def getEquivalentState(self, state): + return self._map[state.getRegisterKey()] + + def containsEquivalentState(self, state): + return state.getRegisterKey() in self._map + + def getStatesNum(self): + return len(self._map) diff --git a/fsabuilder/fsa/serializer.py b/fsabuilder/fsa/serializer.py new file mode 100644 index 0000000..736b08c --- /dev/null +++ b/fsabuilder/fsa/serializer.py @@ -0,0 +1,93 @@ +''' +Created on Oct 20, 2013 + +@author: mlenart +''' + +class Serializer(object): + + def __init__(self): + pass + + def serialize2CppFile(self, fsa, fname): + res = [] + fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) + res.append('const unsigned char DEFAULT_FSA[] = {') + for idx, state in enumerate(sorted(fsa.initialState.dfs(set()), key=lambda state: state.offset)): + res.append('// state '+str(idx)) + partRes = [] + for byte in self.state2bytearray(state): + partRes.append(hex(byte)) + partRes.append(',') + res.append(' '.join(partRes)) + res.append('}') + with open(fname, 'w') as f: + f.write('\n'.join(res)) + + def serialize2BinaryFile(self, fsa, fname): + res = bytearray() + fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) + for state in sorted(fsa.initialState.dfs(set()), key=lambda state: state.offset): +# res.append('// state '+str(idx)) + res.extend(self.state2bytearray(state)) + with open(fname, 'wb') as f: + f.write(res) + + def getStateSize(self, state): + raise NotImplementedError('Not implemented') + + def fsa2bytearray(self, fsa): + res = bytearray() + fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state)) + for state in sorted(fsa.initialState.dfs(set()), key=state.offset): + res.extend(self.state2bytearray(state)) + return res + + def state2bytearray(self, state): + raise NotImplementedError('Not implemented') + +class SimpleSerializer(Serializer): + + ACCEPTING_FLAG = 128 + + def getStateSize(self, state): + return 1 + 4 * len(state.transitionsMap.keys()) + self.getDataSize(state) + + def getDataSize(self, state): + raise NotImplementedError('Not implemented') + + def state2bytearray(self, state): + res = bytearray() + res.extend(self._stateData2bytearray(state)) + res.extend(self._transitionsData2bytearray(state)) + return res + + def _stateData2bytearray(self, state): + res = bytearray() + firstByte = 0 + if state.isAccepting(): + firstByte |= SimpleSerializer.ACCEPTING_FLAG + firstByte |= len(state.transitionsMap) + assert firstByte < 256 and firstByte > 0 + res.append(firstByte) + if state.isAccepting(): + res.extend(state.encodedData) + return res + + def _transitionsData2bytearray(self, state): + res = bytearray() + # must sort that strange way because it must be sorted according to char, not unsigned char + for byte, nextState in sorted(state.transitionsMap.iteritems(), key=lambda (c, _): c if (c >= 0 and c < 128) else c - 256): + res.append(byte) + offset = nextState.offset + res.append(offset & 0x0000FF) + res.append((offset & 0x00FF00) >> 8) + res.append((offset & 0xFF0000) >> 16) + return res + +class SimpleSerializerWithStringValues(SimpleSerializer): + + def getDataSize(self, state): + assert type(state.encodedData) == bytearray or not state.isAccepting() + return len(state.encodedData) if state.isAccepting() else 0 + \ No newline at end of file diff --git a/fsabuilder/fsa/state.py b/fsabuilder/fsa/state.py new file mode 100644 index 0000000..1c24ceb --- /dev/null +++ b/fsabuilder/fsa/state.py @@ -0,0 +1,49 @@ +''' +Created on Oct 8, 2013 + +@author: mlenart +''' + +class State(object): + ''' + A state in an automaton + ''' + + def __init__(self): + self.transitionsMap = {} + self.encodedData = None + self.reverseOffset = None + self.offset = None + + def setTransition(self, byte, nextState): + self.transitionsMap[byte] = nextState + + def hasNext(self, byte): + return byte in self.transitionsMap + + def getNext(self, byte): + return self.transitionsMap.get(byte, None) + + def getRegisterKey(self): + return ( frozenset(self.transitionsMap.iteritems()), tuple(self.encodedData) if self.encodedData else None ) + + def isAccepting(self): + return self.encodedData is not None + + def tryToRecognize(self, word): + if word: + nextState = self.getNext(word[0]) + if nextState: + return nextState.tryToRecognize(word[1:]) + else: + return False + else: + return self.encodedData + + def dfs(self, alreadyVisited): + if not self in alreadyVisited: + for _, state in sorted(self.transitionsMap.iteritems()): + for state1 in state.dfs(alreadyVisited): + yield state1 + alreadyVisited.add(self) + yield self diff --git a/fsabuilder/fsa/test/__init__.py b/fsabuilder/fsa/test/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/fsabuilder/fsa/test/__init__.py diff --git a/fsabuilder/fsa/test/testConstruction.py b/fsabuilder/fsa/test/testConstruction.py new file mode 100644 index 0000000..6cbd142 --- /dev/null +++ b/fsabuilder/fsa/test/testConstruction.py @@ -0,0 +1,69 @@ +#-*- coding: utf-8 -*- +''' +Created on Oct 8, 2013 + +@author: lennyn +''' +import unittest +from fsa import fsa, visualizer, encode + +class Test(unittest.TestCase): + + + def testSimpleConstruction(self): + print 'dupa' + a = fsa.FSA(encode.Encoder()) + input = sorted([ + (u'bić', ''), + (u'bij', ''), + (u'biją', ''), + (u'bijcie', ''), + (u'bije', ''), + (u'bijecie', ''), + (u'bijemy', ''), + (u'bijesz', ''), + (u'biję', ''), + (u'bijmy', ''), + (u'bili', 'asd'), + (u'biliby', ''), + (u'bilibyście', ''), + (u'bilibyśmy', ''), + (u'biliście', 'asdfas'), + (u'biliśmy', ''), + (u'bił', 'wersadfas'), + (u'biła', 'asdfasd'), + (u'biłaby', 'asdfa'), + (u'biłabym', ''), + (u'biłabyś', 'asdfa'), + (u'biłam', 'dfas'), + (u'biłaś', 'asdfas'), + (u'biłby', ''), + (u'biłbym', 'asdfa'), + (u'biłbyś', ''), + (u'biłem', ''), + (u'biłeś', 'sadfa'), + (u'biły', ''), + (u'biłyby', ''), + (u'biłybyście', ''), + (u'biłybyśmy', ''), + (u'biłyście', ''), + (u'biłyśmy', ''), + ], key=lambda w: bytearray(w[0], 'utf8')) + a.feed(input) + print a.getStatesNum() +# print a.tryToRecognize(u'bi') +# print a.tryToRecognize(u'bić') +# print a.tryToRecognize(u'bili') + for w, res in input: + print w, res, a.tryToRecognize(w) + recognized = a.tryToRecognize(w) + if type(res) in [str, unicode]: + recognized = recognized[0] + assert recognized == res + a.calculateOffsets(lambda state: 1 + 4 * len(state.transitionsMap.keys()) + (len(state.encodedData) if state.isAccepting() else 0)) + visualizer.Visualizer().visualize(a) + print 'done' + +if __name__ == "__main__": + #import sys;sys.argv = ['', 'Test.testSimpleConstruction'] + unittest.main() \ No newline at end of file diff --git a/fsabuilder/fsa/visualizer.py b/fsabuilder/fsa/visualizer.py new file mode 100644 index 0000000..78c4410 --- /dev/null +++ b/fsabuilder/fsa/visualizer.py @@ -0,0 +1,45 @@ +''' +Created on Oct 20, 2013 + +@author: mlenart +''' + +import networkx as nx +import matplotlib.pyplot as plt + +class Visualizer(object): + + def __init__(self): + pass + + def visualize(self, fsa): + G = nx.DiGraph() + allStates = list(reversed(list(fsa.initialState.dfs(set())))) + edgeLabelsMap = {} + nodeLabelsMap = {} + for idx, state in enumerate(allStates): + G.add_node(idx, offset=state.offset) + for c, targetState in state.transitionsMap.iteritems(): + G.add_edge(idx, allStates.index(targetState)) + label = chr(c) if c <= 127 else '%' + edgeLabelsMap[(idx, allStates.index(targetState))] = label + nodeLabelsMap[idx] = state.offset if not state.isAccepting() else state.encodedData + '(' + str(state.offset) + ')' + pos=nx.shell_layout(G) + nx.draw_networkx_nodes(G, + pos, + nodelist=list([allStates.index(s) for s in allStates if not s.isAccepting()]), + node_shape='s', + node_color='w') + nx.draw_networkx_nodes(G, + pos, + nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]), + node_shape='s') +# nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), ) + nx.draw_networkx_edges(G, pos, edgelist=edgeLabelsMap.keys()) + nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap) + nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap) + plt.axis('off') + plt.draw() + plt.show() +# plt.savefig('dupa.png') + print 'done' \ No newline at end of file diff --git a/fsabuilder/input/__init__.py b/fsabuilder/input/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/fsabuilder/input/__init__.py diff --git a/morfeusz/CMakeLists.txt b/morfeusz/CMakeLists.txt new file mode 100644 index 0000000..641748b --- /dev/null +++ b/morfeusz/CMakeLists.txt @@ -0,0 +1,12 @@ +# Make sure the compiler can find include files from our Hello library. +#include_directories (${Morfeusz_SOURCE_DIR}/FSALibrary) + +# Make sure the linker can find the Hello library once it is built. +#link_directories (${Morfeusz_BINARY_DIR}/Hello) +include_directories (${Morfeusz_SOURCE_DIR}/fsa) +add_library (morfeusz2 morfeusz.hpp morfeusz.cpp) +add_executable (morfeusz2_analyze main.cpp) + +# Link the executable to the Hello library. +target_link_libraries (morfeusz2_analyze morfeusz2) +set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" ) diff --git a/morfeusz/default_fsa.hpp b/morfeusz/default_fsa.hpp new file mode 100644 index 0000000..9de435f --- /dev/null +++ b/morfeusz/default_fsa.hpp @@ -0,0 +1,14 @@ +/* + * File: default_fsa.hpp + * Author: mlenart + * + * Created on 21 październik 2013, 17:50 + */ + +#ifndef DEFAULT_FSA_HPP +#define DEFAULT_FSA_HPP + +extern const unsigned char DEFAULT_FSA[]; + +#endif /* DEFAULT_FSA_HPP */ + diff --git a/morfeusz/main.cpp b/morfeusz/main.cpp new file mode 100644 index 0000000..9713b42 --- /dev/null +++ b/morfeusz/main.cpp @@ -0,0 +1,25 @@ +/* + * File: main.cc + * Author: mlenart + * + * Created on October 8, 2013, 12:41 PM + */ + +#include <cstdlib> +#include <iostream> +#include "fsa.hpp" +#include "default_fsa.hpp" + +using namespace std; + +/* + * + */ +int main(int argc, char** argv) { + unsigned char dupa[3] = {0376 | 1, 0111, 0234, }; + char x = 255; + cout << *reinterpret_cast<int*>(&x) << endl; + return 0; +} + + diff --git a/morfeusz/morfeusz.cpp b/morfeusz/morfeusz.cpp new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/morfeusz/morfeusz.cpp diff --git a/morfeusz/morfeusz.hpp b/morfeusz/morfeusz.hpp new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/morfeusz/morfeusz.hpp diff --git a/nbproject/configurations.xml b/nbproject/configurations.xml new file mode 100644 index 0000000..444bba9 --- /dev/null +++ b/nbproject/configurations.xml @@ -0,0 +1,61 @@ +<?xml version="1.0" encoding="UTF-8"?> +<configurationDescriptor version="90"> + <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT"> + <df root="fsa" name="0"> + <in>_state_impl.hpp</in> + </df> + <df root="morfeusz" name="1"> + <in>default_fsa.hpp</in> + <in>main.cpp</in> + <in>morfeusz.cpp</in> + </df> + <logicalFolder name="ExternalFiles" + displayName="Important Files" + projectFiles="false" + kind="IMPORTANT_FILES_FOLDER"> + <itemPath>CMakeLists.txt</itemPath> + <itemPath>build/Makefile</itemPath> + </logicalFolder> + </logicalFolder> + <sourceFolderFilter>^(nbproject)$</sourceFolderFilter> + <sourceRootList> + <Elem>fsa</Elem> + <Elem>morfeusz</Elem> + </sourceRootList> + <projectmakefile>build/Makefile</projectmakefile> + <confs> + <conf name="Default" type="0"> + <toolsSet> + <compilerSet>default</compilerSet> + <dependencyChecking>false</dependencyChecking> + <rebuildPropChanged>false</rebuildPropChanged> + </toolsSet> + <codeAssistance> + </codeAssistance> + <makefileType> + <makeTool> + <buildCommandWorkingDir>build</buildCommandWorkingDir> + <buildCommand>${MAKE} -f Makefile</buildCommand> + <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> + <executablePath></executablePath> + <ccTool> + <incDir> + <pElem>fsa</pElem> + <pElem>build/morfeusz</pElem> + </incDir> + </ccTool> + </makeTool> + </makefileType> + <item path="fsa/_state_impl.hpp" ex="false" tool="3" flavor2="0"> + </item> + <item path="morfeusz/default_fsa.hpp" ex="false" tool="3" flavor2="0"> + </item> + <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> + <ccTool> + </ccTool> + </item> + <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="0"> + </item> + </conf> + </confs> +</configurationDescriptor> diff --git a/nbproject/project.xml b/nbproject/project.xml new file mode 100644 index 0000000..84b35e8 --- /dev/null +++ b/nbproject/project.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://www.netbeans.org/ns/project/1"> + <type>org.netbeans.modules.cnd.makeproject</type> + <configuration> + <data xmlns="http://www.netbeans.org/ns/make-project/1"> + <name>morfeusz</name> + <c-extensions/> + <cpp-extensions>cpp</cpp-extensions> + <header-extensions>hpp</header-extensions> + <sourceEncoding>UTF-8</sourceEncoding> + <make-dep-projects/> + <sourceRootList> + <sourceRootElem>fsa</sourceRootElem> + <sourceRootElem>morfeusz</sourceRootElem> + </sourceRootList> + <confList> + <confElem> + <name>Default</name> + <type>0</type> + </confElem> + </confList> + <formatting> + <project-formatting-style>false</project-formatting-style> + </formatting> + </data> + </configuration> +</project>