Commit b314fe9d9e17aa2ed2c4b24739ff8156717e7948

Authored by Michał Lenart
1 parent 0b045fe6

- pierwszy commit

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@2 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
CMakeLists.txt 0 → 100644
  1 +# The name of our project is "HELLO". CMakeLists files in this project can
  2 +# refer to the root source directory of the project as ${HELLO_SOURCE_DIR} and
  3 +# to the root binary directory of the project as ${HELLO_BINARY_DIR}.
  4 +cmake_minimum_required (VERSION 2.6)
  5 +project (Morfeusz)
  6 +
  7 +# Recurse into the "Hello" and "Demo" subdirectories. This does not actually
  8 +# cause another cmake executable to run. The same process will walk through
  9 +# the project's entire directory structure.
  10 +add_subdirectory (fsa)
  11 +add_subdirectory (morfeusz)
  12 +
... ...
fsa/CMakeLists.txt 0 → 100644
  1 +
  2 +add_executable (test_dict test_dict.cpp)
  3 +set_target_properties ( test_dict PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -g" )
... ...
fsa/_simple_fsa_impl.hpp 0 → 100644
  1 +/*
  2 + * File: _simple_fsa_impl.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on October 20, 2013, 12:25 PM
  6 + */
  7 +
  8 +#ifndef _SIMPLE_FSA_IMPL_HPP
  9 +#define _SIMPLE_FSA_IMPL_HPP
  10 +
  11 +#include <algorithm>
  12 +#include <utility>
  13 +#include <iostream>
  14 +#include "fsa.hpp"
  15 +
  16 +using namespace std;
  17 +
  18 +#pragma pack(push) /* push current alignment to stack */
  19 +#pragma pack(1) /* set alignment to 1 byte boundary */
  20 +
  21 +struct StateData {
  22 + unsigned transitionsNum : 7;
  23 + unsigned accepting : 1;
  24 +};
  25 +
  26 +struct TransitionData {
  27 + char label;
  28 + unsigned targetOffset : 24;
  29 +};
  30 +
  31 +#pragma pack(pop) /* restore original alignment from stack */
  32 +
  33 +static bool compareTransitions(TransitionData t1, TransitionData t2) {
  34 + return t1.label < t2.label;
  35 +}
  36 +
  37 +template <class T>
  38 +SimpleFSA<T>::SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer)
  39 +: FSA<T>(ptr, deserializer) {
  40 +}
  41 +
  42 +template <class T>
  43 +SimpleFSA<T>::~SimpleFSA() {
  44 +
  45 +}
  46 +
  47 +static void debugState(const StateData* stateData) {
  48 + cerr << "STATE" << endl;
  49 + cerr << stateData->transitionsNum << " " << stateData->accepting << endl;
  50 +}
  51 +
  52 +static void debugTransitions(const TransitionData* transitionsTable, const TransitionData* transitionsEnd) {
  53 + int offset = 0;
  54 + cerr << "TRANSITIONS" << endl;
  55 + while (transitionsTable + offset < transitionsEnd) {
  56 + const TransitionData td = *(transitionsTable + offset);
  57 + if ((td.label <= 'z' && 'a' <= td.label))
  58 + cerr << td.label << " " << td.targetOffset << endl;
  59 + else {
  60 + cerr << ((int) td.label) << " " << td.targetOffset << endl;
  61 + }
  62 + offset++;
  63 + }
  64 +}
  65 +
  66 +template <class T>
  67 +void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const {
  68 + if (c<= 'z' && 'a' <= c)
  69 + cerr << "NEXT " << c << " from " << state.getOffset() << endl;
  70 + else
  71 + cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl;
  72 + const unsigned char* fromPointer = this->startPtr + state.getOffset();
  73 + int transitionsTableOffset = sizeof (StateData);
  74 + if (state.isAccepting()) {
  75 + transitionsTableOffset += state.getValueSize();
  76 + cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl;
  77 + }
  78 + const StateData* stateData = reinterpret_cast<const StateData*> (fromPointer);
  79 + const TransitionData* transitionsTable = reinterpret_cast<const TransitionData*> (fromPointer + transitionsTableOffset);
  80 + const TransitionData* transitionsEnd = transitionsTable + stateData->transitionsNum;
  81 + debugState(stateData);
  82 + debugTransitions(transitionsTable, transitionsEnd);
  83 + const TransitionData* foundTransition = std::lower_bound(
  84 + transitionsTable, transitionsEnd,
  85 + TransitionData{c, 0},
  86 + compareTransitions);
  87 + if (foundTransition == transitionsEnd || foundTransition->label != c) {
  88 + cerr << "SINK" << (foundTransition == transitionsEnd) << " " << foundTransition->label << " for " << c << endl;
  89 + state.setNextAsSink();
  90 + }
  91 + else {
  92 +// cerr << "FOUND " << foundTransition->label << " " << foundTransition->targetOffset << endl;
  93 + const unsigned char* nextStatePointer = this->startPtr + foundTransition->targetOffset;
  94 + const StateData* nextStateData = reinterpret_cast<const StateData*> (nextStatePointer);
  95 + if (nextStateData->accepting) {
  96 + cerr << "ACCEPTING" << endl;
  97 + T object;
  98 + int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object);
  99 + state.setNext(foundTransition->targetOffset, object, size);
  100 + }
  101 + else {
  102 + state.setNext(foundTransition->targetOffset);
  103 + }
  104 + }
  105 +}
  106 +
  107 +#endif /* _SIMPLE_FSA_IMPL_HPP */
... ...
fsa/_state_impl.hpp 0 → 100644
  1 +/*
  2 + * File: _state_impl.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 21 październik 2013, 15:20
  6 + */
  7 +
  8 +#ifndef _STATE_IMPL_HPP
  9 +#define _STATE_IMPL_HPP
  10 +
  11 +#include <typeinfo>
  12 +#include "fsa.hpp"
  13 +
  14 +using namespace std;
  15 +
  16 +template <class T>
  17 +State<T>::State(const FSA<T>& fsa)
  18 +: fsa(fsa), offset(0), accepting(false), sink(false), value(), valueSize(0) {
  19 +}
  20 +
  21 +template <class T>
  22 +bool State<T>::isSink() const {
  23 + return this->sink;
  24 +}
  25 +
  26 +template <class T>
  27 +bool State<T>::isAccepting() const {
  28 + return this->accepting;
  29 +}
  30 +
  31 +template <class T>
  32 +void State<T>::proceedToNext(const char c) {
  33 + if (this->isSink()) {
  34 + return;
  35 + }
  36 + else {
  37 + this->fsa.proceedToNext(c, *this);
  38 + }
  39 +}
  40 +
  41 +template <class T>
  42 +unsigned int State<T>::getOffset() const {
  43 + assert(!this->isSink());
  44 + return this->offset;
  45 +}
  46 +
  47 +template <class T>
  48 +T State<T>::getValue() const {
  49 + assert(this->isAccepting());
  50 + return this->value;
  51 +}
  52 +
  53 +template <class T>
  54 +unsigned int State<T>::getValueSize() const {
  55 + assert(this->isAccepting());
  56 + return this->valueSize;
  57 +}
  58 +
  59 +template <class T>
  60 +State<T>::~State() {
  61 +
  62 +}
  63 +
  64 +template <class T>
  65 +void State<T>::setNext(const unsigned int offset) {
  66 + assert(!this->isSink());
  67 + this->offset = offset;
  68 + this->accepting = false;
  69 +}
  70 +
  71 +template <class T>
  72 +void State<T>::setNext(const unsigned int offset, const T& value, const unsigned int valueSize) {
  73 + assert(!this->isSink());
  74 + this->offset = offset;
  75 + this->accepting = true;
  76 + this->value = value;
  77 + this->valueSize = valueSize;
  78 +}
  79 +
  80 +template <class T>
  81 +void State<T>::setNextAsSink() {
  82 + this->sink = true;
  83 + this->accepting = false;
  84 +}
  85 +
  86 +#endif /* _STATE_IMPL_HPP */
  87 +
... ...
fsa/fsa.hpp 0 → 100644
  1 +/*
  2 + * File: fsa.hh
  3 + * Author: mlenart
  4 + *
  5 + * Created on October 17, 2013, 2:00 PM
  6 + */
  7 +
  8 +#ifndef FSA_HPP
  9 +#define FSA_HPP
  10 +
  11 +//#include <iostream>
  12 +#include <cstring>
  13 +#include <typeinfo>
  14 +#include <cassert>
  15 +
  16 +template <class T> class State;
  17 +template <class T> class FSA;
  18 +template <class T> class Deserializer;
  19 +
  20 +template <class T>
  21 +class Deserializer {
  22 +public:
  23 +
  24 + /**
  25 + * Deserialize object from ptr.
  26 + * Returns number of bytes read or -1 on error.
  27 + */
  28 + virtual int deserialize(const unsigned char* ptr, T& object) const = 0;
  29 +};
  30 +
  31 +class StringDeserializer : public Deserializer<char*> {
  32 +public:
  33 +
  34 + StringDeserializer() {
  35 + }
  36 +
  37 + /**
  38 + * Deserialize object from ptr.
  39 + * Returns number of bytes read or -1 on error.
  40 + */
  41 + int deserialize(const unsigned char* ptr, char*& text) const {
  42 + text = const_cast<char*> (reinterpret_cast<const char*> (ptr));
  43 + return strlen(text) + 1;
  44 + }
  45 +};
  46 +
  47 +/**
  48 + * Finite state automaton.
  49 + */
  50 +template <class T>
  51 +class FSA {
  52 +public:
  53 +
  54 + /**
  55 + * Get this automaton's initial state.
  56 + */
  57 + State<T> getInitialState() const {
  58 + return State<T>(*this);
  59 + }
  60 +
  61 + bool tryToRecognize(const char* input, T& value) const {
  62 + State<T> currState = this->getInitialState();
  63 + int i = 0;
  64 + while (!currState.isSink() && input[i] != '\0') {
  65 + currState.proceedToNext(input[i]);
  66 + i++;
  67 + }
  68 + if (currState.isAccepting()) {
  69 + value = currState.getValue();
  70 + return true;
  71 + }
  72 + else {
  73 + return false;
  74 + }
  75 + }
  76 +
  77 + virtual ~FSA() {
  78 + }
  79 +protected:
  80 +
  81 + FSA(const unsigned char* ptr, const Deserializer<T>& deserializer)
  82 + : startPtr(ptr), deserializer(deserializer) {
  83 + }
  84 + /**
  85 + * Proceed to next state
  86 + *
  87 + * @param fromPointer - wskaźnik
  88 + * @param c - char for the transition.
  89 + * @return next state
  90 + */
  91 + virtual void proceedToNext(const char c, State<T>& state) const = 0;
  92 + const unsigned char* startPtr;
  93 + const Deserializer<T>& deserializer;
  94 + friend class State<T>;
  95 +private:
  96 + // FSA();
  97 +};
  98 +
  99 +template <class T>
  100 +class SimpleFSA : public FSA<T> {
  101 +public:
  102 + SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer);
  103 + virtual ~SimpleFSA();
  104 +protected:
  105 + void proceedToNext(const char c, State<T>& state) const;
  106 +private:
  107 +
  108 +};
  109 +
  110 +#include "_simple_fsa_impl.hpp"
  111 +
  112 +/**
  113 + * A state in an FSA.
  114 + */
  115 +template <class T>
  116 +class State {
  117 +public:
  118 +
  119 + /**
  120 + * Is this a "sink" state - non-accepting state without outgoing transitions
  121 + */
  122 + bool isSink() const;
  123 +
  124 + /**
  125 + * Is this an accepting state
  126 + */
  127 + bool isAccepting() const;
  128 +
  129 + /**
  130 + * Get next state proceeding a transition for given character.
  131 + */
  132 + void proceedToNext(const char c);
  133 +
  134 + /**
  135 + * Get value of this state.
  136 + * Makes sense only for accepting states.
  137 + * For non-accepting states is throws an exception.
  138 + */
  139 + T getValue() const;
  140 +
  141 + /**
  142 + * Get the size (in bytes) of this state's value.
  143 + * Makes sense only for accepting states.
  144 + * For non-accepting states is throws an exception.
  145 + */
  146 + unsigned int getValueSize() const;
  147 +
  148 + unsigned int getOffset() const;
  149 +
  150 + void setNext(const unsigned int offset);
  151 + void setNext(const unsigned int offset, const T& value, const unsigned int valueSize);
  152 + void setNextAsSink();
  153 +
  154 + State(const FSA<T>& fsa);
  155 +
  156 + virtual ~State();
  157 +private:
  158 + const FSA<T>& fsa;
  159 + unsigned int offset;
  160 + bool accepting;
  161 + bool sink;
  162 + T value;
  163 + int valueSize;
  164 +};
  165 +
  166 +#include "_state_impl.hpp"
  167 +
  168 +#endif /* FSA_HPP */
  169 +
  170 +
... ...
fsa/test_dict.cpp 0 → 100644
  1 +/*
  2 + * File: test.cpp
  3 + * Author: lennyn
  4 + *
  5 + * Created on October 22, 2013, 2:11 PM
  6 + */
  7 +
  8 +#include <cstdlib>
  9 +#include <iostream>
  10 +#include <fstream>
  11 +#include <string>
  12 +#include <sstream>
  13 +#include <algorithm>
  14 +#include <functional>
  15 +#include <cctype>
  16 +#include <locale>
  17 +#include "fsa.hpp"
  18 +
  19 +using namespace std;
  20 +
  21 +void validate(const bool cond, const string& msg) {
  22 + if (!cond) {
  23 + cerr << msg << endl;
  24 + exit(1);
  25 + }
  26 +}
  27 +
  28 +unsigned char* readFile(const char* fname) {
  29 + ifstream ifs;
  30 + ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
  31 + ifs.open(fname, ios::in | ios::binary | ios::ate);
  32 + // if (ifs.is_open()) {
  33 + int size = ifs.tellg();
  34 + unsigned char* memblock = new unsigned char [size];
  35 + ifs.seekg(0, ios::beg);
  36 + ifs.read(reinterpret_cast<char*> (memblock), size);
  37 + ifs.close();
  38 + return memblock;
  39 + // }
  40 + // else {
  41 + // cerr << "Unable to open file " << fname << endl;
  42 + // }
  43 +}
  44 +
  45 +std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
  46 + std::stringstream ss(s);
  47 + std::string item;
  48 + while (std::getline(ss, item, delim)) {
  49 + elems.push_back(item);
  50 + }
  51 + return elems;
  52 +}
  53 +
  54 +
  55 +std::vector<std::string> split(const std::string &s, char delim) {
  56 + std::vector<std::string> elems;
  57 + split(s, delim, elems);
  58 + return elems;
  59 +}
  60 +
  61 +static inline string &rtrim(string &s) {
  62 + s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun<int, int>(isspace))).base(), s.end());
  63 + return s;
  64 +}
  65 +
  66 +void testFSA(const FSA<char*>& fsa, const char* fname) {
  67 + ifstream ifs;
  68 +// ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
  69 + ifs.open(fname, ios::binary);
  70 + string line;
  71 + while (getline(ifs, line)) {
  72 + vector<string> split1(split(line, '\t'));
  73 + string key = split1[0];
  74 + key = "bijekcją";
  75 + string value = split1[1];
  76 +
  77 + for (unsigned int i = 0; i < key.length(); i++) {
  78 + cout << (int) key[i] << " ";
  79 + }
  80 + cout << endl;
  81 +
  82 + char* value2;
  83 + if (fsa.tryToRecognize(key.c_str(), value2)) {
  84 + if (string(value) != string(value2)) {
  85 + cout << "BAD INTERP " << key << " " << value << " != " << value2 << endl;
  86 + }
  87 + else {
  88 + cout << "OK! " << key << " " << value << endl;
  89 + }
  90 + }
  91 + else {
  92 + cout << "MISS " << key << " " << value << " not recognized" << endl;
  93 + }
  94 + }
  95 + cout << ifs.good() << endl;
  96 + cout << ifs.fail() << endl;
  97 + cout << ifs.eof() << endl;
  98 + cout << "done" << endl;
  99 +}
  100 +
  101 +/*
  102 + *
  103 + */
  104 +int main(int argc, char** argv) {
  105 + validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename.");
  106 + const unsigned char* fsaData = readFile(argv[1]);
  107 + StringDeserializer deserializer;
  108 + SimpleFSA<char*> fsa(fsaData, deserializer);
  109 + testFSA(fsa, argv[2]);
  110 + cout << argc << endl;
  111 + return 0;
  112 +}
  113 +
... ...
fsabuilder/.settings/org.eclipse.core.resources.prefs 0 → 100644
  1 +eclipse.preferences.version=1
  2 +encoding//fsa/buildfsa.py=utf-8
  3 +encoding//fsa/test/testConstruction.py=utf-8
... ...
fsabuilder/fsa/__init__.py 0 → 100644
fsabuilder/fsa/buildfsa.py 0 → 100644
  1 +# -*- coding:utf-8 -*-
  2 +'''
  3 +Created on 21 paź 2013
  4 +
  5 +@author: mlenart
  6 +'''
  7 +
  8 +import sys
  9 +import logging
  10 +import codecs
  11 +import encode
  12 +import convertinput
  13 +from fsa import FSA
  14 +from serializer import SimpleSerializerWithStringValues
  15 +from visualizer import Visualizer
  16 +from optparse import OptionParser
  17 +
  18 +logging.basicConfig(level=logging.DEBUG)
  19 +
  20 +class OutputFormat():
  21 + BINARY = 'BINARY'
  22 + CPP = 'CPP'
  23 +
  24 +class InputFormat():
  25 + ENCODED = 'ENCODED'
  26 + POLIMORF = 'POLIMORF'
  27 +
  28 +def parseOptions():
  29 + """
  30 + Parses commandline args
  31 + """
  32 + parser = OptionParser()
  33 + parser.add_option('-i', '--input-file',
  34 + dest='inputFile',
  35 + metavar='FILE',
  36 + help='path to input file')
  37 + parser.add_option('-o', '--output-file',
  38 + dest='outputFile',
  39 + metavar='FILE',
  40 + help='path to output file')
  41 + parser.add_option('--input-format',
  42 + dest='inputFormat',
  43 + help='input format - ENCODED or POLIMORF')
  44 + parser.add_option('--output-format',
  45 + dest='outputFormat',
  46 + help='output format - BINARY or CPP')
  47 + parser.add_option('--visualize',
  48 + dest='visualize',
  49 + action='store_true',
  50 + default=False,
  51 + help='visualize result')
  52 +
  53 + opts, args = parser.parse_args()
  54 +
  55 + if None in [opts.inputFile, opts.outputFile, opts.outputFormat, opts.inputFormat]:
  56 + parser.print_help()
  57 + exit(1)
  58 + if not opts.outputFormat.upper() in [OutputFormat.BINARY, OutputFormat.CPP]:
  59 + print >> sys.stderr, 'output format must be one of ('+str([OutputFormat.BINARY, OutputFormat.CPP])+')'
  60 + exit(1)
  61 + if not opts.inputFormat.upper() in [InputFormat.ENCODED, InputFormat.POLIMORF]:
  62 + print >> sys.stderr, 'input format must be one of ('+str([InputFormat.ENCODED, InputFormat.POLIMORF])+')'
  63 + exit(1)
  64 + return opts
  65 +
  66 +def readEncodedInput(inputFile):
  67 + with codecs.open(inputFile, 'r', 'utf8') as f:
  68 + for line in f.readlines():
  69 + word, interps = line.strip().split()
  70 + yield word, interps.split(u'|')
  71 +
  72 +def readPolimorfInput(inputFile, encoder):
  73 + with codecs.open(inputFile, 'r', 'utf8') as f:
  74 + for entry in convertinput.convertPolimorf(f.readlines(), lambda (word, interp): encoder.word2SortKey(word)):
  75 + yield entry
  76 +
  77 +if __name__ == '__main__':
  78 + opts = parseOptions()
  79 + encoder = encode.Encoder()
  80 + fsa = FSA(encoder)
  81 + serializer = SimpleSerializerWithStringValues()
  82 +
  83 + inputData = readEncodedInput(opts.inputFile) \
  84 + if opts.inputFormat == InputFormat.ENCODED \
  85 + else readPolimorfInput(opts.inputFile, encoder)
  86 +
  87 + logging.info('feeding FSA with data ...')
  88 + fsa.feed(inputData)
  89 + logging.info('states num: '+str(fsa.getStatesNum()))
  90 + if opts.outputFormat == 'CPP':
  91 + serializer.serialize2CppFile(fsa, opts.outputFile)
  92 + else:
  93 + serializer.serialize2BinaryFile(fsa, opts.outputFile)
  94 + if opts.visualize:
  95 + Visualizer().visualize(fsa)
... ...
fsabuilder/fsa/convertinput.py 0 → 100644
  1 +'''
  2 +Created on Oct 23, 2013
  3 +
  4 +@author: mlenart
  5 +'''
  6 +import sys
  7 +import fileinput
  8 +import logging
  9 +from encode import Encoder
  10 +
  11 +def _encodeInterp(orth, base, tag, name):
  12 + removePrefix = 0
  13 + root = u''
  14 + for o, b in zip(orth, base):
  15 + if o == b:
  16 + root += o
  17 + else:
  18 + break
  19 + removeSuffixNum = len(orth) - len(root)
  20 + addSuffix = base[len(root):]
  21 + return u'+'.join([
  22 + chr(ord('A')+removePrefix) + chr(ord('A')+removeSuffixNum) + addSuffix,
  23 + tag,
  24 + name])
  25 +
  26 +def _parsePolimorf(inputLines):
  27 + for line0 in inputLines:
  28 + line = line0.strip(u'\n')
  29 + if line:
  30 +# print line
  31 + orth, base, tag, name = line.split(u'\t')
  32 + yield (orth, _encodeInterp(orth, base, tag, name))
  33 +
  34 +def _sortAndMergeParsedInput(inputData, key=lambda k: k):
  35 + logging.info('sorting input...')
  36 + entries = list(inputData)
  37 + entries.sort(key=key)
  38 + logging.info('done sorting')
  39 + prevOrth = None
  40 + prevInterps = None
  41 + for orth, interp in entries:
  42 + if prevOrth and prevOrth == orth:
  43 + prevInterps.append(interp)
  44 + else:
  45 + if prevOrth:
  46 + yield (prevOrth, sorted(set(prevInterps)))
  47 + prevOrth = orth
  48 + prevInterps = [interp]
  49 +
  50 +def convertPolimorf(inputLines, sortKey=lambda k: k):
  51 + for orth, interps in _sortAndMergeParsedInput(_parsePolimorf(inputLines), key=sortKey):
  52 + yield orth, interps
  53 +
  54 +def _decodeInputLines(rawInputLines, encoding):
  55 + for line in rawInputLines:
  56 + yield line.decode(encoding)
  57 +
  58 +if __name__ == '__main__':
  59 + encoder = Encoder()
  60 + for orth, interps in convertPolimorf(_decodeInputLines(fileinput.input(), 'utf8'), lambda (orth, interp): encoder.word2SortKey(orth)):
  61 + print u'\t'.join([orth, u'|'.join(interps)]).encode('utf8')
... ...
fsabuilder/fsa/encode.py 0 → 100644
  1 +'''
  2 +Created on Oct 23, 2013
  3 +
  4 +@author: lennyn
  5 +'''
  6 +
  7 +class Encoder(object):
  8 + '''
  9 + classdocs
  10 + '''
  11 +
  12 +
  13 + def __init__(self, encoding='utf8'):
  14 + '''
  15 + Constructor
  16 + '''
  17 + self.encoding = encoding
  18 +
  19 + def encodeWord(self, word):
  20 + return bytearray(word, self.encoding)
  21 +
  22 + def encodeData(self, data):
  23 + return bytearray(u'|'.join(data).encode(self.encoding)) + bytearray([0])
  24 +
  25 + def decodeData(self, rawData):
  26 +# print unicode(str(rawData), self.encoding)[:-1]
  27 +# print unicode(str(rawData), self.encoding)[:-1].split(u'|')
  28 + return unicode(str(rawData), self.encoding)[:-1].split(u'|')
  29 +
  30 + def word2SortKey(self, word):
  31 + return word.encode(self.encoding)
... ...
fsabuilder/fsa/fsa.py 0 → 100644
  1 +'''
  2 +Created on Oct 8, 2013
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +import state
  8 +import register
  9 +import logging
  10 +
  11 +class FSA(object):
  12 + '''
  13 + A finite state automaton
  14 + '''
  15 +
  16 +
  17 + def __init__(self, encoder):
  18 + self.encodeWord = encoder.encodeWord
  19 + self.encodeData = encoder.encodeData
  20 + self.decodeData = encoder.decodeData
  21 + self.encodedPrevWord = None
  22 + self.initialState = state.State()
  23 + self.register = register.Register()
  24 +
  25 + def tryToRecognize(self, word):
  26 + return self.decodeData(self.initialState.tryToRecognize(self.encodeWord(word)))
  27 +
  28 + def feed(self, input):
  29 +
  30 + for n, (word, data) in enumerate(input, start=1):
  31 + assert data is not None
  32 + if type(data) in [str, unicode]:
  33 + data = [data]
  34 + encodedWord = self.encodeWord(word)
  35 + assert encodedWord > self.encodedPrevWord
  36 + self._addSorted(encodedWord, self.encodeData(data))
  37 + self.encodedPrevWord = encodedWord
  38 + assert self.tryToRecognize(word) == data
  39 + if n % 10000 == 0:
  40 + logging.info(word)
  41 +
  42 + self.initialState = self._replaceOrRegister(self.initialState, self.encodeWord(word))
  43 + self.encodedPrevWord = None
  44 +
  45 + def getStatesNum(self):
  46 + return self.register.getStatesNum()
  47 +
  48 + def _addSorted(self, encodedWord, data):
  49 + assert self.encodedPrevWord < encodedWord
  50 + q = self.initialState
  51 + i = 0
  52 + while i <= len(encodedWord) and q.hasNext(encodedWord[i]):
  53 + q = q.getNext(encodedWord[i])
  54 + i += 1
  55 + if self.encodedPrevWord and i < len(self.encodedPrevWord):
  56 + nextState = q.getNext(self.encodedPrevWord[i])
  57 + q.setTransition(
  58 + self.encodedPrevWord[i],
  59 + self._replaceOrRegister(nextState, self.encodedPrevWord[i+1:]))
  60 +
  61 + while i < len(encodedWord):
  62 + q.setTransition(encodedWord[i], state.State())
  63 + q = q.getNext(encodedWord[i])
  64 + i += 1
  65 +
  66 + assert q.encodedData is None
  67 +# print q, encodedData
  68 + q.encodedData = data
  69 +
  70 + def _replaceOrRegister(self, q, encodedWord):
  71 + if encodedWord:
  72 + nextState = q.getNext(encodedWord[0])
  73 + q.setTransition(
  74 + encodedWord[0],
  75 + self._replaceOrRegister(nextState, encodedWord[1:]))
  76 +
  77 + if self.register.containsEquivalentState(q):
  78 + return self.register.getEquivalentState(q)
  79 + else:
  80 + self.register.addState(q)
  81 + return q
  82 +
  83 + def calculateOffsets(self, sizeCounter):
  84 + currReverseOffset = 0
  85 + for state in self.initialState.dfs(set()):
  86 + currReverseOffset += sizeCounter(state)
  87 + state.reverseOffset = currReverseOffset
  88 + for state in self.initialState.dfs(set()):
  89 + state.offset = currReverseOffset - state.reverseOffset
  90 +
  91 +
  92 +
0 93 \ No newline at end of file
... ...
fsabuilder/fsa/register.py 0 → 100644
  1 +'''
  2 +Created on Oct 8, 2013
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +class Register(object):
  8 + '''
  9 + States register.
  10 + '''
  11 +
  12 +
  13 + def __init__(self):
  14 + self._map = {}
  15 +
  16 + def addState(self, state):
  17 + self._map[state.getRegisterKey()] = state
  18 +
  19 + def getEquivalentState(self, state):
  20 + return self._map[state.getRegisterKey()]
  21 +
  22 + def containsEquivalentState(self, state):
  23 + return state.getRegisterKey() in self._map
  24 +
  25 + def getStatesNum(self):
  26 + return len(self._map)
... ...
fsabuilder/fsa/serializer.py 0 → 100644
  1 +'''
  2 +Created on Oct 20, 2013
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +class Serializer(object):
  8 +
  9 + def __init__(self):
  10 + pass
  11 +
  12 + def serialize2CppFile(self, fsa, fname):
  13 + res = []
  14 + fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
  15 + res.append('const unsigned char DEFAULT_FSA[] = {')
  16 + for idx, state in enumerate(sorted(fsa.initialState.dfs(set()), key=lambda state: state.offset)):
  17 + res.append('// state '+str(idx))
  18 + partRes = []
  19 + for byte in self.state2bytearray(state):
  20 + partRes.append(hex(byte))
  21 + partRes.append(',')
  22 + res.append(' '.join(partRes))
  23 + res.append('}')
  24 + with open(fname, 'w') as f:
  25 + f.write('\n'.join(res))
  26 +
  27 + def serialize2BinaryFile(self, fsa, fname):
  28 + res = bytearray()
  29 + fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
  30 + for state in sorted(fsa.initialState.dfs(set()), key=lambda state: state.offset):
  31 +# res.append('// state '+str(idx))
  32 + res.extend(self.state2bytearray(state))
  33 + with open(fname, 'wb') as f:
  34 + f.write(res)
  35 +
  36 + def getStateSize(self, state):
  37 + raise NotImplementedError('Not implemented')
  38 +
  39 + def fsa2bytearray(self, fsa):
  40 + res = bytearray()
  41 + fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
  42 + for state in sorted(fsa.initialState.dfs(set()), key=state.offset):
  43 + res.extend(self.state2bytearray(state))
  44 + return res
  45 +
  46 + def state2bytearray(self, state):
  47 + raise NotImplementedError('Not implemented')
  48 +
  49 +class SimpleSerializer(Serializer):
  50 +
  51 + ACCEPTING_FLAG = 128
  52 +
  53 + def getStateSize(self, state):
  54 + return 1 + 4 * len(state.transitionsMap.keys()) + self.getDataSize(state)
  55 +
  56 + def getDataSize(self, state):
  57 + raise NotImplementedError('Not implemented')
  58 +
  59 + def state2bytearray(self, state):
  60 + res = bytearray()
  61 + res.extend(self._stateData2bytearray(state))
  62 + res.extend(self._transitionsData2bytearray(state))
  63 + return res
  64 +
  65 + def _stateData2bytearray(self, state):
  66 + res = bytearray()
  67 + firstByte = 0
  68 + if state.isAccepting():
  69 + firstByte |= SimpleSerializer.ACCEPTING_FLAG
  70 + firstByte |= len(state.transitionsMap)
  71 + assert firstByte < 256 and firstByte > 0
  72 + res.append(firstByte)
  73 + if state.isAccepting():
  74 + res.extend(state.encodedData)
  75 + return res
  76 +
  77 + def _transitionsData2bytearray(self, state):
  78 + res = bytearray()
  79 + # must sort that strange way because it must be sorted according to char, not unsigned char
  80 + for byte, nextState in sorted(state.transitionsMap.iteritems(), key=lambda (c, _): c if (c >= 0 and c < 128) else c - 256):
  81 + res.append(byte)
  82 + offset = nextState.offset
  83 + res.append(offset & 0x0000FF)
  84 + res.append((offset & 0x00FF00) >> 8)
  85 + res.append((offset & 0xFF0000) >> 16)
  86 + return res
  87 +
  88 +class SimpleSerializerWithStringValues(SimpleSerializer):
  89 +
  90 + def getDataSize(self, state):
  91 + assert type(state.encodedData) == bytearray or not state.isAccepting()
  92 + return len(state.encodedData) if state.isAccepting() else 0
  93 +
0 94 \ No newline at end of file
... ...
fsabuilder/fsa/state.py 0 → 100644
  1 +'''
  2 +Created on Oct 8, 2013
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +class State(object):
  8 + '''
  9 + A state in an automaton
  10 + '''
  11 +
  12 + def __init__(self):
  13 + self.transitionsMap = {}
  14 + self.encodedData = None
  15 + self.reverseOffset = None
  16 + self.offset = None
  17 +
  18 + def setTransition(self, byte, nextState):
  19 + self.transitionsMap[byte] = nextState
  20 +
  21 + def hasNext(self, byte):
  22 + return byte in self.transitionsMap
  23 +
  24 + def getNext(self, byte):
  25 + return self.transitionsMap.get(byte, None)
  26 +
  27 + def getRegisterKey(self):
  28 + return ( frozenset(self.transitionsMap.iteritems()), tuple(self.encodedData) if self.encodedData else None )
  29 +
  30 + def isAccepting(self):
  31 + return self.encodedData is not None
  32 +
  33 + def tryToRecognize(self, word):
  34 + if word:
  35 + nextState = self.getNext(word[0])
  36 + if nextState:
  37 + return nextState.tryToRecognize(word[1:])
  38 + else:
  39 + return False
  40 + else:
  41 + return self.encodedData
  42 +
  43 + def dfs(self, alreadyVisited):
  44 + if not self in alreadyVisited:
  45 + for _, state in sorted(self.transitionsMap.iteritems()):
  46 + for state1 in state.dfs(alreadyVisited):
  47 + yield state1
  48 + alreadyVisited.add(self)
  49 + yield self
... ...
fsabuilder/fsa/test/__init__.py 0 → 100644
fsabuilder/fsa/test/testConstruction.py 0 → 100644
  1 +#-*- coding: utf-8 -*-
  2 +'''
  3 +Created on Oct 8, 2013
  4 +
  5 +@author: lennyn
  6 +'''
  7 +import unittest
  8 +from fsa import fsa, visualizer, encode
  9 +
  10 +class Test(unittest.TestCase):
  11 +
  12 +
  13 + def testSimpleConstruction(self):
  14 + print 'dupa'
  15 + a = fsa.FSA(encode.Encoder())
  16 + input = sorted([
  17 + (u'bić', ''),
  18 + (u'bij', ''),
  19 + (u'biją', ''),
  20 + (u'bijcie', ''),
  21 + (u'bije', ''),
  22 + (u'bijecie', ''),
  23 + (u'bijemy', ''),
  24 + (u'bijesz', ''),
  25 + (u'biję', ''),
  26 + (u'bijmy', ''),
  27 + (u'bili', 'asd'),
  28 + (u'biliby', ''),
  29 + (u'bilibyście', ''),
  30 + (u'bilibyśmy', ''),
  31 + (u'biliście', 'asdfas'),
  32 + (u'biliśmy', ''),
  33 + (u'bił', 'wersadfas'),
  34 + (u'biła', 'asdfasd'),
  35 + (u'biłaby', 'asdfa'),
  36 + (u'biłabym', ''),
  37 + (u'biłabyś', 'asdfa'),
  38 + (u'biłam', 'dfas'),
  39 + (u'biłaś', 'asdfas'),
  40 + (u'biłby', ''),
  41 + (u'biłbym', 'asdfa'),
  42 + (u'biłbyś', ''),
  43 + (u'biłem', ''),
  44 + (u'biłeś', 'sadfa'),
  45 + (u'biły', ''),
  46 + (u'biłyby', ''),
  47 + (u'biłybyście', ''),
  48 + (u'biłybyśmy', ''),
  49 + (u'biłyście', ''),
  50 + (u'biłyśmy', ''),
  51 + ], key=lambda w: bytearray(w[0], 'utf8'))
  52 + a.feed(input)
  53 + print a.getStatesNum()
  54 +# print a.tryToRecognize(u'bi')
  55 +# print a.tryToRecognize(u'bić')
  56 +# print a.tryToRecognize(u'bili')
  57 + for w, res in input:
  58 + print w, res, a.tryToRecognize(w)
  59 + recognized = a.tryToRecognize(w)
  60 + if type(res) in [str, unicode]:
  61 + recognized = recognized[0]
  62 + assert recognized == res
  63 + a.calculateOffsets(lambda state: 1 + 4 * len(state.transitionsMap.keys()) + (len(state.encodedData) if state.isAccepting() else 0))
  64 + visualizer.Visualizer().visualize(a)
  65 + print 'done'
  66 +
  67 +if __name__ == "__main__":
  68 + #import sys;sys.argv = ['', 'Test.testSimpleConstruction']
  69 + unittest.main()
0 70 \ No newline at end of file
... ...
fsabuilder/fsa/visualizer.py 0 → 100644
  1 +'''
  2 +Created on Oct 20, 2013
  3 +
  4 +@author: mlenart
  5 +'''
  6 +
  7 +import networkx as nx
  8 +import matplotlib.pyplot as plt
  9 +
  10 +class Visualizer(object):
  11 +
  12 + def __init__(self):
  13 + pass
  14 +
  15 + def visualize(self, fsa):
  16 + G = nx.DiGraph()
  17 + allStates = list(reversed(list(fsa.initialState.dfs(set()))))
  18 + edgeLabelsMap = {}
  19 + nodeLabelsMap = {}
  20 + for idx, state in enumerate(allStates):
  21 + G.add_node(idx, offset=state.offset)
  22 + for c, targetState in state.transitionsMap.iteritems():
  23 + G.add_edge(idx, allStates.index(targetState))
  24 + label = chr(c) if c <= 127 else '%'
  25 + edgeLabelsMap[(idx, allStates.index(targetState))] = label
  26 + nodeLabelsMap[idx] = state.offset if not state.isAccepting() else state.encodedData + '(' + str(state.offset) + ')'
  27 + pos=nx.shell_layout(G)
  28 + nx.draw_networkx_nodes(G,
  29 + pos,
  30 + nodelist=list([allStates.index(s) for s in allStates if not s.isAccepting()]),
  31 + node_shape='s',
  32 + node_color='w')
  33 + nx.draw_networkx_nodes(G,
  34 + pos,
  35 + nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]),
  36 + node_shape='s')
  37 +# nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), )
  38 + nx.draw_networkx_edges(G, pos, edgelist=edgeLabelsMap.keys())
  39 + nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap)
  40 + nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap)
  41 + plt.axis('off')
  42 + plt.draw()
  43 + plt.show()
  44 +# plt.savefig('dupa.png')
  45 + print 'done'
0 46 \ No newline at end of file
... ...
fsabuilder/input/__init__.py 0 → 100644
morfeusz/CMakeLists.txt 0 → 100644
  1 +# Make sure the compiler can find include files from our Hello library.
  2 +#include_directories (${Morfeusz_SOURCE_DIR}/FSALibrary)
  3 +
  4 +# Make sure the linker can find the Hello library once it is built.
  5 +#link_directories (${Morfeusz_BINARY_DIR}/Hello)
  6 +include_directories (${Morfeusz_SOURCE_DIR}/fsa)
  7 +add_library (morfeusz2 morfeusz.hpp morfeusz.cpp)
  8 +add_executable (morfeusz2_analyze main.cpp)
  9 +
  10 +# Link the executable to the Hello library.
  11 +target_link_libraries (morfeusz2_analyze morfeusz2)
  12 +set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" )
... ...
morfeusz/default_fsa.hpp 0 → 100644
  1 +/*
  2 + * File: default_fsa.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 21 październik 2013, 17:50
  6 + */
  7 +
  8 +#ifndef DEFAULT_FSA_HPP
  9 +#define DEFAULT_FSA_HPP
  10 +
  11 +extern const unsigned char DEFAULT_FSA[];
  12 +
  13 +#endif /* DEFAULT_FSA_HPP */
  14 +
... ...
morfeusz/main.cpp 0 → 100644
  1 +/*
  2 + * File: main.cc
  3 + * Author: mlenart
  4 + *
  5 + * Created on October 8, 2013, 12:41 PM
  6 + */
  7 +
  8 +#include <cstdlib>
  9 +#include <iostream>
  10 +#include "fsa.hpp"
  11 +#include "default_fsa.hpp"
  12 +
  13 +using namespace std;
  14 +
  15 +/*
  16 + *
  17 + */
  18 +int main(int argc, char** argv) {
  19 + unsigned char dupa[3] = {0376 | 1, 0111, 0234, };
  20 + char x = 255;
  21 + cout << *reinterpret_cast<int*>(&x) << endl;
  22 + return 0;
  23 +}
  24 +
  25 +
... ...
morfeusz/morfeusz.cpp 0 → 100644
morfeusz/morfeusz.hpp 0 → 100644
nbproject/configurations.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<configurationDescriptor version="90">
  3 + <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT">
  4 + <df root="fsa" name="0">
  5 + <in>_state_impl.hpp</in>
  6 + </df>
  7 + <df root="morfeusz" name="1">
  8 + <in>default_fsa.hpp</in>
  9 + <in>main.cpp</in>
  10 + <in>morfeusz.cpp</in>
  11 + </df>
  12 + <logicalFolder name="ExternalFiles"
  13 + displayName="Important Files"
  14 + projectFiles="false"
  15 + kind="IMPORTANT_FILES_FOLDER">
  16 + <itemPath>CMakeLists.txt</itemPath>
  17 + <itemPath>build/Makefile</itemPath>
  18 + </logicalFolder>
  19 + </logicalFolder>
  20 + <sourceFolderFilter>^(nbproject)$</sourceFolderFilter>
  21 + <sourceRootList>
  22 + <Elem>fsa</Elem>
  23 + <Elem>morfeusz</Elem>
  24 + </sourceRootList>
  25 + <projectmakefile>build/Makefile</projectmakefile>
  26 + <confs>
  27 + <conf name="Default" type="0">
  28 + <toolsSet>
  29 + <compilerSet>default</compilerSet>
  30 + <dependencyChecking>false</dependencyChecking>
  31 + <rebuildPropChanged>false</rebuildPropChanged>
  32 + </toolsSet>
  33 + <codeAssistance>
  34 + </codeAssistance>
  35 + <makefileType>
  36 + <makeTool>
  37 + <buildCommandWorkingDir>build</buildCommandWorkingDir>
  38 + <buildCommand>${MAKE} -f Makefile</buildCommand>
  39 + <cleanCommand>${MAKE} -f Makefile clean</cleanCommand>
  40 + <executablePath></executablePath>
  41 + <ccTool>
  42 + <incDir>
  43 + <pElem>fsa</pElem>
  44 + <pElem>build/morfeusz</pElem>
  45 + </incDir>
  46 + </ccTool>
  47 + </makeTool>
  48 + </makefileType>
  49 + <item path="fsa/_state_impl.hpp" ex="false" tool="3" flavor2="0">
  50 + </item>
  51 + <item path="morfeusz/default_fsa.hpp" ex="false" tool="3" flavor2="0">
  52 + </item>
  53 + <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
  54 + <ccTool>
  55 + </ccTool>
  56 + </item>
  57 + <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="0">
  58 + </item>
  59 + </conf>
  60 + </confs>
  61 +</configurationDescriptor>
... ...
nbproject/project.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://www.netbeans.org/ns/project/1">
  3 + <type>org.netbeans.modules.cnd.makeproject</type>
  4 + <configuration>
  5 + <data xmlns="http://www.netbeans.org/ns/make-project/1">
  6 + <name>morfeusz</name>
  7 + <c-extensions/>
  8 + <cpp-extensions>cpp</cpp-extensions>
  9 + <header-extensions>hpp</header-extensions>
  10 + <sourceEncoding>UTF-8</sourceEncoding>
  11 + <make-dep-projects/>
  12 + <sourceRootList>
  13 + <sourceRootElem>fsa</sourceRootElem>
  14 + <sourceRootElem>morfeusz</sourceRootElem>
  15 + </sourceRootList>
  16 + <confList>
  17 + <confElem>
  18 + <name>Default</name>
  19 + <type>0</type>
  20 + </confElem>
  21 + </confList>
  22 + <formatting>
  23 + <project-formatting-style>false</project-formatting-style>
  24 + </formatting>
  25 + </data>
  26 + </configuration>
  27 +</project>
... ...