- pierwszy commit (b314fe9d) | Commits | SGJP / Morfeusz

Commit b314fe9d9e17aa2ed2c4b24739ff8156717e7948

Authored by Michał Lenart 12 years ago

- pierwszy commit

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@2 ff4e3ee1-f430-4e82-ade0-24591c43f1fd

Inline Side-by-side

Showing 26 changed files with 1195 additions and 0 deletions

CMakeLists.txt 0 → 100644

View file @b314fe9

	1	+# The name of our project is "HELLO". CMakeLists files in this project can
	2	+# refer to the root source directory of the project as ${HELLO_SOURCE_DIR} and
	3	+# to the root binary directory of the project as ${HELLO_BINARY_DIR}.
	4	+cmake_minimum_required (VERSION 2.6)
	5	+project (Morfeusz)
	6	+
	7	+# Recurse into the "Hello" and "Demo" subdirectories. This does not actually
	8	+# cause another cmake executable to run. The same process will walk through
	9	+# the project's entire directory structure.
	10	+add_subdirectory (fsa)
	11	+add_subdirectory (morfeusz)
	12	+
...	...

fsa/CMakeLists.txt 0 → 100644

View file @b314fe9

	1	+
	2	+add_executable (test_dict test_dict.cpp)
	3	+set_target_properties ( test_dict PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -g" )
...	...

fsa/_simple_fsa_impl.hpp 0 → 100644

View file @b314fe9

	1	+/*
	2	+ * File: _simple_fsa_impl.hpp
	3	+ * Author: mlenart
	4	+ *
	5	+ * Created on October 20, 2013, 12:25 PM
	6	+ */
	7	+
	8	+#ifndef _SIMPLE_FSA_IMPL_HPP
	9	+#define _SIMPLE_FSA_IMPL_HPP
	10	+
	11	+#include <algorithm>
	12	+#include <utility>
	13	+#include <iostream>
	14	+#include "fsa.hpp"
	15	+
	16	+using namespace std;
	17	+
	18	+#pragma pack(push) /* push current alignment to stack */
	19	+#pragma pack(1) /* set alignment to 1 byte boundary */
	20	+
	21	+struct StateData {
	22	+ unsigned transitionsNum : 7;
	23	+ unsigned accepting : 1;
	24	+};
	25	+
	26	+struct TransitionData {
	27	+ char label;
	28	+ unsigned targetOffset : 24;
	29	+};
	30	+
	31	+#pragma pack(pop) /* restore original alignment from stack */
	32	+
	33	+static bool compareTransitions(TransitionData t1, TransitionData t2) {
	34	+ return t1.label < t2.label;
	35	+}
	36	+
	37	+template <class T>
	38	+SimpleFSA<T>::SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer)
	39	+: FSA<T>(ptr, deserializer) {
	40	+}
	41	+
	42	+template <class T>
	43	+SimpleFSA<T>::~SimpleFSA() {
	44	+
	45	+}
	46	+
	47	+static void debugState(const StateData* stateData) {
	48	+ cerr << "STATE" << endl;
	49	+ cerr << stateData->transitionsNum << " " << stateData->accepting << endl;
	50	+}
	51	+
	52	+static void debugTransitions(const TransitionData* transitionsTable, const TransitionData* transitionsEnd) {
	53	+ int offset = 0;
	54	+ cerr << "TRANSITIONS" << endl;
	55	+ while (transitionsTable + offset < transitionsEnd) {
	56	+ const TransitionData td = *(transitionsTable + offset);
	57	+ if ((td.label <= 'z' && 'a' <= td.label))
	58	+ cerr << td.label << " " << td.targetOffset << endl;
	59	+ else {
	60	+ cerr << ((int) td.label) << " " << td.targetOffset << endl;
	61	+ }
	62	+ offset++;
	63	+ }
	64	+}
	65	+
	66	+template <class T>
	67	+void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const {
	68	+ if (c<= 'z' && 'a' <= c)
	69	+ cerr << "NEXT " << c << " from " << state.getOffset() << endl;
	70	+ else
	71	+ cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl;
	72	+ const unsigned char* fromPointer = this->startPtr + state.getOffset();
	73	+ int transitionsTableOffset = sizeof (StateData);
	74	+ if (state.isAccepting()) {
	75	+ transitionsTableOffset += state.getValueSize();
	76	+ cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl;
	77	+ }
	78	+ const StateData* stateData = reinterpret_cast<const StateData*> (fromPointer);
	79	+ const TransitionData* transitionsTable = reinterpret_cast<const TransitionData*> (fromPointer + transitionsTableOffset);
	80	+ const TransitionData* transitionsEnd = transitionsTable + stateData->transitionsNum;
	81	+ debugState(stateData);
	82	+ debugTransitions(transitionsTable, transitionsEnd);
	83	+ const TransitionData* foundTransition = std::lower_bound(
	84	+ transitionsTable, transitionsEnd,
	85	+ TransitionData{c, 0},
	86	+ compareTransitions);
	87	+ if (foundTransition == transitionsEnd \|\| foundTransition->label != c) {
	88	+ cerr << "SINK" << (foundTransition == transitionsEnd) << " " << foundTransition->label << " for " << c << endl;
	89	+ state.setNextAsSink();
	90	+ }
	91	+ else {
	92	+// cerr << "FOUND " << foundTransition->label << " " << foundTransition->targetOffset << endl;
	93	+ const unsigned char* nextStatePointer = this->startPtr + foundTransition->targetOffset;
	94	+ const StateData* nextStateData = reinterpret_cast<const StateData*> (nextStatePointer);
	95	+ if (nextStateData->accepting) {
	96	+ cerr << "ACCEPTING" << endl;
	97	+ T object;
	98	+ int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object);
	99	+ state.setNext(foundTransition->targetOffset, object, size);
	100	+ }
	101	+ else {
	102	+ state.setNext(foundTransition->targetOffset);
	103	+ }
	104	+ }
	105	+}
	106	+
	107	+#endif /* _SIMPLE_FSA_IMPL_HPP */
...	...

fsa/_state_impl.hpp 0 → 100644

View file @b314fe9

	1	+/*
	2	+ * File: _state_impl.hpp
	3	+ * Author: mlenart
	4	+ *
	5	+ * Created on 21 październik 2013, 15:20
	6	+ */
	7	+
	8	+#ifndef _STATE_IMPL_HPP
	9	+#define _STATE_IMPL_HPP
	10	+
	11	+#include <typeinfo>
	12	+#include "fsa.hpp"
	13	+
	14	+using namespace std;
	15	+
	16	+template <class T>
	17	+State<T>::State(const FSA<T>& fsa)
	18	+: fsa(fsa), offset(0), accepting(false), sink(false), value(), valueSize(0) {
	19	+}
	20	+
	21	+template <class T>
	22	+bool State<T>::isSink() const {
	23	+ return this->sink;
	24	+}
	25	+
	26	+template <class T>
	27	+bool State<T>::isAccepting() const {
	28	+ return this->accepting;
	29	+}
	30	+
	31	+template <class T>
	32	+void State<T>::proceedToNext(const char c) {
	33	+ if (this->isSink()) {
	34	+ return;
	35	+ }
	36	+ else {
	37	+ this->fsa.proceedToNext(c, *this);
	38	+ }
	39	+}
	40	+
	41	+template <class T>
	42	+unsigned int State<T>::getOffset() const {
	43	+ assert(!this->isSink());
	44	+ return this->offset;
	45	+}
	46	+
	47	+template <class T>
	48	+T State<T>::getValue() const {
	49	+ assert(this->isAccepting());
	50	+ return this->value;
	51	+}
	52	+
	53	+template <class T>
	54	+unsigned int State<T>::getValueSize() const {
	55	+ assert(this->isAccepting());
	56	+ return this->valueSize;
	57	+}
	58	+
	59	+template <class T>
	60	+State<T>::~State() {
	61	+
	62	+}
	63	+
	64	+template <class T>
	65	+void State<T>::setNext(const unsigned int offset) {
	66	+ assert(!this->isSink());
	67	+ this->offset = offset;
	68	+ this->accepting = false;
	69	+}
	70	+
	71	+template <class T>
	72	+void State<T>::setNext(const unsigned int offset, const T& value, const unsigned int valueSize) {
	73	+ assert(!this->isSink());
	74	+ this->offset = offset;
	75	+ this->accepting = true;
	76	+ this->value = value;
	77	+ this->valueSize = valueSize;
	78	+}
	79	+
	80	+template <class T>
	81	+void State<T>::setNextAsSink() {
	82	+ this->sink = true;
	83	+ this->accepting = false;
	84	+}
	85	+
	86	+#endif /* _STATE_IMPL_HPP */
	87	+
...	...

fsa/fsa.hpp 0 → 100644

View file @b314fe9

	1	+/*
	2	+ * File: fsa.hh
	3	+ * Author: mlenart
	4	+ *
	5	+ * Created on October 17, 2013, 2:00 PM
	6	+ */
	7	+
	8	+#ifndef FSA_HPP
	9	+#define FSA_HPP
	10	+
	11	+//#include <iostream>
	12	+#include <cstring>
	13	+#include <typeinfo>
	14	+#include <cassert>
	15	+
	16	+template <class T> class State;
	17	+template <class T> class FSA;
	18	+template <class T> class Deserializer;
	19	+
	20	+template <class T>
	21	+class Deserializer {
	22	+public:
	23	+
	24	+ /**
	25	+ * Deserialize object from ptr.
	26	+ * Returns number of bytes read or -1 on error.
	27	+ */
	28	+ virtual int deserialize(const unsigned char* ptr, T& object) const = 0;
	29	+};
	30	+
	31	+class StringDeserializer : public Deserializer<char*> {
	32	+public:
	33	+
	34	+ StringDeserializer() {
	35	+ }
	36	+
	37	+ /**
	38	+ * Deserialize object from ptr.
	39	+ * Returns number of bytes read or -1 on error.
	40	+ */
	41	+ int deserialize(const unsigned char* ptr, char*& text) const {
	42	+ text = const_cast<char> (reinterpret_cast<const char> (ptr));
	43	+ return strlen(text) + 1;
	44	+ }
	45	+};
	46	+
	47	+/**
	48	+ * Finite state automaton.
	49	+ */
	50	+template <class T>
	51	+class FSA {
	52	+public:
	53	+
	54	+ /**
	55	+ * Get this automaton's initial state.
	56	+ */
	57	+ State<T> getInitialState() const {
	58	+ return State<T>(*this);
	59	+ }
	60	+
	61	+ bool tryToRecognize(const char* input, T& value) const {
	62	+ State<T> currState = this->getInitialState();
	63	+ int i = 0;
	64	+ while (!currState.isSink() && input[i] != '\0') {
	65	+ currState.proceedToNext(input[i]);
	66	+ i++;
	67	+ }
	68	+ if (currState.isAccepting()) {
	69	+ value = currState.getValue();
	70	+ return true;
	71	+ }
	72	+ else {
	73	+ return false;
	74	+ }
	75	+ }
	76	+
	77	+ virtual ~FSA() {
	78	+ }
	79	+protected:
	80	+
	81	+ FSA(const unsigned char* ptr, const Deserializer<T>& deserializer)
	82	+ : startPtr(ptr), deserializer(deserializer) {
	83	+ }
	84	+ /**
	85	+ * Proceed to next state
	86	+ *
	87	+ * @param fromPointer - wskaźnik
	88	+ * @param c - char for the transition.
	89	+ * @return next state
	90	+ */
	91	+ virtual void proceedToNext(const char c, State<T>& state) const = 0;
	92	+ const unsigned char* startPtr;
	93	+ const Deserializer<T>& deserializer;
	94	+ friend class State<T>;
	95	+private:
	96	+ // FSA();
	97	+};
	98	+
	99	+template <class T>
	100	+class SimpleFSA : public FSA<T> {
	101	+public:
	102	+ SimpleFSA(const unsigned char* ptr, const Deserializer<T>& deserializer);
	103	+ virtual ~SimpleFSA();
	104	+protected:
	105	+ void proceedToNext(const char c, State<T>& state) const;
	106	+private:
	107	+
	108	+};
	109	+
	110	+#include "_simple_fsa_impl.hpp"
	111	+
	112	+/**
	113	+ * A state in an FSA.
	114	+ */
	115	+template <class T>
	116	+class State {
	117	+public:
	118	+
	119	+ /**
	120	+ * Is this a "sink" state - non-accepting state without outgoing transitions
	121	+ */
	122	+ bool isSink() const;
	123	+
	124	+ /**
	125	+ * Is this an accepting state
	126	+ */
	127	+ bool isAccepting() const;
	128	+
	129	+ /**
	130	+ * Get next state proceeding a transition for given character.
	131	+ */
	132	+ void proceedToNext(const char c);
	133	+
	134	+ /**
	135	+ * Get value of this state.
	136	+ * Makes sense only for accepting states.
	137	+ * For non-accepting states is throws an exception.
	138	+ */
	139	+ T getValue() const;
	140	+
	141	+ /**
	142	+ * Get the size (in bytes) of this state's value.
	143	+ * Makes sense only for accepting states.
	144	+ * For non-accepting states is throws an exception.
	145	+ */
	146	+ unsigned int getValueSize() const;
	147	+
	148	+ unsigned int getOffset() const;
	149	+
	150	+ void setNext(const unsigned int offset);
	151	+ void setNext(const unsigned int offset, const T& value, const unsigned int valueSize);
	152	+ void setNextAsSink();
	153	+
	154	+ State(const FSA<T>& fsa);
	155	+
	156	+ virtual ~State();
	157	+private:
	158	+ const FSA<T>& fsa;
	159	+ unsigned int offset;
	160	+ bool accepting;
	161	+ bool sink;
	162	+ T value;
	163	+ int valueSize;
	164	+};
	165	+
	166	+#include "_state_impl.hpp"
	167	+
	168	+#endif /* FSA_HPP */
	169	+
	170	+
...	...

fsa/test_dict.cpp 0 → 100644

View file @b314fe9

	1	+/*
	2	+ * File: test.cpp
	3	+ * Author: lennyn
	4	+ *
	5	+ * Created on October 22, 2013, 2:11 PM
	6	+ */
	7	+
	8	+#include <cstdlib>
	9	+#include <iostream>
	10	+#include <fstream>
	11	+#include <string>
	12	+#include <sstream>
	13	+#include <algorithm>
	14	+#include <functional>
	15	+#include <cctype>
	16	+#include <locale>
	17	+#include "fsa.hpp"
	18	+
	19	+using namespace std;
	20	+
	21	+void validate(const bool cond, const string& msg) {
	22	+ if (!cond) {
	23	+ cerr << msg << endl;
	24	+ exit(1);
	25	+ }
	26	+}
	27	+
	28	+unsigned char* readFile(const char* fname) {
	29	+ ifstream ifs;
	30	+ ifs.exceptions(std::ifstream::failbit \| std::ifstream::badbit);
	31	+ ifs.open(fname, ios::in \| ios::binary \| ios::ate);
	32	+ // if (ifs.is_open()) {
	33	+ int size = ifs.tellg();
	34	+ unsigned char* memblock = new unsigned char [size];
	35	+ ifs.seekg(0, ios::beg);
	36	+ ifs.read(reinterpret_cast<char*> (memblock), size);
	37	+ ifs.close();
	38	+ return memblock;
	39	+ // }
	40	+ // else {
	41	+ // cerr << "Unable to open file " << fname << endl;
	42	+ // }
	43	+}
	44	+
	45	+std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
	46	+ std::stringstream ss(s);
	47	+ std::string item;
	48	+ while (std::getline(ss, item, delim)) {
	49	+ elems.push_back(item);
	50	+ }
	51	+ return elems;
	52	+}
	53	+
	54	+
	55	+std::vector<std::string> split(const std::string &s, char delim) {
	56	+ std::vector<std::string> elems;
	57	+ split(s, delim, elems);
	58	+ return elems;
	59	+}
	60	+
	61	+static inline string &rtrim(string &s) {
	62	+ s.erase(find_if(s.rbegin(), s.rend(), not1(ptr_fun<int, int>(isspace))).base(), s.end());
	63	+ return s;
	64	+}
	65	+
	66	+void testFSA(const FSA<char>& fsa, const char fname) {
	67	+ ifstream ifs;
	68	+// ifs.exceptions(std::ifstream::failbit \| std::ifstream::badbit);
	69	+ ifs.open(fname, ios::binary);
	70	+ string line;
	71	+ while (getline(ifs, line)) {
	72	+ vector<string> split1(split(line, '\t'));
	73	+ string key = split1[0];
	74	+ key = "bijekcją";
	75	+ string value = split1[1];
	76	+
	77	+ for (unsigned int i = 0; i < key.length(); i++) {
	78	+ cout << (int) key[i] << " ";
	79	+ }
	80	+ cout << endl;
	81	+
	82	+ char* value2;
	83	+ if (fsa.tryToRecognize(key.c_str(), value2)) {
	84	+ if (string(value) != string(value2)) {
	85	+ cout << "BAD INTERP " << key << " " << value << " != " << value2 << endl;
	86	+ }
	87	+ else {
	88	+ cout << "OK! " << key << " " << value << endl;
	89	+ }
	90	+ }
	91	+ else {
	92	+ cout << "MISS " << key << " " << value << " not recognized" << endl;
	93	+ }
	94	+ }
	95	+ cout << ifs.good() << endl;
	96	+ cout << ifs.fail() << endl;
	97	+ cout << ifs.eof() << endl;
	98	+ cout << "done" << endl;
	99	+}
	100	+
	101	+/*
	102	+ *
	103	+ */
	104	+int main(int argc, char** argv) {
	105	+ validate(argc == 3, "Must provide exactly two arguments - FSA filename and dictionary filename.");
	106	+ const unsigned char* fsaData = readFile(argv[1]);
	107	+ StringDeserializer deserializer;
	108	+ SimpleFSA<char*> fsa(fsaData, deserializer);
	109	+ testFSA(fsa, argv[2]);
	110	+ cout << argc << endl;
	111	+ return 0;
	112	+}
	113	+
...	...

fsabuilder/.settings/org.eclipse.core.resources.prefs 0 → 100644

View file @b314fe9

	1	+eclipse.preferences.version=1
	2	+encoding//fsa/buildfsa.py=utf-8
	3	+encoding//fsa/test/testConstruction.py=utf-8
...	...

fsabuilder/fsa/__init__.py 0 → 100644

View file @b314fe9

fsabuilder/fsa/buildfsa.py 0 → 100644

View file @b314fe9

	1	+# -- coding:utf-8 --
	2	+'''
	3	+Created on 21 paź 2013
	4	+
	5	+@author: mlenart
	6	+'''
	7	+
	8	+import sys
	9	+import logging
	10	+import codecs
	11	+import encode
	12	+import convertinput
	13	+from fsa import FSA
	14	+from serializer import SimpleSerializerWithStringValues
	15	+from visualizer import Visualizer
	16	+from optparse import OptionParser
	17	+
	18	+logging.basicConfig(level=logging.DEBUG)
	19	+
	20	+class OutputFormat():
	21	+ BINARY = 'BINARY'
	22	+ CPP = 'CPP'
	23	+
	24	+class InputFormat():
	25	+ ENCODED = 'ENCODED'
	26	+ POLIMORF = 'POLIMORF'
	27	+
	28	+def parseOptions():
	29	+ """
	30	+ Parses commandline args
	31	+ """
	32	+ parser = OptionParser()
	33	+ parser.add_option('-i', '--input-file',
	34	+ dest='inputFile',
	35	+ metavar='FILE',
	36	+ help='path to input file')
	37	+ parser.add_option('-o', '--output-file',
	38	+ dest='outputFile',
	39	+ metavar='FILE',
	40	+ help='path to output file')
	41	+ parser.add_option('--input-format',
	42	+ dest='inputFormat',
	43	+ help='input format - ENCODED or POLIMORF')
	44	+ parser.add_option('--output-format',
	45	+ dest='outputFormat',
	46	+ help='output format - BINARY or CPP')
	47	+ parser.add_option('--visualize',
	48	+ dest='visualize',
	49	+ action='store_true',
	50	+ default=False,
	51	+ help='visualize result')
	52	+
	53	+ opts, args = parser.parse_args()
	54	+
	55	+ if None in [opts.inputFile, opts.outputFile, opts.outputFormat, opts.inputFormat]:
	56	+ parser.print_help()
	57	+ exit(1)
	58	+ if not opts.outputFormat.upper() in [OutputFormat.BINARY, OutputFormat.CPP]:
	59	+ print >> sys.stderr, 'output format must be one of ('+str([OutputFormat.BINARY, OutputFormat.CPP])+')'
	60	+ exit(1)
	61	+ if not opts.inputFormat.upper() in [InputFormat.ENCODED, InputFormat.POLIMORF]:
	62	+ print >> sys.stderr, 'input format must be one of ('+str([InputFormat.ENCODED, InputFormat.POLIMORF])+')'
	63	+ exit(1)
	64	+ return opts
	65	+
	66	+def readEncodedInput(inputFile):
	67	+ with codecs.open(inputFile, 'r', 'utf8') as f:
	68	+ for line in f.readlines():
	69	+ word, interps = line.strip().split()
	70	+ yield word, interps.split(u'\|')
	71	+
	72	+def readPolimorfInput(inputFile, encoder):
	73	+ with codecs.open(inputFile, 'r', 'utf8') as f:
	74	+ for entry in convertinput.convertPolimorf(f.readlines(), lambda (word, interp): encoder.word2SortKey(word)):
	75	+ yield entry
	76	+
	77	+if __name__ == '__main__':
	78	+ opts = parseOptions()
	79	+ encoder = encode.Encoder()
	80	+ fsa = FSA(encoder)
	81	+ serializer = SimpleSerializerWithStringValues()
	82	+
	83	+ inputData = readEncodedInput(opts.inputFile) \
	84	+ if opts.inputFormat == InputFormat.ENCODED \
	85	+ else readPolimorfInput(opts.inputFile, encoder)
	86	+
	87	+ logging.info('feeding FSA with data ...')
	88	+ fsa.feed(inputData)
	89	+ logging.info('states num: '+str(fsa.getStatesNum()))
	90	+ if opts.outputFormat == 'CPP':
	91	+ serializer.serialize2CppFile(fsa, opts.outputFile)
	92	+ else:
	93	+ serializer.serialize2BinaryFile(fsa, opts.outputFile)
	94	+ if opts.visualize:
	95	+ Visualizer().visualize(fsa)
...	...

fsabuilder/fsa/convertinput.py 0 → 100644

View file @b314fe9

	1	+'''
	2	+Created on Oct 23, 2013
	3	+
	4	+@author: mlenart
	5	+'''
	6	+import sys
	7	+import fileinput
	8	+import logging
	9	+from encode import Encoder
	10	+
	11	+def _encodeInterp(orth, base, tag, name):
	12	+ removePrefix = 0
	13	+ root = u''
	14	+ for o, b in zip(orth, base):
	15	+ if o == b:
	16	+ root += o
	17	+ else:
	18	+ break
	19	+ removeSuffixNum = len(orth) - len(root)
	20	+ addSuffix = base[len(root):]
	21	+ return u'+'.join([
	22	+ chr(ord('A')+removePrefix) + chr(ord('A')+removeSuffixNum) + addSuffix,
	23	+ tag,
	24	+ name])
	25	+
	26	+def _parsePolimorf(inputLines):
	27	+ for line0 in inputLines:
	28	+ line = line0.strip(u'\n')
	29	+ if line:
	30	+# print line
	31	+ orth, base, tag, name = line.split(u'\t')
	32	+ yield (orth, _encodeInterp(orth, base, tag, name))
	33	+
	34	+def _sortAndMergeParsedInput(inputData, key=lambda k: k):
	35	+ logging.info('sorting input...')
	36	+ entries = list(inputData)
	37	+ entries.sort(key=key)
	38	+ logging.info('done sorting')
	39	+ prevOrth = None
	40	+ prevInterps = None
	41	+ for orth, interp in entries:
	42	+ if prevOrth and prevOrth == orth:
	43	+ prevInterps.append(interp)
	44	+ else:
	45	+ if prevOrth:
	46	+ yield (prevOrth, sorted(set(prevInterps)))
	47	+ prevOrth = orth
	48	+ prevInterps = [interp]
	49	+
	50	+def convertPolimorf(inputLines, sortKey=lambda k: k):
	51	+ for orth, interps in _sortAndMergeParsedInput(_parsePolimorf(inputLines), key=sortKey):
	52	+ yield orth, interps
	53	+
	54	+def _decodeInputLines(rawInputLines, encoding):
	55	+ for line in rawInputLines:
	56	+ yield line.decode(encoding)
	57	+
	58	+if __name__ == '__main__':
	59	+ encoder = Encoder()
	60	+ for orth, interps in convertPolimorf(_decodeInputLines(fileinput.input(), 'utf8'), lambda (orth, interp): encoder.word2SortKey(orth)):
	61	+ print u'\t'.join([orth, u'\|'.join(interps)]).encode('utf8')
...	...

fsabuilder/fsa/encode.py 0 → 100644

View file @b314fe9

	1	+'''
	2	+Created on Oct 23, 2013
	3	+
	4	+@author: lennyn
	5	+'''
	6	+
	7	+class Encoder(object):
	8	+ '''
	9	+ classdocs
	10	+ '''
	11	+
	12	+
	13	+ def __init__(self, encoding='utf8'):
	14	+ '''
	15	+ Constructor
	16	+ '''
	17	+ self.encoding = encoding
	18	+
	19	+ def encodeWord(self, word):
	20	+ return bytearray(word, self.encoding)
	21	+
	22	+ def encodeData(self, data):
	23	+ return bytearray(u'\|'.join(data).encode(self.encoding)) + bytearray([0])
	24	+
	25	+ def decodeData(self, rawData):
	26	+# print unicode(str(rawData), self.encoding)[:-1]
	27	+# print unicode(str(rawData), self.encoding)[:-1].split(u'\|')
	28	+ return unicode(str(rawData), self.encoding)[:-1].split(u'\|')
	29	+
	30	+ def word2SortKey(self, word):
	31	+ return word.encode(self.encoding)
...	...

fsabuilder/fsa/fsa.py 0 → 100644

View file @b314fe9

	1	+'''
	2	+Created on Oct 8, 2013
	3	+
	4	+@author: mlenart
	5	+'''
	6	+
	7	+import state
	8	+import register
	9	+import logging
	10	+
	11	+class FSA(object):
	12	+ '''
	13	+ A finite state automaton
	14	+ '''
	15	+
	16	+
	17	+ def __init__(self, encoder):
	18	+ self.encodeWord = encoder.encodeWord
	19	+ self.encodeData = encoder.encodeData
	20	+ self.decodeData = encoder.decodeData
	21	+ self.encodedPrevWord = None
	22	+ self.initialState = state.State()
	23	+ self.register = register.Register()
	24	+
	25	+ def tryToRecognize(self, word):
	26	+ return self.decodeData(self.initialState.tryToRecognize(self.encodeWord(word)))
	27	+
	28	+ def feed(self, input):
	29	+
	30	+ for n, (word, data) in enumerate(input, start=1):
	31	+ assert data is not None
	32	+ if type(data) in [str, unicode]:
	33	+ data = [data]
	34	+ encodedWord = self.encodeWord(word)
	35	+ assert encodedWord > self.encodedPrevWord
	36	+ self._addSorted(encodedWord, self.encodeData(data))
	37	+ self.encodedPrevWord = encodedWord
	38	+ assert self.tryToRecognize(word) == data
	39	+ if n % 10000 == 0:
	40	+ logging.info(word)
	41	+
	42	+ self.initialState = self._replaceOrRegister(self.initialState, self.encodeWord(word))
	43	+ self.encodedPrevWord = None
	44	+
	45	+ def getStatesNum(self):
	46	+ return self.register.getStatesNum()
	47	+
	48	+ def _addSorted(self, encodedWord, data):
	49	+ assert self.encodedPrevWord < encodedWord
	50	+ q = self.initialState
	51	+ i = 0
	52	+ while i <= len(encodedWord) and q.hasNext(encodedWord[i]):
	53	+ q = q.getNext(encodedWord[i])
	54	+ i += 1
	55	+ if self.encodedPrevWord and i < len(self.encodedPrevWord):
	56	+ nextState = q.getNext(self.encodedPrevWord[i])
	57	+ q.setTransition(
	58	+ self.encodedPrevWord[i],
	59	+ self._replaceOrRegister(nextState, self.encodedPrevWord[i+1:]))
	60	+
	61	+ while i < len(encodedWord):
	62	+ q.setTransition(encodedWord[i], state.State())
	63	+ q = q.getNext(encodedWord[i])
	64	+ i += 1
	65	+
	66	+ assert q.encodedData is None
	67	+# print q, encodedData
	68	+ q.encodedData = data
	69	+
	70	+ def _replaceOrRegister(self, q, encodedWord):
	71	+ if encodedWord:
	72	+ nextState = q.getNext(encodedWord[0])
	73	+ q.setTransition(
	74	+ encodedWord[0],
	75	+ self._replaceOrRegister(nextState, encodedWord[1:]))
	76	+
	77	+ if self.register.containsEquivalentState(q):
	78	+ return self.register.getEquivalentState(q)
	79	+ else:
	80	+ self.register.addState(q)
	81	+ return q
	82	+
	83	+ def calculateOffsets(self, sizeCounter):
	84	+ currReverseOffset = 0
	85	+ for state in self.initialState.dfs(set()):
	86	+ currReverseOffset += sizeCounter(state)
	87	+ state.reverseOffset = currReverseOffset
	88	+ for state in self.initialState.dfs(set()):
	89	+ state.offset = currReverseOffset - state.reverseOffset
	90	+
	91	+
	92	+
0	93	\ No newline at end of file
...	...

fsabuilder/fsa/register.py 0 → 100644

View file @b314fe9

	1	+'''
	2	+Created on Oct 8, 2013
	3	+
	4	+@author: mlenart
	5	+'''
	6	+
	7	+class Register(object):
	8	+ '''
	9	+ States register.
	10	+ '''
	11	+
	12	+
	13	+ def __init__(self):
	14	+ self._map = {}
	15	+
	16	+ def addState(self, state):
	17	+ self._map[state.getRegisterKey()] = state
	18	+
	19	+ def getEquivalentState(self, state):
	20	+ return self._map[state.getRegisterKey()]
	21	+
	22	+ def containsEquivalentState(self, state):
	23	+ return state.getRegisterKey() in self._map
	24	+
	25	+ def getStatesNum(self):
	26	+ return len(self._map)
...	...

fsabuilder/fsa/serializer.py 0 → 100644

View file @b314fe9

	1	+'''
	2	+Created on Oct 20, 2013
	3	+
	4	+@author: mlenart
	5	+'''
	6	+
	7	+class Serializer(object):
	8	+
	9	+ def __init__(self):
	10	+ pass
	11	+
	12	+ def serialize2CppFile(self, fsa, fname):
	13	+ res = []
	14	+ fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
	15	+ res.append('const unsigned char DEFAULT_FSA[] = {')
	16	+ for idx, state in enumerate(sorted(fsa.initialState.dfs(set()), key=lambda state: state.offset)):
	17	+ res.append('// state '+str(idx))
	18	+ partRes = []
	19	+ for byte in self.state2bytearray(state):
	20	+ partRes.append(hex(byte))
	21	+ partRes.append(',')
	22	+ res.append(' '.join(partRes))
	23	+ res.append('}')
	24	+ with open(fname, 'w') as f:
	25	+ f.write('\n'.join(res))
	26	+
	27	+ def serialize2BinaryFile(self, fsa, fname):
	28	+ res = bytearray()
	29	+ fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
	30	+ for state in sorted(fsa.initialState.dfs(set()), key=lambda state: state.offset):
	31	+# res.append('// state '+str(idx))
	32	+ res.extend(self.state2bytearray(state))
	33	+ with open(fname, 'wb') as f:
	34	+ f.write(res)
	35	+
	36	+ def getStateSize(self, state):
	37	+ raise NotImplementedError('Not implemented')
	38	+
	39	+ def fsa2bytearray(self, fsa):
	40	+ res = bytearray()
	41	+ fsa.calculateOffsets(sizeCounter=lambda state: self.getStateSize(state))
	42	+ for state in sorted(fsa.initialState.dfs(set()), key=state.offset):
	43	+ res.extend(self.state2bytearray(state))
	44	+ return res
	45	+
	46	+ def state2bytearray(self, state):
	47	+ raise NotImplementedError('Not implemented')
	48	+
	49	+class SimpleSerializer(Serializer):
	50	+
	51	+ ACCEPTING_FLAG = 128
	52	+
	53	+ def getStateSize(self, state):
	54	+ return 1 + 4 * len(state.transitionsMap.keys()) + self.getDataSize(state)
	55	+
	56	+ def getDataSize(self, state):
	57	+ raise NotImplementedError('Not implemented')
	58	+
	59	+ def state2bytearray(self, state):
	60	+ res = bytearray()
	61	+ res.extend(self._stateData2bytearray(state))
	62	+ res.extend(self._transitionsData2bytearray(state))
	63	+ return res
	64	+
	65	+ def _stateData2bytearray(self, state):
	66	+ res = bytearray()
	67	+ firstByte = 0
	68	+ if state.isAccepting():
	69	+ firstByte \|= SimpleSerializer.ACCEPTING_FLAG
	70	+ firstByte \|= len(state.transitionsMap)
	71	+ assert firstByte < 256 and firstByte > 0
	72	+ res.append(firstByte)
	73	+ if state.isAccepting():
	74	+ res.extend(state.encodedData)
	75	+ return res
	76	+
	77	+ def _transitionsData2bytearray(self, state):
	78	+ res = bytearray()
	79	+ # must sort that strange way because it must be sorted according to char, not unsigned char
	80	+ for byte, nextState in sorted(state.transitionsMap.iteritems(), key=lambda (c, _): c if (c >= 0 and c < 128) else c - 256):
	81	+ res.append(byte)
	82	+ offset = nextState.offset
	83	+ res.append(offset & 0x0000FF)
	84	+ res.append((offset & 0x00FF00) >> 8)
	85	+ res.append((offset & 0xFF0000) >> 16)
	86	+ return res
	87	+
	88	+class SimpleSerializerWithStringValues(SimpleSerializer):
	89	+
	90	+ def getDataSize(self, state):
	91	+ assert type(state.encodedData) == bytearray or not state.isAccepting()
	92	+ return len(state.encodedData) if state.isAccepting() else 0
	93	+
0	94	\ No newline at end of file
...	...

fsabuilder/fsa/state.py 0 → 100644

View file @b314fe9

	1	+'''
	2	+Created on Oct 8, 2013
	3	+
	4	+@author: mlenart
	5	+'''
	6	+
	7	+class State(object):
	8	+ '''
	9	+ A state in an automaton
	10	+ '''
	11	+
	12	+ def __init__(self):
	13	+ self.transitionsMap = {}
	14	+ self.encodedData = None
	15	+ self.reverseOffset = None
	16	+ self.offset = None
	17	+
	18	+ def setTransition(self, byte, nextState):
	19	+ self.transitionsMap[byte] = nextState
	20	+
	21	+ def hasNext(self, byte):
	22	+ return byte in self.transitionsMap
	23	+
	24	+ def getNext(self, byte):
	25	+ return self.transitionsMap.get(byte, None)
	26	+
	27	+ def getRegisterKey(self):
	28	+ return ( frozenset(self.transitionsMap.iteritems()), tuple(self.encodedData) if self.encodedData else None )
	29	+
	30	+ def isAccepting(self):
	31	+ return self.encodedData is not None
	32	+
	33	+ def tryToRecognize(self, word):
	34	+ if word:
	35	+ nextState = self.getNext(word[0])
	36	+ if nextState:
	37	+ return nextState.tryToRecognize(word[1:])
	38	+ else:
	39	+ return False
	40	+ else:
	41	+ return self.encodedData
	42	+
	43	+ def dfs(self, alreadyVisited):
	44	+ if not self in alreadyVisited:
	45	+ for _, state in sorted(self.transitionsMap.iteritems()):
	46	+ for state1 in state.dfs(alreadyVisited):
	47	+ yield state1
	48	+ alreadyVisited.add(self)
	49	+ yield self
...	...

fsabuilder/fsa/test/__init__.py 0 → 100644

View file @b314fe9

fsabuilder/fsa/test/testConstruction.py 0 → 100644

View file @b314fe9

	1	+#-- coding: utf-8 --
	2	+'''
	3	+Created on Oct 8, 2013
	4	+
	5	+@author: lennyn
	6	+'''
	7	+import unittest
	8	+from fsa import fsa, visualizer, encode
	9	+
	10	+class Test(unittest.TestCase):
	11	+
	12	+
	13	+ def testSimpleConstruction(self):
	14	+ print 'dupa'
	15	+ a = fsa.FSA(encode.Encoder())
	16	+ input = sorted([
	17	+ (u'bić', ''),
	18	+ (u'bij', ''),
	19	+ (u'biją', ''),
	20	+ (u'bijcie', ''),
	21	+ (u'bije', ''),
	22	+ (u'bijecie', ''),
	23	+ (u'bijemy', ''),
	24	+ (u'bijesz', ''),
	25	+ (u'biję', ''),
	26	+ (u'bijmy', ''),
	27	+ (u'bili', 'asd'),
	28	+ (u'biliby', ''),
	29	+ (u'bilibyście', ''),
	30	+ (u'bilibyśmy', ''),
	31	+ (u'biliście', 'asdfas'),
	32	+ (u'biliśmy', ''),
	33	+ (u'bił', 'wersadfas'),
	34	+ (u'biła', 'asdfasd'),
	35	+ (u'biłaby', 'asdfa'),
	36	+ (u'biłabym', ''),
	37	+ (u'biłabyś', 'asdfa'),
	38	+ (u'biłam', 'dfas'),
	39	+ (u'biłaś', 'asdfas'),
	40	+ (u'biłby', ''),
	41	+ (u'biłbym', 'asdfa'),
	42	+ (u'biłbyś', ''),
	43	+ (u'biłem', ''),
	44	+ (u'biłeś', 'sadfa'),
	45	+ (u'biły', ''),
	46	+ (u'biłyby', ''),
	47	+ (u'biłybyście', ''),
	48	+ (u'biłybyśmy', ''),
	49	+ (u'biłyście', ''),
	50	+ (u'biłyśmy', ''),
	51	+ ], key=lambda w: bytearray(w[0], 'utf8'))
	52	+ a.feed(input)
	53	+ print a.getStatesNum()
	54	+# print a.tryToRecognize(u'bi')
	55	+# print a.tryToRecognize(u'bić')
	56	+# print a.tryToRecognize(u'bili')
	57	+ for w, res in input:
	58	+ print w, res, a.tryToRecognize(w)
	59	+ recognized = a.tryToRecognize(w)
	60	+ if type(res) in [str, unicode]:
	61	+ recognized = recognized[0]
	62	+ assert recognized == res
	63	+ a.calculateOffsets(lambda state: 1 + 4 * len(state.transitionsMap.keys()) + (len(state.encodedData) if state.isAccepting() else 0))
	64	+ visualizer.Visualizer().visualize(a)
	65	+ print 'done'
	66	+
	67	+if __name__ == "__main__":
	68	+ #import sys;sys.argv = ['', 'Test.testSimpleConstruction']
	69	+ unittest.main()
0	70	\ No newline at end of file
...	...

fsabuilder/fsa/visualizer.py 0 → 100644

View file @b314fe9

	1	+'''
	2	+Created on Oct 20, 2013
	3	+
	4	+@author: mlenart
	5	+'''
	6	+
	7	+import networkx as nx
	8	+import matplotlib.pyplot as plt
	9	+
	10	+class Visualizer(object):
	11	+
	12	+ def __init__(self):
	13	+ pass
	14	+
	15	+ def visualize(self, fsa):
	16	+ G = nx.DiGraph()
	17	+ allStates = list(reversed(list(fsa.initialState.dfs(set()))))
	18	+ edgeLabelsMap = {}
	19	+ nodeLabelsMap = {}
	20	+ for idx, state in enumerate(allStates):
	21	+ G.add_node(idx, offset=state.offset)
	22	+ for c, targetState in state.transitionsMap.iteritems():
	23	+ G.add_edge(idx, allStates.index(targetState))
	24	+ label = chr(c) if c <= 127 else '%'
	25	+ edgeLabelsMap[(idx, allStates.index(targetState))] = label
	26	+ nodeLabelsMap[idx] = state.offset if not state.isAccepting() else state.encodedData + '(' + str(state.offset) + ')'
	27	+ pos=nx.shell_layout(G)
	28	+ nx.draw_networkx_nodes(G,
	29	+ pos,
	30	+ nodelist=list([allStates.index(s) for s in allStates if not s.isAccepting()]),
	31	+ node_shape='s',
	32	+ node_color='w')
	33	+ nx.draw_networkx_nodes(G,
	34	+ pos,
	35	+ nodelist=list([allStates.index(s) for s in allStates if s.isAccepting()]),
	36	+ node_shape='s')
	37	+# nx.draw_networkx_nodes(G, pos, nodelist=list([allStates.index(s) for s in allStates if s.isFinal()])), )
	38	+ nx.draw_networkx_edges(G, pos, edgelist=edgeLabelsMap.keys())
	39	+ nx.draw_networkx_edge_labels(G, pos, edge_labels = edgeLabelsMap)
	40	+ nx.draw_networkx_labels(G, pos, labels=nodeLabelsMap)
	41	+ plt.axis('off')
	42	+ plt.draw()
	43	+ plt.show()
	44	+# plt.savefig('dupa.png')
	45	+ print 'done'
0	46	\ No newline at end of file
...	...

fsabuilder/input/__init__.py 0 → 100644

View file @b314fe9

morfeusz/CMakeLists.txt 0 → 100644

View file @b314fe9

	1	+# Make sure the compiler can find include files from our Hello library.
	2	+#include_directories (${Morfeusz_SOURCE_DIR}/FSALibrary)
	3	+
	4	+# Make sure the linker can find the Hello library once it is built.
	5	+#link_directories (${Morfeusz_BINARY_DIR}/Hello)
	6	+include_directories (${Morfeusz_SOURCE_DIR}/fsa)
	7	+add_library (morfeusz2 morfeusz.hpp morfeusz.cpp)
	8	+add_executable (morfeusz2_analyze main.cpp)
	9	+
	10	+# Link the executable to the Hello library.
	11	+target_link_libraries (morfeusz2_analyze morfeusz2)
	12	+set_target_properties ( morfeusz2_analyze PROPERTIES COMPILE_FLAGS "-std=gnu++0x" )
...	...

morfeusz/default_fsa.hpp 0 → 100644

View file @b314fe9

	1	+/*
	2	+ * File: default_fsa.hpp
	3	+ * Author: mlenart
	4	+ *
	5	+ * Created on 21 październik 2013, 17:50
	6	+ */
	7	+
	8	+#ifndef DEFAULT_FSA_HPP
	9	+#define DEFAULT_FSA_HPP
	10	+
	11	+extern const unsigned char DEFAULT_FSA[];
	12	+
	13	+#endif /* DEFAULT_FSA_HPP */
	14	+
...	...

morfeusz/main.cpp 0 → 100644

View file @b314fe9

	1	+/*
	2	+ * File: main.cc
	3	+ * Author: mlenart
	4	+ *
	5	+ * Created on October 8, 2013, 12:41 PM
	6	+ */
	7	+
	8	+#include <cstdlib>
	9	+#include <iostream>
	10	+#include "fsa.hpp"
	11	+#include "default_fsa.hpp"
	12	+
	13	+using namespace std;
	14	+
	15	+/*
	16	+ *
	17	+ */
	18	+int main(int argc, char** argv) {
	19	+ unsigned char dupa[3] = {0376 \| 1, 0111, 0234, };
	20	+ char x = 255;
	21	+ cout << reinterpret_cast<int>(&x) << endl;
	22	+ return 0;
	23	+}
	24	+
	25	+
...	...

morfeusz/morfeusz.cpp 0 → 100644

View file @b314fe9

morfeusz/morfeusz.hpp 0 → 100644

View file @b314fe9

nbproject/configurations.xml 0 → 100644

View file @b314fe9

	1	+<?xml version="1.0" encoding="UTF-8"?>
	2	+<configurationDescriptor version="90">
	3	+ <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT">
	4	+ <df root="fsa" name="0">
	5	+ <in>_state_impl.hpp</in>
	6	+ </df>
	7	+ <df root="morfeusz" name="1">
	8	+ <in>default_fsa.hpp</in>
	9	+ <in>main.cpp</in>
	10	+ <in>morfeusz.cpp</in>
	11	+ </df>
	12	+ <logicalFolder name="ExternalFiles"
	13	+ displayName="Important Files"
	14	+ projectFiles="false"
	15	+ kind="IMPORTANT_FILES_FOLDER">
	16	+ <itemPath>CMakeLists.txt</itemPath>
	17	+ <itemPath>build/Makefile</itemPath>
	18	+ </logicalFolder>
	19	+ </logicalFolder>
	20	+ <sourceFolderFilter>^(nbproject)$</sourceFolderFilter>
	21	+ <sourceRootList>
	22	+ <Elem>fsa</Elem>
	23	+ <Elem>morfeusz</Elem>
	24	+ </sourceRootList>
	25	+ <projectmakefile>build/Makefile</projectmakefile>
	26	+ <confs>
	27	+ <conf name="Default" type="0">
	28	+ <toolsSet>
	29	+ <compilerSet>default</compilerSet>
	30	+ <dependencyChecking>false</dependencyChecking>
	31	+ <rebuildPropChanged>false</rebuildPropChanged>
	32	+ </toolsSet>
	33	+ <codeAssistance>
	34	+ </codeAssistance>
	35	+ <makefileType>
	36	+ <makeTool>
	37	+ <buildCommandWorkingDir>build</buildCommandWorkingDir>
	38	+ <buildCommand>${MAKE} -f Makefile</buildCommand>
	39	+ <cleanCommand>${MAKE} -f Makefile clean</cleanCommand>
	40	+ <executablePath></executablePath>
	41	+ <ccTool>
	42	+ <incDir>
	43	+ <pElem>fsa</pElem>
	44	+ <pElem>build/morfeusz</pElem>
	45	+ </incDir>
	46	+ </ccTool>
	47	+ </makeTool>
	48	+ </makefileType>
	49	+ <item path="fsa/_state_impl.hpp" ex="false" tool="3" flavor2="0">
	50	+ </item>
	51	+ <item path="morfeusz/default_fsa.hpp" ex="false" tool="3" flavor2="0">
	52	+ </item>
	53	+ <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
	54	+ <ccTool>
	55	+ </ccTool>
	56	+ </item>
	57	+ <item path="morfeusz/morfeusz.cpp" ex="false" tool="1" flavor2="0">
	58	+ </item>
	59	+ </conf>
	60	+ </confs>
	61	+</configurationDescriptor>
...	...

nbproject/project.xml 0 → 100644

View file @b314fe9

	1	+<?xml version="1.0" encoding="UTF-8"?>
	2	+<project xmlns="http://www.netbeans.org/ns/project/1">
	3	+ <type>org.netbeans.modules.cnd.makeproject</type>
	4	+ <configuration>
	5	+ <data xmlns="http://www.netbeans.org/ns/make-project/1">
	6	+ <name>morfeusz</name>
	7	+ <c-extensions/>
	8	+ <cpp-extensions>cpp</cpp-extensions>
	9	+ <header-extensions>hpp</header-extensions>
	10	+ <sourceEncoding>UTF-8</sourceEncoding>
	11	+ <make-dep-projects/>
	12	+ <sourceRootList>
	13	+ <sourceRootElem>fsa</sourceRootElem>
	14	+ <sourceRootElem>morfeusz</sourceRootElem>
	15	+ </sourceRootList>
	16	+ <confList>
	17	+ <confElem>
	18	+ <name>Default</name>
	19	+ <type>0</type>
	20	+ </confElem>
	21	+ </confList>
	22	+ <formatting>
	23	+ <project-formatting-style>false</project-formatting-style>
	24	+ </formatting>
	25	+ </data>
	26	+ </configuration>
	27	+</project>
...	...