Commit 41cbc0cc6470d77c8edd1081993b282bb8b955b0

Authored by Michał Lenart
1 parent 9f473443

- dodanie testów szybkości

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@4 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsa/CMakeLists.txt
1 1  
2 2 add_executable (test_dict test_dict.cpp)
  3 +add_executable (test_speed test_speed.cpp)
3 4 set_target_properties ( test_dict PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -g" )
  5 +set_target_properties ( test_speed PROPERTIES COMPILE_FLAGS "-std=gnu++0x -Wall -O2" )
... ...
fsa/_simple_fsa_impl.hpp renamed to fsa/_fsa_impl.hpp
... ... @@ -55,9 +55,9 @@ static void debugTransitions(const TransitionData* transitionsTable, const Trans
55 55 while (transitionsTable + offset < transitionsEnd) {
56 56 const TransitionData td = *(transitionsTable + offset);
57 57 if ((td.label <= 'z' && 'a' <= td.label))
58   - cerr << td.label << " " << td.targetOffset << endl;
  58 + cerr << td.label << " " << td.targetOffset << endl;
59 59 else {
60   - cerr << ((int) td.label) << " " << td.targetOffset << endl;
  60 + cerr << ((int) td.label) << " " << td.targetOffset << endl;
61 61 }
62 62 offset++;
63 63 }
... ... @@ -65,43 +65,62 @@ static void debugTransitions(const TransitionData* transitionsTable, const Trans
65 65  
66 66 template <class T>
67 67 void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const {
68   - if (c<= 'z' && 'a' <= c)
69   - cerr << "NEXT " << c << " from " << state.getOffset() << endl;
70   - else
71   - cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl;
  68 +// if (c <= 'z' && 'a' <= c)
  69 +// cerr << "NEXT " << c << " from " << state.getOffset() << endl;
  70 +// else
  71 +// cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl;
72 72 const unsigned char* fromPointer = this->startPtr + state.getOffset();
73 73 int transitionsTableOffset = sizeof (StateData);
74 74 if (state.isAccepting()) {
75 75 transitionsTableOffset += state.getValueSize();
76   - cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl;
  76 +// cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl;
77 77 }
78 78 const StateData* stateData = reinterpret_cast<const StateData*> (fromPointer);
79 79 const TransitionData* transitionsTable = reinterpret_cast<const TransitionData*> (fromPointer + transitionsTableOffset);
80 80 const TransitionData* transitionsEnd = transitionsTable + stateData->transitionsNum;
81   - debugState(stateData);
82   - debugTransitions(transitionsTable, transitionsEnd);
  81 +// debugState(stateData);
  82 +// debugTransitions(transitionsTable, transitionsEnd);
83 83 const TransitionData* foundTransition = std::lower_bound(
84 84 transitionsTable, transitionsEnd,
85   - TransitionData{c, 0},
86   - compareTransitions);
  85 + TransitionData{c, 0}, compareTransitions);
87 86 if (foundTransition == transitionsEnd || foundTransition->label != c) {
88   - cerr << "SINK" << (foundTransition == transitionsEnd) << " " << foundTransition->label << " for " << c << endl;
  87 +// cerr << "SINK" << (foundTransition == transitionsEnd) << " " << foundTransition->label << " for " << c << endl;
89 88 state.setNextAsSink();
90 89 }
91 90 else {
92   -// cerr << "FOUND " << foundTransition->label << " " << foundTransition->targetOffset << endl;
  91 + // cerr << "FOUND " << foundTransition->label << " " << foundTransition->targetOffset << endl;
93 92 const unsigned char* nextStatePointer = this->startPtr + foundTransition->targetOffset;
94 93 const StateData* nextStateData = reinterpret_cast<const StateData*> (nextStatePointer);
95 94 if (nextStateData->accepting) {
96   - cerr << "ACCEPTING" << endl;
  95 +// cerr << "ACCEPTING" << endl;
97 96 T object;
98 97 int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object);
99 98 state.setNext(foundTransition->targetOffset, object, size);
100   - }
101   - else {
  99 + } else {
102 100 state.setNext(foundTransition->targetOffset);
103 101 }
104 102 }
105 103 }
106 104  
  105 +template <class T>
  106 +bool FSA<T>::tryToRecognize(const char* input, T& value) const {
  107 + State<T> currState = this->getInitialState();
  108 + int i = 0;
  109 + while (!currState.isSink() && input[i] != '\0') {
  110 + currState.proceedToNext(input[i]);
  111 + i++;
  112 + }
  113 + if (currState.isAccepting()) {
  114 + value = currState.getValue();
  115 + return true;
  116 + } else {
  117 + return false;
  118 + }
  119 +}
  120 +
  121 +template <class T>
  122 +State<T> FSA<T>::getInitialState() const {
  123 + return State<T>(*this);
  124 +}
  125 +
107 126 #endif /* _SIMPLE_FSA_IMPL_HPP */
... ...
fsa/fsa.hpp
... ... @@ -54,25 +54,9 @@ public:
54 54 /**
55 55 * Get this automaton's initial state.
56 56 */
57   - State<T> getInitialState() const {
58   - return State<T>(*this);
59   - }
  57 + State<T> getInitialState() const;
60 58  
61   - bool tryToRecognize(const char* input, T& value) const {
62   - State<T> currState = this->getInitialState();
63   - int i = 0;
64   - while (!currState.isSink() && input[i] != '\0') {
65   - currState.proceedToNext(input[i]);
66   - i++;
67   - }
68   - if (currState.isAccepting()) {
69   - value = currState.getValue();
70   - return true;
71   - }
72   - else {
73   - return false;
74   - }
75   - }
  59 + bool tryToRecognize(const char* input, T& value) const;
76 60  
77 61 virtual ~FSA() {
78 62 }
... ... @@ -83,10 +67,6 @@ protected:
83 67 }
84 68 /**
85 69 * Proceed to next state
86   - *
87   - * @param fromPointer - wskaźnik
88   - * @param c - char for the transition.
89   - * @return next state
90 70 */
91 71 virtual void proceedToNext(const char c, State<T>& state) const = 0;
92 72 const unsigned char* startPtr;
... ... @@ -107,7 +87,7 @@ private:
107 87  
108 88 };
109 89  
110   -#include "_simple_fsa_impl.hpp"
  90 +#include "_fsa_impl.hpp"
111 91  
112 92 /**
113 93 * A state in an FSA.
... ...
fsa/test_dict.cpp
... ... @@ -14,34 +14,12 @@
14 14 #include <functional>
15 15 #include <cctype>
16 16 #include <locale>
  17 +#include <vector>
17 18 #include "fsa.hpp"
  19 +#include "utils.hpp"
18 20  
19 21 using namespace std;
20 22  
21   -void validate(const bool cond, const string& msg) {
22   - if (!cond) {
23   - cerr << msg << endl;
24   - exit(1);
25   - }
26   -}
27   -
28   -unsigned char* readFile(const char* fname) {
29   - ifstream ifs;
30   - ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
31   - ifs.open(fname, ios::in | ios::binary | ios::ate);
32   - // if (ifs.is_open()) {
33   - int size = ifs.tellg();
34   - unsigned char* memblock = new unsigned char [size];
35   - ifs.seekg(0, ios::beg);
36   - ifs.read(reinterpret_cast<char*> (memblock), size);
37   - ifs.close();
38   - return memblock;
39   - // }
40   - // else {
41   - // cerr << "Unable to open file " << fname << endl;
42   - // }
43   -}
44   -
45 23 std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) {
46 24 std::stringstream ss(s);
47 25 std::string item;
... ...
fsa/test_speed.cpp 0 → 100644
  1 +/*
  2 + * File: test_speed.cpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 24 październik 2013, 17:47
  6 + */
  7 +
  8 +#include <cstdlib>
  9 +#include <fstream>
  10 +#include "fsa.hpp"
  11 +#include "utils.hpp"
  12 +
  13 +#define NDEBUG
  14 +
  15 +using namespace std;
  16 +
  17 +/*
  18 + *
  19 + */
  20 +int main(int argc, char** argv) {
  21 + validate(argc == 3, "Must provide exactly two arguments - FSA filename and test data filename.");
  22 + const unsigned char* fsaData = readFile(argv[1]);
  23 + StringDeserializer deserializer;
  24 + SimpleFSA<char*> fsa(fsaData, deserializer);
  25 + ifstream ifs;
  26 +// ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
  27 + ifs.open(argv[2], ios::binary);
  28 + string line;
  29 + while (getline(ifs, line)) {
  30 + char* val;
  31 + if (fsa.tryToRecognize(line.c_str(), val)) {
  32 +// printf("%s: *OK*\n", line.c_str());
  33 + }
  34 + else {
  35 +// printf("%s: NOT FOUND", line.c_str());
  36 + }
  37 + }
  38 + return 0;
  39 +}
  40 +
... ...
fsa/utils.hpp 0 → 100644
  1 +/*
  2 + * File: utils.hpp
  3 + * Author: mlenart
  4 + *
  5 + * Created on 24 październik 2013, 17:56
  6 + */
  7 +
  8 +#ifndef UTILS_HPP
  9 +#define UTILS_HPP
  10 +
  11 +#include <iostream>
  12 +#include <string>
  13 +
  14 +void validate(const bool cond, const std::string& msg) {
  15 + if (!cond) {
  16 + std::cerr << msg << std::endl;
  17 + exit(1);
  18 + }
  19 +}
  20 +
  21 +unsigned char* readFile(const char* fname) {
  22 + ifstream ifs;
  23 + ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
  24 + ifs.open(fname, ios::in | ios::binary | ios::ate);
  25 + // if (ifs.is_open()) {
  26 + int size = ifs.tellg();
  27 + unsigned char* memblock = new unsigned char [size];
  28 + ifs.seekg(0, ios::beg);
  29 + ifs.read(reinterpret_cast<char*> (memblock), size);
  30 + ifs.close();
  31 + return memblock;
  32 + // }
  33 + // else {
  34 + // cerr << "Unable to open file " << fname << endl;
  35 + // }
  36 +}
  37 +
  38 +#endif /* UTILS_HPP */
  39 +
... ...
fsabuilder/fsa/buildfsa.py
... ... @@ -24,6 +24,7 @@ class OutputFormat():
24 24 class InputFormat():
25 25 ENCODED = 'ENCODED'
26 26 POLIMORF = 'POLIMORF'
  27 + PLAIN = 'PLAIN'
27 28  
28 29 def parseOptions():
29 30 """
... ... @@ -58,7 +59,7 @@ def parseOptions():
58 59 if not opts.outputFormat.upper() in [OutputFormat.BINARY, OutputFormat.CPP]:
59 60 print >> sys.stderr, 'output format must be one of ('+str([OutputFormat.BINARY, OutputFormat.CPP])+')'
60 61 exit(1)
61   - if not opts.inputFormat.upper() in [InputFormat.ENCODED, InputFormat.POLIMORF]:
  62 + if not opts.inputFormat.upper() in [InputFormat.ENCODED, InputFormat.POLIMORF, InputFormat.PLAIN]:
62 63 print >> sys.stderr, 'input format must be one of ('+str([InputFormat.ENCODED, InputFormat.POLIMORF])+')'
63 64 exit(1)
64 65 return opts
... ... @@ -74,15 +75,23 @@ def readPolimorfInput(inputFile, encoder):
74 75 for entry in convertinput.convertPolimorf(f.readlines(), lambda (word, interp): encoder.word2SortKey(word)):
75 76 yield entry
76 77  
  78 +def readPlainInput(inputFile, encoder):
  79 + with codecs.open(inputFile, 'r', 'utf8') as f:
  80 + for line in sorted(f.readlines(), key=encoder.word2SortKey):
  81 + word = line.strip()
  82 + yield word, ''
  83 +
77 84 if __name__ == '__main__':
78 85 opts = parseOptions()
79 86 encoder = encode.Encoder()
80 87 fsa = FSA(encoder)
81 88 serializer = SimpleSerializerWithStringValues()
82 89  
83   - inputData = readEncodedInput(opts.inputFile) \
84   - if opts.inputFormat == InputFormat.ENCODED \
85   - else readPolimorfInput(opts.inputFile, encoder)
  90 + inputData = {
  91 + InputFormat.ENCODED: readEncodedInput(opts.inputFile),
  92 + InputFormat.POLIMORF: readPolimorfInput(opts.inputFile, encoder),
  93 + InputFormat.PLAIN: readPlainInput(opts.inputFile, encoder)
  94 + }[opts.inputFormat]
86 95  
87 96 logging.info('feeding FSA with data ...')
88 97 fsa.feed(inputData)
... ...
nbproject/configurations.xml
... ... @@ -2,10 +2,11 @@
2 2 <configurationDescriptor version="90">
3 3 <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT">
4 4 <df root="fsa" name="0">
5   - <in>_state_impl.hpp</in>
  5 + <in>test_dict.cpp</in>
  6 + <in>test_speed.cpp</in>
  7 + <in>utils.hpp</in>
6 8 </df>
7 9 <df root="morfeusz" name="1">
8   - <in>default_fsa.hpp</in>
9 10 <in>main.cpp</in>
10 11 <in>morfeusz.cpp</in>
11 12 </df>
... ... @@ -38,17 +39,30 @@
38 39 <buildCommand>${MAKE} -f Makefile</buildCommand>
39 40 <cleanCommand>${MAKE} -f Makefile clean</cleanCommand>
40 41 <executablePath></executablePath>
41   - <ccTool>
42   - <incDir>
43   - <pElem>fsa</pElem>
44   - <pElem>build/morfeusz</pElem>
45   - </incDir>
46   - </ccTool>
47 42 </makeTool>
48 43 </makefileType>
49   - <item path="fsa/_state_impl.hpp" ex="false" tool="3" flavor2="0">
  44 + <folder path="0">
  45 + <ccTool>
  46 + <incDir>
  47 + <pElem>build/fsa</pElem>
  48 + </incDir>
  49 + </ccTool>
  50 + </folder>
  51 + <folder path="1">
  52 + <ccTool>
  53 + <incDir>
  54 + <pElem>fsa</pElem>
  55 + <pElem>build/morfeusz</pElem>
  56 + </incDir>
  57 + </ccTool>
  58 + </folder>
  59 + <item path="fsa/test_dict.cpp" ex="false" tool="1" flavor2="8">
  60 + <ccTool>
  61 + </ccTool>
  62 + </item>
  63 + <item path="fsa/test_speed.cpp" ex="false" tool="1" flavor2="0">
50 64 </item>
51   - <item path="morfeusz/default_fsa.hpp" ex="false" tool="3" flavor2="0">
  65 + <item path="fsa/utils.hpp" ex="false" tool="3" flavor2="0">
52 66 </item>
53 67 <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8">
54 68 <ccTool>
... ...