Commit 41cbc0cc6470d77c8edd1081993b282bb8b955b0
1 parent
9f473443
- dodanie testów szybkości
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/morfeusz@4 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
8 changed files
with
158 additions
and
77 deletions
fsa/CMakeLists.txt
fsa/_simple_fsa_impl.hpp renamed to fsa/_fsa_impl.hpp
... | ... | @@ -55,9 +55,9 @@ static void debugTransitions(const TransitionData* transitionsTable, const Trans |
55 | 55 | while (transitionsTable + offset < transitionsEnd) { |
56 | 56 | const TransitionData td = *(transitionsTable + offset); |
57 | 57 | if ((td.label <= 'z' && 'a' <= td.label)) |
58 | - cerr << td.label << " " << td.targetOffset << endl; | |
58 | + cerr << td.label << " " << td.targetOffset << endl; | |
59 | 59 | else { |
60 | - cerr << ((int) td.label) << " " << td.targetOffset << endl; | |
60 | + cerr << ((int) td.label) << " " << td.targetOffset << endl; | |
61 | 61 | } |
62 | 62 | offset++; |
63 | 63 | } |
... | ... | @@ -65,43 +65,62 @@ static void debugTransitions(const TransitionData* transitionsTable, const Trans |
65 | 65 | |
66 | 66 | template <class T> |
67 | 67 | void SimpleFSA<T>::proceedToNext(const char c, State<T>& state) const { |
68 | - if (c<= 'z' && 'a' <= c) | |
69 | - cerr << "NEXT " << c << " from " << state.getOffset() << endl; | |
70 | - else | |
71 | - cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl; | |
68 | +// if (c <= 'z' && 'a' <= c) | |
69 | +// cerr << "NEXT " << c << " from " << state.getOffset() << endl; | |
70 | +// else | |
71 | +// cerr << "NEXT " << (short) c << " from " << state.getOffset() << endl; | |
72 | 72 | const unsigned char* fromPointer = this->startPtr + state.getOffset(); |
73 | 73 | int transitionsTableOffset = sizeof (StateData); |
74 | 74 | if (state.isAccepting()) { |
75 | 75 | transitionsTableOffset += state.getValueSize(); |
76 | - cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl; | |
76 | +// cerr << "transitionsTableOffset " << transitionsTableOffset + state.getOffset() << " because value is " << state.getValue() << endl; | |
77 | 77 | } |
78 | 78 | const StateData* stateData = reinterpret_cast<const StateData*> (fromPointer); |
79 | 79 | const TransitionData* transitionsTable = reinterpret_cast<const TransitionData*> (fromPointer + transitionsTableOffset); |
80 | 80 | const TransitionData* transitionsEnd = transitionsTable + stateData->transitionsNum; |
81 | - debugState(stateData); | |
82 | - debugTransitions(transitionsTable, transitionsEnd); | |
81 | +// debugState(stateData); | |
82 | +// debugTransitions(transitionsTable, transitionsEnd); | |
83 | 83 | const TransitionData* foundTransition = std::lower_bound( |
84 | 84 | transitionsTable, transitionsEnd, |
85 | - TransitionData{c, 0}, | |
86 | - compareTransitions); | |
85 | + TransitionData{c, 0}, compareTransitions); | |
87 | 86 | if (foundTransition == transitionsEnd || foundTransition->label != c) { |
88 | - cerr << "SINK" << (foundTransition == transitionsEnd) << " " << foundTransition->label << " for " << c << endl; | |
87 | +// cerr << "SINK" << (foundTransition == transitionsEnd) << " " << foundTransition->label << " for " << c << endl; | |
89 | 88 | state.setNextAsSink(); |
90 | 89 | } |
91 | 90 | else { |
92 | -// cerr << "FOUND " << foundTransition->label << " " << foundTransition->targetOffset << endl; | |
91 | + // cerr << "FOUND " << foundTransition->label << " " << foundTransition->targetOffset << endl; | |
93 | 92 | const unsigned char* nextStatePointer = this->startPtr + foundTransition->targetOffset; |
94 | 93 | const StateData* nextStateData = reinterpret_cast<const StateData*> (nextStatePointer); |
95 | 94 | if (nextStateData->accepting) { |
96 | - cerr << "ACCEPTING" << endl; | |
95 | +// cerr << "ACCEPTING" << endl; | |
97 | 96 | T object; |
98 | 97 | int size = this->deserializer.deserialize(nextStatePointer + sizeof (StateData), object); |
99 | 98 | state.setNext(foundTransition->targetOffset, object, size); |
100 | - } | |
101 | - else { | |
99 | + } else { | |
102 | 100 | state.setNext(foundTransition->targetOffset); |
103 | 101 | } |
104 | 102 | } |
105 | 103 | } |
106 | 104 | |
105 | +template <class T> | |
106 | +bool FSA<T>::tryToRecognize(const char* input, T& value) const { | |
107 | + State<T> currState = this->getInitialState(); | |
108 | + int i = 0; | |
109 | + while (!currState.isSink() && input[i] != '\0') { | |
110 | + currState.proceedToNext(input[i]); | |
111 | + i++; | |
112 | + } | |
113 | + if (currState.isAccepting()) { | |
114 | + value = currState.getValue(); | |
115 | + return true; | |
116 | + } else { | |
117 | + return false; | |
118 | + } | |
119 | +} | |
120 | + | |
121 | +template <class T> | |
122 | +State<T> FSA<T>::getInitialState() const { | |
123 | + return State<T>(*this); | |
124 | +} | |
125 | + | |
107 | 126 | #endif /* _SIMPLE_FSA_IMPL_HPP */ |
... | ... |
fsa/fsa.hpp
... | ... | @@ -54,25 +54,9 @@ public: |
54 | 54 | /** |
55 | 55 | * Get this automaton's initial state. |
56 | 56 | */ |
57 | - State<T> getInitialState() const { | |
58 | - return State<T>(*this); | |
59 | - } | |
57 | + State<T> getInitialState() const; | |
60 | 58 | |
61 | - bool tryToRecognize(const char* input, T& value) const { | |
62 | - State<T> currState = this->getInitialState(); | |
63 | - int i = 0; | |
64 | - while (!currState.isSink() && input[i] != '\0') { | |
65 | - currState.proceedToNext(input[i]); | |
66 | - i++; | |
67 | - } | |
68 | - if (currState.isAccepting()) { | |
69 | - value = currState.getValue(); | |
70 | - return true; | |
71 | - } | |
72 | - else { | |
73 | - return false; | |
74 | - } | |
75 | - } | |
59 | + bool tryToRecognize(const char* input, T& value) const; | |
76 | 60 | |
77 | 61 | virtual ~FSA() { |
78 | 62 | } |
... | ... | @@ -83,10 +67,6 @@ protected: |
83 | 67 | } |
84 | 68 | /** |
85 | 69 | * Proceed to next state |
86 | - * | |
87 | - * @param fromPointer - wskaźnik | |
88 | - * @param c - char for the transition. | |
89 | - * @return next state | |
90 | 70 | */ |
91 | 71 | virtual void proceedToNext(const char c, State<T>& state) const = 0; |
92 | 72 | const unsigned char* startPtr; |
... | ... | @@ -107,7 +87,7 @@ private: |
107 | 87 | |
108 | 88 | }; |
109 | 89 | |
110 | -#include "_simple_fsa_impl.hpp" | |
90 | +#include "_fsa_impl.hpp" | |
111 | 91 | |
112 | 92 | /** |
113 | 93 | * A state in an FSA. |
... | ... |
fsa/test_dict.cpp
... | ... | @@ -14,34 +14,12 @@ |
14 | 14 | #include <functional> |
15 | 15 | #include <cctype> |
16 | 16 | #include <locale> |
17 | +#include <vector> | |
17 | 18 | #include "fsa.hpp" |
19 | +#include "utils.hpp" | |
18 | 20 | |
19 | 21 | using namespace std; |
20 | 22 | |
21 | -void validate(const bool cond, const string& msg) { | |
22 | - if (!cond) { | |
23 | - cerr << msg << endl; | |
24 | - exit(1); | |
25 | - } | |
26 | -} | |
27 | - | |
28 | -unsigned char* readFile(const char* fname) { | |
29 | - ifstream ifs; | |
30 | - ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); | |
31 | - ifs.open(fname, ios::in | ios::binary | ios::ate); | |
32 | - // if (ifs.is_open()) { | |
33 | - int size = ifs.tellg(); | |
34 | - unsigned char* memblock = new unsigned char [size]; | |
35 | - ifs.seekg(0, ios::beg); | |
36 | - ifs.read(reinterpret_cast<char*> (memblock), size); | |
37 | - ifs.close(); | |
38 | - return memblock; | |
39 | - // } | |
40 | - // else { | |
41 | - // cerr << "Unable to open file " << fname << endl; | |
42 | - // } | |
43 | -} | |
44 | - | |
45 | 23 | std::vector<std::string> &split(const std::string &s, char delim, std::vector<std::string> &elems) { |
46 | 24 | std::stringstream ss(s); |
47 | 25 | std::string item; |
... | ... |
fsa/test_speed.cpp
0 → 100644
1 | +/* | |
2 | + * File: test_speed.cpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 24 październik 2013, 17:47 | |
6 | + */ | |
7 | + | |
8 | +#include <cstdlib> | |
9 | +#include <fstream> | |
10 | +#include "fsa.hpp" | |
11 | +#include "utils.hpp" | |
12 | + | |
13 | +#define NDEBUG | |
14 | + | |
15 | +using namespace std; | |
16 | + | |
17 | +/* | |
18 | + * | |
19 | + */ | |
20 | +int main(int argc, char** argv) { | |
21 | + validate(argc == 3, "Must provide exactly two arguments - FSA filename and test data filename."); | |
22 | + const unsigned char* fsaData = readFile(argv[1]); | |
23 | + StringDeserializer deserializer; | |
24 | + SimpleFSA<char*> fsa(fsaData, deserializer); | |
25 | + ifstream ifs; | |
26 | +// ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); | |
27 | + ifs.open(argv[2], ios::binary); | |
28 | + string line; | |
29 | + while (getline(ifs, line)) { | |
30 | + char* val; | |
31 | + if (fsa.tryToRecognize(line.c_str(), val)) { | |
32 | +// printf("%s: *OK*\n", line.c_str()); | |
33 | + } | |
34 | + else { | |
35 | +// printf("%s: NOT FOUND", line.c_str()); | |
36 | + } | |
37 | + } | |
38 | + return 0; | |
39 | +} | |
40 | + | |
... | ... |
fsa/utils.hpp
0 → 100644
1 | +/* | |
2 | + * File: utils.hpp | |
3 | + * Author: mlenart | |
4 | + * | |
5 | + * Created on 24 październik 2013, 17:56 | |
6 | + */ | |
7 | + | |
8 | +#ifndef UTILS_HPP | |
9 | +#define UTILS_HPP | |
10 | + | |
11 | +#include <iostream> | |
12 | +#include <string> | |
13 | + | |
14 | +void validate(const bool cond, const std::string& msg) { | |
15 | + if (!cond) { | |
16 | + std::cerr << msg << std::endl; | |
17 | + exit(1); | |
18 | + } | |
19 | +} | |
20 | + | |
21 | +unsigned char* readFile(const char* fname) { | |
22 | + ifstream ifs; | |
23 | + ifs.exceptions(std::ifstream::failbit | std::ifstream::badbit); | |
24 | + ifs.open(fname, ios::in | ios::binary | ios::ate); | |
25 | + // if (ifs.is_open()) { | |
26 | + int size = ifs.tellg(); | |
27 | + unsigned char* memblock = new unsigned char [size]; | |
28 | + ifs.seekg(0, ios::beg); | |
29 | + ifs.read(reinterpret_cast<char*> (memblock), size); | |
30 | + ifs.close(); | |
31 | + return memblock; | |
32 | + // } | |
33 | + // else { | |
34 | + // cerr << "Unable to open file " << fname << endl; | |
35 | + // } | |
36 | +} | |
37 | + | |
38 | +#endif /* UTILS_HPP */ | |
39 | + | |
... | ... |
fsabuilder/fsa/buildfsa.py
... | ... | @@ -24,6 +24,7 @@ class OutputFormat(): |
24 | 24 | class InputFormat(): |
25 | 25 | ENCODED = 'ENCODED' |
26 | 26 | POLIMORF = 'POLIMORF' |
27 | + PLAIN = 'PLAIN' | |
27 | 28 | |
28 | 29 | def parseOptions(): |
29 | 30 | """ |
... | ... | @@ -58,7 +59,7 @@ def parseOptions(): |
58 | 59 | if not opts.outputFormat.upper() in [OutputFormat.BINARY, OutputFormat.CPP]: |
59 | 60 | print >> sys.stderr, 'output format must be one of ('+str([OutputFormat.BINARY, OutputFormat.CPP])+')' |
60 | 61 | exit(1) |
61 | - if not opts.inputFormat.upper() in [InputFormat.ENCODED, InputFormat.POLIMORF]: | |
62 | + if not opts.inputFormat.upper() in [InputFormat.ENCODED, InputFormat.POLIMORF, InputFormat.PLAIN]: | |
62 | 63 | print >> sys.stderr, 'input format must be one of ('+str([InputFormat.ENCODED, InputFormat.POLIMORF])+')' |
63 | 64 | exit(1) |
64 | 65 | return opts |
... | ... | @@ -74,15 +75,23 @@ def readPolimorfInput(inputFile, encoder): |
74 | 75 | for entry in convertinput.convertPolimorf(f.readlines(), lambda (word, interp): encoder.word2SortKey(word)): |
75 | 76 | yield entry |
76 | 77 | |
78 | +def readPlainInput(inputFile, encoder): | |
79 | + with codecs.open(inputFile, 'r', 'utf8') as f: | |
80 | + for line in sorted(f.readlines(), key=encoder.word2SortKey): | |
81 | + word = line.strip() | |
82 | + yield word, '' | |
83 | + | |
77 | 84 | if __name__ == '__main__': |
78 | 85 | opts = parseOptions() |
79 | 86 | encoder = encode.Encoder() |
80 | 87 | fsa = FSA(encoder) |
81 | 88 | serializer = SimpleSerializerWithStringValues() |
82 | 89 | |
83 | - inputData = readEncodedInput(opts.inputFile) \ | |
84 | - if opts.inputFormat == InputFormat.ENCODED \ | |
85 | - else readPolimorfInput(opts.inputFile, encoder) | |
90 | + inputData = { | |
91 | + InputFormat.ENCODED: readEncodedInput(opts.inputFile), | |
92 | + InputFormat.POLIMORF: readPolimorfInput(opts.inputFile, encoder), | |
93 | + InputFormat.PLAIN: readPlainInput(opts.inputFile, encoder) | |
94 | + }[opts.inputFormat] | |
86 | 95 | |
87 | 96 | logging.info('feeding FSA with data ...') |
88 | 97 | fsa.feed(inputData) |
... | ... |
nbproject/configurations.xml
... | ... | @@ -2,10 +2,11 @@ |
2 | 2 | <configurationDescriptor version="90"> |
3 | 3 | <logicalFolder name="root" displayName="root" projectFiles="true" kind="ROOT"> |
4 | 4 | <df root="fsa" name="0"> |
5 | - <in>_state_impl.hpp</in> | |
5 | + <in>test_dict.cpp</in> | |
6 | + <in>test_speed.cpp</in> | |
7 | + <in>utils.hpp</in> | |
6 | 8 | </df> |
7 | 9 | <df root="morfeusz" name="1"> |
8 | - <in>default_fsa.hpp</in> | |
9 | 10 | <in>main.cpp</in> |
10 | 11 | <in>morfeusz.cpp</in> |
11 | 12 | </df> |
... | ... | @@ -38,17 +39,30 @@ |
38 | 39 | <buildCommand>${MAKE} -f Makefile</buildCommand> |
39 | 40 | <cleanCommand>${MAKE} -f Makefile clean</cleanCommand> |
40 | 41 | <executablePath></executablePath> |
41 | - <ccTool> | |
42 | - <incDir> | |
43 | - <pElem>fsa</pElem> | |
44 | - <pElem>build/morfeusz</pElem> | |
45 | - </incDir> | |
46 | - </ccTool> | |
47 | 42 | </makeTool> |
48 | 43 | </makefileType> |
49 | - <item path="fsa/_state_impl.hpp" ex="false" tool="3" flavor2="0"> | |
44 | + <folder path="0"> | |
45 | + <ccTool> | |
46 | + <incDir> | |
47 | + <pElem>build/fsa</pElem> | |
48 | + </incDir> | |
49 | + </ccTool> | |
50 | + </folder> | |
51 | + <folder path="1"> | |
52 | + <ccTool> | |
53 | + <incDir> | |
54 | + <pElem>fsa</pElem> | |
55 | + <pElem>build/morfeusz</pElem> | |
56 | + </incDir> | |
57 | + </ccTool> | |
58 | + </folder> | |
59 | + <item path="fsa/test_dict.cpp" ex="false" tool="1" flavor2="8"> | |
60 | + <ccTool> | |
61 | + </ccTool> | |
62 | + </item> | |
63 | + <item path="fsa/test_speed.cpp" ex="false" tool="1" flavor2="0"> | |
50 | 64 | </item> |
51 | - <item path="morfeusz/default_fsa.hpp" ex="false" tool="3" flavor2="0"> | |
65 | + <item path="fsa/utils.hpp" ex="false" tool="3" flavor2="0"> | |
52 | 66 | </item> |
53 | 67 | <item path="morfeusz/main.cpp" ex="false" tool="1" flavor2="8"> |
54 | 68 | <ccTool> |
... | ... |