Commit 5e5f3286b0a583c8dbf208c92550152abca79030
1 parent
1d174433
- poprawiona obsługa słowników w morfeuszbuilderze (popoprawiane opcje)
- poprawki testów (by uwzględniały nową obsługę słowników) git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/trunk@256 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
12 changed files
with
180 additions
and
137 deletions
fsabuilder/morfeusz_builder
... | ... | @@ -23,8 +23,8 @@ def _checkOption(opt, parser, msg): |
23 | 23 | parser.print_help() |
24 | 24 | exit(1) |
25 | 25 | |
26 | -def _checkExactlyOneOptionSet(optsList, parser, msg): | |
27 | - if optsList.count(True) != 1: | |
26 | +def _checkCondition(cond, parser, msg): | |
27 | + if not cond: | |
28 | 28 | print >> sys.stderr, msg |
29 | 29 | parser.print_help() |
30 | 30 | exit(1) |
... | ... | @@ -42,6 +42,11 @@ def _checkOpen(filename, mode): |
42 | 42 | print >> sys.stderr, str(ex) |
43 | 43 | exit(1) |
44 | 44 | |
45 | +def _getDictFilename(opts, isGenerator): | |
46 | + typeCode = 's' if isGenerator else 'a' | |
47 | + fname = '%s-%s.dict' % (opts.dictName, typeCode) | |
48 | + return os.path.join(opts.dictDir, fname) | |
49 | + | |
45 | 50 | def _parseOptions(): |
46 | 51 | """ |
47 | 52 | Parses commandline args |
... | ... | @@ -53,7 +58,7 @@ def _parseOptions(): |
53 | 58 | action='callback', |
54 | 59 | callback=_parseListCallback, |
55 | 60 | metavar='FILES', |
56 | - help='comma separated list of files') | |
61 | + help='comma separated list of dictionary files') | |
57 | 62 | parser.add_option('--tagset-file', |
58 | 63 | dest='tagsetFile', |
59 | 64 | metavar='FILE', |
... | ... | @@ -62,37 +67,45 @@ def _parseOptions(): |
62 | 67 | dest='segmentsFile', |
63 | 68 | metavar='FILE', |
64 | 69 | help='path to the file with segment rules') |
65 | - parser.add_option('--trim-supneg', | |
66 | - dest='trimSupneg', | |
67 | - default=False, | |
68 | - action='store_true', | |
69 | - help='this option is ignored and exists only for backwards compatibility') | |
70 | - parser.add_option('-o', '--output-file', | |
71 | - dest='outputFile', | |
70 | + #~ parser.add_option('--trim-supneg', | |
71 | + #~ dest='trimSupneg', | |
72 | + #~ default=False, | |
73 | + #~ action='store_true', | |
74 | + #~ help='this option is ignored and exists only for backwards compatibility') | |
75 | + parser.add_option('--dict-name', | |
76 | + dest='dictName', | |
77 | + help='the name of result dictionary') | |
78 | + parser.add_option('--dict-dir', | |
79 | + dest='dictDir', | |
72 | 80 | metavar='FILE', |
73 | - help='path to output file') | |
74 | - parser.add_option('-a', '--analyzer', | |
75 | - dest='analyzer', | |
76 | - action='store_true', | |
77 | - default=False, | |
78 | - help='Generate FSA for morphological analysis') | |
79 | - parser.add_option('-g', '--generator', | |
80 | - dest='generator', | |
81 | - action='store_true', | |
82 | - default=False, | |
83 | - help='Generate FSA for morphological synthesis') | |
84 | - parser.add_option('--cpp', | |
85 | - dest='cpp', | |
81 | + default=os.getcwd(), | |
82 | + help='path to output directory (the default is current dir)') | |
83 | + parser.add_option('--only-analyzer', | |
84 | + dest='onlyAnalyzer', | |
86 | 85 | action='store_true', |
87 | 86 | default=False, |
88 | - help='Encode binary data in c++ file') | |
89 | - parser.add_option('--use-arrays', | |
90 | - dest='useArrays', | |
87 | + help='Generate dictionary for morphological analysis only (default is both analysis and synthesis)') | |
88 | + parser.add_option('--only-generator', | |
89 | + dest='onlyGenerator', | |
91 | 90 | action='store_true', |
92 | 91 | default=False, |
93 | - help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)') | |
92 | + help='Generate dictionary for morphological synthesis only (default is both analysis and synthesis)') | |
93 | + parser.add_option('--analyzer-cpp', | |
94 | + dest='analyzerCpp', | |
95 | + metavar='FILE', | |
96 | + help='Encode analyzer dictionary data in given c++ file') | |
97 | + parser.add_option('--generator-cpp', | |
98 | + dest='generatorCpp', | |
99 | + metavar='FILE', | |
100 | + help='Encode generator dictionary data in given c++ file') | |
101 | + #~ parser.add_option('--use-arrays', | |
102 | + #~ dest='useArrays', | |
103 | + #~ action='store_true', | |
104 | + #~ default=False, | |
105 | + #~ help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)') | |
94 | 106 | parser.add_option('--serialization-method', |
95 | 107 | dest='serializationMethod', |
108 | + default='V1', | |
96 | 109 | help="FSA serialization method: \ |
97 | 110 | SIMPLE - fixed-length transitions, fastest and weakest compression \ |
98 | 111 | V1 - variable-length transitions, compressed labels - strongest compression \ |
... | ... | @@ -102,9 +115,12 @@ def _parseOptions(): |
102 | 115 | #~ action='store_true', |
103 | 116 | #~ default=False, |
104 | 117 | #~ help='visualize result') |
105 | - parser.add_option('--train-file', | |
106 | - dest='trainFile', | |
107 | - help='A text file used for training. Should contain words from some large corpus - one word in each line') | |
118 | + parser.add_option('--analyzer-train-file', | |
119 | + dest='analyzerTrainFile', | |
120 | + help='A text file used for analyzer training. Should contain words from some large corpus - one word in each line. Resulting analysis automaton should be faster with proper train file.') | |
121 | + parser.add_option('--generator-train-file', | |
122 | + dest='generatorTrainFile', | |
123 | + help='A text file used for generator training. Should contain words from some large corpus - one word in each line. Resulting synthesis automaton should be faster with proper train file.') | |
108 | 124 | parser.add_option('--debug', |
109 | 125 | dest='debug', |
110 | 126 | action='store_true', |
... | ... | @@ -119,22 +135,33 @@ def _parseOptions(): |
119 | 135 | opts, args = parser.parse_args() |
120 | 136 | |
121 | 137 | _checkOption(opts.inputFiles, parser, "Input file is missing") |
122 | - _checkOption(opts.outputFile, parser, "Output file is missing") | |
138 | + _checkOption(opts.dictDir, parser, "Output dictionary dir is missing") | |
139 | + _checkCondition((opts.onlyAnalyzer, opts.onlyGenerator) != (True, True), | |
140 | + parser, 'Cannot set both --only-analyzer and --only-generator') | |
141 | + writeCpp = {opts.analyzerCpp, opts.generatorCpp} != {None} | |
142 | + _checkCondition(opts.dictName or writeCpp, parser, "Dictionary name is missing") | |
143 | + _checkCondition(opts.onlyGenerator or opts.analyzerCpp or not writeCpp, parser, "Analyzer .cpp output file path is missing") | |
144 | + _checkCondition(opts.onlyAnalyzer or opts.generatorCpp or not writeCpp, parser, "Generator .cpp output file path is missing") | |
145 | + #~ _checkCondition((opts.dictName, opts.outputCpp) != (None, None), | |
146 | + #~ parser, 'Must set at least one of: --dict-name, --output-cpp') | |
147 | + #~ _checkOption(opts.outputFile, parser, "Output file is missing") | |
123 | 148 | _checkOption(opts.tagsetFile, parser, "Tagset file is missing") |
124 | - _checkOption(opts.serializationMethod, parser, "Serialization method file is missing") | |
125 | - _checkExactlyOneOptionSet([opts.analyzer, opts.generator], | |
126 | - parser, 'Must set exactly one FSA type: --analyzer or --generator') | |
149 | + _checkOption(opts.segmentsFile, parser, "Segmentation file is missing") | |
150 | + #~ _checkOption(opts.serializationMethod, parser, "Serialization method file is missing") | |
151 | + #~ _checkExactlyOneOptionSet([opts.analyzer, opts.generator], | |
152 | + #~ parser, 'Must set exactly one FSA type: --analyzer or --generator') | |
127 | 153 | |
128 | 154 | _checkOpen(opts.tagsetFile, 'r') |
155 | + _checkOpen(opts.segmentsFile, 'r') | |
129 | 156 | for filename in opts.inputFiles: |
130 | 157 | _checkOpen(filename, 'r') |
131 | - _checkOpen(opts.outputFile, 'w') | |
132 | - _checkOption(opts.segmentsFile, parser, "Segment rules file is missing") | |
133 | - if opts.analyzer: | |
134 | - _checkOpen(opts.segmentsFile, 'r') | |
158 | + if not opts.onlyGenerator: | |
159 | + _checkOpen(_getDictFilename(opts, isGenerator=False), 'w') | |
160 | + if not opts.onlyAnalyzer: | |
161 | + _checkOpen(_getDictFilename(opts, isGenerator=True), 'w') | |
135 | 162 | |
136 | - if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]: | |
137 | - print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')' | |
163 | + if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]: | |
164 | + print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')' | |
138 | 165 | parser.print_help() |
139 | 166 | exit(1) |
140 | 167 | |
... | ... | @@ -211,49 +238,43 @@ def buildGeneratorFromPoliMorf(inputFiles, tagset, segmentRulesManager): |
211 | 238 | _printStats(fsa) |
212 | 239 | return fsa, encoder.qualifiersMap |
213 | 240 | |
241 | +def _doBuildDictionaryPart(opts, isGenerator): | |
242 | + tagset = Tagset(opts.tagsetFile) | |
243 | + rulesParserVersion = rulesParser.RulesParser.PARSE4ANALYZER if not isGenerator else rulesParser.RulesParser.PARSE4GENERATOR | |
244 | + segmentRulesManager = rulesParser.RulesParser(tagset, rulesParserVersion).parse(opts.segmentsFile) | |
245 | + fsa, qualifiersMap = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager) | |
246 | + segmentationRulesData = segmentRulesManager.serialize() | |
247 | + | |
248 | + if opts.analyzerTrainFile: | |
249 | + logging.info('training with '+opts.analyzerTrainFile+' ...') | |
250 | + fsa.train(_readTrainData(opts.analyzerTrainFile)) | |
251 | + logging.info('done training') | |
252 | + | |
253 | + serializer = Serializer.getSerializer(opts.serializationMethod, fsa, tagset, qualifiersMap, segmentationRulesData) | |
254 | + if opts.generatorCpp and isGenerator: | |
255 | + serializer.serialize2CppFile(opts.generatorCpp, isGenerator=isGenerator) | |
256 | + if opts.analyzerCpp and not isGenerator: | |
257 | + serializer.serialize2CppFile(opts.analyzerCpp, isGenerator=isGenerator) | |
258 | + | |
259 | + if opts.dictDir: | |
260 | + serializer.serialize2BinaryFile(_getDictFilename(opts, isGenerator=isGenerator), isGenerator=isGenerator) | |
261 | + | |
262 | + logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset)) | |
263 | + | |
214 | 264 | def main(opts): |
215 | 265 | if opts.debug: |
216 | 266 | logging.basicConfig(level=logging.DEBUG) |
217 | 267 | else: |
218 | 268 | logging.basicConfig(level=logging.INFO) |
219 | 269 | |
220 | - if opts.analyzer: | |
221 | - logging.info('*** building analyzer ***') | |
222 | - else: | |
223 | - logging.info('*** building generator ***') | |
224 | - | |
225 | 270 | logging.info('reading tagset from %s', opts.tagsetFile) |
226 | 271 | tagset = Tagset(opts.tagsetFile) |
227 | - rulesType = rulesParser.RulesParser.PARSE4ANALYZER if opts.analyzer else rulesParser.RulesParser.PARSE4GENERATOR | |
228 | - segmentRulesManager = rulesParser.RulesParser(tagset, rulesType).parse(opts.segmentsFile) | |
229 | - segmentationRulesData = segmentRulesManager.serialize() | |
230 | 272 | |
231 | - if opts.analyzer: | |
232 | - fsa, qualifiersMap = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager) | |
233 | - else: | |
234 | - fsa, qualifiersMap = buildGeneratorFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager) | |
235 | - if opts.trainFile: | |
236 | - logging.info('training with '+opts.trainFile+' ...') | |
237 | - fsa.train(_readTrainData(opts.trainFile)) | |
238 | - logging.info('done training') | |
239 | - | |
240 | -# serializer = { | |
241 | -# SerializationMethod.SIMPLE: SimpleSerializer, | |
242 | -# SerializationMethod.V1: VLengthSerializer1, | |
243 | -# SerializationMethod.V2: VLengthSerializer2, | |
244 | -# }[opts.serializationMethod](fsa) | |
245 | - serializer = Serializer.getSerializer(opts.serializationMethod, fsa, tagset, qualifiersMap, segmentationRulesData) | |
246 | - | |
247 | - if opts.cpp: | |
248 | - serializer.serialize2CppFile(opts.outputFile, isGenerator=opts.generator) | |
249 | - else: | |
250 | - serializer.serialize2BinaryFile(opts.outputFile, isGenerator=opts.generator) | |
251 | - | |
252 | - logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset)) | |
253 | -# { | |
254 | -# OutputFormat.CPP: serializer.serialize2CppFile, | |
255 | -# OutputFormat.BINARY: serializer.serialize2BinaryFile | |
256 | -# }[opts.outputFormat](opts.outputFile) | |
273 | + if not opts.onlyGenerator: | |
274 | + _doBuildDictionaryPart(opts, isGenerator=False) | |
275 | + | |
276 | + if not opts.onlyGenerator: | |
277 | + _doBuildDictionaryPart(opts, isGenerator=True) | |
257 | 278 | |
258 | 279 | if __name__ == '__main__': |
259 | 280 | import os |
... | ... |
morfeusz/CMakeLists.txt
... | ... | @@ -6,21 +6,15 @@ if (SKIP_DICTIONARY_BUILDING) |
6 | 6 | message ("SKIPPING dictionary building") |
7 | 7 | else () |
8 | 8 | add_custom_command ( |
9 | - OUTPUT "${ANALYZER_DICTIONARY_CPP}" | |
10 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${ANALYZER_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 | |
11 | - DEPENDS "${INPUT_DICTIONARY}" | |
12 | - COMMENT "Building default dictionary C++ file" | |
13 | - ) | |
14 | - add_custom_command ( | |
15 | - OUTPUT "${GENERATOR_DICTIONARY_CPP}" | |
16 | - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${GENERATOR_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1 | |
9 | + OUTPUT "${ANALYZER_DICTIONARY_CPP}" "${GENERATOR_DICTIONARY_CPP}" | |
10 | + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --input-files="${INPUT_DICTIONARIES}" --analyzer-cpp="${ANALYZER_DICTIONARY_CPP}" --generator-cpp="${GENERATOR_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" | |
17 | 11 | DEPENDS "${INPUT_DICTIONARY}" |
18 | 12 | COMMENT "Building default dictionary C++ file" |
19 | 13 | ) |
20 | 14 | endif() |
21 | 15 | |
22 | -add_custom_target ( analyzer-dictionary DEPENDS "${INPUT_DICTIONARY_CPP}") | |
23 | -add_custom_target ( generator-dictionary DEPENDS "${INPUT_SYNTH_DICTIONARY_CPP}") | |
16 | +add_custom_target ( analyzer-dictionary DEPENDS "${ANALYZER_DICTIONARY_CPP}") | |
17 | +add_custom_target ( generator-dictionary DEPENDS "${GENERATOR_DICTIONARY_CPP}") | |
24 | 18 | add_custom_target ( dictionaries DEPENDS analyzer-dictionary generator-dictionary) |
25 | 19 | |
26 | 20 | include_directories( ${CMAKE_CURRENT_SOURCE_DIR} ) |
... | ... |
morfeusz/Dictionary.cpp
... | ... | @@ -33,8 +33,8 @@ namespace morfeusz { |
33 | 33 | } |
34 | 34 | |
35 | 35 | Dictionary::Dictionary(const unsigned char* fsaFileStartPtr, MorfeuszProcessorType processorType) |
36 | - : idResolver(fsaFileStartPtr, &UTF8CharsetConverter::getInstance()), | |
37 | - fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))), | |
36 | + : fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))), | |
37 | + idResolver(fsaFileStartPtr, &UTF8CharsetConverter::getInstance()), | |
38 | 38 | separatorsList(getSeparatorsList(fsaFileStartPtr)), |
39 | 39 | segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)), |
40 | 40 | defaultSegrulesOptions(getDefaultSegrulesOptions(fsaFileStartPtr)), |
... | ... |
morfeusz/Dictionary.hpp
... | ... | @@ -24,9 +24,9 @@ namespace morfeusz { |
24 | 24 | |
25 | 25 | struct Dictionary { |
26 | 26 | Dictionary(const unsigned char* ptr, MorfeuszProcessorType processorType); |
27 | - | |
28 | - IdResolverImpl idResolver; | |
27 | + | |
29 | 28 | FSAType* fsa; |
29 | + IdResolverImpl idResolver; | |
30 | 30 | std::vector<uint32_t> separatorsList; |
31 | 31 | std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap; |
32 | 32 | SegrulesOptions defaultSegrulesOptions; |
... | ... |
morfeusz/Environment.cpp
... | ... | @@ -173,8 +173,8 @@ namespace morfeusz { |
173 | 173 | } |
174 | 174 | |
175 | 175 | void Environment::setDictionary(const std::string& dictName) { |
176 | + cerr << "SETTING DICT: " << dictName << endl; | |
176 | 177 | this->dictionary = DictionariesRepository::instance.getDictionary(dictName, this->processorType); |
177 | - | |
178 | 178 | idResolver = dictionary->idResolver; |
179 | 179 | this->idResolver.setCharsetConverter(currentCharsetConverter); |
180 | 180 | currSegrulesOptions = dictionary->defaultSegrulesOptions; |
... | ... |
morfeusz/MorfeuszImpl.cpp
... | ... | @@ -156,14 +156,6 @@ namespace morfeusz { |
156 | 156 | return getAnyEnvironment().getAvailablePraetOptions(); |
157 | 157 | } |
158 | 158 | |
159 | - // void MorfeuszImpl::setAnalyzerDictionary(const string& filename) { | |
160 | - // this->analyzerEnv.setDictionaryFile(filename); | |
161 | - // } | |
162 | - // | |
163 | - // void MorfeuszImpl::setGeneratorDictionary(const string& filename) { | |
164 | - // this->generatorEnv.setDictionaryFile(filename); | |
165 | - // } | |
166 | - | |
167 | 159 | MorfeuszImpl::~MorfeuszImpl() { |
168 | 160 | } |
169 | 161 | |
... | ... |
morfeusz/tests/TestMorfeusz.cpp
... | ... | @@ -101,27 +101,28 @@ void TestMorfeusz::testAnalyzeVector1() { |
101 | 101 | CPPUNIT_ASSERT_EQUAL(string("AAAAbbbbCCCC"), res[0].lemma); |
102 | 102 | } |
103 | 103 | |
104 | -static inline string prepareErrorneusTmpFile() { | |
105 | - char* filename = tmpnam(NULL); | |
104 | +static inline string prepareErrorneusDictFile(const string& dictName) { | |
105 | + string filename = dictName + "-a.dict"; | |
106 | 106 | ofstream out; |
107 | - out.open(filename); | |
107 | + out.open(filename.c_str()); | |
108 | 108 | out << "asfasdfa" << endl; |
109 | - out.close(); | |
110 | - return string(filename); | |
109 | + return filename; | |
111 | 110 | } |
112 | 111 | |
113 | -void TestMorfeusz::testOpenInvalidFile() { | |
114 | - CPPUNIT_FAIL("not implemented yet"); | |
115 | -// cerr << "testOpenInvalidFile" << endl; | |
116 | -// string filename(prepareErrorneusTmpFile()); | |
117 | -// CPPUNIT_ASSERT_THROW(morfeusz->setAnalyzerDictionary(filename), FileFormatException); | |
112 | +void TestMorfeusz::testOpenInvalidDict() { | |
113 | + cerr << "testOpenInvalidDict" << endl; | |
114 | + string dictName = "asdfasdfasdfa"; | |
115 | + string filename = prepareErrorneusDictFile(dictName); | |
116 | + morfeusz->dictionarySearchPaths.push_front("."); | |
117 | + cerr << "still alive..." << endl; | |
118 | + CPPUNIT_ASSERT_THROW(morfeusz->setDictionary(dictName), FileFormatException); | |
119 | + remove(filename.c_str()); | |
118 | 120 | } |
119 | 121 | |
120 | -void TestMorfeusz::testOpenNonExistentFile() { | |
121 | - CPPUNIT_FAIL("not implemented yet"); | |
122 | +void TestMorfeusz::testOpenNonExistentDict() { | |
122 | 123 | // cerr << "testOpenNonExistentFile" << endl; |
123 | 124 | // string filename(tmpnam(NULL)); |
124 | -// CPPUNIT_ASSERT_THROW(morfeusz->setAnalyzerDictionary(filename), std::ios_base::failure); | |
125 | + CPPUNIT_ASSERT_THROW(morfeusz->setDictionary("asdfasdfa"), MorfeuszException); | |
125 | 126 | } |
126 | 127 | |
127 | 128 | void TestMorfeusz::testSetInvalidAgglOption() { |
... | ... |
morfeusz/tests/TestMorfeusz.hpp
... | ... | @@ -19,8 +19,8 @@ class TestMorfeusz : public CPPUNIT_NS::TestFixture { |
19 | 19 | CPPUNIT_TEST(testAnalyzeIterateWithWhitespaceHandlingKEEP); |
20 | 20 | CPPUNIT_TEST(testAnalyzeIterateWithWhitespaceHandlingAPPEND); |
21 | 21 | CPPUNIT_TEST(testAnalyzeVector1); |
22 | - CPPUNIT_TEST(testOpenInvalidFile); | |
23 | - CPPUNIT_TEST(testOpenNonExistentFile); | |
22 | + CPPUNIT_TEST(testOpenInvalidDict); | |
23 | + CPPUNIT_TEST(testOpenNonExistentDict); | |
24 | 24 | CPPUNIT_TEST(testSetInvalidAgglOption); |
25 | 25 | CPPUNIT_TEST(testSetInvalidPraetOption); |
26 | 26 | CPPUNIT_TEST(testWhitespaceHandlingKEEP); |
... | ... | @@ -39,8 +39,8 @@ private: |
39 | 39 | void testAnalyzeIterateWithWhitespaceHandlingKEEP(); |
40 | 40 | void testAnalyzeIterateWithWhitespaceHandlingAPPEND(); |
41 | 41 | void testAnalyzeVector1(); |
42 | - void testOpenInvalidFile(); | |
43 | - void testOpenNonExistentFile(); | |
42 | + void testOpenInvalidDict(); | |
43 | + void testOpenNonExistentDict(); | |
44 | 44 | void testSetInvalidAgglOption(); |
45 | 45 | void testSetInvalidPraetOption(); |
46 | 46 | void testWhitespaceHandlingKEEP(); |
... | ... |
morfeusz/wrappers/java/JMorfeuszTest.java
1 | 1 | |
2 | 2 | import java.io.File; |
3 | 3 | import java.io.IOException; |
4 | +import java.io.PrintStream; | |
4 | 5 | import java.util.List; |
5 | 6 | import java.util.NoSuchElementException; |
6 | 7 | import org.junit.After; |
... | ... | @@ -52,12 +53,11 @@ public class JMorfeuszTest { |
52 | 53 | try { |
53 | 54 | res.get(2); |
54 | 55 | fail(); |
55 | - } | |
56 | - catch (IndexOutOfBoundsException ex) { | |
57 | - | |
56 | + } catch (IndexOutOfBoundsException ex) { | |
57 | + | |
58 | 58 | } |
59 | 59 | } |
60 | - | |
60 | + | |
61 | 61 | @Test |
62 | 62 | public void testAnalyzeAsIterator() { |
63 | 63 | ResultsIterator it = morfeusz.analyseAsIterator("Aaaa żżżż"); |
... | ... | @@ -68,9 +68,8 @@ public class JMorfeuszTest { |
68 | 68 | try { |
69 | 69 | it.next(); |
70 | 70 | fail(); |
71 | - } | |
72 | - catch (NoSuchElementException ex) { | |
73 | - | |
71 | + } catch (NoSuchElementException ex) { | |
72 | + | |
74 | 73 | } |
75 | 74 | } |
76 | 75 | |
... | ... | @@ -78,7 +77,7 @@ public class JMorfeuszTest { |
78 | 77 | public void testInvalidAgglOption() { |
79 | 78 | morfeusz.setAggl("XXXXYYYYZZZZ"); |
80 | 79 | } |
81 | - | |
80 | + | |
82 | 81 | @Test(expected = MorfeuszException.class) |
83 | 82 | public void testInvalidPraetOption() { |
84 | 83 | morfeusz.setPraet("XXXXYYYYZZZZ"); |
... | ... | @@ -94,18 +93,21 @@ public class JMorfeuszTest { |
94 | 93 | morfeusz.setCaseHandling(null); |
95 | 94 | } |
96 | 95 | |
97 | - @Test(expected = IOException.class) | |
98 | - public void testNonExistingDictionaryFile() throws IOException { | |
99 | - fail("not implemented yet"); | |
100 | -// File tmpFile = File.createTempFile("morfeusz_invalid_dict", ".test"); | |
101 | -// tmpFile.delete(); | |
102 | -// morfeusz.setGeneratorDictionary(tmpFile.getAbsolutePath()); | |
96 | + @Test(expected = MorfeuszException.class) | |
97 | + public void testNonExistingDictionary() throws IOException { | |
98 | + morfeusz.setDictionary("ee2rmtsq"); | |
103 | 99 | } |
104 | 100 | |
105 | 101 | @Test(expected = IOException.class) |
106 | - public void testInvalidDictionaryFile() throws IOException { | |
107 | - fail("not implemented yet"); | |
108 | -// File tmpFile = File.createTempFile("morfeusz_invalid_dict", ".test"); | |
109 | -// morfeusz.setGeneratorDictionary(tmpFile.getAbsolutePath()); | |
102 | + public void testInvalidDictionary() throws Exception { | |
103 | + String dictName = "6J1vMiqY"; | |
104 | + File tmpFile = new File(dictName + "-a.dict"); | |
105 | + assertTrue(tmpFile.createNewFile()); | |
106 | + tmpFile.deleteOnExit(); | |
107 | + try (PrintStream out = new PrintStream(tmpFile)) { | |
108 | + out.print("IzEne9FXuc"); | |
109 | + } | |
110 | + morfeusz.getDictionarySearchPaths().add(0, "."); | |
111 | + morfeusz.setDictionary(dictName); | |
110 | 112 | } |
111 | 113 | } |
... | ... |
morfeusz/wrappers/morfeusz_java.i
... | ... | @@ -147,11 +147,21 @@ import java.util.ArrayList; |
147 | 147 | jenv->ThrowNew(clazz, "Invalid file format"); |
148 | 148 | return $null; |
149 | 149 | } |
150 | + catch(morfeusz::MorfeuszException & e) { | |
151 | + jclass clazz = jenv->FindClass("pl/waw/ipipan/morfeusz/MorfeuszException"); | |
152 | + jenv->ThrowNew(clazz, e.what()); | |
153 | + return $null; | |
154 | + } | |
150 | 155 | catch(std::ios_base::failure & e) { |
151 | 156 | jclass clazz = jenv->FindClass("java/io/IOException"); |
152 | 157 | jenv->ThrowNew(clazz, e.what()); |
153 | 158 | return $null; |
154 | 159 | } |
160 | + catch(...) { | |
161 | + jclass clazz = jenv->FindClass("java/lang/RuntimeException"); | |
162 | + jenv->ThrowNew(clazz, "Unknown exception"); | |
163 | + return $null; | |
164 | + } | |
155 | 165 | } |
156 | 166 | |
157 | 167 | //%javaexception("java.io.IOException") morfeusz::Morfeusz::setGeneratorDictionary { |
... | ... |
morfeusz/wrappers/morfeusz_python.i
... | ... | @@ -23,6 +23,27 @@ |
23 | 23 | } |
24 | 24 | } |
25 | 25 | |
26 | +%exception morfeusz::Morfeusz::setDictionary { | |
27 | + try{ | |
28 | + $action | |
29 | + } | |
30 | + catch(const std::ios_base::failure& e) { | |
31 | + SWIG_exception(SWIG_IOError, const_cast<char*>(e.what())); | |
32 | + } | |
33 | + catch(const morfeusz::MorfeuszException& e) { | |
34 | + SWIG_exception(SWIG_IOError, const_cast<char*>(e.what())); | |
35 | + } | |
36 | + catch(const std::invalid_argument& e) { | |
37 | + SWIG_exception(SWIG_ValueError, const_cast<char*>(e.what())); | |
38 | + } | |
39 | + catch(const std::string& e) { | |
40 | + SWIG_exception(SWIG_RuntimeError, const_cast<char*>(e.c_str())); | |
41 | + } | |
42 | + catch(...) { | |
43 | + SWIG_exception(SWIG_RuntimeError, "Unknown exception"); | |
44 | + } | |
45 | +} | |
46 | + | |
26 | 47 | %ignore morfeusz::MorfeuszException; |
27 | 48 | %ignore morfeusz::FileFormatException; |
28 | 49 | |
... | ... |
morfeusz/wrappers/python/test.py
... | ... | @@ -76,18 +76,20 @@ class TestSequenceFunctions(unittest.TestCase): |
76 | 76 | pass |
77 | 77 | |
78 | 78 | def testNonExistingDictionaryFile(self): |
79 | - _, path = tempfile.mkstemp() | |
80 | - os.remove(path) | |
81 | 79 | try: |
82 | - self.morfeusz.setGeneratorDictionary(path) | |
80 | + self.morfeusz.setDictionary("1P4sEBuWv") | |
83 | 81 | self.fail() |
84 | 82 | except IOError: |
85 | 83 | pass |
86 | 84 | |
87 | 85 | def testInvalidDictionaryFile(self): |
88 | - _, path = tempfile.mkstemp() | |
86 | + dirpath = tempfile.mkdtemp() | |
87 | + dictName = '6J1vMiqY' | |
88 | + path = os.path.join(dirpath, dictName + '-a.dict') | |
89 | + with open(path, "a+") as f: | |
90 | + f.write('ee2rmtsq') | |
89 | 91 | try: |
90 | - self.morfeusz.setGeneratorDictionary(path) | |
92 | + self.morfeusz.setDictionary(dictName) | |
91 | 93 | self.fail() |
92 | 94 | except IOError: |
93 | 95 | pass |
... | ... |