Commit 5e5f3286b0a583c8dbf208c92550152abca79030

Authored by Michał Lenart
1 parent 1d174433

- poprawiona obsługa słowników w morfeuszbuilderze (popoprawiane opcje)

- poprawki testów (by uwzględniały nową obsługę słowników)

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/trunk@256 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
fsabuilder/morfeusz_builder
... ... @@ -23,8 +23,8 @@ def _checkOption(opt, parser, msg):
23 23 parser.print_help()
24 24 exit(1)
25 25  
26   -def _checkExactlyOneOptionSet(optsList, parser, msg):
27   - if optsList.count(True) != 1:
  26 +def _checkCondition(cond, parser, msg):
  27 + if not cond:
28 28 print >> sys.stderr, msg
29 29 parser.print_help()
30 30 exit(1)
... ... @@ -42,6 +42,11 @@ def _checkOpen(filename, mode):
42 42 print >> sys.stderr, str(ex)
43 43 exit(1)
44 44  
  45 +def _getDictFilename(opts, isGenerator):
  46 + typeCode = 's' if isGenerator else 'a'
  47 + fname = '%s-%s.dict' % (opts.dictName, typeCode)
  48 + return os.path.join(opts.dictDir, fname)
  49 +
45 50 def _parseOptions():
46 51 """
47 52 Parses commandline args
... ... @@ -53,7 +58,7 @@ def _parseOptions():
53 58 action='callback',
54 59 callback=_parseListCallback,
55 60 metavar='FILES',
56   - help='comma separated list of files')
  61 + help='comma separated list of dictionary files')
57 62 parser.add_option('--tagset-file',
58 63 dest='tagsetFile',
59 64 metavar='FILE',
... ... @@ -62,37 +67,45 @@ def _parseOptions():
62 67 dest='segmentsFile',
63 68 metavar='FILE',
64 69 help='path to the file with segment rules')
65   - parser.add_option('--trim-supneg',
66   - dest='trimSupneg',
67   - default=False,
68   - action='store_true',
69   - help='this option is ignored and exists only for backwards compatibility')
70   - parser.add_option('-o', '--output-file',
71   - dest='outputFile',
  70 + #~ parser.add_option('--trim-supneg',
  71 + #~ dest='trimSupneg',
  72 + #~ default=False,
  73 + #~ action='store_true',
  74 + #~ help='this option is ignored and exists only for backwards compatibility')
  75 + parser.add_option('--dict-name',
  76 + dest='dictName',
  77 + help='the name of result dictionary')
  78 + parser.add_option('--dict-dir',
  79 + dest='dictDir',
72 80 metavar='FILE',
73   - help='path to output file')
74   - parser.add_option('-a', '--analyzer',
75   - dest='analyzer',
76   - action='store_true',
77   - default=False,
78   - help='Generate FSA for morphological analysis')
79   - parser.add_option('-g', '--generator',
80   - dest='generator',
81   - action='store_true',
82   - default=False,
83   - help='Generate FSA for morphological synthesis')
84   - parser.add_option('--cpp',
85   - dest='cpp',
  81 + default=os.getcwd(),
  82 + help='path to output directory (the default is current dir)')
  83 + parser.add_option('--only-analyzer',
  84 + dest='onlyAnalyzer',
86 85 action='store_true',
87 86 default=False,
88   - help='Encode binary data in c++ file')
89   - parser.add_option('--use-arrays',
90   - dest='useArrays',
  87 + help='Generate dictionary for morphological analysis only (default is both analysis and synthesis)')
  88 + parser.add_option('--only-generator',
  89 + dest='onlyGenerator',
91 90 action='store_true',
92 91 default=False,
93   - help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)')
  92 + help='Generate dictionary for morphological synthesis only (default is both analysis and synthesis)')
  93 + parser.add_option('--analyzer-cpp',
  94 + dest='analyzerCpp',
  95 + metavar='FILE',
  96 + help='Encode analyzer dictionary data in given c++ file')
  97 + parser.add_option('--generator-cpp',
  98 + dest='generatorCpp',
  99 + metavar='FILE',
  100 + help='Encode generator dictionary data in given c++ file')
  101 + #~ parser.add_option('--use-arrays',
  102 + #~ dest='useArrays',
  103 + #~ action='store_true',
  104 + #~ default=False,
  105 + #~ help='store states reachable by 2 transitions in arrays (should speed up recognition, available only when --serialization-method=V1)')
94 106 parser.add_option('--serialization-method',
95 107 dest='serializationMethod',
  108 + default='V1',
96 109 help="FSA serialization method: \
97 110 SIMPLE - fixed-length transitions, fastest and weakest compression \
98 111 V1 - variable-length transitions, compressed labels - strongest compression \
... ... @@ -102,9 +115,12 @@ def _parseOptions():
102 115 #~ action='store_true',
103 116 #~ default=False,
104 117 #~ help='visualize result')
105   - parser.add_option('--train-file',
106   - dest='trainFile',
107   - help='A text file used for training. Should contain words from some large corpus - one word in each line')
  118 + parser.add_option('--analyzer-train-file',
  119 + dest='analyzerTrainFile',
  120 + help='A text file used for analyzer training. Should contain words from some large corpus - one word in each line. Resulting analysis automaton should be faster with proper train file.')
  121 + parser.add_option('--generator-train-file',
  122 + dest='generatorTrainFile',
  123 + help='A text file used for generator training. Should contain words from some large corpus - one word in each line. Resulting synthesis automaton should be faster with proper train file.')
108 124 parser.add_option('--debug',
109 125 dest='debug',
110 126 action='store_true',
... ... @@ -119,22 +135,33 @@ def _parseOptions():
119 135 opts, args = parser.parse_args()
120 136  
121 137 _checkOption(opts.inputFiles, parser, "Input file is missing")
122   - _checkOption(opts.outputFile, parser, "Output file is missing")
  138 + _checkOption(opts.dictDir, parser, "Output dictionary dir is missing")
  139 + _checkCondition((opts.onlyAnalyzer, opts.onlyGenerator) != (True, True),
  140 + parser, 'Cannot set both --only-analyzer and --only-generator')
  141 + writeCpp = {opts.analyzerCpp, opts.generatorCpp} != {None}
  142 + _checkCondition(opts.dictName or writeCpp, parser, "Dictionary name is missing")
  143 + _checkCondition(opts.onlyGenerator or opts.analyzerCpp or not writeCpp, parser, "Analyzer .cpp output file path is missing")
  144 + _checkCondition(opts.onlyAnalyzer or opts.generatorCpp or not writeCpp, parser, "Generator .cpp output file path is missing")
  145 + #~ _checkCondition((opts.dictName, opts.outputCpp) != (None, None),
  146 + #~ parser, 'Must set at least one of: --dict-name, --output-cpp')
  147 + #~ _checkOption(opts.outputFile, parser, "Output file is missing")
123 148 _checkOption(opts.tagsetFile, parser, "Tagset file is missing")
124   - _checkOption(opts.serializationMethod, parser, "Serialization method file is missing")
125   - _checkExactlyOneOptionSet([opts.analyzer, opts.generator],
126   - parser, 'Must set exactly one FSA type: --analyzer or --generator')
  149 + _checkOption(opts.segmentsFile, parser, "Segmentation file is missing")
  150 + #~ _checkOption(opts.serializationMethod, parser, "Serialization method file is missing")
  151 + #~ _checkExactlyOneOptionSet([opts.analyzer, opts.generator],
  152 + #~ parser, 'Must set exactly one FSA type: --analyzer or --generator')
127 153  
128 154 _checkOpen(opts.tagsetFile, 'r')
  155 + _checkOpen(opts.segmentsFile, 'r')
129 156 for filename in opts.inputFiles:
130 157 _checkOpen(filename, 'r')
131   - _checkOpen(opts.outputFile, 'w')
132   - _checkOption(opts.segmentsFile, parser, "Segment rules file is missing")
133   - if opts.analyzer:
134   - _checkOpen(opts.segmentsFile, 'r')
  158 + if not opts.onlyGenerator:
  159 + _checkOpen(_getDictFilename(opts, isGenerator=False), 'w')
  160 + if not opts.onlyAnalyzer:
  161 + _checkOpen(_getDictFilename(opts, isGenerator=True), 'w')
135 162  
136   - if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2]:
137   - print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1, SerializationMethod.V2])+')'
  163 + if not opts.serializationMethod.upper() in [SerializationMethod.SIMPLE, SerializationMethod.V1]:
  164 + print >> sys.stderr, '--serialization-method must be one of ('+str([SerializationMethod.SIMPLE, SerializationMethod.V1])+')'
138 165 parser.print_help()
139 166 exit(1)
140 167  
... ... @@ -211,49 +238,43 @@ def buildGeneratorFromPoliMorf(inputFiles, tagset, segmentRulesManager):
211 238 _printStats(fsa)
212 239 return fsa, encoder.qualifiersMap
213 240  
  241 +def _doBuildDictionaryPart(opts, isGenerator):
  242 + tagset = Tagset(opts.tagsetFile)
  243 + rulesParserVersion = rulesParser.RulesParser.PARSE4ANALYZER if not isGenerator else rulesParser.RulesParser.PARSE4GENERATOR
  244 + segmentRulesManager = rulesParser.RulesParser(tagset, rulesParserVersion).parse(opts.segmentsFile)
  245 + fsa, qualifiersMap = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager)
  246 + segmentationRulesData = segmentRulesManager.serialize()
  247 +
  248 + if opts.analyzerTrainFile:
  249 + logging.info('training with '+opts.analyzerTrainFile+' ...')
  250 + fsa.train(_readTrainData(opts.analyzerTrainFile))
  251 + logging.info('done training')
  252 +
  253 + serializer = Serializer.getSerializer(opts.serializationMethod, fsa, tagset, qualifiersMap, segmentationRulesData)
  254 + if opts.generatorCpp and isGenerator:
  255 + serializer.serialize2CppFile(opts.generatorCpp, isGenerator=isGenerator)
  256 + if opts.analyzerCpp and not isGenerator:
  257 + serializer.serialize2CppFile(opts.analyzerCpp, isGenerator=isGenerator)
  258 +
  259 + if opts.dictDir:
  260 + serializer.serialize2BinaryFile(_getDictFilename(opts, isGenerator=isGenerator), isGenerator=isGenerator)
  261 +
  262 + logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset))
  263 +
214 264 def main(opts):
215 265 if opts.debug:
216 266 logging.basicConfig(level=logging.DEBUG)
217 267 else:
218 268 logging.basicConfig(level=logging.INFO)
219 269  
220   - if opts.analyzer:
221   - logging.info('*** building analyzer ***')
222   - else:
223   - logging.info('*** building generator ***')
224   -
225 270 logging.info('reading tagset from %s', opts.tagsetFile)
226 271 tagset = Tagset(opts.tagsetFile)
227   - rulesType = rulesParser.RulesParser.PARSE4ANALYZER if opts.analyzer else rulesParser.RulesParser.PARSE4GENERATOR
228   - segmentRulesManager = rulesParser.RulesParser(tagset, rulesType).parse(opts.segmentsFile)
229   - segmentationRulesData = segmentRulesManager.serialize()
230 272  
231   - if opts.analyzer:
232   - fsa, qualifiersMap = buildAnalyzerFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager)
233   - else:
234   - fsa, qualifiersMap = buildGeneratorFromPoliMorf(opts.inputFiles, tagset, segmentRulesManager)
235   - if opts.trainFile:
236   - logging.info('training with '+opts.trainFile+' ...')
237   - fsa.train(_readTrainData(opts.trainFile))
238   - logging.info('done training')
239   -
240   -# serializer = {
241   -# SerializationMethod.SIMPLE: SimpleSerializer,
242   -# SerializationMethod.V1: VLengthSerializer1,
243   -# SerializationMethod.V2: VLengthSerializer2,
244   -# }[opts.serializationMethod](fsa)
245   - serializer = Serializer.getSerializer(opts.serializationMethod, fsa, tagset, qualifiersMap, segmentationRulesData)
246   -
247   - if opts.cpp:
248   - serializer.serialize2CppFile(opts.outputFile, isGenerator=opts.generator)
249   - else:
250   - serializer.serialize2BinaryFile(opts.outputFile, isGenerator=opts.generator)
251   -
252   - logging.info('total FSA size (in bytes): '+str(fsa.initialState.reverseOffset))
253   -# {
254   -# OutputFormat.CPP: serializer.serialize2CppFile,
255   -# OutputFormat.BINARY: serializer.serialize2BinaryFile
256   -# }[opts.outputFormat](opts.outputFile)
  273 + if not opts.onlyGenerator:
  274 + _doBuildDictionaryPart(opts, isGenerator=False)
  275 +
  276 + if not opts.onlyGenerator:
  277 + _doBuildDictionaryPart(opts, isGenerator=True)
257 278  
258 279 if __name__ == '__main__':
259 280 import os
... ...
morfeusz/CMakeLists.txt
... ... @@ -6,21 +6,15 @@ if (SKIP_DICTIONARY_BUILDING)
6 6 message ("SKIPPING dictionary building")
7 7 else ()
8 8 add_custom_command (
9   - OUTPUT "${ANALYZER_DICTIONARY_CPP}"
10   - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --analyzer --input-files="${INPUT_DICTIONARIES}" -o "${ANALYZER_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1
11   - DEPENDS "${INPUT_DICTIONARY}"
12   - COMMENT "Building default dictionary C++ file"
13   - )
14   - add_custom_command (
15   - OUTPUT "${GENERATOR_DICTIONARY_CPP}"
16   - COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --generator --input-files="${INPUT_DICTIONARIES}" -o "${GENERATOR_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}" --cpp --serialization-method=V1
  9 + OUTPUT "${ANALYZER_DICTIONARY_CPP}" "${GENERATOR_DICTIONARY_CPP}"
  10 + COMMAND python ${PROJECT_SOURCE_DIR}/fsabuilder/morfeusz_builder --input-files="${INPUT_DICTIONARIES}" --analyzer-cpp="${ANALYZER_DICTIONARY_CPP}" --generator-cpp="${GENERATOR_DICTIONARY_CPP}" --tagset-file="${INPUT_TAGSET}" --segments-file="${SEGMENT_RULES_FILE}"
17 11 DEPENDS "${INPUT_DICTIONARY}"
18 12 COMMENT "Building default dictionary C++ file"
19 13 )
20 14 endif()
21 15  
22   -add_custom_target ( analyzer-dictionary DEPENDS "${INPUT_DICTIONARY_CPP}")
23   -add_custom_target ( generator-dictionary DEPENDS "${INPUT_SYNTH_DICTIONARY_CPP}")
  16 +add_custom_target ( analyzer-dictionary DEPENDS "${ANALYZER_DICTIONARY_CPP}")
  17 +add_custom_target ( generator-dictionary DEPENDS "${GENERATOR_DICTIONARY_CPP}")
24 18 add_custom_target ( dictionaries DEPENDS analyzer-dictionary generator-dictionary)
25 19  
26 20 include_directories( ${CMAKE_CURRENT_SOURCE_DIR} )
... ...
morfeusz/Dictionary.cpp
... ... @@ -33,8 +33,8 @@ namespace morfeusz {
33 33 }
34 34  
35 35 Dictionary::Dictionary(const unsigned char* fsaFileStartPtr, MorfeuszProcessorType processorType)
36   - : idResolver(fsaFileStartPtr, &UTF8CharsetConverter::getInstance()),
37   - fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))),
  36 + : fsa(FSAType::getFSA(fsaFileStartPtr, initializeDeserializer(processorType))),
  37 + idResolver(fsaFileStartPtr, &UTF8CharsetConverter::getInstance()),
38 38 separatorsList(getSeparatorsList(fsaFileStartPtr)),
39 39 segrulesFSAsMap(createSegrulesFSAsMap(fsaFileStartPtr)),
40 40 defaultSegrulesOptions(getDefaultSegrulesOptions(fsaFileStartPtr)),
... ...
morfeusz/Dictionary.hpp
... ... @@ -24,9 +24,9 @@ namespace morfeusz {
24 24  
25 25 struct Dictionary {
26 26 Dictionary(const unsigned char* ptr, MorfeuszProcessorType processorType);
27   -
28   - IdResolverImpl idResolver;
  27 +
29 28 FSAType* fsa;
  29 + IdResolverImpl idResolver;
30 30 std::vector<uint32_t> separatorsList;
31 31 std::map<SegrulesOptions, SegrulesFSA*> segrulesFSAsMap;
32 32 SegrulesOptions defaultSegrulesOptions;
... ...
morfeusz/Environment.cpp
... ... @@ -173,8 +173,8 @@ namespace morfeusz {
173 173 }
174 174  
175 175 void Environment::setDictionary(const std::string& dictName) {
  176 + cerr << "SETTING DICT: " << dictName << endl;
176 177 this->dictionary = DictionariesRepository::instance.getDictionary(dictName, this->processorType);
177   -
178 178 idResolver = dictionary->idResolver;
179 179 this->idResolver.setCharsetConverter(currentCharsetConverter);
180 180 currSegrulesOptions = dictionary->defaultSegrulesOptions;
... ...
morfeusz/MorfeuszImpl.cpp
... ... @@ -156,14 +156,6 @@ namespace morfeusz {
156 156 return getAnyEnvironment().getAvailablePraetOptions();
157 157 }
158 158  
159   - // void MorfeuszImpl::setAnalyzerDictionary(const string& filename) {
160   - // this->analyzerEnv.setDictionaryFile(filename);
161   - // }
162   - //
163   - // void MorfeuszImpl::setGeneratorDictionary(const string& filename) {
164   - // this->generatorEnv.setDictionaryFile(filename);
165   - // }
166   -
167 159 MorfeuszImpl::~MorfeuszImpl() {
168 160 }
169 161  
... ...
morfeusz/tests/TestMorfeusz.cpp
... ... @@ -101,27 +101,28 @@ void TestMorfeusz::testAnalyzeVector1() {
101 101 CPPUNIT_ASSERT_EQUAL(string("AAAAbbbbCCCC"), res[0].lemma);
102 102 }
103 103  
104   -static inline string prepareErrorneusTmpFile() {
105   - char* filename = tmpnam(NULL);
  104 +static inline string prepareErrorneusDictFile(const string& dictName) {
  105 + string filename = dictName + "-a.dict";
106 106 ofstream out;
107   - out.open(filename);
  107 + out.open(filename.c_str());
108 108 out << "asfasdfa" << endl;
109   - out.close();
110   - return string(filename);
  109 + return filename;
111 110 }
112 111  
113   -void TestMorfeusz::testOpenInvalidFile() {
114   - CPPUNIT_FAIL("not implemented yet");
115   -// cerr << "testOpenInvalidFile" << endl;
116   -// string filename(prepareErrorneusTmpFile());
117   -// CPPUNIT_ASSERT_THROW(morfeusz->setAnalyzerDictionary(filename), FileFormatException);
  112 +void TestMorfeusz::testOpenInvalidDict() {
  113 + cerr << "testOpenInvalidDict" << endl;
  114 + string dictName = "asdfasdfasdfa";
  115 + string filename = prepareErrorneusDictFile(dictName);
  116 + morfeusz->dictionarySearchPaths.push_front(".");
  117 + cerr << "still alive..." << endl;
  118 + CPPUNIT_ASSERT_THROW(morfeusz->setDictionary(dictName), FileFormatException);
  119 + remove(filename.c_str());
118 120 }
119 121  
120   -void TestMorfeusz::testOpenNonExistentFile() {
121   - CPPUNIT_FAIL("not implemented yet");
  122 +void TestMorfeusz::testOpenNonExistentDict() {
122 123 // cerr << "testOpenNonExistentFile" << endl;
123 124 // string filename(tmpnam(NULL));
124   -// CPPUNIT_ASSERT_THROW(morfeusz->setAnalyzerDictionary(filename), std::ios_base::failure);
  125 + CPPUNIT_ASSERT_THROW(morfeusz->setDictionary("asdfasdfa"), MorfeuszException);
125 126 }
126 127  
127 128 void TestMorfeusz::testSetInvalidAgglOption() {
... ...
morfeusz/tests/TestMorfeusz.hpp
... ... @@ -19,8 +19,8 @@ class TestMorfeusz : public CPPUNIT_NS::TestFixture {
19 19 CPPUNIT_TEST(testAnalyzeIterateWithWhitespaceHandlingKEEP);
20 20 CPPUNIT_TEST(testAnalyzeIterateWithWhitespaceHandlingAPPEND);
21 21 CPPUNIT_TEST(testAnalyzeVector1);
22   - CPPUNIT_TEST(testOpenInvalidFile);
23   - CPPUNIT_TEST(testOpenNonExistentFile);
  22 + CPPUNIT_TEST(testOpenInvalidDict);
  23 + CPPUNIT_TEST(testOpenNonExistentDict);
24 24 CPPUNIT_TEST(testSetInvalidAgglOption);
25 25 CPPUNIT_TEST(testSetInvalidPraetOption);
26 26 CPPUNIT_TEST(testWhitespaceHandlingKEEP);
... ... @@ -39,8 +39,8 @@ private:
39 39 void testAnalyzeIterateWithWhitespaceHandlingKEEP();
40 40 void testAnalyzeIterateWithWhitespaceHandlingAPPEND();
41 41 void testAnalyzeVector1();
42   - void testOpenInvalidFile();
43   - void testOpenNonExistentFile();
  42 + void testOpenInvalidDict();
  43 + void testOpenNonExistentDict();
44 44 void testSetInvalidAgglOption();
45 45 void testSetInvalidPraetOption();
46 46 void testWhitespaceHandlingKEEP();
... ...
morfeusz/wrappers/java/JMorfeuszTest.java
1 1  
2 2 import java.io.File;
3 3 import java.io.IOException;
  4 +import java.io.PrintStream;
4 5 import java.util.List;
5 6 import java.util.NoSuchElementException;
6 7 import org.junit.After;
... ... @@ -52,12 +53,11 @@ public class JMorfeuszTest {
52 53 try {
53 54 res.get(2);
54 55 fail();
55   - }
56   - catch (IndexOutOfBoundsException ex) {
57   -
  56 + } catch (IndexOutOfBoundsException ex) {
  57 +
58 58 }
59 59 }
60   -
  60 +
61 61 @Test
62 62 public void testAnalyzeAsIterator() {
63 63 ResultsIterator it = morfeusz.analyseAsIterator("Aaaa żżżż");
... ... @@ -68,9 +68,8 @@ public class JMorfeuszTest {
68 68 try {
69 69 it.next();
70 70 fail();
71   - }
72   - catch (NoSuchElementException ex) {
73   -
  71 + } catch (NoSuchElementException ex) {
  72 +
74 73 }
75 74 }
76 75  
... ... @@ -78,7 +77,7 @@ public class JMorfeuszTest {
78 77 public void testInvalidAgglOption() {
79 78 morfeusz.setAggl("XXXXYYYYZZZZ");
80 79 }
81   -
  80 +
82 81 @Test(expected = MorfeuszException.class)
83 82 public void testInvalidPraetOption() {
84 83 morfeusz.setPraet("XXXXYYYYZZZZ");
... ... @@ -94,18 +93,21 @@ public class JMorfeuszTest {
94 93 morfeusz.setCaseHandling(null);
95 94 }
96 95  
97   - @Test(expected = IOException.class)
98   - public void testNonExistingDictionaryFile() throws IOException {
99   - fail("not implemented yet");
100   -// File tmpFile = File.createTempFile("morfeusz_invalid_dict", ".test");
101   -// tmpFile.delete();
102   -// morfeusz.setGeneratorDictionary(tmpFile.getAbsolutePath());
  96 + @Test(expected = MorfeuszException.class)
  97 + public void testNonExistingDictionary() throws IOException {
  98 + morfeusz.setDictionary("ee2rmtsq");
103 99 }
104 100  
105 101 @Test(expected = IOException.class)
106   - public void testInvalidDictionaryFile() throws IOException {
107   - fail("not implemented yet");
108   -// File tmpFile = File.createTempFile("morfeusz_invalid_dict", ".test");
109   -// morfeusz.setGeneratorDictionary(tmpFile.getAbsolutePath());
  102 + public void testInvalidDictionary() throws Exception {
  103 + String dictName = "6J1vMiqY";
  104 + File tmpFile = new File(dictName + "-a.dict");
  105 + assertTrue(tmpFile.createNewFile());
  106 + tmpFile.deleteOnExit();
  107 + try (PrintStream out = new PrintStream(tmpFile)) {
  108 + out.print("IzEne9FXuc");
  109 + }
  110 + morfeusz.getDictionarySearchPaths().add(0, ".");
  111 + morfeusz.setDictionary(dictName);
110 112 }
111 113 }
... ...
morfeusz/wrappers/morfeusz_java.i
... ... @@ -147,11 +147,21 @@ import java.util.ArrayList;
147 147 jenv->ThrowNew(clazz, "Invalid file format");
148 148 return $null;
149 149 }
  150 + catch(morfeusz::MorfeuszException & e) {
  151 + jclass clazz = jenv->FindClass("pl/waw/ipipan/morfeusz/MorfeuszException");
  152 + jenv->ThrowNew(clazz, e.what());
  153 + return $null;
  154 + }
150 155 catch(std::ios_base::failure & e) {
151 156 jclass clazz = jenv->FindClass("java/io/IOException");
152 157 jenv->ThrowNew(clazz, e.what());
153 158 return $null;
154 159 }
  160 + catch(...) {
  161 + jclass clazz = jenv->FindClass("java/lang/RuntimeException");
  162 + jenv->ThrowNew(clazz, "Unknown exception");
  163 + return $null;
  164 + }
155 165 }
156 166  
157 167 //%javaexception("java.io.IOException") morfeusz::Morfeusz::setGeneratorDictionary {
... ...
morfeusz/wrappers/morfeusz_python.i
... ... @@ -23,6 +23,27 @@
23 23 }
24 24 }
25 25  
  26 +%exception morfeusz::Morfeusz::setDictionary {
  27 + try{
  28 + $action
  29 + }
  30 + catch(const std::ios_base::failure& e) {
  31 + SWIG_exception(SWIG_IOError, const_cast<char*>(e.what()));
  32 + }
  33 + catch(const morfeusz::MorfeuszException& e) {
  34 + SWIG_exception(SWIG_IOError, const_cast<char*>(e.what()));
  35 + }
  36 + catch(const std::invalid_argument& e) {
  37 + SWIG_exception(SWIG_ValueError, const_cast<char*>(e.what()));
  38 + }
  39 + catch(const std::string& e) {
  40 + SWIG_exception(SWIG_RuntimeError, const_cast<char*>(e.c_str()));
  41 + }
  42 + catch(...) {
  43 + SWIG_exception(SWIG_RuntimeError, "Unknown exception");
  44 + }
  45 +}
  46 +
26 47 %ignore morfeusz::MorfeuszException;
27 48 %ignore morfeusz::FileFormatException;
28 49  
... ...
morfeusz/wrappers/python/test.py
... ... @@ -76,18 +76,20 @@ class TestSequenceFunctions(unittest.TestCase):
76 76 pass
77 77  
78 78 def testNonExistingDictionaryFile(self):
79   - _, path = tempfile.mkstemp()
80   - os.remove(path)
81 79 try:
82   - self.morfeusz.setGeneratorDictionary(path)
  80 + self.morfeusz.setDictionary("1P4sEBuWv")
83 81 self.fail()
84 82 except IOError:
85 83 pass
86 84  
87 85 def testInvalidDictionaryFile(self):
88   - _, path = tempfile.mkstemp()
  86 + dirpath = tempfile.mkdtemp()
  87 + dictName = '6J1vMiqY'
  88 + path = os.path.join(dirpath, dictName + '-a.dict')
  89 + with open(path, "a+") as f:
  90 + f.write('ee2rmtsq')
89 91 try:
90   - self.morfeusz.setGeneratorDictionary(path)
  92 + self.morfeusz.setDictionary(dictName)
91 93 self.fail()
92 94 except IOError:
93 95 pass
... ...