Commit d7a3318a05cf145e814613f9380fa988d1cd3c52
1 parent
b9b7d313
poprawiony wrapper pythonowy
git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/trunk@353 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
Showing
1 changed file
with
155 additions
and
6 deletions
morfeusz/wrappers/morfeusz_python.i
... | ... | @@ -55,6 +55,8 @@ |
55 | 55 | } |
56 | 56 | }; |
57 | 57 | |
58 | +%rename (_Morfeusz) morfeusz::Morfeusz; | |
59 | + | |
58 | 60 | //%ignore morfeusz::Morfeusz::createInstance(morfeusz::MorfeuszUsage); |
59 | 61 | %extend morfeusz::Morfeusz { |
60 | 62 | std::vector<morfeusz::MorphInterpretation> morfeusz::Morfeusz::_generateByTagId(const std::string& lemma, int tagId) const { |
... | ... | @@ -87,7 +89,7 @@ def analyse(self, text): |
87 | 89 | Analyse given text and return a list of MorphInterpretation objects. |
88 | 90 | """ |
89 | 91 | res = InterpsList() |
90 | - $action(self, text, res) | |
92 | + $action(self, text.encode('utf-8'), res) | |
91 | 93 | return res |
92 | 94 | %} |
93 | 95 | |
... | ... | @@ -216,9 +218,10 @@ def getTag(self, morfeusz): |
216 | 218 | %feature("shadow") morfeusz::MorphInterpretation::getName %{ |
217 | 219 | def getName(self, morfeusz): |
218 | 220 | """ |
219 | - Returns this interpretation named entity as string | |
221 | + Returns this interpretation named entity as unicode | |
220 | 222 | """ |
221 | - return $action(self, morfeusz) | |
223 | + name = $action(self, morfeusz).decode('utf8') | |
224 | + return name.split('|') if name else [] | |
222 | 225 | %} |
223 | 226 | |
224 | 227 | %feature("shadow") morfeusz::MorphInterpretation::getLabelsAsString %{ |
... | ... | @@ -232,9 +235,9 @@ def getLabelsAsUnicode(self, morfeusz): |
232 | 235 | %feature("shadow") morfeusz::MorphInterpretation::getLabels %{ |
233 | 236 | def getLabels(self, morfeusz): |
234 | 237 | """ |
235 | - Returns this interpretation labels as a set of strings | |
238 | + Returns this interpretation labels as a list of strings | |
236 | 239 | """ |
237 | - return { l.decode('utf8') for l in $action(self, morfeusz) } | |
240 | + return [l.decode('utf8') for l in $action(self, morfeusz)] | |
238 | 241 | %} |
239 | 242 | |
240 | 243 | %feature("shadow") morfeusz::MorphInterpretation::createIgn %{ |
... | ... | @@ -282,7 +285,7 @@ def getLabelsAsUnicode(self, labelsId): |
282 | 285 | |
283 | 286 | %feature("shadow") morfeusz::IdResolver::getLabels %{ |
284 | 287 | def getLabels(self, labelsId): |
285 | - return { l.decode('utf8') for l in $action(self, labelsId) } | |
288 | + return [l.decode('utf8') for l in $action(self, labelsId)] | |
286 | 289 | %} |
287 | 290 | |
288 | 291 | %feature("shadow") morfeusz::IdResolver::getLabelsId %{ |
... | ... | @@ -290,6 +293,152 @@ def getLabelsId(self, labelsStr): |
290 | 293 | return $action(self, labelsStr.encode('utf8')) |
291 | 294 | %} |
292 | 295 | |
296 | +%pythoncode %{ | |
297 | +import collections | |
298 | +import os.path | |
299 | + | |
300 | +# skopiowane, bo kod sie wkleja na poczatku zamiast na koncu | |
301 | +CONTINUOUS_NUMBERING = _morfeusz2.CONTINUOUS_NUMBERING | |
302 | +CONDITIONALLY_CASE_SENSITIVE = _morfeusz2.CONDITIONALLY_CASE_SENSITIVE | |
303 | +SKIP_WHITESPACES = _morfeusz2.SKIP_WHITESPACES | |
304 | +ANALYSE_ONLY = _morfeusz2.ANALYSE_ONLY | |
305 | +GENERATE_ONLY = _morfeusz2.GENERATE_ONLY | |
306 | +BOTH_ANALYSE_AND_GENERATE = _morfeusz2.BOTH_ANALYSE_AND_GENERATE | |
307 | + | |
308 | +__version__ = _morfeusz2._Morfeusz_getVersion() | |
309 | + | |
310 | +__copyright__ = _morfeusz2._Morfeusz_getCopyright() | |
311 | + | |
312 | +GENDERS = ['m1', 'm2', 'm3', 'f', 'n1', 'n2', 'p1', 'p2', 'p3'] | |
313 | + | |
314 | + | |
315 | +class Morfeusz(_object): | |
316 | + def __init__(self, dict_name=None, dict_path=None, | |
317 | + analyse=True, generate=True, expand_dag=False, | |
318 | + expand_tags=False, expand_dot=True, expand_underscore=True, | |
319 | + aggl=None, praet=None, separate_numbering=True, | |
320 | + case_handling=CONDITIONALLY_CASE_SENSITIVE, | |
321 | + whitespace=SKIP_WHITESPACES): | |
322 | + """ | |
323 | + case_handling options: | |
324 | + CONDITIONALLY_CASE_SENSITIVE, STRICTLY_CASE_SENSITIVE, IGNORE_CASE | |
325 | + whitespace options: | |
326 | + SKIP_WHITESPACES, KEEP_WHITESPACES, APPEND_WHITESPACES | |
327 | + """ | |
328 | + if analyse and generate: | |
329 | + usage = BOTH_ANALYSE_AND_GENERATE | |
330 | + elif analyse: | |
331 | + usage = ANALYSE_ONLY | |
332 | + elif generate: | |
333 | + usage = GENERATE_ONLY | |
334 | + else: | |
335 | + raise ValueError( | |
336 | + 'At least one of "analyse" and "generate" must be True') | |
337 | + self.expand_dag = expand_dag | |
338 | + self.expand_tags = expand_tags | |
339 | + self.expand_dot = expand_dot | |
340 | + self.expand_underscore = expand_underscore | |
341 | + if dict_path: | |
342 | + self.add_dictionary_path(dict_path) | |
343 | + if dict_name: | |
344 | + m = _Morfeusz.createInstance(dict_name, usage) | |
345 | + else: | |
346 | + m = _Morfeusz.createInstance(usage) | |
347 | + self._morfeusz_obj = m | |
348 | + if aggl: | |
349 | + m.setAggl(aggl) | |
350 | + if praet: | |
351 | + m.setPraet(praet) | |
352 | + if not separate_numbering: | |
353 | + m.setTokenNumbering(CONTINUOUS_NUMBERING) | |
354 | + m.setCaseHandling(case_handling) | |
355 | + m.setWhitespaceHandling(whitespace) | |
356 | + | |
357 | + def add_dictionary_path(self, dict_path): | |
358 | + dict_paths = _morfeusz2._Morfeusz_dictionarySearchPaths_get() | |
359 | + if dict_path not in dict_paths: | |
360 | + _morfeusz2._Morfeusz_dictionarySearchPaths_set( | |
361 | + (dict_path,) + dict_paths) | |
362 | + | |
363 | + def _expand_tag(self, tag): | |
364 | + chunks = [ | |
365 | + GENDERS if chunk == '_' and self.expand_underscore | |
366 | + else chunk.split('.') | |
367 | + for chunk in tag.split(':') | |
368 | + ] | |
369 | + | |
370 | + if not self.expand_dot: | |
371 | + yield ':'.join('.'.join(values) for values in chunks) | |
372 | + return | |
373 | + | |
374 | + def expand_chunks(i): | |
375 | + if i >= len(chunks): | |
376 | + yield () | |
377 | + else: | |
378 | + tail = tuple(expand_chunks(i + 1)) | |
379 | + for chunk_variant in chunks[i]: | |
380 | + for tail_variant in tail: | |
381 | + yield (chunk_variant,) + tail_variant | |
382 | + | |
383 | + for x in expand_chunks(0): | |
384 | + yield ':'.join(x) | |
385 | + | |
386 | + def _expand_interp(self, interp): | |
387 | + tags = self._expand_tag(interp[2]) | |
388 | + for tag in tags: | |
389 | + yield (interp[0], interp[1], tag, interp[3], interp[4]) | |
390 | + | |
391 | + @staticmethod | |
392 | + def _dag_to_list(interps): | |
393 | + dag = collections.defaultdict(list) | |
394 | + for start, end, interp in interps: | |
395 | + dag[start].append((interp, end)) | |
396 | + def expand_dag(start): | |
397 | + nexts = dag[start] | |
398 | + if not nexts: | |
399 | + yield [] | |
400 | + else: | |
401 | + for head, end in nexts: | |
402 | + for tail in expand_dag(end): | |
403 | + yield [head] + tail | |
404 | + return list(expand_dag(0)) | |
405 | + | |
406 | + def analyse(self, text): | |
407 | + m = self._morfeusz_obj | |
408 | + interps = m.analyse(text) | |
409 | + interp_tuples = [ | |
410 | + (i.startNode, i.endNode, | |
411 | + (i.lemma, i.orth, i.getTag(m), i.getName(m), i.getLabels(m))) | |
412 | + for i in interps] | |
413 | + | |
414 | + def expand_interps(): | |
415 | + for start, end, interp in interp_tuples: | |
416 | + for exp_interp in self._expand_interp(interp): | |
417 | + yield start, end, exp_interp | |
418 | + | |
419 | + if self.expand_tags: | |
420 | + interp_tuples = list(expand_interps()) | |
421 | + if self.expand_dag: | |
422 | + interp_tuples = self._dag_to_list(interp_tuples) | |
423 | + return interp_tuples | |
424 | + | |
425 | + def generate(self, lemma, tag_id=None): | |
426 | + m = self._morfeusz_obj | |
427 | + interps = m.generate(lemma, tag_id) | |
428 | + interp_tuples = [ | |
429 | + (i.orth, i.lemma, i.getTag(m), i.getName(m), i.getLabels(m)) | |
430 | + for i in interps] | |
431 | + | |
432 | + def expand_interps(): | |
433 | + for interp in interp_tuples: | |
434 | + for exp_interp in self._expand_interp(interp): | |
435 | + yield exp_interp | |
436 | + | |
437 | + if self.expand_tags: | |
438 | + interp_tuples = list(expand_interps()) | |
439 | + return interp_tuples | |
440 | +%} | |
441 | + | |
293 | 442 | %include "std_vector.i" |
294 | 443 | %include "std_string.i" |
295 | 444 | %include "std_list.i" |
... | ... |