Commit d7a3318a05cf145e814613f9380fa988d1cd3c52

Authored by Jan Szejko
1 parent b9b7d313

poprawiony wrapper pythonowy

git-svn-id: svn://svn.nlp.ipipan.waw.pl/morfeusz/trunk@353 ff4e3ee1-f430-4e82-ade0-24591c43f1fd
morfeusz/wrappers/morfeusz_python.i
... ... @@ -55,6 +55,8 @@
55 55 }
56 56 };
57 57  
  58 +%rename (_Morfeusz) morfeusz::Morfeusz;
  59 +
58 60 //%ignore morfeusz::Morfeusz::createInstance(morfeusz::MorfeuszUsage);
59 61 %extend morfeusz::Morfeusz {
60 62 std::vector<morfeusz::MorphInterpretation> morfeusz::Morfeusz::_generateByTagId(const std::string& lemma, int tagId) const {
... ... @@ -87,7 +89,7 @@ def analyse(self, text):
87 89 Analyse given text and return a list of MorphInterpretation objects.
88 90 """
89 91 res = InterpsList()
90   - $action(self, text, res)
  92 + $action(self, text.encode('utf-8'), res)
91 93 return res
92 94 %}
93 95  
... ... @@ -216,9 +218,10 @@ def getTag(self, morfeusz):
216 218 %feature("shadow") morfeusz::MorphInterpretation::getName %{
217 219 def getName(self, morfeusz):
218 220 """
219   - Returns this interpretation named entity as string
  221 + Returns this interpretation named entity as unicode
220 222 """
221   - return $action(self, morfeusz)
  223 + name = $action(self, morfeusz).decode('utf8')
  224 + return name.split('|') if name else []
222 225 %}
223 226  
224 227 %feature("shadow") morfeusz::MorphInterpretation::getLabelsAsString %{
... ... @@ -232,9 +235,9 @@ def getLabelsAsUnicode(self, morfeusz):
232 235 %feature("shadow") morfeusz::MorphInterpretation::getLabels %{
233 236 def getLabels(self, morfeusz):
234 237 """
235   - Returns this interpretation labels as a set of strings
  238 + Returns this interpretation labels as a list of strings
236 239 """
237   - return { l.decode('utf8') for l in $action(self, morfeusz) }
  240 + return [l.decode('utf8') for l in $action(self, morfeusz)]
238 241 %}
239 242  
240 243 %feature("shadow") morfeusz::MorphInterpretation::createIgn %{
... ... @@ -282,7 +285,7 @@ def getLabelsAsUnicode(self, labelsId):
282 285  
283 286 %feature("shadow") morfeusz::IdResolver::getLabels %{
284 287 def getLabels(self, labelsId):
285   - return { l.decode('utf8') for l in $action(self, labelsId) }
  288 + return [l.decode('utf8') for l in $action(self, labelsId)]
286 289 %}
287 290  
288 291 %feature("shadow") morfeusz::IdResolver::getLabelsId %{
... ... @@ -290,6 +293,152 @@ def getLabelsId(self, labelsStr):
290 293 return $action(self, labelsStr.encode('utf8'))
291 294 %}
292 295  
  296 +%pythoncode %{
  297 +import collections
  298 +import os.path
  299 +
  300 +# skopiowane, bo kod sie wkleja na poczatku zamiast na koncu
  301 +CONTINUOUS_NUMBERING = _morfeusz2.CONTINUOUS_NUMBERING
  302 +CONDITIONALLY_CASE_SENSITIVE = _morfeusz2.CONDITIONALLY_CASE_SENSITIVE
  303 +SKIP_WHITESPACES = _morfeusz2.SKIP_WHITESPACES
  304 +ANALYSE_ONLY = _morfeusz2.ANALYSE_ONLY
  305 +GENERATE_ONLY = _morfeusz2.GENERATE_ONLY
  306 +BOTH_ANALYSE_AND_GENERATE = _morfeusz2.BOTH_ANALYSE_AND_GENERATE
  307 +
  308 +__version__ = _morfeusz2._Morfeusz_getVersion()
  309 +
  310 +__copyright__ = _morfeusz2._Morfeusz_getCopyright()
  311 +
  312 +GENDERS = ['m1', 'm2', 'm3', 'f', 'n1', 'n2', 'p1', 'p2', 'p3']
  313 +
  314 +
  315 +class Morfeusz(_object):
  316 + def __init__(self, dict_name=None, dict_path=None,
  317 + analyse=True, generate=True, expand_dag=False,
  318 + expand_tags=False, expand_dot=True, expand_underscore=True,
  319 + aggl=None, praet=None, separate_numbering=True,
  320 + case_handling=CONDITIONALLY_CASE_SENSITIVE,
  321 + whitespace=SKIP_WHITESPACES):
  322 + """
  323 + case_handling options:
  324 + CONDITIONALLY_CASE_SENSITIVE, STRICTLY_CASE_SENSITIVE, IGNORE_CASE
  325 + whitespace options:
  326 + SKIP_WHITESPACES, KEEP_WHITESPACES, APPEND_WHITESPACES
  327 + """
  328 + if analyse and generate:
  329 + usage = BOTH_ANALYSE_AND_GENERATE
  330 + elif analyse:
  331 + usage = ANALYSE_ONLY
  332 + elif generate:
  333 + usage = GENERATE_ONLY
  334 + else:
  335 + raise ValueError(
  336 + 'At least one of "analyse" and "generate" must be True')
  337 + self.expand_dag = expand_dag
  338 + self.expand_tags = expand_tags
  339 + self.expand_dot = expand_dot
  340 + self.expand_underscore = expand_underscore
  341 + if dict_path:
  342 + self.add_dictionary_path(dict_path)
  343 + if dict_name:
  344 + m = _Morfeusz.createInstance(dict_name, usage)
  345 + else:
  346 + m = _Morfeusz.createInstance(usage)
  347 + self._morfeusz_obj = m
  348 + if aggl:
  349 + m.setAggl(aggl)
  350 + if praet:
  351 + m.setPraet(praet)
  352 + if not separate_numbering:
  353 + m.setTokenNumbering(CONTINUOUS_NUMBERING)
  354 + m.setCaseHandling(case_handling)
  355 + m.setWhitespaceHandling(whitespace)
  356 +
  357 + def add_dictionary_path(self, dict_path):
  358 + dict_paths = _morfeusz2._Morfeusz_dictionarySearchPaths_get()
  359 + if dict_path not in dict_paths:
  360 + _morfeusz2._Morfeusz_dictionarySearchPaths_set(
  361 + (dict_path,) + dict_paths)
  362 +
  363 + def _expand_tag(self, tag):
  364 + chunks = [
  365 + GENDERS if chunk == '_' and self.expand_underscore
  366 + else chunk.split('.')
  367 + for chunk in tag.split(':')
  368 + ]
  369 +
  370 + if not self.expand_dot:
  371 + yield ':'.join('.'.join(values) for values in chunks)
  372 + return
  373 +
  374 + def expand_chunks(i):
  375 + if i >= len(chunks):
  376 + yield ()
  377 + else:
  378 + tail = tuple(expand_chunks(i + 1))
  379 + for chunk_variant in chunks[i]:
  380 + for tail_variant in tail:
  381 + yield (chunk_variant,) + tail_variant
  382 +
  383 + for x in expand_chunks(0):
  384 + yield ':'.join(x)
  385 +
  386 + def _expand_interp(self, interp):
  387 + tags = self._expand_tag(interp[2])
  388 + for tag in tags:
  389 + yield (interp[0], interp[1], tag, interp[3], interp[4])
  390 +
  391 + @staticmethod
  392 + def _dag_to_list(interps):
  393 + dag = collections.defaultdict(list)
  394 + for start, end, interp in interps:
  395 + dag[start].append((interp, end))
  396 + def expand_dag(start):
  397 + nexts = dag[start]
  398 + if not nexts:
  399 + yield []
  400 + else:
  401 + for head, end in nexts:
  402 + for tail in expand_dag(end):
  403 + yield [head] + tail
  404 + return list(expand_dag(0))
  405 +
  406 + def analyse(self, text):
  407 + m = self._morfeusz_obj
  408 + interps = m.analyse(text)
  409 + interp_tuples = [
  410 + (i.startNode, i.endNode,
  411 + (i.lemma, i.orth, i.getTag(m), i.getName(m), i.getLabels(m)))
  412 + for i in interps]
  413 +
  414 + def expand_interps():
  415 + for start, end, interp in interp_tuples:
  416 + for exp_interp in self._expand_interp(interp):
  417 + yield start, end, exp_interp
  418 +
  419 + if self.expand_tags:
  420 + interp_tuples = list(expand_interps())
  421 + if self.expand_dag:
  422 + interp_tuples = self._dag_to_list(interp_tuples)
  423 + return interp_tuples
  424 +
  425 + def generate(self, lemma, tag_id=None):
  426 + m = self._morfeusz_obj
  427 + interps = m.generate(lemma, tag_id)
  428 + interp_tuples = [
  429 + (i.orth, i.lemma, i.getTag(m), i.getName(m), i.getLabels(m))
  430 + for i in interps]
  431 +
  432 + def expand_interps():
  433 + for interp in interp_tuples:
  434 + for exp_interp in self._expand_interp(interp):
  435 + yield exp_interp
  436 +
  437 + if self.expand_tags:
  438 + interp_tuples = list(expand_interps())
  439 + return interp_tuples
  440 +%}
  441 +
293 442 %include "std_vector.i"
294 443 %include "std_string.i"
295 444 %include "std_list.i"
... ...