get_lexes_to_check.py 3.44 KB

Edit Raw Blame History

# -*- coding:utf-8 -*-

import codecs
import itertools
import os

from django.core.management.base import BaseCommand

from dictionary.models import Lemma
from settings import PROJECT_PATH

TO_CHECK_PATH = os.path.join(PROJECT_PATH, 'data', 'lemmas2check-20180907.csv')


class Command(BaseCommand):
    def handle(self, **options):
        write_schemata_to_check()


def write_schemata_to_check():
    to_check_file = codecs.open(TO_CHECK_PATH, 'wt', 'utf-8')
    lemmas = Lemma.objects.filter(old=False).order_by('entry_obj__name')
    for lemma in lemmas:
        print lemma
        if lemma.frames.count() > 1:
            combinations = itertools.combinations(lemma.frames.all(), 2)
            for comb in combinations:
                if only_lex_diff(comb[0], comb[1]):
                    to_check_file.write('%s (%s)\n' % (lemma.entry_obj.name, lemma.status.status))
                    break
    to_check_file.close()


def only_lex_diff(schema1, schema2):
    if (schema1.characteristics.get(type=u'ZWROTNOŚĆ') == schema2.characteristics.get(type=u'ZWROTNOŚĆ') and
        schema1.characteristics.get(type=u'ASPEKT') == schema2.characteristics.get(type=u'ASPEKT') and
        schema1.characteristics.get(type=u'NEGATYWNOŚĆ') == schema2.characteristics.get(type=u'NEGATYWNOŚĆ') and
        schema1.characteristics.get(type=u'PREDYKATYWNOŚĆ') == schema2.characteristics.get(type=u'PREDYKATYWNOŚĆ') and
        schema1.positions.count() == schema2.positions.count() and (schema1.phraseologic or schema2.phraseologic)):

        pos_diff1 = schema1.positions.exclude(text_rep__in=[pos.text_rep for pos in schema2.positions.all()])
        pos_diff2 = schema2.positions.exclude(text_rep__in=[pos.text_rep for pos in schema1.positions.all()])
        for pos1 in pos_diff1:
            match = False
            for pos2 in pos_diff2:
                if positions_lexically_match(pos1, pos2):
                    match = True
            if not match:
                return False
        return True

    return False


def positions_lexically_match(pos1, pos2):
    if (pos1.categories.count() == pos2.categories.count() and
            (pos1.categories.all() & pos2.categories.all()).count() == pos1.categories.count() and
            phrase_types_match(pos1, pos2)):
        return True
    return False


def phrase_types_match(pos1, pos2):
    if pos1.arguments.count() == pos2.arguments.count():
        pt_diff1 = pos1.arguments.exclude(pk__in=pos2.arguments.all())
        pt_diff2 = pos2.arguments.exclude(pk__in=pos1.arguments.all())
        for pt1 in pt_diff1:
            match = False
            for pt2 in pt_diff2:
                if lex_to_phrase_type_match(pt1, pt2) or lex_to_phrase_type_match(pt2, pt1):
                    match = True
                    break
            if not match:
                return False
        return True
    return False


def lex_to_phrase_type_match(pt1, pt2):
    if pt1.type == 'lex' and pt2.type in ['adjp', 'prepadjp']:
        lexicalized_pt = pt1.atributes.get(type='TYP FRAZY').values.all()[0].argument

        if (pt2.type == 'adjp' and lexicalized_pt.type in ['adjp', 'ppasp', 'pactp'] and
                pt2.text_rep.split('(')[1] == lexicalized_pt.text_rep.split('(')[1]):
            return True
        elif (pt2.type == 'prepadjp' and lexicalized_pt.type in ['prepadjp', 'prepppasp', 'preppactp'] and
                pt2.text_rep.split('(')[1] == lexicalized_pt.text_rep.split('(')[1]):
            return True

    return False