remove_duplicates.py 1.87 KB
import codecs
import os

from django.core.management.base import BaseCommand

from webapp.models import Meaning
from settings import PROJECT_PATH


REMOVED_PATH = os.path.join(PROJECT_PATH, 'data', 'removed.txt')


class Command(BaseCommand):
    help = 'Remove duplicate meanings from database.'

    def handle(self, *args, **options):
        remove_meanings()


def remove_meanings():
    removed_meanings = []
    try:
        for to_remove in Meaning.objects.all():
            same_meanings = Meaning.objects
            to_remove_expressions = to_remove.expressions.filter(main_expression=None)
            for expression in to_remove_expressions:
                same_meanings = same_meanings.filter(expressions__text=expression.text)

            if same_meanings.count() > 1:
                to_remove_as_set = set([expr.text for expr in to_remove_expressions])
                for bigger in same_meanings:
                    bigger_as_set = set([expr.text for expr in bigger.expressions.filter(main_expression=None)])
                    if to_remove_as_set <= bigger_as_set and to_remove.id != bigger.id:
                        if ((to_remove.wikilink == bigger.wikilink or not to_remove.wikilink)
                            and (to_remove.plWN_synset == bigger.plWN_synset or to_remove.plWN_synset == 0)):

                            print ('to remove:', to_remove.id)
                            print ('bigger:', bigger.id)
                            removed_meanings.append(to_remove_as_set)
                            for domain in to_remove.domains.all():
                                bigger.domains.add(domain)
                            to_remove.delete()
                            break

    finally:
        with codecs.open(REMOVED_PATH, 'w', 'utf-8') as output_file:
            for removed in removed_meanings:
                output_file.write(u' <---> '.join(removed) + '\n')