remove_duplicates.py
1.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import codecs
import os
from django.core.management.base import BaseCommand
from webapp.models import Meaning
from settings import PROJECT_PATH
REMOVED_PATH = os.path.join(PROJECT_PATH, 'data', 'removed.txt')
class Command(BaseCommand):
help = 'Remove duplicate meanings from database.'
def handle(self, *args, **options):
remove_meanings()
def remove_meanings():
removed_meanings = []
try:
for to_remove in Meaning.objects.all():
same_meanings = Meaning.objects
to_remove_expressions = to_remove.expressions.all()
for expression in to_remove_expressions:
same_meanings = same_meanings.filter(expressions__text=expression.text)
if same_meanings.count() > 1:
to_remove_as_set = set([expr.text for expr in to_remove_expressions])
for bigger in same_meanings:
bigger_as_set = set([expr.text for expr in bigger.expressions.all()])
if to_remove_as_set <= bigger_as_set and to_remove.id != bigger.id:
if ((to_remove.wikilink == bigger.wikilink or not to_remove.wikilink)
and (to_remove.plWN_synset == bigger.plWN_synset or to_remove.plWN_synset == 0)):
print ('to remove:', to_remove.id)
print ('bigger:', bigger.id)
removed_meanings.append(to_remove_as_set)
for domain in to_remove.domains.all():
bigger.domains.add(domain)
to_remove.delete()
break
finally:
with codecs.open(REMOVED_PATH, 'w', 'utf-8') as output_file:
for removed in removed_meanings:
output_file.write(u' <---> '.join(removed) + '\n')