get_keywords2eurovoc_mappings.py
3.92 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
from django.core.management.base import BaseCommand
from projects.marcell.models import Keyword2EuroVoc
from storage.models import Keyword
class Command(BaseCommand):
help = 'Get Keyword2EuroVoc mappings.'
def add_arguments(self, parser):
parser.add_argument('-o',
'--output',
action='store',
dest='output',
type=str,
help='output file path',
required=True)
parser.add_argument('-b'
'--best',
action='store_true',
dest='best',
help='get best keyword matches')
parser.add_argument('--main_score_pl',
action='store',
dest='main_score_pl',
type=float,
help='main_score_pl')
parser.add_argument('--main_score_en',
action='store',
dest='main_score_en',
type=float,
help='main_score_en')
parser.add_argument('--secondary_score_pl',
action='store',
dest='secondary_score_pl',
type=float,
help='secondary_score_pl')
parser.add_argument('--secondary_score_en',
action='store',
dest='secondary_score_en',
type=float,
help='secondary_score_en')
def handle(self, *args, **options):
with open(options['output'], 'w') as mappings_file:
self._write_labels(mappings_file)
if options['best']:
for keyword in Keyword.objects.all():
k2evs = Keyword2EuroVoc.objects.filter(keyword=keyword)
pl_best = k2evs.order_by('-score_pl').all()[0]
self._write_mapping(pl_best, mappings_file)
en_best = k2evs.order_by('-score_en').all()[0]
self._write_mapping(en_best, mappings_file)
else:
if (options['main_score_pl'] and options['main_score_en'] and
options['secondary_score_pl'] and options['secondary_score_en']):
for k2ev in Keyword2EuroVoc.objects.all():
if (k2ev.score_pl >= options['main_score_pl'] or k2ev.score_en >= options['main_score_en'] or
(k2ev.score_pl >= options['secondary_score_pl'] and
k2ev.score_en >= options['secondary_score_en'])):
self._write_mapping(k2ev, mappings_file)
else:
for k2ev in Keyword2EuroVoc.objects.all():
self._write_mapping(k2ev, mappings_file)
def _write_labels(self, mappings_file):
labels = ['isap_kw', 'domain_pl', 'eurovoc_label_pl', 'eurovoc_label_en', 'score_pl', 'score_en']
mappings_file.write('{}\n'.format('\t'.join(labels)))
def _write_mapping(self, k2ev, mappings_file):
isap_kw = k2ev.keyword.label
domain_pl = self._get_pl_domains(k2ev)
eurovoc_label_pl = '|'.join([l.text for l in k2ev.eurovoc.labels.filter(lang='pl')])
eurovoc_label_en = '|'.join([l.text for l in k2ev.eurovoc.labels.filter(lang='en')])
score_pl = k2ev.score_pl
score_en = k2ev.score_en
mappings_file.write(f'{isap_kw}\t{domain_pl}\t{eurovoc_label_pl}\t{eurovoc_label_en}\t{score_pl}\t{score_en}\n')
def _get_pl_domains(self, k2ev):
domains_pl = []
for d in k2ev.eurovoc.get_domains():
for l in d.labels.filter(lang='pl'):
domains_pl.append(l.text)
return '|'.join(domains_pl)