get_keywords2eurovoc_mappings.py 3.92 KB
from django.core.management.base import BaseCommand

from projects.marcell.models import Keyword2EuroVoc
from storage.models import Keyword


class Command(BaseCommand):
    help = 'Get Keyword2EuroVoc mappings.'

    def add_arguments(self, parser):

        parser.add_argument('-o',
                            '--output',
                            action='store',
                            dest='output',
                            type=str,
                            help='output file path',
                            required=True)

        parser.add_argument('-b'
                            '--best',
                            action='store_true',
                            dest='best',
                            help='get best keyword matches')

        parser.add_argument('--main_score_pl',
                            action='store',
                            dest='main_score_pl',
                            type=float,
                            help='main_score_pl')

        parser.add_argument('--main_score_en',
                            action='store',
                            dest='main_score_en',
                            type=float,
                            help='main_score_en')

        parser.add_argument('--secondary_score_pl',
                            action='store',
                            dest='secondary_score_pl',
                            type=float,
                            help='secondary_score_pl')

        parser.add_argument('--secondary_score_en',
                            action='store',
                            dest='secondary_score_en',
                            type=float,
                            help='secondary_score_en')

    def handle(self, *args, **options):

        with open(options['output'], 'w') as mappings_file:
            self._write_labels(mappings_file)

            if options['best']:
                for keyword in Keyword.objects.all():
                    k2evs = Keyword2EuroVoc.objects.filter(keyword=keyword)
                    pl_best = k2evs.order_by('-score_pl').all()[0]
                    self._write_mapping(pl_best, mappings_file)
                    en_best = k2evs.order_by('-score_en').all()[0]
                    self._write_mapping(en_best, mappings_file)
            else:
                if (options['main_score_pl'] and options['main_score_en'] and
                        options['secondary_score_pl'] and options['secondary_score_en']):
                    for k2ev in Keyword2EuroVoc.objects.all():
                        if (k2ev.score_pl >= options['main_score_pl'] or k2ev.score_en >= options['main_score_en'] or
                                (k2ev.score_pl >= options['secondary_score_pl'] and
                                 k2ev.score_en >= options['secondary_score_en'])):
                            self._write_mapping(k2ev, mappings_file)
                else:
                    for k2ev in Keyword2EuroVoc.objects.all():
                        self._write_mapping(k2ev, mappings_file)

    def _write_labels(self, mappings_file):
        labels = ['isap_kw', 'domain_pl', 'eurovoc_label_pl', 'eurovoc_label_en', 'score_pl', 'score_en']
        mappings_file.write('{}\n'.format('\t'.join(labels)))

    def _write_mapping(self, k2ev, mappings_file):
        isap_kw = k2ev.keyword.label
        domain_pl = self._get_pl_domains(k2ev)
        eurovoc_label_pl = '|'.join([l.text for l in k2ev.eurovoc.labels.filter(lang='pl')])
        eurovoc_label_en = '|'.join([l.text for l in k2ev.eurovoc.labels.filter(lang='en')])
        score_pl = k2ev.score_pl
        score_en = k2ev.score_en
        mappings_file.write(f'{isap_kw}\t{domain_pl}\t{eurovoc_label_pl}\t{eurovoc_label_en}\t{score_pl}\t{score_en}\n')

    def _get_pl_domains(self, k2ev):
        domains_pl = []
        for d in k2ev.eurovoc.get_domains():
            for l in d.labels.filter(lang='pl'):
                domains_pl.append(l.text)
        return '|'.join(domains_pl)