models.py 3.78 KB
from django.contrib.postgres.fields import ArrayField, JSONField
from django.db import models

from storage.models import Keyword


class EuroVocTerm(models.Model):
    tid = models.CharField(max_length=10)
    type = models.CharField(max_length=10)
    subterms = models.ManyToManyField('EuroVocTerm', related_name='superterms')

    def get_domains(self):
        domains = []
        if self.type == 'domain':
            if self not in domains:
                domains.append(self)
        elif self.type in ['thesaurus', 'descriptor']:
            for superterm in self.superterms.all():
                for domain in superterm.get_domains():
                    if domain not in domains:
                        domains.append(domain)
        return domains

    def get_subdomains(self):
        subdomains = []
        if self.type == 'domain':
            pass
        elif self.type == 'thesaurus':
            if self not in subdomains:
                subdomains.append(self)
        elif self.type == 'descriptor':
            for superterm in self.superterms.all():
                for subdomain in superterm.get_subdomains():
                    if subdomain not in subdomains:
                        subdomains.append(subdomain)
        return subdomains

    class Meta:
        db_table = 'eurovoc_term'
        ordering = ['tid']
        unique_together = ['tid', 'type']

    def __str__(self):
        return ' | '.join([str(label) for label in self.labels.all()])


class EuroVocLabel(models.Model):
    lang = models.CharField(max_length=5)
    text = models.CharField(max_length=150)
    term = models.ForeignKey(EuroVocTerm, related_name='labels', on_delete=models.CASCADE)
    lemmatization_graph = JSONField(blank=True, null=True)
    used_for = models.BooleanField(default=False)
    vector = ArrayField(models.FloatField(), null=True)

    class Meta:
        db_table = 'eurovoc_label'
        ordering = ['term__tid', 'lang']

    def __str__(self):
        return '{}: {}'.format(self.lang, self.text)


class IATETerm(models.Model):
    tid = models.CharField(max_length=10, primary_key=True)
    subject_field = models.TextField(blank=True)

    class Meta:
        db_table = 'iate_term'
        ordering = ['tid']

    def eurovoc_terms(self):
        eurovoc_ids = []
        for subject in self.subject_field.split(';'):
            for evlabel in EuroVocLabel.objects.filter(lang='en', text=subject.strip(), used_for=False):
                eurovoc_ids.append(evlabel.term.tid)
        return EuroVocTerm.objects.filter(tid__in=eurovoc_ids, type__in=['domain', 'thesaurus'])

    def __str__(self):
        return ' | '.join([str(label) for label in self.labels.all()])


class IATELabel(models.Model):
    lang = models.CharField(max_length=5)
    text = models.CharField(max_length=1000)
    term = models.ForeignKey(IATETerm, related_name='labels', on_delete=models.CASCADE)
    type = models.CharField(max_length=12)
    administrative_status = models.CharField(max_length=24, blank=True)
    reliability_code = models.PositiveIntegerField()
    lemmatization_graph = JSONField(blank=True, null=True)

    class Meta:
        db_table = 'iate_label'
        ordering = ['term__tid', 'lang']

    def __str__(self):
        return '{}: {}'.format(self.lang, self.text)


class Keyword2EuroVoc(models.Model):
    keyword = models.ForeignKey(Keyword, related_name='similarities', on_delete=models.CASCADE)
    eurovoc = models.ForeignKey(EuroVocTerm, related_name='similarities', on_delete=models.CASCADE)
    score_pl = models.FloatField(default=0.0)
    score_en = models.FloatField(default=0.0)

    class Meta:
        db_table = 'keyword2eurovoc'
        ordering = ['-score_pl', '-score_en']

    def __str__(self):
        return '{} {} ({}, {})'.format(str(self.keyword), str(self.eurovoc), self.score_pl, self.score_en)