Commit 970954d2c83e9aef84c020d5a7f7bbe1c569bf56

Authored by Bartłomiej Nitoń
1 parent fa5dc747

Add political ontology while updating ppc docs.

collector/projects/ppc/management/commands/update_ppc_docs.py
1 1 import os
2 2  
3 3 from django.core.management.base import BaseCommand, CommandError
  4 +from django.db.models import Q
4 5  
  6 +from collector import settings
  7 +from datetime import datetime
5 8 from loaders import ppc_tei
6   -from writers import tei_anno_and_mtas
7   -from storage.models import Document
  9 +from writers import tei, tei_anno_and_mtas
  10 +from storage.models import Document, Participant
  11 +from projects.ppc.ppo import PPO
  12 +from projects.ppc.models import Function
8 13  
9 14  
10 15 class Command(BaseCommand):
... ... @@ -48,3 +53,94 @@ class Command(BaseCommand):
48 53 document.pipeline.index(document)
49 54 document.changed = False
50 55 document.save()
  56 +
  57 + self._add_political_ontology(docs2update)
  58 + for document in docs2update.all():
  59 + tei.write_header(document)
  60 +
  61 + def _add_political_ontology(self, documents):
  62 + ppo = PPO(settings.PPO_PATH)
  63 +
  64 + for politician in ppo.get_politicians():
  65 + for public_function in politician.hasFunction:
  66 + self._add_function(ppo, documents, politician, public_function)
  67 +
  68 + for person in ppo.get_other_persons():
  69 + for public_function in person.hasFunction:
  70 + self._add_function(ppo, documents, person, public_function)
  71 +
  72 + def _add_function(self, ppo, documents, politician, public_function):
  73 + if public_function.position:
  74 + start_date, end_date = self._get_function_term_of_office(public_function)
  75 +
  76 + if start_date and end_date:
  77 + documents = documents.filter(publication_date__gte=start_date)
  78 + documents = documents.filter(publication_date__lte=end_date)
  79 + else:
  80 + print(public_function, 'No start/end date!!')
  81 + return
  82 + else:
  83 + start_date, end_date = self._get_house_term_of_office(public_function)
  84 +
  85 + election = str(public_function.occursWith[0]).split('.')[-1]
  86 + system = ppo.get_system(election)
  87 +
  88 + house, term = self._get_house_and_term(public_function)
  89 + documents = documents.filter(metadata__name='system', metadata__value=system)
  90 + documents = documents.filter(metadata__name='house', metadata__value=house)
  91 + documents = documents.filter(metadata__name='termNo', metadata__value=term)
  92 + if start_date:
  93 + documents = documents.filter(publication_date__gte=start_date)
  94 + if end_date:
  95 + documents = documents.filter(publication_date__lte=end_date)
  96 +
  97 + q_names = Q()
  98 + for lastName in politician.lastName:
  99 + q_names |= Q(name__regex=r'(^|\s)((%s\s(.+\s)?%s)|(%s\s(.+\s)?%s))(\s|$)' % (
  100 + politician.firstName[0], lastName, lastName, politician.firstName[0]))
  101 +
  102 + name_surname_match = Participant.objects.filter(Q(type='person') & Q(document__in=documents) &
  103 + q_names).distinct()
  104 +
  105 + print(public_function, 'full match: %d' % (name_surname_match.count()))
  106 + function_obj, _ = Function.objects.get_or_create(iri=public_function.iri)
  107 +
  108 + for participant in name_surname_match.all():
  109 + participant.functions.add(function_obj)
  110 +
  111 + def _get_function_term_of_office(self, public_function):
  112 + start = None
  113 + end = None
  114 + if public_function.startTime[0]:
  115 + date_parts = public_function.startTime[0].split('-')
  116 + if len(date_parts) == 3:
  117 + start = datetime.strptime(public_function.startTime[0], '%d-%m-%Y')
  118 + elif len(date_parts) == 2:
  119 + start = datetime.strptime(public_function.startTime[0], '%m-%Y')
  120 + else:
  121 + start = datetime.strptime(public_function.startTime[0], '%Y')
  122 + if public_function.stopTime[0]:
  123 + date_parts = public_function.stopTime[0].split('-')
  124 + if len(date_parts) == 3:
  125 + end = datetime.strptime(public_function.stopTime[0], '%d-%m-%Y')
  126 + elif len(date_parts) == 2:
  127 + end = datetime.strptime(public_function.stopTime[0], '%m-%Y')
  128 + else:
  129 + end = datetime.strptime(public_function.stopTime[0], '%Y')
  130 + return start, end
  131 +
  132 + def _get_house_term_of_office(self, public_function):
  133 + start = None
  134 + end = None
  135 + if public_function.dateFrom:
  136 + start = datetime.strptime(public_function.dateFrom.split('T')[0], '%Y-%m-%d')
  137 + if public_function.dateTo:
  138 + end = datetime.strptime(public_function.dateTo.split('T')[0], '%Y-%m-%d')
  139 + return start, end
  140 +
  141 + def _get_house_and_term(self, public_function):
  142 + house = 'Senat'
  143 + if public_function.isLowerHouse[0]:
  144 + house = 'Sejm'
  145 + term = str(public_function).split('_')[-1].replace(house, '').lstrip('-')
  146 + return house, term
... ...