Commit 970954d2c83e9aef84c020d5a7f7bbe1c569bf56
1 parent
fa5dc747
Add political ontology while updating ppc docs.
Showing
1 changed file
with
98 additions
and
2 deletions
collector/projects/ppc/management/commands/update_ppc_docs.py
1 | 1 | import os |
2 | 2 | |
3 | 3 | from django.core.management.base import BaseCommand, CommandError |
4 | +from django.db.models import Q | |
4 | 5 | |
6 | +from collector import settings | |
7 | +from datetime import datetime | |
5 | 8 | from loaders import ppc_tei |
6 | -from writers import tei_anno_and_mtas | |
7 | -from storage.models import Document | |
9 | +from writers import tei, tei_anno_and_mtas | |
10 | +from storage.models import Document, Participant | |
11 | +from projects.ppc.ppo import PPO | |
12 | +from projects.ppc.models import Function | |
8 | 13 | |
9 | 14 | |
10 | 15 | class Command(BaseCommand): |
... | ... | @@ -48,3 +53,94 @@ class Command(BaseCommand): |
48 | 53 | document.pipeline.index(document) |
49 | 54 | document.changed = False |
50 | 55 | document.save() |
56 | + | |
57 | + self._add_political_ontology(docs2update) | |
58 | + for document in docs2update.all(): | |
59 | + tei.write_header(document) | |
60 | + | |
61 | + def _add_political_ontology(self, documents): | |
62 | + ppo = PPO(settings.PPO_PATH) | |
63 | + | |
64 | + for politician in ppo.get_politicians(): | |
65 | + for public_function in politician.hasFunction: | |
66 | + self._add_function(ppo, documents, politician, public_function) | |
67 | + | |
68 | + for person in ppo.get_other_persons(): | |
69 | + for public_function in person.hasFunction: | |
70 | + self._add_function(ppo, documents, person, public_function) | |
71 | + | |
72 | + def _add_function(self, ppo, documents, politician, public_function): | |
73 | + if public_function.position: | |
74 | + start_date, end_date = self._get_function_term_of_office(public_function) | |
75 | + | |
76 | + if start_date and end_date: | |
77 | + documents = documents.filter(publication_date__gte=start_date) | |
78 | + documents = documents.filter(publication_date__lte=end_date) | |
79 | + else: | |
80 | + print(public_function, 'No start/end date!!') | |
81 | + return | |
82 | + else: | |
83 | + start_date, end_date = self._get_house_term_of_office(public_function) | |
84 | + | |
85 | + election = str(public_function.occursWith[0]).split('.')[-1] | |
86 | + system = ppo.get_system(election) | |
87 | + | |
88 | + house, term = self._get_house_and_term(public_function) | |
89 | + documents = documents.filter(metadata__name='system', metadata__value=system) | |
90 | + documents = documents.filter(metadata__name='house', metadata__value=house) | |
91 | + documents = documents.filter(metadata__name='termNo', metadata__value=term) | |
92 | + if start_date: | |
93 | + documents = documents.filter(publication_date__gte=start_date) | |
94 | + if end_date: | |
95 | + documents = documents.filter(publication_date__lte=end_date) | |
96 | + | |
97 | + q_names = Q() | |
98 | + for lastName in politician.lastName: | |
99 | + q_names |= Q(name__regex=r'(^|\s)((%s\s(.+\s)?%s)|(%s\s(.+\s)?%s))(\s|$)' % ( | |
100 | + politician.firstName[0], lastName, lastName, politician.firstName[0])) | |
101 | + | |
102 | + name_surname_match = Participant.objects.filter(Q(type='person') & Q(document__in=documents) & | |
103 | + q_names).distinct() | |
104 | + | |
105 | + print(public_function, 'full match: %d' % (name_surname_match.count())) | |
106 | + function_obj, _ = Function.objects.get_or_create(iri=public_function.iri) | |
107 | + | |
108 | + for participant in name_surname_match.all(): | |
109 | + participant.functions.add(function_obj) | |
110 | + | |
111 | + def _get_function_term_of_office(self, public_function): | |
112 | + start = None | |
113 | + end = None | |
114 | + if public_function.startTime[0]: | |
115 | + date_parts = public_function.startTime[0].split('-') | |
116 | + if len(date_parts) == 3: | |
117 | + start = datetime.strptime(public_function.startTime[0], '%d-%m-%Y') | |
118 | + elif len(date_parts) == 2: | |
119 | + start = datetime.strptime(public_function.startTime[0], '%m-%Y') | |
120 | + else: | |
121 | + start = datetime.strptime(public_function.startTime[0], '%Y') | |
122 | + if public_function.stopTime[0]: | |
123 | + date_parts = public_function.stopTime[0].split('-') | |
124 | + if len(date_parts) == 3: | |
125 | + end = datetime.strptime(public_function.stopTime[0], '%d-%m-%Y') | |
126 | + elif len(date_parts) == 2: | |
127 | + end = datetime.strptime(public_function.stopTime[0], '%m-%Y') | |
128 | + else: | |
129 | + end = datetime.strptime(public_function.stopTime[0], '%Y') | |
130 | + return start, end | |
131 | + | |
132 | + def _get_house_term_of_office(self, public_function): | |
133 | + start = None | |
134 | + end = None | |
135 | + if public_function.dateFrom: | |
136 | + start = datetime.strptime(public_function.dateFrom.split('T')[0], '%Y-%m-%d') | |
137 | + if public_function.dateTo: | |
138 | + end = datetime.strptime(public_function.dateTo.split('T')[0], '%Y-%m-%d') | |
139 | + return start, end | |
140 | + | |
141 | + def _get_house_and_term(self, public_function): | |
142 | + house = 'Senat' | |
143 | + if public_function.isLowerHouse[0]: | |
144 | + house = 'Sejm' | |
145 | + term = str(public_function).split('_')[-1].replace(house, '').lstrip('-') | |
146 | + return house, term | |
... | ... |