speakers.py
1.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
'A list of speakers'
import re
from unicodedata import normalize, combining
class Speakers:
'A class to manage a list of speakers'
def __init__(self):
self.speakers = {}
self.names = {}
def find_id(self, speaker):
'Create ID of given speaker'
try:
return self.names[speaker]
except KeyError:
speaker_id = self._id(speaker)
self.names[speaker] = speaker_id
self.speakers[speaker_id] = self._format(speaker)
return speaker_id
def role(self, speaker):
'Return role of given speaker'
if re.match(r'(Marszałek|Wicemarszałek)', speaker):
return 'chair'
if re.match(r'(Poseł|Posłanka|Głos z sali|Głosy z sali)', speaker):
return 'speaker'
if speaker == 'komentarz':
return 'commentator'
if speaker == 'zdarzenie':
return 'commentator'
return 'guest'
def __iter__(self):
for speaker_id, name in self.speakers.items():
yield (speaker_id, name)
def _id(self, speaker):
'Generate ASCII ID for given speaker'
if ' ' not in speaker:
return speaker
cleaned = re.sub(
r'\(.*\)', '', speaker.title().replace('Ł', 'l').replace('ł', 'l'))
to_ascii = u"".join([c for c in normalize(
'NFKD', cleaned) if not combining(c)])
return re.sub(r'[^A-Za-z]', '', to_ascii)
def _format(self, speaker):
'Strip brackets from speaker name'
return re.sub(r'\s+\(.*\)', '', speaker)