metadata.py
1.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
'Retrieve metadata'
from datetime import date
import os
import re
def metadata(headers, filename):
'Create a dictionary with metadata based on paragraph content'
basename = os.path.basename(filename)
data = {
'title': 'Tytuł',
'system': 'III RP',
'termNo': basename.split('_')[0],
'sessionNo': '3',
'dayNo': '1',
'date': _date(headers),
'type': _type(headers),
'house': _house(headers),
'file_id': basename.split('.')[0]
}
print(data)
return data
def _house(headers):
'Determine house of Parliament'
for header in headers:
if header.who != 'komentarz':
continue
if 'Senat' in header.u:
return 'Senat'
if 'Sejm' in header.u:
return 'Sejm'
return 'Sejm'
def _type(headers):
'Determine type of session'
for header in headers:
if re.search(r'KOMISJA', header.u):
return 'committee'
if re.search('z posiedzenia (Sejmu|Senatu)', header.u):
return 'session'
return 'session'
def _date(headers):
'Determine date of the session'
for header in headers:
match = re.search(r'dni[ua] (\d+) (\w+) (\d{4})', header.u)
if match:
month = {'stycznia': 1, 'lutego': 2, 'marca': 3, 'kwietnia': 4, 'maja': 5, 'czerwca': 6,
'lipca': 7, 'sierpnia': 8, 'września': 9, 'października': 10, 'listopada': 11,
'grudnia': 12}
return str(date(int(match.group(3)), month[match.group(2)], int(match.group(1))))
match = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', header.u)
if match:
return str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))