metadata.py 1.74 KB
'Retrieve metadata'

from datetime import date
import os
import re


def metadata(headers, filename):
    'Create a dictionary with metadata based on paragraph content'
    basename = os.path.basename(filename)
    data = {
        'title': 'Tytuł',
        'system': 'III RP',
        'termNo': basename.split('_')[0],
        'sessionNo': '3',
        'dayNo': '1',
        'date': _date(headers),
        'type': _type(headers),
        'house': _house(headers),
        'file_id': basename.split('.')[0]
    }
    print(data)
    return data


def _house(headers):
    'Determine house of Parliament'
    for header in headers:
        if header.who != 'komentarz':
            continue
        if 'Senat' in header.u:
            return 'Senat'
        if 'Sejm' in header.u:
            return 'Sejm'
    return 'Sejm'


def _type(headers):
    'Determine type of session'
    for header in headers:
        if re.search(r'KOMISJA', header.u):
            return 'committee'
        if re.search('z posiedzenia (Sejmu|Senatu)', header.u):
            return 'session'
    return 'session'


def _date(headers):
    'Determine date of the session'
    for header in headers:
        match = re.search(r'dni[ua] (\d+) (\w+) (\d{4})', header.u)
        if match:
            month = {'stycznia': 1, 'lutego': 2, 'marca': 3, 'kwietnia': 4, 'maja': 5, 'czerwca': 6,
                     'lipca': 7, 'sierpnia': 8, 'września': 9, 'października': 10, 'listopada': 11,
                     'grudnia': 12}
            return str(date(int(match.group(3)), month[match.group(2)], int(match.group(1))))
        match = re.search(r'(\d{1,2})\.(\d{1,2})\.(\d{4})', header.u)
        if match:
            return str(date(int(match.group(3)), int(match.group(2)), int(match.group(1))))