metadata.py 6.26 KB
'Retrieve metadata'

import datetime
import os
import re

from committees import committee


def metadata(headers, filename):
    'Create a dictionary with metadata based on paragraph content'
    basename = os.path.basename(filename)
    data = {}
    data['original_file'] = filename
    data['title'] = _title(headers)
    data['house'] = _house(headers)
    (data['abbreviation'], data['type']) = _type(data['house'], headers)
    data['termNo'] = basename.split('_')[0]
    data['dayNo'] = '1'
    data['sessionNo'] = _session(headers)
    data['date'] = _date(headers)
    data['system'] = _system(data['date'])
    data['file_id'] = _file_id(data)
    _validate(data)
    return data


def _validate(data):
    'Check if the values are correct'
    for key, value in data.items():
        if key not in ['abbreviation', 'file_id'] and '??' in value:
            print(
                f'{data["original_file"]}: warning: invalid value for {key}: {value}')


def _title(headers):
    'Determine title of the document'
    for header in headers:
        if header.who == 'komentarz':
            return header.u.strip('()')
    return '???'


def _house(headers):
    'Determine house of Parliament'
    for header in headers:
        if header.who != 'komentarz':
            continue
        if 'Senat' in header.u:
            return 'Senat'
        if 'Sejm' in header.u:
            return 'Sejm'
    return 'Sejm'


def _type(house, headers):
    'Determine type of session'
    for header in headers:
        match = re.search(
            r'((KOMISJA|ZESPÓŁ|RADA) .*?)(?:, obradując|\s\[|\s\()', header.u, re.I)
        if match:
            return committee(house, match.group(1))
        if re.search(f'z posiedzenia {house}u', header.u):
            return 'Posiedzenie plenarne'
    return ('pp', 'Posiedzenie plenarne')


def _date(headers):
    'Determine date of the session'
    for header in headers:
        match = re.search(r'(\d{1,2})\.(\d{1,2})[\.\s](\d{4})', header.u)
        if match:  # 12.02.1980
            return str(datetime.date(int(match.group(3)), int(match.group(2)), int(match.group(1))))
        match = re.search(r'dni[ua] (\d{1,2}) (\w+) (\d{4})', header.u) or re.search(
            r'dniach (\d{1,2}) i \d{1,2} (\w+) (\d{4})', header.u) or re.search(
            r'(\d{1,2}) (\w+) (\d{4})', header.u)
        if match:  # 12 maja 1980
            return str(datetime.date(int(match.group(3)), _MONTHS[match.group(2)],
                                     int(match.group(1))))
        match = re.search(r'(\d{1,2})[\.\s]([IXV]+)[\.\s](\d{4})', header.u)
        if match:  # 12.IV.1980
            return str(datetime.date(int(match.group(3)), _ROMANS[match.group(2)],
                                     int(match.group(1))))
    return '????-??-??'


def _session(headers):
    'Determine session'
    for header in headers:
        if header.who != 'komentarz':
            continue
        for regex in [r'(\d+) posiedzeni[ae]', r'(?:KOMISJA|ZESPÓŁ|RADA) .*? \((\d+)\)']:
            match = re.search(regex, header.u, re.I)
            if match:
                return match.group(1)
        match = re.search(r'(?:^|z )(.*) posiedzeni[ea]', header.u, re.I)
        if match:
            session = _numeral(match.group(1))
            if session:
                return session
        # Initial session often has no number
        if re.search(r'(KOMISJA|ZESPÓŁ|RADA)\s', header.u, re.I):
            return '1'
    return '???'


def _system(date):
    'Return system based on date'
    if date < '1940':
        return 'II RP'
    if date < '1989.07':
        return 'PRL'
    return 'III RP'


def _file_id(data):
    (start, end) = _years(data['house'], data['date'])
    house = _house_codes(data['house'])
    try:
        session = int(data['sessionNo'])
    except ValueError:
        print(f'Warning: wrong session in {data["original_file"]}')
        session = 1
    day = int(data['dayNo'])
    abbreviation = data['abbreviation'] + 'x' * (5 - len(data['abbreviation']))
    return f'{start}{end[2:]}-{house}-{abbreviation}-{session:05d}-{day:02d}'


def _years(house, date):
    'Return years of given term'
    sejm = ['1919-02-10', '1922-11-28', '1928-03-27', '1930-12-09', '1935-10-04', '1938-11-28',
            '1947-02-04', '1952-11-20', '1957-02-20', '1961-04-16', '1965-05-30', '1969-06-01',
            '1972-03-19', '1976-03-21', '1980-03-23', '1985-10-13', '1989-06-18', '1991-11-25',
            '1993-09-19', '1997-10-20', '2001-10-19', '2005-10-19', '2007-11-05', '2011-11-08',
            '2015-11-12', '2019-11-12', '2023-11-12']
    senat = ['1922-11-12', '1928-03-11', '1930-11-23', '1935-10-04', '1938-11-13', '1991-11-25',
             '1991-11-26', '1993-10-15', '2005-10-20', '2007-11-05', '2011-11-08', '2015-11-12',
             '2019-11-12', '2023-11-12']
    dates = senat if house == 'Senat' else sejm
    for (index, start) in enumerate(dates):
        if date <= start:
            return (dates[index - 1][:4], start[:4])
    return ('????', '????')


def _house_codes(house):
    'Return the file ID section for given house'
    house_codes = {'Sejm': 'sjm', 'Senat': 'snt',
                   'Krajowa Rada Narodowa': 'krn'}
    return house_codes[house]


def _numeral(text):
    'Convert text numerals to numbers'
    total = 0
    for word in text.lower().split():
        for pattern, value in _NUMERALS.items():
            if word.startswith(pattern):
                total += value
    return str(total)


_NUMERALS = {'pierwsz': 1, 'drugi': 2, 'trzeci': 3, 'czwart': 4, 'piąt': 5,
             'szóst': 6, 'siódm': 7, 'ósm': 8, 'dziewiąt': 9, 'dziesiąt': 10,
             'jedenast': 11, 'dwunast': 12, 'trzynast': 13, 'czternast': 14,
             'piętnast': 15, 'szesnast': 16, 'siedemnast': 17, 'osiemnast': 18,
             'dziewiętnast': 19, 'dwudziest': 20, 'trzydziest': 30, 'czterdziest': 40,
             'pięćdziesiąt': 50, 'sześćdziesiąt': 60, 'siedemdziesiąt': 70,
             'osiemdziesiąt': 80, 'dziewięćdziesiąt': 90}

_MONTHS = {'stycznia': 1, 'lutego': 2, 'marca': 3, 'kwietnia': 4, 'maja': 5, 'czerwca': 6,
           'lipca': 7, 'sierpnia': 8, 'września': 9, 'października': 10, 'listopada': 11,
           'grudnia': 12}

_ROMANS = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7, 'VIII': 8, 'IX': 9,
           'X': 10, 'XI': 11, 'XII': 12}