metadata.py 7.12 KB
'Retrieve metadata'

import datetime
import re

from committees import committee


def metadata(headers, filename):
    'Create a dictionary with metadata based on paragraph content'
    data = {}
    data['original_file'] = filename
    data['house'] = _house(headers)
    (data['abbreviation'], data['type']) = _type(data['house'], headers)
    data['date'] = _date(headers)
    data['termNo'] = _term(data['house'], data['date'])
    data['dayNo'] = '1'
    data['sessionNo'] = _session(headers)
    data['system'] = _system(data['date'])
    data['title'] = _title(headers, data)
    data['file_id'] = _file_id(data)
    return data


def validate(data):
    'Check if the values are correct'
    for key, value in data.items():
        if key not in ['abbreviation', 'file_id'] and '??' in value:
            print(
                f'{data["original_file"]}: warning: invalid value for {key}: \'{value}\'')


def _title(headers, data):
    'Determine title of the document'
    if data['abbreviation'] != 'ppxxx':
        if '?' in data["sessionNo"]:
            return data["type"]
        return f'{data["type"]} ({data["sessionNo"]})'
    for header in headers:
        if header.who == 'komentarz':
            return header.u.strip('()')
    return '???'


def _house(headers):
    'Determine house of Parliament'
    for header in headers:
        if header.who != 'komentarz':
            continue
        if 'Senat' in header.u:
            return 'Senat'
        if 'Sejm' in header.u:
            return 'Sejm'
    return 'Sejm'


def _type(house, headers):
    'Determine type of session'
    for header in headers:
        match = re.search(
            r'((KOMISJA|ZESPÓŁ|RADA) .*?)(?:,? obradując|,? pod przewodnictwem|,?któr| oraz| komisja| i komisja| z udziałem|\s\[|\s\(|$)',
            header.u, re.I)
        if match:
            return committee(house, match.group(1))
        if re.search(f'(z posiedzenia|posiedzenie) {house}u', header.u):
            return ('pp', 'Posiedzenie plenarne')
        if '[TREŚĆ]' in header:
            break
    return ('??', '????')


def _date(headers):
    'Determine date of the session'
    for header in headers:
        match = re.search(r'(\d{1,2})\.(\d{1,2})[\.\s](\d{4})', header.u)
        if match:  # 12.02.1980
            return _format_date(match.group(3), match.group(2), match.group(1))
        match = re.search(r'dni[ua] (\d{1,2}) ([a-ząćęłńóśćżź]+) (\d{4})', header.u) or re.search(
            r'dniach (\d{1,2}) i \d{1,2} ([a-ząćęłńóśćżź]+) (\d{4})', header.u) or re.search(
            r'(\d{1,2}) ([a-ząćęłńóśćżź]+) (\d{4})', header.u)
        if match:  # 12 maja 1980
            return _format_date(match.group(3), _MONTHS[match.group(2)], match.group(1))
        match = re.search(r'(\d{1,2})[\.\s]([IXV]+)[\.\s](\d{4})', header.u)
        if match:  # 12.IV.1980
            return _format_date(match.group(3), _ROMANS[match.group(2)], match.group(1))
    return '????-??-??'


def _format_date(year, month, day):
    try:
        return str(datetime.date(int(year), int(month), int(day)))
    except ValueError:
        print(f'Warning: bad date {year}-{month}-{day}')
        return '????-??-??'


def _session(headers):
    'Determine session'
    for header in headers:
        for regex in [r'(\d+) posiedzeni[ae]', r'(?:KOMISJA|ZESPÓŁ|RADA) .*?\((\d+)\)']:
            match = re.search(regex, header.u, re.I)
            if match:
                return match.group(1)
        match = re.search(r'(?:^|z )(.*) posiedzeni[ea]', header.u, re.I)
        if match:
            session = _numeral(match.group(1))
            if session:
                return session
        # Initial session often has no number
        # if re.search(r'(KOMISJA|ZESPÓŁ|RADA)\s', header.u, re.I):
            # return '???'
    return '???'


def _system(date):
    'Return system based on date'
    if date < '1940':
        return 'II RP'
    if date < '1989.07':
        return 'PRL'
    return 'III RP'


def _file_id(data):
    (start, end) = _years(data['house'], data['date'])
    house = _house_codes(data['house'])
    day = int(data['dayNo'])
    abbreviation = data['abbreviation'] + 'x' * (5 - len(data['abbreviation']))
    try:
        session = f'{int(data["sessionNo"]):05d}'
    except ValueError:
        session = '?????'
    return f'{start}{end[2:]}-{house}-{abbreviation}-{session}-{day:02d}'


def _years(house, date):
    'Return years of given term'
    previous = ''
    for start in _YEARS[house]:
        if date <= start:
            return (previous[:4], start[: 4])
        previous = start
    return ('????', '????')


def _term(house, date):
    'Return tern of given date'
    for end, term in _YEARS[house].items():
        if date <= end:
            return str(term)
    return '???'


def _house_codes(house):
    'Return the file ID section for given house'
    house_codes = {'Sejm': 'sjm', 'Senat': 'snt',
                   'Krajowa Rada Narodowa': 'krn'}
    return house_codes[house]


def _numeral(text):
    'Convert text numerals to numbers'
    total = 0
    for word in text.lower().split():
        for pattern, value in _NUMERALS.items():
            if word.startswith(pattern):
                total += value
    return str(total)


_NUMERALS = {'pierwsz': 1, 'drugi': 2, 'trzeci': 3, 'czwart': 4, 'piąt': 5,
             'szóst': 6, 'siódm': 7, 'ósm': 8, 'dziewiąt': 9, 'dziesiąt': 10,
             'jedenast': 11, 'dwunast': 12, 'trzynast': 13, 'czternast': 14,
             'piętnast': 15, 'szesnast': 16, 'siedemnast': 17, 'osiemnast': 18,
             'dziewiętnast': 19, 'dwudziest': 20, 'trzydziest': 30, 'czterdziest': 40,
             'pięćdziesiąt': 50, 'sześćdziesiąt': 60, 'siedemdziesiąt': 70,
             'osiemdziesiąt': 80, 'dziewięćdziesiąt': 90}

_MONTHS = {'stycznia': 1, 'styczeń': 1, 'lutego': 2, 'luty': 2,
           'marca': 3, 'marzec': 3, 'kwietnia': 4, 'kwiecień': 4,
           'maja': 5, 'maj': 5, 'czerwca': 6, 'czerwiec': 6,
           'lipca': 7, 'lipiec': 7, 'sierpnia': 8, 'sierpień': 8,
           'września': 9, 'wrzesień': 9, 'października': 10, 'październik': 10,
           'listopada': 11, 'listopad': 11, 'grudnia': 12, 'grudzień': 12,
           }

_ROMANS = {'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6, 'VII': 7, 'VIII': 8, 'IX': 9,
           'X': 10, 'XI': 11, 'XII': 12}

_YEARS = {
    'Sejm': {'1919-02-10': 1, '1922-11-28': 2, '1928-03-27': 3, '1930-12-09': 4,
             '1935-10-04': 5, '1938-11-28': 6, '1947-02-04': 1, '1952-11-20': 1,
             '1957-02-20': 1, '1961-04-16': 2, '1965-05-30': 3, '1969-06-01': 4,
             '1972-03-19': 5, '1976-03-21': 6, '1980-03-23': 7, '1985-10-13': 8,
             '1989-06-18': 9, '1991-11-25': 10, '1993-09-19': 1, '1997-10-20': 2,
             '2001-10-19': 3, '2005-10-19': 4, '2007-11-05': 5, '2011-11-08': 6,
             '2015-11-12': 7, '2019-11-12': 8, '2023-11-12': 9},
    'Senat': {'1922-11-12': 1, '1928-03-11': 2, '1930-11-23': 3, '1935-10-04': 4,
              '1938-11-13': 5, '1991-11-25': 1, '1991-11-26': 2, '1993-10-15': 3,
              '2005-10-20': 4, '2007-11-05': 5, '2011-11-08': 6, '2015-11-12': 7,
              '2019-11-12': 8, '2023-11-12': 9},
}