add_additional_ne.py 9.63 KB

Edit Raw Blame History Permalink

import argparse
import os
import re


REGEXPS = {
    'EMAIL': '^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$',
    'URL': '^(?:(?:https?|ftp|sftp|file)://|(www|ftp)\.)[-A-Za-z0-9+&@#/%?=~_|$!:,.;()]*$',
    'IP': '^(?:(?:25[0-5]|2[0-4]\d|1?\d?\d)(?:\.(?!$)|$)){4}$',
    'IBAN': '^(AD\d{10}[A-Z0-9]{12}|AE\d{21}|AL\d{10}[A-Z0-9]{16}|AT\d{18}|AZ\d{2}[A-Z]{4}[A-Z0-9]{20}|BA\d{18}|'
            'BE\d{14}|BG\d{2}[A-Z]{4}\d{6}[A-Z0-9]{8}|BH\d{2}[A-Z]{4}[A-Z0-9]{14}|BR\d{25}[A-Z]{1}[A-Z0-9]{1}|'
            'BY\d{2}[A-Z0-9]{4}\d{4}[A-Z0-9]{16}|CH\d{7}[A-Z0-9]{12}|CR\d{20}|CY\d{10}[A-Z0-9]{16}|CZ\d{22}|DE\d{20}|'
            'DK\d{16}|DO\d{2}[A-Z0-9]{4}\d{20}|EE\d{18}|EG\d{27}|ES\d{22}|FI\d{16}|FO\d{16}|FR\d{12}[A-Z0-9]{11}\d{2}|'
            'GB\d{2}[A-Z]{4}\d{14}|GE\d{2}[A-Z]{2}\d{16}|GI\d{2}[A-Z]{4}[A-Z0-9]{15}|GL\d{16}|GR\d{9}[A-Z0-9]{16}|'
            'GT\d{2}[A-Z0-9]{24}|HR\d{19}|HU\d{26}|IE\d{2}[A-Z]{4}\d{14}|IL\d{21}|IQ\d{2}[A-Z]{4}\d{15}|IS\d{24}|'
            'IT\d{2}[A-Z]{1}\d{10}[A-Z0-9]{12}|JO\d{2}[A-Z]{4}\d{4}[A-Z0-9]{18}|KW\d{2}[A-Z]{4}[A-Z0-9]{22}|'
            'KZ\d{5}[A-Z0-9]{13}|LB\d{6}[A-Z0-9]{20}|LC\d{2}[A-Z]{4}[A-Z0-9]{24}|LI\d{7}[A-Z0-9]{12}|LT\d{18}|'
            'LU\d{5}[A-Z0-9]{13}|LV\d{2}[A-Z]{4}[A-Z0-9]{13}|MC\d{12}[A-Z0-9]{11}\d{2}|MD\d{2}[A-Z0-9]{20}|ME\d{20}|'
            'MK\d{5}[A-Z0-9]{10}\d{2}|MR\d{25}|MT\d{2}[A-Z]{4}\d{5}[A-Z0-9]{18}|MU\d{2}[A-Z]{4}\d{19}[A-Z]{3}|'
            'NL\d{2}[A-Z]{4}\d{10}|NO\d{13}|PK\d{2}[A-Z]{4}[A-Z0-9]{16}|PL\d{26}|PS\d{2}[A-Z]{4}[A-Z0-9]{21}|PT\d{23}|'
            'QA\d{2}[A-Z]{4}[A-Z0-9]{21}|RO\d{2}[A-Z]{4}[A-Z0-9]{16}|RS\d{20}|SA\d{4}[A-Z0-9]{18}|'
            'SC\d{2}[A-Z]{4}\d{20}[A-Z]{3}|SE\d{22}|SI\d{17}|SK\d{22}|SM\d{2}[A-Z]{1}\d{10}[A-Z0-9]{12}|ST\d{23}|'
            'SV\d{2}[A-Z]{4}\d{20}|TL\d{21}|TN\d{22}|TR\d{8}[A-Z0-9]{16}|UA\d{8}[A-Z0-9]{19}|VA\d{20}|'
            'VG\d{2}[A-Z]{4}\d{16}|XK\d{18})$',
    # # vat/tax number https://www.oreilly.com/library/view/regular-expressions-cookbook/9781449327453/ch04s21.html
    # 'ID': '^((AT)?U[0-9]{8}|(BE)?0[0-9]{9}|(BG)?[0-9]{9,10}|(CY)?[0-9]{8}L|(CZ)?[0-9]{8,10}|(DE)?[0-9]{9}|(DK)?[0-9]{8}|'
    #       '(EE)?[0-9]{9}|(EL|GR)?[0-9]{9}|(ES)?[0-9A-Z][0-9]{7}[0-9A-Z]|(FI)?[0-9]{8}|(FR)?[0-9A-Z]{2}[0-9]{9}|'
    #       '(GB)?([0-9]{9}([0-9]{3})?|[A-Z]{2}[0-9]{3})|(HU)?[0-9]{8}|(IE)?[0-9]S[0-9]{5}L|(IT)?[0-9]{11}|'
    #       '(LT)?([0-9]{9}|[0-9]{12})|(LU)?[0-9]{8}|(LV)?[0-9]{11}|(MT)?[0-9]{8}|(NL)?[0-9]{9}B[0-9]{2}|(PL)?[0-9]{10}|'
    #       '(PT)?[0-9]{9}|(RO)?[0-9]{2,10}|(SE)?[0-9]{12}|(SI)?[0-9]{8}|(SK)?[0-9]{10})|'
    # # personal identification https://ipsec.pl/european-personal-data-regexp-patterns.html
    #       '[0-9]{12}|[0-9]{10}|[A-Za-z0-9+/]{22}[A-Za-z0-9+/=][A-Za-z0-9+/=]|'
    #       '[0-9]{2}\.?[0-9]{2}\.?[0-9]{2}-[0-9]{3}\.?[0-9]{2}|[0-9]{2}[0,1,2,4][0-9][0-9]{2}[0-9]{4}|'
    #       '[0-9]{2}[0,1,5][0-9][0-9]{2}/?[0-9]{4}|[A-Z]{2}[0-9]{6}|[0-9]{2}[0,1][0-9][0-9]{2}-[0-9]{4}|'
    #       '[1-6][0-9]{2}[1,2][0-9][0-9]{2}[0-9]{4}|[A-Z]{2}?[ ]?[0-9]{2}[ ]?[0-9]{4}[ ]?[0-9]{4}[ ]?[0-9]{4}[ ]?'
    #       '[0-9]{4}[ ]?[0-9]{4}|[0-9]{2}\.?[0,1][0-9]\.?[0-9]{2}[-+A][0-9]{3}[A-Z]|'
    #       '[1,2][ ]?[0-9]{2}[ ]?[0,1,2,3,5][0-9][ ]?[0-9A-Z]{5}[ ]?[0-9]{3}[ ]?[0-9]{2}|'
    #       '[0-9]{2}[0,1][0-9][0-9]{2}-[A-Z]-[0-9]{5}|[0-9]{3}/?[0-9]{4}/?[0-9]{4}|'
    #       '[0-9]{2}[0-9]{2}[0,1][0-9][0-9]{2}[A-Z][0-9]{2}[0-9]|[A-Z][ -]?[0-9]{6}|[0-9]{3}[ ]?[0-9]{3}[ ][0-9]{3}|'
    #       '[1-8][ ]?[0-9]{2}[0,1][0-9][0-9]{2}[ ]?[0-9]{4}|[0-9]{7}[A-Z]W?|'
    #       '[A-Z]{6}[0-9]{2}[A-E,H,L,M,P,R-T][0-9]{2}[A-Z0-9]{5}|'
    #       '[0-9]{2}[0,1][0-9][0-9]-[0-9]{5}|[3-6][0-9]{2}[0,1][0-9][0-9]{2}[0-9]{4}|[0-9]{9}|'
    #       '[0-9]{2}[0,1][0-9][0-9]{2}[ ]?[0-9]{5}|[0-9]{4}[0-3]{1}[0-9}{1}[0-9]{5}|'
    #       '[1-8][0-9]{2}[0,1][0-9][0-9]{2}[0-9]{6}|[0-9,X,M,L,K,Y][0-9]{7}[A-Z]|[0-9]{2}[0-1][0-9][0-9]{2}[-+][0-9]{4}|'
    #       '[0-9]{3}\.?[0-9]{2}\.?[0-9]{3}\.?[0-9]{3}|756\.?[0-9]{4}\.?[0-9]{4}\.?[0-9]{2}|'
    #       '[A-CEGHJ-PR-TW-Z][A-CEGHJ-NPR-TW-Z]{1}[0-9]{6}[A-DFM]?|^([ACEHJLMOPRSW-Yacehjlmoprsw-y]'
    #       '[A-CEGHJ-NPRSTW-Za-ceghj-nprstw-z]|[Bb][A-CEHJ-NPRSTW-Za-cehj-nprstw-z]|[Gg][ACEGHJ-NPRSTW-Zaceghj-nprstw-z]'
    #       '|[Kk][A-CEGHJ-MPRSTW-Za-ceghj-mprstw-z]|[Nn][A-CEGHJLMNPRSW-Za-ceghjlmnprsw-z]|'
    #       '[Tt][A-CEGHJ-MPRSTW-Za-ceghj-mprstw-z]|[Zz][A-CEGHJ-NPRSTW-Ya-ceghj-nprstw-y])[0-9]{6}[A-Da-d ]?$|'
    #       '[0-9]{3}[ -]?[0-9]{3}[ -]?[0-9]{4}$',
    # # https://ihateregex.io/expr/phone/
    'PHONE': '^[\+]?[(]?[0-9]{3}[)]?[-\s\.]?[0-9]{3}[-\s\.]?[0-9]{4,6}$',
    # 'GPS': '^(-?[1-8]?\d(?:\.\d{1,18})?|90(?:\.0{1,18})?),\s+?(-?(?:1[0-7]|[1-9])?\d(?:\.\d{1,18})?|180(?:\.0{1,18})?)$'
}

def main():
    args = parse_arguments()
    if not args.input or not args.output:
        print('Error: Input and output must be selected!')
    add_ne_by_year(args.input, args.output)


def parse_arguments():
    parser = argparse.ArgumentParser(description='Add additional NE to the corpora.')
    parser.add_argument('-o', '--output', help='output directory')
    required_arguments = parser.add_argument_group('required arguments')
    required_arguments.add_argument('-i', '--input', help='corpora root directory', required=True)
    return parser.parse_args()


def add_ne_by_year(root_directory, out_corpora_directory):
    for root, dirs, files in os.walk(root_directory):
        for filename in files:
            if filename.endswith('.conllup') or filename.endswith('.conllu'):
                src = os.path.join(root, filename)
                year = get_year(src)
                year_path = os.path.join(out_corpora_directory, year)
                os.makedirs(year_path, exist_ok=True)
                dst = os.path.join(year_path, filename)
                add_nes(src, dst)


def get_year(filepath):
    with open(filepath, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                continue
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'PublicationDate':
                    return value.split('-')[0]
    return '0'


def is_segment(line):
    if line and line[0].isdigit():
        return True
    return False


def is_metadata(line):
    if line.startswith('#'):
        return True
    return False


def get_metadata(line):
    name_value_pair = line.split('=', 1)
    name = name_value_pair[0].lstrip('#').strip()
    value = name_value_pair[1].strip()
    return name, value


def add_nes(src, dst):
    cleaned_lines = []
    paragraph = []
    sentence = []

    with open(src, 'r') as conllup_file:
        for line in conllup_file:
            line = line.strip()
            if is_segment(line):
                sentence.append(line)
            elif is_metadata(line):
                name, value = get_metadata(line)
                if name == 'newpar id':
                    if sentence:
                        add_additional_nes_to_sentence(sentence)
                        sentence.append('')
                        paragraph.extend(sentence)
                    sentence = []
                    if len(paragraph) > 1:
                        cleaned_lines.extend(paragraph)
                    paragraph = [line]
                elif name == 'sent_id':
                    if sentence:
                        add_additional_nes_to_sentence(sentence)
                        sentence.append('')
                        paragraph.extend(sentence)
                    sentence = [line]
                elif name == 'text':
                    sentence.append(line)
                else:
                    cleaned_lines.append(line)

        if sentence:
            add_additional_nes_to_sentence(sentence)
            sentence.append('')
            paragraph.extend(sentence)
        if len(paragraph) > 1:
            cleaned_lines.extend(paragraph)

        if not cleaned_lines[-1]:
            cleaned_lines.append('')

    if not cleaned_lines[-1].strip():
        cleaned_lines.pop()

    with open(dst, 'w') as dst_file:
        dst_file.write('\n'.join(cleaned_lines))


def add_additional_nes_to_sentence(sentence):
    for i in range(2, len(sentence)+1):
        for j in reversed(range(i + 1, len(sentence)+1)):
            tokens = sentence[i:j]
            if not contains_ne(tokens):
                text = tokens_to_orth(tokens)
                ne_name = check_for_ne(text)
                if ne_name:
                    sentence[i:j] = mark_ne(tokens, ne_name)
                    break


def contains_ne(tokens):
    for tok in tokens:
        columns = tok.split('\t')
        ne = columns[10]
        if ne != 'O':
            return True
    return False


def tokens_to_orth(tokens):
    text = ''
    for tok in tokens:
        columns = tok.split('\t')
        orth = columns[1]
        nsa = columns[9] == 'SpaceAfter=No'
        if nsa:
            text += orth
        else:
            text += f'{orth} '
    return text.strip()


def check_for_ne(text):
    for name, expression in REGEXPS.items():
        if re.search(expression, text):
            return name
    return ''


def mark_ne(tokens, ne_name):
    first = True
    marked_tokens = []
    for it, tok in enumerate(tokens):
        columns = tok.split('\t')
        if it == len(tokens)-1 and columns[3] == 'PUNCT' and columns[2] not in ['/', ')']:
            print(ne_name, 'removed col', columns)
            marked_tokens.append('\t'.join(columns))
        else:
            if first:
                columns[10] = f'B-{ne_name}'
            else:
                columns[10] = f'I-{ne_name}'
            marked_tokens.append('\t'.join(columns))
        first = False
    return marked_tokens


if __name__ == '__main__':
    main()