compos_alt.py 3.1 KB
# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Nov 2016.
# This file is intended to check the (partially tagged) NKJP1M frequency list against list of exce-
# ptions from morphological rules derived from SGJP.
# If you want to use this, review the end of this file (filenames, column structure) and run with python3.

import re

# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
def strict_tagcomp(tag1, tag2):
    tag1_items = tag1.split(':')
    tag2_items = tag2.split(':')

    if (tag1_items[0] != tag2_items[0] # POS
            or len(tag1_items) != len(tag2_items)):
        return False

    for (i, item) in enumerate(tag1_items):
        if not item in tag2_items[i].split('.'):
            return False

    return True

def liberal_tagcomp(tag1, tag2):
    tag1_items = tag1.split(':')
    tag2_items = tag2.split(':')

    if (tag1_items[0] != tag2_items[0] # POS
            or len(tag1_items) != len(tag2_items)):
        return False

    for (i, item) in enumerate(tag1_items):
        # remove tags n1, f1...
        item = re.sub(r'(n1|n2|n3)', 'n', item)
        model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
        if not item in model and model[0] != '_': # underscore as a catchall
            return False

    return True

# the bulk of the following ripped from check_rule_compos.py
def esccurl(string) :
    "Escape the curly brackets in the string, for using it with the string formatter."
    return string.replace('{', '{{').replace('}', '}}')

alt_idx = dict() # indexed by data[0] - word form
with open('../resources/SGJP/alt.tab') as alt_src:
    for line in alt_src:
            line = line.strip()
            data = line.split('\t')
            if len(data) != 3:
                print('Skipped line in the alt list: '+line)
                continue
            alt_idx[data[0]] = [data[1], data[2]]

with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
    with open('freq_with_alt.tab', 'w+') as out:
        for line in inp:
            line = line.strip()
            data = line.split('\t')
            if len(data) != 8: # column count of TAGGED frequency list
                print('Skipped line in the list: '+line)
                continue

            # The following was added to work on partially done tagged frequency, to get rid of the
            # previous COMPOS classification. Otherwise we'd want to use something like this:
            # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
            # previous COMPOS column is in data[4], so we skip it below
            fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))

            if (data[0] in alt_idx and data[1] == alt_idx[data[0]][0]
                    and liberal_tagcomp(data[2], alt_idx[data[0]][1])):
                print(fmt.format('COMPOS-ALT'), file=out)
            else:
                print(line, file=out)