check_rule_compos.py 6.22 KB
# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016.
# This file is intended to check the NKJP1M frequency list against rules derived from SGJP.
# If you want to use this, review the end of this file (filenames, column structure) and run with python3.

import re

def load_rules_file(fname):
    rule_list = []
    contents = ''

    with open(fname) as inp:
        contents = inp.read()

    contents = contents.split('\n')

    for line in contents:
        data = line.split('\t')
        if len(data) != 7:
            print('Skipped line in rules: '+line)
        rule_list.append(tuple(data))

    return rule_list

def make_rules_table(rule_list):
    "Given rule_list as list of tuples (name, freq, classification, prefix, suffix, stem ending, \
    tag), create a dictionary: ending -> list of applicable rules, also as tuples. Indices are \
    prefixes followed by - (hyphen) and suffixes preced by -, up to three characters; longer \
    affixes are included in the lists for their outermost three-character parts. If both empty \
    affixes are empty, rule gets listed under '-'."

    rtable = dict()

    for rl in rule_list:
        if len(rl) != 7:
            print("Skipped invalid rule: "+str(rl))
            continue

        index = '-'

        if rl[3] != '':
            index = rl[3] + '-'
        elif rl[4] != '':
            index = '-' + rl[4]

        if len(index) > 4:
            if index[0] == '-': # suffix
                index = '-' + index[-3:]
            else: # prefix
                index = index[:3] + '-'
        
        if index in rtable:
            rtable[index].append(rl)
        else:
            rtable[index] = [ rl ]

    return rtable

# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
def strict_tagcomp(tag1, tag2):
    tag1_items = tag1.split(':')
    tag2_items = tag2.split(':')

    if (tag1_items[0] != tag2_items[0] # POS
            or len(tag1_items) != len(tag2_items)):
        return False

    for (i, item) in enumerate(tag1_items):
        if not item in tag2_items[i].split('.'):
            return False

    return True

def liberal_tagcomp(tag1, tag2):
    tag1_items = tag1.split(':')
    tag2_items = tag2.split(':')

    if (tag1_items[0] != tag2_items[0] # POS
            or len(tag1_items) != len(tag2_items)):
        return False

    for (i, item) in enumerate(tag1_items):
        # remove tags n1, f1...
        item = re.sub(r'(n1|n2|n3)', 'n', item)
        model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
        if not item in model and model[0] != '_': # underscore as a catchall
            return False

    return True

def is_recognizable(entry, rules_table):
    "Check whether entry, given as triple (word_form, lemma, tags) is recognizable using \
    rules_table as obtained from make_rules_table() function. Return the rule's class \
    (third column, usually empty string)."

    for chunk_size in range(3, -1, -1):
        if len(entry[0]) < chunk_size:
            continue

        rule_candidates = []

        pref_ind = entry[0][:chunk_size]+'-'
        suf_ind = '-'+entry[0][-chunk_size:]
        if pref_ind in rules_table:
            rule_candidates += rules_table[ pref_ind ]
        if suf_ind in rules_table:
            rule_candidates += rules_table[ suf_ind ]

        if len(rule_candidates) == 0:
            continue
        for rl in rule_candidates:
            # check first the prefix and suffix (the above code just finds rules that are
            # potentially relevant), and tag; then proceed to reconstructing the lemma
            if (entry[0][:len(rl[3])] == rl[3] and
                    # check for empty suffix, since string[-0:] returns the string unchanged
                    (len(rl[4]) == 0 or entry[0][-len(rl[4]):] == rl[4]) and
                    liberal_tagcomp(entry[2], rl[6])):
                # trim the prefix and suffix, and glue the ending suggested by the rule;
                # compare with the original lemma
                if (entry[0][len(rl[3]):-len(rl[4])]+rl[5] == entry[1]
                        # another corner case, str[:-0] would be ''
                        or (len(rl[4]) == 0 and entry[0][len(rl[3]):]+rl[5] == entry[1])):
                    return rl[2]

    return False

rlist = load_rules_file('../resources/SGJP/freq_rules.tab')
rtable = make_rules_table(rlist)

def esccurl(string) :
    "Escape the curly brackets in the string, for using it with the string formatter."
    return string.replace('{', '{{').replace('}', '}}')

with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
    with open('freq_with_rules.tab', 'w+') as out:
        for line in inp:
            line = line.strip()
            data = line.split('\t')
            if len(data) != 8: # column count of TAGGED frequency list
                print('Skipped line in the list: '+line)
                continue

            # The following was added to work on partially done tagged frequency, to get rid of the
            # previous COMPOS classification. Otherwise we'd want to use something like this:
            # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
            # previous COMPOS column is in data[4], so we skip it below
            fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))

            rl_class = is_recognizable((data[0], data[1], data[2]), rtable)
            if rl_class == '':
                print(fmt.format('COMPOS'), file=out)
            elif rl_class != False:
                print(fmt.format('COMPOS-'+rl_class), file=out)
            else:
                # Try again, with lowered lemma and word form.
                rl_class_low = is_recognizable((data[0].lower(), data[1].lower(), data[2]),
                        rtable)
                if rl_class_low == '':
                    print(fmt.format('COMPOS-LWR'), file=out)
                elif rl_class_low != False:
                    print(fmt.format('COMPOS-LWR-'+rl_class_low), file=out)
                else:
                    print(fmt.format('NCOMPOS'), file=out)