compos_alt.py 4.38 KB

Edit Raw Blame History Permalink

# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Nov 2016.
# This file is intended to check the (partially tagged) NKJP1M frequency list against list of exce-
# ptions from morphological rules derived from SGJP.
# If you want to use this, review the end of this file (filenames, column structure) and run with python3.

import re

# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
def strict_tagcomp(tag1, tag2):
    tag1_items = tag1.split(':')
    tag2_items = tag2.split(':')

    if (tag1_items[0] != tag2_items[0] # POS
            or len(tag1_items) != len(tag2_items)):
        return False

    for (i, item) in enumerate(tag1_items):
        if not item in tag2_items[i].split('.'):
            return False

    return True

def liberal_tagcomp(tag1, tag2):
    tag1_items = tag1.split(':')
    tag2_items = tag2.split(':')

    if (tag1_items[0] != tag2_items[0] # POS
            or len(tag1_items) != len(tag2_items)):
        return False

    for (i, item) in enumerate(tag1_items):
        # remove tags n1, f1...
        item = re.sub(r'(n1|n2|n3)', 'n', item)
        model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
        if not item in model and model[0] != '_': # underscore as a catchall
            return False

    return True

# the bulk of the following ripped from check_rule_compos.py
def esccurl(string) :
    "Escape the curly brackets in the string, for using it with the string formatter."
    return string.replace('{', '{{').replace('}', '}}')

alt_idx = dict() # indexed by data[0] - word form

with open('../resources/SGJP/alt.tab') as alt_src:
    for line in alt_src:
            line = line.strip()
            data = line.split('\t')
            if len(data) != 3:
                print('Skipped line in the alt list: '+line)
                continue
            # handle lemmas with subclassification after colon
            if data[1].find(':') != -1 and data[1] != ':':
                data[1] = data[1][: data[1].find(':')]
            # each entry consists of 0 - list of lemmas, 1 - list of tags
            if not data[0] in alt_idx:
                alt_idx[data[0]] = [[data[1]], [data[2]]]
            else:
                alt_idx[data[0]][0].append(data[1])
                alt_idx[data[0]][1].append(data[2])

with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
    with open('freq_with_alt.tab', 'w+') as out:
        for line in inp:
            line = line.strip()
            data = line.split('\t')
            if len(data) != 8: # column count of TAGGED frequency list
                print('Skipped line in the list: '+line)
                continue

            # The following was added to work on partially done tagged frequency, to get rid of the
            # previous COMPOS classification. Otherwise we'd want to use something like this:
            # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
            # previous COMPOS column is in data[4], so we skip it below
            fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))

            matched = False
            if data[0] in alt_idx:
                tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), alt_idx[data[0]][1]))
                tagnum = True in tagcomps and tagcomps.index(True)
                # (make sure that if lemma is matching, it belongs to the matching tag)
                if tagnum != -1 and tagnum != False and alt_idx[data[0]][0][tagnum] == data[1]:
                    print(fmt.format('COMPOS-ALT'), file=out)
                    matched = True
            # try again with lowering word form and lemma:
            if not matched and data[0].lower() in alt_idx:
                tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), # data[2] - tag stays the same
                                    alt_idx[data[0].lower()][1]))
                tagnum = True in tagcomps and tagcomps.index(True)
                if tagnum != -1 and tagnum != False and alt_idx[data[0].lower()][0][tagnum] == data[1].lower():
                    print(fmt.format('COMPOS-LWR-ALT'), file=out)
                    matched = True
            if not matched:
                print(line, file=out)