compos_alt.py
4.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Nov 2016.
# This file is intended to check the (partially tagged) NKJP1M frequency list against list of exce-
# ptions from morphological rules derived from SGJP.
# If you want to use this, review the end of this file (filenames, column structure) and run with python3.
import re
# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
def strict_tagcomp(tag1, tag2):
tag1_items = tag1.split(':')
tag2_items = tag2.split(':')
if (tag1_items[0] != tag2_items[0] # POS
or len(tag1_items) != len(tag2_items)):
return False
for (i, item) in enumerate(tag1_items):
if not item in tag2_items[i].split('.'):
return False
return True
def liberal_tagcomp(tag1, tag2):
tag1_items = tag1.split(':')
tag2_items = tag2.split(':')
if (tag1_items[0] != tag2_items[0] # POS
or len(tag1_items) != len(tag2_items)):
return False
for (i, item) in enumerate(tag1_items):
# remove tags n1, f1...
item = re.sub(r'(n1|n2|n3)', 'n', item)
model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
if not item in model and model[0] != '_': # underscore as a catchall
return False
return True
# the bulk of the following ripped from check_rule_compos.py
def esccurl(string) :
"Escape the curly brackets in the string, for using it with the string formatter."
return string.replace('{', '{{').replace('}', '}}')
alt_idx = dict() # indexed by data[0] - word form
with open('../resources/SGJP/alt.tab') as alt_src:
for line in alt_src:
line = line.strip()
data = line.split('\t')
if len(data) != 3:
print('Skipped line in the alt list: '+line)
continue
# handle lemmas with subclassification after colon
if data[1].find(':') != -1 and data[1] != ':':
data[1] = data[1][: data[1].find(':')]
# each entry consists of 0 - list of lemmas, 1 - list of tags
if not data[0] in alt_idx:
alt_idx[data[0]] = [[data[1]], [data[2]]]
else:
alt_idx[data[0]][0].append(data[1])
alt_idx[data[0]][1].append(data[2])
with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
with open('freq_with_alt.tab', 'w+') as out:
for line in inp:
line = line.strip()
data = line.split('\t')
if len(data) != 8: # column count of TAGGED frequency list
print('Skipped line in the list: '+line)
continue
# The following was added to work on partially done tagged frequency, to get rid of the
# previous COMPOS classification. Otherwise we'd want to use something like this:
# fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
# previous COMPOS column is in data[4], so we skip it below
fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))
matched = False
if data[0] in alt_idx:
tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), alt_idx[data[0]][1]))
tagnum = True in tagcomps and tagcomps.index(True)
# (make sure that if lemma is matching, it belongs to the matching tag)
if tagnum != -1 and tagnum != False and alt_idx[data[0]][0][tagnum] == data[1]:
print(fmt.format('COMPOS-ALT'), file=out)
matched = True
# try again with lowering word form and lemma:
if not matched and data[0].lower() in alt_idx:
tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), # data[2] - tag stays the same
alt_idx[data[0].lower()][1]))
tagnum = True in tagcomps and tagcomps.index(True)
if tagnum != -1 and tagnum != False and alt_idx[data[0].lower()][0][tagnum] == data[1].lower():
print(fmt.format('COMPOS-LWR-ALT'), file=out)
matched = True
if not matched:
print(line, file=out)