compos_alt.py
3.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Nov 2016.
# This file is intended to check the (partially tagged) NKJP1M frequency list against list of exce-
# ptions from morphological rules derived from SGJP.
# If you want to use this, review the end of this file (filenames, column structure) and run with python3.
import re
# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
def strict_tagcomp(tag1, tag2):
tag1_items = tag1.split(':')
tag2_items = tag2.split(':')
if (tag1_items[0] != tag2_items[0] # POS
or len(tag1_items) != len(tag2_items)):
return False
for (i, item) in enumerate(tag1_items):
if not item in tag2_items[i].split('.'):
return False
return True
def liberal_tagcomp(tag1, tag2):
tag1_items = tag1.split(':')
tag2_items = tag2.split(':')
if (tag1_items[0] != tag2_items[0] # POS
or len(tag1_items) != len(tag2_items)):
return False
for (i, item) in enumerate(tag1_items):
# remove tags n1, f1...
item = re.sub(r'(n1|n2|n3)', 'n', item)
model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
if not item in model and model[0] != '_': # underscore as a catchall
return False
return True
# the bulk of the following ripped from check_rule_compos.py
def esccurl(string) :
"Escape the curly brackets in the string, for using it with the string formatter."
return string.replace('{', '{{').replace('}', '}}')
alt_idx = dict() # indexed by data[0] - word form
with open('../resources/SGJP/alt.tab') as alt_src:
for line in alt_src:
line = line.strip()
data = line.split('\t')
if len(data) != 3:
print('Skipped line in the alt list: '+line)
continue
alt_idx[data[0]] = [data[1], data[2]]
with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
with open('freq_with_alt.tab', 'w+') as out:
for line in inp:
line = line.strip()
data = line.split('\t')
if len(data) != 8: # column count of TAGGED frequency list
print('Skipped line in the list: '+line)
continue
# The following was added to work on partially done tagged frequency, to get rid of the
# previous COMPOS classification. Otherwise we'd want to use something like this:
# fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
# previous COMPOS column is in data[4], so we skip it below
fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))
if (data[0] in alt_idx and data[1] == alt_idx[data[0]][0]
and liberal_tagcomp(data[2], alt_idx[data[0]][1])):
print(fmt.format('COMPOS-ALT'), file=out)
else:
print(line, file=out)