check_rule_compos.py
6.22 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016.
# This file is intended to check the NKJP1M frequency list against rules derived from SGJP.
# If you want to use this, review the end of this file (filenames, column structure) and run with python3.
import re
def load_rules_file(fname):
rule_list = []
contents = ''
with open(fname) as inp:
contents = inp.read()
contents = contents.split('\n')
for line in contents:
data = line.split('\t')
if len(data) != 7:
print('Skipped line in rules: '+line)
rule_list.append(tuple(data))
return rule_list
def make_rules_table(rule_list):
"Given rule_list as list of tuples (name, freq, classification, prefix, suffix, stem ending, \
tag), create a dictionary: ending -> list of applicable rules, also as tuples. Indices are \
prefixes followed by - (hyphen) and suffixes preced by -, up to three characters; longer \
affixes are included in the lists for their outermost three-character parts. If both empty \
affixes are empty, rule gets listed under '-'."
rtable = dict()
for rl in rule_list:
if len(rl) != 7:
print("Skipped invalid rule: "+str(rl))
continue
index = '-'
if rl[3] != '':
index = rl[3] + '-'
elif rl[4] != '':
index = '-' + rl[4]
if len(index) > 4:
if index[0] == '-': # suffix
index = '-' + index[-3:]
else: # prefix
index = index[:3] + '-'
if index in rtable:
rtable[index].append(rl)
else:
rtable[index] = [ rl ]
return rtable
# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
def strict_tagcomp(tag1, tag2):
tag1_items = tag1.split(':')
tag2_items = tag2.split(':')
if (tag1_items[0] != tag2_items[0] # POS
or len(tag1_items) != len(tag2_items)):
return False
for (i, item) in enumerate(tag1_items):
if not item in tag2_items[i].split('.'):
return False
return True
def liberal_tagcomp(tag1, tag2):
tag1_items = tag1.split(':')
tag2_items = tag2.split(':')
if (tag1_items[0] != tag2_items[0] # POS
or len(tag1_items) != len(tag2_items)):
return False
for (i, item) in enumerate(tag1_items):
# remove tags n1, f1...
item = re.sub(r'(n1|n2|n3)', 'n', item)
model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
if not item in model and model[0] != '_': # underscore as a catchall
return False
return True
def is_recognizable(entry, rules_table):
"Check whether entry, given as triple (word_form, lemma, tags) is recognizable using \
rules_table as obtained from make_rules_table() function. Return the rule's class \
(third column, usually empty string)."
for chunk_size in range(3, -1, -1):
if len(entry[0]) < chunk_size:
continue
rule_candidates = []
pref_ind = entry[0][:chunk_size]+'-'
suf_ind = '-'+entry[0][-chunk_size:]
if pref_ind in rules_table:
rule_candidates += rules_table[ pref_ind ]
if suf_ind in rules_table:
rule_candidates += rules_table[ suf_ind ]
if len(rule_candidates) == 0:
continue
for rl in rule_candidates:
# check first the prefix and suffix (the above code just finds rules that are
# potentially relevant), and tag; then proceed to reconstructing the lemma
if (entry[0][:len(rl[3])] == rl[3] and
# check for empty suffix, since string[-0:] returns the string unchanged
(len(rl[4]) == 0 or entry[0][-len(rl[4]):] == rl[4]) and
liberal_tagcomp(entry[2], rl[6])):
# trim the prefix and suffix, and glue the ending suggested by the rule;
# compare with the original lemma
if (entry[0][len(rl[3]):-len(rl[4])]+rl[5] == entry[1]
# another corner case, str[:-0] would be ''
or (len(rl[4]) == 0 and entry[0][len(rl[3]):]+rl[5] == entry[1])):
return rl[2]
return False
rlist = load_rules_file('../resources/SGJP/freq_rules.tab')
rtable = make_rules_table(rlist)
def esccurl(string) :
"Escape the curly brackets in the string, for using it with the string formatter."
return string.replace('{', '{{').replace('}', '}}')
with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
with open('freq_with_rules.tab', 'w+') as out:
for line in inp:
line = line.strip()
data = line.split('\t')
if len(data) != 8: # column count of TAGGED frequency list
print('Skipped line in the list: '+line)
continue
# The following was added to work on partially done tagged frequency, to get rid of the
# previous COMPOS classification. Otherwise we'd want to use something like this:
# fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
# previous COMPOS column is in data[4], so we skip it below
fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))
rl_class = is_recognizable((data[0], data[1], data[2]), rtable)
if rl_class == '':
print(fmt.format('COMPOS'), file=out)
elif rl_class != False:
print(fmt.format('COMPOS-'+rl_class), file=out)
else:
# Try again, with lowered lemma and word form.
rl_class_low = is_recognizable((data[0].lower(), data[1].lower(), data[2]),
rtable)
if rl_class_low == '':
print(fmt.format('COMPOS-LWR'), file=out)
elif rl_class_low != False:
print(fmt.format('COMPOS-LWR-'+rl_class_low), file=out)
else:
print(fmt.format('NCOMPOS'), file=out)