check_rule_compos.py
4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
import functools
import re
def load_rules_file(fname):
rule_list = []
contents = ''
with open(fname) as inp:
contents = inp.read()
contents = contents.split('\n')
for line in contents:
data = line.split('\t')
if len(data) != 7:
print('Skipped line in rules: '+line)
rule_list.append(tuple(data))
return rule_list
def make_rules_table(rule_list):
"Given rule_list as list of tuples (name, freq, classification, prefix, suffix, stem ending, \
tag), create a dictionary: ending -> list of applicable rules, also as tuples. Indices are \
prefixes followed by - (hyphen) and suffixes preced by -, up to three characters; longer \
affixes are included in the lists for their outermost three-character parts. If both empty \
affixes are empty, rule gets listed under '-'."
rtable = dict()
for rl in rule_list:
if len(rl) != 7:
print("Skipped invalid rule: "+str(rl))
continue
index = '-'
if rl[3] != '':
index = rl[3] + '-'
elif rl[4] != '':
index = '-' + rl[4]
if len(index) > 4:
if index[0] == '-': # suffix
index = '-' + index[-3:]
else: # prefix
index = index[:3] + '-'
if index in rtable:
rtable[index].append(rl)
else:
rtable[index] = [ rl ]
return rtable
# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
def strict_tagcomp(tag1, tag2):
tag1_items = tag1.split(':')
tag2_items = tag2.split(':')
if (tag1_items[0] != tag2_items[0] # POS
or len(tag1_items) != len(tag2_items)):
return False
for (i, item) in enumerate(tag1_items):
if not item in tag2_items[i].split('.'):
return False
return True
def liberal_tagcomp(tag1, tag2):
tag1_items = tag1.split(':')
tag2_items = tag2.split(':')
if (tag1_items[0] != tag2_items[0] # POS
or len(tag1_items) != len(tag2_items)):
return False
for (i, item) in enumerate(tag1_items):
# remove tags n1, f1...
item = re.sub(r'(n1|n2|n3)', 'n', item)
model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
if not item in model and model[0] != '_': # underscore as a catchall
return False
return True
def is_recognizable(entry, rules_table):
"Check whether entry, given as triple (word_form, lemma, tags) is recognizable using \
rules_table as obtained from make_rules_table() function. Return the rule's class \
(third column, usually empty string)."
for chunk_size in range(3, -1, -1):
if len(entry[0]) < chunk_size:
continue
rule_candidates = []
pref_ind = entry[0][:chunk_size]+'-'
suf_ind = '-'+entry[0][-chunk_size:]
if pref_ind in rules_table:
rule_candidates += rules_table[ pref_ind ]
if suf_ind in rules_table:
rule_candidates += rules_table[ suf_ind ]
if len(rule_candidates) == 0:
continue
for rl in rule_candidates:
# check first the prefix and suffix (the above code just finds rules that are
# potentially relevant), and tag; then proceed to reconstructing the lemma
if (entry[0][:len(rl[3])] == rl[3] and
# check for empty suffix, since string[-0:] returns the string unchanged
(len(rl[4]) == 0 or entry[0][-len(rl[4]):] == rl[4]) and
liberal_tagcomp(entry[2], rl[6])):
# trim the prefix and suffix, and glue the ending suggested by the rule;
# compare with the original lemma
if (entry[0][len(rl[3]):-len(rl[4])]+rl[5] == entry[1]
# another corner case, str[:-0] would be ''
or (len(rl[4]) == 0 and entry[0][len(rl[3]):]+rl[5] == entry[1])):
return rl[2]
return False
rlist = load_rules_file('../resources/SGJP/freq_rules.tab')
rtable = make_rules_table(rlist)
with open('../resources/NKJP1M/NKJP1M-frequency.tab') as inp:
with open('freq_with_rules.tab', 'w+') as out:
for line in inp:
line = line.strip()
data = line.split('\t')
if len(data) != 4:
print('Skipped line in the list: '+line)
continue
rl_class = is_recognizable((data[0], data[1], data[2]), rtable)
if rl_class == False:
print(line+'\tNCOMPOS', file=out)
elif rl_class == '':
print(line+'\tCOMPOS', file=out)
else:
print(line+'\t'+'COMPOS-'+rl_class, file=out)