Commit 9945eec3c13fc40bbd37b00ce26873de1a2e4691
1 parent
fb5972b2
korekta błędów na liście frekw., nowy skrypt do COMPOS
Showing
6 changed files
with
549669 additions
and
164 deletions
Too many changes to show.
To preserve performance only 1 of 6 files are displayed.
morphology/check_rule_compos.py deleted
1 | -# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016. | |
2 | -# This file is intended to check the NKJP1M frequency list against rules derived from SGJP. | |
3 | -# If you want to use this, review the end of this file (filenames, column structure) and run with python3. | |
4 | - | |
5 | -import re | |
6 | - | |
7 | -def load_rules_file(fname): | |
8 | - rule_list = [] | |
9 | - contents = '' | |
10 | - | |
11 | - with open(fname) as inp: | |
12 | - contents = inp.read() | |
13 | - | |
14 | - contents = contents.split('\n') | |
15 | - | |
16 | - for line in contents: | |
17 | - data = line.split('\t') | |
18 | - if len(data) != 7: | |
19 | - print('Skipped line in rules: '+line) | |
20 | - rule_list.append(tuple(data)) | |
21 | - | |
22 | - return rule_list | |
23 | - | |
24 | -def make_rules_table(rule_list): | |
25 | - "Given rule_list as list of tuples (name, freq, classification, prefix, suffix, stem ending, \ | |
26 | - tag), create a dictionary: ending -> list of applicable rules, also as tuples. Indices are \ | |
27 | - prefixes followed by - (hyphen) and suffixes preced by -, up to three characters; longer \ | |
28 | - affixes are included in the lists for their outermost three-character parts. If both empty \ | |
29 | - affixes are empty, rule gets listed under '-'." | |
30 | - | |
31 | - rtable = dict() | |
32 | - | |
33 | - for rl in rule_list: | |
34 | - if len(rl) != 7: | |
35 | - print("Skipped invalid rule: "+str(rl)) | |
36 | - continue | |
37 | - | |
38 | - index = '-' | |
39 | - | |
40 | - if rl[3] != '': | |
41 | - index = rl[3] + '-' | |
42 | - elif rl[4] != '': | |
43 | - index = '-' + rl[4] | |
44 | - | |
45 | - if len(index) > 4: | |
46 | - if index[0] == '-': # suffix | |
47 | - index = '-' + index[-3:] | |
48 | - else: # prefix | |
49 | - index = index[:3] + '-' | |
50 | - | |
51 | - if index in rtable: | |
52 | - rtable[index].append(rl) | |
53 | - else: | |
54 | - rtable[index] = [ rl ] | |
55 | - | |
56 | - return rtable | |
57 | - | |
58 | -# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained | |
59 | -# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking | |
60 | -# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better | |
61 | -def strict_tagcomp(tag1, tag2): | |
62 | - tag1_items = tag1.split(':') | |
63 | - tag2_items = tag2.split(':') | |
64 | - | |
65 | - if (tag1_items[0] != tag2_items[0] # POS | |
66 | - or len(tag1_items) != len(tag2_items)): | |
67 | - return False | |
68 | - | |
69 | - for (i, item) in enumerate(tag1_items): | |
70 | - if not item in tag2_items[i].split('.'): | |
71 | - return False | |
72 | - | |
73 | - return True | |
74 | - | |
75 | -def liberal_tagcomp(tag1, tag2): | |
76 | - tag1_items = tag1.split(':') | |
77 | - tag2_items = tag2.split(':') | |
78 | - | |
79 | - if (tag1_items[0] != tag2_items[0] # POS | |
80 | - or len(tag1_items) != len(tag2_items)): | |
81 | - return False | |
82 | - | |
83 | - for (i, item) in enumerate(tag1_items): | |
84 | - # remove tags n1, f1... | |
85 | - item = re.sub(r'(n1|n2|n3)', 'n', item) | |
86 | - model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.') | |
87 | - if not item in model and model[0] != '_': # underscore as a catchall | |
88 | - return False | |
89 | - | |
90 | - return True | |
91 | - | |
92 | -def is_recognizable(entry, rules_table): | |
93 | - "Check whether entry, given as triple (word_form, lemma, tags) is recognizable using \ | |
94 | - rules_table as obtained from make_rules_table() function. Return the rule's class \ | |
95 | - (third column, usually empty string)." | |
96 | - | |
97 | - for chunk_size in range(3, -1, -1): | |
98 | - if len(entry[0]) < chunk_size: | |
99 | - continue | |
100 | - | |
101 | - rule_candidates = [] | |
102 | - | |
103 | - pref_ind = entry[0][:chunk_size]+'-' | |
104 | - suf_ind = '-'+entry[0][-chunk_size:] | |
105 | - if pref_ind in rules_table: | |
106 | - rule_candidates += rules_table[ pref_ind ] | |
107 | - if suf_ind in rules_table: | |
108 | - rule_candidates += rules_table[ suf_ind ] | |
109 | - | |
110 | - if len(rule_candidates) == 0: | |
111 | - continue | |
112 | - for rl in rule_candidates: | |
113 | - # check first the prefix and suffix (the above code just finds rules that are | |
114 | - # potentially relevant), and tag; then proceed to reconstructing the lemma | |
115 | - if (entry[0][:len(rl[3])] == rl[3] and | |
116 | - # check for empty suffix, since string[-0:] returns the string unchanged | |
117 | - (len(rl[4]) == 0 or entry[0][-len(rl[4]):] == rl[4]) and | |
118 | - liberal_tagcomp(entry[2], rl[6])): | |
119 | - # trim the prefix and suffix, and glue the ending suggested by the rule; | |
120 | - # compare with the original lemma | |
121 | - if (entry[0][len(rl[3]):-len(rl[4])]+rl[5] == entry[1] | |
122 | - # another corner case, str[:-0] would be '' | |
123 | - or (len(rl[4]) == 0 and entry[0][len(rl[3]):]+rl[5] == entry[1])): | |
124 | - return rl[2] | |
125 | - | |
126 | - return False | |
127 | - | |
128 | -rlist = load_rules_file('../resources/SGJP/freq_rules.tab') | |
129 | -rtable = make_rules_table(rlist) | |
130 | - | |
131 | -def esccurl(string) : | |
132 | - "Escape the curly brackets in the string, for using it with the string formatter." | |
133 | - return string.replace('{', '{{').replace('}', '}}') | |
134 | - | |
135 | -with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp: | |
136 | - with open('freq_with_rules.tab', 'w+') as out: | |
137 | - for line in inp: | |
138 | - line = line.strip() | |
139 | - data = line.split('\t') | |
140 | - if len(data) != 8: # column count of TAGGED frequency list | |
141 | - print('Skipped line in the list: '+line) | |
142 | - continue | |
143 | - | |
144 | - # The following was added to work on partially done tagged frequency, to get rid of the | |
145 | - # previous COMPOS classification. Otherwise we'd want to use something like this: | |
146 | - # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list | |
147 | - # previous COMPOS column is in data[4], so we skip it below | |
148 | - fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:])) | |
149 | - | |
150 | - rl_class = is_recognizable((data[0], data[1], data[2]), rtable) | |
151 | - if rl_class == '': | |
152 | - print(fmt.format('COMPOS'), file=out) | |
153 | - elif rl_class != False: | |
154 | - print(fmt.format('COMPOS-'+rl_class), file=out) | |
155 | - else: | |
156 | - # Try again, with lowered lemma and word form. | |
157 | - rl_class_low = is_recognizable((data[0].lower(), data[1].lower(), data[2]), | |
158 | - rtable) | |
159 | - if rl_class_low == '': | |
160 | - print(fmt.format('COMPOS-LWR'), file=out) | |
161 | - elif rl_class_low != False: | |
162 | - print(fmt.format('COMPOS-LWR-'+rl_class_low), file=out) | |
163 | - else: | |
164 | - print(fmt.format('NCOMPOS'), file=out) |