Commit 9945eec3c13fc40bbd37b00ce26873de1a2e4691

Authored by Szymon Rutkowski
1 parent fb5972b2

korekta błędów na liście frekw., nowy skrypt do COMPOS

Too many changes to show.

To preserve performance only 1 of 6 files are displayed.

morphology/check_rule_compos.py deleted
1   -# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016.
2   -# This file is intended to check the NKJP1M frequency list against rules derived from SGJP.
3   -# If you want to use this, review the end of this file (filenames, column structure) and run with python3.
4   -
5   -import re
6   -
7   -def load_rules_file(fname):
8   - rule_list = []
9   - contents = ''
10   -
11   - with open(fname) as inp:
12   - contents = inp.read()
13   -
14   - contents = contents.split('\n')
15   -
16   - for line in contents:
17   - data = line.split('\t')
18   - if len(data) != 7:
19   - print('Skipped line in rules: '+line)
20   - rule_list.append(tuple(data))
21   -
22   - return rule_list
23   -
24   -def make_rules_table(rule_list):
25   - "Given rule_list as list of tuples (name, freq, classification, prefix, suffix, stem ending, \
26   - tag), create a dictionary: ending -> list of applicable rules, also as tuples. Indices are \
27   - prefixes followed by - (hyphen) and suffixes preced by -, up to three characters; longer \
28   - affixes are included in the lists for their outermost three-character parts. If both empty \
29   - affixes are empty, rule gets listed under '-'."
30   -
31   - rtable = dict()
32   -
33   - for rl in rule_list:
34   - if len(rl) != 7:
35   - print("Skipped invalid rule: "+str(rl))
36   - continue
37   -
38   - index = '-'
39   -
40   - if rl[3] != '':
41   - index = rl[3] + '-'
42   - elif rl[4] != '':
43   - index = '-' + rl[4]
44   -
45   - if len(index) > 4:
46   - if index[0] == '-': # suffix
47   - index = '-' + index[-3:]
48   - else: # prefix
49   - index = index[:3] + '-'
50   -
51   - if index in rtable:
52   - rtable[index].append(rl)
53   - else:
54   - rtable[index] = [ rl ]
55   -
56   - return rtable
57   -
58   -# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
59   -# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
60   -# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
61   -def strict_tagcomp(tag1, tag2):
62   - tag1_items = tag1.split(':')
63   - tag2_items = tag2.split(':')
64   -
65   - if (tag1_items[0] != tag2_items[0] # POS
66   - or len(tag1_items) != len(tag2_items)):
67   - return False
68   -
69   - for (i, item) in enumerate(tag1_items):
70   - if not item in tag2_items[i].split('.'):
71   - return False
72   -
73   - return True
74   -
75   -def liberal_tagcomp(tag1, tag2):
76   - tag1_items = tag1.split(':')
77   - tag2_items = tag2.split(':')
78   -
79   - if (tag1_items[0] != tag2_items[0] # POS
80   - or len(tag1_items) != len(tag2_items)):
81   - return False
82   -
83   - for (i, item) in enumerate(tag1_items):
84   - # remove tags n1, f1...
85   - item = re.sub(r'(n1|n2|n3)', 'n', item)
86   - model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.')
87   - if not item in model and model[0] != '_': # underscore as a catchall
88   - return False
89   -
90   - return True
91   -
92   -def is_recognizable(entry, rules_table):
93   - "Check whether entry, given as triple (word_form, lemma, tags) is recognizable using \
94   - rules_table as obtained from make_rules_table() function. Return the rule's class \
95   - (third column, usually empty string)."
96   -
97   - for chunk_size in range(3, -1, -1):
98   - if len(entry[0]) < chunk_size:
99   - continue
100   -
101   - rule_candidates = []
102   -
103   - pref_ind = entry[0][:chunk_size]+'-'
104   - suf_ind = '-'+entry[0][-chunk_size:]
105   - if pref_ind in rules_table:
106   - rule_candidates += rules_table[ pref_ind ]
107   - if suf_ind in rules_table:
108   - rule_candidates += rules_table[ suf_ind ]
109   -
110   - if len(rule_candidates) == 0:
111   - continue
112   - for rl in rule_candidates:
113   - # check first the prefix and suffix (the above code just finds rules that are
114   - # potentially relevant), and tag; then proceed to reconstructing the lemma
115   - if (entry[0][:len(rl[3])] == rl[3] and
116   - # check for empty suffix, since string[-0:] returns the string unchanged
117   - (len(rl[4]) == 0 or entry[0][-len(rl[4]):] == rl[4]) and
118   - liberal_tagcomp(entry[2], rl[6])):
119   - # trim the prefix and suffix, and glue the ending suggested by the rule;
120   - # compare with the original lemma
121   - if (entry[0][len(rl[3]):-len(rl[4])]+rl[5] == entry[1]
122   - # another corner case, str[:-0] would be ''
123   - or (len(rl[4]) == 0 and entry[0][len(rl[3]):]+rl[5] == entry[1])):
124   - return rl[2]
125   -
126   - return False
127   -
128   -rlist = load_rules_file('../resources/SGJP/freq_rules.tab')
129   -rtable = make_rules_table(rlist)
130   -
131   -def esccurl(string) :
132   - "Escape the curly brackets in the string, for using it with the string formatter."
133   - return string.replace('{', '{{').replace('}', '}}')
134   -
135   -with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
136   - with open('freq_with_rules.tab', 'w+') as out:
137   - for line in inp:
138   - line = line.strip()
139   - data = line.split('\t')
140   - if len(data) != 8: # column count of TAGGED frequency list
141   - print('Skipped line in the list: '+line)
142   - continue
143   -
144   - # The following was added to work on partially done tagged frequency, to get rid of the
145   - # previous COMPOS classification. Otherwise we'd want to use something like this:
146   - # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
147   - # previous COMPOS column is in data[4], so we skip it below
148   - fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))
149   -
150   - rl_class = is_recognizable((data[0], data[1], data[2]), rtable)
151   - if rl_class == '':
152   - print(fmt.format('COMPOS'), file=out)
153   - elif rl_class != False:
154   - print(fmt.format('COMPOS-'+rl_class), file=out)
155   - else:
156   - # Try again, with lowered lemma and word form.
157   - rl_class_low = is_recognizable((data[0].lower(), data[1].lower(), data[2]),
158   - rtable)
159   - if rl_class_low == '':
160   - print(fmt.format('COMPOS-LWR'), file=out)
161   - elif rl_class_low != False:
162   - print(fmt.format('COMPOS-LWR-'+rl_class_low), file=out)
163   - else:
164   - print(fmt.format('NCOMPOS'), file=out)