korekta błędów na liście frekw., nowy skrypt do COMPOS (9945eec3) | Commits | Wojciech Jaworski / ENIAM

Browse Code »

Commit 9945eec3c13fc40bbd37b00ce26873de1a2e4691

Authored by Szymon Rutkowski 8 years ago

1 parent fb5972b2

korekta błędów na liście frekw., nowy skrypt do COMPOS

Inline Side-by-side

Showing 6 changed files with 549669 additions and 164 deletions

Too many changes to show.
Reload with full diff Plain diff Email patch

To preserve performance only 1 of 6 files are displayed.

morphology/check_rule_compos.py deleted

View file @fb5972b

1		-# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016.
2		-# This file is intended to check the NKJP1M frequency list against rules derived from SGJP.
3		-# If you want to use this, review the end of this file (filenames, column structure) and run with python3.
4		-
5		-import re
6		-
7		-def load_rules_file(fname):
8		- rule_list = []
9		- contents = ''
10		-
11		- with open(fname) as inp:
12		- contents = inp.read()
13		-
14		- contents = contents.split('\n')
15		-
16		- for line in contents:
17		- data = line.split('\t')
18		- if len(data) != 7:
19		- print('Skipped line in rules: '+line)
20		- rule_list.append(tuple(data))
21		-
22		- return rule_list
23		-
24		-def make_rules_table(rule_list):
25		- "Given rule_list as list of tuples (name, freq, classification, prefix, suffix, stem ending, \
26		- tag), create a dictionary: ending -> list of applicable rules, also as tuples. Indices are \
27		- prefixes followed by - (hyphen) and suffixes preced by -, up to three characters; longer \
28		- affixes are included in the lists for their outermost three-character parts. If both empty \
29		- affixes are empty, rule gets listed under '-'."
30		-
31		- rtable = dict()
32		-
33		- for rl in rule_list:
34		- if len(rl) != 7:
35		- print("Skipped invalid rule: "+str(rl))
36		- continue
37		-
38		- index = '-'
39		-
40		- if rl[3] != '':
41		- index = rl[3] + '-'
42		- elif rl[4] != '':
43		- index = '-' + rl[4]
44		-
45		- if len(index) > 4:
46		- if index[0] == '-': # suffix
47		- index = '-' + index[-3:]
48		- else: # prefix
49		- index = index[:3] + '-'
50		-
51		- if index in rtable:
52		- rtable[index].append(rl)
53		- else:
54		- rtable[index] = [ rl ]
55		-
56		- return rtable
57		-
58		-# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained
59		-# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking
60		-# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better
61		-def strict_tagcomp(tag1, tag2):
62		- tag1_items = tag1.split(':')
63		- tag2_items = tag2.split(':')
64		-
65		- if (tag1_items[0] != tag2_items[0] # POS
66		- or len(tag1_items) != len(tag2_items)):
67		- return False
68		-
69		- for (i, item) in enumerate(tag1_items):
70		- if not item in tag2_items[i].split('.'):
71		- return False
72		-
73		- return True
74		-
75		-def liberal_tagcomp(tag1, tag2):
76		- tag1_items = tag1.split(':')
77		- tag2_items = tag2.split(':')
78		-
79		- if (tag1_items[0] != tag2_items[0] # POS
80		- or len(tag1_items) != len(tag2_items)):
81		- return False
82		-
83		- for (i, item) in enumerate(tag1_items):
84		- # remove tags n1, f1...
85		- item = re.sub(r'(n1\|n2\|n3)', 'n', item)
86		- model = re.sub(r'(n1\|n2\|n3\|p2\|p3)', 'n', tag2_items[i]).split('.')
87		- if not item in model and model[0] != '_': # underscore as a catchall
88		- return False
89		-
90		- return True
91		-
92		-def is_recognizable(entry, rules_table):
93		- "Check whether entry, given as triple (word_form, lemma, tags) is recognizable using \
94		- rules_table as obtained from make_rules_table() function. Return the rule's class \
95		- (third column, usually empty string)."
96		-
97		- for chunk_size in range(3, -1, -1):
98		- if len(entry[0]) < chunk_size:
99		- continue
100		-
101		- rule_candidates = []
102		-
103		- pref_ind = entry[0][:chunk_size]+'-'
104		- suf_ind = '-'+entry[0][-chunk_size:]
105		- if pref_ind in rules_table:
106		- rule_candidates += rules_table[ pref_ind ]
107		- if suf_ind in rules_table:
108		- rule_candidates += rules_table[ suf_ind ]
109		-
110		- if len(rule_candidates) == 0:
111		- continue
112		- for rl in rule_candidates:
113		- # check first the prefix and suffix (the above code just finds rules that are
114		- # potentially relevant), and tag; then proceed to reconstructing the lemma
115		- if (entry[0][:len(rl[3])] == rl[3] and
116		- # check for empty suffix, since string[-0:] returns the string unchanged
117		- (len(rl[4]) == 0 or entry[0][-len(rl[4]):] == rl[4]) and
118		- liberal_tagcomp(entry[2], rl[6])):
119		- # trim the prefix and suffix, and glue the ending suggested by the rule;
120		- # compare with the original lemma
121		- if (entry[0][len(rl[3]):-len(rl[4])]+rl[5] == entry[1]
122		- # another corner case, str[:-0] would be ''
123		- or (len(rl[4]) == 0 and entry[0][len(rl[3]):]+rl[5] == entry[1])):
124		- return rl[2]
125		-
126		- return False
127		-
128		-rlist = load_rules_file('../resources/SGJP/freq_rules.tab')
129		-rtable = make_rules_table(rlist)
130		-
131		-def esccurl(string) :
132		- "Escape the curly brackets in the string, for using it with the string formatter."
133		- return string.replace('{', '{{').replace('}', '}}')
134		-
135		-with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp:
136		- with open('freq_with_rules.tab', 'w+') as out:
137		- for line in inp:
138		- line = line.strip()
139		- data = line.split('\t')
140		- if len(data) != 8: # column count of TAGGED frequency list
141		- print('Skipped line in the list: '+line)
142		- continue
143		-
144		- # The following was added to work on partially done tagged frequency, to get rid of the
145		- # previous COMPOS classification. Otherwise we'd want to use something like this:
146		- # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list
147		- # previous COMPOS column is in data[4], so we skip it below
148		- fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:]))
149		-
150		- rl_class = is_recognizable((data[0], data[1], data[2]), rtable)
151		- if rl_class == '':
152		- print(fmt.format('COMPOS'), file=out)
153		- elif rl_class != False:
154		- print(fmt.format('COMPOS-'+rl_class), file=out)
155		- else:
156		- # Try again, with lowered lemma and word form.
157		- rl_class_low = is_recognizable((data[0].lower(), data[1].lower(), data[2]),
158		- rtable)
159		- if rl_class_low == '':
160		- print(fmt.format('COMPOS-LWR'), file=out)
161		- elif rl_class_low != False:
162		- print(fmt.format('COMPOS-LWR-'+rl_class_low), file=out)
163		- else:
164		- print(fmt.format('NCOMPOS'), file=out)

korekta błędów na liście frekw., nowy skrypt do COMPOS

Too many changes to show. Reload with full diff Plain diff Email patch

Too many changes to show.
Reload with full diff Plain diff Email patch