Commit eace3a197039fab3294a6e38a5d492f9b628db4f
Merge branch 'guesser' into integration
Showing
33 changed files
with
508360 additions
and
50 deletions
Too many changes to show.
To preserve performance only 7 of 33 files are displayed.
morphology/check_rule_compos.py
0 → 100644
1 | +# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016. | |
2 | +# This file is intended to check the NKJP1M frequency list against rules derived from SGJP. | |
3 | +# If you want to use this, review the end of this file (filenames, column structure) and run with python3. | |
4 | + | |
5 | +import re | |
6 | + | |
7 | +def load_rules_file(fname): | |
8 | + rule_list = [] | |
9 | + contents = '' | |
10 | + | |
11 | + with open(fname) as inp: | |
12 | + contents = inp.read() | |
13 | + | |
14 | + contents = contents.split('\n') | |
15 | + | |
16 | + for line in contents: | |
17 | + data = line.split('\t') | |
18 | + if len(data) != 7: | |
19 | + print('Skipped line in rules: '+line) | |
20 | + rule_list.append(tuple(data)) | |
21 | + | |
22 | + return rule_list | |
23 | + | |
24 | +def make_rules_table(rule_list): | |
25 | + "Given rule_list as list of tuples (name, freq, classification, prefix, suffix, stem ending, \ | |
26 | + tag), create a dictionary: ending -> list of applicable rules, also as tuples. Indices are \ | |
27 | + prefixes followed by - (hyphen) and suffixes preced by -, up to three characters; longer \ | |
28 | + affixes are included in the lists for their outermost three-character parts. If both empty \ | |
29 | + affixes are empty, rule gets listed under '-'." | |
30 | + | |
31 | + rtable = dict() | |
32 | + | |
33 | + for rl in rule_list: | |
34 | + if len(rl) != 7: | |
35 | + print("Skipped invalid rule: "+str(rl)) | |
36 | + continue | |
37 | + | |
38 | + index = '-' | |
39 | + | |
40 | + if rl[3] != '': | |
41 | + index = rl[3] + '-' | |
42 | + elif rl[4] != '': | |
43 | + index = '-' + rl[4] | |
44 | + | |
45 | + if len(index) > 4: | |
46 | + if index[0] == '-': # suffix | |
47 | + index = '-' + index[-3:] | |
48 | + else: # prefix | |
49 | + index = index[:3] + '-' | |
50 | + | |
51 | + if index in rtable: | |
52 | + rtable[index].append(rl) | |
53 | + else: | |
54 | + rtable[index] = [ rl ] | |
55 | + | |
56 | + return rtable | |
57 | + | |
58 | +# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained | |
59 | +# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking | |
60 | +# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better | |
61 | +def strict_tagcomp(tag1, tag2): | |
62 | + tag1_items = tag1.split(':') | |
63 | + tag2_items = tag2.split(':') | |
64 | + | |
65 | + if (tag1_items[0] != tag2_items[0] # POS | |
66 | + or len(tag1_items) != len(tag2_items)): | |
67 | + return False | |
68 | + | |
69 | + for (i, item) in enumerate(tag1_items): | |
70 | + if not item in tag2_items[i].split('.'): | |
71 | + return False | |
72 | + | |
73 | + return True | |
74 | + | |
75 | +def liberal_tagcomp(tag1, tag2): | |
76 | + tag1_items = tag1.split(':') | |
77 | + tag2_items = tag2.split(':') | |
78 | + | |
79 | + if (tag1_items[0] != tag2_items[0] # POS | |
80 | + or len(tag1_items) != len(tag2_items)): | |
81 | + return False | |
82 | + | |
83 | + for (i, item) in enumerate(tag1_items): | |
84 | + # remove tags n1, f1... | |
85 | + item = re.sub(r'(n1|n2|n3)', 'n', item) | |
86 | + model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.') | |
87 | + if not item in model and model[0] != '_': # underscore as a catchall | |
88 | + return False | |
89 | + | |
90 | + return True | |
91 | + | |
92 | +def is_recognizable(entry, rules_table): | |
93 | + "Check whether entry, given as triple (word_form, lemma, tags) is recognizable using \ | |
94 | + rules_table as obtained from make_rules_table() function. Return the rule's class \ | |
95 | + (third column, usually empty string)." | |
96 | + | |
97 | + for chunk_size in range(3, -1, -1): | |
98 | + if len(entry[0]) < chunk_size: | |
99 | + continue | |
100 | + | |
101 | + rule_candidates = [] | |
102 | + | |
103 | + pref_ind = entry[0][:chunk_size]+'-' | |
104 | + suf_ind = '-'+entry[0][-chunk_size:] | |
105 | + if pref_ind in rules_table: | |
106 | + rule_candidates += rules_table[ pref_ind ] | |
107 | + if suf_ind in rules_table: | |
108 | + rule_candidates += rules_table[ suf_ind ] | |
109 | + | |
110 | + if len(rule_candidates) == 0: | |
111 | + continue | |
112 | + for rl in rule_candidates: | |
113 | + # check first the prefix and suffix (the above code just finds rules that are | |
114 | + # potentially relevant), and tag; then proceed to reconstructing the lemma | |
115 | + if (entry[0][:len(rl[3])] == rl[3] and | |
116 | + # check for empty suffix, since string[-0:] returns the string unchanged | |
117 | + (len(rl[4]) == 0 or entry[0][-len(rl[4]):] == rl[4]) and | |
118 | + liberal_tagcomp(entry[2], rl[6])): | |
119 | + # trim the prefix and suffix, and glue the ending suggested by the rule; | |
120 | + # compare with the original lemma | |
121 | + if (entry[0][len(rl[3]):-len(rl[4])]+rl[5] == entry[1] | |
122 | + # another corner case, str[:-0] would be '' | |
123 | + or (len(rl[4]) == 0 and entry[0][len(rl[3]):]+rl[5] == entry[1])): | |
124 | + return rl[2] | |
125 | + | |
126 | + return False | |
127 | + | |
128 | +rlist = load_rules_file('../resources/SGJP/freq_rules.tab') | |
129 | +rtable = make_rules_table(rlist) | |
130 | + | |
131 | +def esccurl(string) : | |
132 | + "Escape the curly brackets in the string, for using it with the string formatter." | |
133 | + return string.replace('{', '{{').replace('}', '}}') | |
134 | + | |
135 | +with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp: | |
136 | + with open('freq_with_rules.tab', 'w+') as out: | |
137 | + for line in inp: | |
138 | + line = line.strip() | |
139 | + data = line.split('\t') | |
140 | + if len(data) != 8: # column count of TAGGED frequency list | |
141 | + print('Skipped line in the list: '+line) | |
142 | + continue | |
143 | + | |
144 | + # The following was added to work on partially done tagged frequency, to get rid of the | |
145 | + # previous COMPOS classification. Otherwise we'd want to use something like this: | |
146 | + # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list | |
147 | + # previous COMPOS column is in data[4], so we skip it below | |
148 | + fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:])) | |
149 | + | |
150 | + rl_class = is_recognizable((data[0], data[1], data[2]), rtable) | |
151 | + if rl_class == '': | |
152 | + print(fmt.format('COMPOS'), file=out) | |
153 | + elif rl_class != False: | |
154 | + print(fmt.format('COMPOS-'+rl_class), file=out) | |
155 | + else: | |
156 | + # Try again, with lowered lemma and word form. | |
157 | + rl_class_low = is_recognizable((data[0].lower(), data[1].lower(), data[2]), | |
158 | + rtable) | |
159 | + if rl_class_low == '': | |
160 | + print(fmt.format('COMPOS-LWR'), file=out) | |
161 | + elif rl_class_low != False: | |
162 | + print(fmt.format('COMPOS-LWR-'+rl_class_low), file=out) | |
163 | + else: | |
164 | + print(fmt.format('NCOMPOS'), file=out) | |
... | ... |
morphology/compare_morphosyn.py
0 → 100644
1 | +# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Oct 2016. | |
2 | +# | |
3 | +# Given a frequency list and groundtruth dictionary, tag the entries on the frequency list with some | |
4 | +# automatic tags (can be seen at the end of this file). | |
5 | +# | |
6 | +# Run from Python3, with -i (inspect option), eg. `python3 -i compare_morphosyn.py`. | |
7 | +# Then invoke something like (with # representing Python prompt): | |
8 | +# # sgjp = load_sgjp('../../NLP resources/sgjp-20160724.tab') | |
9 | +# # nkjp = load_nkjp('../resources/NKJP1M/NKJP1M-frequency.tab') | |
10 | +# # notmatching(nkjp, sgjp, liberal_tagcomp, 'raw_tagged_frequency.tab') # (may take a while) | |
11 | +# # ^D # Ctrl-D when done | |
12 | +# The last argument points the result file, liberal_tagcomp is the most sane tag comparing function. | |
13 | + | |
14 | +import functools | |
15 | +import re | |
16 | +import unicodedata | |
17 | + | |
18 | +def load_sgjp(fname): | |
19 | + sgjp = dict() | |
20 | + with open(fname) as inp: | |
21 | + for line in inp: | |
22 | + data = line.strip().split('\t') | |
23 | + | |
24 | + if len(data) < 3: | |
25 | + print('Skipped line: ' + line.strip()) | |
26 | + continue | |
27 | + | |
28 | + word_form = data[0] | |
29 | + | |
30 | + lemma = '' | |
31 | + lemma_sub = '' | |
32 | + if data[1] == ':': | |
33 | + lemma = [':'] | |
34 | + else: | |
35 | + lemma = data[1].split(":")[0] # lemma subidentifier | |
36 | + if len(data[1].split(":")) > 1: | |
37 | + lemma_sub = data[1].split(":")[1] | |
38 | + if word_form.find('_') == -1: | |
39 | + lemma = lemma.replace('_', ' ') | |
40 | + | |
41 | + tags = data[2] | |
42 | + | |
43 | + notes = '' | |
44 | + if len(data) == 4: | |
45 | + notes = data[3] | |
46 | + | |
47 | + if lemma in sgjp: | |
48 | + sgjp[lemma].append([word_form, tags, notes]) | |
49 | + else: | |
50 | + sgjp[lemma] = [ [word_form, tags, notes, lemma_sub] ] | |
51 | + return sgjp | |
52 | + | |
53 | + | |
54 | +def load_nkjp(fname): | |
55 | + nkjp = [] | |
56 | + with open(fname) as inp: | |
57 | + nkjp = inp.read().split('\n') | |
58 | + for (n, line) in enumerate(nkjp): | |
59 | + nkjp[n] = nkjp[n].split('\t') # word_form, lemma, tags, freq | |
60 | + if len(nkjp[n]) != 5: | |
61 | + print('Skipped line: ' + str(n)) | |
62 | + del nkjp[n] | |
63 | + return nkjp | |
64 | + | |
65 | +def naive_tagcomp(tag1, tag2): | |
66 | + return (tag1 == tag2) | |
67 | + | |
68 | +def strict_tagcomp(tag1, tag2): | |
69 | + tag1_items = tag1.split(':') | |
70 | + tag2_items = tag2.split(':') | |
71 | + | |
72 | + if (tag1_items[0] != tag2_items[0] # POS | |
73 | + or len(tag1_items) != len(tag2_items)): | |
74 | + return False | |
75 | + | |
76 | + for (i, item) in enumerate(tag1_items): | |
77 | + if not item in tag2_items[i].split('.'): | |
78 | + return False | |
79 | + | |
80 | + return True | |
81 | + | |
82 | +def liberal_tagcomp(tag1, tag2): | |
83 | + tag1_items = tag1.split(':') | |
84 | + tag2_items = tag2.split(':') | |
85 | + | |
86 | + if (tag1_items[0] != tag2_items[0] # POS | |
87 | + or len(tag1_items) != len(tag2_items)): | |
88 | + return False | |
89 | + | |
90 | + for (i, item) in enumerate(tag1_items): | |
91 | + # remove tags n1, f1... | |
92 | + item = re.sub(r'(n1|n2|n3)', 'n', item) | |
93 | + model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.') | |
94 | + if not item in model and model[0] != '_': # underscore as a catchall | |
95 | + return False | |
96 | + | |
97 | + return True | |
98 | + | |
99 | +def compare_entries(nkjp_entry, sgjp_forms, tagcomp_func): | |
100 | + found = False | |
101 | + case1 = False | |
102 | + case2 = False | |
103 | + case3 = False | |
104 | + for (s, sgjp_form) in enumerate(sgjp_forms): | |
105 | + nkjp_word = nkjp_entry[0] | |
106 | + nkjp_tag = re.sub(r':$', '', nkjp_entry[2]) | |
107 | + if nkjp_tag != nkjp_entry[2]: | |
108 | + print("Corrected tag %s for %s %s" % (nkjp_entry[2], nkjp_entry[0], nkjp_entry[1])) | |
109 | + sgjp_word = sgjp_form[0] | |
110 | + sgjp_tag = sgjp_form[1] | |
111 | + | |
112 | + tag_match = tagcomp_func(nkjp_tag, sgjp_tag) # do it once | |
113 | + | |
114 | + if sgjp_word == nkjp_word and tag_match: # word_nkjp_word & tag | |
115 | + found = True | |
116 | + break | |
117 | + | |
118 | + elif tag_match: # tag okay, try with other letter cases | |
119 | + if len(nkjp_word) > 1 and nkjp_word.lower().capitalize() == nkjp_word: # Aaaa -> aaaa | |
120 | + if sgjp_word == nkjp_word.lower(): | |
121 | + case1 = True | |
122 | + if not case1 and nkjp_word.lower() != nkjp_word: | |
123 | + if sgjp_word == nkjp_word.capitalize(): # AAAA -> Aaaa | |
124 | + case2 = True | |
125 | + elif sgjp_word == nkjp_word.lower(): # AAAA -> aaaa, A -> a | |
126 | + case3 = True | |
127 | + return (found, case1, case2, case3) | |
128 | + | |
129 | +def tab_format(collection, label): | |
130 | + "Convert a collection used by notmatching() function to a string of tabbed entries." | |
131 | + fmt = '' | |
132 | + for etr in collection: | |
133 | + fmt = fmt + '\t'.join(etr)+ '\t' + label + '\n' | |
134 | + #print("formatted for "+label+", "+str(len(fmt)) + " bytes") | |
135 | + return fmt | |
136 | + | |
137 | +def nonalphab(string): | |
138 | + for char in string: | |
139 | + if unicodedata.category(char)[0] == 'L': # 'letter' | |
140 | + return False | |
141 | + return True | |
142 | + | |
143 | +def notmatching(nkjp, sgjp, tagcomp_func, result_file): | |
144 | + notmatching = [] | |
145 | + matching = [] | |
146 | + case1_notmatching = [] # Aaaa -> aaaa | |
147 | + case2_notmatching = [] # AAAA -> Aaaa | |
148 | + case3_notmatching = [] # AAAA -> aaaa, A -> a | |
149 | + lower_matching = [] # matching with form and lemma converted to lowercase | |
150 | + symbols = [] | |
151 | + notmatching_numeric = [] | |
152 | + | |
153 | + for (n, nkjp_entry) in enumerate(nkjp): | |
154 | + | |
155 | + lemma = nkjp_entry[1].strip() | |
156 | + form = nkjp_entry[0].strip() | |
157 | + # Warn about stripped whitespaces. | |
158 | + if lemma != nkjp_entry[1]: | |
159 | + print("Stripped whitespaces in lemma: %s" % nkjp_entry[1]) | |
160 | + if form != nkjp_entry[0]: | |
161 | + print("Stripped whitespaces in form: %s" % nkjp_entry[0]) | |
162 | + | |
163 | + # Abbreviations are automatically classified as symbols. | |
164 | + if nkjp_entry[2][:4] == 'brev': | |
165 | + symbols.append(nkjp_entry) | |
166 | + continue | |
167 | + | |
168 | + sgjp_forms = [] | |
169 | + lowered_lemma = False # indicates if lemma was converted to lowercase | |
170 | + if lemma in sgjp: # lemma matching | |
171 | + sgjp_forms = sgjp[lemma] | |
172 | + else: | |
173 | + if lemma.lower() in sgjp: | |
174 | + lowered_lemma = True | |
175 | + sgjp_forms = sgjp[lemma.lower()] | |
176 | + else: | |
177 | + # Continue when we can't find even lowered lemma in SGJP. | |
178 | + if nonalphab(form) and nonalphab(lemma): | |
179 | + symbols.append(nkjp_entry) | |
180 | + elif re.match(r"^[123456789]", form, flags=re.L) != None: | |
181 | + notmatching_numeric.append(nkjp_entry) | |
182 | + continue | |
183 | + else: | |
184 | + notmatching.append(nkjp_entry) | |
185 | + continue | |
186 | + | |
187 | + # The following is executed only if the lemma (maybe in lowercase) was found in SGJP. | |
188 | + | |
189 | + # Go through the entry if it wasn't found in SGJP | |
190 | + found, case1, case2, case3 = 0, 1, 2, 3 # indices in boolean tuple below | |
191 | + case = compare_entries(nkjp_entry, sgjp_forms, tagcomp_func) | |
192 | + | |
193 | + # one more desperate attempt at lowering the lemma, if nothing was found | |
194 | + if (not lowered_lemma) and not True in case: | |
195 | + if lemma.lower() in sgjp: | |
196 | + sgjp_forms = sgjp[lemma.lower()] | |
197 | + case = compare_entries(nkjp_entry, sgjp_forms, tagcomp_func) | |
198 | + if True in case: | |
199 | + lowered_lemma = True | |
200 | + else: # revert for consistency | |
201 | + sgjp_forms = sgjp[lemma] | |
202 | + | |
203 | + if lowered_lemma and (case[found] or case[case1] or case[case2] or case[case3]): | |
204 | + lower_matching.append(nkjp_entry) | |
205 | + continue | |
206 | + | |
207 | + if case[found]: | |
208 | + matching.append(nkjp_entry) | |
209 | + continue | |
210 | + | |
211 | + if nonalphab(form) and nonalphab(lemma): | |
212 | + symbols.append(nkjp_entry) | |
213 | + continue | |
214 | + if re.match(r"^[123456789]", form, flags=re.L) != None: | |
215 | + notmatching_numeric.append(nkjp_entry) | |
216 | + continue | |
217 | + | |
218 | + if case[case1]: | |
219 | + case1_notmatching.append(nkjp_entry) | |
220 | + continue | |
221 | + if case[case2]: | |
222 | + case2_notmatching.append(nkjp_entry) | |
223 | + continue | |
224 | + if case[case3]: | |
225 | + case3_notmatching.append(nkjp_entry) | |
226 | + continue | |
227 | + | |
228 | + # when everything failed: | |
229 | + notmatching.append(nkjp_entry) | |
230 | + | |
231 | + collections = [nkjp, matching, case1_notmatching, case2_notmatching, case3_notmatching, | |
232 | + lower_matching, symbols, notmatching_numeric, notmatching] | |
233 | + # sort the entries in collections by frequency | |
234 | + collections = list(map((lambda coll: sorted(coll, reverse=True, key=(lambda etr: int(etr[3])))), | |
235 | + collections)) | |
236 | + freqs = list(map(lambda coll: functools.reduce((lambda x, y: x+y), | |
237 | + [int(etr[3]) for etr in coll]), # sum of sets' frequencies | |
238 | + collections)) | |
239 | + descs = ["Total:", | |
240 | + "Found:", | |
241 | + "Found when Aaa -> aaa (lemma):", | |
242 | + "Found when AAA -> Aaa (lemma):", | |
243 | + "Found when AAA -> aaa (lemma):", | |
244 | + "Found when word form and lemma are converted to lowercase:", | |
245 | + "Symbols:", | |
246 | + "Not found, numeric:", | |
247 | + "Not found, other:"] | |
248 | + | |
249 | + for (i, _) in enumerate(collections): | |
250 | + info = (len(collections[i]), 100.0*(len(collections[i])/len(collections[0])), | |
251 | + freqs[i], 100.0*(freqs[i]/freqs[0])) | |
252 | + print((descs[i]+" %d entries (%.2f%%), %d occurences (%.2f%%)") % info) | |
253 | + | |
254 | + # below we skip nkjp, which contains everything | |
255 | + labels = ['SGJP-EXACT\tNCH\tCORR', 'SGJP-LMM-UNCAPITAL\tNCH\tCORR', | |
256 | + 'SGJP-LMM-CAPITAL\tNCH\tCORR', 'SGJP-LMM-LOWER\tNCH\tCORR', | |
257 | + 'SGJP-BTH-LOWER\tNCH\tCORR', 'NON-SGJP\tSYMB\tCORR', | |
258 | + 'NON-SGJP\tLATEK\tCORR', 'NON-SGJP\tCW\tCORR'] | |
259 | + with open(result_file, 'w+') as out: | |
260 | + for (c, coll) in enumerate(collections[1:]): | |
261 | + print(tab_format(coll, labels[c]), file=out) | |
... | ... |
morphology/compos_alt.py
0 → 100644
1 | +# Blame Szymon Rutkowski - szymon@szymonrutkowski.pl - Nov 2016. | |
2 | +# This file is intended to check the (partially tagged) NKJP1M frequency list against list of exce- | |
3 | +# ptions from morphological rules derived from SGJP. | |
4 | +# If you want to use this, review the end of this file (filenames, column structure) and run with python3. | |
5 | + | |
6 | +import re | |
7 | + | |
8 | +# just ripped from compare_morphosyn.py, guess it'll be better to keep those scripts self-contained | |
9 | +# note that liberal_tagcomp is mainly suitable for checking NKJP against SGJP, when checking | |
10 | +# a resource obeying more SJGP'ish tagging convention the strict_tagcomp will be better | |
11 | +def strict_tagcomp(tag1, tag2): | |
12 | + tag1_items = tag1.split(':') | |
13 | + tag2_items = tag2.split(':') | |
14 | + | |
15 | + if (tag1_items[0] != tag2_items[0] # POS | |
16 | + or len(tag1_items) != len(tag2_items)): | |
17 | + return False | |
18 | + | |
19 | + for (i, item) in enumerate(tag1_items): | |
20 | + if not item in tag2_items[i].split('.'): | |
21 | + return False | |
22 | + | |
23 | + return True | |
24 | + | |
25 | +def liberal_tagcomp(tag1, tag2): | |
26 | + tag1_items = tag1.split(':') | |
27 | + tag2_items = tag2.split(':') | |
28 | + | |
29 | + if (tag1_items[0] != tag2_items[0] # POS | |
30 | + or len(tag1_items) != len(tag2_items)): | |
31 | + return False | |
32 | + | |
33 | + for (i, item) in enumerate(tag1_items): | |
34 | + # remove tags n1, f1... | |
35 | + item = re.sub(r'(n1|n2|n3)', 'n', item) | |
36 | + model = re.sub(r'(n1|n2|n3|p2|p3)', 'n', tag2_items[i]).split('.') | |
37 | + if not item in model and model[0] != '_': # underscore as a catchall | |
38 | + return False | |
39 | + | |
40 | + return True | |
41 | + | |
42 | +# the bulk of the following ripped from check_rule_compos.py | |
43 | +def esccurl(string) : | |
44 | + "Escape the curly brackets in the string, for using it with the string formatter." | |
45 | + return string.replace('{', '{{').replace('}', '}}') | |
46 | + | |
47 | +alt_idx = dict() # indexed by data[0] - word form | |
48 | + | |
49 | +with open('../resources/SGJP/alt.tab') as alt_src: | |
50 | + for line in alt_src: | |
51 | + line = line.strip() | |
52 | + data = line.split('\t') | |
53 | + if len(data) != 3: | |
54 | + print('Skipped line in the alt list: '+line) | |
55 | + continue | |
56 | + # handle lemmas with subclassification after colon | |
57 | + if data[1].find(':') != -1 and data[1] != ':': | |
58 | + data[1] = data[1][: data[1].find(':')] | |
59 | + # each entry consists of 0 - list of lemmas, 1 - list of tags | |
60 | + if not data[0] in alt_idx: | |
61 | + alt_idx[data[0]] = [[data[1]], [data[2]]] | |
62 | + else: | |
63 | + alt_idx[data[0]][0].append(data[1]) | |
64 | + alt_idx[data[0]][1].append(data[2]) | |
65 | + | |
66 | +with open('../resources/NKJP1M/NKJP1M-tagged-frequency.tab') as inp: | |
67 | + with open('freq_with_alt.tab', 'w+') as out: | |
68 | + for line in inp: | |
69 | + line = line.strip() | |
70 | + data = line.split('\t') | |
71 | + if len(data) != 8: # column count of TAGGED frequency list | |
72 | + print('Skipped line in the list: '+line) | |
73 | + continue | |
74 | + | |
75 | + # The following was added to work on partially done tagged frequency, to get rid of the | |
76 | + # previous COMPOS classification. Otherwise we'd want to use something like this: | |
77 | + # fmt = esccurl(line) + '\t{0}' # simple format string, applicable to raw frequency list | |
78 | + # previous COMPOS column is in data[4], so we skip it below | |
79 | + fmt = esccurl('\t'.join(data[0:4])) + '\t{0}\t' + esccurl('\t'.join(data[5:])) | |
80 | + | |
81 | + matched = False | |
82 | + if data[0] in alt_idx: | |
83 | + tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), alt_idx[data[0]][1])) | |
84 | + tagnum = True in tagcomps and tagcomps.index(True) | |
85 | + # (make sure that if lemma is matching, it belongs to the matching tag) | |
86 | + if tagnum != -1 and tagnum != False and alt_idx[data[0]][0][tagnum] == data[1]: | |
87 | + print(fmt.format('COMPOS-ALT'), file=out) | |
88 | + matched = True | |
89 | + # try again with lowering word form and lemma: | |
90 | + if not matched and data[0].lower() in alt_idx: | |
91 | + tagcomps = list(map(lambda x: liberal_tagcomp(data[2], x), # data[2] - tag stays the same | |
92 | + alt_idx[data[0].lower()][1])) | |
93 | + tagnum = True in tagcomps and tagcomps.index(True) | |
94 | + if tagnum != -1 and tagnum != False and alt_idx[data[0].lower()][0][tagnum] == data[1].lower(): | |
95 | + print(fmt.format('COMPOS-LWR-ALT'), file=out) | |
96 | + matched = True | |
97 | + if not matched: | |
98 | + print(line, file=out) | |
... | ... |
morphology/data/interps_general.tab
0 → 100644
1 | +adj-sup adj:sg:nom.voc:n1.n2:sup Ca | |
2 | +adj-sup adj:sg:nom.voc:m1.m2.m3:sup Cb | |
3 | +adj-sup adj:sg:nom.voc:f:sup Cc | |
4 | +adj-sup adj:sg:loc:m1.m2.m3.n1.n2:sup Cd | |
5 | +adj-sup adj:sg:loc:f:sup Ce | |
6 | +adj-sup adj:sg:inst:m1.m2.m3.n1.n2:sup Cf | |
7 | +adj-sup adj:sg:inst:f:sup Cg | |
8 | +adj-sup adj:sg:gen:m1.m2.m3.n1.n2:sup Ch | |
9 | +adj-sup adj:sg:gen:f:sup Ci | |
10 | +adj-sup adj:sg:dat:m1.m2.m3.n1.n2:sup Cj | |
11 | +adj-sup adj:sg:dat:f:sup Ck | |
12 | +adj-sup adj:sg:acc:n1.n2:sup Cl | |
13 | +adj-sup adj:sg:acc:m3:sup Cm | |
14 | +adj-sup adj:sg:acc:m1.m2:sup Cn | |
15 | +adj-sup adj:sg:acc:f:sup Co | |
16 | +adj-sup adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:sup Cp | |
17 | +adj-sup adj:pl:nom.voc:m1.p1:sup Cq | |
18 | +adj-sup adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Cr | |
19 | +adj-sup adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Cs | |
20 | +adj-sup adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Ct | |
21 | +adj-sup adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:sup Cu | |
22 | +adj-sup adj:pl:acc:m2.m3.f.n1.n2.p2.p3:sup Cv | |
23 | +adj-sup adj:pl:acc:m1.p1:sup Cw | |
24 | +verb-neg ppas:sg:nom.voc:m1.m2.m3:perf:neg Ua | |
25 | +verb-neg ppas:sg:nom.voc:m1.m2.m3:imperf:neg Ua | |
26 | +verb-neg ppas:sg:nom.voc:m1.m2.m3:imperf.perf:neg Ua | |
27 | +verb-neg ppas:sg:nom.voc:f:perf:neg Ub | |
28 | +verb-neg ppas:sg:nom.voc:f:imperf:neg Ub | |
29 | +verb-neg ppas:sg:nom.voc:f:imperf.perf:neg Ub | |
30 | +verb-neg ppas:sg:nom.acc.voc:n1.n2:perf:neg Uc | |
31 | +verb-neg ppas:sg:nom.acc.voc:n1.n2:imperf:neg Uc | |
32 | +verb-neg ppas:sg:nom.acc.voc:n1.n2:imperf.perf:neg Uc | |
33 | +verb-neg ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:neg Ud | |
34 | +verb-neg ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg Ud | |
35 | +verb-neg ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg Ud | |
36 | +verb-neg ppas:sg:gen:m1.m2.m3.n1.n2:perf:neg Ue | |
37 | +verb-neg ppas:sg:gen:m1.m2.m3.n1.n2:imperf:neg Ue | |
38 | +verb-neg ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg Ue | |
39 | +verb-neg ppas:sg:gen.dat.loc:f:perf:neg Uf | |
40 | +verb-neg ppas:sg:gen.dat.loc:f:imperf:neg Uf | |
41 | +verb-neg ppas:sg:gen.dat.loc:f:imperf.perf:neg Uf | |
42 | +verb-neg ppas:sg:dat:m1.m2.m3.n1.n2:perf:neg Ug | |
43 | +verb-neg ppas:sg:dat:m1.m2.m3.n1.n2:imperf:neg Ug | |
44 | +verb-neg ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg Ug | |
45 | +verb-neg ppas:sg:acc:m3:perf:neg Uh | |
46 | +verb-neg ppas:sg:acc:m3:imperf:neg Uh | |
47 | +verb-neg ppas:sg:acc:m3:imperf.perf:neg Uh | |
48 | +verb-neg ppas:sg:acc:m1.m2:perf:neg Ui | |
49 | +verb-neg ppas:sg:acc:m1.m2:imperf:neg Ui | |
50 | +verb-neg ppas:sg:acc:m1.m2:imperf.perf:neg Ui | |
51 | +verb-neg ppas:sg:acc.inst:f:perf:neg Uj | |
52 | +verb-neg ppas:sg:acc.inst:f:imperf:neg Uj | |
53 | +verb-neg ppas:sg:acc.inst:f:imperf.perf:neg Uj | |
54 | +verb-neg ppas:pl:nom.voc:m1.p1:perf:neg Uk | |
55 | +verb-neg ppas:pl:nom.voc:m1.p1:imperf:neg Uk | |
56 | +verb-neg ppas:pl:nom.voc:m1.p1:imperf.perf:neg Uk | |
57 | +verb-neg ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:neg Ul | |
58 | +verb-neg ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg Ul | |
59 | +verb-neg ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg Ul | |
60 | +verb-neg ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg Um | |
61 | +verb-neg ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Um | |
62 | +verb-neg ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Um | |
63 | +verb-neg ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg Un | |
64 | +verb-neg ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Un | |
65 | +verb-neg ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Un | |
66 | +verb-neg ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:neg Uo | |
67 | +verb-neg ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Uo | |
68 | +verb-neg ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Uo | |
69 | +verb-neg ppas:pl:acc:m1.p1:perf:neg Up | |
70 | +verb-neg ppas:pl:acc:m1.p1:imperf:neg Up | |
71 | +verb-neg ppas:pl:acc:m1.p1:imperf.perf:neg Up | |
72 | +verb-neg pact:sg:nom.voc:m1.m2.m3:imperf:neg Va | |
73 | +verb-neg pact:sg:nom.voc:m1.m2.m3:imperf.perf:neg Va | |
74 | +verb-neg pact:sg:nom.voc:f:imperf:neg Vb | |
75 | +verb-neg pact:sg:nom.voc:f:imperf.perf:neg Vb | |
76 | +verb-neg pact:sg:nom.acc.voc:n1.n2:imperf:neg Vc | |
77 | +verb-neg pact:sg:nom.acc.voc:n1.n2:imperf.perf:neg Vc | |
78 | +verb-neg pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:neg Vd | |
79 | +verb-neg pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:neg Vd | |
80 | +verb-neg pact:sg:gen:m1.m2.m3.n1.n2:imperf:neg Ve | |
81 | +verb-neg pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:neg Ve | |
82 | +verb-neg pact:sg:gen.dat.loc:f:imperf:neg Vf | |
83 | +verb-neg pact:sg:gen.dat.loc:f:imperf.perf:neg Vf | |
84 | +verb-neg pact:sg:dat:m1.m2.m3.n1.n2:imperf:neg Vg | |
85 | +verb-neg pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:neg Vg | |
86 | +verb-neg pact:sg:acc:m3:imperf:neg Vh | |
87 | +verb-neg pact:sg:acc:m3:imperf.perf:neg Vh | |
88 | +verb-neg pact:sg:acc:m1.m2:imperf:neg Vi | |
89 | +verb-neg pact:sg:acc:m1.m2:imperf.perf:neg Vi | |
90 | +verb-neg pact:sg:acc.inst:f:imperf:neg Vj | |
91 | +verb-neg pact:sg:acc.inst:f:imperf.perf:neg Vj | |
92 | +verb-neg pact:pl:nom.voc:m1.p1:imperf:neg Vk | |
93 | +verb-neg pact:pl:nom.voc:m1.p1:imperf.perf:neg Vk | |
94 | +verb-neg pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:neg Vl | |
95 | +verb-neg pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:neg Vl | |
96 | +verb-neg pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Vm | |
97 | +verb-neg pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Vm | |
98 | +verb-neg pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Vn | |
99 | +verb-neg pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Vn | |
100 | +verb-neg pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:neg Vo | |
101 | +verb-neg pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:neg Vo | |
102 | +verb-neg pact:pl:acc:m1.p1:imperf:neg Vp | |
103 | +verb-neg pact:pl:acc:m1.p1:imperf.perf:neg Vp | |
104 | +verb-neg ger:sg:nom.acc:n2:perf:neg Ta | |
105 | +verb-neg ger:sg:nom.acc:n2:imperf:neg Ta | |
106 | +verb-neg ger:sg:nom.acc:n2:imperf.perf:neg Ta | |
107 | +verb-neg ger:sg:inst:n2:perf:neg Tb | |
108 | +verb-neg ger:sg:inst:n2:imperf:neg Tb | |
109 | +verb-neg ger:sg:inst:n2:imperf.perf:neg Tb | |
110 | +verb-neg ger:sg:gen:n2:perf:neg Tc | |
111 | +verb-neg ger:sg:gen:n2:imperf:neg Tc | |
112 | +verb-neg ger:sg:gen:n2:imperf.perf:neg Tc | |
113 | +verb-neg ger:sg:dat.loc:n2:perf:neg Td | |
114 | +verb-neg ger:sg:dat.loc:n2:imperf:neg Td | |
115 | +verb-neg ger:sg:dat.loc:n2:imperf.perf:neg Td | |
116 | +verb-neg ger:pl:nom.acc:n2:perf:neg Te | |
117 | +verb-neg ger:pl:nom.acc:n2:imperf:neg Te | |
118 | +verb-neg ger:pl:nom.acc:n2:imperf.perf:neg Te | |
119 | +verb-neg ger:pl:loc:n2:perf:neg Tf | |
120 | +verb-neg ger:pl:loc:n2:imperf:neg Tf | |
121 | +verb-neg ger:pl:loc:n2:imperf.perf:neg Tf | |
122 | +verb-neg ger:pl:inst:n2:perf:neg Tg | |
123 | +verb-neg ger:pl:inst:n2:imperf:neg Tg | |
124 | +verb-neg ger:pl:inst:n2:imperf.perf:neg Tg | |
125 | +verb-neg ger:pl:gen:n2:perf:neg Th | |
126 | +verb-neg ger:pl:gen:n2:imperf:neg Th | |
127 | +verb-neg ger:pl:gen:n2:imperf.perf:neg Th | |
128 | +verb-neg ger:pl:dat:n2:perf:neg Ti | |
129 | +verb-neg ger:pl:dat:n2:imperf:neg Ti | |
130 | +verb-neg ger:pl:dat:n2:imperf.perf:neg Ti | |
131 | +other winien:sg:n1.n2:ter:imperf W | |
132 | +other winien:sg:n1.n2:sec:imperf W | |
133 | +other winien:sg:n1.n2:pri:imperf W | |
134 | +other winien:sg:n1.n2:imperf W | |
135 | +other winien:sg:m1.m2.m3:ter:imperf W | |
136 | +other winien:sg:m1.m2.m3:sec:imperf W | |
137 | +other winien:sg:m1.m2.m3:pri:imperf W | |
138 | +other winien:sg:m1.m2.m3:imperf W | |
139 | +other winien:sg:f:ter:imperf W | |
140 | +other winien:sg:f:sec:imperf W | |
141 | +other winien:sg:f:pri:imperf W | |
142 | +other winien:sg:f:imperf W | |
143 | +other winien:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf W | |
144 | +other winien:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf W | |
145 | +other winien:pl:m2.m3.f.n1.n2.p2.p3:imperf W | |
146 | +other winien:pl:m1.p1:ter:imperf W | |
147 | +other winien:pl:m1.p1:sec:imperf W | |
148 | +other winien:pl:m1.p1:pri:imperf W | |
149 | +other winien:pl:m1.p1:imperf W | |
150 | +noun subst:sg:voc:n2 Gva | |
151 | +noun subst:sg:voc:n1 Gvb | |
152 | +noun subst:sg:voc:m3 Gvc | |
153 | +noun subst:sg:voc:m2 Gvd | |
154 | +noun subst:sg:voc:m1 Gve | |
155 | +noun subst:sg:voc:f Gvf | |
156 | +noun subst:sg:nom:n2 Gna | |
157 | +noun subst:sg:nom:n1 Gnb | |
158 | +noun subst:sg:nom:m3 Gnc | |
159 | +noun subst:sg:nom:m2 Gnd | |
160 | +noun subst:sg:nom:m1 Gne | |
161 | +noun subst:sg:nom:f Gnf | |
162 | +noun subst:sg:loc:n2 Gla | |
163 | +noun subst:sg:loc:n1 Glb | |
164 | +noun subst:sg:loc:m3 Glc | |
165 | +noun subst:sg:loc:m2 Gld | |
166 | +noun subst:sg:loc:m1 Gle | |
167 | +noun subst:sg:loc:f Glf | |
168 | +noun subst:sg:inst:n2 Gia | |
169 | +noun subst:sg:inst:n1 Gib | |
170 | +noun subst:sg:inst:m3 Gic | |
171 | +noun subst:sg:inst:m2 Gid | |
172 | +noun subst:sg:inst:m1 Gie | |
173 | +noun subst:sg:inst:f Gif | |
174 | +noun subst:sg:gen:n2 Gga | |
175 | +noun subst:sg:gen:n1 Ggb | |
176 | +noun subst:sg:gen:m3 Ggc | |
177 | +noun subst:sg:gen:m2 Ggd | |
178 | +noun subst:sg:gen:m1 Gge | |
179 | +noun subst:sg:gen:f Ggf | |
180 | +noun subst:sg:dat:n2 Gda | |
181 | +noun subst:sg:dat:n1 Gdb | |
182 | +noun subst:sg:dat:m3 Gdc | |
183 | +noun subst:sg:dat:m2 Gdd | |
184 | +noun subst:sg:dat:m1 Gde | |
185 | +noun subst:sg:dat:f Gdf | |
186 | +noun subst:sg:acc:n2 Gaa | |
187 | +noun subst:sg:acc:n1 Gab | |
188 | +noun subst:sg:acc:m3 Gac | |
189 | +noun subst:sg:acc:m2 Gad | |
190 | +noun subst:sg:acc:m1 Gae | |
191 | +noun subst:sg:acc:f Gaf | |
192 | +noun subst:pl:voc:p3 Yvp | |
193 | +noun subst:pl:voc:p2 Yvq | |
194 | +noun subst:pl:voc:p1 Yvr | |
195 | +noun subst:pl:voc:n2 Yva | |
196 | +noun subst:pl:voc:n1 Yvb | |
197 | +noun subst:pl:voc:m3 Yvc | |
198 | +noun subst:pl:voc:m2 Yvd | |
199 | +noun subst:pl:voc:m1 Yve | |
200 | +noun subst:pl:voc:f Yvf | |
201 | +noun subst:pl:nom:p3 Ynp | |
202 | +noun subst:pl:nom:p2 Ynq | |
203 | +noun subst:pl:nom:p1 Ynr | |
204 | +noun subst:pl:nom:n2 Yna | |
205 | +noun subst:pl:nom:n1 Ynb | |
206 | +noun subst:pl:nom:m3 Ync | |
207 | +noun subst:pl:nom:m2 Ynd | |
208 | +noun subst:pl:nom:m1 Yne | |
209 | +noun subst:pl:nom:f Ynf | |
210 | +noun subst:pl:loc:p3 Ylp | |
211 | +noun subst:pl:loc:p2 Ylq | |
212 | +noun subst:pl:loc:p1 Ylr | |
213 | +noun subst:pl:loc:n2 Yla | |
214 | +noun subst:pl:loc:n1 Ylb | |
215 | +noun subst:pl:loc:m3 Ylc | |
216 | +noun subst:pl:loc:m2 Yld | |
217 | +noun subst:pl:loc:m1 Yle | |
218 | +noun subst:pl:loc:f Ylf | |
219 | +noun subst:pl:inst:p3 Yip | |
220 | +noun subst:pl:inst:p2 Yiq | |
221 | +noun subst:pl:inst:p1 Yir | |
222 | +noun subst:pl:inst:n2 Yia | |
223 | +noun subst:pl:inst:n1 Yib | |
224 | +noun subst:pl:inst:m3 Yic | |
225 | +noun subst:pl:inst:m2 Yid | |
226 | +noun subst:pl:inst:m1 Yie | |
227 | +noun subst:pl:inst:f Yif | |
228 | +noun subst:pl:gen:p3 Ygp | |
229 | +noun subst:pl:gen:p2 Ygq | |
230 | +noun subst:pl:gen:p1 Ygr | |
231 | +noun subst:pl:gen:n2 Yga | |
232 | +noun subst:pl:gen:n1 Ygb | |
233 | +noun subst:pl:gen:m3 Ygc | |
234 | +noun subst:pl:gen:m2 Ygd | |
235 | +noun subst:pl:gen:m1 Yge | |
236 | +noun subst:pl:gen:f Ygf | |
237 | +noun subst:pl:dat:p3 Ydp | |
238 | +noun subst:pl:dat:p2 Ydq | |
239 | +noun subst:pl:dat:p1 Ydr | |
240 | +noun subst:pl:dat:n2 Yda | |
241 | +noun subst:pl:dat:n1 Ydb | |
242 | +noun subst:pl:dat:m3 Ydc | |
243 | +noun subst:pl:dat:m2 Ydd | |
244 | +noun subst:pl:dat:m1 Yde | |
245 | +noun subst:pl:dat:f Ydf | |
246 | +noun subst:pl:acc:p3 Yap | |
247 | +noun subst:pl:acc:p2 Yaq | |
248 | +noun subst:pl:acc:p1 Yar | |
249 | +noun subst:pl:acc:n2 Yaa | |
250 | +noun subst:pl:acc:n1 Yab | |
251 | +noun subst:pl:acc:m3 Yac | |
252 | +noun subst:pl:acc:m2 Yad | |
253 | +noun subst:pl:acc:m1 Yae | |
254 | +noun subst:pl:acc:f Yaf | |
255 | +other qub W | |
256 | +other prep:nom W | |
257 | +other prep:loc:wok W | |
258 | +other prep:loc:nwok W | |
259 | +other prep:loc W | |
260 | +other prep:inst:wok W | |
261 | +other prep:inst:nwok W | |
262 | +other prep:inst W | |
263 | +other prep:gen:wok W | |
264 | +other prep:gen:nwok W | |
265 | +other prep:gen W | |
266 | +other prep:dat W | |
267 | +other prep:acc:wok W | |
268 | +other prep:acc:nwok W | |
269 | +other prep:acc W | |
270 | +other pred W | |
271 | +verb praet:sg:n1.n2:ter:perf Ja | |
272 | +verb praet:sg:n1.n2:ter:imperf.perf Ja | |
273 | +verb praet:sg:n1.n2:ter:imperf Ja | |
274 | +verb praet:sg:n1.n2:sec:perf Jb | |
275 | +verb praet:sg:n1.n2:sec:imperf.perf Jb | |
276 | +verb praet:sg:n1.n2:sec:imperf Jb | |
277 | +verb praet:sg:n1.n2:pri:perf Jc | |
278 | +verb praet:sg:n1.n2:pri:imperf.perf Jc | |
279 | +verb praet:sg:n1.n2:pri:imperf Jc | |
280 | +verb praet:sg:n1.n2:perf Jd | |
281 | +verb praet:sg:n1.n2:imperf.perf Jd | |
282 | +verb praet:sg:n1.n2:imperf Jd | |
283 | +verb praet:sg:m1.m2.m3:ter:perf Je | |
284 | +verb praet:sg:m1.m2.m3:ter:imperf.perf Je | |
285 | +verb praet:sg:m1.m2.m3:ter:imperf Je | |
286 | +verb praet:sg:m1.m2.m3:sec:perf Jf | |
287 | +verb praet:sg:m1.m2.m3:sec:imperf.perf Jf | |
288 | +verb praet:sg:m1.m2.m3:sec:imperf Jf | |
289 | +verb praet:sg:m1.m2.m3:pri:perf Jg | |
290 | +verb praet:sg:m1.m2.m3:pri:imperf.perf Jg | |
291 | +verb praet:sg:m1.m2.m3:pri:imperf Jg | |
292 | +verb praet:sg:m1.m2.m3:perf:nagl.agl Jh | |
293 | +verb praet:sg:m1.m2.m3:imperf:nagl.agl Jh | |
294 | +verb praet:sg:m1.m2.m3:imperf.perf Jh | |
295 | +verb praet:sg:f:ter:perf Ji | |
296 | +verb praet:sg:f:ter:imperf Ji | |
297 | +verb praet:sg:f:sec:perf Jj | |
298 | +verb praet:sg:f:sec:imperf Jj | |
299 | +verb praet:sg:f:pri:perf Jk | |
300 | +verb praet:sg:f:pri:imperf Jk | |
301 | +verb praet:sg:f:perf Jl | |
302 | +verb praet:sg:f:imperf Jl | |
303 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:ter:perf Jm | |
304 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf.perf Jm | |
305 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf Jm | |
306 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:sec:perf Jn | |
307 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf.perf Jn | |
308 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf Jn | |
309 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:pri:perf Jo | |
310 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf.perf Jo | |
311 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf Jo | |
312 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:perf Jp | |
313 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:imperf.perf Jp | |
314 | +verb praet:pl:m2.m3.f.n1.n2.p2.p3:imperf Jp | |
315 | +verb praet:pl:m1.p1:ter:perf Jq | |
316 | +verb praet:pl:m1.p1:ter:imperf Jq | |
317 | +verb praet:pl:m1.p1:sec:perf Jr | |
318 | +verb praet:pl:m1.p1:sec:imperf Jr | |
319 | +verb praet:pl:m1.p1:pri:perf Js | |
320 | +verb praet:pl:m1.p1:pri:imperf Js | |
321 | +verb praet:pl:m1.p1:perf Jt | |
322 | +verb praet:pl:m1.p1:imperf Jt | |
323 | +other ppron3:sg:nom:n1.n2:ter:akc.nakc:praep.npraep W | |
324 | +other ppron3:sg:nom:m1.m2.m3:ter:akc.nakc:praep.npraep W | |
325 | +other ppron3:sg:nom:f:ter:akc.nakc:praep.npraep W | |
326 | +other ppron3:sg:loc:n1.n2:ter:akc.nakc:praep.npraep W | |
327 | +other ppron3:sg:loc:m1.m2.m3:ter:akc.nakc:praep.npraep W | |
328 | +other ppron3:sg:loc:f:ter:akc.nakc:praep.npraep W | |
329 | +other ppron3:sg:inst:n1.n2:ter:akc.nakc:praep.npraep W | |
330 | +other ppron3:sg:inst:m1.m2.m3:ter:akc.nakc:praep.npraep W | |
331 | +other ppron3:sg:inst:f:ter:akc.nakc:praep.npraep W | |
332 | +other ppron3:sg:gen:n1.n2:ter:nakc:npraep W | |
333 | +other ppron3:sg:gen:n1.n2:ter:akc:npraep W | |
334 | +other ppron3:sg:gen:n1.n2:ter:akc.nakc:praep W | |
335 | +other ppron3:sg:gen:m1.m2.m3:ter:nakc:praep W | |
336 | +other ppron3:sg:gen:m1.m2.m3:ter:nakc:npraep W | |
337 | +other ppron3:sg:gen:m1.m2.m3:ter:akc:praep W | |
338 | +other ppron3:sg:gen:m1.m2.m3:ter:akc:npraep W | |
339 | +other ppron3:sg:gen:f:ter:akc.nakc:praep W | |
340 | +other ppron3:sg:gen:f:ter:akc.nakc:npraep W | |
341 | +other ppron3:sg:dat:n1.n2:ter:nakc:npraep W | |
342 | +other ppron3:sg:dat:n1.n2:ter:akc:npraep W | |
343 | +other ppron3:sg:dat:n1.n2:ter:akc.nakc:praep W | |
344 | +other ppron3:sg:dat:m1.m2.m3:ter:nakc:npraep W | |
345 | +other ppron3:sg:dat:m1.m2.m3:ter:akc:npraep W | |
346 | +other ppron3:sg:dat:m1.m2.m3:ter:akc.nakc:praep W | |
347 | +other ppron3:sg:dat:f:ter:akc.nakc:praep W | |
348 | +other ppron3:sg:dat:f:ter:akc.nakc:npraep W | |
349 | +other ppron3:sg:acc:n1.n2:ter:akc.nakc:praep W | |
350 | +other ppron3:sg:acc:n1.n2:ter:akc.nakc:npraep W | |
351 | +other ppron3:sg:acc:m1.m2.m3:ter:nakc:praep W | |
352 | +other ppron3:sg:acc:m1.m2.m3:ter:nakc:npraep W | |
353 | +other ppron3:sg:acc:m1.m2.m3:ter:akc:praep W | |
354 | +other ppron3:sg:acc:m1.m2.m3:ter:akc:npraep W | |
355 | +other ppron3:sg:acc:f:ter:akc.nakc:praep W | |
356 | +other ppron3:sg:acc:f:ter:akc.nakc:npraep W | |
357 | +other ppron3:pl:nom:m2.m3.f.n1.n2.p2.p3:ter:akc.nakc:praep.npraep W | |
358 | +other ppron3:pl:nom:m1.p1:ter:akc.nakc:praep.npraep W | |
359 | +other ppron3:pl:loc:_:ter:akc.nakc:praep.npraep W | |
360 | +other ppron3:pl:inst:_:ter:akc.nakc:praep.npraep W | |
361 | +other ppron3:pl:gen:_:ter:akc.nakc:praep W | |
362 | +other ppron3:pl:gen:_:ter:akc.nakc:npraep W | |
363 | +other ppron3:pl:dat:_:ter:akc.nakc:praep W | |
364 | +other ppron3:pl:dat:_:ter:akc.nakc:npraep W | |
365 | +other ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:akc.nakc:praep W | |
366 | +other ppron3:pl:acc:m2.m3.f.n1.n2.p2.p3:ter:akc.nakc:npraep W | |
367 | +other ppron3:pl:acc:m1.p1:ter:akc.nakc:praep W | |
368 | +other ppron3:pl:acc:m1.p1:ter:akc.nakc:npraep W | |
369 | +other ppron12:sg:voc:m1.m2.m3.f.n1.n2:sec W | |
370 | +other ppron12:sg:voc:m1.m2.m3.f.n1.n2:pri W | |
371 | +other ppron12:sg:nom:m1.m2.m3.f.n1.n2:sec W | |
372 | +other ppron12:sg:nom:m1.m2.m3.f.n1.n2:pri W | |
373 | +other ppron12:sg:loc:m1.m2.m3.f.n1.n2:sec W | |
374 | +other ppron12:sg:loc:m1.m2.m3.f.n1.n2:pri W | |
375 | +other ppron12:sg:inst:m1.m2.m3.f.n1.n2:sec W | |
376 | +other ppron12:sg:inst:m1.m2.m3.f.n1.n2:pri W | |
377 | +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:nakc W | |
378 | +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:sec:akc W | |
379 | +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:nakc W | |
380 | +other ppron12:sg:gen:m1.m2.m3.f.n1.n2:pri:akc W | |
381 | +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:nakc W | |
382 | +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:sec:akc W | |
383 | +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:nakc W | |
384 | +other ppron12:sg:dat:m1.m2.m3.f.n1.n2:pri:akc W | |
385 | +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:nakc W | |
386 | +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:sec:akc W | |
387 | +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:nakc W | |
388 | +other ppron12:sg:acc:m1.m2.m3.f.n1.n2:pri:akc W | |
389 | +other ppron12:pl:voc:_:sec W | |
390 | +other ppron12:pl:voc:_:pri W | |
391 | +other ppron12:pl:nom:_:sec W | |
392 | +other ppron12:pl:nom:_:pri W | |
393 | +other ppron12:pl:loc:_:sec W | |
394 | +other ppron12:pl:loc:_:pri W | |
395 | +other ppron12:pl:inst:_:sec W | |
396 | +other ppron12:pl:inst:_:pri W | |
397 | +other ppron12:pl:gen:_:sec W | |
398 | +other ppron12:pl:gen:_:pri W | |
399 | +other ppron12:pl:dat:_:sec W | |
400 | +other ppron12:pl:dat:_:pri W | |
401 | +other ppron12:pl:acc:_:sec W | |
402 | +other ppron12:pl:acc:_:pri W | |
403 | +verb ppas:sg:nom.voc:m1.m2.m3:perf:aff Ra | |
404 | +verb ppas:sg:nom.voc:m1.m2.m3:imperf:aff Ra | |
405 | +verb ppas:sg:nom.voc:m1.m2.m3:imperf.perf:aff Ra | |
406 | +verb ppas:sg:nom.voc:f:perf:aff Rb | |
407 | +verb ppas:sg:nom.voc:f:imperf:aff Rb | |
408 | +verb ppas:sg:nom.voc:f:imperf.perf:aff Rb | |
409 | +verb ppas:sg:nom.acc.voc:n1.n2:perf:aff Rc | |
410 | +verb ppas:sg:nom.acc.voc:n1.n2:imperf:aff Rc | |
411 | +verb ppas:sg:nom.acc.voc:n1.n2:imperf.perf:aff Rc | |
412 | +verb ppas:sg:inst.loc:m1.m2.m3.n1.n2:perf:aff Rd | |
413 | +verb ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff Rd | |
414 | +verb ppas:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff Rd | |
415 | +verb ppas:sg:gen:m1.m2.m3.n1.n2:perf:aff Re | |
416 | +verb ppas:sg:gen:m1.m2.m3.n1.n2:imperf:aff Re | |
417 | +verb ppas:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff Re | |
418 | +verb ppas:sg:gen.dat.loc:f:perf:aff Rf | |
419 | +verb ppas:sg:gen.dat.loc:f:imperf:aff Rf | |
420 | +verb ppas:sg:gen.dat.loc:f:imperf.perf:aff Rf | |
421 | +verb ppas:sg:dat:m1.m2.m3.n1.n2:perf:aff Rg | |
422 | +verb ppas:sg:dat:m1.m2.m3.n1.n2:imperf:aff Rg | |
423 | +verb ppas:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff Rg | |
424 | +verb ppas:sg:acc:m3:perf:aff Rh | |
425 | +verb ppas:sg:acc:m3:imperf:aff Rh | |
426 | +verb ppas:sg:acc:m3:imperf.perf:aff Rh | |
427 | +verb ppas:sg:acc:m1.m2:perf:aff Ri | |
428 | +verb ppas:sg:acc:m1.m2:imperf:aff Ri | |
429 | +verb ppas:sg:acc:m1.m2:imperf.perf:aff Ri | |
430 | +verb ppas:sg:acc.inst:f:perf:aff Rj | |
431 | +verb ppas:sg:acc.inst:f:imperf:aff Rj | |
432 | +verb ppas:sg:acc.inst:f:imperf.perf:aff Rj | |
433 | +verb ppas:pl:nom.voc:m1.p1:perf:aff Rk | |
434 | +verb ppas:pl:nom.voc:m1.p1:imperf:aff Rk | |
435 | +verb ppas:pl:nom.voc:m1.p1:imperf.perf:aff Rk | |
436 | +verb ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:perf:aff Rl | |
437 | +verb ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff Rl | |
438 | +verb ppas:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff Rl | |
439 | +verb ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff Rm | |
440 | +verb ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Rm | |
441 | +verb ppas:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Rm | |
442 | +verb ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff Rn | |
443 | +verb ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Rn | |
444 | +verb ppas:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Rn | |
445 | +verb ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:perf:aff Ro | |
446 | +verb ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Ro | |
447 | +verb ppas:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Ro | |
448 | +verb ppas:pl:acc:m1.p1:perf:aff Rp | |
449 | +verb ppas:pl:acc:m1.p1:imperf:aff Rp | |
450 | +verb ppas:pl:acc:m1.p1:imperf.perf:aff Rp | |
451 | +verb pcon:imperf O | |
452 | +verb pant:perf P | |
453 | +verb pact:sg:nom.voc:m1.m2.m3:imperf:aff Qa | |
454 | +verb pact:sg:nom.voc:m1.m2.m3:imperf.perf:aff Qa | |
455 | +verb pact:sg:nom.voc:f:imperf:aff Qb | |
456 | +verb pact:sg:nom.voc:f:imperf.perf:aff Qb | |
457 | +verb pact:sg:nom.acc.voc:n1.n2:imperf:aff Qc | |
458 | +verb pact:sg:nom.acc.voc:n1.n2:imperf.perf:aff Qc | |
459 | +verb pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf:aff Qd | |
460 | +verb pact:sg:inst.loc:m1.m2.m3.n1.n2:imperf.perf:aff Qd | |
461 | +verb pact:sg:gen:m1.m2.m3.n1.n2:imperf:aff Qe | |
462 | +verb pact:sg:gen:m1.m2.m3.n1.n2:imperf.perf:aff Qe | |
463 | +verb pact:sg:gen.dat.loc:f:imperf:aff Qf | |
464 | +verb pact:sg:gen.dat.loc:f:imperf.perf:aff Qf | |
465 | +verb pact:sg:dat:m1.m2.m3.n1.n2:imperf:aff Qg | |
466 | +verb pact:sg:dat:m1.m2.m3.n1.n2:imperf.perf:aff Qg | |
467 | +verb pact:sg:acc:m3:imperf:aff Qh | |
468 | +verb pact:sg:acc:m3:imperf.perf:aff Qh | |
469 | +verb pact:sg:acc:m1.m2:imperf:aff Qi | |
470 | +verb pact:sg:acc:m1.m2:imperf.perf:aff Qi | |
471 | +verb pact:sg:acc.inst:f:imperf:aff Qj | |
472 | +verb pact:sg:acc.inst:f:imperf.perf:aff Qj | |
473 | +verb pact:pl:nom.voc:m1.p1:imperf:aff Qk | |
474 | +verb pact:pl:nom.voc:m1.p1:imperf.perf:aff Qk | |
475 | +verb pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf:aff Ql | |
476 | +verb pact:pl:nom.acc.voc:m2.m3.f.n1.n2.p2.p3:imperf.perf:aff Ql | |
477 | +verb pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Qm | |
478 | +verb pact:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Qm | |
479 | +verb pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Qn | |
480 | +verb pact:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Qn | |
481 | +verb pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf:aff Qo | |
482 | +verb pact:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:imperf.perf:aff Qo | |
483 | +verb pact:pl:acc:m1.p1:imperf:aff Qp | |
484 | +verb pact:pl:acc:m1.p1:imperf.perf:aff Qp | |
485 | +other num:sg:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.n1.n2:rec W | |
486 | +other num:sg:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2:rec W | |
487 | +other num:sg:nom.gen.dat.inst.acc.loc.voc:f:rec W | |
488 | +other num:sg:nom.acc:m1.m2.m3.f.n1.n2:rec W | |
489 | +other num:sg.pl:nom.acc:m1.m2.m3.f.n1.n2.p1.p2:rec W | |
490 | +other num:pl:nom.voc:m1:rec W | |
491 | +other num:pl:nom.voc:m1:congr W | |
492 | +other num:pl:nom.gen.dat.inst.acc.loc.voc:m1.m2.m3.f.n1.n2.p1.p2:rec W | |
493 | +other num:pl:nom.acc:m1.m2.m3.f.n1.n2.p1.p2:rec W | |
494 | +other num:pl:nom.acc.voc:n1.p1.p2:rec W | |
495 | +other num:pl:nom.acc.voc:m2.m3.n2:congr W | |
496 | +other num:pl:nom.acc.voc:m2.m3.n2.f:congr W | |
497 | +other num:pl:nom.acc.voc:m2.m3.f.n2:rec W | |
498 | +other num:pl:nom.acc.voc:m2.m3.f.n1.n2.p1.p2:rec W | |
499 | +other num:pl:nom.acc.voc:m1:rec W | |
500 | +other num:pl:nom.acc.voc:f:congr W | |
501 | +other num:pl:inst:n1.p1.p2:rec W | |
502 | +other num:pl:inst:m1.m2.m3.n2:congr W | |
503 | +other num:pl:inst:m1.m2.m3.n2.f:congr W | |
504 | +other num:pl:inst:m1.m2.m3.f.n2:congr W | |
505 | +other num:pl:inst:m1.m2.m3.f.n1.n2.p1.p2:congr W | |
506 | +other num:pl:inst:f:congr W | |
507 | +other num:pl:gen:n1.p1.p2:rec W | |
508 | +other num:pl:gen.loc:m1.m2.m3.n2.f:congr W | |
509 | +other num:pl:gen.loc:m1.m2.m3.f.n1.n2.p1.p2:congr W | |
510 | +other num:pl:gen.dat.loc:m1.m2.m3.n2.f:congr W | |
511 | +other num:pl:gen.dat.inst.loc:m1.m2.m3.f.n2:congr W | |
512 | +other num:pl:gen.dat.inst.loc:m1.m2.m3.f.n1.n2.p1.p2:congr W | |
513 | +other num:pl:dat:m1.m2.m3.n2.f:congr W | |
514 | +other num:pl:dat.loc:n1.p1.p2:congr.rec W | |
515 | +other num:pl:acc:m1:rec W | |
516 | +other num:comp W | |
517 | +other interj W | |
518 | +verb inf:perf I | |
519 | +verb inf:imperf.perf I | |
520 | +verb inf:imperf I | |
521 | +verb impt:sg:sec:perf Ma | |
522 | +verb impt:sg:sec:imperf.perf Ma | |
523 | +verb impt:sg:sec:imperf Ma | |
524 | +verb impt:pl:sec:perf Mb | |
525 | +verb impt:pl:sec:imperf.perf Mb | |
526 | +verb impt:pl:sec:imperf Mb | |
527 | +verb impt:pl:pri:perf Mc | |
528 | +verb impt:pl:pri:imperf.perf Mc | |
529 | +verb impt:pl:pri:imperf Mc | |
530 | +verb imps:perf N | |
531 | +verb imps:imperf.perf N | |
532 | +verb imps:imperf N | |
533 | +verb ger:sg:nom.acc:n2:perf:aff Xa | |
534 | +verb ger:sg:nom.acc:n2:imperf:aff Xa | |
535 | +verb ger:sg:nom.acc:n2:imperf.perf:aff Xa | |
536 | +verb ger:sg:inst:n2:perf:aff Xb | |
537 | +verb ger:sg:inst:n2:imperf:aff Xb | |
538 | +verb ger:sg:inst:n2:imperf.perf:aff Xb | |
539 | +verb ger:sg:gen:n2:perf:aff Xc | |
540 | +verb ger:sg:gen:n2:imperf:aff Xc | |
541 | +verb ger:sg:gen:n2:imperf.perf:aff Xc | |
542 | +verb ger:sg:dat.loc:n2:perf:aff Xd | |
543 | +verb ger:sg:dat.loc:n2:imperf:aff Xd | |
544 | +verb ger:sg:dat.loc:n2:imperf.perf:aff Xd | |
545 | +verb ger:pl:nom.acc:n2:perf:aff Xe | |
546 | +verb ger:pl:nom.acc:n2:imperf:aff Xe | |
547 | +verb ger:pl:nom.acc:n2:imperf.perf:aff Xe | |
548 | +verb ger:pl:loc:n2:perf:aff Xf | |
549 | +verb ger:pl:loc:n2:imperf:aff Xf | |
550 | +verb ger:pl:loc:n2:imperf.perf:aff Xf | |
551 | +verb ger:pl:inst:n2:perf:aff Xg | |
552 | +verb ger:pl:inst:n2:imperf:aff Xg | |
553 | +verb ger:pl:inst:n2:imperf.perf:aff Xg | |
554 | +verb ger:pl:gen:n2:perf:aff Xh | |
555 | +verb ger:pl:gen:n2:imperf:aff Xh | |
556 | +verb ger:pl:gen:n2:imperf.perf:aff Xh | |
557 | +verb ger:pl:dat:n2:perf:aff Xi | |
558 | +verb ger:pl:dat:n2:imperf:aff Xi | |
559 | +verb ger:pl:dat:n2:imperf.perf:aff Xi | |
560 | +verb fin:sg:ter:perf La | |
561 | +verb fin:sg:ter:imperf.perf La | |
562 | +verb fin:sg:ter:imperf La | |
563 | +verb fin:sg:sec:perf Lb | |
564 | +verb fin:sg:sec:imperf.perf Lb | |
565 | +verb fin:sg:sec:imperf Lb | |
566 | +verb fin:sg:pri:perf Lc | |
567 | +verb fin:sg:pri:imperf.perf Lc | |
568 | +verb fin:sg:pri:imperf Lc | |
569 | +verb fin:pl:ter:perf Ld | |
570 | +verb fin:pl:ter:imperf.perf Ld | |
571 | +verb fin:pl:ter:imperf Ld | |
572 | +verb fin:pl:sec:perf Le | |
573 | +verb fin:pl:sec:imperf.perf Le | |
574 | +verb fin:pl:sec:imperf Le | |
575 | +verb fin:pl:pri:perf Lf | |
576 | +verb fin:pl:pri:imperf.perf Lf | |
577 | +verb fin:pl:pri:imperf Lf | |
578 | +noun depr:pl:voc:m2 Hv | |
579 | +noun depr:pl:nom:m2 Hn | |
580 | +other conj W | |
581 | +verb cond:sg:n1.n2:ter:perf Ka | |
582 | +verb cond:sg:n1.n2:ter:imperf.perf Ka | |
583 | +verb cond:sg:n1.n2:ter:imperf Ka | |
584 | +verb cond:sg:n1.n2:sec:perf Kb | |
585 | +verb cond:sg:n1.n2:sec:imperf.perf Kb | |
586 | +verb cond:sg:n1.n2:sec:imperf Kb | |
587 | +verb cond:sg:n1.n2:pri:perf Kc | |
588 | +verb cond:sg:n1.n2:pri:imperf.perf Kc | |
589 | +verb cond:sg:n1.n2:pri:imperf Kc | |
590 | +verb cond:sg:n1.n2:perf Kd | |
591 | +verb cond:sg:n1.n2:imperf.perf Kd | |
592 | +verb cond:sg:n1.n2:imperf Kd | |
593 | +verb cond:sg:m1.m2.m3:ter:perf Ke | |
594 | +verb cond:sg:m1.m2.m3:ter:imperf.perf Ke | |
595 | +verb cond:sg:m1.m2.m3:ter:imperf Ke | |
596 | +verb cond:sg:m1.m2.m3:sec:perf Kf | |
597 | +verb cond:sg:m1.m2.m3:sec:imperf.perf Kf | |
598 | +verb cond:sg:m1.m2.m3:sec:imperf Kf | |
599 | +verb cond:sg:m1.m2.m3:pri:perf Kg | |
600 | +verb cond:sg:m1.m2.m3:pri:imperf.perf Kg | |
601 | +verb cond:sg:m1.m2.m3:pri:imperf Kg | |
602 | +verb cond:sg:f:ter:perf Kh | |
603 | +verb cond:sg:f:ter:imperf.perf Kh | |
604 | +verb cond:sg:f:ter:imperf Kh | |
605 | +verb cond:sg:f:sec:perf Ki | |
606 | +verb cond:sg:f:sec:imperf.perf Ki | |
607 | +verb cond:sg:f:sec:imperf Ki | |
608 | +verb cond:sg:f:pri:perf Kj | |
609 | +verb cond:sg:f:pri:imperf.perf Kj | |
610 | +verb cond:sg:f:pri:imperf Kj | |
611 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:ter:perf Kk | |
612 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf.perf Kk | |
613 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:ter:imperf Kk | |
614 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:sec:perf Kl | |
615 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf.perf Kl | |
616 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:sec:imperf Kl | |
617 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:pri:perf Km | |
618 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf.perf Km | |
619 | +verb cond:pl:m2.m3.f.n1.n2.p2.p3:pri:imperf Km | |
620 | +verb cond:pl:m1.p1:ter:perf Kn | |
621 | +verb cond:pl:m1.p1:ter:imperf.perf Kn | |
622 | +verb cond:pl:m1.p1:ter:imperf Kn | |
623 | +verb cond:pl:m1.p1:sec:perf Ko | |
624 | +verb cond:pl:m1.p1:sec:imperf.perf Ko | |
625 | +verb cond:pl:m1.p1:sec:imperf Ko | |
626 | +verb cond:pl:m1.p1:pri:perf Kp | |
627 | +verb cond:pl:m1.p1:pri:imperf.perf Kp | |
628 | +verb cond:pl:m1.p1:pri:imperf Kp | |
629 | +other comp W | |
630 | +other burk W | |
631 | +other brev:pun W | |
632 | +other brev:npun W | |
633 | +other bedzie:sg:ter:imperf W | |
634 | +other bedzie:sg:sec:imperf W | |
635 | +other bedzie:sg:pri:imperf W | |
636 | +other bedzie:pl:ter:imperf W | |
637 | +other bedzie:pl:sec:imperf W | |
638 | +other bedzie:pl:pri:imperf W | |
639 | +other aglt:sg:sec:imperf:wok W | |
640 | +other aglt:sg:sec:imperf:nwok W | |
641 | +other aglt:sg:pri:imperf:wok W | |
642 | +other aglt:sg:pri:imperf:nwok W | |
643 | +other aglt:pl:sec:imperf:wok W | |
644 | +other aglt:pl:sec:imperf:nwok W | |
645 | +other aglt:pl:pri:imperf:wok W | |
646 | +other aglt:pl:pri:imperf:nwok W | |
647 | +adv-sup adv:sup W | |
648 | +adv adv:pos W | |
649 | +adv-com adv:com W | |
650 | +adv adv W | |
651 | +adj adjp F | |
652 | +adj adjc E | |
653 | +adj adja D | |
654 | +adj adj:sg:nom.voc:n1.n2:pos Aa | |
655 | +adj-com adj:sg:nom.voc:n1.n2:com Ba | |
656 | +adj adj:sg:nom.voc:m1.m2.m3:pos Ab | |
657 | +adj-com adj:sg:nom.voc:m1.m2.m3:com Bb | |
658 | +adj adj:sg:nom.voc:f:pos Ac | |
659 | +adj-com adj:sg:nom.voc:f:com Bc | |
660 | +adj adj:sg:loc:m1.m2.m3.n1.n2:pos Ad | |
661 | +adj-com adj:sg:loc:m1.m2.m3.n1.n2:com Bd | |
662 | +adj adj:sg:loc:f:pos Ae | |
663 | +adj-com adj:sg:loc:f:com Be | |
664 | +adj adj:sg:inst:m1.m2.m3.n1.n2:pos Af | |
665 | +adj-com adj:sg:inst:m1.m2.m3.n1.n2:com Bf | |
666 | +adj adj:sg:inst:f:pos Ag | |
667 | +adj-com adj:sg:inst:f:com Bg | |
668 | +adj adj:sg:gen:m1.m2.m3.n1.n2:pos Ah | |
669 | +adj-com adj:sg:gen:m1.m2.m3.n1.n2:com Bh | |
670 | +adj adj:sg:gen:f:pos Ai | |
671 | +adj-com adj:sg:gen:f:com Bi | |
672 | +adj adj:sg:dat:m1.m2.m3.n1.n2:pos Aj | |
673 | +adj-com adj:sg:dat:m1.m2.m3.n1.n2:com Bj | |
674 | +adj adj:sg:dat:f:pos Ak | |
675 | +adj-com adj:sg:dat:f:com Bk | |
676 | +adj adj:sg:acc:n1.n2:pos Al | |
677 | +adj-com adj:sg:acc:n1.n2:com Bl | |
678 | +adj adj:sg:acc:m3:pos Am | |
679 | +adj-com adj:sg:acc:m3:com Bm | |
680 | +adj adj:sg:acc:m1.m2:pos An | |
681 | +adj-com adj:sg:acc:m1.m2:com Bn | |
682 | +adj adj:sg:acc:f:pos Ao | |
683 | +adj-com adj:sg:acc:f:com Bo | |
684 | +adj adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:pos Ap | |
685 | +adj-com adj:pl:nom.voc:m2.m3.f.n1.n2.p2.p3:com Bp | |
686 | +adj adj:pl:nom.voc:m1.p1:pos Aq | |
687 | +adj-com adj:pl:nom.voc:m1.p1:com Bq | |
688 | +adj adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:pos Ar | |
689 | +adj-com adj:pl:loc:m1.m2.m3.f.n1.n2.p1.p2.p3:com Br | |
690 | +adj adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:pos As | |
691 | +adj-com adj:pl:inst:m1.m2.m3.f.n1.n2.p1.p2.p3:com Bs | |
692 | +adj adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:pos At | |
693 | +adj-com adj:pl:gen:m1.m2.m3.f.n1.n2.p1.p2.p3:com Bt | |
694 | +adj adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:pos Au | |
695 | +adj-com adj:pl:dat:m1.m2.m3.f.n1.n2.p1.p2.p3:com Bu | |
696 | +adj adj:pl:acc:m2.m3.f.n1.n2.p2.p3:pos Av | |
697 | +adj-com adj:pl:acc:m2.m3.f.n1.n2.p2.p3:com Bv | |
698 | +adj adj:pl:acc:m1.p1:pos Aw | |
699 | +adj-com adj:pl:acc:m1.p1:com Bw | |
700 | +verb pacta XYZ | |
... | ... |
morphology/doc/decyzje-scalanie.txt
0 → 100644
1 | +Plik freqListInterps.ml wykorzystuje plik data/interps_general.tab, skąd usunięte zostały niektóre wpisy, | |
2 | +głównie dotyczące adj (powodujące dwuznaczności przy scalaniu). | |
3 | + | |
4 | +Pozostawione intepretacje: | |
5 | +-siebie, numcol (nieobecne w SGJP) pozostają jak są | |
6 | +-ppron12, ppron3 pozostają jak są | |
7 | +(tzn. nie ma znalezionych odpowiedników, są pomijane) | |
8 | + | |
9 | +W przypadku praet, imps, imp, fin, inf, ger, pact, ppas wybrana została interpretacja najbardziej podobna | |
10 | +do oryginalnej licząc od końca, co rozwiązuje problemy perf.impef, neg.aff | |
11 | + | |
12 | +Przekształcenia: | |
13 | +-qub:wok, qub:nwok -> qub | |
14 | + | |
15 | +Co do num spoza SGJP: | |
16 | +-jeżeli forma składa się wyłącznie z cyfr arabskich i rzymskich, wybierana jest najdłuższa interpretacja | |
17 | +Zapewne w rzeczywistości powinny wtedy obejmować wszystkie możliwe tagi. | |
18 | +-w przeciwnym wypadku wybierana jest najkrótsza (najwęższa) interpretacja | |
... | ... |
morphology/doc/model.pdf
No preview for this file type
morphology/doc/model.tex
... | ... | @@ -6,89 +6,162 @@ |
6 | 6 | \usepackage[polish]{babel} |
7 | 7 | % \usepackage{tikz} |
8 | 8 | % \usetikzlibrary{conceptgraph} |
9 | +\usepackage{amsthm} | |
9 | 10 | |
10 | 11 | \parindent 0pt |
11 | 12 | \parskip 4pt |
12 | 13 | |
13 | -% \newcommand{\tensor}{\otimes} | |
14 | -% \newcommand{\forward}{\operatorname{/}} | |
15 | -% \newcommand{\backward}{\operatorname{\backslash}} | |
16 | -% \newcommand{\both}{\mid} | |
17 | -% \newcommand{\plus}{\oplus} | |
18 | -% \newcommand{\zero}{0} | |
19 | -% \newcommand{\one}{1} | |
20 | -% \newcommand{\letin}[2]{{\bf let}\;#1\;{\bf in}\;#2} | |
21 | -% \newcommand{\caseof}[2]{{\bf case}\;#1\;{\bf of}\;#2} | |
22 | -% \newcommand{\emp}{{\bf emp}} | |
23 | -% \newcommand{\inl}{{\bf inl}} | |
24 | -% \newcommand{\inr}{{\bf inr}} | |
25 | -% \newcommand{\coord}[1]{{#1}^\star} | |
26 | -% \newcommand{\map}[2]{{\bf map}\;#1\;#2} | |
27 | -% \newcommand{\concat}[1]{{\bf concat}\;#1} | |
28 | -% \newcommand{\makeset}[1]{{\bf makeset}\;#1} | |
29 | -% \newcommand{\maketerm}[1]{{\bf maketerm}\;#1} | |
30 | -% \newcommand{\addlist}[2]{{\bf add}\;#1\;#2} | |
31 | -% \newcommand{\ana}[1]{{\bf ana}(#1)} | |
32 | -% \newcommand{\One}{\bullet} | |
33 | - | |
34 | - | |
35 | -\title{Model probabilistyczny guessera dla języka polskiego} | |
14 | +\newcommand{\form}{{\it form}} | |
15 | +\newcommand{\lemma}{{\it lemma}} | |
16 | +\newcommand{\cat}{{\it cat}} | |
17 | +\newcommand{\interp}{{\it interp}} | |
18 | +\newcommand{\fsuf}{{\it fsuf}} | |
19 | +\newcommand{\lsuf}{{\it lsuf}} | |
20 | + | |
21 | +\newtheorem{task}{Zadanie} | |
22 | +\newtheorem{answer}{Odpowiedź} | |
23 | + | |
24 | +\title{Model probabilistyczny fleksji języka polskiego} | |
36 | 25 | \author{Wojciech Jaworski} |
37 | 26 | %\date{} |
38 | 27 | |
39 | 28 | \begin{document} |
40 | 29 | \maketitle |
41 | 30 | |
42 | -Zakładamy, że język jest rozkładem probabilistycznym na czwórkach (form,lemma,cat,interp), | |
31 | +Zakładamy, że język jest rozkładem probabilistycznym na czwórkach (\form,\lemma,\cat,\interp), | |
43 | 32 | czyli, że wystąpienia kolejnych słów w tekście są od siebie niezależne. |
44 | -Interpretacja interp jest zbiorem tagów zgodnym a tagsetem SGJP. | |
45 | -Kategoria $cat \in \{ noun, adj, adv, verb, other \}$ | |
33 | +Interpretacja \interp{} jest zbiorem tagów zgodnym a tagsetem SGJP. | |
34 | +Kategoria $\cat \in \{ {\rm noun}, {\rm adj}, {\rm adv}, {\rm verb}, {\rm other} \}$ | |
46 | 35 | Zakładamy też, że język jest poprawny, tzn. nie ma literówek, ani błędów gramatycznych. |
47 | 36 | |
48 | 37 | Dysponujemy następującymi danymi: |
49 | 38 | \begin{itemize} |
50 | 39 | \item słownikiem gramatycznym S, czyli zbiorem czwórek, o których wiemy, że należą do języka; |
51 | -\item zbiorem reguł, czyli zbiorem czwórek (fsuf,lsuf,cat,interp) | |
40 | +\item zbiorem reguł, czyli zbiorem czwórek (\fsuf,\lsuf,\cat,\interp) | |
52 | 41 | \item zbiorem wyjątków, czyli zbiorem czwórek, o których wiemy, że należą do języka, które nie są opisywane przez reguły |
53 | 42 | \item otagowaną listą frekwencyjną. |
54 | 43 | \end{itemize} |
55 | -Reguła przyłożona do formy ucina fsuf i przykleja lsuf. | |
44 | +Reguła przyłożona do formy ucina \fsuf{} i przykleja \lsuf. | |
45 | + | |
46 | +Lista frekwencyjna wytworzona jest na podstawie NKJP1M. Usunięte zostały z niej symbole | |
47 | +(formy do których odczytania nie wystarczy znajomość reguł wymowy takie, jak liczby zapisane cyframi, oznaczenia godzin i lat, | |
48 | +znaki interpunkcyjne, skróty, emotikony). Usunięte zostały również formy odmienialne z użyciem myślnika i apostrofu | |
49 | +(np. odmienione akronimy i nazwiska obce, formy takie jak ,,12-latek``). | |
50 | +Interpretacje na liście frekwencyjnej zostały skonwertowane do postaci takiej jaka występuje w SGJP, | |
51 | +łączącej interpretacje form identycznych. Na przykład interpretacje adj:pl:nom:m1:pos, adj:pl:voc:m1:pos, adj:pl:nom:p1:pos i adj:pl:voc:p1:pos | |
52 | +zostały złączone w adj:pl:nom.voc:m1.p1:pos, a frekwencje form zsumowane. | |
53 | + | |
54 | +Celem jest aproksymacja wartości P(\lemma,\cat,\interp|\form). | |
55 | + | |
56 | +%Jakość aproksymacji mierzymy licząc jak często wśród $k$ najbardziej prawdopodobnych trójek $\lemma,\cat,\interp$ | |
57 | +%wskazanych przez model dla zadanej formy znajduje się trójka poprawna. Wyniki dla poszczególnych form agregujemy | |
58 | +%za pomocą średniej ważonej po ich częstościach. | |
59 | + | |
60 | +%Pytanie 0: Ile wynosi powyższa miara liczona z użyciem p-stw wziętych z listy frekwencyjnej? (To jest ograniczenie górne dla modelu) | |
61 | + | |
62 | +%Pytanie 0': Ile wynosi powyższa miara liczona z użyciem częstości wziętych ze zbioru reguł? (To jest ograniczenie dolne dla modelu) | |
63 | + | |
64 | +Pierwszym kryterium jest przynależność formy do słownika S. | |
65 | +Jeśli forma należy do S zakładamy, że jedno z haseł S zawierające tę formę | |
66 | +poprawnie opisuje jej lemat, kategorię i interpretację. | |
67 | + | |
68 | +\begin{task} | |
69 | +Jakie jest prawdopodobieństwo trafienia na formę, której lemat, kategoria i interpretacja należy do słownika, czyli | |
70 | +\[P((\form,\lemma,\cat,\interp) \in S)\] | |
71 | +Jakie jest prawdopodobieństwo trafienia na formę, która należy do słownika, ale jej lemat, kategoria lub interpretacja należy do słownika, czyli | |
72 | +\[P((\form,\lemma,\cat,\interp) \not\in S \wedge \form \in S)\] | |
73 | +\end{task} | |
74 | + | |
75 | +\begin{answer} | |
76 | +Prawdopodobieństwo natrafienia na formę należącą do słownika wynosi 95,67\%, zaś natrafienia na formę należącą do SGJP bez odpowiedniej | |
77 | +interpretacji -- 3,92\% (lista tych form znajduje się w pliku traps.txt). | |
78 | +\end{answer} | |
79 | + | |
80 | +W przypadku form należących do słownika różnorodność interpretacji będzie niewielka, | |
81 | +natomiast istotne będzie prawdopodobieństwo wystąpienia danego lematu. | |
82 | +Zaś w przypadku form nie należących do słownika prawdopodobieństwo wystąpienia lematu | |
83 | +będzie zawsze małe. | |
56 | 84 | |
57 | -Celem jest aproksymacja wartości P(lemma,cat,interp|form). | |
85 | +Dzielimy teraz listę frekwencyjną na część należącą do S i nie należącą do S. | |
86 | +Od tej pory budujemy model osobno dla każdej z części. | |
58 | 87 | |
59 | -Pytanie 1: $P((form,lemma,cat,interp) \in S)$ | |
88 | +W przypadku cześci należącej do S zauważamy, że \[P(\lemma,\cat,\interp|\form)=P(\form|\lemma,\cat,\interp)\frac{P(\lemma,\cat,\interp)}{P(\form)}\] | |
60 | 89 | |
61 | -Pytanie 2: $P((form,lemma,cat,interp) \not\in S \wedge form \in S)$ | |
90 | +Zakładamy, że \interp{} jest niezależne od \lemma, pod warunkiem określonego \cat | |
91 | +\[P(\lemma,\cat,\interp)=P(\lemma,\cat)P(\interp|\lemma,\cat)=P(\lemma,\cat)P(\interp|\cat)\] | |
62 | 92 | |
63 | -Załóżmy, że reguły i wyjątki mają postać taką, że do danej formy można zaaplikować tylko jedną z nich | |
64 | -(dla żadnej reguły sufix nie jest podciągiem innego sufixu). Wtedy | |
65 | -\[P(lemma,cat,interp|form)\approx P(rule|form)=P(rule|fsuf)\] | |
66 | -(W powyższym drzewie sufixowym w każdym węźle mamy dowiązania do sufixów o jeden znak dłuższych oraz kategorię pozostałe traktową łącznie | |
93 | +$P(\form)$, $P(\lemma,\cat)$ i $P(\interp|\cat)$ szacujemy na podstawie listy frekwencyjnej, | |
94 | +w przypadku pierwszych dwu stosując wygładzanie. Wyliczenie $P(\form)$ zawiera uogólniona lista frekwencyjna | |
95 | +(ścieżka {\tt resources/NKJP1M/NKJP1M-generalized-frequency.tab} w repozytorium ENIAM), $P(\lemma,\cat)$ -- plik | |
96 | + {\tt prob\_lemmacat.txt}, zaś $P(\interp|\cat)$ -- {\tt prob\_itp\_givencat.txt} (oba zawarte w katalogu {\tt morphology/doc}). | |
67 | 97 | |
68 | -Pytanie 3: Czy faktycznie zachodzi powyższa zależność? Jak zmierzyć podobieństwo? | |
98 | +$P(\form|\lemma,\cat,\interp)$ wynosi 0, gdy w S nie ma krotki postaci (\form,\lemma,\cat,\interp); | |
99 | +1, gdy jest dokładnie jedna krotka z (\lemma,\cat,\interp). Gdy jest ich więcej oznacza to, że | |
100 | +lemat ma przynajmniej dwa warianty odmiany. Są to przypadki rzadkie. Przypisujemy każdej z możliwości | |
101 | +prawdopodobieństwo 1. | |
69 | 102 | |
70 | -Problem tu jest taki, że lista frekwencyjna jest zbyt mała by precyzyjnie określić p-stwo ok. 40000 reguł | |
103 | +\begin{task} | |
104 | +Przejrzeć SGJP i znaleźć wszystkie przykłady, w których dla ustalonego lematu, kategorii i interpretacji | |
105 | +jest więcej niż jedna forma. Znaleźć wystąpienia tych krotek na liście frekwencyjnej. | |
106 | +\end{task} | |
71 | 107 | |
72 | -\[P(rule|fsuf)=P(lsuf,cat,interp|fsuf)=P(fsuf|lsuf,cat,interp)\frac{P(lsuf,cat,interp)}{P(fsuf)}\] | |
108 | +\begin{answer} | |
109 | +Lista takich form znajduje się w pliku multi\_forms.txt. | |
110 | +\end{answer} | |
73 | 111 | |
74 | -$P(fsuf)$ jest prawdopodobieństwem tego, że do języka należy słowo o zadanym sufixie. | |
112 | +Teraz zanalizujemy drugą część listy frekwencyjnej. | |
113 | +Załóżmy, że reguły mają postać taką, że sufiks żadnej reguły nie jest podciągiem sufixu innej z nich. | |
114 | +Sufiksy reguł tworzą drzewo, które w każdym węźle ma dowiązania do sufixów o jeden znak dłuższych oraz kategorię pozostałe traktową łącznie. | |
115 | +Przyjmujemy następujące założenie modelowe: | |
116 | +\[P(\lemma,\cat,\interp|\form)\approx P(rule|\form)=P(rule|\fsuf)\] | |
117 | +Wynika ono z tego, że mając nieznaną formę musimy oprzeć się na ogólnych regułach | |
118 | +odmiany i nie możemy korzystać z tego że ma ona jakieś konkretne brzmienie. | |
119 | +Korzystamy tutaj tylko z reguł oznaczonych jako produktywne. | |
120 | + | |
121 | +Problem tu jest taki, że lista frekwencyjna jest zbyt mała by precyzyjnie określić p-stwo ok. 40000 reguł. | |
122 | +Dlatego znowu stosujemy zabieg z prawdopodobieństwem warunkowym. | |
123 | + | |
124 | +\[P(rule|\fsuf)=P(\lsuf,\cat,\interp|\fsuf)=P(\fsuf|\lsuf,\cat,\interp)\frac{P(\lsuf,\cat,\interp)}{P(\fsuf)}\] | |
125 | + | |
126 | +$P(\fsuf)$ jest prawdopodobieństwem tego, że do języka należy słowo o zadanym sufixie. | |
75 | 127 | Można je oszacować za pomocą listy frekwencyjnej. |
76 | 128 | |
77 | -Zakładamy, że interp jest niezależne od lsuf, pod warunkiem określonego cat | |
78 | -$P(lsuf,cat,interp)=P(lsuf,cat)P(interp|lsuf,cat)=P(lsuf,cat)P(interp|cat)$ | |
129 | +Zakładamy, że \interp{} jest niezależne od \lsuf, pod warunkiem określonego \cat | |
130 | +\[P(\lsuf,\cat,\interp)=P(\lsuf,\cat)P(\interp|\lsuf,\cat)=P(\lsuf,\cat)P(\interp|\cat)\] | |
79 | 131 | |
80 | -$P(lsuf,cat)$ i $P(interp|cat)$ można oszacować na podstawie listy frekwencyjnej. | |
132 | +$P(\lsuf,\cat)$ i $P(\interp|\cat)$ można oszacować na podstawie listy frekwencyjnej. | |
81 | 133 | |
82 | -$P(fsuf|lsuf,cat,interp)$ wynosi 0, gdy nie ma reguły postaci (fsuf,lsuf,cat,interp); | |
83 | -1, gdy jest dokładnie jedna reguła z (lsuf,cat,interp), a gdy jest ich więcej trzeba | |
84 | -oszacować z listy frekwencyjnej. | |
134 | +\begin{task} | |
135 | +Oszacować $P(\fsuf)$ i $P(\lsuf,\cat)$ na podstawie listy frekwencyjnej. | |
136 | +Sprawdzić dla jakich sufiksów próbka jest mała albo nie ma jej wcale. | |
137 | +\end{task} | |
85 | 138 | |
86 | -Pytanie 4: Czy powyższe przybliżenie jest poprawne, jak często jest więcej niż jedna reguła i ile wynoszą wówczas p-stwa? | |
139 | +% w razie gdyby był problem można próbować dzielić sufiksy na części i założyć niezależność tych części | |
140 | + | |
141 | +$P(\fsuf|\lsuf,\cat,\interp)$ wynosi 0, gdy nie ma reguły postaci (\fsuf,\lsuf,\cat,\interp); | |
142 | +1, gdy jest dokładnie jedna reguła z (\fsuf,\lsuf,\cat,\interp). Ustawiamy produktywność reguł tak | |
143 | +by nie pojawiało się więcej pasujących reguł. | |
144 | + | |
145 | +\begin{task} | |
146 | +Określić produktywność reguł i sprawdzić, czy nie ma niejednoznacznych dopasowań. | |
147 | +\end{task} | |
87 | 148 | |
88 | -Pytanie 5: Co zrobić z niejednoznacznymi interpretacjami? | |
149 | +\begin{task} | |
150 | +Określić jakość modelu. | |
151 | +\end{task} | |
89 | 152 | |
90 | -Zadania poboczne: wytworzenie otagowanej listy frekwencyjnej, wytworzenie zbioru reguł, wskazanie, które reguły opisują sytuacje wyjątkowe. | |
153 | +\begin{answer} | |
154 | +Wyliczona jakość modelu (stopień pokrycia listy frekwencyjnej przez co najmniej 95\% najbardziej prawdopodobnych interpretacji wg modelu) wyniosła 79,90\%. | |
155 | +\end{answer} | |
156 | + | |
157 | +%czasowniki produktywne to te z lematem ać ować ywać, ić, yć, (nąć) | |
158 | + | |
159 | +Pytanie 4: Czy powyższe przybliżenie jest poprawne, jak często jest więcej niż jedna reguła i ile wynoszą wówczas p-stwa? | |
160 | + | |
161 | +Zadania poboczne: wytworzenie otagowanej listy frekwencyjnej, wytworzenie (uzupełnienie) zbioru reguł na podstawie SGJP i listy frekwencyjnej, wskazanie, które reguły opisują sytuacje wyjątkowe. | |
91 | 162 | |
92 | 163 | Zadanie na przyszłość: reguły słowotwórstwa i ich interpretacja semantyczna. |
93 | 164 | |
94 | -\end{document} | |
95 | 165 | \ No newline at end of file |
166 | +Do powyższego modelu trzeba jeszcze dodać prefixy nie i naj. | |
167 | + | |
168 | +\end{document} | |
... | ... |