import_kipi.py
13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#-*- coding:utf-8 -*-
import sys
from django.core.management.base import BaseCommand, CommandError
from common.util import debug, suffixes, cut_end
from dictionary.models import Lexeme, Vocabulary, LexemeAssociation, Pattern, \
all_forms, InflectionCharacteristic, get_root, Ending, BaseFormLabel
from dictionary.management.commands.import_morfologik import create_lexeme, \
create_lip, print_data, find_minimal_sets, blacklist_filter, join_many, join, print_forms
class Command(BaseCommand):
args = '<input file name>'
help = 'importuje leksemy z KIPI 1.0'
def handle(self, filename, **options):
import_kipi(open(filename))
DEBUG = False
COMMONNESS = {
'geog': u'geograficzna',
'imie': u'imię',
'inna': u'własna',
'nazw': u'nazwisko',
'orga': u'organizacja',
'posp': u'pospolita',
}
def inflection_characteristic(forms, pos):
# w KIPI jest tylko subst i adj
tag = forms[0][1]
if pos == 'subst':
if 'depr' in tag or tag.endswith('m1'):
ic = 'm1'
else:
ic = tag.rsplit(':', 1)[1]
elif pos == 'adj':
# formy 3+ tu nie występują
if any(tag == 'adja' for form, tag in forms):
ic = ''
else:
ic = '0-'
return ic
# COPYPASTA HEAVEN
def get_basic_endings(lexical_class, ic):
return Ending.objects.filter(
base_form_label=ic.basic_form_label,
pattern__type__lexical_class__symbol=lexical_class)
basic_form_endings_dict = {}
for pos in ('adj', 'subst'):
for ic in InflectionCharacteristic.objects.filter(part_of_speech__symbol=pos):
basic_form_endings_dict[(pos, ic.symbol)] = get_basic_endings(pos, ic)
sure_bfls_sg = tuple(
BaseFormLabel.objects.filter(
symbol__in=['sg:dat', 'sg:gen', 'sg:inst']).values_list('pk', flat=True))
sure_bfls_pl = tuple(
BaseFormLabel.objects.filter(
symbol__in=['pl:dat', 'pl:inst', 'pl:loc']).values_list('pk', flat=True))
def basic_form_endings(lexical_class, ic, basic_form, form_set):
if lexical_class != 'subst':
return basic_form_endings_dict[(lexical_class, ic)].filter(
string__in=suffixes(basic_form))
else:
# karkołomne, ale trochę przyśpiesza
endings = basic_form_endings_dict[(lexical_class, ic)]
new_endings = Ending.objects.none()
for suf in suffixes(basic_form):
root = cut_end(basic_form, suf)
n = len(root)
ending_strings = tuple(
form[n:] for form in form_set if form.startswith(root))
endings_part = endings.filter(string=suf)
pattern_ids = endings_part.values_list('pattern', flat=True)
patterns = Pattern.objects.filter(pk__in=pattern_ids).extra(
where=["(id = '0000' or not exists "
"(select id from zakonczenia where w_id = wzory.id "
"and zak not in %s and efobaz in %s) or not exists "
"(select id from zakonczenia where w_id = wzory.id "
"and zak not in %s and efobaz in %s))"],
params=[ending_strings, sure_bfls_sg, ending_strings, sure_bfls_pl])
new_endings = new_endings | endings_part.filter(pattern__in=patterns)
return new_endings
memoized_pattern_ics = {}
def bad_pattern_subst(pattern, ic):
if (pattern, ic) in memoized_pattern_ics:
return memoized_pattern_ics[(pattern, ic)]
if not pattern.lexemeinflectionpattern_set.filter(
inflection_characteristic__symbol=ic).exclude(lexeme__status='cand'):
ret = True
elif pattern.type.symbol in 'mn' and ic == 'f':
ret = True
elif pattern.type.symbol in 'fm' and ic[0] == 'n':
ret = True
else:
ret = False
memoized_pattern_ics[(pattern, ic)] = ret
return ret
memoized_good_endings = {}
def good_ending_set_subst(pattern, ic, root):
if (pattern, ic) in memoized_good_endings:
good_endings = memoized_good_endings[(pattern, ic)]
return set(root + e for e in good_endings)
endings = pattern.endings
if ic not in ('m1', 'p1'):
endings = endings.exclude(base_form_label__symbol='pl:nom:mo')
if ic[0] == 'p':
endings = endings.filter(base_form_label__symbol__startswith='pl')
else:
for g in list(set('mfn') - set(ic[0])):
endings = endings.exclude(
base_form_label__symbol__startswith='pl:gen:' + g)
if ic == 'p3':
if pattern.type.symbol == 'f':
endings = endings.exclude(base_form_label__symbol='pl:gen:m')
elif pattern.type.symbol == 'n':
endings = endings.exclude(base_form_label__symbol='pl:gen:n')
good_endings = list(endings.values_list('string', flat=True))
memoized_good_endings[(pattern, ic)] = good_endings
return set(root + e for e in good_endings)
def good_ending_set(lexical_class, ic, pattern, root=''):
if lexical_class != 'subst':
return pattern.ending_set(root)
else:
return good_ending_set_subst(pattern, ic, root)
def relevant_subst(ending, ic):
bfl = ending.base_form_label.symbol
tag = bfl.split(':')
pattern_type = ending.pattern.type.symbol
return (not (ic in ('m1', 'p1') and bfl == 'pl:nom') and
not (len(tag) >= 3 and ic[0] != 'p' and
tag[2][0] != ic[0]) and
not (ic[0] == 'p' and tag[0] != 'pl') and
not (ic == 'p3' and bfl.startswith('pl:gen:') and (
(pattern_type == 'n' and tag[2] == 'n') or
(pattern_type == 'f' and tag[2] == 'm')
)) and
not (ic not in ('m1', 'p1') and bfl == 'pl:nom:mo'))
def relevant_adj(ending):
tag = ending.base_form_label.symbol
return tag not in ('0', '3+')
def relevant(lexical_class, ending, ic):
if lexical_class == 'subst':
return relevant_subst(ending, ic)
elif lexical_class == 'adj':
return relevant_adj(ending)
def find_patterns(basic_form, pos, ic, forms):
patterns = Pattern.objects.filter(type__lexical_class__symbol=pos)
# znaleźć wszystkie zawarte i zawierające wzory
form_set = set(form for form, tag in forms)
ending_sets = {}
included_patterns = set()
including_patterns = set()
matching_patterns = set()
for basic_ending in basic_form_endings(pos, ic, basic_form, form_set):
pattern = basic_ending.pattern
if pos == 'subst' and bad_pattern_subst(pattern, ic):
#print 'odpadł:', pattern
continue # olewamy komentarze że formy odrzucone przez charfle?
root = basic_form[:len(basic_form) - len(basic_ending.string)]
ending_sets[pattern] = good_ending_set(pos, ic, pattern, root)
including = form_set.issubset(ending_sets[pattern])
bad_forms = set()
for ending in pattern.endings.all():
if relevant(pos, ending, ic):
if root + ending.string not in form_set:
bfl = ending.base_form_label.symbol
#print pattern.name, root, ending.string, bfl
bad_forms.add(root + ending.string)
if not bad_forms:
included_patterns.add((pattern, root))
if including:
matching_patterns.add((pattern, root))
elif including:
including_patterns.add(((pattern, root), tuple(bad_forms)))
# nie wiem, czy to potrzebne, ale na wszelki wypadek
included_patterns = list(included_patterns)
including_patterns = list(including_patterns)
matching_patterns = list(matching_patterns)
if len(matching_patterns) > 0:
if DEBUG:
print u'dokładne wzory: %s' % join(matching_patterns)
return 'match', matching_patterns, included_patterns, including_patterns
# nic nie pasuje albo trzeba wybrać wiele wzorów
if DEBUG and len(including_patterns) > 0:
print u'zawierające: %s' % join(p for p, b_f in including_patterns)
if DEBUG and len(included_patterns) > 0:
print u'zawarte: %s' % join(included_patterns)
return find_many_patterns(
pos, ic, form_set, basic_form, included_patterns, ending_sets) + (
included_patterns, including_patterns)
def find_many_patterns(pos, ic, form_set, basic_form, included_patterns,
ending_sets):
necessary_patterns = set()
missing_form = None
for form in form_set:
having = []
for pattern, root in included_patterns:
if form in ending_sets[pattern]:
having.append((pattern, root))
if len(having) == 1:
necessary_patterns.add(having[0])
if having == []:
missing_form = form
break
if missing_form:
if DEBUG:
print u"brak formy: %s" % missing_form
return 'none', []
covered_forms = set()
for pattern, root in necessary_patterns:
covered_forms |= ending_sets[pattern]
if form_set.issubset(covered_forms):
if DEBUG:
print u"pokryte koniecznymi wzorami: %s" % join(necessary_patterns)
return 'many', [list(necessary_patterns)]
else:
#for pattern, root in included_patterns:
# print pattern, ending_sets[pattern]
minimal_sets = find_minimal_sets(
form_set, covered_forms, necessary_patterns, included_patterns,
ending_sets)
return 'many', minimal_sets
def filter_patterns(filter, action_name, type, patterns, included, including,
lexical_class, form_set, entry, ic):
old_patterns = patterns
old_included = included
bad_patterns = False
if type == 'many':
if any(pattern_set != filter(pattern_set) for pattern_set in patterns):
included = filter(included)
ending_sets = {}
for pattern, root in included:
ending_sets[pattern] = good_ending_set(lexical_class, ic, pattern, root)
type, patterns = find_many_patterns(
lexical_class, ic, form_set, entry, included, ending_sets)
if type != 'many':
debug(entry, u'mnogie dopasowanie zepsute przez %s (%s)' %
(action_name, join_many(old_patterns)))
type = 'many'
patterns, included = old_patterns, old_included
bad_patterns = True
elif type == 'none':
including_dict = dict(including)
including = [(key, including_dict[key]) for key in filter(including_dict)]
else: # type == 'match'
patterns = filter(patterns)
including_dict = dict(including)
including = [(key, including_dict[key]) for key in filter(including_dict)]
included = filter(included)
if old_patterns and not patterns:
ending_sets = {}
for pattern, root in included:
ending_sets[pattern] = good_ending_set(lexical_class, ic, pattern, root)
type, patterns = find_many_patterns(
lexical_class, ic, form_set, entry, included, ending_sets)
if type == 'none':
debug(entry, u'znikły wzory przez %s (%s)' %
(action_name, join(old_patterns)))
type = 'match'
patterns = old_patterns
bad_patterns = True
return type, patterns, included, including, bad_patterns
def process_forms(forms, base, pos, commonness):
if Lexeme.objects.filter(entry=base):
return
ic = inflection_characteristic(forms, pos)
form_set = set(form for form, tag in forms)
type, patterns, included, including = find_patterns(base, pos, ic, forms)
type, patterns, included, including, bad_patterns = filter_patterns(
blacklist_filter, u'czarną listę', type, patterns, included, including,
pos, form_set, base, ic)
# wzory się już nie zmienią od tego miejsca
if type == 'many':
all_patterns = [p for pattern_set in patterns for p in pattern_set]
else:
all_patterns = patterns
if type == 'none':
debug(base, u'zawiera się w %s' % join(p for p, b_f in including))
chosen = []
fitting = including
if pos == 'adj' and including:
print_forms(forms, 'rzeczownik#')
return
elif type == 'match':
patterns.sort(key=lambda p: p[0].name)
fitting = patterns
chosen = patterns[:1]
elif type == 'many':
chosen = patterns[0]
if DEBUG:
print u'zestawy wielu wzorów: %s' % join_many(patterns)
fitting = patterns
if not DEBUG:
comments = [u'z Korpusu IPI 1.0']
if commonness == u'własna' or type != 'match' or len(fitting) > 1:
status = 'cand'
else:
status = 'desc'
if bad_patterns:
comments.append(u'Wzory z czarnej listy!')
status = 'cand'
if len(fitting) > 1 or (type == 'none' and fitting):
if type == 'none':
comments.append(u'Zawierające wzory:')
for (pattern, root), bad_forms in fitting:
comments.append(u'%s: %s' % (pattern.name, ', '.join(bad_forms)))
elif type != 'many':
comments.append(u'Pasujące wzory: %s' % join(fitting))
else:
comments.append(u'Pasujące zestawy wzorów: %s' % join_many(fitting))
comment = '\n'.join(comments)
lips = []
for i, pattern in enumerate(chosen):
lips.append(create_lip(pattern[0], pattern[1], i + 1, ic, pos))
lexeme_data = create_lexeme(base, 1, pos, status, comment)
lexeme_data['commonness'] = commonness
data = {
'lexeme': lexeme_data,
'lips': lips,
}
print_data(data)
def import_kipi(input_file):
last_key = None
forms = None
for line in input_file:
data = line.strip().decode('utf-8').split('\t')
form, base, comm, tag = data
pos = 'subst' if tag.startswith('subst') else 'adj' # bez split, bo adja
key = (base, pos, comm)
if key != last_key:
if last_key is not None:
process_forms(forms, last_key[0], last_key[1], COMMONNESS[last_key[2]])
last_key = key
forms = []
forms.append((form, tag))
process_forms(forms, last_key[0], last_key[1], COMMONNESS[last_key[2]])