Commit 3aca112b76a6a084dcc86e8e05760bdd696fdfbb
1 parent
34f731c6
forgot 2 files
Showing
2 changed files
with
1544 additions
and
0 deletions
importer/RealizationDescriptionUtils.py
0 → 100644
1 | +import morfeusz2 | |
2 | + | |
3 | +from entries.phrase_descriptions.descriptions import make_phraseologisms | |
4 | + | |
5 | +from importer.Phrase import Case, Preposition, Modification, Words, NP, LexNP, PrepNP, LexPrepNP, AdjP, LexAdjP, PActP, LexPActP, Compar, LexCompar, Fixed | |
6 | +from importer.Position import Position | |
7 | + | |
8 | +morfeusz = morfeusz2.Morfeusz(generate=False, analyse=True, expand_tags=True) | |
9 | + | |
10 | +SYNSET_KILL_LIST = ( | |
11 | + 5564, # forma skupienia substancji-1 (miksować na masę/pianę/sok/.../formę skupienia substancji | |
12 | + 24299, # kopytny-1 kopytowiec-1 (w: potratować) | |
13 | + 87011, # wyrywanie-1 (w: wyplenić, upierdliwe gerundium, bierzemy herbicyd-1 środek chwastobójczy-1) | |
14 | + 102594, # GERUNDIUM-1 | |
15 | +) | |
16 | + | |
17 | +UNIT_KILL_LIST = ( | |
18 | + ('członek', '3'), # część ciała-1 | |
19 | + ('pani', '2'), # kobieta-1 | |
20 | + ('pan', '2'), # mężczyzna-1 | |
21 | + ('rodzaj', '1'), ('twarz', '3'), # wersja-1 | |
22 | + ('rzeczownik własny', '1'), # nazwa własna-1 | |
23 | + ('środek lokomocji', '1',), # środek transportu-1 | |
24 | + ('ojciec', '3',), # autor-1, twórca-1 | |
25 | + ('dyscyplina sportu', '1'), # dyscyplina sportowa-1 | |
26 | + ('siedzenie', '7'), # pupa-1 itd. | |
27 | + ('jednostka odległości', '1'), # jednostka długości | |
28 | + ('przyrząd sportowy', '1'), # sprzęt sportowy | |
29 | + ('środek spożywczy', '1'), # produkt spożywczy | |
30 | + ('płód', '3'), # wytwór umysłu-1 | |
31 | + ('działanie prawne', '1'), # czynność prawna | |
32 | + ('CO2', '1'), ('ditlenek węgla', '1'), # dwutlenek węgla-1 | |
33 | + ('forma fizyczna', '1'), ('sprawność fizyczna', '1'), # kondycja fizyczna | |
34 | + ('surowiec energetyczny', '1'), # paliwo kopalne | |
35 | + ('C', '6'), ('°C', '1'), ('celsjusz', '1'), # stopień Celsjusza | |
36 | + ('akt', '4'), # ceremoniał-1, obrządek-1, ... | |
37 | + ('uniemożliwianie', '1'), # udaremnianie-1 | |
38 | + ('zły czyn', '1'), # zły uczynek | |
39 | + ('dzieło literackie', '1'), # utwór literacki | |
40 | + ('profesor nadzwyczajny', '1'), # profesor zwyczajny | |
41 | + ('nauka', '1'), ('dziedzina wiedzy', '1'), # dyscyplina naukowa | |
42 | + ('ciąg wydarzeń', '1'), # ciąg zdarzeń | |
43 | + ('zaburzenie', '1'), ('naruszenie', '2'), | |
44 | + ('przekroczenie', '3'), ('złamanie', '2'), # uchybienie-1 wykroczenie-1 | |
45 | + ('agent gospodarczy', '1'), | |
46 | + ('jednostka gospodarcza', '1'), # podmiot gospodarczy | |
47 | + ('obróbka termiczna', '1'), # obróbka cieplna | |
48 | + ('praca', '6'), # dzieło-2 | |
49 | + ('pojazd kolejowy', '1'), # pojazd szynowy-1 | |
50 | + ('aromaty', '1'), # wonności-1 | |
51 | + ('pojazd kosmiczny', '1'), # statek kosmiczny-1 | |
52 | + ('splot wydarzeń', '1'), # zbieg okoliczności | |
53 | + ('raz', '2'), # cios-1, uderzenie-10 | |
54 | + ('zainteresowanie', '2'), ('inklinacja', '2'), # przychylność-2 sympatia-1 | |
55 | +) | |
56 | + | |
57 | +SYNSET2LEMMA = { | |
58 | + #człowiek-1, istota ludzka-1, jednostka-2, osoba-1 | |
59 | + 6047 : 'LUDZIE', | |
60 | + # TODO? heurystyka: wszystkie lu > 2(3?) słów -> przejdź do hiponimu/hiperonimu? | |
61 | + # człowiek ze względu na pełnioną funkcję-1 | |
62 | + 6822 : 'LUDZIE', | |
63 | + # człowiek ze względu na swoje zajęcie-1 | |
64 | + 6797 : 'LUDZIE', | |
65 | + # człowiek ze względu na relacje społeczne-1 | |
66 | + 6775 : 'LUDZIE', | |
67 | + # człowiek o określonej narodowości, przynależności państwowej-1 | |
68 | + 20542 : 'LUDZIE', | |
69 | + # człowiek charakteryzowany ze względu na kwalifikacje-1 | |
70 | + 6779 : 'LUDZIE', | |
71 | + # człowiek, który coś robi-1 | |
72 | + 241977 : 'LUDZIE', | |
73 | + # miejsce ze względu na przeznaczenie-1 | |
74 | + 25121 : 'MIEJSCE', | |
75 | + # miejsce wyróżniające się z całości obiektu-1 | |
76 | + 105470 : 'MIEJSCE', | |
77 | + # rzecz oceniana negatywnie-1 | |
78 | + 26983 : 'ALL', | |
79 | + # rzecz oceniana pozytywnie-1 | |
80 | + 31937 : 'ALL', | |
81 | + # własność komunikatu, wypowiedzi-1 | |
82 | + 39190 : 'ALL', | |
83 | + # zbiór obiektów - kolekcja elementów-1 | |
84 | + 103163 : 'ALL', | |
85 | + # doświadczenie negatywnego uczucia-1 | |
86 | + 97278 : 'ALL', | |
87 | + # GERUNDIUM OD CZASOWNIKA DYNAMICZNEGO (AKCJA)-1 | |
88 | + 102592 : 'ALL', | |
89 | +} | |
90 | + | |
91 | +SYNSET_MAP = { | |
92 | + # przeważnie sensowniejsze są hiponimy | |
93 | + # człowiek płci żeńskiej-1 -> kobieta-1 | |
94 | + 225273 : 129, | |
95 | + # człowiek, który wykonuje czynności religijne-1 -> kapłan-1 | |
96 | + 239405 : 683, | |
97 | + # człowiek płci męskiej-1 -> mężczyzna-1 | |
98 | + 225272 : 6709, | |
99 | + # pracownik służby zdrowia-1 -> doktor-1 lek.-1 lekarz-1 | |
100 | + 6379 : 597, | |
101 | + # człowiek ukarany-1 -> skazaniec-1 skazany-1 | |
102 | + 41644 : 7909, | |
103 | + # placówka służby zdrowia-1 -> lecznica-2 przychodnia-1 | |
104 | + 4821 : 8858, | |
105 | + # pojazd nawodny lub podwodny-1 -> statek-1 | |
106 | + 225184 : 7354, | |
107 | + # pojazd o przeznaczeniu wojskowym-1 -> pojazd wojskowy-1 | |
108 | + 227639 : 225095, | |
109 | + # narzędzie do łowienia-1 -> wędka-1> | |
110 | + 42387 : 14273, | |
111 | + # urządzenie wspomagające proces liczenia-1 -> liczydło-1 | |
112 | + 237614 : 17750, | |
113 | + # zdarzenie oceniane negatywnie-1 -> cios-2 dramat-3 nieszczęście-1 tragedia-1 zguba-1 | |
114 | + 27419 : 1911, | |
115 | + # czynność o charakterze sakralnym-1 -> akt-4 ceremoniał-1 obrządek-1 obrzęd-1 ryt-2 rytuał-1 | |
116 | + 105440 : 6566, | |
117 | + # zrobienie czegoś niezgodnego z prawem lub z normami społecznymi-1 -> naruszenie-2 przekroczenie-3 uchybienie-1 wykroczenie-1 zaburzenie-1 złamanie-2 | |
118 | + 99630 : 99214, | |
119 | + # instrument dęty blaszany-1 -> róg-5 waltornia-1 | |
120 | + 233727 : 25206, | |
121 | + # jednostka miary kąta-1 -> deg-1 stopień-6 °-1 | |
122 | + 41774 : 41779, | |
123 | + # czynność o charakterze fizjologicznym-1 -> czynność-1 | |
124 | + # (tutaj hiponimy się nie nadają: respiracja-1, plozja-1> | |
125 | + 105442 : 10765, | |
126 | + # zamachowiec-samobójca-1 -> zamachowiec-1 | |
127 | + 80502 : 18234, | |
128 | + # gracz w karty-1 -> gracz-1> | |
129 | + 59620 : 969, | |
130 | + # sztuczne źródło światła-1 -> światło-10 źródło światła-1> | |
131 | + 227275 : 5519, | |
132 | + # czytnik kodu paskowego-1 skaner do kodów kreskowych-1 -> czytnik-1 | |
133 | + 237869 : 13600, | |
134 | + # środek transportu publicznego-1 -> środek lokomocji-1 środek transportu-1 | |
135 | + 228654 : 8674, | |
136 | + # sposób poruszania się-1 -> sposób-1> | |
137 | + 238882 : 8194, | |
138 | + # właściwość zdarzenia, sytuacji-1 -> atrybut-1 cecha-1 przymiot-1 własność-2 właściwość-1 | |
139 | + 5479 : 323, | |
140 | + # cecha żywego organizmu-1 -> cecha-1 | |
141 | + 36346 : 323, | |
142 | + # echa uwarunkowana kulturowo-1 -> cecha-1 | |
143 | + 249629 : 323, | |
144 | + # zawodnik w drużynie-1 -> zawodnik-1 | |
145 | + 234138 : 4286, | |
146 | + # metryczna jednostka masy-1 -> jednostka masy-1 | |
147 | + 231263 : 7946, | |
148 | + # część garderoby-1 element garderoby-1 element ubioru-1 sztuka odzieży-1 -> strój-1 ubiór-1 ubranie-3 | |
149 | + # meronimia | |
150 | + 227081 : 3289, | |
151 | + # żywienie do kogoś pozytywnych uczuć-1 -> inklinacja-2 przychylność-2 sympatia-1 zainteresowanie-2 | |
152 | + # zupełnie niepowiązany synset, żywienie... jest z hierarchii GERUNDIUM | |
153 | + 81622 : 3367, | |
154 | +} | |
155 | + | |
156 | +LEMMA_KILLLIST = ('odzieża',) | |
157 | + | |
158 | +def get_interps(word, lemma=None, tag_constraints=None): | |
159 | + interps = set() | |
160 | + for i, j, (orth, base, tag, p, k) in morfeusz.analyse(word): | |
161 | + tag_elems = set(tag.split(':')) | |
162 | + base = base.split(':')[0] | |
163 | + if base not in LEMMA_KILLLIST and (not lemma or base == lemma) and (not tag_constraints or tag_elems.issuperset(tag_constraints)): | |
164 | + interps.add((base, tag)) | |
165 | + return interps | |
166 | + | |
167 | +def get_adj_lemma(word): | |
168 | + interps = get_interps(word, tag_constraints=['adj']) | |
169 | + lemmata = set(lemma for lemma, tag in interps) | |
170 | + degrees = set(tag.split(':')[4] for lemma, tag in interps) | |
171 | + assert(len(lemmata) == 1) | |
172 | + assert(len(degrees) == 1) | |
173 | + return lemmata.pop(), degrees.pop() | |
174 | + | |
175 | +def get_pact_lemma(word): | |
176 | + interps = get_interps(word, tag_constraints=['pact']) | |
177 | + lemmata = set(lemma for lemma, tag in interps) | |
178 | + assert(len(lemmata) == 1) | |
179 | + return lemmata.pop() | |
180 | + | |
181 | +def get_subst_lemma(word, case): | |
182 | + if (word, case) == ('zbrodni', 'gen'): | |
183 | + # nie: zbrodzień | |
184 | + return 'zbrodnia', 'sg' | |
185 | + interps = get_interps(word, tag_constraints=['subst', case]) | |
186 | + lemmata = set(lemma for lemma, tag in interps) | |
187 | + nums = set(tag.split(':')[1] for lemma, tag in interps) | |
188 | + try: | |
189 | + assert(len(lemmata) == 1) | |
190 | + except: | |
191 | + print(word, case, interps) | |
192 | + raise | |
193 | + # TODO? heurystyka dla liczby, np. ‹sztuka *odzieży*› | |
194 | + return lemmata.pop(), 'sg' if 'sg' in nums else 'pl' | |
195 | + | |
196 | +def adj2adv(lemma): | |
197 | + if lemma.endswith('ny'): | |
198 | + return lemma[:-1] + 'ie' | |
199 | + 1/0 | |
200 | + | |
201 | +''' | |
202 | +def get_degree(word): | |
203 | + interps = morfeusz.analyse(word) | |
204 | + adj_tags = [interp[2][2] for interp in interps if interp[2][2].startswith('adj:')] | |
205 | + degrees = set(tag.split(':')[4] for tag in adj_tags) | |
206 | + assert(len(degrees) == 1) | |
207 | + return degrees.pop() | |
208 | +''' | |
209 | + | |
210 | +''' | |
211 | +def get_num(word, case): | |
212 | + interps = morfeusz.analyse(word) | |
213 | + subst_tags = [interp[2][2] for interp in interps if interp[2][2].startswith('subst:')] | |
214 | + nums = set(tag.split(':')[1] for tag in subst_tags if case in tag) | |
215 | + # TODO heuristic | |
216 | + return 'sg' if 'sg' in nums else 'pl' | |
217 | +''' | |
218 | + | |
219 | +GERUNDS = { | |
220 | + 'przyrządzanie', 'szarpnięcie', | |
221 | +} | |
222 | + | |
223 | +GENDER = { | |
224 | + 'braterstwo' : 'n', | |
225 | + 'DNA' : 'n', | |
226 | + 'narzeczeństwo' : 'n', | |
227 | + | |
228 | + 'burda' : 'f', | |
229 | + 'nuda' : 'f', | |
230 | + 'pasza' : 'f', | |
231 | + 'przekora' : 'f', | |
232 | + 'reżyser' : 'f', # grl pwr! | |
233 | + 'sędzia' : 'f', | |
234 | + 'włóczęga' : 'f', | |
235 | + | |
236 | + 'aspirant' : 'm1', | |
237 | + 'bojownik' : 'm1', | |
238 | + 'champion' : 'm1', | |
239 | + 'członek' : 'm1', # członek rodziny | |
240 | + 'faworyt' : 'm1', | |
241 | + 'gad' : 'm1', | |
242 | + 'głąb' : 'm1', | |
243 | + 'idol' : 'm1', | |
244 | + 'informator' : 'm1', | |
245 | + 'inicjator' : 'm1', | |
246 | + 'mediator' : 'm1', | |
247 | + 'orzeł' : 'm1', | |
248 | + 'pilot' : 'm1', | |
249 | + 'poseł' : 'm1', | |
250 | + 'pośrednik' : 'm1', | |
251 | + 'prekursor' : 'm1', | |
252 | + 'profesor' : 'm1', | |
253 | + 'przewodnik' : 'm1', | |
254 | + 'reprezentant' : 'm1', | |
255 | + 'śpiewak' : 'm1', | |
256 | + 'tchórz' : 'm1', | |
257 | + 'tłumacz' : 'm1', | |
258 | + 'zbieg' : 'm1', | |
259 | + | |
260 | + 'blues' : 'm2', # czuje bluesa | |
261 | + 'całus' : 'm2', | |
262 | + 'całusek' : 'm2', | |
263 | + 'czort' : 'm2', | |
264 | + 'diabeł' : 'm2', | |
265 | + 'drapieżnik' : 'm2', | |
266 | + 'fioł' : 'm2', | |
267 | + 'grzyb' : 'm2', | |
268 | + 'guz' : 'm2', | |
269 | + 'kaduk' : 'm2', | |
270 | + 'kogut' : 'm2', | |
271 | + 'konik' : 'm2', | |
272 | + 'kopniak' : 'm2', | |
273 | + 'kosz' : 'm2', | |
274 | + 'kot' : 'm2', | |
275 | + 'koziołek' : 'm2', | |
276 | + 'lód' : 'm2', | |
277 | + 'Oscar' : 'm2', | |
278 | + 'pająk' : 'm2', | |
279 | + 'papieros' : 'm2', | |
280 | + # pies aportujący, nie pałujący | |
281 | + 'pies' : 'm2', | |
282 | + 'pion' : 'm2', # szachowy | |
283 | + 'samiec' : 'm2', | |
284 | + 'ssak' : 'm2', | |
285 | + 'świr' : 'm2', # dostać świra | |
286 | + 'wąż' : 'm2', | |
287 | + 'zając' : 'm2', | |
288 | + 'ziemniak' : 'm2', | |
289 | + | |
290 | + 'dobry' : 'm3', | |
291 | + | |
292 | + 'antytalent' : 'm3', | |
293 | + 'aparat' : 'm3', | |
294 | + 'asfalt' : 'm3', | |
295 | + 'bas' : 'm3', | |
296 | + 'bełkot' : 'm3', | |
297 | + 'beton' : 'm3', | |
298 | + 'bis' : 'm3', | |
299 | + 'blog' : 'm3', | |
300 | + 'bodziec' : 'm3', | |
301 | + 'bruderszaft' : 'm3', | |
302 | + 'but' : 'm3', | |
303 | + 'bzik' : 'm3', | |
304 | + 'cep' : 'm3', | |
305 | + 'chichot' : 'm3', | |
306 | + 'czubek' : 'm3', | |
307 | + 'drapak' : 'm3', | |
308 | + 'drut' : 'm3', | |
309 | + 'dzień' : 'm3', # nie f | |
310 | + 'dół' : 'm3', | |
311 | + 'e-mail' : 'm3', | |
312 | + 'fart' : 'm3', | |
313 | + 'fenomen' : 'm3', | |
314 | + 'figiel' : 'm3', | |
315 | + 'flak' : 'm3', | |
316 | + 'flis' : 'm3', | |
317 | + 'galop' : 'm3', | |
318 | + 'gnat' : 'm3', | |
319 | + 'grill' : 'm3', | |
320 | + 'grzmot' : 'm3', | |
321 | + 'ideał' : 'm3', | |
322 | + 'klawisz' : 'm3', | |
323 | + 'klops' : 'm3', | |
324 | + 'kołek' : 'm3', | |
325 | + 'kozioł' : 'm3', | |
326 | + 'Księżyc' : 'm3', | |
327 | + 'kuksaniec' : 'm3', | |
328 | + 'kurek' : 'm3', | |
329 | + 'lek' : 'm3', | |
330 | + 'licencjat' : 'm3', | |
331 | + 'link' : 'm3', | |
332 | + 'łeb' : 'm3', | |
333 | + 'marsz' : 'm3', | |
334 | + 'mat' : 'm3', | |
335 | + 'mejl' : 'm3', | |
336 | + 'metal' : 'm3', | |
337 | + 'mikser' : 'm3', | |
338 | + 'minus' : 'm3', | |
339 | + 'młot' : 'm3', | |
340 | + 'model' : 'm3', | |
341 | + 'news' : 'm3', | |
342 | + 'odpowiednik' : 'm3', | |
343 | + 'operator' : 'm3', | |
344 | + 'palec' : 'm3', | |
345 | + 'paluch' : 'm3', | |
346 | + 'pas' : 'm3', | |
347 | + 'plaster' : 'm3', | |
348 | + 'plus' : 'm3', | |
349 | + 'pomruk' : 'm3', | |
350 | + 'popłoch' : 'm3', | |
351 | + 'post' : 'm3', | |
352 | + 'powód' : 'm3', | |
353 | + 'rapsod' : 'm3', | |
354 | + 'ryj' : 'm3', | |
355 | + 'Rzym' : 'm3', | |
356 | + 'sen' : 'm3', | |
357 | + 'skręt' : 'm3', | |
358 | + 'smród' : 'm3', | |
359 | + 'sport' : 'm3', | |
360 | + 'stop' : 'm3', | |
361 | + 'strach' : 'm3', | |
362 | + 'sygnalizator' : 'm3', | |
363 | + 'szampan' : 'm3', | |
364 | + 'talent' : 'm3', | |
365 | + 'wist' : 'm3', | |
366 | + 'walkower' : 'm3', | |
367 | + 'wał' : 'm3', | |
368 | + 'wyż' : 'm3', | |
369 | + 'zajob' : 'm3', | |
370 | + 'ząb' : 'm3', | |
371 | + 'zwiastun' : 'm3', | |
372 | +} | |
373 | + | |
374 | +def get_gender(interps): | |
375 | + # TODO? | |
376 | + forms = set(form for form, tag in interps) | |
377 | + if len(forms) == 1 and list(forms)[0] in GENDER: | |
378 | + return GENDER[forms.pop()] | |
379 | + genders = set(tag.split(':')[3] for form, tag in interps) | |
380 | + try: | |
381 | + assert(len(genders) == 1) | |
382 | + return genders.pop() | |
383 | + except: | |
384 | + print(interps) | |
385 | + | |
386 | +def get_simplified_tags(word): | |
387 | + tags = set() | |
388 | + for interp in morfeusz.analyse(word): | |
389 | + tag = interp[2][2].split(':') | |
390 | + pos = tag[0] | |
391 | + if pos in ('subst', 'ger'): | |
392 | + # POS + case | |
393 | + tags.add('{}:{}'.format(tag[0], tag[2])) | |
394 | + elif pos == 'prep': | |
395 | + # POS + case | |
396 | + tags.add('prep:{}'.format(tag[1])) | |
397 | + else: | |
398 | + tags.add(pos) | |
399 | + return tags | |
400 | + | |
401 | +def make_comprepnp(comprep, words, num, mod): | |
402 | + dummy_id = None | |
403 | + np = NP(Case('gen'), dummy_id) | |
404 | + lexnp = LexNP(np, num, words, mod, dummy_id) | |
405 | + phrases = set(make_phraseologisms(lexnp, None, None)) | |
406 | + assert(len(phrases) == 1) | |
407 | + return ['{} {}'.format(comprep, phrases.pop())] | |
408 | + | |
409 | +def make_compar(compar, words, num, mod, controller): | |
410 | + dummy_id = None | |
411 | + assert(controller) | |
412 | + controller_case = controller.getCase() | |
413 | + np = NP(Case(controller_case), dummy_id) | |
414 | + lexnp = LexNP(np, num, words, mod, dummy_id) | |
415 | + return LexCompar(compar, [lexnp], dummy_id) | |
416 | + | |
417 | +def make_adjp_mod(word): | |
418 | + dummy_id = None | |
419 | + adjp = AdjP(Case('agr'), dummy_id) | |
420 | + lemma, degree = get_adj_lemma(word) | |
421 | + words = Words('concat', 'xor', [lemma]) | |
422 | + lex_adjp = LexAdjP(adjp, 'agr', 'agr', degree, words, NATR, None) | |
423 | + mod_position = Position(None, None, None, [lex_adjp], dummy_id) | |
424 | + return Modification('ratr1', [mod_position]) | |
425 | + | |
426 | +def make_pactp_mod(word): | |
427 | + dummy_id = None | |
428 | + pactp = PActP(Case('agr'), dummy_id) | |
429 | + lemma = get_pact_lemma(word) | |
430 | + words = Words('concat', 'xor', [lemma]) | |
431 | + lex_pactp = LexPActP(pactp, 'agr', 'agr', 'aff', words, '', NATR, None) | |
432 | + mod_position = Position(None, None, None, [lex_pactp], dummy_id) | |
433 | + return Modification('ratr1', [mod_position]) | |
434 | + | |
435 | +def make_npgen_mod(word): | |
436 | + dummy_id = None | |
437 | + np = NP(Case('gen'), dummy_id) | |
438 | + lemma, num = get_subst_lemma(word, 'gen') | |
439 | + words = Words('concat', 'xor', [lemma]) | |
440 | + lex_np = LexNP(np, num, words, NATR, None) | |
441 | + mod_position = Position(None, None, None, [lex_np], dummy_id) | |
442 | + return Modification('ratr1', [mod_position]) | |
443 | + | |
444 | +def make_prepnp_mod(word, prep, case): | |
445 | + dummy_id = None | |
446 | + prepnp = PrepNP(Preposition(prep, Case(case)), dummy_id) | |
447 | + lemma, num = get_subst_lemma(word, case) | |
448 | + words = Words('concat', 'xor', [lemma]) | |
449 | + lex_prepnp = LexPrepNP(prepnp, num, words, NATR, None) | |
450 | + mod_position = Position(None, None, None, [lex_prepnp], dummy_id) | |
451 | + return Modification('ratr1', [mod_position]) | |
452 | + | |
453 | +def make_fixed_mod(text): | |
454 | + dummy_id = None | |
455 | + fixed = Fixed(None, text, dummy_id) | |
456 | + mod_position = Position(None, None, None, [fixed], dummy_id) | |
457 | + return Modification('ratr1', [mod_position]) | |
458 | + | |
459 | +NATR = Modification('natr', None) | |
460 | +JAKIS = make_adjp_mod('jakiś') | |
461 | + | |
462 | +PREDEF2LEMMA = { | |
463 | + 'CECHA' : { | |
464 | + 'adjp' : ('jakiś', None, 'sg', 'adj', NATR), | |
465 | + '_' : ('cecha', 'f', 'sg', 'subst', JAKIS), | |
466 | + }, | |
467 | + 'CZAS' : { | |
468 | + '_' : ('moment', 'm3', 'sg', 'subst', JAKIS), | |
469 | + }, | |
470 | + 'CZEMU' : { | |
471 | + '_' : ('powód', 'm3', 'sg', 'subst', JAKIS), | |
472 | + }, | |
473 | + 'CZYNNOŚĆ' : { | |
474 | + '_' : ('czynność', 'f', 'sg', 'subst', JAKIS), | |
475 | + }, | |
476 | + 'DOBRA' : { | |
477 | + '_' : ('dobra', 'n', 'pl', 'subst', JAKIS), | |
478 | + }, | |
479 | + 'ILOŚĆ' : { | |
480 | + '_' : ('ilość', 'f', 'sg', 'subst', JAKIS), | |
481 | + }, | |
482 | + 'ISTOTY' : { | |
483 | + '_' : ('istota', 'f', 'sg', 'subst', JAKIS), | |
484 | + }, | |
485 | + 'JADŁO' : { | |
486 | + '_' : ('pożywienie', 'n', 'sg', 'subst', JAKIS), | |
487 | + }, | |
488 | + # Ela proponowała: sytuacja, ale wtedy mamy np. absolutorium za jakąś sytuację i wygląda dziwnie | |
489 | + 'KIEDY' : { | |
490 | + '_' : ('czas', 'm3', 'sg', 'subst', JAKIS), | |
491 | + }, | |
492 | + 'KOMUNIKAT' : { | |
493 | + '_' : ('komunikat', 'm3', 'sg', 'subst', JAKIS), | |
494 | + }, | |
495 | + 'KONCEPCJA' : { | |
496 | + '_' : ('koncepcja', 'f', 'sg', 'subst', JAKIS), | |
497 | + }, | |
498 | + 'LUDZIE' : { | |
499 | + 'possp' : ('czyjś', None, 'sg', 'adj', NATR), | |
500 | + '_' : ('ktoś', 'm1', 'sg', 'subst', NATR), | |
501 | + }, | |
502 | + 'MIEJSCE' : { | |
503 | + '_' : ('miejsce', 'n', 'sg', 'subst', JAKIS), | |
504 | + }, | |
505 | + 'OBIEKTY' : { | |
506 | + '_' : ('obiekt', 'm3', 'sg', 'subst', JAKIS), | |
507 | + }, | |
508 | + 'OTOCZENIE' : { | |
509 | + '_' : ('otoczenie', 'n', 'sg', 'subst', JAKIS), | |
510 | + }, | |
511 | + # PODMIOTY -> LUDZIE | |
512 | + #'PODMIOTY' : { | |
513 | + # 'possp' : ('czyjś', 'sg', 'adj', NATR), | |
514 | + # '_' : ('ktoś', 'sg', 'subst', NATR), | |
515 | + #}, | |
516 | + 'POŁOŻENIE' : { | |
517 | + '_' : ('położenie', 'n', 'sg', 'subst', JAKIS), | |
518 | + }, | |
519 | + 'SYTUACJA' : { | |
520 | + '_' : ('sytuacja', 'f', 'sg', 'subst', JAKIS), | |
521 | + }, | |
522 | + 'WYTWÓR' : { | |
523 | + '_' : ('wytwór', 'm3', 'sg', 'subst', JAKIS), | |
524 | + }, | |
525 | + 'ALL' : { | |
526 | + '_' : ('coś', 'n', 'sg', 'subst', NATR), | |
527 | + } | |
528 | +} | |
529 | + | |
530 | +PREDEFXP = { | |
531 | + 'caus' : { | |
532 | + 'LUDZIE' : 'z czyjegoś powodu', | |
533 | + 'ISTOTY' : 'z powodu jakiejś istoty', | |
534 | + 'ALL' : 'z jakiegoś powodu', | |
535 | + }, | |
536 | + 'dest' : { | |
537 | + 'LUDZIE' : 'dla kogoś', | |
538 | + 'ALL' : 'w jakimś celu', | |
539 | + }, | |
540 | + 'mod' : { | |
541 | + 'ISTOTY' : 'w jakiś sposób', | |
542 | + 'ALL' : 'w jakiś sposób', | |
543 | + }, | |
544 | + 'instr' : { | |
545 | + 'LUDZIE' : 'z czyjąś pomocą', | |
546 | + 'ISTOTY' : 'z pomocą jakiejś istoty', | |
547 | + 'ALL' : 'za pomocą czegoś', | |
548 | + }, | |
549 | + 'abl' : { | |
550 | + 'LUDZIE' : 'od kogoś', | |
551 | + 'ISTOTY' : 'od jakiejś istoty', | |
552 | + 'ALL' : 'skądś', | |
553 | + }, | |
554 | + 'adl' : { | |
555 | + 'LUDZIE' : 'do kogoś', | |
556 | + 'ISTOTY' : 'do jakiejś istoty', | |
557 | + 'MIEJSCE' : 'dokądś', | |
558 | + 'ALL' : 'dokądś', | |
559 | + }, | |
560 | + 'locat' : { | |
561 | + 'LUDZIE' : 'u kogoś', | |
562 | + 'ISTOTY' : 'u jakiejś istoty', | |
563 | + 'MIEJSCE' : 'gdzieś', | |
564 | + 'ALL' : 'gdzieś', | |
565 | + }, | |
566 | + 'perl' : { | |
567 | + 'LUDZIE' : 'po kimś', # „jak żebrak, jak bandyta czołgam się *pod nimi*” – to raczej nie powinno być xp(perl), tylko prepnp; ale coś muszę przypisać, więc „po kimś”, bo jeśli już, to coś/ktoś może przejść/chodzić po kimś... | |
568 | + 'ISTOTY' : 'po jakieś istocie', | |
569 | + 'ALL' : 'którędyś', | |
570 | + }, | |
571 | + 'temp' : { | |
572 | + 'LUDZIE' : 'kiedyś', # np. bezprawie | |
573 | + 'ALL' : 'kiedyś', | |
574 | + }, | |
575 | + 'dur' : { | |
576 | + 'ALL' : 'przez jakiś czas', | |
577 | + }, | |
578 | +} | |
... | ... |
importer/RealizationDescriptions.py
0 → 100644
1 | +import os | |
2 | + | |
3 | +from collections import Counter, defaultdict | |
4 | +from itertools import chain | |
5 | + | |
6 | +from shellvalier.settings import BASE_DIR | |
7 | + | |
8 | +from meanings.models import LexicalUnit, Synset | |
9 | +from semantics.models import SemanticRole, RoleAttribute | |
10 | + | |
11 | +from entries.phrase_descriptions.utils import get_form | |
12 | +from entries.phrase_descriptions.polish_strings import TO | |
13 | +from entries.phrase_descriptions.descriptions import make_phraseologisms | |
14 | + | |
15 | +from importer.Phrase import Case, Preposition, Modification, Words, LexPhrase, Fixed, NP, LexNP, LexNumP, PrepNP, LexPrepNP, LexPrepGerP, AdjP, LexAdjP, LexPrepAdjP, PActP, LexPActP | |
16 | +from importer.RealizationDescriptionUtils import * | |
17 | + | |
18 | +def get_prefs_list(argument): | |
19 | + return sorted( | |
20 | + map(str, argument.predefined.all()) | |
21 | + ) + sorted( | |
22 | + map(str, argument.synsets.all()) | |
23 | + ) + sorted( | |
24 | + map(str, argument.relations.all()) | |
25 | + ) | |
26 | + | |
27 | +LOCATION_ROLES = {'Location', 'Path'} | |
28 | + | |
29 | +def select_predefined(predefs): | |
30 | + if len(predefs) == 1: | |
31 | + return predefs[0] | |
32 | + return 'ALL' | |
33 | + # TODO inne heurystyki? | |
34 | + print(predefs) | |
35 | + 1/0 | |
36 | + | |
37 | +def select_predefined_for_xp(predefs, role): | |
38 | + if predefs == ['ISTOTY']: | |
39 | + return 'ISTOTY' | |
40 | + return 'ALL' | |
41 | + # TODO heurystyki? | |
42 | + print(predefs) | |
43 | + 1/0 | |
44 | + | |
45 | +def get_predefined_lemma(argument, xp=False): | |
46 | + predefined = argument.predefined.all() | |
47 | + if not predefined: | |
48 | + return None | |
49 | + predefs = sorted(p.key for p in predefined) | |
50 | + role = argument.role.role.role | |
51 | + if role not in LOCATION_ROLES and {'LUDZIE', 'PODMIOTY'}.intersection(predefs): | |
52 | + return ['LUDZIE'] | |
53 | + if xp: | |
54 | + return [select_predefined_for_xp(predefs, role)] | |
55 | + else: | |
56 | + return [select_predefined(predefs)] | |
57 | + | |
58 | +def get_hyponyms(synset, seen=None, tab=' '): | |
59 | + if seen is None: | |
60 | + seen = set() | |
61 | + hyponyms = set() | |
62 | + for hypo in synset.hyponyms.all(): | |
63 | + if hypo not in seen: | |
64 | + seen.add(hypo) | |
65 | + hyponyms.add(hypo) | |
66 | + hyponyms.update(get_hyponyms(hypo, seen, tab=tab + ' ')) | |
67 | + return hyponyms | |
68 | + | |
69 | +def select_synsets(synsets): | |
70 | + by_num_hyponyms = defaultdict(set) | |
71 | + for synset in synsets: | |
72 | + hyponyms = get_hyponyms(synset) | |
73 | + N = len(hyponyms) | |
74 | + by_num_hyponyms[N].add(synset) | |
75 | + M = max(by_num_hyponyms.keys()) | |
76 | + return list(by_num_hyponyms[M]) | |
77 | + | |
78 | +FREQ = Counter() | |
79 | +with open(os.path.join(BASE_DIR, 'data/freq/sgjp-freq-23032021.tab')) as f: | |
80 | + for l in f: | |
81 | + lemma, pos, freq = l.strip('\n').split('\t') | |
82 | + if pos not in ('adj', 'subst'): | |
83 | + continue | |
84 | + freq = int(freq) | |
85 | + if freq < 10: | |
86 | + continue | |
87 | + # this is inaccurate, but conflate multiple occurrences | |
88 | + FREQ[lemma] += freq | |
89 | + | |
90 | +def rank_units(units, ranker): | |
91 | + buckets = defaultdict(set) | |
92 | + for unit in units: | |
93 | + buckets[ranker(unit)].add(unit) | |
94 | + ranked = dict() | |
95 | + for rank, (n, unts) in enumerate(sorted(buckets.items())): | |
96 | + for unit in unts: | |
97 | + ranked[unit] = rank | |
98 | + return ranked | |
99 | + | |
100 | +meaning_no_ranker = lambda unit: int(unit.sense) | |
101 | +# TODO lepiej mniej znaczeń (bardziej specyficzne -> precyzyjniejsze?) czy więcej (częstsze -> bardziej zrozumiałe?) | |
102 | +num_meanings_ranker = lambda unit: LexicalUnit.objects.filter(base=unit.base).count() | |
103 | +# w ten sposób nadajemy też najniższy priorytet wielowyrazowym, jeśli istnieje 1-wyrazowa notowana na liście frek. | |
104 | +freq_ranker = lambda unit: -FREQ.get(unit.base, 0) | |
105 | +words_ranker = lambda unit: len(unit.base.split()) | |
106 | + | |
107 | + | |
108 | +# różnice przejrzane oczami na próbce dla: | |
109 | +# [meaning_no_ranker, freq_ranker, num_meanings_ranker] | |
110 | +# [freq_ranker, meaning_no_ranker, num_meanings_ranker] -> [freq_ranker, num_meanings_ranker, meaning_no_ranker] -> takie same wyniki na próbce, TODO sugestia Eli: druga opcja brzmi intuicyjniej | |
111 | +# [num_meanings_ranker, meaning_no_ranker, freq_ranker] | |
112 | +# [meaning_no_ranker, num_meanings_ranker, freq_ranker] | |
113 | + | |
114 | +def select_units(units, rankers=[freq_ranker, num_meanings_ranker, meaning_no_ranker, words_ranker]): | |
115 | + units = [unit for unit in units if (unit.base, unit.sense) not in UNIT_KILL_LIST] | |
116 | + unit2rank = defaultdict(lambda: [0 for i in range(len(rankers))]) | |
117 | + for i, ranker in enumerate(rankers): | |
118 | + for unit, rank in rank_units(units, ranker).items(): | |
119 | + unit2rank[unit][i] = rank | |
120 | + by_rank = defaultdict(set) | |
121 | + for unit, rank in unit2rank.items(): | |
122 | + by_rank[tuple(rank)].add(unit) | |
123 | + #for rank, units in sorted(by_rank.items()): | |
124 | + # print(' ***', rank, units) | |
125 | + return sorted(by_rank.items())[0][1] | |
126 | + | |
127 | +LEMMA_CACHE = dict() | |
128 | + | |
129 | +#returns [lemmata], is_predef | |
130 | +def get_synsets_lemma(argument, pos): | |
131 | + synsets = argument.synsets.filter(lexical_units__pos=pos).distinct() | |
132 | + synsets = [(Synset.objects.get(id=SYNSET_MAP[s.id]) if s.id in SYNSET_MAP else s) for s in synsets if s.id not in SYNSET_KILL_LIST] | |
133 | + if not synsets: | |
134 | + return None | |
135 | + key = tuple(sorted(map(str, synsets))) | |
136 | + if key in LEMMA_CACHE: | |
137 | + return LEMMA_CACHE[key] | |
138 | + synsets = synsets if len(synsets) == 1 else select_synsets(synsets) | |
139 | + for synset in synsets: | |
140 | + if synset.id in SYNSET2LEMMA: | |
141 | + return [SYNSET2LEMMA[synset.id]], True | |
142 | + units = list(chain.from_iterable(synset.lexical_units.all() for synset in synsets)) | |
143 | + units = [units[0]] if len(units) == 1 else select_units(units) | |
144 | + ret = (sorted(unit.base for unit in units), False) | |
145 | + if ret[0] == ['cecha czynności', 'cecha działania']: | |
146 | + return (['cecha'], False) | |
147 | + LEMMA_CACHE[key] = ret | |
148 | + return ret | |
149 | + | |
150 | +#LEMMA_CACHE = dict() | |
151 | + | |
152 | +def get_argument_lemma(argument, xp=False): | |
153 | + lemma = get_predefined_lemma(argument, xp=xp) | |
154 | + if lemma: | |
155 | + return lemma, True | |
156 | + lemma = get_synsets_lemma(argument, 'noun') | |
157 | + if lemma: | |
158 | + # get_synsets_lemma returns [lemmata], is_predef | |
159 | + return lemma | |
160 | + lemma = get_synsets_lemma(argument, 'adj') | |
161 | + if lemma: | |
162 | + return lemma | |
163 | + # TODO!!! np. akuratność | |
164 | + return ['ALL'], True | |
165 | + lemma = get_relations_lemma(argument) | |
166 | + assert(lemma) | |
167 | + return lemma, False | |
168 | + | |
169 | +# nie powinny występować razem: | |
170 | +# * LUDZIE + PODMIOTY | |
171 | +# * MIEJSCE + OTOCZENIE + POŁOŻENIE | |
172 | + | |
173 | +def process_lemma(lemma, phrase_type): | |
174 | + mod = NATR | |
175 | + if lemma in PREDEF2LEMMA: | |
176 | + lemma, gend, num, pos, mod = PREDEF2LEMMA[lemma].get(phrase_type, PREDEF2LEMMA[lemma]['_']) | |
177 | + return lemma, gend, num, pos, mod | |
178 | + | |
179 | + if ' ' in lemma: | |
180 | + # eg. ‹środki pieniężne› | |
181 | + words = lemma.split(' ') | |
182 | + tags = [] | |
183 | + for i, word in enumerate(words): | |
184 | + tags.append(sorted(get_simplified_tags(word))) | |
185 | + if len(words) == 2 and 'subst:nom' in tags[0] and 'adj' in tags[1]: | |
186 | + # np. ‹środki pieniężne› | |
187 | + # ‹napój wyskokowy› -> ‹napój› również impt, | |
188 | + # ‹stan psychiczny› -> ‹psychiczny› również subst, | |
189 | + lemma = words[0] | |
190 | + mod = make_adjp_mod(words[1]) | |
191 | + mod._order = 'post' | |
192 | + elif len(words) == 2 and 'subst:nom' in tags[1] and 'adj' in tags[0]: | |
193 | + # np. ‹zły uczynek› | |
194 | + lemma = words[1] | |
195 | + mod = make_adjp_mod(words[0]) | |
196 | + elif len(words) == 2 and 'subst:nom' in tags[0] and 'pact' in tags[1]: | |
197 | + # np. ‹pojazd latający› | |
198 | + lemma = words[0] | |
199 | + mod = make_pactp_mod(words[1]) | |
200 | + mod._order = 'post' | |
201 | + elif len(words) == 2 and 'subst:nom' in tags[0] and 'subst:gen' in tags[1]: | |
202 | + # np. ‹dziedzina wiedzy› | |
203 | + lemma = words[0] | |
204 | + mod = make_npgen_mod(words[1]) | |
205 | + elif len(words) == 2 and 'subst:nom' in tags[0] and 'ger:gen' in tags[1]: | |
206 | + # np. ‹język programowania› | |
207 | + lemma = words[0] | |
208 | + # nie mamy lexgerp, więc używamy fixed | |
209 | + mod = make_fixed_mod(words[1]) | |
210 | + mod._order = 'post' | |
211 | + elif len(words) == 3 and 'subst:nom' in tags[0] and 'prep:gen' in tags[1] and 'subst:gen' in tags[2]: | |
212 | + # np. ‹maszyna do szycia› | |
213 | + lemma = words[0] | |
214 | + mod = make_prepnp_mod(words[2], words[1], 'gen') | |
215 | + else: | |
216 | + print(lemma, tags) | |
217 | + 1/0 | |
218 | + | |
219 | + if lemma == 'lata': | |
220 | + return 'rok', 'm3', 'pl', 'subst', mod | |
221 | + if lemma in GERUNDS: | |
222 | + return lemma, 'n', 'sg', 'subst', mod | |
223 | + | |
224 | + subst_sg_interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'sg', 'nom']) | |
225 | + if subst_sg_interps: | |
226 | + return lemma, get_gender(subst_sg_interps), 'sg', 'subst', mod | |
227 | + subst_pl_interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'pl', 'nom']) | |
228 | + if subst_pl_interps: | |
229 | + # lemat „mnogi” notowany w Morfeuszu jako plurale tantum, np. ‹środki› | |
230 | + return lemma, get_gender(subst_pl_interps), 'pl', 'subst', mod | |
231 | + pt_interps = get_interps(lemma, tag_constraints=['subst', 'pl', 'nom']) | |
232 | + if pt_interps: | |
233 | + # lemat „mnogi” nie notowany w Morfeuszu, jako plurale tantum, np. ‹pieniądze› | |
234 | + lemmata = set(lemma for lemma, tag in pt_interps) | |
235 | + if len(lemmata) == 1: | |
236 | + return lemmata.pop(), get_gender(pt_interps), 'pl', 'subst', mod | |
237 | + if get_interps(lemma, lemma=lemma, tag_constraints=['adj', 'sg', 'nom', 'm1']): | |
238 | + # przymiotnik | |
239 | + return lemma, None, 'sg', 'adj', mod | |
240 | + ger_interps = get_interps(lemma, tag_constraints=['ger', 'sg', 'nom']) | |
241 | + if ger_interps: | |
242 | + # gerundium | |
243 | + lemmata = set(lemma for lemma, tag in ger_interps) | |
244 | + if len(lemmata) == 1: | |
245 | + return lemmata.pop(), 'n', 'sg', 'ger', mod | |
246 | + | |
247 | + print('\n\n***===============================') | |
248 | + print(lemma) | |
249 | + print(get_interps(lemma)) | |
250 | + 1/0 | |
251 | + print('***===============================\n\n') | |
252 | + | |
253 | + ''' | |
254 | + # TODO rodzaj w zależności od hiperonimów? | |
255 | + if lemma == 'członek': | |
256 | + return lemma, 'sg', 'subst', mod | |
257 | + try: | |
258 | + get_form(lemma, ['subst', 'sg', 'nom']) | |
259 | + return lemma, 'sg', 'subst', mod | |
260 | + except: | |
261 | + pass | |
262 | + try: | |
263 | + # lemat „mnogi” notowany w Morfeuszu jako plurale tantum, np. ‹środki› | |
264 | + get_form(lemma, ['subst', 'pl', 'nom']) | |
265 | + return lemma, 'pl', 'subst', mod | |
266 | + except: | |
267 | + pass | |
268 | + try: | |
269 | + # przymiotnik | |
270 | + get_form(lemma, ['adj', 'sg', 'nom', 'm1']) | |
271 | + return lemma, 'sg', 'adj', mod | |
272 | + except: | |
273 | + # lemat „mnogi” nie notowany w Morfeuszu, jako plurale tantum, np. ‹pieniądze› | |
274 | + subst_pl_nom_lemmata = set(interp[2][1].split(':')[0] for interp in morfeusz.analyse(lemma) if interp[2][2].startswith('subst:pl:nom')) | |
275 | + if len(subst_pl_nom_lemmata) == 1: | |
276 | + return subst_pl_nom_lemmata.pop(), 'pl', 'subst', mod | |
277 | + print('============', lemma) | |
278 | + print('============', subst_pl_nom_lemmata) | |
279 | + raise | |
280 | + ''' | |
281 | + | |
282 | +PREP_2GRAMS = Counter() | |
283 | +with open(os.path.join(BASE_DIR, 'data/freq/2grams_prep_nkjp')) as f: | |
284 | + for l in f: | |
285 | + digram, freq = l.strip('\n').split('\t') | |
286 | + freq = int(freq) | |
287 | + PREP_2GRAMS[digram] = freq | |
288 | + | |
289 | +XP2PREPNP = { | |
290 | + 'abl' : (('z', 'gen'),), | |
291 | + # do domu / na basen | |
292 | + 'adl' : (('do', 'gen'), ('na', 'acc'),), | |
293 | + # w mieście, na wsi, u Janka | |
294 | + 'locat' : (('w', 'loc'), ('na', 'loc',), ('u', 'gen'),), | |
295 | + 'perl' : (('przez', 'acc'),), | |
296 | + 'temp' : (('podczas', 'gen'),), | |
297 | + 'dur' : (('przez', 'acc'),), | |
298 | +} | |
299 | + | |
300 | +def xp2prepnp(advcat, lemma, num): | |
301 | + if advcat in XP2PREPNP: | |
302 | + preps = XP2PREPNP[advcat] | |
303 | + if len(preps) == 1: | |
304 | + return preps[0] | |
305 | + else: | |
306 | + ranked = [] | |
307 | + for prep, case in preps: | |
308 | + form = get_form(lemma, ['subst', num, case])[0] | |
309 | + digram = '{} {}'.format(prep, form) | |
310 | + ranked.append((-PREP_2GRAMS[digram], (prep, case))) | |
311 | + return sorted(ranked)[0][1] | |
312 | + else: | |
313 | + return None, None | |
314 | + | |
315 | +XP2COMPREPNP = { | |
316 | + 'caus' : 'z powodu', | |
317 | + # TODO: ożywione: dla ..., nieożywione: w celu ... | |
318 | + 'dest' : 'w celu', | |
319 | + 'instr' : 'za pomocą', | |
320 | +} | |
321 | + | |
322 | +def generate_phrases(function, negativity, phrase, lemma, is_predef, head_gender, controller=None, controller_grammar=None): | |
323 | + phrase_type = phrase._name | |
324 | + dummy_id = None | |
325 | + | |
326 | + if is_predef and phrase_type == 'xp' and not phrase._category._limitations: | |
327 | + advcat = phrase._category._value | |
328 | + # np. „komuś podobało się gdzieś” | |
329 | + return [PREDEFXP[advcat][lemma]], 'n', 'sg' | |
330 | + | |
331 | + distrp = False | |
332 | + processed_lemma, gend, num, pos, mod = process_lemma(lemma, phrase_type) | |
333 | + if phrase_type in ('adjp', 'prepadjp') and pos != 'adj': | |
334 | + # np. aborcja - Manner - lek - adjp(agr)/xp(instr) -> ‹jakaś aborcja› | |
335 | + processed_lemma, gend, pos, mod = 'jakiś', None, 'adj', NATR | |
336 | + if phrase_type == 'nonch': | |
337 | + phrase_type = 'np' | |
338 | + phrase = NP(Case('nom'), dummy_id) | |
339 | + # bo nonch może być realizowana wyłącznie przez ‹coś› itp. | |
340 | + processed_lemma, gend, pos, mod = 'coś', 'n', 'subst', NATR | |
341 | + # i przetwarzanie dalej jako np | |
342 | + if phrase_type == 'distrp': | |
343 | + # ‘po jabłku’ byłoby OK, ale np. ‘po pieniądzach’ brMzmi idiotycznie, więc | |
344 | + # robimy np(gen) i potem dokleimy ‘po ileś’ (czegoś) | |
345 | + distrp = True | |
346 | + phrase_type = 'np' | |
347 | + phrase = NP(Case('gen'), dummy_id) | |
348 | + # i przetwarzanie dalej jako np | |
349 | + | |
350 | + print('PHRASE TYPE:', phrase_type, 'LEMMA:', processed_lemma, 'MODIFICATION:', mod, 'FUNCTION:', function) | |
351 | + words = Words('concat', 'xor', [processed_lemma]) | |
352 | + | |
353 | + # TODO | |
354 | + if phrase_type in ('cp', 'ncp', 'prepncp'): | |
355 | + cptype = phrase._type._value | |
356 | + assert(cptype in ('int', 'rel') or not phrase._type._realisations) | |
357 | + phr = None | |
358 | + if cptype == 'int': | |
359 | + if phrase._type._realisations: | |
360 | + phr = '/'.join(phrase._type._realisations) + ' …' | |
361 | + else: | |
362 | + phr = 'kto/co/jak/… robi' | |
363 | + elif cptype == 'rel': | |
364 | + if phrase._type._realisations: | |
365 | + phr = '/'.join(phrase._type._realisations) + ' …' | |
366 | + else: | |
367 | + print(phrase) | |
368 | + 1 / 0 | |
369 | + elif cptype == 'żeby2': | |
370 | + comp = 'że' if negativity != 'neg' else 'żeby' | |
371 | + phr = 'że coś się stało' | |
372 | + elif cptype in ('żeby', 'jakoby', 'jakby',): | |
373 | + phr = '{} coś się stało'.format(cptype) | |
374 | + elif cptype in ('że', 'bo', 'gdy', 'jak', 'jeśli', 'kiedy',): | |
375 | + phr = '{} coś się dzieje'.format(cptype) | |
376 | + elif cptype in ('aż', 'zanim',): | |
377 | + phr = '{} coś się stanie'.format(cptype) | |
378 | + else: | |
379 | + print(phrase) | |
380 | + 1 / 0 | |
381 | + if phrase_type == 'cp': | |
382 | + return [phr], 'n', 'sg' | |
383 | + if phrase_type == 'ncp': | |
384 | + return ['{}, {}'.format(TO[phrase._case._value], phr)], 'n', 'sg' | |
385 | + if phrase_type == 'prepncp': | |
386 | + return ['{} {}, {}'.format(phrase._prep._value, TO[phrase._prep._case._value], phr)], 'n', 'sg' | |
387 | + if phrase_type == 'or': | |
388 | + # TODO? absurd „coś się dzieje”? absurd: coś się dzieje? | |
389 | + return ['„coś się dzieje”'], 'n', 'sg' | |
390 | + if phrase_type in ('refl', 'recip'): | |
391 | + # TODO? | |
392 | + return ['się'], None, None | |
393 | + if phrase_type == 'advp': | |
394 | + # TODO! | |
395 | + if pos == 'adj': | |
396 | + return [adj2adv(processed_lemma)], None, None | |
397 | + # dla nie-przymiotników i tak nic nie wymyślimy | |
398 | + return ['jakoś'], None, None | |
399 | + if phrase_type == 'infp': | |
400 | + # TODO? | |
401 | + return ['coś robić' if negativity != 'neg' else 'czegoś robić'], 'n', 'sg' | |
402 | + if phrase_type == 'E': | |
403 | + # TODO? | |
404 | + return ['∅'], 'n', 'sg' | |
405 | + | |
406 | + if pos == 'adj' and phrase_type not in ('possp', 'adjp', 'prepadjp',): | |
407 | + # TODO? np. aktualizacja - Manner - automatyczny - xp(instr) | |
408 | + # TODO źle się generuje dla chlastać, ale tam Instrument ma pref. przymiotnikową ‹ostry›, powinno być raczej ‹ostrze› | |
409 | + phrase_type = 'adjp' | |
410 | + phrase = AdjP(Case('agr'), dummy_id) | |
411 | + # i przetwarzanie dalej jako adjp | |
412 | + | |
413 | + if phrase_type == 'possp' and processed_lemma == 'czyjś': | |
414 | + return [get_form(processed_lemma, ['sg', 'nom', head_gender, 'pos'])[0]], None, None | |
415 | + if phrase_type == 'comprepnp': | |
416 | + # TODO wielowyrazowe! ‹abonament w wysokości środków pieniężnych› | |
417 | + # TODO może ładniej by było „w czyjejś sprawie”, „na czyjąś rzecz”, ale | |
418 | + # to trochę trudniejsze | |
419 | + return make_comprepnp(phrase._prep._value, words, num, mod), None, None | |
420 | + #return ['{} {}'.format(phrase._prep._value, get_form(lemma, [num, 'gen'])[0])] | |
421 | + | |
422 | + lex_phrases = [] | |
423 | + phrases = [] | |
424 | + | |
425 | + if phrase_type == 'np': | |
426 | + # gerundium; TODO? lista wyjątków jeśli więcej | |
427 | + if (processed_lemma, function, phrase._case._value) == ('przyrządzanie', 'subj', 'str'): | |
428 | + return ['przyrządzanie'], 'n', 'sg' | |
429 | + if (processed_lemma, function, phrase._case._value) == ('szarpnięcie', None, 'inst'): | |
430 | + return ['szarpnięciem'], 'n', 'sg' | |
431 | + lex_phrases.append(LexNP(phrase, num, words, mod, dummy_id)) | |
432 | + if phrase_type == 'possp': | |
433 | + np = NP(Case('gen'), dummy_id) | |
434 | + lex_phrases.append(LexNP(np, num, words, mod, dummy_id)) | |
435 | + if phrase_type == 'prepnp': | |
436 | + # gerundium; TODO? lista wyjątków jeśli więcej | |
437 | + if (processed_lemma, phrase._prep._case._value, phrase._prep._value) == ('przyrządzanie', 'gen', 'do'): | |
438 | + return ['do przyrządzania'], None, None | |
439 | + if phrase._prep._value in ('między', 'pomiędzy', 'wśród', 'pośród') and processed_lemma not in ('ktoś', 'coś'): | |
440 | + num = 'pl' | |
441 | + if pos == 'subst': | |
442 | + lex_phrases.append(LexPrepNP(phrase, num, words, mod, dummy_id)) | |
443 | + if pos == 'ger': | |
444 | + lex_phrases.append(LexPrepGerP(phrase, num, 'aff', words, '', mod, dummy_id)) | |
445 | + if phrase_type == 'adjp': | |
446 | + # TODO! gender & control | |
447 | + lex_phrases.append(LexAdjP(phrase, 'sg', head_gender if head_gender else 'm1', 'pos', words, mod, dummy_id)) | |
448 | + if phrase_type == 'prepadjp': | |
449 | + lex_phrases.append(LexPrepAdjP(phrase, 'sg', 'm1', 'pos', words, mod, dummy_id)) | |
450 | + if phrase_type == 'compar': | |
451 | + lex_phrases.append(make_compar(phrase, words, num, mod, controller)) | |
452 | + if phrase_type == 'xp': | |
453 | + if phrase._category._limitations: | |
454 | + for realisation in phrase._category._limitations: | |
455 | + phrs, g, n = generate_phrases(function, negativity, realisation, lemma, is_predef, head_gender) | |
456 | + for phr in phrs: | |
457 | + if phr not in phrases: | |
458 | + phrases.append(phr) | |
459 | + return phrases, 'n', 'sg' | |
460 | + else: | |
461 | + advcat = phrase._category._value | |
462 | + if advcat == 'mod': | |
463 | + phrase2 = NP(Case('inst'), dummy_id) | |
464 | + lex_phrases.append(LexNP(phrase2, num, words, mod, dummy_id)) | |
465 | + prep, case = xp2prepnp(advcat, processed_lemma, num) | |
466 | + if prep: | |
467 | + phrase2 = PrepNP(Preposition(prep, Case(case)), dummy_id) | |
468 | + lex_phrases.append(LexPrepNP(phrase2, num, words, mod, dummy_id)) | |
469 | + if advcat in XP2COMPREPNP: | |
470 | + if pos == 'subst': | |
471 | + comprep = XP2COMPREPNP[advcat] | |
472 | + return make_comprepnp(comprep, words, num, mod), None, None | |
473 | + if pos == 'ger': | |
474 | + assert(mod == NATR) | |
475 | + return ['{} {}'.format(comprep, get_form(processed_lemma, ['ger', num, 'gen', head_gender])[0])], 'n', 'sg' | |
476 | + | |
477 | + for lex_phrase in lex_phrases: | |
478 | + for phr in make_phraseologisms(lex_phrase, function, negativity, controller=controller, controller_grammar=controller_grammar): | |
479 | + if phr not in phrases: | |
480 | + # TODO? porządna lista wyjątków, jeśli będzie więcej | |
481 | + if phr == 'na członek rodziny': | |
482 | + phr = 'na członka rodziny' | |
483 | + if distrp: | |
484 | + # po iluś facetów/po ileś dziewczyn/kotów... | |
485 | + phr = 'po {} {}'.format('iluś' if gend == 'm1' else 'ileś', phr) | |
486 | + phrases.append(phr) | |
487 | + | |
488 | + assert(phrases) | |
489 | + return phrases, gend if phrase_type == 'np' else None, num if phrase_type == 'np' else None | |
490 | + | |
491 | +def get_lex_gender_number(phrase): | |
492 | + if isinstance(phrase, LexNP): | |
493 | + number = phrase._number | |
494 | + # take the first lemma since first expansion is taken for whole meaning description | |
495 | + lemma = phrase._words._lemmas[0] | |
496 | + if lemma == 'siebie': | |
497 | + gender = 'm1' | |
498 | + elif lemma == 'łupień': | |
499 | + gender = 'm2' | |
500 | + else: | |
501 | + interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'nom']) | |
502 | + gender = get_gender(interps) | |
503 | + return gender, number if number != '_' else 'sg' | |
504 | + ''' | |
505 | + genders = list() | |
506 | + for lemma in phrase._words._lemmas: | |
507 | + if lemma == 'siebie': | |
508 | + genders.append('m1') | |
509 | + elif lemma == 'łupień': | |
510 | + genders.append('m2') | |
511 | + else: | |
512 | + interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'nom']) | |
513 | + genders.append(get_gender(interps)) | |
514 | + return genders[0], number if number != '_' else 'sg' | |
515 | + ''' | |
516 | + if isinstance(phrase, LexNumP): | |
517 | + # take the first lemma since first expansion is taken for whole meaning description | |
518 | + lemma = phrase._words._lemmas[0] | |
519 | + interps = get_interps(lemma, lemma=lemma, tag_constraints=['subst', 'nom']) | |
520 | + gender = get_gender(interps) | |
521 | + lemma = phrase._nums._lemmas[0] | |
522 | + recs = set() | |
523 | + if lemma == '2': | |
524 | + recs.add('congr') | |
525 | + else: | |
526 | + for interp in get_interps(lemma, lemma=lemma, tag_constraints=['num', 'nom']): | |
527 | + recs.add(interp[1].split(':')[-1]) | |
528 | + assert(len(recs) == 1) | |
529 | + rec = recs.pop() | |
530 | + if rec == 'rec': | |
531 | + # wiele/pięciu/trzydzieści osiem kotów/facetów/kobiet przyszło | |
532 | + return 'n', 'sg' | |
533 | + else: | |
534 | + # trzy kobiety/koty przyszły/trzej faceci przyszli | |
535 | + return gender, 'pl' | |
536 | + return None, None | |
537 | + | |
538 | +PHRASE_CACHE = dict() | |
539 | + | |
540 | +PHRASE_SEP = ' / ' | |
541 | + | |
542 | +# subentry, argument: DB model objects | |
543 | +# schema, phrase: importer objects | |
544 | +def get_phrase_description(subentry, argument, position, phrase, controller_grammar=None): | |
545 | + print() | |
546 | + print(argument) | |
547 | + print(phrase) | |
548 | + gender, number = None, None | |
549 | + function = position._function._value if position._function else None | |
550 | + control = None | |
551 | + if position._control: | |
552 | + assert(len(position._control) == 1) | |
553 | + control = position._control[0]._function | |
554 | + negativity = subentry.negativity.name if subentry.negativity else '_' | |
555 | + head_lemma, head_gender = subentry.entry.name, None | |
556 | + | |
557 | + controller, controller_features, controller_function = None, None, None | |
558 | + if control and control.endswith('controllee'): | |
559 | + print('==================', control) | |
560 | + controller = position._schema.getController(control) | |
561 | + controller_features = controller_grammar[controller] | |
562 | + controller_function = controller._function._value if controller._function else None | |
563 | + | |
564 | + if subentry.entry.pos.tag == 'noun': | |
565 | + interps = get_interps(head_lemma, lemma=head_lemma, tag_constraints=['subst', 'nom']) | |
566 | + head_gender = get_gender(interps) | |
567 | + # TODO | |
568 | + # TODO gender, number | |
569 | + # TODO (‹jakieś›) oko * (‹jakieś›) oczy *błyszczy* z powodu substancji | |
570 | + if isinstance(phrase, LexPhrase) or isinstance(phrase, Fixed): | |
571 | + phrs = [] | |
572 | + # TODO to powinny być tylko brakujące [...] w lex(cp) | |
573 | + try: | |
574 | + for phr in make_phraseologisms(phrase, function, negativity, controller=controller, controller_grammar=controller_features): | |
575 | + if phr not in phrs: | |
576 | + phrs.append(phr) | |
577 | + except: | |
578 | + phrs.append('!!!???') | |
579 | + gender, number = get_lex_gender_number(phrase) | |
580 | + return PHRASE_SEP.join(phrs), gender, number | |
581 | + lemmata, is_predef = get_argument_lemma(argument, xp=(phrase._name == 'xp' and not phrase._category._limitations)) | |
582 | + try: | |
583 | + assert(len(lemmata) == 1) | |
584 | + except: | |
585 | + print(lemmata, is_predef) | |
586 | + raise | |
587 | + phrases = [] | |
588 | + # TODO since there’s one lemma, drop the loop | |
589 | + for lemma in lemmata: | |
590 | + key = (function, negativity, str(phrase), lemma, str(head_gender), control, controller_features, controller_function) | |
591 | + if key in PHRASE_CACHE: | |
592 | + lemma_phrases, gender, number = PHRASE_CACHE[key] | |
593 | + else: | |
594 | + lemma_phrases, gender, number = generate_phrases(function, negativity, phrase, lemma, is_predef, head_gender, controller=controller, controller_grammar=controller_features) | |
595 | + PHRASE_CACHE[key] = (lemma_phrases, gender, number) | |
596 | + phrases += lemma_phrases | |
597 | + return PHRASE_SEP.join(phrases), gender, number | |
598 | + | |
599 | +def get_only_value(d): | |
600 | + return list(d.values())[0] | |
601 | + | |
602 | +PRIORITY, ATTR, SUBPRIORITY = 'priority', 'attr', 'subpriority' | |
603 | +LOW_PRIORITY = 200 | |
604 | +CP_PRIO = { | |
605 | + 'żeby' : 0, # że | |
606 | + 'kiedy' : 0, # gdy, jak | |
607 | + 'żeby2' : 1, # jak | |
608 | + 'że' : 2, # jak | |
609 | + # prefer phrases introduced by complementisers where present | |
610 | + 'int' : LOW_PRIORITY + 1, | |
611 | +} | |
612 | +PHRASE_PRIORITY = { | |
613 | + 'xp' : { | |
614 | + PRIORITY : 10, | |
615 | + ATTR : lambda phrase: phrase._category._value, | |
616 | + SUBPRIORITY : { | |
617 | + 'adl' : 0, # nawigacja xp(adl)/xp(locat) | |
618 | + 'locat' : 1, # powycierać xp(abl)/xp(locat) | |
619 | + 'caus' : 2, # ucierpieć xp(caus)/xp(temp) | |
620 | + }, | |
621 | + }, | |
622 | + 'np' : { | |
623 | + PRIORITY : 20, | |
624 | + ATTR : lambda phrase: phrase._case._value, | |
625 | + SUBPRIORITY : { | |
626 | + 'str' : 0, | |
627 | + }, | |
628 | + }, | |
629 | + 'prepnp' : { | |
630 | + PRIORITY : 22, | |
631 | + ATTR : lambda phrase: (phrase._prep._value, phrase._prep._case._value), | |
632 | + SUBPRIORITY : { | |
633 | + ('do', 'gen') : 0, # adekwatny do/dla; kolejka do/za | |
634 | + ('za', 'inst') : 1, # agitować za/przeciw | |
635 | + ('o', 'acc') : 1, # apel o/przeciw | |
636 | + ('w', 'acc') : 1, # całować w/po | |
637 | + ('w', 'loc') : 1, # defilada w/na pojeździe | |
638 | + ('między', 'inst') : 2, # debata między/z/wśród | |
639 | + ('o', 'loc') : 2, # debata o/wokół/nad | |
640 | + ('wobec', 'gen') : 2, # dług wobec/względem, konsekwentny wobec/dla | |
641 | + ('dla', 'gen') : 3, # certyfikat dla/za | |
642 | + ('z', 'gen') : 2, # dochód z/za/od | |
643 | + ('o', 'acc') : 3, # kampania o/za | |
644 | + ('pod', 'inst') : 4, # kruszyć się pod/od | |
645 | + ('o', 'loc') : 4, # książka o czymś/z czegoś | |
646 | + ('po', 'loc') : 5, # odlatywać od/po | |
647 | + ('od', 'gen') : 6, # podatek od/za | |
648 | + ('przeciw', 'dat') : 7, # przestępstwo z/przeciw | |
649 | + ('na', 'loc') : 7, # skoncentrować się na/nad | |
650 | + ('za', 'acc') : 7, # zabulić na/za | |
651 | + ('z', 'acc') : LOW_PRIORITY + 1, # mandat – błąd w danych, jest tam też za:acc | |
652 | + }, | |
653 | + }, | |
654 | + 'comprepnp' : { | |
655 | + PRIORITY : 24, | |
656 | + ATTR : lambda phrase: phrase._prep._value, | |
657 | + SUBPRIORITY : { | |
658 | + 'w sprawie' : 0, # w kwestii | |
659 | + 'w zakresie' : 0, # dyletant w zakresie/w kwestii | |
660 | + 'w kwestii' : 1, # dyskrecja co do/w kwestii | |
661 | + 'z dziedziny' : 1, # referat w dziedzinie/z dziedziny | |
662 | + }, | |
663 | + }, | |
664 | + 'cp' : { | |
665 | + PRIORITY : 30, | |
666 | + ATTR : lambda phrase: phrase._type._value, | |
667 | + SUBPRIORITY : CP_PRIO, | |
668 | + }, | |
669 | + 'ncp' : { | |
670 | + PRIORITY : 32, | |
671 | + ATTR : lambda phrase: phrase._type._value, | |
672 | + SUBPRIORITY : CP_PRIO, | |
673 | + }, | |
674 | + 'prepncp' : { | |
675 | + PRIORITY : 34, | |
676 | + ATTR : lambda phrase: phrase._type._value, | |
677 | + SUBPRIORITY : CP_PRIO, | |
678 | + }, | |
679 | +} | |
680 | + | |
681 | +def get_phrase_priority(phrase): | |
682 | + lex = False | |
683 | + if isinstance(phrase, LexPhrase): | |
684 | + lex = True | |
685 | + phrase = phrase._lex_phrase() | |
686 | + phrase_type = phrase._name | |
687 | + if phrase_type == 'xp' and phrase._category._limitations: | |
688 | + # TODO? heurystyka: bierzemy pierwszą | |
689 | + phrase, phrase_type = phrase._category._limitations[0], phrase._category._limitations[0]._name | |
690 | + if phrase_type not in PHRASE_PRIORITY: | |
691 | + return (LOW_PRIORITY, LOW_PRIORITY) | |
692 | + attr = PHRASE_PRIORITY[phrase_type][ATTR](phrase) | |
693 | + # lower the priority by 1 for lexes, eg. dostępność prepnp(dla, gen)/lex(prepnp(‹dla kieszeni›)) | |
694 | + return (PHRASE_PRIORITY[phrase_type][PRIORITY] + (1 if lex else 0), PHRASE_PRIORITY[phrase_type][SUBPRIORITY].get(attr, LOW_PRIORITY)) | |
695 | + | |
696 | +# position: importer object | |
697 | +# phrase_descriptions: dict | |
698 | +# key: phrase importer object | |
699 | +# value: (description, gender, number) | |
700 | +# result: phrase description to use in the realisation description | |
701 | +# TODO!! dzwonić – dwie lex(prepnp(w,loc))! | |
702 | +# TODO!! kapać – dwie lex(np(inst))! | |
703 | +# TODO!! popukać – dwie lex(prepnp(do,gen))! | |
704 | +# TODO!! przeczyć – dwie lex(np(dat))! | |
705 | +# TODO!! pukać – dwie lex(prepnp(do,gen))! | |
706 | +# TODO!! regenerować – dwie lex(np(str))! | |
707 | +# TODO!! rosić – dwie lex(np(inst))! | |
708 | +# TODO!! spychać – dwie lex(prepnp(na,acc))! | |
709 | +# TODO!! szwankować – dwie lex(prepnp(na,loc))! | |
710 | +# TODO!! wypchać – dwie lex(np(inst))! | |
711 | +# TODO!! zapukać – dwie lex(prepnp(do,gen))! | |
712 | +# TODO!! zepchnąć – dwie lex(prepnp(na,acc))! | |
713 | +# TODO!! zrosić – dwie lex(np(inst))! | |
714 | +def select_phrase_description(position, phrase_descriptions): | |
715 | + #print(type(position)) | |
716 | + #print(phrase_descriptions) | |
717 | + if len(phrase_descriptions) == 1: | |
718 | + desc = get_only_value(phrase_descriptions) | |
719 | + assert(desc[0] != '???') | |
720 | + return desc | |
721 | + by_priority = defaultdict(set) | |
722 | + for p, d in phrase_descriptions.items(): | |
723 | + by_priority[get_phrase_priority(p)].add((p, d)) | |
724 | + min_priority_phrases = by_priority[min(by_priority.keys())] | |
725 | + if len(min_priority_phrases) == 1: | |
726 | + p, desc = min_priority_phrases.pop() | |
727 | + assert (desc[0] != '???') | |
728 | + return desc | |
729 | + else: | |
730 | + # TODO? napsuć zdrowia/nerwów | |
731 | + if set(desc[0] for desc in phrase_descriptions.values()) == {'zdrowia', 'nerwów'}: | |
732 | + return ('zdrowia i nerwów', 'n', 'pl') | |
733 | + for phrase, desc in phrase_descriptions.items(): | |
734 | + print('***', type(phrase)) | |
735 | + print('*** ', phrase, desc) | |
736 | + for priority, phrases in sorted(by_priority.items()): | |
737 | + print('===', priority) | |
738 | + print('=== ', phrases) | |
739 | + 1/0 | |
740 | + | |
741 | + | |
742 | +FUNCTION_RANK = { | |
743 | + 'subj' : 0, | |
744 | + 'head' : 0, | |
745 | + 'obj' : 2, | |
746 | + None : 4, | |
747 | +} | |
748 | + | |
749 | +def is_np(phrase, case): | |
750 | + if phrase._name != 'np': | |
751 | + return False | |
752 | + if isinstance(phrase, LexPhrase): | |
753 | + return phrase._np._case._value == case | |
754 | + else: | |
755 | + return phrase._case._value == case | |
756 | + | |
757 | +# TODO: possp na początku tylko, jeśli jest przymiotnikowe | |
758 | +def get_argument_realisation_priority(ar, entry_pos): | |
759 | + position = ar._position | |
760 | + function = position._function._value if position._function else None | |
761 | + # first rank by subj or possp, obj, rest | |
762 | + rank1 = FUNCTION_RANK[function] | |
763 | + phrase_types = set(phrase._name for phrase in position._phrases) | |
764 | + if (phrase_types == {'adjp'} and entry_pos == 'noun') or phrase_types == {'possp'}: | |
765 | + # jakieś COŚ, ale UCZYNIĆ kogoś jakimś | |
766 | + rank1 = 0 | |
767 | + # np(dat) after verb ‹ktoś daje komuś coś› | |
768 | + if [p for p in ar._position._phrases if is_np(p, 'dat')]: | |
769 | + rank1 = 1 | |
770 | + # np(str) without function (TODO? error in data, e.g. chwytać ustami *powietrze* – should be obj?) | |
771 | + if function is None and [p for p in ar._position._phrases if is_np(p, 'str')]: | |
772 | + rank1 = 3 | |
773 | + # clauses at the end | |
774 | + if {'cp', 'ncp', 'prepncp'}.issuperset(phrase_types): | |
775 | + rank1 = 5 | |
776 | + # then rank by phrase type: refl/recip, then nominal, then rest | |
777 | + rank2 = 2 | |
778 | + if {'refl', 'recip'}.intersection(phrase_types): | |
779 | + rank2 = 0 | |
780 | + elif 'np' in phrase_types: | |
781 | + rank2 = 1 | |
782 | + # finally rank by semantic argument priority | |
783 | + sem_role = ar._argument._semantic_role | |
784 | + role_prio = SemanticRole.objects.get(role=sem_role._value).priority | |
785 | + attribute_prio = RoleAttribute.objects.get(attribute=sem_role._attribute).priority if sem_role._attribute else 0 | |
786 | + rank3 = (role_prio, attribute_prio) | |
787 | + return [rank1, rank2, rank3] | |
788 | + | |
789 | +# jeśli nie ma nic na początku, a jest np(dat), to przesuwamy na początek | |
790 | +def rerank(ars): | |
791 | + print(ars) | |
792 | + before, after, np_dat = [], [], [] | |
793 | + for rank, fallback, ar in ars: | |
794 | + if rank[0] == 0: | |
795 | + before.append((rank, fallback, ar)) | |
796 | + elif [p for p in ar._position._phrases if is_np(p, 'dat')]: | |
797 | + np_dat.append((rank, fallback, ar)) | |
798 | + else: | |
799 | + after.append((rank, fallback, ar)) | |
800 | + if before: | |
801 | + return ars | |
802 | + else: | |
803 | + #assert(len(np_dat) <= 1) #TODO? hasło: daleki | |
804 | + return [([0] + rank[1:], fallback, ar) for rank, fallback, ar in np_dat] + after | |
805 | + | |
806 | +# for multi-position Lemma arguments, e.g. dostać się z deszczu pod rynnę | |
807 | + | |
808 | +FALLBACK = { | |
809 | + 'z deszczu' : 1, | |
810 | + 'pod rynnę' : 2, | |
811 | + 'od ściany' : 1, | |
812 | + 'do ściany' : 2, | |
813 | + 'żywcem' : 1, | |
814 | + 'ze skóry' : 2, | |
815 | + 'pięknym' : 1, | |
816 | + 'za nadobne' : 2, | |
817 | + 'od Annasza' : 1, | |
818 | + 'do Kajfasza' : 2, | |
819 | + 'z (brudnymi) buciorami / z (swoimi) buciorami / z (brudnymi swoimi) buciorami / z (brudnymi) butami / z (swoimi) butami / z (brudnymi swoimi) butami' : 1, | |
820 | + 'do łóżka / do łóżek' : 2, | |
821 | + 'samego' : 1, | |
822 | + 'w (‹jakieś›) ręce' : 2, | |
823 | + 'z (‹jakiejś›) radości / z (‹jakiegoś›) szczęścia' : 1, | |
824 | + 'pod sufit' : 2, | |
825 | + 'z jednej skrajności' : 1, | |
826 | + 'w drugą' : 2, | |
827 | + 'ze skrajności' : 1, | |
828 | + 'w skrajność' : 2, | |
829 | + 'z motyką' : 1, | |
830 | + 'na słońce' : 2, | |
831 | + 'z nogi' : 1, | |
832 | + 'na nogę' : 2, | |
833 | + 'z pustego' : 1, | |
834 | + 'w próżne' : 2, | |
835 | + 'z (‹jakiejś›) klasy' : 1, | |
836 | + 'do (‹jakiejś›) klasy' : 2, | |
837 | + 'z (‹jakiegoś›) kwiatka' : 1, | |
838 | + 'na (‹jakiś›) kwiatek' : 2, | |
839 | + 'w dno' : 1, | |
840 | + 'od spodu' : 2, | |
841 | + 'po rozum' : 1, | |
842 | + 'do głowy' : 2, | |
843 | + 'z pazurami / z pięściami' : 1, | |
844 | + 'do oczu' : 2, | |
845 | + 'na ziemię' : 1, | |
846 | + 'z obłoków' : 2, | |
847 | + 'prosto' : 1, | |
848 | + 'w (‹jakieś›) serce / w (‹jakieś›) serca' : 2, | |
849 | + 'z rąk' : 1, | |
850 | + 'do rąk' : 2, | |
851 | + 'z ręki' : 1, | |
852 | + 'do ręki' : 2, | |
853 | + 'o pomstę' : 1, | |
854 | + 'do nieba' : 2, | |
855 | + 'ze zbiornika' : 1, | |
856 | + 'do zbiornika' : 2, | |
857 | +} | |
858 | + | |
859 | +def fallback(description): | |
860 | + return FALLBACK.get(description, 0) | |
861 | + | |
862 | +WINIEN = ('powinien', 'winien',) | |
863 | + | |
864 | +# realisation: importer object | |
865 | +# subentry: DB model object | |
866 | +# TODO wszystkie lex-y chyba powinny wejść do tej reprezentacji, | |
867 | +# np. ktoś babrze ‹sobie› ‹rączki›: ‹sobie› nie jest powiązane z argumentem... | |
868 | +def get_realisation_description(realisation, subentry, aspect): | |
869 | + entry = subentry.entry | |
870 | + ars = [(get_argument_realisation_priority(ar, entry.pos.tag), fallback(ar._description), ar) for ar in realisation._argument_realizations] | |
871 | + print([(p1, p2, ar._description) for p1, p2, ar in ars]) | |
872 | + ars = sorted(ars) | |
873 | + if entry.pos.tag == 'verb': | |
874 | + # dla innych nie przesuwamy np(dat): bliski *komuś* | |
875 | + ars = rerank(ars) | |
876 | + before = [ar._description.split(PHRASE_SEP)[0] for rank, fallback, ar in ars if rank[0] == 0] | |
877 | + after = [ar._description.split(PHRASE_SEP)[0] for rank, fallback, ar in ars if rank[0] > 0] | |
878 | + subj_ars = [ar for ar in realisation._argument_realizations if ar._position._function and ar._position._function._value == 'subj'] | |
879 | + assert(len(subj_ars) <= 1) | |
880 | + subj_ar = subj_ars[0] if subj_ars else None | |
881 | + entry_form = entry.name | |
882 | + if entry.name == 'naleźć': | |
883 | + #TODO błąd w słowniku | |
884 | + aspect = 'perf' | |
885 | + if entry.name == 'bootować': | |
886 | + # nienotowane w Morfeuszu | |
887 | + entry_form = 'bootuje' | |
888 | + elif entry.name == 'wtyczkować': | |
889 | + # nienotowane w Morfeuszu | |
890 | + entry_form = 'wtyczkuje' | |
891 | + elif entry.pos.tag == 'verb': | |
892 | + assert(aspect) | |
893 | + entry_base = entry.name | |
894 | + if entry_base == 'doprząc': | |
895 | + entry_base = 'doprzęgnąć' | |
896 | + if aspect == '_': | |
897 | + # eg. aresztować | |
898 | + aspect = 'imperf' | |
899 | + try: | |
900 | + subj_num = subj_ar._number if subj_ar else 'sg' | |
901 | + if subj_ar and (aspect == 'perf' or entry_base in WINIEN): | |
902 | + # potrzebne tylko dla dokonanych (zrobił/a/o) i winien/na | |
903 | + if subj_ar._gender: | |
904 | + subj_gend = subj_ar._gender | |
905 | + else: | |
906 | + print('##############', subj_ar) | |
907 | + print('##############', subj_ar._position._phrases) | |
908 | + print('##############', subj_ar._argument) | |
909 | + 1/0 | |
910 | + else: | |
911 | + # no subject: ‹jestem kotem — olśniło kogoś› | |
912 | + subj_gend = 'n' | |
913 | + if entry_base in WINIEN: | |
914 | + entry_form = get_form(entry_base, ['winien', subj_num, subj_gend, 'imperf'])[0] | |
915 | + elif aspect == 'imperf': | |
916 | + # niedokonane: fin (cz. teraźnieszy) | |
917 | + # TODO? lista wyjątków, jeśli będzie więcej | |
918 | + if entry_base == 'sparować' and subj_num == 'sg': | |
919 | + # bokser sparuje — imperf nienotowane w Morfeuszu | |
920 | + entry_form = 'sparuje' | |
921 | + else: | |
922 | + entry_form = get_form(entry_base, ['fin', subj_num, 'ter', 'imperf'])[0] | |
923 | + else: | |
924 | + # dokonane: praet (cz. przeszły) | |
925 | + # TODO? lista wyjątków, jeśli będzie więcej | |
926 | + if entry_base == 'nasuwać' and (subj_num, subj_gend) == ('sg', 'm1'): | |
927 | + # „Nasuwał się mebli przy odnawianiu mieszkania.” — perf nienotowane w Morfeuszu | |
928 | + entry_form = 'nasuwał' | |
929 | + elif entry_base == 'wybzykać' and (subj_num, subj_gend) == ('sg', 'm1'): | |
930 | + # nienotowane w Morfeuszu | |
931 | + entry_form = 'wybzykał' | |
932 | + elif entry_base == 'wytuszować' and (subj_num, subj_gend) == ('sg', 'm1'): | |
933 | + # nienotowane w Morfeuszu | |
934 | + entry_form = 'wytuszował' | |
935 | + elif entry_base == 'zależeć' and (subj_num, subj_gend) == ('sg', 'm2'): | |
936 | + # nienotowane w Morfeuszu | |
937 | + entry_form = 'zależał' | |
938 | + elif entry_base == 'zemdlić' and (subj_num, subj_gend) == ('sg', 'f'): | |
939 | + # formy inne niż „zemdliło” nienotowane w Morfeuszu | |
940 | + entry_form = 'zemdliła' | |
941 | + else: | |
942 | + entry_form = get_form(entry_base, ['praet', subj_num, subj_gend, 'perf', ['nagl', '']])[0] | |
943 | + except: | |
944 | + entry_form = get_form(entry_base, ['pred'])[0] | |
945 | + if entry.name == 'napaść' and {'wal_69620-mng', 'wal_80242-mng', 'wal_174604-mng', 'wal_174605-mng', 'wal_174603-mng', 'wal_174606-mng'}.issuperset(realisation._frame._meanings): | |
946 | + # znaczenie ‹napaść (się) jedzeniem› | |
947 | + entry_form = entry_form.replace('dł', 'sł') | |
948 | + if entry.name == 'oblec' and {'wal_85605-mng', 'wal_85615-mng'}.issuperset(realisation._frame._meanings): | |
949 | + # znaczenie ‹oblec twierdzę› | |
950 | + entry_form = entry_form.replace('kł', 'gł') | |
951 | + if entry.name == 'odpaść' and {'wal_68230-mng', 'wal_68225-mng', 'wal_79689-mng'}.issuperset(realisation._frame._meanings): | |
952 | + # znaczenie ‹odpaść (się) jedzeniem› | |
953 | + entry_form = entry_form.replace('dł', 'sł') | |
954 | + if entry.name == 'podpaść' and {'wal_86356-mng', 'wal_86350-mng', 'wal_174582-mng', 'wal_174584-mng', 'wal_174585-mng', 'wal_174586-mng'}.issuperset(realisation._frame._meanings): | |
955 | + # znaczenie ‹podpaść (się) jedzeniem› | |
956 | + entry_form = entry_form.replace('dł', 'sł') | |
957 | + if entry.name == 'popaść' and {'wal_174529-mng', 'wal_174530-mng'}.issuperset(realisation._frame._meanings): | |
958 | + # znaczenie ‹popaść (się) jedzeniem› | |
959 | + entry_form = entry_form.replace('dł', 'sł') | |
960 | + | |
961 | + if subentry.negativity and subentry.negativity.name == 'neg': | |
962 | + entry_form = 'nie ' + entry_form | |
963 | + if subentry.inherent_sie.name == 'true': | |
964 | + entry_form += ' się' | |
965 | + elements = before + ['<b>{}</b>'.format(entry_form)] + after | |
966 | + return ' '.join(elements) | |
... | ... |