Commit 1e5542913a5001225fd08dba7f8a8034b396fd2b
1 parent
f8298e06
generator frazeologii - podmioty; przygotowanie do synchronizacji ze Słowosiecią
Showing
4 changed files
with
138 additions
and
40 deletions
semantics/phraseology_generator.py
... | ... | @@ -3,7 +3,8 @@ |
3 | 3 | from dictionary.models import sort_arguments, sort_positions, sortatributes |
4 | 4 | from settings import MORFEUSZ2 |
5 | 5 | |
6 | -def lexicalisation(argument): | |
6 | +def lexicalisation(argument, categories, base): | |
7 | + subj = is_subj(categories) | |
7 | 8 | b = argument.type |
8 | 9 | if b == 'fixed': |
9 | 10 | return get_words(sortatributes(argument)[-1]) |
... | ... | @@ -14,15 +15,21 @@ def lexicalisation(argument): |
14 | 15 | lexicalisation_type = lexicalisation_parameters[0].values.all()[0].argument.type |
15 | 16 | lexicalisation_parameters = sortatributes(lexicalisation_parameters[0].values.all()[0].argument) |
16 | 17 | if lexicalisation_type == 'np': # np(case), number, nouns, atr |
17 | - nps = get_nps(get_case(lexicalisation_parameters[0]), get_number(attributes[1]), get_words(attributes[2]), attributes[3]) | |
18 | - return nps | |
18 | + nps = get_nps(get_case(lexicalisation_parameters[0], subj), get_number(attributes[1], subj), get_words(attributes[2]), attributes[3]) | |
19 | + return (nps, get_verb(base, get_number(attributes[1], subj), subj)) | |
19 | 20 | elif lexicalisation_type == 'prepnp': #prepnp(prep, case), number, nouns, atr |
20 | - prepnps = get_prepnps(get_preposition(lexicalisation_parameters[0]), get_case(lexicalisation_parameters[1]), get_number(attributes[1]), get_words(attributes[2]), attributes[3]) | |
21 | - return prepnps | |
21 | + prepnps = get_prepnps(get_preposition(lexicalisation_parameters[0]), get_case(lexicalisation_parameters[1], subj), get_number(attributes[1], subj), get_words(attributes[2]), attributes[3]) | |
22 | + return (prepnps, []) | |
22 | 23 | else: |
23 | 24 | return [] |
24 | 25 | return [] |
25 | 26 | |
27 | +def is_subj(categories): | |
28 | + for cat in categories: | |
29 | + if cat.category == u'subj': | |
30 | + return True | |
31 | + return False | |
32 | + | |
26 | 33 | def get_preposition(attribute): |
27 | 34 | return attribute.values.all()[0].parameter.type.name |
28 | 35 | |
... | ... | @@ -30,14 +37,20 @@ def get_words(attribute): |
30 | 37 | words = [word.text[1:-1] for word in attribute.values.all()] |
31 | 38 | return words |
32 | 39 | |
33 | -def get_case(attribute): | |
40 | +def get_case(attribute, is_subj): | |
34 | 41 | case = attribute.values.all()[0].parameter.type.name |
35 | 42 | if case == u'str': |
36 | - case = u'acc' | |
43 | + if is_subj: | |
44 | + case = u'nom' | |
45 | + else: | |
46 | + case = u'acc' | |
37 | 47 | return case |
38 | 48 | |
39 | -def get_number(attribute): | |
49 | +def get_number(attribute, is_subj): | |
40 | 50 | number = attribute.values.all()[0].parameter.type.name |
51 | + if number == u'_': | |
52 | + if is_subj: | |
53 | + number = u'sg' | |
41 | 54 | return number |
42 | 55 | |
43 | 56 | def get_nps(case, number, nouns, _atr): |
... | ... | @@ -65,3 +78,15 @@ def get_prepnps(prep, case, number, nouns, _atr): |
65 | 78 | nps = get_nps(case, number, nouns, _atr) |
66 | 79 | return [prep + ' ' + np for np in nps] |
67 | 80 | |
81 | +def get_verb(inf, number, is_subj): | |
82 | + if not is_subj: | |
83 | + return None | |
84 | + else: | |
85 | + options = [(interp.orth, interp.getTag(MORFEUSZ2)) for interp in MORFEUSZ2.generate(inf.encode('utf8'))] | |
86 | + filtered = [] | |
87 | + for option in options: | |
88 | + (orth, tag) = option | |
89 | + if u'fin' in tag and u'sg' in tag and u'ter' in tag: | |
90 | + filtered.append(option) | |
91 | + options = filtered | |
92 | + return [orth for orth, _ in options] | |
... | ... |
semantics/static/js/semantics_lexical_units.js
... | ... | @@ -220,20 +220,40 @@ function getMeaningsSelectionForFrame(frame_id) { |
220 | 220 | var j; |
221 | 221 | for (j = 0; j < rows.length; j++) { |
222 | 222 | var options = []; |
223 | + var vrb = []; | |
224 | + var pre = []; | |
223 | 225 | sid_alt = rows[j].split('_'); |
224 | 226 | var sch = "schema_" + sid_alt[0] + "_"; |
225 | 227 | var k; |
226 | 228 | for (k = 0; k < schemas_content[sch].display.arguments[0].length; k++) { |
227 | 229 | var proper = schemas_content[sch].display.arguments[0][k].csv_id + "alt_" + sid_alt[1] + "_"; |
228 | 230 | if (connected[lem].indexOf(proper) != -1) { |
229 | - options.push(schemas_content[sch].display.arguments[0][k].lex); | |
231 | + if (schemas_content[sch].display.arguments[0][k].vrb.length > 0) { | |
232 | + pre.push(schemas_content[sch].display.arguments[0][k].lex); | |
233 | + vrb = schemas_content[sch].display.arguments[0][k].vrb; | |
234 | + } else { | |
235 | + options.push(schemas_content[sch].display.arguments[0][k].lex); | |
236 | + } | |
230 | 237 | } |
231 | 238 | } |
232 | - var lex = {lemma: base, args: options}; | |
233 | - if (hasRefl(sch)) { | |
234 | - lex.lemma = base + " się"; | |
235 | - } | |
236 | - lexicalisation.push(lex); | |
239 | + if (vrb.length == 0) { | |
240 | + var lex = {lemma: [base], pre: pre, args: options}; | |
241 | + if (hasRefl(sch)) { | |
242 | + lex.lemma = [base + " się"]; | |
243 | + } | |
244 | + lexicalisation.push(lex); | |
245 | + } else { | |
246 | + var lex = {lemma: vrb, pre: pre, args: options}; | |
247 | + if (hasRefl(sch)) { | |
248 | + var l = []; | |
249 | + var k; | |
250 | + for (k=0; k < vrb.length; k++) { | |
251 | + l.push(vrb[k] + " się"); | |
252 | + } | |
253 | + lex.lemma = l; | |
254 | + } | |
255 | + lexicalisation.push(lex); | |
256 | + } | |
237 | 257 | } |
238 | 258 | } |
239 | 259 | |
... | ... | @@ -248,12 +268,11 @@ function getFormForLexicalisation(lexicalisation) { |
248 | 268 | var result = ""; |
249 | 269 | var i; |
250 | 270 | for (i = 0; i < lexicalisation.length; i++) { |
251 | - var perms = permute(lexicalisation[i].args); | |
252 | - var j; | |
253 | - for (j = 0; j < perms.length; j++) { | |
254 | - result += lexicalisationForm(lexicalisation[i].lemma, cartesian(perms[j])) | |
255 | - } | |
256 | - result += '<br\>'; | |
271 | + var perms = permute(lexicalisation[i].args); | |
272 | + var j; | |
273 | + for (j = 0; j < perms.length; j++) { | |
274 | + result += lexicalisationForm(lexicalisation[i].lemma, lexicalisation[i].pre, cartesian(perms[j])); | |
275 | + } | |
257 | 276 | } |
258 | 277 | return result; |
259 | 278 | } |
... | ... | @@ -302,26 +321,79 @@ function cartesian(llist) { |
302 | 321 | return result; |
303 | 322 | } |
304 | 323 | |
305 | -function lexicalisationForm(lemma, tokenised) { | |
324 | +function lexicalisationForm(lemma, pre, tokenised) { | |
325 | + var list; | |
326 | + if (pre.length == 0) { | |
327 | + list = noSubjUnits(lemma, tokenised); | |
328 | + } else { | |
329 | + list = subjUnits(pre, lemma, tokenised); | |
330 | + } | |
331 | + | |
306 | 332 | var display = ""; |
307 | - var i; | |
308 | - for (i = 0; i < tokenised.length; i++) { | |
309 | - if (tokenised[i].length == 0) { | |
310 | - display += "<br\>"; | |
311 | - } else { | |
312 | - var j; | |
313 | - for (j = 0; j < lexical_units.length; j++) { | |
314 | - if (lemma + " " + tokenised[i].join(" ") == lexical_units[j].base) { | |
315 | - return ""; | |
316 | - } | |
317 | - } | |
318 | - display += "<input type = \"checkbox\" name = \"mwe\" value = \"" + lemma + " " + tokenised[i].join(" ") + "\">"; // TODO: unikalne wartości, wartość => dodanie odpowiedniej jednostki (nazwa jednostki w wartości?) | |
319 | - display += lemma + " " + tokenised[i].join(" ") + "<br\>"; | |
333 | + var i, j; | |
334 | + for (i = 0; i < list.length; i++) { | |
335 | + var included = false; | |
336 | + for (j = 0; j < lexical_units.length; j++) { | |
337 | + if (list[i] == lexical_units[j].base) { | |
338 | + included = true; | |
339 | + } | |
340 | + } | |
341 | + if (!included) { | |
342 | + display += "<input type = \"checkbox\" name = \"mwe\" value = \"" + list[i] + "\">" + list[i] + "<br\>"; | |
320 | 343 | } |
321 | 344 | } |
322 | 345 | return display; |
323 | 346 | } |
324 | 347 | |
348 | +function noSubjUnits(lemmata, dependants) { | |
349 | + var result = []; | |
350 | + var i, j; | |
351 | + for (i = 0; i < lemmata.length; i++) { | |
352 | + if (dependants.length == 0) { | |
353 | + result.push(lemmata[i]); | |
354 | + } else { | |
355 | + for (j = 0; j < dependants.length; j++) { | |
356 | + result.push(lemmata[i] + dependants[j].join(" ")); | |
357 | + } | |
358 | + } | |
359 | + } | |
360 | + return result; | |
361 | +} | |
362 | + | |
363 | +function subjUnits(pre, lemmata, dependants) { | |
364 | + var result = []; | |
365 | + var i, j; | |
366 | + var temp = noSubjUnits(lemmata, dependants); | |
367 | + i = decapitate(dependants); | |
368 | + var pre2 = i.heads; | |
369 | + var temp2 = noSubjUnits(lemmata, i.bodies); | |
370 | + for (i = 0; i < pre.length; i++) { | |
371 | + for (j = 0; j < temp.length; j++) { | |
372 | + result.push(pre[i] + " " + temp[j]); | |
373 | + } | |
374 | + for (j = 0; j < pre2.length; j++) { | |
375 | + result.push(pre[i] + " " + pre2[j] + " " + temp2[j]); | |
376 | + } | |
377 | + } | |
378 | + return result; | |
379 | +} | |
380 | + | |
381 | +function decapitate(llist) { | |
382 | + var heads = []; | |
383 | + var bodies = []; | |
384 | + var i; | |
385 | + for (i = 0; i < llist.length; i++) { | |
386 | + if (llist[i].length > 0) { | |
387 | + var body = llist[i].slice(); | |
388 | + var head = body[0]; | |
389 | + body.splice(0, 1); | |
390 | + heads.push(head); | |
391 | + bodies.push(body); | |
392 | + } | |
393 | + } | |
394 | + return {heads: heads, bodies: bodies}; | |
395 | +} | |
396 | + | |
325 | 397 | |
326 | 398 | // get readable form of lexical unit |
327 | 399 | function getLexicalUnit(luid) { |
... | ... |
semantics/static/js/semantics_schemas.js
... | ... | @@ -130,10 +130,11 @@ function schemaBody(schema, alternation, lex){ |
130 | 130 | for (l = 0; l < display.arguments[k].length; l++) { |
131 | 131 | schema_body += '<td id="' + display.arguments[k][l].csv_id + 'alt_' + alternation + '_" class="' + display.arguments[k][l].csv_class + 'alt_' + alternation + '_" onclick="schemaClick(\'' + display.arguments[k][l].csv_id + 'alt_' + alternation +'_\', '; |
132 | 132 | if (display.arguments[k][l].lex.length != 0) { |
133 | - schema_body += '[\'' + display.arguments[k][l].lex.join('\', \'') + '\'])">'; | |
133 | + schema_body += '[\'' + display.arguments[k][l].lex.join('\', \'') + '\']'; | |
134 | 134 | } else { |
135 | - schema_body += '[])">'; | |
135 | + schema_body += '[]'; | |
136 | 136 | } |
137 | + schema_body += ')">'; | |
137 | 138 | schema_body += display.arguments[k][l].argument; |
138 | 139 | schema_body += '</td>'; |
139 | 140 | if (parseInt(display.arguments[k][l].csv_id.split('_')[5]) >= 0) { |
... | ... |
semantics/views.py
... | ... | @@ -450,13 +450,13 @@ def ajax_schemas(request, lemma_id): |
450 | 450 | # identifier, class, argument |
451 | 451 | arg = [] |
452 | 452 | #ma["ala"] = kot |
453 | - for i, c, a in zip(idents, schema_ids, row): | |
453 | + for i, c, a, p in zip(idents, schema_ids, row, ordered_positions): | |
454 | 454 | astr, aobj = a |
455 | 455 | if aobj is not None and aobj.is_phraseologic(): |
456 | - lex = lexicalisation(aobj) | |
456 | + lex, vrb = lexicalisation(aobj, p.categories.all(), lemma.entry_obj.name) | |
457 | 457 | else: |
458 | - lex = [] | |
459 | - arg.append({"csv_id": i, "csv_class": c, "argument": astr, "lex": lex}) | |
458 | + lex, vrb = ([], []) | |
459 | + arg.append({"csv_id": i, "csv_class": c, "argument": astr, "lex": lex, "vrb": vrb}) | |
460 | 460 | display["arguments"].append(arg) |
461 | 461 | |
462 | 462 | schema_display["schemas"].append({"schema_id": str(schema.id), "grade": lemma.get_schema_opinion(schema), "colspan": str(max(len(schema_categories), 1)), "rowspan": str(schema_arguments_rowspan), "display": display, "phraseologic": schema.phraseologic}) |
... | ... |