Commit 1e5542913a5001225fd08dba7f8a8034b396fd2b
1 parent
f8298e06
generator frazeologii - podmioty; przygotowanie do synchronizacji ze Słowosiecią
Showing
4 changed files
with
138 additions
and
40 deletions
semantics/phraseology_generator.py
@@ -3,7 +3,8 @@ | @@ -3,7 +3,8 @@ | ||
3 | from dictionary.models import sort_arguments, sort_positions, sortatributes | 3 | from dictionary.models import sort_arguments, sort_positions, sortatributes |
4 | from settings import MORFEUSZ2 | 4 | from settings import MORFEUSZ2 |
5 | 5 | ||
6 | -def lexicalisation(argument): | 6 | +def lexicalisation(argument, categories, base): |
7 | + subj = is_subj(categories) | ||
7 | b = argument.type | 8 | b = argument.type |
8 | if b == 'fixed': | 9 | if b == 'fixed': |
9 | return get_words(sortatributes(argument)[-1]) | 10 | return get_words(sortatributes(argument)[-1]) |
@@ -14,15 +15,21 @@ def lexicalisation(argument): | @@ -14,15 +15,21 @@ def lexicalisation(argument): | ||
14 | lexicalisation_type = lexicalisation_parameters[0].values.all()[0].argument.type | 15 | lexicalisation_type = lexicalisation_parameters[0].values.all()[0].argument.type |
15 | lexicalisation_parameters = sortatributes(lexicalisation_parameters[0].values.all()[0].argument) | 16 | lexicalisation_parameters = sortatributes(lexicalisation_parameters[0].values.all()[0].argument) |
16 | if lexicalisation_type == 'np': # np(case), number, nouns, atr | 17 | if lexicalisation_type == 'np': # np(case), number, nouns, atr |
17 | - nps = get_nps(get_case(lexicalisation_parameters[0]), get_number(attributes[1]), get_words(attributes[2]), attributes[3]) | ||
18 | - return nps | 18 | + nps = get_nps(get_case(lexicalisation_parameters[0], subj), get_number(attributes[1], subj), get_words(attributes[2]), attributes[3]) |
19 | + return (nps, get_verb(base, get_number(attributes[1], subj), subj)) | ||
19 | elif lexicalisation_type == 'prepnp': #prepnp(prep, case), number, nouns, atr | 20 | elif lexicalisation_type == 'prepnp': #prepnp(prep, case), number, nouns, atr |
20 | - prepnps = get_prepnps(get_preposition(lexicalisation_parameters[0]), get_case(lexicalisation_parameters[1]), get_number(attributes[1]), get_words(attributes[2]), attributes[3]) | ||
21 | - return prepnps | 21 | + prepnps = get_prepnps(get_preposition(lexicalisation_parameters[0]), get_case(lexicalisation_parameters[1], subj), get_number(attributes[1], subj), get_words(attributes[2]), attributes[3]) |
22 | + return (prepnps, []) | ||
22 | else: | 23 | else: |
23 | return [] | 24 | return [] |
24 | return [] | 25 | return [] |
25 | 26 | ||
27 | +def is_subj(categories): | ||
28 | + for cat in categories: | ||
29 | + if cat.category == u'subj': | ||
30 | + return True | ||
31 | + return False | ||
32 | + | ||
26 | def get_preposition(attribute): | 33 | def get_preposition(attribute): |
27 | return attribute.values.all()[0].parameter.type.name | 34 | return attribute.values.all()[0].parameter.type.name |
28 | 35 | ||
@@ -30,14 +37,20 @@ def get_words(attribute): | @@ -30,14 +37,20 @@ def get_words(attribute): | ||
30 | words = [word.text[1:-1] for word in attribute.values.all()] | 37 | words = [word.text[1:-1] for word in attribute.values.all()] |
31 | return words | 38 | return words |
32 | 39 | ||
33 | -def get_case(attribute): | 40 | +def get_case(attribute, is_subj): |
34 | case = attribute.values.all()[0].parameter.type.name | 41 | case = attribute.values.all()[0].parameter.type.name |
35 | if case == u'str': | 42 | if case == u'str': |
36 | - case = u'acc' | 43 | + if is_subj: |
44 | + case = u'nom' | ||
45 | + else: | ||
46 | + case = u'acc' | ||
37 | return case | 47 | return case |
38 | 48 | ||
39 | -def get_number(attribute): | 49 | +def get_number(attribute, is_subj): |
40 | number = attribute.values.all()[0].parameter.type.name | 50 | number = attribute.values.all()[0].parameter.type.name |
51 | + if number == u'_': | ||
52 | + if is_subj: | ||
53 | + number = u'sg' | ||
41 | return number | 54 | return number |
42 | 55 | ||
43 | def get_nps(case, number, nouns, _atr): | 56 | def get_nps(case, number, nouns, _atr): |
@@ -65,3 +78,15 @@ def get_prepnps(prep, case, number, nouns, _atr): | @@ -65,3 +78,15 @@ def get_prepnps(prep, case, number, nouns, _atr): | ||
65 | nps = get_nps(case, number, nouns, _atr) | 78 | nps = get_nps(case, number, nouns, _atr) |
66 | return [prep + ' ' + np for np in nps] | 79 | return [prep + ' ' + np for np in nps] |
67 | 80 | ||
81 | +def get_verb(inf, number, is_subj): | ||
82 | + if not is_subj: | ||
83 | + return None | ||
84 | + else: | ||
85 | + options = [(interp.orth, interp.getTag(MORFEUSZ2)) for interp in MORFEUSZ2.generate(inf.encode('utf8'))] | ||
86 | + filtered = [] | ||
87 | + for option in options: | ||
88 | + (orth, tag) = option | ||
89 | + if u'fin' in tag and u'sg' in tag and u'ter' in tag: | ||
90 | + filtered.append(option) | ||
91 | + options = filtered | ||
92 | + return [orth for orth, _ in options] |
semantics/static/js/semantics_lexical_units.js
@@ -220,20 +220,40 @@ function getMeaningsSelectionForFrame(frame_id) { | @@ -220,20 +220,40 @@ function getMeaningsSelectionForFrame(frame_id) { | ||
220 | var j; | 220 | var j; |
221 | for (j = 0; j < rows.length; j++) { | 221 | for (j = 0; j < rows.length; j++) { |
222 | var options = []; | 222 | var options = []; |
223 | + var vrb = []; | ||
224 | + var pre = []; | ||
223 | sid_alt = rows[j].split('_'); | 225 | sid_alt = rows[j].split('_'); |
224 | var sch = "schema_" + sid_alt[0] + "_"; | 226 | var sch = "schema_" + sid_alt[0] + "_"; |
225 | var k; | 227 | var k; |
226 | for (k = 0; k < schemas_content[sch].display.arguments[0].length; k++) { | 228 | for (k = 0; k < schemas_content[sch].display.arguments[0].length; k++) { |
227 | var proper = schemas_content[sch].display.arguments[0][k].csv_id + "alt_" + sid_alt[1] + "_"; | 229 | var proper = schemas_content[sch].display.arguments[0][k].csv_id + "alt_" + sid_alt[1] + "_"; |
228 | if (connected[lem].indexOf(proper) != -1) { | 230 | if (connected[lem].indexOf(proper) != -1) { |
229 | - options.push(schemas_content[sch].display.arguments[0][k].lex); | 231 | + if (schemas_content[sch].display.arguments[0][k].vrb.length > 0) { |
232 | + pre.push(schemas_content[sch].display.arguments[0][k].lex); | ||
233 | + vrb = schemas_content[sch].display.arguments[0][k].vrb; | ||
234 | + } else { | ||
235 | + options.push(schemas_content[sch].display.arguments[0][k].lex); | ||
236 | + } | ||
230 | } | 237 | } |
231 | } | 238 | } |
232 | - var lex = {lemma: base, args: options}; | ||
233 | - if (hasRefl(sch)) { | ||
234 | - lex.lemma = base + " się"; | ||
235 | - } | ||
236 | - lexicalisation.push(lex); | 239 | + if (vrb.length == 0) { |
240 | + var lex = {lemma: [base], pre: pre, args: options}; | ||
241 | + if (hasRefl(sch)) { | ||
242 | + lex.lemma = [base + " się"]; | ||
243 | + } | ||
244 | + lexicalisation.push(lex); | ||
245 | + } else { | ||
246 | + var lex = {lemma: vrb, pre: pre, args: options}; | ||
247 | + if (hasRefl(sch)) { | ||
248 | + var l = []; | ||
249 | + var k; | ||
250 | + for (k=0; k < vrb.length; k++) { | ||
251 | + l.push(vrb[k] + " się"); | ||
252 | + } | ||
253 | + lex.lemma = l; | ||
254 | + } | ||
255 | + lexicalisation.push(lex); | ||
256 | + } | ||
237 | } | 257 | } |
238 | } | 258 | } |
239 | 259 | ||
@@ -248,12 +268,11 @@ function getFormForLexicalisation(lexicalisation) { | @@ -248,12 +268,11 @@ function getFormForLexicalisation(lexicalisation) { | ||
248 | var result = ""; | 268 | var result = ""; |
249 | var i; | 269 | var i; |
250 | for (i = 0; i < lexicalisation.length; i++) { | 270 | for (i = 0; i < lexicalisation.length; i++) { |
251 | - var perms = permute(lexicalisation[i].args); | ||
252 | - var j; | ||
253 | - for (j = 0; j < perms.length; j++) { | ||
254 | - result += lexicalisationForm(lexicalisation[i].lemma, cartesian(perms[j])) | ||
255 | - } | ||
256 | - result += '<br\>'; | 271 | + var perms = permute(lexicalisation[i].args); |
272 | + var j; | ||
273 | + for (j = 0; j < perms.length; j++) { | ||
274 | + result += lexicalisationForm(lexicalisation[i].lemma, lexicalisation[i].pre, cartesian(perms[j])); | ||
275 | + } | ||
257 | } | 276 | } |
258 | return result; | 277 | return result; |
259 | } | 278 | } |
@@ -302,26 +321,79 @@ function cartesian(llist) { | @@ -302,26 +321,79 @@ function cartesian(llist) { | ||
302 | return result; | 321 | return result; |
303 | } | 322 | } |
304 | 323 | ||
305 | -function lexicalisationForm(lemma, tokenised) { | 324 | +function lexicalisationForm(lemma, pre, tokenised) { |
325 | + var list; | ||
326 | + if (pre.length == 0) { | ||
327 | + list = noSubjUnits(lemma, tokenised); | ||
328 | + } else { | ||
329 | + list = subjUnits(pre, lemma, tokenised); | ||
330 | + } | ||
331 | + | ||
306 | var display = ""; | 332 | var display = ""; |
307 | - var i; | ||
308 | - for (i = 0; i < tokenised.length; i++) { | ||
309 | - if (tokenised[i].length == 0) { | ||
310 | - display += "<br\>"; | ||
311 | - } else { | ||
312 | - var j; | ||
313 | - for (j = 0; j < lexical_units.length; j++) { | ||
314 | - if (lemma + " " + tokenised[i].join(" ") == lexical_units[j].base) { | ||
315 | - return ""; | ||
316 | - } | ||
317 | - } | ||
318 | - display += "<input type = \"checkbox\" name = \"mwe\" value = \"" + lemma + " " + tokenised[i].join(" ") + "\">"; // TODO: unikalne wartości, wartość => dodanie odpowiedniej jednostki (nazwa jednostki w wartości?) | ||
319 | - display += lemma + " " + tokenised[i].join(" ") + "<br\>"; | 333 | + var i, j; |
334 | + for (i = 0; i < list.length; i++) { | ||
335 | + var included = false; | ||
336 | + for (j = 0; j < lexical_units.length; j++) { | ||
337 | + if (list[i] == lexical_units[j].base) { | ||
338 | + included = true; | ||
339 | + } | ||
340 | + } | ||
341 | + if (!included) { | ||
342 | + display += "<input type = \"checkbox\" name = \"mwe\" value = \"" + list[i] + "\">" + list[i] + "<br\>"; | ||
320 | } | 343 | } |
321 | } | 344 | } |
322 | return display; | 345 | return display; |
323 | } | 346 | } |
324 | 347 | ||
348 | +function noSubjUnits(lemmata, dependants) { | ||
349 | + var result = []; | ||
350 | + var i, j; | ||
351 | + for (i = 0; i < lemmata.length; i++) { | ||
352 | + if (dependants.length == 0) { | ||
353 | + result.push(lemmata[i]); | ||
354 | + } else { | ||
355 | + for (j = 0; j < dependants.length; j++) { | ||
356 | + result.push(lemmata[i] + dependants[j].join(" ")); | ||
357 | + } | ||
358 | + } | ||
359 | + } | ||
360 | + return result; | ||
361 | +} | ||
362 | + | ||
363 | +function subjUnits(pre, lemmata, dependants) { | ||
364 | + var result = []; | ||
365 | + var i, j; | ||
366 | + var temp = noSubjUnits(lemmata, dependants); | ||
367 | + i = decapitate(dependants); | ||
368 | + var pre2 = i.heads; | ||
369 | + var temp2 = noSubjUnits(lemmata, i.bodies); | ||
370 | + for (i = 0; i < pre.length; i++) { | ||
371 | + for (j = 0; j < temp.length; j++) { | ||
372 | + result.push(pre[i] + " " + temp[j]); | ||
373 | + } | ||
374 | + for (j = 0; j < pre2.length; j++) { | ||
375 | + result.push(pre[i] + " " + pre2[j] + " " + temp2[j]); | ||
376 | + } | ||
377 | + } | ||
378 | + return result; | ||
379 | +} | ||
380 | + | ||
381 | +function decapitate(llist) { | ||
382 | + var heads = []; | ||
383 | + var bodies = []; | ||
384 | + var i; | ||
385 | + for (i = 0; i < llist.length; i++) { | ||
386 | + if (llist[i].length > 0) { | ||
387 | + var body = llist[i].slice(); | ||
388 | + var head = body[0]; | ||
389 | + body.splice(0, 1); | ||
390 | + heads.push(head); | ||
391 | + bodies.push(body); | ||
392 | + } | ||
393 | + } | ||
394 | + return {heads: heads, bodies: bodies}; | ||
395 | +} | ||
396 | + | ||
325 | 397 | ||
326 | // get readable form of lexical unit | 398 | // get readable form of lexical unit |
327 | function getLexicalUnit(luid) { | 399 | function getLexicalUnit(luid) { |
semantics/static/js/semantics_schemas.js
@@ -130,10 +130,11 @@ function schemaBody(schema, alternation, lex){ | @@ -130,10 +130,11 @@ function schemaBody(schema, alternation, lex){ | ||
130 | for (l = 0; l < display.arguments[k].length; l++) { | 130 | for (l = 0; l < display.arguments[k].length; l++) { |
131 | schema_body += '<td id="' + display.arguments[k][l].csv_id + 'alt_' + alternation + '_" class="' + display.arguments[k][l].csv_class + 'alt_' + alternation + '_" onclick="schemaClick(\'' + display.arguments[k][l].csv_id + 'alt_' + alternation +'_\', '; | 131 | schema_body += '<td id="' + display.arguments[k][l].csv_id + 'alt_' + alternation + '_" class="' + display.arguments[k][l].csv_class + 'alt_' + alternation + '_" onclick="schemaClick(\'' + display.arguments[k][l].csv_id + 'alt_' + alternation +'_\', '; |
132 | if (display.arguments[k][l].lex.length != 0) { | 132 | if (display.arguments[k][l].lex.length != 0) { |
133 | - schema_body += '[\'' + display.arguments[k][l].lex.join('\', \'') + '\'])">'; | 133 | + schema_body += '[\'' + display.arguments[k][l].lex.join('\', \'') + '\']'; |
134 | } else { | 134 | } else { |
135 | - schema_body += '[])">'; | 135 | + schema_body += '[]'; |
136 | } | 136 | } |
137 | + schema_body += ')">'; | ||
137 | schema_body += display.arguments[k][l].argument; | 138 | schema_body += display.arguments[k][l].argument; |
138 | schema_body += '</td>'; | 139 | schema_body += '</td>'; |
139 | if (parseInt(display.arguments[k][l].csv_id.split('_')[5]) >= 0) { | 140 | if (parseInt(display.arguments[k][l].csv_id.split('_')[5]) >= 0) { |
semantics/views.py
@@ -450,13 +450,13 @@ def ajax_schemas(request, lemma_id): | @@ -450,13 +450,13 @@ def ajax_schemas(request, lemma_id): | ||
450 | # identifier, class, argument | 450 | # identifier, class, argument |
451 | arg = [] | 451 | arg = [] |
452 | #ma["ala"] = kot | 452 | #ma["ala"] = kot |
453 | - for i, c, a in zip(idents, schema_ids, row): | 453 | + for i, c, a, p in zip(idents, schema_ids, row, ordered_positions): |
454 | astr, aobj = a | 454 | astr, aobj = a |
455 | if aobj is not None and aobj.is_phraseologic(): | 455 | if aobj is not None and aobj.is_phraseologic(): |
456 | - lex = lexicalisation(aobj) | 456 | + lex, vrb = lexicalisation(aobj, p.categories.all(), lemma.entry_obj.name) |
457 | else: | 457 | else: |
458 | - lex = [] | ||
459 | - arg.append({"csv_id": i, "csv_class": c, "argument": astr, "lex": lex}) | 458 | + lex, vrb = ([], []) |
459 | + arg.append({"csv_id": i, "csv_class": c, "argument": astr, "lex": lex, "vrb": vrb}) | ||
460 | display["arguments"].append(arg) | 460 | display["arguments"].append(arg) |
461 | 461 | ||
462 | schema_display["schemas"].append({"schema_id": str(schema.id), "grade": lemma.get_schema_opinion(schema), "colspan": str(max(len(schema_categories), 1)), "rowspan": str(schema_arguments_rowspan), "display": display, "phraseologic": schema.phraseologic}) | 462 | schema_display["schemas"].append({"schema_id": str(schema.id), "grade": lemma.get_schema_opinion(schema), "colspan": str(max(len(schema_categories), 1)), "rowspan": str(schema_arguments_rowspan), "display": display, "phraseologic": schema.phraseologic}) |