generator frazeologii - podmioty; przygotowanie do synchronizacji ze Słowosiecią

Tomasz Bartosiak
1 parent f8298e06
Showing 4 changed files with 138 additions and 40 deletions
semantics/phraseology_generator.py
semantics/static/js/semantics_lexical_units.js
semantics/static/js/semantics_schemas.js
semantics/views.py
@@ -3,7 +3,8 @@
 from dictionary.models import sort_arguments, sort_positions, sortatributes
 from settings import MORFEUSZ2
-def lexicalisation(argument):
+def lexicalisation(argument, categories, base):
+    subj = is_subj(categories)
     b = argument.type
     if b == 'fixed':
         return get_words(sortatributes(argument)[-1])
@@ -14,15 +15,21 @@ def lexicalisation(argument):
        lexicalisation_type = lexicalisation_parameters[0].values.all()[0].argument.type
        lexicalisation_parameters = sortatributes(lexicalisation_parameters[0].values.all()[0].argument)
     if lexicalisation_type == 'np': # np(case), number, nouns, atr
-        nps = get_nps(get_case(lexicalisation_parameters[0]), get_number(attributes[1]), get_words(attributes[2]), attributes[3])
-        return nps
+        nps = get_nps(get_case(lexicalisation_parameters[0], subj), get_number(attributes[1], subj), get_words(attributes[2]), attributes[3])
+        return (nps, get_verb(base, get_number(attributes[1], subj), subj))
     elif lexicalisation_type == 'prepnp': #prepnp(prep, case), number, nouns, atr
-        prepnps = get_prepnps(get_preposition(lexicalisation_parameters[0]), get_case(lexicalisation_parameters[1]), get_number(attributes[1]), get_words(attributes[2]), attributes[3])
-        return prepnps
+        prepnps = get_prepnps(get_preposition(lexicalisation_parameters[0]), get_case(lexicalisation_parameters[1], subj), get_number(attributes[1], subj), get_words(attributes[2]), attributes[3])
+        return (prepnps, [])
     else:
         return []
     return []
+def is_subj(categories):
+    for cat in categories:
+        if cat.category == u'subj':
+            return True
+    return False
+
 def get_preposition(attribute):
     return attribute.values.all()[0].parameter.type.name
@@ -30,14 +37,20 @@ def get_words(attribute):
     words = [word.text[1:-1] for word in attribute.values.all()]
     return words
-def get_case(attribute):
+def get_case(attribute, is_subj):
     case = attribute.values.all()[0].parameter.type.name
     if case == u'str':
-        case = u'acc'
+        if is_subj:
+            case = u'nom'
+        else:
+            case = u'acc'
     return case
-def get_number(attribute):
+def get_number(attribute, is_subj):
     number = attribute.values.all()[0].parameter.type.name
+    if number == u'_':
+        if is_subj:
+            number = u'sg'
     return number
 def get_nps(case, number, nouns, _atr):
@@ -65,3 +78,15 @@ def get_prepnps(prep, case, number, nouns, _atr):
     nps = get_nps(case, number, nouns, _atr)
     return [prep + ' ' + np for np in nps]
+def get_verb(inf, number, is_subj):
+    if not is_subj:
+        return None
+    else:
+        options = [(interp.orth, interp.getTag(MORFEUSZ2)) for interp in MORFEUSZ2.generate(inf.encode('utf8'))]
+        filtered = []
+        for option in options:
+            (orth, tag) = option
+            if u'fin' in tag and u'sg' in tag and u'ter' in tag:
+                filtered.append(option)
+        options = filtered
+        return [orth for orth, _ in options]
@@ -220,20 +220,40 @@ function getMeaningsSelectionForFrame(frame_id) {
 	var j;
 	for (j = 0; j < rows.length; j++) {
 	    var options = [];
+            var vrb = [];
+            var pre = [];
             sid_alt = rows[j].split('_');
 	    var sch = "schema_" + sid_alt[0] + "_";
 	    var k;
 	    for (k = 0; k < schemas_content[sch].display.arguments[0].length; k++) {
 		var proper = schemas_content[sch].display.arguments[0][k].csv_id + "alt_" + sid_alt[1] + "_";
                 if (connected[lem].indexOf(proper) != -1) {
-                    options.push(schemas_content[sch].display.arguments[0][k].lex);
+                    if (schemas_content[sch].display.arguments[0][k].vrb.length > 0) {
+                        pre.push(schemas_content[sch].display.arguments[0][k].lex);
+                        vrb = schemas_content[sch].display.arguments[0][k].vrb;
+                    } else {
+                        options.push(schemas_content[sch].display.arguments[0][k].lex);
+                    }
 		} 
 	    } 
-            var lex = {lemma: base, args: options};
-	    if (hasRefl(sch)) {
-		lex.lemma = base + " się";
-	    }
-	    lexicalisation.push(lex);
+            if (vrb.length == 0) {
+		var lex = {lemma: [base], pre: pre, args: options};
+		if (hasRefl(sch)) {
+		    lex.lemma = [base + " się"];
+		}
+		lexicalisation.push(lex);
+            } else {
+		var lex = {lemma: vrb, pre: pre, args: options};
+		if (hasRefl(sch)) {
+                    var l = [];
+                    var k;
+                    for (k=0; k < vrb.length; k++) {
+		        l.push(vrb[k] + " się");
+                    }
+                    lex.lemma = l;
+		}
+		lexicalisation.push(lex);
+            }
 	} 
     } 
@@ -248,12 +268,11 @@ function getFormForLexicalisation(lexicalisation) {
     var result = "";
     var i;
     for (i = 0; i < lexicalisation.length; i++) {
-        var perms = permute(lexicalisation[i].args);
-        var j;
-        for (j = 0; j < perms.length; j++) {
-            result += lexicalisationForm(lexicalisation[i].lemma, cartesian(perms[j]))
-        }
-        result += '<br\>';
+	var perms = permute(lexicalisation[i].args);
+	var j;
+	for (j = 0; j < perms.length; j++) {
+	    result += lexicalisationForm(lexicalisation[i].lemma, lexicalisation[i].pre, cartesian(perms[j]));
+	}
     }
     return result;
 }
@@ -302,26 +321,79 @@ function cartesian(llist) {
     return result;
 }
-function lexicalisationForm(lemma, tokenised) {
+function lexicalisationForm(lemma, pre, tokenised) {
+    var list;
+    if (pre.length == 0) {
+        list = noSubjUnits(lemma, tokenised);
+    } else {
+        list = subjUnits(pre, lemma, tokenised);
+    }
+
     var display = "";
-    var i;
-    for (i = 0; i < tokenised.length; i++) {
-        if (tokenised[i].length == 0) {
-            display += "<br\>";
-        } else {
-	    var j;
-	    for (j = 0; j < lexical_units.length; j++) {
-		if (lemma + " " + tokenised[i].join(" ") == lexical_units[j].base) {
-		    return "";
-		}
-	    }
-            display += "<input type = \"checkbox\" name = \"mwe\" value = \"" + lemma + " " + tokenised[i].join(" ") + "\">"; // TODO: unikalne wartości, wartość => dodanie odpowiedniej jednostki (nazwa jednostki w wartości?)
-            display += lemma + " " + tokenised[i].join(" ") + "<br\>";
+    var i, j;
+    for (i = 0; i < list.length; i++) {
+        var included = false;
+        for (j = 0; j < lexical_units.length; j++) {
+            if (list[i] == lexical_units[j].base) {
+                included = true;
+            }
+        }
+        if (!included) {
+            display += "<input type = \"checkbox\" name = \"mwe\" value = \"" + list[i] + "\">" + list[i] + "<br\>";
         }
     }
     return display;
 }
+function noSubjUnits(lemmata, dependants) {
+    var result = [];
+    var i, j;
+    for (i = 0; i < lemmata.length; i++) {
+        if (dependants.length == 0) {
+             result.push(lemmata[i]);
+        } else {
+            for (j = 0; j < dependants.length; j++) {
+                result.push(lemmata[i] + dependants[j].join(" "));
+            }
+        }
+    }
+    return result;
+}
+
+function subjUnits(pre, lemmata, dependants) {
+    var result = [];
+    var i, j;
+    var temp = noSubjUnits(lemmata, dependants);
+    i = decapitate(dependants);
+    var pre2 = i.heads;
+    var temp2 = noSubjUnits(lemmata, i.bodies);
+    for (i = 0; i < pre.length; i++) {
+        for (j = 0; j < temp.length; j++) {
+            result.push(pre[i] + " " + temp[j]);
+        }
+        for (j = 0; j < pre2.length; j++) {
+            result.push(pre[i] + " " + pre2[j] + " " + temp2[j]);
+        }
+    }
+    return result;
+}
+
+function decapitate(llist) {
+    var heads = [];
+    var bodies = [];
+    var i;
+    for (i = 0; i < llist.length; i++) {
+        if (llist[i].length > 0) {
+            var body = llist[i].slice();
+            var head = body[0];
+            body.splice(0, 1);
+            heads.push(head);
+            bodies.push(body);
+        }
+    }
+    return {heads: heads, bodies: bodies};
+}
+
 // get readable form of lexical unit
 function getLexicalUnit(luid) {
@@ -130,10 +130,11 @@ function schemaBody(schema, alternation, lex){
         for (l = 0; l < display.arguments[k].length; l++) {
             schema_body += '<td id="' + display.arguments[k][l].csv_id + 'alt_' + alternation + '_" class="' + display.arguments[k][l].csv_class + 'alt_' + alternation + '_"  onclick="schemaClick(\'' + display.arguments[k][l].csv_id + 'alt_' + alternation +'_\', ';
 	    if (display.arguments[k][l].lex.length != 0) {
-		schema_body += '[\'' + display.arguments[k][l].lex.join('\', \'') + '\'])">';
+		schema_body += '[\'' + display.arguments[k][l].lex.join('\', \'') + '\']';
 	    } else {
-		schema_body += '[])">';
+		schema_body += '[]';
 	    }
+            schema_body += ')">';
             schema_body += display.arguments[k][l].argument;
             schema_body += '</td>';
             if (parseInt(display.arguments[k][l].csv_id.split('_')[5]) >= 0) {
@@ -450,13 +450,13 @@ def ajax_schemas(request, lemma_id):
                 # identifier, class, argument
                 arg =  []
                 #ma["ala"] = kot
-                for i, c, a in zip(idents, schema_ids, row):
+                for i, c, a, p in zip(idents, schema_ids, row, ordered_positions):
                     astr, aobj = a
                     if aobj is not None and aobj.is_phraseologic():
-                        lex = lexicalisation(aobj)
+                        lex, vrb = lexicalisation(aobj, p.categories.all(), lemma.entry_obj.name)
                     else:
-                        lex = []
-                    arg.append({"csv_id": i, "csv_class": c, "argument": astr, "lex": lex})
+                        lex, vrb = ([], [])
+                    arg.append({"csv_id": i, "csv_class": c, "argument": astr, "lex": lex, "vrb": vrb})
                 display["arguments"].append(arg)
             schema_display["schemas"].append({"schema_id": str(schema.id), "grade": lemma.get_schema_opinion(schema), "colspan": str(max(len(schema_categories), 1)), "rowspan": str(schema_arguments_rowspan), "display": display, "phraseologic": schema.phraseologic})