semantics/management/commands/import_frames.py

#! /usr/bin/python
# -*- coding: utf-8 -*-
import sys, os, codecs
from django.core.management.base import BaseCommand
from dictionary.models import Lemma
from semantics.models import SemanticRole, FramePosition, Complement, LexicalUnit, SemanticFrame
from settings import PROJECT_PATH
class Command(BaseCommand):
    args = 'none'
    help = ''
    def handle(self, **options):
        #clear_import_data()
        import_frames()
def clear_import_data():
    FramePosition.objects.all().delete()
    Complement.objects.all().delete()
    SemanticFrame.objects.all().delete()
def import_frames():
    verbs_file_path = os.path.join(PROJECT_PATH, 'data', 'Semantics', 'plWN_verbs.csv')
    columns = ['id', 'od', 'do', 'range', 'level', 'ile', 'art', '2ls', 'synset', '4ls', 'xls', 'fnf', 'light', 'alt', 'conv', 'dev', 'senses', 'typ', 'change', 'causephase', 'laspect', 'maspect', 'Agent', 'Manipulator', 'Effector', 'Cognizer', 'Protagonist', 'Benefactor', 'Cause', 'Stimulus', 'Communicator', 'Path', 'Instrument', 'Patient', 'Theme', 'Experiencer', 'Resultee', 'Beneficiary', 'Object', 'Asset', 'Product', 'Content', 'Source', 'Material', 'Goal', 'Entity', 'Part', 'Collection', 'Attribute', 'Event', 'Phase', 'State-of-Affairs', 'Scenario', 'Background', 'Focus', 'Instance', 'Type', 'Location', 'Time']
    fields = {}
    for column in columns:
        fields[column] = set()
    with_comma = 0
    with codecs.open(verbs_file_path, encoding='utf_8', mode='r') as infile:
        first = True
        lines = 0
        for line in iter(infile):
            data = {}
            if first:
                first = False
            else:
                cells = line.split('\t')
                if len(cells) < len(columns):
                    print cells[8], len(cells), len(columns)
                named_cells = zip(columns, cells)
                invalid = False
                for column, cell in named_cells[(22 + len(cells) - len(columns)):]:
                    if column == 'phase' or column == 'scenario':
                        continue
                    cell = cell.strip()
                    if ',' in cell:
                        invalid = True
                    if cell != '' and not invalid:
                        data[column] = []
                        delete = False
                        for item in cell.split('|'):
                            add = []
                            for part in item.split(':'):
                                if part != '':
                                    part = part.strip()
                                    if part == 'b' or part == 'zero':
                                        delete = True
                                        add.append('')
                                    elif part == 'i' or part == 'o' or part == 're' or part == 'neg':
                                        continue
                                    elif part[0] == '\'':
                                        continue
                                    elif part == 'abl' or part == 'adl':
                                        add.append('xp(' + part + ')')
                                    elif '+' in part:
                                        d = part.split('+')
                                        if len(d) == 1:
                                            add.append('np(' + d[0] + ')')
                                        elif len(d) == 2:
                                            prep, case = part.split('+')
                                            if len(prep.split(' ')) > 1:
                                                add.append('comprepnp(' + prep + ')')
                                            else:    
                                                add.append('prepnp(' + prep + ',' + case + ')')
                                        else:
                                            print part
                                    elif part == 'xp{locat}' or part == 'xp{locat)':
                                        add.append('xp(locat)')
                                    elif part == u'że':
                                        add.append(u'cp(' + part + ')')
                                    elif part == 'inf':
                                        add.append('infp(_)')
                                    # elif part == 'nom':
                                    #     add.append('np(str)')
                                    elif part == 'bf' or part == 'sg':
                                        add.append(part)
                                    else:
                                        add.append('np(' + part + ')')
                            data[column].append(':'.join(add))
                        if delete:
                            empty = True
                            for entry in data[column]:
                                if entry != '':
                                    empty = False
                            if empty:
                                del data[column] 
                if invalid:
                    continue
                if len(data) > 0:
                    alter = max([len(cell) for cell in data.values()])
                else:
                    alter = 1
                for role in data:
                    base = data[role]
                    while len(data[role]) < alter:
                        data[role] += base
                for i in range(alter):
                    realizations = {}
                    for key in data:
                        if data[key][i] != '':
                            if data[key][i] not in realizations:
                                realizations[data[key][i]] = [key]
                            else:
                                realizations[data[key][i]].append(key)
                    # background + focus
                    if 'bf' in realizations:
                        base_role = realizations['bf']
                        del realizations['bf']
                        for key in realizations:
                            if 'background' in realizations[key]:
                                realizations[key] += base_role
                            if 'focus' in realizations[key]:
                                realizations[key] += base_role
                    # source + goal
                    if 'sg' in realizations:
                        base_role = realizations['sg']
                        del realizations['sg']
                        for key in realizations:
                            if 'source' in realizations[key]:
                                realizations[key] += base_role
                            if 'goal' in realizations[key]:
                                realizations[key] += base_role
                    frame = {', '.join(l): r for r, l in realizations.items()}
                    if len(frame) > 0:
                        for unit in cells[8].split(','):
                            lu = unit.strip()
                            if lu[0] != u'k':
                                continue
                            lemmas = Lemma.objects.filter(entry=lu.split(' ')[0], old=False)
                            if len(lemmas) != 1:
                                # print lu, '->', len(lemmas), '!=', 1
                                continue
                            lemma = lemmas[0]
                            all_schemas = lemma.frames.all()
                            if len(lu.split(' ')) == 2:
                                unit = LexicalUnit.objects.get(base=lu.split(' ')[0], sense=int(lu.split(' ')[1]))
                                # create empty frame
                                f = SemanticFrame()
                                f.save()
                                f.lexical_units.add(unit)
                                schemas = []
                                for schema in all_schemas:
                                    c = schema.characteristics.get(type=u'ZWROTNOŚĆ')
                                    if c.value.value == u'':
                                        schemas.append(schema)
                                # create unconnected roles
                                complements = {}
                                for roles, argument in frame.items():
                                    complements[argument] = Complement(frame=f)
                                    complements[argument].save()
                                    for r in roles.split(','):
                                        role = r.strip()
                                        print role
                                        dbrole = SemanticRole.objects.get(role=role)
                                        complements[argument].roles.add(dbrole)
                                # connect to EVERY frame where ALL roles can be found
                                compatible = []
                                for schema in schemas:
                                    schema_ok = True
                                    positions = schema.positions.all()
                                    connections = []
                                    for argument in frame.values():
                                        argument_ok = False
                                        for position in positions:
                                            if len(position.arguments.filter(text_rep=argument)) > 0:
                                                argument_ok = True
                                                connections.append((complements[argument], schema, position, position.arguments.filter(text_rep=argument)[0]))
                                            if argument == u'np(nom)': # subj + np(str)
                                                if len(position.arguments.filter(text_rep=u'np(str)')) > 0 and len(position.categories.filter(category=u'subj')) > 0:
                                                    argument_ok = True
                                                    connections.append((complements[argument], schema, position, position.arguments.filter(text_rep=u'np(str)')[0]))
                                            if argument == u'np(acc)': # obj + np(str)
                                                if len(position.arguments.filter(text_rep=u'np(str)')) > 0 and len(position.categories.filter(category=u'obj')) > 0:
                                                    argument_ok = True
                                                    connections.append((complements[argument], schema, position, position.arguments.filter(text_rep=u'np(str)')[0]))
                                        schema_ok &= argument_ok
                                    if schema_ok:
                                        compatible.append(schema)
                                        for c, f, p, a in connections:
                                            x = FramePosition.objects.filter(frame=f, position=p, argument=a)
                                            if len(x) > 0:
                                                c.realizations.add(x[0])
                                            else:
                                                x = FramePosition(frame=f, position=p, argument=a)
                                                x.save()
                                                c.realizations.add(x)
                            elif len(lu.split(' ')) == 3 and lu.split(' ')[1] == u'się':
                                unit = LexicalUnit.objects.get(base=' '.join(lu.split(' ')[0:2]), sense=int(lu.split(' ')[2]))
                                # create empty frame
                                f = SemanticFrame()
                                f.save()
                                f.lexical_units.add(unit)
                                schemas = []
                                for schema in all_schemas:
                                    c = schema.characteristics.get(type=u'ZWROTNOŚĆ')
                                    if c.value.value == u'się':
                                        schemas.append(schema)
                                    else:
                                        for position in schema.positions.all():
                                            if len(position.arguments.filter(text_rep=u'refl')) > 0:
                                                schemas.append(schema)
                                                break
                                # create unconnected roles
                                complements = {}
                                for roles, argument in frame.items():
                                    complements[argument] = Complement(frame=f)
                                    complements[argument].save()
                                    for r in roles.split(','):
                                        role = r.strip()
                                        print role
                                        dbrole = SemanticRole.objects.get(role=role)
                                        complements[argument].roles.add(dbrole)
                                # connect to EVERY frame where ALL roles can be found
                                compatible = []
                                for schema in schemas:
                                    schema_ok = True
                                    positions = schema.positions.all()
                                    connections = []
                                    for argument in frame.values():
                                        argument_ok = False
                                        for position in positions:
                                            if len(position.arguments.filter(text_rep=argument)) > 0:
                                                argument_ok = True
                                                connections.append((complements[argument], schema, position, position.arguments.filter(text_rep=argument)[0]))
                                            if argument == u'np(nom)': # subj + np(str)
                                                if len(position.arguments.filter(text_rep=u'np(str)')) > 0 and len(position.categories.filter(category=u'subj')) > 0:
                                                    argument_ok = True
                                                    connections.append((complements[argument], schema, position, position.arguments.filter(text_rep=u'np(str)')[0]))
                                            if argument == u'np(acc)': # obj + np(str)
                                                if len(position.arguments.filter(text_rep=u'np(str)')) > 0 and len(position.categories.filter(category=u'obj')) > 0:
                                                    argument_ok = True
                                                    connections.append((complements[argument], schema, position, position.arguments.filter(text_rep=u'np(str)')[0]))
                                        schema_ok &= argument_ok
                                    if schema_ok:
                                        compatible.append(schema)
                                        for c, f, p, a in connections:
                                            x = FramePosition.objects.filter(frame=f, position=p, argument=a)
                                            if len(x) > 0:
                                                c.realizations.add(x[0])
                                            else:
                                                x = FramePosition(frame=f, position=p, argument=a)
                                                x.save()
                                                c.realizations.add(x)