load_skladnica_frames.py 14.7 KB
#-*- coding:utf-8 -*-
# author: B.Niton

import codecs
import operator
import re

from django.core.management.base import BaseCommand

from dictionary.models import *
from common.util import triple_arg_poss

from django.db.models import Q

SKL_FRAMES_PATH = 'data/ramki_skl_v_130925.txt'
NLEMMA_PATH = 'data/brak_frames.txt'
UNDERLINE_PATH = 'data/underline_frames.txt'

class Command(BaseCommand):
    help = 'Creates składnica frame models from składnica frames.'

    def handle(self, file_path=SKL_FRAMES_PATH, **options):
        load_frames()  

def update_arg(arg):
    arg = arg.replace(u'mian', u'nom')
    arg = arg.replace(u'bier', u'acc')
    arg = arg.replace(u'cel', u'dat')
    arg = arg.replace(u'dop', u'gen')
    arg = arg.replace(u'miej', u'loc')
    arg = arg.replace(u'narz', u'inst')
    arg = arg.replace(u'pop', u'postp')   
    
    for atr in Atribute_Value.objects.filter(related=True):
        for main_atr in atr.main_attr_values.all():
            arg = arg.replace(u'(%s)' % atr.value, u'(%s)' % main_atr.value)
            arg = arg.replace(u'(%s,' % atr.value, u'(%s,' % main_atr.value)
            arg = arg.replace(u',%s)' % atr.value, u',%s)' % main_atr.value)
            arg = arg.replace(u',%s,' % atr.value, u',%s,' % main_atr.value)
    
    return arg         

def possible_args(arg, pos, preps):
    possibilities = []
    if arg == 'subj':
        possibilities.extend(triple_arg_poss('np(str)', ['subj']))
    elif arg == 'np(bier)': 
        possibilities.extend(triple_arg_poss('np(str)', ['', 'obj']))  
        possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj']))
        possibilities.extend(triple_arg_poss('np(part)', ['', 'obj', 'subj']))
    elif arg == 'np(dop)':
        possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj']))
        possibilities.extend(triple_arg_poss('np(part)', ['', 'obj', 'subj']))
        #if pos == 'ger': # przechodzi tez na biernik
        possibilities.extend(triple_arg_poss('np(str)', ['', 'obj']))
        possibilities.extend(triple_arg_poss('np(acc)', ['', 'obj', 'subj']))
    elif (arg.startswith('prepnp(jak,') or arg.startswith('prepnp(jako,') or
          arg.startswith(u'prepnp(niż,')):
        prepnp_atr_ls = arg.replace('prepnp(', '').replace(')', '').split(',')
        preposition = prepnp_atr_ls[0]
        case = prepnp_atr_ls[1]
        possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj']))
        if case == 'mian' or case == 'bier':
            possibilities.extend(triple_arg_poss('prepnp(' + preposition + ',str)', ['', 'obj', 'subj']))    
    elif(arg == u'prepnp(na temat,dop)'): # nie znajduje tego w wywleczonych 
        possibilities.extend(triple_arg_poss(u'comprepnp(na temat)', ['', 'obj', 'subj']))
    elif(arg == u"prepnp(w sprawie,dop)"):
        possibilities.extend(triple_arg_poss(u'comprepnp(w sprawie)', ['', 'obj', 'subj']))
    elif(arg == u"prepnp(z powodu,dop)"):
        possibilities.extend(triple_arg_poss(u'comprepnp(z powodu)', ['', 'obj', 'subj']))
    elif(arg == u"adjp(mian)"):
        possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj']))
        possibilities.extend(triple_arg_poss('adjp(pred)', ['', 'obj', 'subj']))
    elif(arg == u"adjp(narz)"):
        possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj']))
        possibilities.extend(triple_arg_poss('adjp(pred)', ['', 'obj', 'subj']))
    elif(arg.startswith(u'sentp')):
        # liczy przecinki by sprawdzic liczbe atrybutow w 'sentp'
        number_of_commas = arg.count(u',')
        conv_arg = arg.replace(u'pz', u'int')
        if(number_of_commas == 0):
            possibilities.extend(triple_arg_poss(update_arg(conv_arg.replace(u'sentp', u'cp')), ['', 'obj', 'subj']))
        elif(number_of_commas == 1):
            possibilities.extend(triple_arg_poss(update_arg(conv_arg.replace(u'sentp', u'ncp')), ['', 'obj', 'subj']))
        elif(number_of_commas == 2):
            possibilities.extend(triple_arg_poss(update_arg(conv_arg.replace(u'sentp', u'prepncp')), ['', 'obj', 'subj']))
    elif(arg == u'advp'):
        possibilities.extend(triple_arg_poss(u'xp(_)', ['', 'obj', 'subj']))
        possibilities.extend(triple_arg_poss(u'advp(pron)', ['', 'obj', 'subj']))
        possibilities.extend(triple_arg_poss(u'advp(misc)', ['', 'obj', 'subj']))
        possibilities.extend(triple_arg_poss(u'advp(pred)', ['', 'obj', 'subj']))
        #possibilities.extend(triple_arg_poss(u'advp(locat)', ['', 'obj', 'subj']))
        #possibilities.extend(triple_arg_poss(u'advp(abl)', ['', 'obj', 'subj']))
        #possibilities.extend(triple_arg_poss(u'advp(adl)', ['', 'obj', 'subj']))
        #possibilities.extend(triple_arg_poss(u'advp(perl)', ['', 'obj', 'subj']))
        #possibilities.extend(triple_arg_poss(u'advp(temp)', ['', 'obj', 'subj']))
        #possibilities.extend(triple_arg_poss(u'advp(dur)', ['', 'obj', 'subj']))
        #possibilities.extend(triple_arg_poss(u'advp(mod)', ['', 'obj', 'subj']))
        for prepnp in preps:
            q_prepnps = []
            possibilities.extend(prepnp['poss_args'])
            for prep_arg in prepnp['poss_args']:
                q_prepnps.append(Q(realizations=prep_arg))
            if q_prepnps:
                xps = Argument.objects.filter(type='xp').filter(reduce(operator.or_, q_prepnps)).all()   
                for xp in xps:
                    possibilities.extend(triple_arg_poss(xp.text_rep, ['', 'obj', 'subj']))
    elif(arg == u'prepnp(przez,bier)'):
        possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj']))
        #if pos == 'ger' or pos == 'ppas':
        possibilities.extend(triple_arg_poss('np(str)', ['subj']))
    elif arg.startswith('infp('):
        conv_arg = copy.deepcopy(arg)
        conv_arg = conv_arg.replace('(nd)', '(imperf)')
        conv_arg = conv_arg.replace('(dk)', '(perf)')
        possibilities.extend(triple_arg_poss(update_arg(conv_arg), ['', 'obj', 'subj']))
        possibilities.extend(triple_arg_poss('infp(_)', ['', 'obj', 'subj']))
    elif(arg == u''):
        possibilities.extend(triple_arg_poss('E', ['subj']))
        possibilities.extend(triple_arg_poss('np(str)', ['subj']))
    else:
        possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj']))
        
    return possibilities


def mark_sub_frames(lemma):
    print smart_str(lemma)
    for frame in lemma.skladnica_frames.all():
        frame.sub_frames.clear()
        frame.visible = True
        frame.save()
    for frame in lemma.skladnica_frames.all(): 
        print frame
        bigger_frames = lemma.skladnica_frames.all()
        bigger_frames = bigger_frames.filter(reflex=frame.reflex).exclude(pk=frame.pk)
        for arg in frame.arguments.all():
            bigger_frames = bigger_frames.filter(arguments__text_id=arg.text_id)
        if bigger_frames.count() > 0:
            frame.visible = False
            frame.save()
        for bigger_frame in bigger_frames.all():  
            bigger_frame.sub_frames.add(frame)   
        for bigger_frame in bigger_frames.all():    
            if(bigger_frame.arguments.count() == frame.arguments.count()):
                if not frame.skladnica_frames.annotate(arguments_count=Count('arguments')).filter(arguments_count__gt=
                                                                                              frame.arguments.count()).count() > 0: 
                    bigger_frame.visible = False
                    bigger_frame.save()
                    frame.visible = True
                    frame.save()
            
        for sub_frame in frame.sub_frames.all():
            if sub_frame.arguments.count() < frame.arguments.count():
                sub_frame.visible = False
                sub_frame.save()
        if(frame.old_frame_value == '' and 
           lemma.skladnica_frames.count() != lemma.skladnica_frames.filter(old_frame_value='').count()):
            frame.visible = False
            frame.save()
        

def load_frames():
    print 'Be patient, it can take a while.'
#    for lemma in Lemma.objects.filter(old=False): ### UWAGA NA TO, POTEM WYWALIC
#        for frame in lemma.skladnica_frames.all():
#            frame.delete()
    try:                                                               
        f = codecs.open(SKL_FRAMES_PATH, "rt", 'utf-8')  
        nlemmafile = codecs.open(NLEMMA_PATH, 'wt', 'utf-8') 
        underfile = codecs.open(UNDERLINE_PATH, 'wt', 'utf-8') 
        example_source_obj = NKJP_Source.objects.get(source=u'korpus ręcznie anotowany (1.2M segmentów)')  
        example_opinion_obj = NKJP_Opinion.objects.get(opinion=u'dobry')                         
        try:
            for line in f:
                line_pattern = re.compile(ur"^([^\s]+)[\s]*([^\s]+)[\s]*((\[[^\]]*\])|(_))[\s]*(\[[^\]]*\])[\s]*\[sent_id='([^']*)'\][\s]*\[text=([^\]]*)\][\s]*$")

                m = line_pattern.match(line)
                if not m:
                    print smart_str(line)
                    continue
                if m:
                    lemma_str = m.group(1).strip()
                    pos = m.group(2).strip()
                    frame_str = m.group(3).strip()
                    prep_str = m.group(6).strip()
                    sent_id_str = m.group(7).strip()
                    example_str = m.group(8).strip().strip("'")
                    print smart_str(lemma_str)
                    
                    lemmas = Lemma.objects.filter(entry_obj__name=lemma_str, old=False) # TODO powinno objac tez pewnie lematy ze starych wersji w kontroli zmian
                    if lemmas.count() == 0:
                        nlemmafile.write(line.strip() + u'\n')
                        continue
                    
                    if frame_str == '_':
                        underfile.write(line.strip() + u'\n')
                        frame_str = '[]'
                    tokens = frame_str.replace('[', '').replace(']', '').split(',')
                    preps = []
                    prep_obj = None
                    if not prep_str == '[]':
                        for prep in prep_str.replace('[', '').replace(']', '').split(';'):
                            if not prep.startswith('cat='):
                                preps.append({'arg': prep,
                                              'poss_args': possible_args(prep, pos, preps),
                                              'poss_positions': []})
                        if preps:
                            prep_obj, created = Skladnica_Argument.objects.get_or_create(text_rep=preps[0]['arg'],
                                                                                         text_id=preps[0]['arg'],
                                                                                         prepnp=None)
                            prep_obj.possible_args.add(*preps[0]['poss_args'])
                    
                    args_ls = []
                    arg_str = ''
                    for tok in tokens:
                        arg_str += tok.strip() + ','
                        if (('(' in arg_str and ')' in arg_str) or
                            (not ('(' in arg_str) and not (')' in arg_str))):
                            args_ls.append(arg_str.strip().rstrip(','))
                            arg_str = ''
                    
                    skladnica_args_ls = []  
                    args_ls.sort()
                    for arg in args_ls:
                        possibilities = []
                        if arg != 'sie':
                            if prep_obj:
                                prep_text_rep = prep_obj.text_rep
                            else:
                                prep_text_rep = ''
                            possibilities = possible_args(arg, pos, preps)  
                            if arg == 'advp' and prep_text_rep:
                                arg_obj, created = Skladnica_Argument.objects.get_or_create(text_rep=arg,
                                                                                            text_id=arg+'_'+prep_text_rep,
                                                                                            prepnp=prep_obj)
                            else:
                                arg_obj, created = Skladnica_Argument.objects.get_or_create(text_rep=arg,
                                                                                            text_id=arg,
                                                                                            prepnp=None)  
                            arg_obj.possible_args.add(*possibilities)
                            skladnica_args_ls.append(arg_obj)                                                           #prepnp=prep_obj)
                    if 'subj' in args_ls:
                        property_obj = Old_Frame_Property.objects.get(name='V')
                    else:
                        property_obj = Old_Frame_Property.objects.get(name='Q')
                    if 'sie' in args_ls:
                        reflex = True
                    else:
                        reflex = False
                    
                    nkjp_example_obj = NKJP_Example(sentence=example_str, source=example_source_obj,
                                                          comment='', opinion=example_opinion_obj) # nowododany warunek wyszukiwania pole wyszukiwania
                    nkjp_example_obj.save()
                                                                               
                    skladnica_frame_obj = Old_Frame(property=property_obj, 
                                                    reflex=reflex, 
                                                    example=nkjp_example_obj,
                                                    pos_tag=pos, 
                                                    old_frame_value='+'.join(args_ls),
                                                    sent_id=sent_id_str)
                    skladnica_frame_obj.save()
                    skladnica_frame_obj.arguments.add(*skladnica_args_ls)
                    for lemma in lemmas.all():
                        lemma.skladnica_frames.add(skladnica_frame_obj)

            for lemma in Lemma.objects.filter(old=False, entry_obj__name=u'być'):
                mark_sub_frames(lemma)
                for old_lemma in Lemma.objects.filter(entry_obj__name=lemma.entry_obj.name, old=True):
                    old_lemma.skladnica_frames.clear()
                    old_lemma.skladnica_frames.add(*lemma.skladnica_frames.all())
        finally:                                                       
            f.close()
            nlemmafile.close()
            underfile.close()
  
    except IOError:                                                    
        return 'Error: Can not work on file %s, check if it exists!' % SKL_FRAMES_PATH