load_initial_nverb_frames.py 10.2 KB
#-*- coding:utf-8 -*-

#Copyright (c) 2014, Bartłomiej Nitoń
#All rights reserved.

#Redistribution and use in source and binary forms, with or without modification, are permitted provided 
#that the following conditions are met:

#    Redistributions of source code must retain the above copyright notice, this list of conditions and 
#    the following disclaimer.
#    Redistributions in binary form must reproduce the above copyright notice, this list of conditions 
#    and the following disclaimer in the documentation and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
# POSSIBILITY OF SUCH DAMAGE.

import codecs
import itertools
from operator import itemgetter

from django.core.management.base import BaseCommand

#from dictionary.common_func import arg_data_to_arg, args_to_position, \
#                                   positions_to_frame
from dictionary.models import Argument, Argument_Model, Frame_Opinion, \
                              Frame_Opinion_Value, Lemma, positions_to_frame, \
                              get_or_create_position
                              

NOUNS_ADDED_PATH = 'data/nverbs/nouns/added-merged_nouns_val.txt'
NOUNS_ERROR_PATH = 'data/nverbs/nouns/error-merged_nouns_val.txt'
NOUNS_FRAMES_PATH = 'data/nverbs/nouns/merged_nouns_val-poss.txt'

ADJS_ADDED_PATH = 'data/nverbs/adjs/added-merged_adjs_val.txt'
ADJS_ERROR_PATH = 'data/nverbs/adjs/error-merged_adjs_val.txt'
ADJS_FRAMES_PATH = 'data/nverbs/adjs/merged_adjs_val-P1.txt'

class Command(BaseCommand):
    args = 'none'
    help = """
    Adds initial nverb frames.
    """

    def handle(self, **options):
        #add_initial_frames(NOUNS_FRAMES_PATH, NOUNS_ADDED_PATH, NOUNS_ERROR_PATH, 'noun')   
        add_initial_frames(ADJS_FRAMES_PATH, ADJS_ADDED_PATH, ADJS_ERROR_PATH, 'adj')    
        
def add_initial_frames(frames_path, added_path, error_path, pos_tag):
    try:
        added_file = codecs.open(added_path, "wt", 'utf-8')
        error_file = codecs.open(error_path, "wt", 'utf-8')
        frames_file = codecs.open(frames_path, "rt", 'utf-8') 
        for line in frames_file:
            line = line.strip()
            pred_val = ''
            if line.startswith('%'):
                continue
            lemma, frames_str, pred_val = get_frames_info(line)
            try:
                lemma_obj = Lemma.objects.get(entry=lemma, old=False, 
                                              status__status=u'do obróbki',
                                              entry_obj__pos__tag=pos_tag)
                #lemma_obj.frames.clear()
                print lemma_obj
                try:
                    parse_and_add_frames(lemma_obj, frames_str, pred_val)
                    added_file.write(u'%s\n' % line)
                except:
                    error_file.write(u'%s\n' % line)
            except Lemma.DoesNotExist:
                pass
    finally:
        added_file.close()
        error_file.close()
        frames_file.close()
        
def add_initial_frames_by_entries(entries, frames_path, added_path, error_path, pos_tag):
    print 'Adding initial frames!'
    try:
        added_file = codecs.open(added_path, "wt", 'utf-8')
        error_file = codecs.open(error_path, "wt", 'utf-8')
        frames_file = codecs.open(frames_path, "rt", 'utf-8')
        for line in frames_file:
            line = line.strip()
            pred_val = ''
            if line.startswith('%'):
                continue
            lemma, frames_str, pred_val = get_frames_info(line)
            if lemma in entries:
                try:
                    lemma_obj = Lemma.objects.get(entry=lemma, old=False, 
                                                  status__status=u'do obróbki',
                                                  entry_obj__pos__tag=pos_tag)
                    print lemma_obj
                    try:
                        parse_and_add_frames(lemma_obj, frames_str, pred_val)
                        added_file.write(u'%s\n' % line)
                    except:
                        error_file.write(u'%s\n' % line)
                except Lemma.DoesNotExist:
                    pass
    finally:
        added_file.close()
        error_file.close()
        frames_file.close()
        
def get_frames_info(line):
    predicativity_val = ''
    line_parts = line.split('\t')
    lemma = line_parts[0].strip()
    frames_str = line_parts[1].strip()
    if len(line_parts) == 3 and line_parts[2] == 'PRED':
        predicativity_val = 'pred'
    return lemma, frames_str, predicativity_val

def parse_and_add_frames(lemma_obj, frames_str, predicativity_val): 
    poss_ls = []
    valence_ls = [arg.strip() for arg in frames_str.split('+')]
    for pos_arg in valence_ls:
        pos_arg = pos_arg.strip()
        possible_args = pos_arg.split('/')
        possible_args = coordinate_arguments(possible_args)
        poss_ls.append(possible_args)
    confs = itertools.product(*poss_ls)
    for frame_args in list(confs):
        frame_args = list(set(frame_args)) #--> tutaj byl fuckup i tworzyly sie dziwne pozycje majace ten sam argument kilkukrotnie, moze ta linijka pomoze
        frame_obj, frame_opinion_obj = create_frame(frame_args, predicativity_val)
        lemma_obj.frames.add(frame_obj)
        lemma_obj.frame_opinions.add(frame_opinion_obj)
        
def coordinate_arguments(arguments):
    coordinated_args = []
    for arg in arguments:
        arg_type, attributes = arg_from_text_rep(arg)
        case, preposition = get_arg_case_and_preposition(arg)
        coordinated_arg = next((arg for arg in coordinated_args if (arg['case'] == case and 
                                                                    arg['preposition'] == preposition)), None)
        if coordinated_arg and case:
            coordinated_arg['argument'] += ';%s' % arg
        else:
            coordinated_arg = {'argument': arg,
                               'case': case,
                               'preposition': preposition}
            coordinated_args.append(coordinated_arg)
        if arg_type == 'ncp':
            additional_arg = u'np(%s)' % case
            coordinated_arg['argument'] += ';%s' % additional_arg
        elif arg_type == 'prepncp':
            additional_arg = u'prepnp(%s,%s)' % (preposition, case)
            coordinated_arg['argument'] += ';%s' % additional_arg
    
    return [arg['argument'] for arg in coordinated_args] 

def arg_from_text_rep(argument):
    attributes = []
    arg_parts = argument.split('(')
    arg_type = arg_parts[0]
    if len(arg_parts) > 1:
        attributes = arg_parts[1].rstrip(')').replace("'", "").split(',')
    return arg_type, attributes

def get_arg_case_and_preposition(argument):
    case = ''
    preposition = ''
    argument = arg_conversion(argument)
    arg_type, attributes = arg_from_text_rep(argument)
    argument_model = Argument_Model.objects.get(arg_model_name=arg_type)
    attribute_models = argument_model.atribute_models.order_by('priority')
    for attr_model, attr_text_rep in zip(attribute_models, attributes):
        if attr_model.atr_model_name == u'PRZYPADEK':
            case = attr_text_rep
        elif attr_model.atr_model_name == u'PRZYIMEK':
            preposition = attr_text_rep
    return case, preposition  
        
def arg_conversion(arg_text_rep):
    arg_text_rep = arg_text_rep.replace('!', '').replace('*', '').replace('?', '')
    if arg_text_rep == 'advp':
        arg_text_rep = u'xp(_)'
    elif arg_text_rep.startswith('comprepnp'):
        arg_text_rep = arg_text_rep.replace("'", "").replace(',gen', '')
    return arg_text_rep

def create_frame(frame_args, predicativity_val):
    positions_objs, frame_opinion_value = get_positions(frame_args)
    frame_obj = positions_to_frame(positions_objs, 
                                   reflex='', 
                                   negativity='', 
                                   predicativity=predicativity_val, 
                                   aspect='')
    frame_opinion_obj, xx = Frame_Opinion.objects.get_or_create(frame=frame_obj, 
                                                                value=frame_opinion_value)
    return frame_obj, frame_opinion_obj

def get_positions(args_strs):
    poss_objs = []
    frame_opinions = []
    for poss_args_str in args_strs:
        frame_opinions.append(possible_frame_opinion(poss_args_str))
        poss_objs.append(create_position(poss_args_str))
    frame_opinion = sorted(frame_opinions, key=itemgetter('priority'), reverse=False)[0]
    frame_opinion_value = Frame_Opinion_Value.objects.get(value=frame_opinion['opinion'])
    return poss_objs, frame_opinion_value

def possible_frame_opinion(arg_str):
    opinion = {'opinion': 'pewny',
               'priority': '4'}
    if '!' in arg_str:
        opinion = {'opinion': u'zły',
                   'priority': '1'}
    elif '?' in arg_str:
        opinion = {'opinion': u'wątpliwy',
                   'priority': '2'}
    elif '*' in arg_str:
        opinion = {'opinion': u'archaiczny',
                   'priority': '3'}
    return opinion

def create_position(args_str):
    arg_objs = []
    for arg_text_rep in args_str.split(';'):
        arg_text_rep = arg_conversion(arg_text_rep)
#        try:
        arg_obj = Argument.objects.get(text_rep=arg_text_rep)
#        except Argument.DoesNotExist: # TODO wylaczac przy wstepnym wrzucaniu hasel
#            arg_type, attributes = arg_from_text_rep(arg_text_rep)
#            arg_obj = arg_data_to_arg(arg_type, attributes)
        arg_objs.append(arg_obj)
    pos_obj = get_or_create_position(categories=[], arguments=arg_objs)
    return pos_obj