match_frames.py 8.47 KB
#! /usr/bin/python3

from frame import Frame, SelectionalPreference
from transform_frame import TransformationRules
from hungarian import hungarian_algorithm
from collections import defaultdict
import numpy as np
from copy import copy

def gen_array(l1, l2, s1, s2, selprefs_table):
    fake = 10.0**6
    matrix = []
    for preference1 in l1:
        row = []
        for preference2 in l2:
            p = SelectionalPreference.similarity(preference1, preference2, selprefs_table)
            if p == 0:
                v = fake
            else:
                v = -np.log(p)
            row.append(v)
        matrix.append(row)
    for i in range(s1, s2):
        row = []
        for preference in l2:
            row.append(fake)
        matrix.append(row)
    return np.array(matrix)
            

cut_value = defaultdict(lambda: -np.log(0.4))
cut_value[u'Time'] = -np.log(0.8)
cut_value[u'Location'] = -np.log(0.8)
cut_value[u'Path'] = -np.log(0.8)
cut_value[u'Attribute'] = -np.log(0.8)
cut_value[u'Measure'] = -np.log(0.8)
cut_value[u'Lemma'] = -np.log(1.0)

arg_rank = defaultdict(lambda: 0)
arg_rank[u'Time'] = 1
arg_rank[u'Location'] = 1
arg_rank[u'Path'] = 1
arg_rank[u'Attribute'] = 1
arg_rank[u'Measure'] = 1
arg_rank[u'Lemma'] = 1


def find_max_arg_matching_value(label, arglist1, arglist2, selprefs_table):
    s1 = len(arglist1)
    s2 = len(arglist2)
    cut = s1
    if s1 <= s2:
        array = gen_array(arglist1, arglist2, s1, s2, selprefs_table)
    else:
        array = gen_array(arglist2, arglist1, s2, s1, selprefs_table)
        cut = s2
    ans_pos = hungarian_algorithm(array.copy())
    result = 0
    missing = [0, 0, 0]
    for i, j in ans_pos:
        # i and j are matched
        if i < cut:
            result += array[i][j]
        else:
            missing[arg_rank[label]] += 1
    return result, tuple(missing)

def match_undefined_preferences_and_max_match_the_rest(label, arglist1, arglist2, selprefs_table):
    selprefs1 = copy(arglist1)
    alls1 = [sp._content[0] for sp in selprefs1]
    a1 = sum(alls1)
    selprefs2 = copy(arglist2)    
    alls2 = [sp._content[0] for sp in selprefs2]
    a2 = sum(alls2)
    if a1 > 0 and a2 > 0:
        # we have to match alls in both lists and pass the rest to max match function
        indexes1 = [y for _, y in list(filter(lambda x: x[0], zip(alls1, range(len(alls1)))))]
        indexes1.reverse() # ordered from largest to smallest
        indexes2 = [y for _, y in list(filter(lambda x: x[0], zip(alls2, range(len(alls2)))))]
        indexes2.reverse() # ordered from largest to smallest
        m = max(a1, a2)
        for i, j in zip(indexes1, indexes2):
            del selprefs1[i]
            del selprefs2[j]
            # i and j are matched
    if len(selprefs1) == 0 and len(selprefs2) == 0:
        return 1.0, (0, 0, 0)
    else:
        return find_max_arg_matching_value(label, selprefs1, selprefs2, selprefs_table)


misses_coefficient = defaultdict(lambda: -np.log(0.1))
misses_coefficient[0.0] = -np.log(1.0)
misses_coefficient[1.0/3] = -np.log(0.99)
misses_coefficient[2.0/3] = -np.log(0.97)
misses_coefficient[1.0] = -np.log(0.95)
misses_coefficient[4.0/3] = -np.log(0.92)
misses_coefficient[5.0/3] = -np.log(0.9)
misses_coefficient[2.0] = -np.log(0.5)

def find_matching_value(frame1, frame2, selprefs_table):
    labels = set(frame1.get_role_labels()) | set(frame2.get_role_labels())
    tmp = 0
    missing0 = 0
    missing1 = 0
    missing2 = 0
    for label in sorted(labels):
        val, (m0, m1, m2) = match_undefined_preferences_and_max_match_the_rest(label, frame1.get_arguments(label), frame2.get_arguments(label), selprefs_table)
        tmp += val
        missing0 += m0
        missing1 += m1
        missing2 += m2
    m = missing0 + (missing1 * 1.0) / 3
    res = tmp + misses_coefficient[m]    
    return np.exp(-res)

def match_transformed_frames(frame1, frame2, rule, selprefs_table):
    v = find_matching_value(frame1, frame2, selprefs_table)
    v *= TransformationRules.get_weight(rule)
    return v

def match_frames(frames_list_1, frames_list_2, session, TT_dict, verbose=False, fake=False):
    rules = TransformationRules.get_rules()

    # global table to store already calculated selectional preferences similarity values
    selprefs_table = {}

    # filling headers_similarity_table a priori
    headers_similarity_table = headers_similarity(frames_list_1, frames_list_2)
    
    for rule in rules:
        TT_objects = []
        TransformationTable = TT_dict[str(rule)]
        transformed_frames_list_1 = {}
        for frame in frames_list_1:
            transformed_frames_list_1[frame._id] = {}
            transformed = rule.apply(frame)
            for signature, transformed_frame in transformed:
                transformed_frames_list_1[frame._id][signature] = transformed_frame
        transformed_frames_list_2 = {}
        for frame in frames_list_2:
            transformed_frames_list_2[frame._id] = {}
            transformed = rule.apply(frame)
            for signature, transformed_frame in transformed:
                transformed_frames_list_2[frame._id][signature] = transformed_frame

        for frame_id_1 in transformed_frames_list_1:
            for frame_id_2 in transformed_frames_list_2:
                calculate_and_store(rule, frame_id_1, transformed_frames_list_1, frame_id_2, transformed_frames_list_2, headers_similarity_table, selprefs_table, TransformationTable, TT_objects, verbose)
        if not fake:
            session.bulk_save_objects(TT_objects)
            session.commit()
                
                        
def match_frames_diagonal(frames_list_1, session, TT_dict, verbose=False, fake=False):
    rules = TransformationRules.get_rules()

    # global table to store already calculated selectional preferences similarity values
    selprefs_table = {}

    # filling headers_similarity_table a priori
    headers_similarity_table = headers_similarity(frames_list_1, frames_list_1)
                
    for rule in rules:
        TT_objects = []
        TransformationTable = TT_dict[str(rule)]
        transformed_frames_list_1 = {}
        transformed_frames_list_2 = {}
        for frame in frames_list_1:
            transformed_frames_list_1[frame._id] = {}
            transformed_frames_list_2[frame._id] = {}
            transformed = rule.apply(frame)
            for signature, transformed_frame in transformed:
                transformed_frames_list_1[frame._id][signature] = transformed_frame
                transformed_frames_list_2[frame._id][signature] = transformed_frame

        l = len(frames_list_1)
        for i in range(l):
            for j in range(i):
                frame_id_1 = frames_list_1[i]._id
                frame_id_2 = frames_list_1[j]._id
                calculate_and_store(rule, frame_id_1, transformed_frames_list_1, frame_id_2, transformed_frames_list_2, headers_similarity_table, selprefs_table, TransformationTable, TT_objects, verbose)

        if not fake:
            session.bulk_save_objects(TT_objects)
            session.commit()
    

def headers_similarity(frames_list_1, frames_list_2):
    headers_similarity_table = {}
    for frame_1 in frames_list_1:
        for frame_2 in frames_list_2:
            if frame_1 != frame_2:
                frame_1.lexical_closeness(frame_2, headers_similarity_table)
            else:
                headers_similarity_table[(frame_1, frame_1)] = 0.0
    return headers_similarity_table
                
    
def calculate_and_store(rule, frame_id_1, transformed_frames_list_1, frame_id_2, transformed_frames_list_2, headers_similarity_table, selprefs_table, TransformationTable, TT_objects, verbose):
    if not Frame.far(frame_id_1, frame_id_2, headers_similarity_table):
        for signature_1 in transformed_frames_list_1[frame_id_1]:
            for signature_2 in transformed_frames_list_2[frame_id_2]:
                frame_1 = transformed_frames_list_1[frame_id_1][signature_1]
                frame_2 = transformed_frames_list_2[frame_id_2][signature_2]
                if verbose:
                    print frame_1
                    print signature_1
                    print frame_2
                    print signature_2
                sim = match_transformed_frames(frame_1, frame_2, rule, selprefs_table)
                if verbose:
                    print sim
                    print "="*30
                if sim >= .3:
                    tt = TransformationTable(frame_1._id, signature_1, frame_2._id, signature_2, sim)
                    TT_objects.append(tt)
                

                            
if __name__ == '__main__':
    test()