tools.py 3.96 KB

Edit Raw Blame History

import random
import numpy
from keras.preprocessing import sequence as seq


def shuffle(lol, seed):
    '''
    lol :: list of list as input
    seed :: seed the shuffling

    shuffle inplace each list in the same order
    '''
    for l in lol:
        random.seed(seed)
        random.shuffle(l)

def minibatch(l, bs):
    '''
    l :: list of word idxs
    return a list of minibatches of indexes
    which size is equal to bs
    border cases are treated as follow:
    eg: [0,1,2,3] and bs = 3
    will output:
    [[0],[0,1],[0,1,2],[1,2,3]]
    '''
    out  = [l[:i] for i in xrange(1, min(bs,len(l)+1) )]
    out += [l[i-bs:i] for i in xrange(bs,len(l)+1) ]
    assert len(l) == len(out)
    return out


def words_in_from_down_to_top_order(sentence_tree):

    levels = numpy.setdiff1d(range(len(sentence_tree)),numpy.unique(sentence_tree)) # - zwraca slowo, ktore nie jest niczyim dzieckiem - czyli powinno byc korzeniem frazy
    if len(levels) != 1: # jezeli okazuje sie jest wiecej niz jeden korzec (lub nie ma korzenia) to zwracamy None, aby pozniej rozpoznac takie zdanie i je wywalic
        return None, None
    levels = levels.tolist()

    for i in range(len(sentence_tree)):
        levels.extend(numpy.setdiff1d(sentence_tree[levels[i]],-1))

    ordered_words = numpy.array(levels)[levels != numpy.array(-1)][::-1] #odwaracmy kolejnosc na poczatku beda slowa znajdujace sie najglebiej

    order = numpy.zeros(len(sentence_tree),dtype='int')
    for i in range(len(sentence_tree)):
        order[ordered_words[i]] = i

    return ordered_words, order


def load_conll_data(conll_format_data, words2ids):


    label_trans = {'_\n':0, 'A\n':1, 'A':1, 'T':1, 'T\n':1}

    sentences = []

    k = 0
    with open(conll_format_data) as fr:
        s = []
        for line in fr:
            if len(line) < 2:
                for i in range(len(s)):
                    children = []
                    for j in range(len(s)):
                        if s[j][1] == i+1:
                            children.append(s[j][0])
                    s[i].append(children)

                words = [x[0] for x in s]
                children = seq.pad_sequences([x[4] for x in s], padding='post', value = -1)
                tokens = [x[3] for x in s]
                if len(s) == 1:
                    sentences.append([\
                                      numpy.array([words2ids[tokens[0]]]),\
                                      numpy.array([-1], ndmin=2),\
                                      numpy.array([-1], ndmin=2), \
                                      y \
                                      ])

                else:
                    ordered_words, order = words_in_from_down_to_top_order(children)
                    if ordered_words is None: #jezeli we frazie jest 2 lub wiecej albo 0 korzeni to nie wlaczamy tego zdania do naszych danych, bo uznajemy je za blendne
                        s = []
                        k = 0
                        continue

                    sentences.append([\
                                      numpy.array([words2ids[x] for x in tokens])[ordered_words],\
                                      numpy.array([[words2ids[tokens[w]] if w>=0 else -1 for w in x] for x in children[ordered_words]]),\
                                      numpy.array([[order[w] if w>= 0 else -1 for w in x] for x in children[ordered_words]]), \
                                      y \
                                      ])
                s = []
                k = 0

            else:
                toks = line.split(' ')
                token = toks[1].decode('utf8')
                parent = int(toks[6])
                sentiment = int(toks[-1] == 'S' or toks[-1] == 'S\n')
                if parent == 0: # to oznacza, ze dane slowo jest korzeniem frazy
                    y = sentiment
                s.append( [k, parent, sentiment, token] )
                k = k +1
    return sentences