tools.py 24.1 KB

Edit Raw Blame History

import random
import numpy
from keras.preprocessing import sequence as seq
import theano

import pickle

def shuffle(lol, seed):
    '''
    lol :: list of list as input
    seed :: seed the shuffling

    shuffle inplace each list in the same order
    '''
    for l in lol:
        random.seed(seed)
        random.shuffle(l)

def minibatch(l, bs):
    '''
    l :: list of word idxs
    return a list of minibatches of indexes
    which size is equal to bs
    border cases are treated as follow:
    eg: [0,1,2,3] and bs = 3
    will output:
    [[0],[0,1],[0,1,2],[1,2,3]]
    '''
    out  = [l[:i] for i in xrange(1, min(bs,len(l)+1) )]
    out += [l[i-bs:i] for i in xrange(bs,len(l)+1) ]
    assert len(l) == len(out)
    return out


def filter_embeddings(datasets, embedding_path, destination):
    '''
    Funkcja redukuje zbior embeddingow, tylko do tych, ktore wystepuja w naszych danych.
    Ostatni wektor w macierzy jest wektorem zerowym.

    datasets - lista zbiorow danych, ktore zostana uzyte w analizie
    embedding_path - plik z wszystkimi embeddingami
    '''

    words = set()
    for dataset in datasets:
        with open(dataset) as f:
            for x in f.read().split():
                words.add(x)


    words2ids = {}
    vectors = []
    i = 0
    for line in open(embedding_path,"r"):
        toks = line.strip("\n").split(" ")
        word = toks[0]
        if word in words:
            v = map(float, toks[1:])
            vectors.append(v)
            words2ids[word] = i
            i = i + 1

    vectors.append(numpy.zeros((len(vectors[0]))))
    vectors = numpy.array(vectors)
    print(vectors.shape)

    pickle.dump(dict([("vectors",vectors), ("words2ids",words2ids)]), open(destination,"w"))


def words_in_from_down_to_top_order(sentence_tree):
    #print sentence_tree
    levels = numpy.setdiff1d(range(len(sentence_tree)),numpy.unique(sentence_tree)) # - zwraca slowo/a, ktore nie jest niczyim dzieckiem - czyli powinno/y byc korzeniem/korzeniami frazy/fraz
    if len(levels) == 0: # wczesniej bylo != 1, co oznaczalo, ze jezeli okazuje sie jest wiecej niz jeden korzec (lub nie ma korzenia) to zwracamy None, aby pozniej rozpoznac takie zdanie i je wywalic. Ale jak robimy batche to musi byc kilka korzeni
        return None, None
    levels = levels.tolist()

    for i in range(len(sentence_tree)):
        #print i
        #print levels[i]
        levels.extend(numpy.setdiff1d(sentence_tree[levels[i]],-1))

    ordered_words = numpy.array(levels)[levels != numpy.array(-1)][::-1] #odwaracmy kolejnosc na poczatku beda slowa znajdujace sie najglebiej

    order = numpy.zeros(len(sentence_tree),dtype='int')
    for i in range(len(sentence_tree)):
        order[ordered_words[i]] = i

    return ordered_words, order


def load_conll_data(conll_format_data, words2ids):


    label_trans = {'_\n':0, 'A\n':1, 'A':1, 'T':1, 'T\n':1}

    sentences = []

    k = 0
    with open(conll_format_data) as fr:
        s = []
        for line in fr:
            if len(line) < 2:
                for i in range(len(s)):
                    children = []
                    for j in range(len(s)):
                        if s[j][1] == i+1:
                            children.append(s[j][0])
                    s[i].append(children)

                words = [x[0] for x in s]
                children = seq.pad_sequences([x[4] for x in s], padding='post', value = -1)
                tokens = [x[3] for x in s]
                if len(s) == 1:
                    sentences.append([\
                                      numpy.array([words2ids[tokens[0]]]),\
                                      numpy.array([-1], ndmin=2),\
                                      numpy.array([-1], ndmin=2), \
                                      y \
                                      ])

                else:
                    ordered_words, order = words_in_from_down_to_top_order(children)
                    if ordered_words is None: #jezeli we frazie jest 2 lub wiecej albo 0 korzeni to nie wlaczamy tego zdania do naszych danych, bo uznajemy je za blendne
                        s = []
                        k = 0
                        continue

                    sentences.append([\
                                      numpy.array([words2ids[x] for x in tokens])[ordered_words],\
                                      numpy.array([[words2ids[tokens[w]] if w>=0 else -1 for w in x] for x in children[ordered_words]]),\
                                      numpy.array([[order[w] if w>= 0 else -1 for w in x] for x in children[ordered_words]]), \
                                      y \
                                      ])
                s = []
                k = 0

            else:
                toks = line.split(' ')
                token = toks[1].decode('utf8')
                parent = int(toks[6])
                sentiment = int(toks[-1] == 'S' or toks[-1] == 'S\n')
                if parent == 0: # to oznacza, ze dane slowo jest korzeniem frazy
                    y = sentiment
                s.append( [k, parent, sentiment, token] )
                k = k +1
    return sentences


def extract_phrases_from_sentence(tokens_ids,childres_ids,children_positions,phrase_labels):

    all_phrases = []

    for root in range(len(tokens_ids)-1): #"-1" poniewaz fraza rozpoczynajaca sie w ostatnim slowie to cale zdanie, ktore bedzie wczesniej dodane do zbioru

        # wyciagamy fraze rozpoczynajacego sie w slowie o indeksie 'root' - to jest korzen frazy:
        nodes = [root]
        i = 0
        for i in range(children_positions.shape[0]): #przechodzimy po podrzewie
            try:
                children = children_positions[nodes[i]]
                nodes.extend(children[children>=0])
            except:
                pass

        nodes = nodes[::-1] # odwracamy kolejnosc, aby potem przechodzic po frazie od lisci do korzenia (aby moc obliczac rekurencyjnie siec)
        # nodes to teraz ciag slow w drzewie w kolejnosci: liscie, slowa, ktore maja pod soba tylko jedno slowo, ..., korzen

        #we frazie slowa pozmienialy pozycja w stosunku do zdania wejsciowego, zatem trzeba uaktualnic zapis struktury drzewa:
        new_positions = numpy.zeros(children_positions.shape[0])-1
        for i in range(len(nodes)):
            new_positions[nodes[i]] = i

        if len(nodes) == 1: #czyli jestesmy w lisciu
            children = numpy.array([[-1]])
        else:
            children = children_positions[nodes]
            children = children[:,numpy.max(children,0)>=0]
            for (i,j), value in numpy.ndenumerate(children):
                children[i,j] = -1 if children[i,j]==-1 else new_positions[value]

        phrase_tokens_ids = tokens_ids[numpy.array(nodes)]
        phrase_children_positions = children

        phrase_children_ids = children.copy()
        for (i,j), value in numpy.ndenumerate(children):
                phrase_children_ids[i,j] = -1 if children[i,j]==-1 else phrase_tokens_ids[value] #wstawiamy id tokenow zamiast pozycji

        phrase_label = phrase_labels[root]

        all_phrases.append([phrase_tokens_ids,\
                        phrase_children_ids,\
                        phrase_children_positions,\
                        phrase_label\
                        ])

    return all_phrases


def load_stanford_data(labels, parents, tokens, words2ids):

    '''
    Funkcja wczytuje dane w postaci drzew zaleznosciowych.

    labels - sciezka do pliku z etykietami - jeden wiersz to wektor etykiek dla podfraz o korzeniach w odpowiadajacym slowie
    parents - sciezka do pliku zawierajacego struktury drzew - jeden wiersz to jedno zdanie - kolejne liczby to indeks rodzica danego slowa
    tokens - sciezka do pliku z tokenami - jeden wiersz to jedno zdanie (tokeny rozdzielone spacjami)
    word2ids - slownik: klucz to token, wartosc to id slow, czyli jego indeks w macierzy embeddingow

    Funkcja zwraca wszystkie istniejace w danym zestawie zdan podfrazy (w tym cale zdania) w postaci listy - jeden element to jedna frazz.
    Jeden element sklada sie kolejno z:
    0. wektor id slow;
    1. macierz dzieci z id slow - i-ty wiersz zawiera id dzieci i-tego slowa. Tu mamy z dotyczenie  paddingiem wartoscia -1.
    2. macie dzieci z pozycjami w zdaniu - j.w. tylko zamiast id jest indeks dziecka w cigu tokenow
    3. etykieta frazy

    Slowa w wyniku sa posortowane w ten sposob, ze obliczajac kolejnce kroki sieci rekurencyjnej ideacej po drzewie, mozemy isc naturalnie od lewej do prawej, bo ustalona kolejnosc zapewnia ze w danym kroku bedziemy mieli policzone wczesniej potrzebne do rekurencji wartosci. Kolejnosc jest tak, ze najpier sa liscie, potem slowa, ktore maja pod soba tylko jedno slowo, itd.

    '''


    def transform_labels(x):
        if x =='#' or int(x) == 0:
            return 1
        elif int(x) < 0:
            return 0
        else:
            return 2

    sentences = []

    l = open(labels, "r")
    # 5 klas: labels = [[2 if y=='#' else int(y)+2 for y in x.split()] for x in l.readlines()]

    # Na ten moment przyjmujemy wartosc "2" w miejsce "#"

    labels = [[transform_labels(y) for y in x.split()] for x in l.readlines()]
    l.close()

    p = open(parents,"r")
    parents = [[int(y) for y in x.split()] for x in p.readlines()]
    p.close()

    t = open(tokens,"r")
    tokens = [x.split() for x in t.readlines()]
    t.close()


    for labels_i,parents_i,tokens_i in zip(labels,parents,tokens):

        s = []
        for i in range(len(tokens_i)):
            s.append([i,int(parents_i[i]),labels_i[i],tokens_i[i]])


        if len(s) == 1: #przypadek gdy fraz sklada sie z jednego tokena
            sentences.append([\
                                  numpy.array([words2ids.get(tokens[0], -1)]),\
                                  numpy.array([-1], ndmin=2),\
                                  numpy.array([-1], ndmin=2), \
                                  numpy.array(labels_i[0]) \
                              ])

        else:

            for i in range(len(s)):
                children = []
                for j in range(len(s)):
                    if s[j][1] == i+1:
                        children.append(s[j][0])
                s[i].append(children)

            words = [x[0] for x in s]
            children = seq.pad_sequences([x[4] for x in s], padding='post', value = -1)
            tokens = [x[3] for x in s]

            ordered_words, order = words_in_from_down_to_top_order(children)

            if ordered_words is None: #jezeli we frazie jest 2 lub wiecej albo 0 korzeni to nie wlaczamy tego zdania do naszych danych, bo uznajemy je za bledne
                continue

            current_sentence = [
                                  numpy.array([words2ids.get(x,-1) for x in tokens])[ordered_words],
                                  numpy.array([[words2ids.get(tokens[w],-1) if w>=0 else -1 for w in x] for x in children[ordered_words]]), #trzeba dokonac takich trasformacji, aby po zamianie kolejnosci slow zgadzaly sie pozycje dzieci
                                  numpy.array([[order[w] if w>= 0 else -1 for w in x] for x in children[ordered_words]]),
                                  numpy.array(labels_i)[ordered_words][-1]
                                  ]
            sentences.append(current_sentence)

            # Dodajemy wszystkie podfrazy danego zdania:
            sentences.extend(extract_phrases_from_sentence(current_sentence[0],current_sentence[1],current_sentence[2],numpy.array(labels_i)))

    return sentences


def load_stanford_data2(labels, parents, tokens, words2ids, train, batch_size, nb_classes):

    '''
    Funkcja wczytuje dane w postaci drzew zaleznosciowych.

    labels - sciezka do pliku z etykietami - jeden wiersz to wektor etykiek dla podfraz o korzeniach w odpowiadajacym slowie
    parents - sciezka do pliku zawierajacego struktury drzew - jeden wiersz to jedno zdanie - kolejne liczby to indeks rodzica danego slowa
    tokens - sciezka do pliku z tokenami - jeden wiersz to jedno zdanie (tokeny rozdzielone spacjami)
    word2ids - slownik: klucz to token, wartosc to id slow, czyli jego indeks w macierzy embeddingow

    Funkcja zwraca zdania w postaci listy - jeden element to jedno zdanie.
    Jeden element sklada sie kolejno z:
    0. wektor id slow;
    1. macierz dzieci z id slow - i-ty wiersz zawiera id dzieci i-tego slowa. Tu mamy z dotyczenie  paddingiem wartoscia -1.
    2. macie dzieci z pozycjami w zdaniu - j.w. tylko zamiast id jest indeks dziecka w cigu tokenow
    3. etykieta frazy

    Slowa w wyniku sa posortowane w ten sposob, ze obliczajac kolejnce kroki sieci rekurencyjnej ideacej po drzewie, mozemy isc naturalnie od lewej do prawej, bo ustalona kolejnosc zapewnia ze w danym kroku bedziemy mieli policzone wczesniej potrzebne do rekurencji wartosci. Kolejnosc jest tak, ze najpier sa liscie, potem slowa, ktore maja pod soba tylko jedno slowo, itd.

    '''


    def transform_labels(x, nb_classes):


        if nb_classes == 3:
            if x =='#' or int(x) == 0:
                return 1
            elif int(x) < 0:
                return 0
            else:
                return 2
        elif nb_classes == 5:
            if x =='#':
                return 2
            else:
                return int(x)+2

    sentences = []

    l = open(labels, "r")
    # 5 klas: labels = [[2 if y=='#' else int(y)+2 for y in x.split()] for x in l.readlines()]

    # Na ten moment przyjmujemy wartosc "2" w miejsce "#"

    labels = [[transform_labels(y,nb_classes) for y in x.split()] for x in l.readlines()]
    l.close()

    p = open(parents,"r")
    parents = [[int(y) for y in x.split()] for x in p.readlines()]
    p.close()

    t = open(tokens,"r")
    tokens = [x.split() for x in t.readlines()]
    t.close()


    k = 0
    sentence_length = 0

    for labels_i,parents_i,tokens_i in zip(labels,parents,tokens):

        if train == True:

            if k % batch_size == 0:
               s = []
               sentence_length = 0

            for i in range(len(tokens_i)):
                s.append([i+sentence_length,int(parents_i[i])+sentence_length,labels_i[i],tokens_i[i]])
            sentence_length = sentence_length + len(tokens_i)
            k = k + 1

            if k % batch_size != 0:
                continue

        else:
            s = []
            for i in range(len(tokens_i)):
                s.append([i,int(parents_i[i]),labels_i[i],tokens_i[i]])


        if len(s) == 1: #przypadek gdy fraz sklada sie z jednego tokena
            sentences.append([\
                                  numpy.array([words2ids.get(tokens[0], -1)]),\
                                  numpy.array([-1], ndmin=2),\
                                  numpy.array([-1], ndmin=2), \
                                  numpy.array(labels_i[0]) \
                              ])

        else:

            for i in range(len(s)):
                children = []
                for j in range(len(s)):
                    if s[j][1] == i+1:
                        children.append(s[j][0])
                s[i].append(children)

            words = [x[0] for x in s]
            children = seq.pad_sequences([x[4] for x in s], padding='post', value = -1)
            tokens = [x[3] for x in s]
            labels_in_batch = [x[2] for x in s]

            ordered_words, order = words_in_from_down_to_top_order(children)

            if ordered_words is None: #jezeli we frazie jest 2 lub wiecej albo 0 korzeni to nie wlaczamy tego zdania do naszych danych, bo uznajemy je za bledne
                continue

            current_sentence = [
                                  numpy.array([words2ids.get(x,-1) for x in tokens])[ordered_words],
                                  numpy.array([[words2ids.get(tokens[w],-1) if w>=0 else -1 for w in x] for x in children[ordered_words]]), #trzeba dokonac takich trasformacji, aby po zamianie kolejnosci slow zgadzaly sie pozycje dzieci
                                  numpy.array([[order[w] if w>= 0 else -1 for w in x] for x in children[ordered_words]]),
                                  numpy.array(labels_in_batch)[ordered_words]
                                  ]
            sentences.append(current_sentence)

            ## Dodajemy wszystkie podfrazy danego zdania:
            #sentences.extend(extract_phrases_from_sentence(current_sentence[0],current_sentence[1],current_sentence[2],numpy.array(labels_i)))

    return sentences


def load_stanford_data3(labels, parents, tokens, words2ids, use_batch, batch_size, nb_classes):


    def transform_labels(x, nb_classes):


        if nb_classes == 3:
            if x =='#' or int(x) == 0:
                return 1
            elif int(x) < 0:
                return 0
            else:
                return 2
        elif nb_classes == 5:
            if x =='#':
                return 2
            else:
                return int(x)+2
       # elif nb_classes == 2: #jesli chcemy miec dwie klasy to neutralne wyrzucamy ze zbioru,
       #     if x =='#' or int(x) == 0:
       #         return -1
       #     elif int(x) < 0:
       #         return 0
       #     else:
       #         return 1

    sentences = []

    l = open(labels, "r")
    # 5 klas: labels = [[2 if y=='#' else int(y)+2 for y in x.split()] for x in l.readlines()]

    # Na ten moment przyjmujemy wartosc "2" w miejsce "#"

    labels = [[transform_labels(y,nb_classes) for y in x.split()] for x in l.readlines()]
    l.close()

    p = open(parents,"r")
    parents = [[int(y) for y in x.split()] for x in p.readlines()]
    p.close()

    t = open(tokens,"r")
    tokens = [x.split() for x in t.readlines()]
    t.close()


    k = 0
    sentence_length = 0
    current_batch, batch_tokens, batch_children_ids, batch_children_positions, batch_labels = [], [], [], [], []
    #batch_words = []

    for labels_i,parents_i,tokens_i in zip(labels,parents,tokens):


        k = k + 1


        s = []
        for i in range(len(tokens_i)):
            s.append([i,int(parents_i[i]),labels_i[i],tokens_i[i]])


        if len(s) == 1 and use_batch == False: #przypadek gdy fraza sklada sie z jednego tokena

            #if nb_classes == 2:
            #    if s[0][-1] < 0:
            #        continue

            sentences.append([\
                                  numpy.array([words2ids.get(tokens[0], -1)]),\
                                  numpy.array([-1], ndmin=2),\
                                  numpy.array([-1], ndmin=2), \
                                  numpy.array(labels_i[0]) \
                                  #,numpy.array([0])
                              ])

        else:

            for i in range(len(s)): # nie wiadomo czy sie nei wywali dla frazy dlugosci 1
                children = []
                for j in range(len(s)):
                    if s[j][1] == i+1:
                        children.append(s[j][0])
                s[i].append(children)

            words = [x[0] for x in s]
            children = seq.pad_sequences([x[4] for x in s], padding='post', value = -1)
            tokens = [x[3] for x in s]
            labels_in_batch = [x[2] for x in s]

            ordered_words, order = words_in_from_down_to_top_order(children)

            if ordered_words is None:
                continue

            current_sentence = [
                                  numpy.array([words2ids.get(x,-1) for x in tokens])[ordered_words],
                                  numpy.array([[words2ids.get(tokens[w],-1) if w>=0 else -1 for w in x]
                                               for x in children[ordered_words]]),
                                  numpy.array([[order[w] if w>= 0 else -1 for w in x] for x in children[ordered_words]]),
                                  numpy.array(labels_in_batch)[ordered_words]
                                  #,numpy.array(words)
                                  ]
            #if nb_classes == 2:
            #    if current_sentence[3][-1] <0:
            #        continue


            if use_batch == True:

                # w tej chwili len(current_sentence[0]) nie jest nigdzie wykorzystywane
                current_batch.append((current_sentence, len(current_sentence[0])))

                if len(current_batch) % batch_size == 0:

                    shift = 0

                    for sent in range(batch_size):

                        if sent > 0:
                            shift = shift + current_batch[sent-1][1]

                        for tok in range(len(current_batch[sent][0][0])):

                            if sent == 0:
                                batch_children_positions.append(current_batch[sent][0][2][tok])
                            else:
                                batch_children_positions.append([chd+shift if chd>=0 else -1 for chd in current_batch[sent][0][2][tok]])
                            #batch_children_positions.append(current_batch[sent][0][2][tok])

                            batch_tokens.append(current_batch[sent][0][0][tok])
                            batch_children_ids.append(current_batch[sent][0][1][tok])
                            batch_labels.append(current_batch[sent][0][3][tok])
                            #batch_words.append(current_batch[sent][0][4][tok])


                    batch_children_ids = seq.pad_sequences(batch_children_ids, padding='post', value = -1)
                    batch_children_positions = seq.pad_sequences(batch_children_positions, padding='post', value = -1)

                    sentences.append([
                                        numpy.array(batch_tokens),
                                        numpy.array(batch_children_ids),
                                        numpy.array(batch_children_positions),
                                        numpy.array(batch_labels)
                                        #,numpy.array(batch_words)
                                    ])

                    current_batch, batch_tokens, batch_children_ids, batch_children_positions, batch_labels = [], [], [], [], []
                    #batch_words = []


            else:

                sentences.append(current_sentence)


    # gdy liczba zdan nie jest wilokrotnosci licznosci batch, to na koncu trzeba dodac pozostale zdania:
    if use_batch == True and len(current_batch) > 0:

        shift = 0

        for sent in range(len(current_batch)):

            if sent > 0:
                shift = shift + current_batch[sent-1][1]

            for tok in range(len(current_batch[sent][0][0])):

                if sent == 0:
                    batch_children_positions.append(current_batch[sent][0][2][tok])
                else:
                    batch_children_positions.append([chd+shift if chd>=0 else -1 for chd in current_batch[sent][0][2][tok]])
                #batch_children_positions.append(current_batch[sent][0][2][tok])

                batch_tokens.append(current_batch[sent][0][0][tok])
                batch_children_ids.append(current_batch[sent][0][1][tok])
                batch_labels.append(current_batch[sent][0][3][tok])
                #batch_words.append(current_batch[sent][0][4][tok])


        batch_children_ids = seq.pad_sequences(batch_children_ids, padding='post', value = -1)
        batch_children_positions = seq.pad_sequences(batch_children_positions, padding='post', value = -1)

        sentences.append([
                            numpy.array(batch_tokens),
                            numpy.array(batch_children_ids),
                            numpy.array(batch_children_positions),
                            numpy.array(batch_labels)
                            #,numpy.array(batch_words)
                        ])


    return sentences