main_current.py 24.3 KB

Edit Raw Blame History

import numpy as np
import time
import sys
import subprocess
import os
import random

#from modules.data import load
#from modules.rnn.many_models import *
#from modules.metrics.accuracy import conlleval
#from modules.utils.tools import *

import theano.tensor as T
import theano

import itertools

import os.path
import pickle


from theano import tensor as T, printing
from collections import OrderedDict
from theano.ifelse import ifelse

from keras.preprocessing import sequence as seq

dataType = 'int64'


def shuffle(lol, seed):
    '''
    lol :: list of list as input
    seed :: seed the shuffling

    shuffle inplace each list in the same order
    '''
    for l in lol:
        random.seed(seed)
        random.shuffle(l)

def words_in_from_down_to_top_order(sentence_tree):
    #print sentence_tree
    levels = np.setdiff1d(range(len(sentence_tree)),np.unique(sentence_tree)) # - zwraca slowo/a, ktore nie jest niczyim dzieckiem - czyli powinno/y byc korzeniem/korzeniami frazy/fraz
    if len(levels) == 0: # wczesniej bylo != 1, co oznaczalo, ze jezeli okazuje sie jest wiecej niz jeden korzec (lub nie ma korzenia) to zwracamy None, aby pozniej rozpoznac takie zdanie i je wywalic. Ale jak robimy batche to musi byc kilka korzeni
        return None, None
    levels = levels.tolist()

    for i in range(len(sentence_tree)):
        #print i
        #print levels[i]
        levels.extend(np.setdiff1d(sentence_tree[levels[i]],-1))

    ordered_words = np.array(levels)[levels != np.array(-1)][::-1] #odwaracmy kolejnosc na poczatku beda slowa znajdujace sie najglebiej

    order = np.zeros(len(sentence_tree),dtype='int')
    for i in range(len(sentence_tree)):
        order[ordered_words[i]] = i

    return ordered_words, order


def load_stanford_data4(labels, parents, tokens, words2ids, use_batch, batch_size, nb_classes):


    def transform_labels(x, nb_classes):


        if nb_classes == 3:
            if x =='#' or int(x) == 0:
                return 1
            elif int(x) < 0:
                return 0
            else:
                return 2
        elif nb_classes == 5:
            if x =='#':
                return 2
            else:
                return int(x)+2
       # elif nb_classes == 2: #jesli chcemy miec dwie klasy to neutralne wyrzucamy ze zbioru,
       #     if x =='#' or int(x) == 0:
       #         return -1
       #     elif int(x) < 0:
       #         return 0
       #     else:
       #         return 1

    sentences = []

    l = open(labels, "r")
    # 5 klas: labels = [[2 if y=='#' else int(y)+2 for y in x.split()] for x in l.readlines()]

    # Na ten moment przyjmujemy wartosc "2" w miejsce "#"

    labels = [[transform_labels(y,nb_classes) for y in x.split()] for x in l.readlines()]
    l.close()

    p = open(parents,"r")
    parents = [[int(y) for y in x.split()] for x in p.readlines()]
    p.close()

    t = open(tokens,"r")
    tokens = [x.split() for x in t.readlines()]
    t.close()


    k = 0
    sentence_length = 0
    current_batch, batch_tokens, batch_children_ids, batch_children_positions, batch_labels = [], [], [], [], []
    batch_words = []

    for labels_i,parents_i,tokens_i in zip(labels,parents,tokens):


        k = k + 1


        s = []
        for i in range(len(tokens_i)):
            s.append([i,int(parents_i[i]),labels_i[i],tokens_i[i]])


        if len(s) == 1 and use_batch == False: #przypadek gdy fraza sklada sie z jednego tokena

            #if nb_classes == 2:
            #    if s[0][-1] < 0:
            #        continue

            sentences.append([\
                                  np.array([words2ids.get(tokens[0], -1)]),\
                                  #wyrzucamy macierz id dzieci np.array([-1], ndmin=2),\
                                  np.array([-1], ndmin=2), \
                                  np.array(labels_i[0]) \
                                  #,np.array([0])
                              ])

        else:

            for i in range(len(s)): # nie wiem czy sie nie wywali dla frazy dlugosci 1
                children = []
                for j in range(len(s)):
                    if s[j][1] == i+1:
                        children.append(s[j][0])
                s[i].append(children)

            words = [x[0] for x in s]
            children = seq.pad_sequences([x[4] for x in s], padding='post', value = -1)
            tokens = [x[3] for x in s]
            labels_in_batch = [x[2] for x in s]

            ordered_words, order = words_in_from_down_to_top_order(children)

            if ordered_words is None:
                continue

            current_sentence = [
                                  np.array([words2ids.get(x,-1) for x in tokens])[ordered_words],
                                  #wyrzucamy macierz id dzieci np.array([[words2ids.get(tokens[w],-1) if w>=0 else -1 for w in x]
                                  #             for x in children[ordered_words]]),
                                  np.array([[order[w] if w>= 0 else -1 for w in x] for x in children[ordered_words]]),
                                  np.array(labels_in_batch)[ordered_words]
                                  ,np.array(words)
                                  ]
            #if nb_classes == 2:
            #    if current_sentence[3][-1] <0:
            #        continue


            if use_batch == True:

                # w tej chwili len(current_sentence[0]) nie jest nigdzie wykorzystywane
                current_batch.append((current_sentence, len(current_sentence[0])))

                if len(current_batch) % batch_size == 0:

                    shift = 0

                    for sent in range(batch_size):

                        ##if sent > 0:
                        ##    shift = shift + current_batch[sent-1][1]

                        for tok in range(len(current_batch[sent][0][0])):

                            if sent == 0:
                                batch_children_positions.append(current_batch[sent][0][1][tok])
                            else:
                                batch_children_positions.append([chd+shift if chd>=0 else -1 for chd in current_batch[sent][0][1][tok]])
                            #batch_children_positions.append(current_batch[sent][0][2][tok])

                            batch_tokens.append(current_batch[sent][0][0][tok])
                            #wyrzucamy macierz id dzieci batch_children_ids.append(current_batch[sent][0][1][tok])
                            batch_labels.append(current_batch[sent][0][2][tok])
                            batch_words.append(current_batch[sent][0][3][tok])


                    #wyrzucamy macierz id dzieci batch_children_ids = seq.pad_sequences(batch_children_ids, padding='post', value = -1)
                    batch_children_positions = seq.pad_sequences(batch_children_positions, padding='post', value = -1)

                    sentences.append([
                                        np.array(batch_tokens),
                                        #wyrzucamy macierz id dzieci np.array(batch_children_ids),
                                        np.array(batch_children_positions),
                                        np.array(batch_labels)
                                        ,np.array(batch_words)
                                    ])

                    current_batch, batch_tokens, batch_children_positions, batch_labels = [], [], [], []
                    batch_words = []


            else:

                sentences.append(current_sentence)


    # gdy liczba zdan nie jest wilokrotnosci licznosci batch, to na koncu trzeba dodac pozostale zdania:
    if use_batch == True and len(current_batch) > 0:

        shift = 0

        for sent in range(len(current_batch)):

            #if sent > 0:
            #    shift = shift + current_batch[sent-1][1]

            for tok in range(len(current_batch[sent][0][0])):

                if sent == 0:
                    batch_children_positions.append(current_batch[sent][0][1][tok])
                else:
                    batch_children_positions.append([chd+shift if chd>=0 else -1 for chd in current_batch[sent][0][1][tok]])
                #batch_children_positions.append(current_batch[sent][0][2][tok])

                batch_tokens.append(current_batch[sent][0][0][tok])
                #wyrzucamy macierz id dzieci batch_children_ids.append(current_batch[sent][0][1][tok])
                batch_labels.append(current_batch[sent][0][2][tok])
                batch_words.append(current_batch[sent][0][3][tok])


        #wyrzucamy macierz id dzieci batch_children_ids = seq.pad_sequences(batch_children_ids, padding='post', value = -1)
        batch_children_positions = seq.pad_sequences(batch_children_positions, padding='post', value = -1)

        sentences.append([
                            np.array(batch_tokens),
                            #wyrzucamy macierz id dzieci np.array(batch_children_ids),
                            np.array(batch_children_positions),
                            np.array(batch_labels)
                            ,np.array(batch_words)
                        ])


    return sentences


class model51(object):
    def __init__(self, nh, nc, w2v_model_path, max_phrase_length):
        '''
        nh :: dimension of hidden state
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings
        ds :: dimension of the sentiment state
        '''
        self.max_phrase_length = max_phrase_length
        w2vecs = pickle.load(open(w2v_model_path,"r"))
        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
        self.words2ids = w2vecs["words2ids"]

        ne = len(w2vecs["words2ids"])
        de = w2vecs["vectors"].shape[1]
        del w2vecs

        r = 0.05
        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (de, nh)).astype(theano.config.floatX))
        #self.W_sh  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
        #           (ds, nh)).astype(theano.config.floatX))
        self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (2*nh, nc)).astype(theano.config.floatX))
        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (2*nh, 2*nh)).astype(theano.config.floatX))
        #self.W_ssy   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
        #           (ds, nc)).astype(theano.config.floatX))
        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (2*nh, nh)).astype(theano.config.floatX))
        self.W_h_y  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (2*nh, nc)).astype(theano.config.floatX))

        self.b_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, 2*nh).astype(theano.config.floatX))
        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
        #self.b_h2  = theano.shared(r * np.random.uniform(-1.0, 1.0, 2*nh).astype(theano.config.floatX))

        # bundle
        self.params = [ self.W_h_y, self.W_e_h, self.W_sh_h, self.emb, self.b_y]
        self.names  = [ 'W_h_y', 'W_eh', 'W_shsh', "embeddings", "b_y"]

	#norm_coefficient = theano.shared(0.0001)
	#hidden_states = theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX))


        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):

	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy

	    if T.neq(word_children_positions[0],-1):

		tmp = word_children_positions>=0.0
		idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1

		#print_idx = printing.Print('idx: ')
		#idx_tmp_printed = print_idx(word_children_positions[idx_tmp])

		schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
		number_of_children = tmp.sum(dtype = theano.config.floatX)
                number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0)
                schh = schh/number_of_children

            h = T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)])

            #h = T.nnet.sigmoid(T.dot(self.emb[word_id],self.W_eh) + T.dot(schh,self.W_shsh) + self.bh)

            current_hidden_state = hidden_states[i]
            hidden_states_new = T.set_subtensor(current_hidden_state, h)

            h2 = T.nnet.sigmoid(T.dot(h, self.W_h_h2))#+self.b_h2)

            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]

            #y_prob = T.nnet.sigmoid(T.dot(h2,self.W_h2_y))# + self.b_y)
	    #y_prob = y_prob/y_prob.sum()

	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2) + T.sum(self.b_h2**2)

            cross_entropy = -T.log(y_prob[y_true]) #+ norm_coefficient * l2_norm


            #current_emb = self.emb[word_id]
            #new_emb = T.set_subtensor(current_emb, self.emb[word_id]-learning_rate*T.grad(cce, self.emb)[word_id])


            #updates = OrderedDict([(self.W_h2_y, self.W_h2_y-learning_rate*T.grad(cce, self.W_h2_y)),
           #             #(self.W_h_y, self.W_h_y-learning_rate*T.grad(cce, self.W_h_y)),
	#		(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cce, self.W_h_h2)),
	#		(self.W_e_h, self.W_e_h-learning_rate*T.grad(cce, self.W_e_h)),
#			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cce, self.W_sh_h)),
	#		(self.emb, self.emb-learning_rate*T.grad(cce, self.emb)), #updated_current_emb), #
         #               #(self.b_h, self.b_h-learning_rate*T.grad(cce,self.b_h)),
          #              (self.b_y, self.b_y-learning_rate*T.grad(cce,self.b_y))#,
          #              #(self.b_h2, self.b_h2-learning_rate*T.grad(cce,self.b_h2)),
          #              #(hidden_states, hidden_states_new)
#			])

            return cross_entropy, hidden_states_new #, updates


        y = T.vector('y',dtype=dataType)
        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
	words = T.vector(dtype=dataType)
        children_positions = T.matrix(dtype=dataType)
	words_indexes = T.vector(dtype=dataType)

	cross_entropy_vector, _ = theano.scan(fn=one_step, \
                                 sequences = [words, children_positions,y,words_indexes],
				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX))],
                                 non_sequences = learning_rate,
                                 n_steps = words.shape[0])


	cost = T.sum(cross_entropy_vector[0])


        updates = OrderedDict([(self.W_h2_y, self.W_h2_y-learning_rate*T.grad(cost, self.W_h2_y)),
                        #(self.W_h_y, self.W_h_y-learning_rate*T.grad(cost, self.W_h_y)),
			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cost, self.W_e_h)),
			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
                        #(self.b_h, self.b_h-learning_rate*T.grad(cost,self.b_h)),
                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))#,
                        #(self.b_h2, self.b_h2-learning_rate*T.grad(cost,self.b_h2)),
			])

	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
                                      outputs = [],
                                      updates = updates,
                                      allow_input_downcast=True,
                                      mode='FAST_RUN'
                                      )


	#hidden_states = theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX))

        def one_step_classify(word_id, word_children_positions, i, hidden_states):

	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy

            if T.neq(word_children_positions[0],-1):

		tmp = word_children_positions>=0.0
		idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
  	        schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
		number_of_children = tmp.sum(dtype = theano.config.floatX)
                number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0)
                schh = schh/number_of_children

            h = T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)])

            current_hidden_state = hidden_states[i]
            hidden_states_new = T.set_subtensor(current_hidden_state, h)

            h2 = T.nnet.sigmoid(T.dot(h, self.W_h_h2))#+self.b_h2)

            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]

	    #y_prob = T.nnet.sigmoid(T.dot(h2,self.W_h2_y))# + self.b_y)
	    #y_prob = y_prob/y_prob.sum()

    	    #updates = OrderedDict([
            #            (hidden_states, hidden_states_new)
	#		])

            return  y_prob, hidden_states_new #), updates


	[y_probs_classify, hidden_states ], _ = theano.scan(
				 fn=one_step_classify,
                                 sequences = [words, children_positions,words_indexes],
				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX))])

	#print_y_probs_classify = printing.Print('y_probs_classify: ')
	#y_probs_classify_printed = print_y_probs_classify(y_probs_classify)


	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]),
                                     sequences = [words_indexes])

	#print_predictions = printing.Print('predictions: ')
	#predictions_printed = print_predictions(predictions)


        # theano functions
        self.classify = theano.function(inputs=[words,children_positions,words_indexes],
                                     outputs=predictions,
				     #updates = upates_hidden_states_classify,
                                     allow_input_downcast=True,
                                     mode='FAST_RUN'
                                     )


    def save(self, folder, e, i):
        for param, name in zip(self.params, self.names):
            np.save(os.path.join(folder, name + str(e) + "_" + str(i) + '.npy'), param.get_value())


if __name__ == '__main__':

    #theano.config.floatX = 'float64'

    file_with_filtered_embeddings = "embeddings/embedding_and_words2ids.pkl"
    if not os.path.exists(file_with_filtered_embeddings):
        print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
        filter_embeddings(["data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks", "data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks", "data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks"],

         "/home/norbert/Doktorat/clarin2sent/treelstm/data/glove/glove.840B.300d.txt",
         file_with_filtered_embeddings)


    s = {'lr':0.01,   #0.03 (przy r=0.05) dobilo do dopasowania 0.9 na 500 obserwacjach po 15 epokach
                      #bo 40 epoki, bylo ok 0.9, a potem spadlo do stalej predykcji rownej 0,
                      #chociaz w zbiorze treningowym nie bylo ani jednaj obserwacji z etykieta 0 !!!
                      #0.03 (r=0.05) przy 5000 obs do 15 epoki stalo na ok 75% a potem spadlo dor predykcji stalej rownej 0
                      #0.05 - reszta j.w. nic sie nie nauczyl, a przy 10 iteracji predykcja spadla do stalej - 0

	 'nepochs':30,
         'seed':345,
         'nc':5         # number of y classes
         }

    # instanciate the model


    batch_size = 1


    for learning_rate in [0.005]: #[0.1, 0.07, 0.03, 0.01, 0.005, 0.001, 0.0005, 0.0001, 0.00005]:


    	    np.random.seed(s['seed'])
	    random.seed(s['seed'])


	    h_dim  = 150

	    #if learning_rate != 0.0005:
	   # 	time.sleep(900)
	    #print "model1: h_dim = ", h_dim, " h2_dim = ", h2_dim

    	    rnn = model51(   nh  = h_dim,
             	            nc = s['nc'],
                	    w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami
               	            max_phrase_length = 60 )

            best_prediction = 0
            early_stop = 0

	    train_data = load_stanford_data4("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks",rnn.words2ids,True,batch_size,s['nc'])

	    train_data_check = load_stanford_data4("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks",rnn.words2ids,False,batch_size,s['nc'])

	    dev_data = load_stanford_data4("data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks",rnn.words2ids,False,0,s['nc'])


	    test_data = load_stanford_data4("data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks",rnn.words2ids,False,0,s['nc'])

	    #print(train_data)

	    n_train = len(train_data)
	    n_dev = len(dev_data)
	    n_test = len(test_data)

	    print ""
	    print "model51: h_dim = ", h_dim, " learning rate = ", learning_rate
	    print ""

    	    #rnn = model51(   nh  = h_dim,
            # 	            nc = s['nc'],
            #    	    w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami
            #   	            max_phrase_length = 60 )

            best_prediction = 0
            early_stop = 0

	    s['clr'] = learning_rate

	    for e in xrange(s['nepochs']):

                if early_stop == 5:
                    break


		# shuffle
		shuffle([train_data], s['seed'])

		tic = time.time()
		for i in range(n_train):
		    rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], s['clr'])


	        # Dev:
	        counts_dev = np.zeros((s['nc'],s['nc']),dtype='int')
                counts_dev_root = np.zeros((s['nc'],s['nc']),dtype='int')
	        for ii in range(n_dev):
	            pred = rnn.classify(dev_data[ii][0],dev_data[ii][1], dev_data[ii][3])
	            for j in range(len(pred)):
	                counts_dev[pred[j], dev_data[ii][2][j]] += 1
                    counts_dev_root[pred[-1], dev_data[ii][2][-1]] += 1

	        if np.diag(counts_dev).sum()/float(counts_dev.sum()) > best_prediction:
                    best_prediction = np.diag(counts_dev).sum()/float(counts_dev.sum())
                    early_stop = 0
                else:
                    early_stop = early_stop + 1

		# Test:
		counts_test = np.zeros((s['nc'],s['nc']),dtype='int')
		counts_test_root = np.zeros((s['nc'],s['nc']),dtype='int')
		for i in range(n_test):
		    pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][3])
		    for j in range(len(pred)):
		        counts_test[pred[j], test_data[i][2][j]] += 1
                    counts_test_root[pred[-1], test_data[i][2][-1]] += 1


		# Train
		counts = np.zeros((s['nc'],s['nc']),dtype='int')
		counts_root = np.zeros((s['nc'],s['nc']),dtype='int')
		for i in range(len(train_data_check)):

		    if i % 1 == 0: #sprawdzamy dopasowanie na 1/100 zbioru zeby oszczedzic czas
		    	pred  = rnn.classify(train_data_check[i][0],train_data_check[i][1], train_data_check[i][3])
		    	for j in range(len(pred)):
		    	    counts[pred[j], train_data_check[i][2][j]] += 1
                   	counts_root[pred[-1], train_data_check[i][2][-1]] += 1

		print("epoch: ", e,
		      "V all: ", np.diag(counts_dev).sum()/float(counts_dev.sum()),
		      "   Test all: ", np.diag(counts_test).sum()/float(counts_test.sum()),
                      "V root: ", np.diag(counts_dev_root).sum()/float(counts_dev_root.sum()),
		      "   Test root: ", np.diag(counts_test_root).sum()/float(counts_test_root.sum()),
		      "   Train: ",  np.diag(counts).sum()/float(counts.sum()),
		      "   Train root: ",  np.diag(counts_root).sum()/float(counts_root.sum())
		     )


		print(time.time()-tic)