tmp.py 19.4 KB

Edit Raw Blame History

import theano
import numpy as np
import os
import pickle

from theano import tensor as T, printing
from collections import OrderedDict
from theano.ifelse import ifelse

theano.config.floatX = 'float64'
dataType = 'int64'

class model(object):

    def __init__(self, nh, nc, ds, w2v_model_path, max_phrase_length):
        '''
        nh :: dimension of hidden state
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings

        ds :: dimension of the sentiment state
        '''


        self.max_phrase_length = max_phrase_length

        ###ne = len(model.index2word)
        ###de = model.vector_size

        ###vectors = np.zeros((ne,de))
        ###self.words2ids = {}
        ###for i in range(len(model.index2word)):
        ###    self.words2ids[model.index2word[i]] = i
        ###    vectors[i] = model[model.index2word[i]]

        w2vecs = pickle.load(open(w2v_model_path,"r"))
        #self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
        self.emb = theano.shared(np.load("saved_models4/embeddings.npy").astype(theano.config.floatX))
        self.words2ids = w2vecs["words2ids"]

        ne = len(w2vecs["words2ids"])
        de = w2vecs["vectors"].shape[1]

        del w2vecs

        #self.words2ids = {}
        #vectors = []
        #i = 0
        #for line in open(w2v_model_path,"r"):
        #    toks = line.strip("\n").split(" ")
        #    word = toks[0]
        #    v = map(float, toks[1:])
        #    vectors.append(v)
        #    self.words2ids[word] = i
        #    i = i + 1
        #vectors.append(np.zeros((len(vectors[0]))))
        #vectors = np.array(vectors)
        #print(vectors.shape)
        #self.emb = theano.shared(vectors.astype(theano.config.floatX))

        #ne = i
        #de = len(vectors[0])

        #bedzie trzeba obsluzyc przypadek, gdy slowo w danych nie ma embeddina w modelu

	###del model
        #del vectors

        #self.sent_states = theano.shared(0.2 * np.concatenate((
        #           np.random.uniform(-1.0, 1.0,(ne, ds)),np.zeros((1,ds))),axis=0).astype(theano.config.floatX))
             # dodajemy jeden wektor zerowy potrzebny dla wyznaczenia sumy
             # dzieci z liscii (czyli lisc symbolicznie ma dziecko bedace nullem - i to ma zerowy sentyment)
	     # uzyc go tez do reprezentacji rzadkich slow na zbiorze treningowym?
	     # porownac dzialanie: 1) przyjecie wektora zerowego dla nowych slow w zbiorze tren; 2) wziecie wartosci ze slowa najbardziej podobnego wzgledem embeddingu wystepujacego w zbiorze tren
	     # trzeba bedzie to uwzglednic w stosowaniu sieci

        r = 0.05


        #self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
        #           (de, nh)).astype(theano.config.floatX))
        self.W_e_h = theano.shared(np.load("saved_models4/W_eh25.npy").astype(theano.config.floatX))

        self.W_sh  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (ds, nh)).astype(theano.config.floatX))

        #self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
        #           (2*nh, nc)).astype(theano.config.floatX))
        self.W_h2_y   = theano.shared(np.load("saved_models4/W_hh225.npy").astype(theano.config.floatX))

        #self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
        #          (2*nh, 2*nh)).astype(theano.config.floatX))
        self.W_h_h2   = theano.shared(np.load("saved_models4/W_h2y25.npy").astype(theano.config.floatX))

        self.W_ssy   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (ds, nc)).astype(theano.config.floatX))

        #self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
        #           (2*nh, nh)).astype(theano.config.floatX))
        self.W_sh_h   = theano.shared(np.load("saved_models4/W_shsh25.npy").astype(theano.config.floatX))

        self.bh  = theano.shared(np.zeros(nh, dtype=theano.config.floatX))
        self.b   = theano.shared(np.zeros(nc, dtype=theano.config.floatX))


        # bundle
        self.params = [ self.W_h2_y, self.W_h_h2, self.W_e_h, self.W_sh_h,self.emb]#, self.bh, self.b ]
        self.names  = [ "W_hh2", 'W_h2y', 'W_eh', 'W_shsh', "embeddings"]#, 'bh', 'b']#, 'h0']


        # liczy sentyment obecnego slowa / predykcja
        # word_id = obecne slowo
        # i = indeks w zdaniu slowa word_id
        # word_children_ids = id-ki dzieci obecnego slowa
        # word_children_positions = pozycje word_children_ids
        def one_step(word_id, word_children_ids, word_children_positions, i, hidden_states):


            idx_tmp = (word_children_positions>=0).nonzero()
            tmp = T.zeros_like(word_children_positions)
            tmp2 = T.set_subtensor(tmp[idx_tmp], 1)
            number_of_children = tmp2.sum()

            #pnoc = theano.printing.Print('Number of children: ')
            #printed_number_of_children = pnoc(number_of_children)


            # sprobowac zamiast zer, wstawic wektor wartosci 0.5
            schh = hidden_states[word_children_positions].sum(axis=0) /( number_of_children + 0.000001) #dodane 0..1, zeby nie bawic sie w ify, gdy nie ma dzieci (wtedy suma i tak jest zero, wiece dzielenie nie ma znaczenia)
            h = T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)])  # bez biasa i sigmoida

            #h = T.nnet.sigmoid(T.dot(self.emb[word_id],self.W_eh) + T.dot(schh,self.W_shsh) + self.bh)

            #h_s = T.zeros_like(hidden_states)
            #zeros_subtensor = h_s[i]
            #new_h_s = T.set_subtensor(zeros_subtensor, h)

            zeros_subtensor = hidden_states[i]
            hidden_states_new = T.set_subtensor(zeros_subtensor, h)

            h2 = T.dot(h, self.W_h_h2)

            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y))# + self.b)


            # powyzsze jest niezbyt sensownie zrobione, bo jesli jest kilka "-1" w dzieciach to tyle razy jest dodawany ten wektor
            # czy da sie to zamienic na petle, zeby nie dodawac wektora -1 kilka razy?
            # w tej chwili to nie ma znaczenia, bo ten wektor i tak jest stale rowny 0 - nie zmienia sie podczas uczenia

            return i+1, hidden_states_new, y_prob


	words = T.vector(dtype=dataType)
	children_ids = T.matrix(dtype=dataType)
        children_positions = T.matrix(dtype=dataType)

	y_probs, _ = theano.scan(fn=one_step, \
                                 sequences = [words, children_ids, children_positions],
                                 outputs_info = [theano.shared(0),
                                                 theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX)),
                                                 None],
                                 n_steps = words.shape[0])


        estimated_probs = y_probs[-1][-1][0]

        y_pred = T.argmax(estimated_probs) # y_probs[-1][-1][0] zwraca wektor [P(y=0), P(y=1)] -> argmax zwraca predykce klasy
                 # dostajemy sie do predykcji dla ostatniego slowa, a klasyfikacja ostatniego slowa odpowiada klasyfikacji frazy,
                 # bo slowa sa ustawione w takiej kolejnosci, ze korzen jest ostatnim slowem


	y = T.scalar('y',dtype=dataType)

        # cost and gradients and learning rate
        lr = T.scalar('lr',dtype=theano.config.floatX)

        nll = -T.log(estimated_probs)[y] #to samo co (sprawdzone):
        #nll = T.nnet.nnet.categorical_crossentropy(estimated_probs,T.extra_ops.to_one_hot(y.dimshuffle('x'), 5)[0])

        gradients = T.grad( nll, self.params )
        updates = OrderedDict(( p, p-lr*g ) for p, g in zip( self.params , gradients))

        # uwaga: ostani rzad macierzy sent_states - wektor odpowiadajacy dziecku, ktorego nie ma - jest stale rowny zero


        # theano functions
        self.classify = theano.function(inputs=[words,children_ids,children_positions], outputs=y_pred,
                                     allow_input_downcast=True,
                                     mode='FAST_RUN' )

        self.train = theano.function( inputs  = [words,children_ids, children_positions, y, lr],
                                      outputs = nll,
                                      updates = updates,
                                     allow_input_downcast=True,
                                      mode='FAST_RUN' )


        #self.normalize = theano.function( inputs = [], #uwazac na dzielenie przez 0 - ostatni wiersz sent_states jest zerowy
        #                 updates = {self.sent_states:\
        #                 self.sent_states/T.sqrt((self.sent_states**2).sum(axis=1))})#.dimshuffle(0,'x')})

    def save(self, folder, e):
        for param, name in zip(self.params, self.names):
            np.save(os.path.join(folder, name + str(e) + '.npy'), param.get_value())


class model2(object):

    '''

    '''


    def __init__(self, nh, nc, ds, w2v_model_path, max_phrase_length):
        '''
        nh :: dimension of hidden state
        nc :: number of classes
        ne :: number of word embeddings in the vocabulary
        de :: dimension of the word embeddings

        ds :: dimension of the sentiment state
        '''

        self.max_phrase_length = max_phrase_length

        w2vecs = pickle.load(open(w2v_model_path,"r"))

        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
        #self.emb = theano.shared(np.load("saved_models_final1/embeddings"+str(e)+"_200.npy").astype(theano.config.floatX))

        self.words2ids = w2vecs["words2ids"]

        ne = len(w2vecs["words2ids"])
        de = w2vecs["vectors"].shape[1]

        del w2vecs

        #self.sent_states = theano.shared(0.2 * np.concatenate((
        #           np.random.uniform(-1.0, 1.0,(ne, ds)),np.zeros((1,ds))),axis=0).astype(theano.config.floatX))
             # dodajemy jeden wektor zerowy potrzebny dla wyznaczenia sumy
             # dzieci z liscii (czyli lisc symbolicznie ma dziecko bedace nullem - i to ma zerowy sentyment)
	     # uzyc go tez do reprezentacji rzadkich slow na zbiorze treningowym?
	     # porownac dzialanie: 1) przyjecie wektora zerowego dla nowych slow w zbiorze tren; 2) wziecie wartosci ze slowa najbardziej podobnego wzgledem embeddingu wystepujacego w zbiorze tren
	     # trzeba bedzie to uwzglednic w stosowaniu sieci

        r = 0.05


        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (de, nh)).astype(theano.config.floatX))
        #self.W_e_h = theano.shared(np.load("saved_models_final1/W_eh"+str(e)+"_200.npy").astype(theano.config.floatX))

        self.W_sh  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (ds, nh)).astype(theano.config.floatX))

        self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (2*nh, nc)).astype(theano.config.floatX))
        #self.W_h2_y   = theano.shared(np.load("saved_models_final1/W_h2y"+str(e)+"_200.npy").astype(theano.config.floatX))

        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (2*nh, 2*nh)).astype(theano.config.floatX))
        #self.W_h_h2   = theano.shared(np.load("saved_models_final1/W_hh2"+str(e)+"_200.npy").astype(theano.config.floatX))

        self.W_ssy   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (ds, nc)).astype(theano.config.floatX))

        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (2*nh, nh)).astype(theano.config.floatX))
        #self.W_sh_h   = theano.shared(np.load("saved_models_final1/W_shsh"+str(e)+"_200.npy").astype(theano.config.floatX))


        self.W_h_y  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
                   (2*nh, nc)).astype(theano.config.floatX))

        self.bh  = theano.shared(np.zeros(nh, dtype=theano.config.floatX))
        self.b   = theano.shared(np.zeros(nc, dtype=theano.config.floatX))


        # bundle
        self.params = [ self.W_h_y, self.W_e_h, self.W_sh_h, self.emb]# self.W_h2_y, self.W_h_h2,
        self.names  = [ "W_h_y", 'W_eh', 'W_shsh', "embeddings"]# 'W_h2y', "W_hh2",


        shared_zero = theano.shared(0)
        shared_one = theano.shared(1)

        # liczy sentyment obecnego slowa / predykcja
        # word_id = obecne slowo
        # i = indeks w zdaniu slowa word_id
        # word_children_ids = id-ki dzieci obecnego slowa
        # word_children_positions = pozycje word_children_ids
        def one_step(word_id, word_children_ids, word_children_positions, y_true, i, hidden_states, learning_rate):

	    p = printing.Print('word_children_positions: ')
	    word_children_positions = p(word_children_positions)


            idx_tmp = (word_children_positions>=0).nonzero()
            tmp = T.zeros_like(word_children_positions)
            tmp2 = T.set_subtensor(tmp[idx_tmp], 1)
            number_of_children = tmp2.sum(dtype = dataType)

	    number_of_children = ifelse(T.eq(number_of_children, shared_zero), shared_one, number_of_children)
            # sprobowac zamiast zer, wstawic wektor wartosci 0.5

	    hello_world_op = printing.Print('number_of_children: ')
	    number_of_children = hello_world_op(number_of_children)


            schh = hidden_states[word_children_positions].sum(axis=0) /  number_of_children#( number_of_children + 0.000001)
#dodane 0..1, zeby nie bawic sie w ify, gdy nie ma dzieci (wtedy suma i tak jest zero, wiece dzielenie nie ma znaczenia)
            h = T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)])  # bez biasa i sigmoida

            #h = T.nnet.sigmoid(T.dot(self.emb[word_id],self.W_eh) + T.dot(schh,self.W_shsh) + self.bh)

            #h_s = T.zeros_like(hidden_states)
            #zeros_subtensor = h_s[i]
            #new_h_s = T.set_subtensor(zeros_subtensor, h)

            zeros_subtensor = hidden_states[i]
            hidden_states_new = T.set_subtensor(zeros_subtensor, h)

            #h2 = T.dot(h, self.W_h_h2)

            #y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y))# + self.b)

            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y))# + self.b)

            cce = -T.log(y_prob[0][y_true])

            #learning_rate = 0.01

            updates = OrderedDict([#(self.W_h2_y, self.W_h2_y-learning_rate*T.grad(cce, self.W_h2_y)),
                        (self.W_h_y, self.W_h_y-learning_rate*T.grad(cce, self.W_h_y)),
			#(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cce, self.W_h_h2)),
			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cce, self.W_e_h)),
			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cce, self.W_sh_h)),
			(self.emb, self.emb-learning_rate*T.grad(cce, self.emb))
			])


            return (i+1,hidden_states_new, y_prob), updates


        y = T.vector('y',dtype=dataType)

        lr = T.scalar('lr',dtype=theano.config.floatX)

	words = T.vector(dtype=dataType)
	children_ids = T.matrix(dtype=dataType)
        children_positions = T.matrix(dtype=dataType)
        #words_indexes = T.vector(dtype=dataType)

	y_probs, upd = theano.scan(fn=one_step, \
                                 sequences = [words, children_ids, children_positions,y],#,words_indexes],
                                 outputs_info = [theano.shared(0),
                                                 theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX)),
                                                 None],
                                 non_sequences = lr,
                                 n_steps = words.shape[0])


        def one_step_classify(word_id, word_children_ids, word_children_positions, i, hidden_states):


            idx_tmp = (word_children_positions>=0).nonzero()
            tmp = T.zeros_like(word_children_positions)
            tmp2 = T.set_subtensor(tmp[idx_tmp], 1)
            number_of_children = tmp2.sum()

            schh = hidden_states[word_children_positions].sum(axis=0) / ifelse(T.eq(number_of_children, shared_zero), shared_one, number_of_children)
            h = T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)])  # bez biasa i sigmoida

            zeros_subtensor = hidden_states[i]
            hidden_states_new = T.set_subtensor(zeros_subtensor, h)

            #h2 = T.dot(h, self.W_h_h2)
            #y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y))# + self.b)
            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y))

            return i+1, hidden_states_new, y_prob


	y_probs_classify, _ = theano.scan(fn=one_step_classify, \
                                 sequences = [words, children_ids, children_positions],
                                 outputs_info = [theano.shared(0),
                                                 theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX)),
                                                 None],
                                 n_steps = words.shape[0])


        predictions, _ = theano.scan(lambda i: (i+1, T.argmax(y_probs_classify[2][i][0])), outputs_info = [theano.shared(0), None], n_steps = y_probs_classify[2].shape[0])

	#res2 , _ = theano.scan(lambda x,i : (i+1, T.argmax(x)),
        #                                                  sequences = [estimated_probs[1]],
        #                                                  outputs_info = [theano.shared(0), None]
        #                                                  )


#        minus_log_true_class_prob = res[1]
        #prediction_class = res2[1]


#        nll = minus_log_true_class_prob.sum()

        #y_pred = T.argmax(estimated_probs) # y_probs[-1][-1][0] zwraca wektor [P(y=0), P(y=1), ...] -> argmax zwraca predykce klasy
                 # dostajemy sie do predykcji dla ostatniego slowa, a klasyfikacja ostatniego slowa odpowiada klasyfikacji frazy,
                 # bo slowa sa ustawione w takiej kolejnosci, ze korzen jest ostatnim slowem


        # cost and gradients and learning rate
        #nll = -T.log(estimated_probs[1])[y] #to samo co (sprawdzone):
        #nll = T.nnet.nnet.categorical_crossentropy(estimated_probs,T.extra_ops.to_one_hot(y.dimshuffle('x'), 5)[0])

#        gradients = T.grad( nll, self.params )
#        updates = OrderedDict(( p, p-lr*g ) for p, g in zip( self.params , gradients))

        # uwaga: ostani rzad macierzy sent_states - wektor odpowiadajacy dziecku, ktorego nie ma - jest stale rowny zero


        # theano functions
        self.classify = theano.function(inputs=[words,children_ids,children_positions], outputs=predictions[1],
                                     allow_input_downcast=True,
                                     mode='FAST_RUN' )

        self.train = theano.function( inputs  = [words,children_ids, children_positions, y, lr],#, words_indexes
                                      outputs = [],#nll,
                                      updates = upd,#updates,
                                      allow_input_downcast=True,
                                      mode='FAST_RUN' )


        #self.normalize = theano.function( inputs = [], #uwazac na dzielenie przez 0 - ostatni wiersz sent_states jest zerowy
        #                 updates = {self.sent_states:\
        #                 self.sent_states/T.sqrt((self.sent_states**2).sum(axis=1))})#.dimshuffle(0,'x')})

    def save(self, folder, e, i):
        for param, name in zip(self.params, self.names):
            np.save(os.path.join(folder, name + str(e) + "_" + str(i) + '.npy'), param.get_value())