LSTM_models.py 9.21 KB
import numpy as np
import time
import sys
import subprocess
import os
import random

#from modules.data import load
#from modules.rnn.many_models import *
#from modules.metrics.accuracy import conlleval
from modules.utils.tools import load_stanford_data4

from theano import pp

import theano.tensor as T
import theano
from theano.sandbox.rng_mrg import MRG_RandomStreams #as MRG_RandomStreams 

import itertools

import os.path
import pickle

from collections import Counter



from theano import tensor as T, printing
from collections import OrderedDict
from theano.ifelse import ifelse

from keras.preprocessing import sequence as seq

dataType = 'int64'




class LSTM_1(object):  
    def __init__(self, h_dim, nc, w2v_model_path, max_phrase_length): 

        '''
        nh :: dimension of hidden state
        nc :: number of classes
        '''

        self.max_phrase_length = max_phrase_length
        w2vecs = pickle.load(open(w2v_model_path,"r"))
        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
        self.words2ids = w2vecs["words2ids"]

        emb_dim = w2vecs["vectors"].shape[1]
        del w2vecs

        r = 0.05

	self.W_i = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim, h_dim) ).astype(theano.config.floatX))
  	self.U_i = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
	self.b_i = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

	self.W_f = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim, h_dim) ).astype(theano.config.floatX))
  	self.U_f = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
	self.b_f = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

	self.W_o = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim, h_dim) ).astype(theano.config.floatX))
  	self.U_o = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
	self.b_o = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

	self.W_u = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim, h_dim) ).astype(theano.config.floatX))
  	self.U_u = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
	self.b_u = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))

        self.W_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, nc)).astype(theano.config.floatX))
        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))

    
        def one_step(word_id, word_children_positions, y_true, k, hidden_states, cell_states, learning_rate):

	    x = self.emb[word_id]
                                                       # czyli wektor zerowy # sprawdzic + 0.5
	    tmp = word_children_positions>=0.0
	    number_of_children = tmp.sum(dtype = theano.config.floatX) 
	    idx_tmp = tmp.nonzero()                                                                   # indeksy realne dzieci - czyli te, gdzie nie ma -1        
   
	    h_aggregated = ifelse(T.gt(number_of_children, 0.0), hidden_states[word_children_positions[idx_tmp]].sum(axis=0), hidden_states[-1])

	    #number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0)
	    #h_aggregated = h_aggregated/number_of_children					      # Usrednianie stanow ukrytych dzieci - 


	    i = T.nnet.sigmoid(	T.dot(x, self.W_i) + T.dot(h_aggregated, self.U_i) + self.b_i)             
    
	    o = T.nnet.sigmoid(	T.dot(x, self.W_o) + T.dot(h_aggregated, self.U_o) + self.b_o)             

	    u = T.tanh(	T.dot(x, self.W_u) + T.dot(h_aggregated, self.U_u) + self.b_u)             

	    f_c = ifelse(T.gt(number_of_children, 0.0), 
			(T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[word_children_positions[idx_tmp]], self.U_f)  + self.b_f )*cell_states[word_children_positions[idx_tmp]]).sum(axis=0),
			T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[-1], self.U_f)  + self.b_f ) * cell_states[-1]
		)

	    c = i*u + f_c

            h = o * T.tanh(c)

            current_cell_state = cell_states[k]
            cell_states_new = T.set_subtensor(current_cell_state, c)

            current_hidden_state = hidden_states[k]
            hidden_states_new = T.set_subtensor(current_hidden_state, h)


            y_prob = T.nnet.softmax(T.dot(h,self.W_y) + self.b_y)[0]
	    
            cross_entropy = -T.log(y_prob[y_true])						       # + norm_coefficient * l2_norm
                                 
            return cross_entropy, hidden_states_new, cell_states_new  


        y = T.vector('y',dtype=dataType)
        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
	words = T.vector(dtype=dataType)
        children_positions = T.matrix(dtype=dataType)
	words_indexes = T.vector(dtype=dataType)

	[cross_entropy_vector, _, _] , _ = theano.scan(fn=one_step, \
                                 sequences = [words, children_positions,y,words_indexes],
				 outputs_info = [None, 
						 theano.shared(np.zeros((self.max_phrase_length+1,h_dim), dtype = theano.config.floatX)),
						 theano.shared(np.zeros((self.max_phrase_length+1,h_dim), dtype = theano.config.floatX))],
                                 non_sequences = learning_rate,
                                 n_steps = words.shape[0])
	cost = T.sum(cross_entropy_vector)

        updates = OrderedDict([
			(self.W_i, self.W_i-learning_rate*T.grad(cost, self.W_i)),
			(self.W_f, self.W_f-learning_rate*T.grad(cost, self.W_f)),
			(self.W_o, self.W_o-learning_rate*T.grad(cost, self.W_o)),
			(self.W_u, self.W_u-learning_rate*T.grad(cost, self.W_u)),
			(self.W_y, self.W_y-learning_rate*T.grad(cost, self.W_y)),
			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
                        (self.b_i, self.b_i-learning_rate*T.grad(cost,self.b_i)),
                        (self.b_f, self.b_f-learning_rate*T.grad(cost,self.b_f)),
                        (self.b_o, self.b_o-learning_rate*T.grad(cost,self.b_o)),
                        (self.b_u, self.b_u-learning_rate*T.grad(cost,self.b_u)),
                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
			])

	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
                                      outputs = [],
                                      updates = updates,
                                      allow_input_downcast=True,
                                      mode='FAST_RUN'
                                      )

 
        def one_step_classify(word_id, word_children_positions, k, hidden_states, cell_states):

	    x = self.emb[word_id]
                                                       # czyli wektor zerowy # sprawdzic + 0.5
	    tmp = word_children_positions>=0.0
	    number_of_children = tmp.sum(dtype = theano.config.floatX) 
	    idx_tmp = tmp.nonzero()                                                                   # indeksy realne dzieci - czyli te, gdzie nie ma -1        
   
	    h_aggregated = ifelse(T.gt(number_of_children, 0.0), hidden_states[word_children_positions[idx_tmp]].sum(axis=0), hidden_states[-1])

	    #number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0)
	    #h_aggregated = h_aggregated/number_of_children					      # Usrednianie stanow ukrytych dzieci - 


	    i = T.nnet.sigmoid(	T.dot(x, self.W_i) + T.dot(h_aggregated, self.U_i) + self.b_i)             
    
	    o = T.nnet.sigmoid(	T.dot(x, self.W_o) + T.dot(h_aggregated, self.U_o) + self.b_o)             

	    u = T.tanh(	T.dot(x, self.W_u) + T.dot(h_aggregated, self.U_u) + self.b_u)             

	    f_c = ifelse(T.gt(number_of_children, 0.0), 
			(T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[word_children_positions[idx_tmp]], self.U_f)  + self.b_f )*cell_states[word_children_positions[idx_tmp]]).sum(axis=0),
			T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[-1], self.U_f)  + self.b_f ) * cell_states[-1]
		)

	    c = i*u + f_c

            h = o * T.tanh(c)

            current_cell_state = cell_states[k]
            cell_states_new = T.set_subtensor(current_cell_state, c)

            current_hidden_state = hidden_states[k]
            hidden_states_new = T.set_subtensor(current_hidden_state, h)


            y_prob = T.nnet.softmax(T.dot(h,self.W_y) + self.b_y)[0]             

            return  y_prob, hidden_states_new, cell_states_new
        

	[y_probs_classify, _, _ ], _ = theano.scan(
				 fn=one_step_classify, 
                                 sequences = [words, children_positions, words_indexes],
				 outputs_info = [None,
						 theano.shared(np.zeros((self.max_phrase_length+1,h_dim), dtype = theano.config.floatX)),
						 theano.shared(np.zeros((self.max_phrase_length+1,h_dim), dtype = theano.config.floatX))])
	
	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
                                     sequences = [words_indexes])

        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
                                     outputs=predictions,
                                     allow_input_downcast=True,
                                     mode='FAST_RUN' 
                                     )