Porownanie modelu MLP na danych polskich i angielskich z tree-lstmem. Pierwsza w…

…ersja i eksperymenty sieci z parametryzajcja krawedziami

Porownanie modelu MLP na danych polskich i angielskich z tree-lstmem. Pierwsza w…
…ersja i eksperymenty sieci z parametryzajcja krawedziami
Norbert Ryciak
1 parent 6060efac
Showing 24 changed files with 5173 additions and 479 deletions
main_for_experiments_on_polish_data_LSTM.py
main_for_experiments_on_polish_data_MLP2.py
main_for_experiments_on_sst_MLP2.py
main_for_experiments_on_stanford_data.py
main_for_sst_LSTM.py
modules/rnn/LSTM_models.py
modules/rnn/LSTM_models.pyc
modules/rnn/models.py
modules/rnn/models.pyc
modules/rnn/models_with_relations.py
modules/rnn/models_with_relations.pyc
modules/rnn/tmp.py
modules/rnn/tmp.pyc
modules/utils/tools.py
modules/utils/tools.pyc
modules/rnn/many_models.py → nieaktualne/many_models.py
modules/rnn/many_models.pyc → nieaktualne/many_models.pyc
modules/rnn/nnet_for_dependency_trees.py → nieaktualne/nnet_for_dependency_trees.py
modules/rnn/nnet_for_dependency_trees.pyc → nieaktualne/nnet_for_dependency_trees.pyc
results/wyniki_model_pf_na_sst.txt
+import numpy as np
+import time
+import sys
+import subprocess
+import os
+import random
+
+#from modules.data import load
+from modules.rnn.LSTM_models import *
+#from modules.metrics.accuracy import conlleval
+from modules.utils.tools import load_stanford_data4, shuffle
+
+from theano import pp
+
+import theano.tensor as T
+import theano
+from theano.sandbox.rng_mrg import MRG_RandomStreams #as MRG_RandomStreams 
+
+import itertools
+
+import os.path
+import pickle
+
+from collections import Counter
+
+
+
+from theano import tensor as T, printing
+from collections import OrderedDict
+from theano.ifelse import ifelse
+
+from keras.preprocessing import sequence as seq
+
+dataType = 'int64'
+
+
+if __name__ == '__main__':
+
+
+
+    w2v_DIM = "300"
+
+
+
+    file_with_filtered_embeddings = "embeddings/embedding_and_words2ids_dim"+w2v_DIM+"_polish.pkl"
+    if not os.path.exists(file_with_filtered_embeddings):
+        print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
+        filter_embeddings(["data/dane_polskie/train/train_labels.txt", "data/dane_polskie/train/train_parents.txt","data/dane_polskie/train/train_sentence.txt", 
+			   "data/dane_polskie/dev/dev_labels.txt", "data/dane_polskie/dev/dev_parents.txt","data/dane_polskie/dev/dev_sentence.txt", 
+			   "data/dane_polskie/test/test_labels.txt", "data/dane_polskie/test/test_parents.txt","data/dane_polskie/test/test_sentence.txt"],
+
+         "/home/norbert/Doktorat/clarin2sent/deeptagger/embeddings/w2v_allwiki_nkjpfull_"+w2v_DIM+".txt",
+         file_with_filtered_embeddings)
+        
+
+    s = {'lr':0.002,
+	 'nepochs':40,
+         'seed':345,
+         'nc':3         # number of y classes
+         }  
+    batch_size = 1
+   
+
+    for h_dim in [100, 150]: 
+
+    	    np.random.seed(s['seed'])
+	    random.seed(s['seed'])
+
+
+    	    rnn = LSTM_1(   h_dim,
+             	            nc = s['nc'],
+                	    w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami 
+               	            max_phrase_length = 60 )
+	
+       
+	    train_data = load_stanford_data4("data/dane_polskie/train/train_labels.txt", "data/dane_polskie/train/train_parents.txt","data/dane_polskie/train/train_sentence.txt",rnn.words2ids,True,batch_size,s['nc'])     
+	    train_data_check = train_data 
+	    dev_data = load_stanford_data4("data/dane_polskie/dev/dev_labels.txt", "data/dane_polskie/dev/dev_parents.txt","data/dane_polskie/dev/dev_sentence.txt",rnn.words2ids,False,0,s['nc'])
+	    test_data = load_stanford_data4("data/dane_polskie/test/test_labels.txt", "data/dane_polskie/test/test_parents.txt","data/dane_polskie/test/test_sentence.txt",rnn.words2ids,False,0,s['nc'])
+
+	    n_train = len(train_data)
+	    n_dev = len(dev_data)
+	    n_test = len(test_data)
+
+	    print ""
+	    #print "model 56 : h_dim = ", h_dim, "h2_dim = ", h2_dim, "h3_dim = ", h3_dim, " learning rate = ", s['lr']#, "dropout rate: ", dropout_rate
+	    print "model LSTM_` : " , "h_dim = ", h_dim
+	    print ""
+
+            best_prediction_valid_all = 0
+            best_prediction_test_all = 0
+            best_prediction_test_root = 0
+            early_stop = 0
+
+
+	    tic = time.time()
+
+	    for e in xrange(s['nepochs']):
+
+		#if e >= 1:
+		#    s['lr'] = 0.8 * s['lr']
+
+                if early_stop == 10:
+                    break
+
+
+		# shuffle
+		shuffle([train_data], s['seed'])
+		
+		for i in range(n_train):
+		    rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], s['lr'])
+		   
+		 	        
+     
+	        # Dev:
+	        counts_dev = np.zeros((s['nc'],s['nc']),dtype='int')
+                counts_dev_root = np.zeros((s['nc'],s['nc']),dtype='int')
+	        for ii in range(n_dev):
+	            pred = rnn.classify(dev_data[ii][0],dev_data[ii][1], dev_data[ii][3])
+	            for j in range(len(pred)):
+	                counts_dev[pred[j], dev_data[ii][2][j]] += 1
+                    counts_dev_root[pred[-1], dev_data[ii][2][-1]] += 1
+
+
+		# Test:
+		counts_test = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_test_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(n_test):
+		    pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][3])
+		    for j in range(len(pred)):
+		        counts_test[pred[j], test_data[i][2][j]] += 1
+                    counts_test_root[pred[-1], test_data[i][2][-1]] += 1
+		
+		# Train
+		counts = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(len(train_data_check)):
+		    
+		    if i % 1 == 0: #sprawdzamy dopasowanie na 1/100 zbioru zeby oszczedzic czas
+		    	pred  = rnn.classify(train_data_check[i][0],train_data_check[i][1], train_data_check[i][3])
+		    	for j in range(len(pred)):
+		    	    counts[pred[j], train_data_check[i][2][j]] += 1
+                   	counts_root[pred[-1], train_data_check[i][2][-1]] += 1
+
+		print("epoch: ", e,
+		      "V all: ", "%0.2f" % (100 * np.diag(counts_dev).sum()/float(counts_dev.sum())), 
+		      "   Test all: ", "%0.2f" % (100 * np.diag(counts_test).sum()/float(counts_test.sum())), 
+                      "V root: ", "%0.2f" % (100 * np.diag(counts_dev_root).sum()/float(counts_dev_root.sum())), 
+		      "   Test root: ", "%0.2f" % (100 * np.diag(counts_test_root).sum()/float(counts_test_root.sum())), 
+		      "   Train: ", "%0.2f" % (100 * np.diag(counts).sum()/float(counts.sum())), 
+		      "   Train root: ", "%0.2f" % (100 * np.diag(counts_root).sum()/float(counts_root.sum()))  
+		     )
+
+	     
+	        if np.diag(counts_dev).sum()/float(counts_dev.sum()) > best_prediction_valid_all:
+                    best_prediction_valid_all = np.diag(counts_dev).sum()/float(counts_dev.sum())
+		    best_prediction_test_all = np.diag(counts_test).sum()/float(counts_test.sum())
+		    best_prediction_test_root = np.diag(counts_test_root).sum()/float(counts_test_root.sum())
+
+                    early_stop = 0
+                else:
+                    early_stop = early_stop + 1
+
+
+	    print("Best valid: ", "%0.2f" % (100 * best_prediction_valid_all)," Test all: ","%0.2f" % (100 * best_prediction_test_all),"Test root: ","%0.2f" % (100 * best_prediction_test_root), "   time: ", time.time()-tic)
+
+import numpy as np
+import time
+import sys
+import subprocess
+import os
+import random
+
+#from modules.data import load
+from modules.rnn.models_with_relations import *
+#from modules.metrics.accuracy import conlleval
+from modules.utils.tools import load_stanford_data6, shuffle
+
+from theano import pp
+
+import theano.tensor as T
+import theano
+from theano.sandbox.rng_mrg import MRG_RandomStreams #as MRG_RandomStreams 
+
+import itertools
+
+import os.path
+import pickle
+
+from collections import Counter
+
+
+
+from theano import tensor as T, printing
+from collections import OrderedDict
+from theano.ifelse import ifelse
+
+from keras.preprocessing import sequence as seq
+
+dataType = 'int64'
+
+
+if __name__ == '__main__':
+
+
+
+    w2v_DIM = "300"
+
+
+
+    file_with_filtered_embeddings = "embeddings/embedding_and_words2ids_dim"+w2v_DIM+"_polish.pkl"
+    if not os.path.exists(file_with_filtered_embeddings):
+        print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
+        filter_embeddings(["data/dane_polskie/train/train_labels.txt", "data/dane_polskie/train/train_parents.txt","data/dane_polskie/train/train_sentence.txt", 
+			   "data/dane_polskie/dev/dev_labels.txt", "data/dane_polskie/dev/dev_parents.txt","data/dane_polskie/dev/dev_sentence.txt", 
+			   "data/dane_polskie/test/test_labels.txt", "data/dane_polskie/test/test_parents.txt","data/dane_polskie/test/test_sentence.txt"],
+
+         "/home/norbert/Doktorat/clarin2sent/deeptagger/embeddings/w2v_allwiki_nkjpfull_"+w2v_DIM+".txt",
+         file_with_filtered_embeddings)
+        
+
+    s = {'lr':0.002,
+	 'nepochs':40,
+         'seed':345,
+         'nc':3         # number of y classes
+         }  
+    batch_size = 1
+   
+
+    for h_dim in [50]: 
+
+    	    np.random.seed(s['seed'])
+	    random.seed(s['seed'])
+
+
+    	    rnn = MLP_2_1(   h_dim, h_dim,
+             	            nc = s['nc'],
+                	    w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami 
+               	            max_phrase_length = 60 )
+	
+       
+	    train_data = load_stanford_data4("data/dane_polskie/train/train_labels.txt", 
+						"data/dane_polskie/train/train_parents.txt",
+						"data/dane_polskie/train/train_sentence.txt",
+
+#############################################################################################
+
+						NIE MA RELACJI DLA POLSKICH DANYCH
+
+#############################################################################################
+
+						rnn.words2ids,True,batch_size,s['nc'])     
+	    train_data_check = train_data 
+	    dev_data = load_stanford_data4("data/dane_polskie/dev/dev_labels.txt", "data/dane_polskie/dev/dev_parents.txt","data/dane_polskie/dev/dev_sentence.txt",rnn.words2ids,False,0,s['nc'])
+	    test_data = load_stanford_data4("data/dane_polskie/test/test_labels.txt", "data/dane_polskie/test/test_parents.txt","data/dane_polskie/test/test_sentence.txt",rnn.words2ids,False,0,s['nc'])
+
+	    n_train = len(train_data)
+	    n_dev = len(dev_data)
+	    n_test = len(test_data)
+
+	    print ""
+	    #print "model 56 : h_dim = ", h_dim, "h2_dim = ", h2_dim, "h3_dim = ", h3_dim, " learning rate = ", s['lr']#, "dropout rate: ", dropout_rate
+	    print "model LSTM_` : " , "h_dim = ", h_dim
+	    print ""
+
+            best_prediction_valid_all = 0
+            best_prediction_test_all = 0
+            best_prediction_test_root = 0
+            early_stop = 0
+
+
+	    tic = time.time()
+
+	    for e in xrange(s['nepochs']):
+
+		#if e >= 1:
+		#    s['lr'] = 0.8 * s['lr']
+
+                if early_stop == 10:
+                    break
+
+
+		# shuffle
+		shuffle([train_data], s['seed'])
+		
+		for i in range(n_train):
+		    rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], s['lr'])
+		   
+		 	        
+     
+	        # Dev:
+	        counts_dev = np.zeros((s['nc'],s['nc']),dtype='int')
+                counts_dev_root = np.zeros((s['nc'],s['nc']),dtype='int')
+	        for ii in range(n_dev):
+	            pred = rnn.classify(dev_data[ii][0],dev_data[ii][1], dev_data[ii][3])
+	            for j in range(len(pred)):
+	                counts_dev[pred[j], dev_data[ii][2][j]] += 1
+                    counts_dev_root[pred[-1], dev_data[ii][2][-1]] += 1
+
+
+		# Test:
+		counts_test = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_test_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(n_test):
+		    pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][3])
+		    for j in range(len(pred)):
+		        counts_test[pred[j], test_data[i][2][j]] += 1
+                    counts_test_root[pred[-1], test_data[i][2][-1]] += 1
+		
+		# Train
+		counts = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(len(train_data_check)):
+		    
+		    if i % 1 == 0: #sprawdzamy dopasowanie na 1/100 zbioru zeby oszczedzic czas
+		    	pred  = rnn.classify(train_data_check[i][0],train_data_check[i][1], train_data_check[i][3])
+		    	for j in range(len(pred)):
+		    	    counts[pred[j], train_data_check[i][2][j]] += 1
+                   	counts_root[pred[-1], train_data_check[i][2][-1]] += 1
+
+		print("epoch: ", e,
+		      "V all: ", "%0.2f" % (100 * np.diag(counts_dev).sum()/float(counts_dev.sum())), 
+		      "   Test all: ", "%0.2f" % (100 * np.diag(counts_test).sum()/float(counts_test.sum())), 
+                      "V root: ", "%0.2f" % (100 * np.diag(counts_dev_root).sum()/float(counts_dev_root.sum())), 
+		      "   Test root: ", "%0.2f" % (100 * np.diag(counts_test_root).sum()/float(counts_test_root.sum())), 
+		      "   Train: ", "%0.2f" % (100 * np.diag(counts).sum()/float(counts.sum())), 
+		      "   Train root: ", "%0.2f" % (100 * np.diag(counts_root).sum()/float(counts_root.sum()))  
+		     )
+
+	     
+	        if np.diag(counts_dev).sum()/float(counts_dev.sum()) > best_prediction_valid_all:
+                    best_prediction_valid_all = np.diag(counts_dev).sum()/float(counts_dev.sum())
+		    best_prediction_test_all = np.diag(counts_test).sum()/float(counts_test.sum())
+		    best_prediction_test_root = np.diag(counts_test_root).sum()/float(counts_test_root.sum())
+
+                    early_stop = 0
+                else:
+                    early_stop = early_stop + 1
+
+
+	    print("Best valid: ", "%0.2f" % (100 * best_prediction_valid_all)," Test all: ","%0.2f" % (100 * best_prediction_test_all),"Test root: ","%0.2f" % (100 * best_prediction_test_root), "   time: ", time.time()-tic)
+
+import numpy as np
+import time
+import sys
+import subprocess
+import os
+import random
+
+#from modules.data import load
+from modules.rnn.models_with_relations import *
+#from modules.metrics.accuracy import conlleval
+from modules.utils.tools import load_stanford_data6, shuffle
+
+from theano import pp
+
+import theano.tensor as T
+import theano
+from theano.sandbox.rng_mrg import MRG_RandomStreams #as MRG_RandomStreams 
+
+import itertools
+
+import os.path
+import pickle
+
+from collections import Counter
+
+
+
+from theano import tensor as T, printing
+from collections import OrderedDict
+from theano.ifelse import ifelse
+
+from keras.preprocessing import sequence as seq
+
+dataType = 'int64'
+
+
+
+if __name__ == '__main__':
+
+    #theano.config.floatX = 'float64'
+
+    file_with_filtered_embeddings = "embeddings/embedding_and_words2ids.pkl"
+    if not os.path.exists(file_with_filtered_embeddings):
+        print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
+        filter_embeddings(["data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks", "data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks", "data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks"],
+
+         "/home/norbert/Doktorat/clarin2sent/treelstm/data/glove/glove.840B.300d.txt",
+         file_with_filtered_embeddings)
+
+
+    batch_size = 1
+        
+    s = {'lr':0.002,  
+	 'nepochs':30,
+         'seed':345,
+         'nc':5         # number of y classes
+         }  
+
+
+    batch_size = 1
+
+
+
+    for ne_dim, nchd_dim, nh2_dim, number_of_relations in [(50,50, 50, 5),(100,100, 100, 5),(50,50, 50, 10),(100,100, 100, 10),(200,200, 100, 5)]: 
+
+    	    np.random.seed(s['seed'])
+	    random.seed(s['seed'])
+
+
+    	    rnn = MLP_2_2( ne = ne_dim, nchd = nchd_dim, nh2 = nh2_dim,
+             	            nc = s['nc'],
+                	    w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami 
+               	            max_phrase_length = 60,
+			  number_of_relations = number_of_relations )
+	
+	    train_data = load_stanford_data6("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks","data/sst/train/rels.txt",rnn.words2ids,True,batch_size,s['nc'], k_most_common_relations = number_of_relations)
+        
+	    dev_data = load_stanford_data6("data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks","data/sst/dev/rels.txt",rnn.words2ids,False,0,s['nc'], k_most_common_relations = number_of_relations)
+
+	    test_data = load_stanford_data6("data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks","data/sst/test/rels.txt",rnn.words2ids,False,0,s['nc'], k_most_common_relations = number_of_relations)
+
+	    n_train = len(train_data)
+	    n_dev = len(dev_data)
+	    n_test = len(test_data)
+
+	    print ""
+	    print "lr = ", s['lr'], "number_of_relations = ", number_of_relations
+	    print "model MLP_2_2 : ", "nchd_dim = ", nchd_dim ,"ne_dim = ", ne_dim , "nh2 =",  nh2_dim
+	    print ""
+
+            best_prediction_valid_all = 0
+            best_prediction_test_all = 0
+            best_prediction_test_root = 0
+            early_stop = 0
+
+
+	    tic = time.time()
+
+	    for e in xrange(s['nepochs']):
+
+		#if e >= 1:
+		#    s['lr'] = 0.8 * s['lr']
+
+                if early_stop == 5:
+                    break
+
+
+		# shuffle
+		shuffle([train_data], s['seed'])
+		
+		for i in range(n_train):
+		    rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], train_data[i][4],s['lr'])
+		   
+		 	        
+     
+	        # Dev:
+	        counts_dev = np.zeros((s['nc'],s['nc']),dtype='int')
+                counts_dev_root = np.zeros((s['nc'],s['nc']),dtype='int')
+	        for ii in range(n_dev):
+	            pred = rnn.classify(dev_data[ii][0],dev_data[ii][1], dev_data[ii][3], dev_data[ii][4])
+	            for j in range(len(pred)):
+	                counts_dev[pred[j], dev_data[ii][2][j]] += 1
+                    counts_dev_root[pred[-1], dev_data[ii][2][-1]] += 1
+
+
+		# Test:
+		counts_test = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_test_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(n_test):
+		    pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][3], test_data[i][4])
+		    for j in range(len(pred)):
+		        counts_test[pred[j], test_data[i][2][j]] += 1
+                    counts_test_root[pred[-1], test_data[i][2][-1]] += 1
+
+		# Train
+		counts = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(len(train_data)):
+		    
+		    if i % 10 == 0: #sprawdzamy dopasowanie na 1/10 zbioru zeby oszczedzic czas
+		    	pred  = rnn.classify(train_data[i][0],train_data[i][1], train_data[i][3], train_data[i][4])
+		    	for j in range(len(pred)):
+		    	    counts[pred[j], train_data[i][2][j]] += 1
+                   	counts_root[pred[-1], train_data[i][2][-1]] += 1
+
+
+		
+	    	print("Valid: ", "%0.2f" % (100 * np.diag(counts_dev).sum()/float(counts_dev.sum())),
+			"Valid root: ","%0.2f" % (100 * np.diag(counts_dev_root).sum()/float(counts_dev_root.sum())),
+			" Test all: ","%0.2f" % (100 * np.diag(counts_test).sum()/float(counts_test.sum())),
+			"Test root: ","%0.2f" % (100 * np.diag(counts_test_root).sum()/float(counts_test_root.sum())), 
+			" Train all: ","%0.2f" % (100 * np.diag(counts).sum()/float(counts.sum())),
+			"Train root: ","%0.2f" % (100 * np.diag(counts_root).sum()/float(counts_root.sum())),"   time: ", time.time()-tic)
+
+	        if np.diag(counts_dev).sum()/float(counts_dev.sum()) > best_prediction_valid_all:
+                    best_prediction_valid_all = np.diag(counts_dev).sum()/float(counts_dev.sum())
+		    best_prediction_test_all = np.diag(counts_test).sum()/float(counts_test.sum())
+		    best_prediction_test_root = np.diag(counts_test_root).sum()/float(counts_test_root.sum())
+
+                    early_stop = 0
+                else:
+                    early_stop = early_stop + 1
+
+
+	    print("Best valid: ", "%0.2f" % (100 * best_prediction_valid_all)," Test all: ","%0.2f" % (100 * best_prediction_test_all),"Test root: ","%0.2f" % (100 * best_prediction_test_root), "   time: ", time.time()-tic)
+
+
+
+
+        
+import numpy as np
+import time
+import sys
+import subprocess
+import os
+import random
+
+#from modules.data import load
+from modules.rnn.models import *
+#from modules.metrics.accuracy import conlleval
+from modules.utils.tools import load_stanford_data4, shuffle
+
+from theano import pp
+
+import theano.tensor as T
+import theano
+from theano.sandbox.rng_mrg import MRG_RandomStreams #as MRG_RandomStreams 
+
+import itertools
+
+import os.path
+import pickle
+
+from collections import Counter
+
+
+
+from theano import tensor as T, printing
+from collections import OrderedDict
+from theano.ifelse import ifelse
+
+from keras.preprocessing import sequence as seq
+
+dataType = 'int64'
+
+
+
+if __name__ == '__main__':
+
+    #theano.config.floatX = 'float64'
+
+    file_with_filtered_embeddings = "embeddings/embedding_and_words2ids.pkl"
+    if not os.path.exists(file_with_filtered_embeddings):
+        print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
+        filter_embeddings(["data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks", "data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks", "data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks"],
+
+         "/home/norbert/Doktorat/clarin2sent/treelstm/data/glove/glove.840B.300d.txt",
+         file_with_filtered_embeddings)
+
+
+    batch_size = 1
+
+
+ # ZDABAC MODELE 7,8,9 , 1, 2, 3 , 10, 11, 5,6
+
+
+        
+    s = {'lr':0.002,  
+	 'nepochs':30,
+         'seed':345,
+         'nc':5         # number of y classes
+         }  
+
+
+    batch_size = 1
+
+
+
+
+
+    for ne_dim, nchd_dim in [(100,100)]:#,(200,200, 200,100),(200,200, 300,100),(100,100, 200,100)]: 
+
+    	    np.random.seed(s['seed'])
+	    random.seed(s['seed'])
+
+
+    	    rnn = model55_pf1( ne = ne_dim, nchd = nchd_dim,# nh2 = nh2_dim,
+             	            nc = s['nc'],
+                	    w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami 
+               	            max_phrase_length = 60 )
+	
+	    train_data = load_stanford_data4("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks",rnn.words2ids,True,batch_size,s['nc'])
+        
+	    dev_data = load_stanford_data4("data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks",rnn.words2ids,False,0,s['nc'])
+
+	    test_data = load_stanford_data4("data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks",rnn.words2ids,False,0,s['nc'])
+
+	    n_train = len(train_data)
+	    n_dev = len(dev_data)
+	    n_test = len(test_data)
+
+	    print ""
+	    print "lr = ", s['lr']
+	    print "model 55_pf1 : ", "nchd_dim = ", nchd_dim ,"ne_dim = ", ne_dim #, "nh2_dim = ", nh2_dim
+	    print ""
+
+            best_prediction_valid_all = 0
+            best_prediction_test_all = 0
+            best_prediction_test_root = 0
+            early_stop = 0
+
+
+	    tic = time.time()
+
+	    for e in xrange(s['nepochs']):
+
+		#if e >= 1:
+		#    s['lr'] = 0.8 * s['lr']
+
+                if early_stop == 5:
+                    break
+
+
+		# shuffle
+		shuffle([train_data], s['seed'])
+		
+		for i in range(n_train):
+		    rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], s['lr'])
+		   
+		 	        
+     
+	        # Dev:
+	        counts_dev = np.zeros((s['nc'],s['nc']),dtype='int')
+                counts_dev_root = np.zeros((s['nc'],s['nc']),dtype='int')
+	        for ii in range(n_dev):
+	            pred = rnn.classify(dev_data[ii][0],dev_data[ii][1], dev_data[ii][3])
+	            for j in range(len(pred)):
+	                counts_dev[pred[j], dev_data[ii][2][j]] += 1
+                    counts_dev_root[pred[-1], dev_data[ii][2][-1]] += 1
+
+
+		# Test:
+		counts_test = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_test_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(n_test):
+		    pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][3])
+		    for j in range(len(pred)):
+		        counts_test[pred[j], test_data[i][2][j]] += 1
+                    counts_test_root[pred[-1], test_data[i][2][-1]] += 1
+		
+		# Train
+		counts = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(len(train_data)):
+		    
+		    if i % 10 == 0: #sprawdzamy dopasowanie na 1/10 zbioru zeby oszczedzic czas
+		    	pred  = rnn.classify(train_data[i][0],train_data[i][1], train_data[i][3])
+		    	for j in range(len(pred)):
+		    	    counts[pred[j], train_data[i][2][j]] += 1
+                   	counts_root[pred[-1], train_data[i][2][-1]] += 1
+
+
+		
+	    	print("Valid: ", "%0.2f" % (100 * np.diag(counts_dev).sum()/float(counts_dev.sum())),
+			"Valid root: ","%0.2f" % (100 * np.diag(counts_dev_root).sum()/float(counts_dev_root.sum())),
+			" Test all: ","%0.2f" % (100 * np.diag(counts_test).sum()/float(counts_test.sum())),
+			"Test root: ","%0.2f" % (100 * np.diag(counts_test_root).sum()/float(counts_test_root.sum())), 
+			" Train all: ","%0.2f" % (100 * np.diag(counts).sum()/float(counts.sum())),
+			"Train root: ","%0.2f" % (100 * np.diag(counts_root).sum()/float(counts_root.sum())),"   time: ", time.time()-tic)
+
+	        if np.diag(counts_dev).sum()/float(counts_dev.sum()) > best_prediction_valid_all:
+                    best_prediction_valid_all = np.diag(counts_dev).sum()/float(counts_dev.sum())
+		    best_prediction_test_all = np.diag(counts_test).sum()/float(counts_test.sum())
+		    best_prediction_test_root = np.diag(counts_test_root).sum()/float(counts_test_root.sum())
+
+                    early_stop = 0
+                else:
+                    early_stop = early_stop + 1
+
+
+	    print("Best valid: ", "%0.2f" % (100 * best_prediction_valid_all)," Test all: ","%0.2f" % (100 * best_prediction_test_all),"Test root: ","%0.2f" % (100 * best_prediction_test_root), "   time: ", time.time()-tic)
+
+
+
+
+### 9 , 2,3 , 5,6
+
+
+        
+import numpy as np
+import time
+import sys
+import subprocess
+import os
+import random
+
+#from modules.data import load
+from modules.rnn.LSTM_models import *
+#from modules.metrics.accuracy import conlleval
+from modules.utils.tools import load_stanford_data4, shuffle
+
+from theano import pp
+
+import theano.tensor as T
+import theano
+from theano.sandbox.rng_mrg import MRG_RandomStreams #as MRG_RandomStreams 
+
+import itertools
+
+import os.path
+import pickle
+
+from collections import Counter
+
+
+
+from theano import tensor as T, printing
+from collections import OrderedDict
+from theano.ifelse import ifelse
+
+from keras.preprocessing import sequence as seq
+
+dataType = 'int64'
+
+
+if __name__ == '__main__':
+
+
+    sys.setrecursionlimit(10000)
+
+    #w2v_DIM = "300"
+
+
+
+    file_with_filtered_embeddings = "embeddings/embedding_and_words2ids.pkl"
+    #if not os.path.exists(file_with_filtered_embeddings):
+    #    print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
+    #    filter_embeddings(["data/dane_polskie/train/train_labels.txt", "data/dane_polskie/train/train_parents.txt","data/dane_polskie/train/train_sentence.txt", 
+	#		   "data/dane_polskie/dev/dev_labels.txt", "data/dane_polskie/dev/dev_parents.txt","data/dane_polskie/dev/dev_sentence.txt", 
+	#		   "data/dane_polskie/test/test_labels.txt", "data/dane_polskie/test/test_parents.txt","data/dane_polskie/test/test_sentence.txt"],
+
+         #"/home/norbert/Doktorat/clarin2sent/deeptagger/embeddings/w2v_allwiki_nkjpfull_"+w2v_DIM+".txt",
+         #file_with_filtered_embeddings)
+        
+
+    s = {'lr':0.002,
+	 'nepochs':40,
+         'seed':345,
+         'nc':5         # number of y classes
+         }  
+    batch_size = 1
+   
+
+    for h_dim in [100]: #100, 150, 200
+
+    	    np.random.seed(s['seed'])
+	    random.seed(s['seed'])
+    
+
+    	    rnn = LSTM_1(   h_dim,
+             	            nc = s['nc'],
+                	    w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami 
+               	            max_phrase_length = 60 )
+	
+       
+	    train_data = load_stanford_data4("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks",rnn.words2ids,True,batch_size,s['nc'])
+	    dev_data = load_stanford_data4("data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks",rnn.words2ids,False,0,s['nc'])
+	    test_data = load_stanford_data4("data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks",rnn.words2ids,False,0,s['nc'])
+
+	    n_train = len(train_data)
+	    n_dev = len(dev_data)
+	    n_test = len(test_data)
+
+	    print ""
+	    print "learning rate: ", s['lr']
+	    print "model LSTM_1 : " , "h_dim = ", h_dim
+	    print ""
+
+            best_prediction_valid_all = 0
+            best_prediction_test_all = 0
+            best_prediction_test_root = 0
+            early_stop = 0
+
+
+	    tic = time.time()
+
+	    for e in xrange(s['nepochs']):
+
+		#if e >= 1:
+		#    s['lr'] = 0.8 * s['lr']
+
+                if early_stop == 5:
+                    break
+
+
+		# shuffle
+		shuffle([train_data], s['seed'])
+		
+		for i in range(n_train):
+		    rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], s['lr'])
+		   
+		pickle.dump(rnn, open("model" + str(e) + ".pkl",'wb'))		 	        
+     
+	        # Dev:
+	        counts_dev = np.zeros((s['nc'],s['nc']),dtype='int')
+                counts_dev_root = np.zeros((s['nc'],s['nc']),dtype='int')
+	        for ii in range(n_dev):
+	            pred = rnn.classify(dev_data[ii][0],dev_data[ii][1], dev_data[ii][3])
+	            for j in range(len(pred)):
+	                counts_dev[pred[j], dev_data[ii][2][j]] += 1
+                    counts_dev_root[pred[-1], dev_data[ii][2][-1]] += 1
+
+
+		# Test:
+		counts_test = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_test_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(n_test):
+		    pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][3])
+		    for j in range(len(pred)):
+		        counts_test[pred[j], test_data[i][2][j]] += 1
+                    counts_test_root[pred[-1], test_data[i][2][-1]] += 1
+		
+		# Train
+		counts = np.zeros((s['nc'],s['nc']),dtype='int')
+		counts_root = np.zeros((s['nc'],s['nc']),dtype='int')
+		for i in range(len(train_data)):
+		    
+		    if i % 5 == 0: #sprawdzamy dopasowanie na 1/100 zbioru zeby oszczedzic czas
+		    	pred  = rnn.classify(train_data[i][0],train_data[i][1], train_data[i][3])
+		    	for j in range(len(pred)):
+		    	    counts[pred[j], train_data[i][2][j]] += 1
+                   	counts_root[pred[-1], train_data[i][2][-1]] += 1
+
+		print("epoch: ", e,
+		      "V all: ", "%0.2f" % (100 * np.diag(counts_dev).sum()/float(counts_dev.sum())), 
+		      "   Test all: ", "%0.2f" % (100 * np.diag(counts_test).sum()/float(counts_test.sum())), 
+                      "V root: ", "%0.2f" % (100 * np.diag(counts_dev_root).sum()/float(counts_dev_root.sum())), 
+		      "   Test root: ", "%0.2f" % (100 * np.diag(counts_test_root).sum()/float(counts_test_root.sum())), 
+		      "   Train: ", "%0.2f" % (100 * np.diag(counts).sum()/float(counts.sum())), 
+		      "   Train root: ", "%0.2f" % (100 * np.diag(counts_root).sum()/float(counts_root.sum()))  
+		     )
+
+	     
+	        if np.diag(counts_dev).sum()/float(counts_dev.sum()) > best_prediction_valid_all:
+                    best_prediction_valid_all = np.diag(counts_dev).sum()/float(counts_dev.sum())
+		    best_prediction_test_all = np.diag(counts_test).sum()/float(counts_test.sum())
+		    best_prediction_test_root = np.diag(counts_test_root).sum()/float(counts_test_root.sum())
+
+                    early_stop = 0
+                else:
+                    early_stop = early_stop + 1
+
+
+	    print("Best valid: ", "%0.2f" % (100 * best_prediction_valid_all)," Test all: ","%0.2f" % (100 * best_prediction_test_all),"Test root: ","%0.2f" % (100 * best_prediction_test_root), "   time: ", time.time()-tic)
+
+import numpy as np
+import time
+import sys
+import subprocess
+import os
+import random
+
+#from modules.data import load
+#from modules.rnn.many_models import *
+#from modules.metrics.accuracy import conlleval
+from modules.utils.tools import load_stanford_data4
+
+from theano import pp
+
+import theano.tensor as T
+import theano
+from theano.sandbox.rng_mrg import MRG_RandomStreams #as MRG_RandomStreams 
+
+import itertools
+
+import os.path
+import pickle
+
+from collections import Counter
+
+
+
+from theano import tensor as T, printing
+from collections import OrderedDict
+from theano.ifelse import ifelse
+
+from keras.preprocessing import sequence as seq
+
+dataType = 'int64'
+
+
+
+
+class LSTM_1(object):  
+    def __init__(self, h_dim, nc, w2v_model_path, max_phrase_length): 
+
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        '''
+
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        emb_dim = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+
+	self.W_i = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim, h_dim) ).astype(theano.config.floatX))
+  	self.U_i = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
+	self.b_i = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))
+
+	self.W_f = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim, h_dim) ).astype(theano.config.floatX))
+  	self.U_f = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
+	self.b_f = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))
+
+	self.W_o = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim, h_dim) ).astype(theano.config.floatX))
+  	self.U_o = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
+	self.b_o = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))
+
+	self.W_u = theano.shared(r * np.random.uniform(-1.0, 1.0, (emb_dim, h_dim) ).astype(theano.config.floatX))
+  	self.U_u = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, h_dim) ).astype(theano.config.floatX))
+	self.b_u = theano.shared(r * np.random.uniform(-1.0, 1.0, h_dim ).astype(theano.config.floatX))
+
+        self.W_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (h_dim, nc)).astype(theano.config.floatX))
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+    
+        def one_step(word_id, word_children_positions, y_true, k, hidden_states, cell_states, learning_rate):
+
+	    x = self.emb[word_id]
+                                                       # czyli wektor zerowy # sprawdzic + 0.5
+	    tmp = word_children_positions>=0.0
+	    number_of_children = tmp.sum(dtype = theano.config.floatX) 
+	    idx_tmp = tmp.nonzero()                                                                   # indeksy realne dzieci - czyli te, gdzie nie ma -1        
+   
+	    h_aggregated = ifelse(T.gt(number_of_children, 0.0), hidden_states[word_children_positions[idx_tmp]].sum(axis=0), hidden_states[-1])
+
+	    #number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0)
+	    #h_aggregated = h_aggregated/number_of_children					      # Usrednianie stanow ukrytych dzieci - 
+
+
+	    i = T.nnet.sigmoid(	T.dot(x, self.W_i) + T.dot(h_aggregated, self.U_i) + self.b_i)             
+    
+	    o = T.nnet.sigmoid(	T.dot(x, self.W_o) + T.dot(h_aggregated, self.U_o) + self.b_o)             
+
+	    u = T.tanh(	T.dot(x, self.W_u) + T.dot(h_aggregated, self.U_u) + self.b_u)             
+
+	    f_c = ifelse(T.gt(number_of_children, 0.0), 
+			(T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[word_children_positions[idx_tmp]], self.U_f)  + self.b_f )*cell_states[word_children_positions[idx_tmp]]).sum(axis=0),
+			T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[-1], self.U_f)  + self.b_f ) * cell_states[-1]
+		)
+
+	    c = i*u + f_c
+
+            h = o * T.tanh(c)
+
+            current_cell_state = cell_states[k]
+            cell_states_new = T.set_subtensor(current_cell_state, c)
+
+            current_hidden_state = hidden_states[k]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_y) + self.b_y)[0]
+	    
+            cross_entropy = -T.log(y_prob[y_true])						       # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new, cell_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	[cross_entropy_vector, _, _] , _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, 
+						 theano.shared(np.zeros((self.max_phrase_length+1,h_dim), dtype = theano.config.floatX)),
+						 theano.shared(np.zeros((self.max_phrase_length+1,h_dim), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+	cost = T.sum(cross_entropy_vector)
+
+        updates = OrderedDict([
+			(self.W_i, self.W_i-learning_rate*T.grad(cost, self.W_i)),
+			(self.W_f, self.W_f-learning_rate*T.grad(cost, self.W_f)),
+			(self.W_o, self.W_o-learning_rate*T.grad(cost, self.W_o)),
+			(self.W_u, self.W_u-learning_rate*T.grad(cost, self.W_u)),
+			(self.W_y, self.W_y-learning_rate*T.grad(cost, self.W_y)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_i, self.b_i-learning_rate*T.grad(cost,self.b_i)),
+                        (self.b_f, self.b_f-learning_rate*T.grad(cost,self.b_f)),
+                        (self.b_o, self.b_o-learning_rate*T.grad(cost,self.b_o)),
+                        (self.b_u, self.b_u-learning_rate*T.grad(cost,self.b_u)),
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, k, hidden_states, cell_states):
+
+	    x = self.emb[word_id]
+                                                       # czyli wektor zerowy # sprawdzic + 0.5
+	    tmp = word_children_positions>=0.0
+	    number_of_children = tmp.sum(dtype = theano.config.floatX) 
+	    idx_tmp = tmp.nonzero()                                                                   # indeksy realne dzieci - czyli te, gdzie nie ma -1        
+   
+	    h_aggregated = ifelse(T.gt(number_of_children, 0.0), hidden_states[word_children_positions[idx_tmp]].sum(axis=0), hidden_states[-1])
+
+	    #number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0)
+	    #h_aggregated = h_aggregated/number_of_children					      # Usrednianie stanow ukrytych dzieci - 
+
+
+	    i = T.nnet.sigmoid(	T.dot(x, self.W_i) + T.dot(h_aggregated, self.U_i) + self.b_i)             
+    
+	    o = T.nnet.sigmoid(	T.dot(x, self.W_o) + T.dot(h_aggregated, self.U_o) + self.b_o)             
+
+	    u = T.tanh(	T.dot(x, self.W_u) + T.dot(h_aggregated, self.U_u) + self.b_u)             
+
+	    f_c = ifelse(T.gt(number_of_children, 0.0), 
+			(T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[word_children_positions[idx_tmp]], self.U_f)  + self.b_f )*cell_states[word_children_positions[idx_tmp]]).sum(axis=0),
+			T.nnet.sigmoid( T.dot(x, self.W_f ) + T.dot(hidden_states[-1], self.U_f)  + self.b_f ) * cell_states[-1]
+		)
+
+	    c = i*u + f_c
+
+            h = o * T.tanh(c)
+
+            current_cell_state = cell_states[k]
+            cell_states_new = T.set_subtensor(current_cell_state, c)
+
+            current_hidden_state = hidden_states[k]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_y) + self.b_y)[0]             
+
+            return  y_prob, hidden_states_new, cell_states_new
+        
+
+	[y_probs_classify, _, _ ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions, words_indexes],
+				 outputs_info = [None,
+						 theano.shared(np.zeros((self.max_phrase_length+1,h_dim), dtype = theano.config.floatX)),
+						 theano.shared(np.zeros((self.max_phrase_length+1,h_dim), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+import numpy as np
+import time
+import sys
+import subprocess
+import os
+import random
+
+#from modules.data import load
+#from modules.rnn.many_models import *
+#from modules.metrics.accuracy import conlleval
+from modules.utils.tools import load_stanford_data4
+
+from theano import pp
+
+import theano.tensor as T
+import theano
+from theano.sandbox.rng_mrg import MRG_RandomStreams #as MRG_RandomStreams 
+
+import itertools
+
+import os.path
+import pickle
+
+from collections import Counter
+
+
+
+from theano import tensor as T, printing
+from collections import OrderedDict
+from theano.ifelse import ifelse
+
+from keras.preprocessing import sequence as seq
+
+dataType = 'int64'
+
+
+
+# UWAGA: "ne" to NIE JEST to co jest napisane - to jest wymiar warstwy bezposrednio nad embeddingiem
+
+
+class model55_pf1(object):  
+    def __init__(self, ne, nchd, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, ne)).astype(theano.config.floatX))
+        self.W_h_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nc)).astype(theano.config.floatX))
+
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nchd)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]))
+ 
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_y, self.W_h_y-learning_rate*T.grad(cost, self.W_h_y)),
+			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cost, self.W_e_h)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]) )
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+class model55_pf2(object):  
+    def __init__(self, ne, nchd, nh2, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+    
+        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, ne)).astype(theano.config.floatX))
+
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nchd)).astype(theano.config.floatX))
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nc)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]))
+            #h = T.nnet.sigmoid(T.dot(self.emb[word_id],self.W_eh) + T.dot(schh,self.W_shsh) + self.bh)
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]
+	
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_y, self.W_h2_y-learning_rate*T.grad(cost, self.W_h2_y)),
+			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cost, self.W_e_h)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]))  
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+class model55_pf3(object):  
+    def __init__(self, ne, nchd, nh2, nh3, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, ne)).astype(theano.config.floatX))
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nchd)).astype(theano.config.floatX))
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_h3   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nh3)).astype(theano.config.floatX))
+        self.W_h3_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh3, nc)).astype(theano.config.floatX))
+
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]))
+            #h = T.nnet.sigmoid(T.dot(self.emb[word_id],self.W_eh) + T.dot(schh,self.W_shsh) + self.bh)
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+            h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_h3, self.W_h2_h3-learning_rate*T.grad(cost, self.W_h2_h3)),
+			(self.W_h3_y, self.W_h3_y-learning_rate*T.grad(cost, self.W_h3_y)),
+			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cost, self.W_e_h)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)])  )
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+            h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+
+
+class model55_pf4(object):  
+    def __init__(self, neh, ne, nchd, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_eh  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, neh)).astype(theano.config.floatX))
+
+        self.W_eh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh, ne)).astype(theano.config.floatX))
+
+        self.W_h_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nc)).astype(theano.config.floatX))
+
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nchd)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+    
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(schh,self.W_sh_h)]))
+ 
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_y, self.W_h_y-learning_rate*T.grad(cost, self.W_h_y)),
+			(self.W_e_eh, self.W_e_eh-learning_rate*T.grad(cost, self.W_e_eh)),
+			(self.W_eh_h, self.W_eh_h-learning_rate*T.grad(cost, self.W_eh_h)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(schh,self.W_sh_h)]))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+
+
+class model55_pf5(object):  
+    def __init__(self, ne, nshh, nchd, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, ne)).astype(theano.config.floatX))
+
+        
+        self.W_h_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nc)).astype(theano.config.floatX))
+
+        self.W_sh_shh   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nshh)).astype(theano.config.floatX))
+
+	self.W_shh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh, nchd)).astype(theano.config.floatX))
+
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id], self.W_e_h), T.dot(shh,self.W_shh_h)]))
+ 
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_y, self.W_h_y-learning_rate*T.grad(cost, self.W_h_y)),
+			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cost, self.W_e_h)),
+			(self.W_shh_h, self.W_shh_h-learning_rate*T.grad(cost, self.W_shh_h)),
+			(self.W_sh_shh, self.W_sh_shh-learning_rate*T.grad(cost, self.W_sh_shh)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id], self.W_e_h), T.dot(shh,self.W_shh_h)]))
+ 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+class model55_pf6(object):  
+    def __init__(self, neh, ne, nshh, nchd, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_eh  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, neh)).astype(theano.config.floatX))
+
+        self.W_eh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh, ne)).astype(theano.config.floatX))
+
+        self.W_h_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nc)).astype(theano.config.floatX))
+
+        self.W_sh_shh   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nshh)).astype(theano.config.floatX))
+
+	self.W_shh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh, nchd)).astype(theano.config.floatX))
+
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(shh,self.W_shh_h)]))
+ 
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_y, self.W_h_y-learning_rate*T.grad(cost, self.W_h_y)),
+			(self.W_e_eh, self.W_e_eh-learning_rate*T.grad(cost, self.W_e_eh)),
+			(self.W_eh_h, self.W_eh_h-learning_rate*T.grad(cost, self.W_eh_h)),
+			(self.W_shh_h, self.W_shh_h-learning_rate*T.grad(cost, self.W_shh_h)),
+			(self.W_sh_shh, self.W_sh_shh-learning_rate*T.grad(cost, self.W_sh_shh)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(shh,self.W_shh_h)]))
+ 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+
+class model55_pf7(object):  
+    def __init__(self, neh, ne, nshh, nchd, nh2, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_eh  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, neh)).astype(theano.config.floatX))
+
+        self.W_eh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh, ne)).astype(theano.config.floatX))
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nc)).astype(theano.config.floatX))
+
+        self.W_sh_shh   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nshh)).astype(theano.config.floatX))
+
+	self.W_shh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh, nchd)).astype(theano.config.floatX))
+
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(shh,self.W_shh_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_y, self.W_h2_y-learning_rate*T.grad(cost, self.W_h2_y)),
+			(self.W_e_eh, self.W_e_eh-learning_rate*T.grad(cost, self.W_e_eh)),
+			(self.W_eh_h, self.W_eh_h-learning_rate*T.grad(cost, self.W_eh_h)),
+			(self.W_shh_h, self.W_shh_h-learning_rate*T.grad(cost, self.W_shh_h)),
+			(self.W_sh_shh, self.W_sh_shh-learning_rate*T.grad(cost, self.W_sh_shh)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(shh,self.W_shh_h)]))
+
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+
+class model55_pf8(object):  
+    def __init__(self, neh, ne, nshh, nchd, nh2, nh3, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_eh  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, neh)).astype(theano.config.floatX))
+
+        self.W_eh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh, ne)).astype(theano.config.floatX))
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_h3   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nh3)).astype(theano.config.floatX))
+	self.W_h3_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh3, nc)).astype(theano.config.floatX))
+
+        self.W_sh_shh   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nshh)).astype(theano.config.floatX))
+
+	self.W_shh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh, nchd)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(shh,self.W_shh_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_h3, self.W_h2_h3-learning_rate*T.grad(cost, self.W_h2_h3)),
+			(self.W_h3_y, self.W_h3_y-learning_rate*T.grad(cost, self.W_h3_y)),
+			(self.W_e_eh, self.W_e_eh-learning_rate*T.grad(cost, self.W_e_eh)),
+			(self.W_eh_h, self.W_eh_h-learning_rate*T.grad(cost, self.W_eh_h)),
+			(self.W_shh_h, self.W_shh_h-learning_rate*T.grad(cost, self.W_shh_h)),
+			(self.W_sh_shh, self.W_sh_shh-learning_rate*T.grad(cost, self.W_sh_shh)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(shh,self.W_shh_h)]))
+
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+class model55_pf9(object):  
+    def __init__(self, ne, nchd, nh2, nh3, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, ne)).astype(theano.config.floatX))
+
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_h3   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nh3)).astype(theano.config.floatX))
+	self.W_h3_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh3, nc)).astype(theano.config.floatX))
+
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nchd)).astype(theano.config.floatX))
+
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_h3, self.W_h2_h3-learning_rate*T.grad(cost, self.W_h2_h3)),
+			(self.W_h3_y, self.W_h3_y-learning_rate*T.grad(cost, self.W_h3_y)),
+			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cost, self.W_e_h)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+class model55_pf10(object):  
+    def __init__(self, nchd, nh2, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (de+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nc)).astype(theano.config.floatX))
+
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (de+nchd, nchd)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+
+            h = T.tanh(T.concatenate([self.emb[word_id], T.dot(schh,self.W_sh_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,de+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_y, self.W_h2_y-learning_rate*T.grad(cost, self.W_h2_y)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([self.emb[word_id], T.dot(schh,self.W_sh_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]                 
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,de+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+class model55_pf11(object):  
+    def __init__(self, nchd, nh2, nh3, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (de+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_h3   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nh3)).astype(theano.config.floatX))
+	self.W_h3_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh3, nc)).astype(theano.config.floatX))
+
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (de+nchd, nchd)).astype(theano.config.floatX))
+
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+
+            h = T.tanh(T.concatenate([self.emb[word_id], T.dot(schh,self.W_sh_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,de+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_h3, self.W_h2_h3-learning_rate*T.grad(cost, self.W_h2_h3)),
+			(self.W_h3_y, self.W_h3_y-learning_rate*T.grad(cost, self.W_h3_y)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+
+            h = T.tanh(T.concatenate([self.emb[word_id], T.dot(schh,self.W_sh_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,de+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+
+class model55_pf12(object):  
+    def __init__(self, neh, ne, nshh, nchd, nh2, nh3, nh4, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_eh  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, neh)).astype(theano.config.floatX))
+
+        self.W_eh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh, ne)).astype(theano.config.floatX))
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_h3   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nh3)).astype(theano.config.floatX))
+	self.W_h3_h4   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh3, nh4)).astype(theano.config.floatX))
+	self.W_h4_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh4, nc)).astype(theano.config.floatX))
+
+        self.W_sh_shh   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nshh)).astype(theano.config.floatX))
+
+	self.W_shh_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh, nchd)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(shh,self.W_shh_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+	    h4 = T.tanh(T.dot(h3, self.W_h3_h4))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h4,self.W_h4_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_h3, self.W_h2_h3-learning_rate*T.grad(cost, self.W_h2_h3)),
+			(self.W_h3_h4, self.W_h3_h4-learning_rate*T.grad(cost, self.W_h3_h4)),
+			(self.W_h4_y, self.W_h4_y-learning_rate*T.grad(cost, self.W_h4_y)),
+			(self.W_e_eh, self.W_e_eh-learning_rate*T.grad(cost, self.W_e_eh)),
+			(self.W_eh_h, self.W_eh_h-learning_rate*T.grad(cost, self.W_eh_h)),
+			(self.W_shh_h, self.W_shh_h-learning_rate*T.grad(cost, self.W_shh_h)),
+			(self.W_sh_shh, self.W_sh_shh-learning_rate*T.grad(cost, self.W_sh_shh)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+
+            h = T.tanh(T.concatenate([T.dot(eh, self.W_eh_h), T.dot(shh,self.W_shh_h)]))
+
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3))
+	    h4 = T.tanh(T.dot(h3, self.W_h3_h4))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h4,self.W_h4_y) + self.b_y)[0]                 
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+class model55_pf13(object):  
+    def __init__(self, neh, neh2, ne, nshh, nshh2, nchd, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_eh  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, neh)).astype(theano.config.floatX))
+        self.W_eh_eh2  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh, neh2)).astype(theano.config.floatX))
+        self.W_eh2_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh2, ne)).astype(theano.config.floatX))
+
+        self.W_sh_shh   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nshh)).astype(theano.config.floatX))
+	self.W_shh_shh2  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh, nshh2)).astype(theano.config.floatX))
+	self.W_shh2_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh2, nchd)).astype(theano.config.floatX))
+
+        self.W_h_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nc)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    eh2 = T.tanh(T.dot(eh,self.W_eh_eh2))
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+	    shh2 = T.tanh(T.dot(shh,self.W_shh_shh2))
+
+
+            h = T.tanh(T.concatenate([T.dot(eh2, self.W_eh2_h), T.dot(shh2,self.W_shh2_h)]))
+ 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_y, self.W_h_y-learning_rate*T.grad(cost, self.W_h_y)),
+			(self.W_e_eh, self.W_e_eh-learning_rate*T.grad(cost, self.W_e_eh)),
+			(self.W_eh_eh2, self.W_eh_eh2-learning_rate*T.grad(cost, self.W_eh_eh2)),
+			(self.W_eh2_h, self.W_eh2_h-learning_rate*T.grad(cost, self.W_eh2_h)),
+			(self.W_shh2_h, self.W_shh2_h-learning_rate*T.grad(cost, self.W_shh2_h)),
+			(self.W_sh_shh, self.W_sh_shh-learning_rate*T.grad(cost, self.W_sh_shh)),
+			(self.W_shh_shh2, self.W_shh_shh2-learning_rate*T.grad(cost, self.W_shh_shh2)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    eh2 = T.tanh(T.dot(eh,self.W_eh_eh2))
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+	    shh2 = T.tanh(T.dot(shh,self.W_shh_shh2))
+
+
+            h = T.tanh(T.concatenate([T.dot(eh2, self.W_eh2_h), T.dot(shh2,self.W_shh2_h)]))
+ 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]
+	                 
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+class model55_pf14(object):  
+    def __init__(self, neh, neh2, ne, nshh, nshh2, nchd, nh2, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_eh  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, neh)).astype(theano.config.floatX))
+        self.W_eh_eh2  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh, neh2)).astype(theano.config.floatX))
+        self.W_eh2_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh2, ne)).astype(theano.config.floatX))
+
+        self.W_sh_shh   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nshh)).astype(theano.config.floatX))
+	self.W_shh_shh2  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh, nshh2)).astype(theano.config.floatX))
+	self.W_shh2_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh2, nchd)).astype(theano.config.floatX))
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nc)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    eh2 = T.tanh(T.dot(eh,self.W_eh_eh2))
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+	    shh2 = T.tanh(T.dot(shh,self.W_shh_shh2))
+
+
+            h = T.tanh(T.concatenate([T.dot(eh2, self.W_eh2_h), T.dot(shh2,self.W_shh2_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_y, self.W_h2_y-learning_rate*T.grad(cost, self.W_h2_y)),
+			(self.W_e_eh, self.W_e_eh-learning_rate*T.grad(cost, self.W_e_eh)),
+			(self.W_eh_eh2, self.W_eh_eh2-learning_rate*T.grad(cost, self.W_eh_eh2)),
+			(self.W_eh2_h, self.W_eh2_h-learning_rate*T.grad(cost, self.W_eh2_h)),
+			(self.W_shh2_h, self.W_shh2_h-learning_rate*T.grad(cost, self.W_shh2_h)),
+			(self.W_sh_shh, self.W_sh_shh-learning_rate*T.grad(cost, self.W_sh_shh)),
+			(self.W_shh_shh2, self.W_shh_shh2-learning_rate*T.grad(cost, self.W_shh_shh2)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    eh2 = T.tanh(T.dot(eh,self.W_eh_eh2))
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+	    shh2 = T.tanh(T.dot(shh,self.W_shh_shh2))
+
+
+            h = T.tanh(T.concatenate([T.dot(eh2, self.W_eh2_h), T.dot(shh2,self.W_shh2_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]
+	           
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+class model55_pf15(object):  
+    def __init__(self, neh, neh2, ne, nshh, nshh2, nchd, nh2, nh3, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_eh  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, neh)).astype(theano.config.floatX))
+        self.W_eh_eh2  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh, neh2)).astype(theano.config.floatX))
+        self.W_eh2_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh2, ne)).astype(theano.config.floatX))
+
+        self.W_sh_shh   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nshh)).astype(theano.config.floatX))
+	self.W_shh_shh2  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh, nshh2)).astype(theano.config.floatX))
+	self.W_shh2_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh2, nchd)).astype(theano.config.floatX))
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_h3   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nh3)).astype(theano.config.floatX))
+        self.W_h3_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh3, nc)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    eh2 = T.tanh(T.dot(eh,self.W_eh_eh2))
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+	    shh2 = T.tanh(T.dot(shh,self.W_shh_shh2))
+
+
+            h = T.tanh(T.concatenate([T.dot(eh2, self.W_eh2_h), T.dot(shh2,self.W_shh2_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3)) 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_h3, self.W_h2_h3-learning_rate*T.grad(cost, self.W_h2_h3)),
+			(self.W_h3_y, self.W_h3_y-learning_rate*T.grad(cost, self.W_h3_y)),
+			(self.W_e_eh, self.W_e_eh-learning_rate*T.grad(cost, self.W_e_eh)),
+			(self.W_eh_eh2, self.W_eh_eh2-learning_rate*T.grad(cost, self.W_eh_eh2)),
+			(self.W_eh2_h, self.W_eh2_h-learning_rate*T.grad(cost, self.W_eh2_h)),
+			(self.W_shh2_h, self.W_shh2_h-learning_rate*T.grad(cost, self.W_shh2_h)),
+			(self.W_sh_shh, self.W_sh_shh-learning_rate*T.grad(cost, self.W_sh_shh)),
+			(self.W_shh_shh2, self.W_shh_shh2-learning_rate*T.grad(cost, self.W_shh_shh2)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    eh2 = T.tanh(T.dot(eh,self.W_eh_eh2))
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+	    shh2 = T.tanh(T.dot(shh,self.W_shh_shh2))
+
+
+            h = T.tanh(T.concatenate([T.dot(eh2, self.W_eh2_h), T.dot(shh2,self.W_shh2_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3)) 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h3,self.W_h3_y) + self.b_y)[0]
+	    
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
+class model55_pf16(object):  
+    def __init__(self, nchd, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+
+        self.W_h_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (de+nchd, nc)).astype(theano.config.floatX))
+
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (de+nchd, nchd)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+
+            h = T.tanh(T.concatenate([self.emb[word_id], T.dot(schh,self.W_sh_h)]))
+ 
+
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,de+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_y, self.W_h_y-learning_rate*T.grad(cost, self.W_h_y)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([self.emb[word_id], T.dot(schh,self.W_sh_h)]))
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]                 
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,de+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+class model55_pf17(object):  
+    def __init__(self, neh, neh2, ne, nshh, nshh2, nchd, nh2, nh3, nh4, nc, w2v_model_path, max_phrase_length): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        ne :: number of word embeddings in the vocabulary
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_eh  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, neh)).astype(theano.config.floatX))
+        self.W_eh_eh2  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh, neh2)).astype(theano.config.floatX))
+        self.W_eh2_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (neh2, ne)).astype(theano.config.floatX))
+
+        self.W_sh_shh   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nshh)).astype(theano.config.floatX))
+	self.W_shh_shh2  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh, nshh2)).astype(theano.config.floatX))
+	self.W_shh2_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (nshh2, nchd)).astype(theano.config.floatX))
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_h3   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nh3)).astype(theano.config.floatX))
+        self.W_h3_h4   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh3, nh4)).astype(theano.config.floatX))
+        self.W_h4_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh4, nc)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+  
+        def one_step(word_id, word_children_positions, y_true, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    eh2 = T.tanh(T.dot(eh,self.W_eh_eh2))
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+	    shh2 = T.tanh(T.dot(shh,self.W_shh_shh2))
+
+
+            h = T.tanh(T.concatenate([T.dot(eh2, self.W_eh2_h), T.dot(shh2,self.W_shh2_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3)) 
+	    h4 = T.tanh(T.dot(h3, self.W_h3_h4)) 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h4,self.W_h4_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_h3, self.W_h2_h3-learning_rate*T.grad(cost, self.W_h2_h3)),
+			(self.W_h3_h4, self.W_h3_h4-learning_rate*T.grad(cost, self.W_h3_h4)),
+			(self.W_h4_y, self.W_h4_y-learning_rate*T.grad(cost, self.W_h4_y)),
+			(self.W_e_eh, self.W_e_eh-learning_rate*T.grad(cost, self.W_e_eh)),
+			(self.W_eh_eh2, self.W_eh_eh2-learning_rate*T.grad(cost, self.W_eh_eh2)),
+			(self.W_eh2_h, self.W_eh2_h-learning_rate*T.grad(cost, self.W_eh2_h)),
+			(self.W_shh2_h, self.W_shh2_h-learning_rate*T.grad(cost, self.W_shh2_h)),
+			(self.W_sh_shh, self.W_sh_shh-learning_rate*T.grad(cost, self.W_sh_shh)),
+			(self.W_shh_shh2, self.W_shh_shh2-learning_rate*T.grad(cost, self.W_shh_shh2)),
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+	    eh = T.tanh(T.dot(self.emb[word_id],self.W_e_eh))
+	    eh2 = T.tanh(T.dot(eh,self.W_eh_eh2))
+
+	    shh = T.tanh(T.dot(schh,self.W_sh_shh))
+	    shh2 = T.tanh(T.dot(shh,self.W_shh_shh2))
+
+
+            h = T.tanh(T.concatenate([T.dot(eh2, self.W_eh2_h), T.dot(shh2,self.W_shh2_h)]))
+ 
+	    h2 = T.tanh(T.dot(h, self.W_h_h2)) 
+	    h3 = T.tanh(T.dot(h2, self.W_h2_h3)) 
+	    h4 = T.tanh(T.dot(h3, self.W_h3_h4)) 
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, h)
+
+            y_prob = T.nnet.softmax(T.dot(h4,self.W_h4_y) + self.b_y)[0]
+	    
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+import numpy as np
+import time
+import sys
+import subprocess
+import os
+import random
+
+#from modules.data import load
+#from modules.rnn.many_models import *
+#from modules.metrics.accuracy import conlleval
+from modules.utils.tools import load_stanford_data4
+
+from theano import pp
+
+import theano.tensor as T
+import theano
+from theano.sandbox.rng_mrg import MRG_RandomStreams #as MRG_RandomStreams 
+
+import itertools
+
+import os.path
+import pickle
+
+from collections import Counter
+
+
+
+from theano import tensor as T, printing
+from collections import OrderedDict
+from theano.ifelse import ifelse
+
+from keras.preprocessing import sequence as seq
+
+dataType = 'int64'
+
+
+
+
+class MLP_2_1(object):  
+
+    # punkt wyjscia modelu: model55_pf1
+
+    def __init__(self, ne, nchd, nc, w2v_model_path, max_phrase_length, number_of_relations): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        #ne = len(w2vecs["words2ids"])
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, ne)).astype(theano.config.floatX))
+        self.W_h_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nc)).astype(theano.config.floatX))
+
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nchd)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+	self.relations_weights = theano.shared(r * np.random.uniform(-1.0, 1.0, (number_of_relations+1, ne+nchd, ne+nchd)).astype(theano.config.floatX))
+  
+        def one_step(word_id, word_children_positions, y_true, relation, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]))
+ 
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, T.dot(h, self.relations_weights[relation,:,:]))
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]
+	
+	    #l2_norm = T.sum(self.W_h_h2**2) + T.sum(self.W_h2_y**2) + T.sum(self.W_e_h**2) + T.sum(self.W_sh_h**2) + T.sum(self.emb**2) + T.sum(self.b_h**2) + T.sum(self.b_y**2)
+
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	
+	relations = T.vector(dtype=dataType)
+
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y, relations, words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_y, self.W_h_y-learning_rate*T.grad(cost, self.W_h_y)),
+			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cost, self.W_e_h)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.relations_weights, self.relations_weights - learning_rate*T.grad(cost,self.relations_weights)), 
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, relations, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, relation, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]) )
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, T.dot(h, self.relations_weights[relation,:,:]))
+
+            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions, relations, words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,relations, words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+class MLP_2_2(object):  
+
+    # punkt wyjscia modelu: model55_pf2
+ 
+    def __init__(self, ne, nchd, nh2, nc, w2v_model_path, max_phrase_length, number_of_relations): 
+        '''
+        nh :: dimension of hidden state
+        nc :: number of classes
+        de :: dimension of the word embeddings
+        ds :: dimension of the sentiment state
+        '''
+        self.max_phrase_length = max_phrase_length
+        w2vecs = pickle.load(open(w2v_model_path,"r"))
+        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
+        self.words2ids = w2vecs["words2ids"]
+
+        de = w2vecs["vectors"].shape[1]
+        del w2vecs
+
+        r = 0.05
+    
+        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0, (de, ne)).astype(theano.config.floatX))
+
+        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nchd)).astype(theano.config.floatX))
+
+        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0, (ne+nchd, nh2)).astype(theano.config.floatX))
+        self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, (nh2, nc)).astype(theano.config.floatX))
+
+        self.b_y   = theano.shared(r * np.random.uniform(-1.0, 1.0, nc).astype(theano.config.floatX))
+
+	self.relations_weights = theano.shared(r * np.random.uniform(-1.0, 1.0, (number_of_relations+1, ne+nchd, ne+nchd)).astype(theano.config.floatX))
+  
+        def one_step(word_id, word_children_positions, y_true, relation, i, hidden_states, learning_rate):
+
+	    schh = hidden_states[-1] #+ 0.5 # czyli wektor zerowy
+
+	    tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1        
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]))
+            #h = T.nnet.sigmoid(T.dot(self.emb[word_id],self.W_eh) + T.dot(schh,self.W_shsh) + self.bh)
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, T.dot(h, self.relations_weights[relation,:,:]))
+
+            h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]
+	
+            cross_entropy = -T.log(y_prob[y_true]) # + norm_coefficient * l2_norm
+                                 
+            return cross_entropy, hidden_states_new  
+
+
+        y = T.vector('y',dtype=dataType)
+        learning_rate = T.scalar('lr',dtype=theano.config.floatX)
+	words = T.vector(dtype=dataType)
+        children_positions = T.matrix(dtype=dataType)
+	relations = T.vector(dtype=dataType)
+	words_indexes = T.vector(dtype=dataType)
+
+	cross_entropy_vector, _ = theano.scan(fn=one_step, \
+                                 sequences = [words, children_positions,y,relations,words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))],
+                                 non_sequences = learning_rate,
+                                 n_steps = words.shape[0])
+	cost = T.sum(cross_entropy_vector[0])
+
+        updates = OrderedDict([
+			(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cost, self.W_h_h2)),
+			(self.W_h2_y, self.W_h2_y-learning_rate*T.grad(cost, self.W_h2_y)),
+			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cost, self.W_e_h)),
+			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cost, self.W_sh_h)),
+			(self.relations_weights, self.relations_weights - learning_rate*T.grad(cost,self.relations_weights)), 
+			(self.emb, self.emb-learning_rate*T.grad(cost, self.emb)), #updated_current_emb), #
+                        (self.b_y, self.b_y-learning_rate*T.grad(cost,self.b_y))
+			])
+
+	self.train = theano.function( inputs  = [words, children_positions, y, relations, words_indexes, learning_rate],
+                                      outputs = [],
+                                      updates = updates,
+                                      allow_input_downcast=True,
+                                      mode='FAST_RUN'
+                                      )
+
+ 
+        def one_step_classify(word_id, word_children_positions, relation, i, hidden_states):
+
+	    schh = hidden_states[-1] #+ 0.5# czyli wektor zerowy
+
+            tmp = word_children_positions>=0.0
+	    idx_tmp = tmp.nonzero() #indeksy realne dzieci - czyli te, gdzie nie ma -1
+	    schh = schh + hidden_states[word_children_positions[idx_tmp]].sum(axis=0) #suma stanow ukrytych dzieci
+	    number_of_children = tmp.sum(dtype = theano.config.floatX)
+	    number_of_children = ifelse( T.gt(number_of_children, 1.0),number_of_children, 1.0) 
+	    schh = schh/number_of_children
+
+            h = T.tanh(T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)]))  
+
+            current_hidden_state = hidden_states[i]
+            hidden_states_new = T.set_subtensor(current_hidden_state, T.dot(h, self.relations_weights[relation,:,:]))
+
+            h2 = T.tanh(T.dot(h, self.W_h_h2))
+
+            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y) + self.b_y)[0]                  
+
+            return  y_prob, hidden_states_new 
+        
+
+	[y_probs_classify, hidden_states ], _ = theano.scan(
+				 fn=one_step_classify, 
+                                 sequences = [words, children_positions, relations, words_indexes],
+				 outputs_info = [None, theano.shared(np.zeros((self.max_phrase_length+1,ne+nchd), dtype = theano.config.floatX))])
+	
+	predictions, _ = theano.scan(lambda i: T.argmax(y_probs_classify[i]), 
+                                     sequences = [words_indexes])
+
+        self.classify = theano.function(inputs=[words,children_positions,relations,words_indexes], 
+                                     outputs=predictions,
+                                     allow_input_downcast=True,
+                                     mode='FAST_RUN' 
+                                     )
+
+
+
+
-import theano
-import numpy as np
-import os
-import pickle
-
-from theano import tensor as T, printing
-from collections import OrderedDict
-from theano.ifelse import ifelse
-
-theano.config.floatX = 'float64'
-dataType = 'int64'
-
-class model(object):
-    
-    def __init__(self, nh, nc, ds, w2v_model_path, max_phrase_length): 
-        '''
-        nh :: dimension of hidden state
-        nc :: number of classes
-        ne :: number of word embeddings in the vocabulary
-        de :: dimension of the word embeddings
-
-        ds :: dimension of the sentiment state
-        '''
-
-        
-
-        self.max_phrase_length = max_phrase_length
-
-        ###ne = len(model.index2word)
-        ###de = model.vector_size
-
-        ###vectors = np.zeros((ne,de))
-        ###self.words2ids = {}
-        ###for i in range(len(model.index2word)):
-        ###    self.words2ids[model.index2word[i]] = i
-        ###    vectors[i] = model[model.index2word[i]]
-        
-        w2vecs = pickle.load(open(w2v_model_path,"r"))
-        #self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
-        self.emb = theano.shared(np.load("saved_models4/embeddings.npy").astype(theano.config.floatX))
-        self.words2ids = w2vecs["words2ids"]
-
-        ne = len(w2vecs["words2ids"])
-        de = w2vecs["vectors"].shape[1]
-
-        del w2vecs
-
-        #self.words2ids = {}
-        #vectors = []
-        #i = 0
-        #for line in open(w2v_model_path,"r"):
-        #    toks = line.strip("\n").split(" ")
-        #    word = toks[0]
-        #    v = map(float, toks[1:])
-        #    vectors.append(v)
-        #    self.words2ids[word] = i
-        #    i = i + 1
-        #vectors.append(np.zeros((len(vectors[0]))))
-        #vectors = np.array(vectors)
-        #print(vectors.shape)
-        #self.emb = theano.shared(vectors.astype(theano.config.floatX)) 
-
-        #ne = i
-        #de = len(vectors[0])
-        
-        #bedzie trzeba obsluzyc przypadek, gdy slowo w danych nie ma embeddina w modelu
-
-	###del model
-        #del vectors
-
-        #self.sent_states = theano.shared(0.2 * np.concatenate((
-        #           np.random.uniform(-1.0, 1.0,(ne, ds)),np.zeros((1,ds))),axis=0).astype(theano.config.floatX))
-             # dodajemy jeden wektor zerowy potrzebny dla wyznaczenia sumy 
-             # dzieci z liscii (czyli lisc symbolicznie ma dziecko bedace nullem - i to ma zerowy sentyment)
-	     # uzyc go tez do reprezentacji rzadkich slow na zbiorze treningowym?
-	     # porownac dzialanie: 1) przyjecie wektora zerowego dla nowych slow w zbiorze tren; 2) wziecie wartosci ze slowa najbardziej podobnego wzgledem embeddingu wystepujacego w zbiorze tren
-	     # trzeba bedzie to uwzglednic w stosowaniu sieci 
-
-        r = 0.05
-
-
-        #self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-        #           (de, nh)).astype(theano.config.floatX))
-        self.W_e_h = theano.shared(np.load("saved_models4/W_eh25.npy").astype(theano.config.floatX))
-
-        self.W_sh  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-                   (ds, nh)).astype(theano.config.floatX))
-
-        #self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-        #           (2*nh, nc)).astype(theano.config.floatX))
-        self.W_h2_y   = theano.shared(np.load("saved_models4/W_hh225.npy").astype(theano.config.floatX))
-
-        #self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-        #          (2*nh, 2*nh)).astype(theano.config.floatX))
-        self.W_h_h2   = theano.shared(np.load("saved_models4/W_h2y25.npy").astype(theano.config.floatX))
-
-        self.W_ssy   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-                   (ds, nc)).astype(theano.config.floatX))
-        
-        #self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-        #           (2*nh, nh)).astype(theano.config.floatX))
-        self.W_sh_h   = theano.shared(np.load("saved_models4/W_shsh25.npy").astype(theano.config.floatX))     
-
-        self.bh  = theano.shared(np.zeros(nh, dtype=theano.config.floatX))
-        self.b   = theano.shared(np.zeros(nc, dtype=theano.config.floatX))
-
-
-        # bundle
-        self.params = [ self.W_h2_y, self.W_h_h2, self.W_e_h, self.W_sh_h,self.emb]#, self.bh, self.b ]
-        self.names  = [ "W_hh2", 'W_h2y', 'W_eh', 'W_shsh', "embeddings"]#, 'bh', 'b']#, 'h0']
-
-
-        # liczy sentyment obecnego slowa / predykcja
-        # word_id = obecne slowo
-        # i = indeks w zdaniu slowa word_id
-        # word_children_ids = id-ki dzieci obecnego slowa
-        # word_children_positions = pozycje word_children_ids
-        def one_step(word_id, word_children_ids, word_children_positions, i, hidden_states):
-
-
-
-            idx_tmp = (word_children_positions>=0).nonzero()
-            tmp = T.zeros_like(word_children_positions)
-            tmp2 = T.set_subtensor(tmp[idx_tmp], 1)
-            number_of_children = tmp2.sum()
-            
-            #pnoc = theano.printing.Print('Number of children: ')
-            #printed_number_of_children = pnoc(number_of_children)
-
-
-            # sprobowac zamiast zer, wstawic wektor wartosci 0.5
-            schh = hidden_states[word_children_positions].sum(axis=0) /( number_of_children + 0.000001) #dodane 0..1, zeby nie bawic sie w ify, gdy nie ma dzieci (wtedy suma i tak jest zero, wiece dzielenie nie ma znaczenia)
-            h = T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)])  # bez biasa i sigmoida
-            
-            #h = T.nnet.sigmoid(T.dot(self.emb[word_id],self.W_eh) + T.dot(schh,self.W_shsh) + self.bh)
-
-            #h_s = T.zeros_like(hidden_states)
-            #zeros_subtensor = h_s[i]
-            #new_h_s = T.set_subtensor(zeros_subtensor, h)
-
-            zeros_subtensor = hidden_states[i]
-            hidden_states_new = T.set_subtensor(zeros_subtensor, h)
-
-            h2 = T.dot(h, self.W_h_h2)
-
-            y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y))# + self.b)
-
-
-            # powyzsze jest niezbyt sensownie zrobione, bo jesli jest kilka "-1" w dzieciach to tyle razy jest dodawany ten wektor
-            # czy da sie to zamienic na petle, zeby nie dodawac wektora -1 kilka razy?
-            # w tej chwili to nie ma znaczenia, bo ten wektor i tak jest stale rowny 0 - nie zmienia sie podczas uczenia
-
-            return i+1, hidden_states_new, y_prob
-        
-        
-
-	words = T.vector(dtype=dataType)
-	children_ids = T.matrix(dtype=dataType)
-        children_positions = T.matrix(dtype=dataType)
-
-	y_probs, _ = theano.scan(fn=one_step, \
-                                 sequences = [words, children_ids, children_positions],
-                                 outputs_info = [theano.shared(0), 
-                                                 theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX)), 
-                                                 None], 
-                                 n_steps = words.shape[0])
-        
-        
-        estimated_probs = y_probs[-1][-1][0]
-
-        y_pred = T.argmax(estimated_probs) # y_probs[-1][-1][0] zwraca wektor [P(y=0), P(y=1)] -> argmax zwraca predykce klasy 
-                 # dostajemy sie do predykcji dla ostatniego slowa, a klasyfikacja ostatniego slowa odpowiada klasyfikacji frazy,
-                 # bo slowa sa ustawione w takiej kolejnosci, ze korzen jest ostatnim slowem
-                 
-      
-	y = T.scalar('y',dtype=dataType)
-	
-        # cost and gradients and learning rate
-        lr = T.scalar('lr',dtype=theano.config.floatX)
-
-        nll = -T.log(estimated_probs)[y] #to samo co (sprawdzone):
-        #nll = T.nnet.nnet.categorical_crossentropy(estimated_probs,T.extra_ops.to_one_hot(y.dimshuffle('x'), 5)[0])
-
-        gradients = T.grad( nll, self.params )
-        updates = OrderedDict(( p, p-lr*g ) for p, g in zip( self.params , gradients))
-        
-        # uwaga: ostani rzad macierzy sent_states - wektor odpowiadajacy dziecku, ktorego nie ma - jest stale rowny zero
-
-        
-        # theano functions
-        self.classify = theano.function(inputs=[words,children_ids,children_positions], outputs=y_pred,
-                                     allow_input_downcast=True,
-                                     mode='FAST_RUN' )
-
-        self.train = theano.function( inputs  = [words,children_ids, children_positions, y, lr],
-                                      outputs = nll,
-                                      updates = updates,
-                                     allow_input_downcast=True,
-                                      mode='FAST_RUN' )
-
-        
-        #self.normalize = theano.function( inputs = [], #uwazac na dzielenie przez 0 - ostatni wiersz sent_states jest zerowy
-        #                 updates = {self.sent_states:\
-        #                 self.sent_states/T.sqrt((self.sent_states**2).sum(axis=1))})#.dimshuffle(0,'x')}) 
-
-    def save(self, folder, e):   
-        for param, name in zip(self.params, self.names):
-            np.save(os.path.join(folder, name + str(e) + '.npy'), param.get_value())
-
-
-
-
-
-
-class model2(object):
-
-    '''
-   
-    '''
-
-    
-    def __init__(self, nh, nc, ds, w2v_model_path, max_phrase_length): 
-        '''
-        nh :: dimension of hidden state
-        nc :: number of classes
-        ne :: number of word embeddings in the vocabulary
-        de :: dimension of the word embeddings
-
-        ds :: dimension of the sentiment state
-        '''
-
-        self.max_phrase_length = max_phrase_length
-
-        w2vecs = pickle.load(open(w2v_model_path,"r"))
-
-        self.emb = theano.shared(w2vecs["vectors"].astype(theano.config.floatX))
-        #self.emb = theano.shared(np.load("saved_models_final1/embeddings"+str(e)+"_200.npy").astype(theano.config.floatX))
-
-        self.words2ids = w2vecs["words2ids"]
-
-        ne = len(w2vecs["words2ids"])
-        de = w2vecs["vectors"].shape[1]
-
-        del w2vecs
-
-        #self.sent_states = theano.shared(0.2 * np.concatenate((
-        #           np.random.uniform(-1.0, 1.0,(ne, ds)),np.zeros((1,ds))),axis=0).astype(theano.config.floatX))
-             # dodajemy jeden wektor zerowy potrzebny dla wyznaczenia sumy 
-             # dzieci z liscii (czyli lisc symbolicznie ma dziecko bedace nullem - i to ma zerowy sentyment)
-	     # uzyc go tez do reprezentacji rzadkich slow na zbiorze treningowym?
-	     # porownac dzialanie: 1) przyjecie wektora zerowego dla nowych slow w zbiorze tren; 2) wziecie wartosci ze slowa najbardziej podobnego wzgledem embeddingu wystepujacego w zbiorze tren
-	     # trzeba bedzie to uwzglednic w stosowaniu sieci 
-
-        r = 0.05
-
-
-        self.W_e_h  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-                   (de, nh)).astype(theano.config.floatX))
-        #self.W_e_h = theano.shared(np.load("saved_models_final1/W_eh"+str(e)+"_200.npy").astype(theano.config.floatX))
-
-        self.W_sh  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-                   (ds, nh)).astype(theano.config.floatX))
-
-        self.W_h2_y   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-                   (2*nh, nc)).astype(theano.config.floatX))
-        #self.W_h2_y   = theano.shared(np.load("saved_models_final1/W_h2y"+str(e)+"_200.npy").astype(theano.config.floatX))
-
-        self.W_h_h2   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-                   (2*nh, 2*nh)).astype(theano.config.floatX))
-        #self.W_h_h2   = theano.shared(np.load("saved_models_final1/W_hh2"+str(e)+"_200.npy").astype(theano.config.floatX))
-
-        self.W_ssy   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-                   (ds, nc)).astype(theano.config.floatX))
-        
-        self.W_sh_h   = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-                   (2*nh, nh)).astype(theano.config.floatX))
-        #self.W_sh_h   = theano.shared(np.load("saved_models_final1/W_shsh"+str(e)+"_200.npy").astype(theano.config.floatX))     
-
-
-        self.W_h_y  = theano.shared(r * np.random.uniform(-1.0, 1.0,\
-                   (2*nh, nc)).astype(theano.config.floatX))
-
-        self.bh  = theano.shared(np.zeros(nh, dtype=theano.config.floatX))
-        self.b   = theano.shared(np.zeros(nc, dtype=theano.config.floatX))
-
-
-        # bundle
-        self.params = [ self.W_h_y, self.W_e_h, self.W_sh_h, self.emb]# self.W_h2_y, self.W_h_h2,
-        self.names  = [ "W_h_y", 'W_eh', 'W_shsh', "embeddings"]# 'W_h2y', "W_hh2",
-
-
-        shared_zero = theano.shared(0)
-        shared_one = theano.shared(1)
-
-        # liczy sentyment obecnego slowa / predykcja
-        # word_id = obecne slowo
-        # i = indeks w zdaniu slowa word_id
-        # word_children_ids = id-ki dzieci obecnego slowa
-        # word_children_positions = pozycje word_children_ids
-        def one_step(word_id, word_children_ids, word_children_positions, y_true, i, hidden_states, learning_rate):
-
-	    p = printing.Print('word_children_positions: ')
-	    word_children_positions = p(word_children_positions)
-
-
-            idx_tmp = (word_children_positions>=0).nonzero()
-            tmp = T.zeros_like(word_children_positions)
-            tmp2 = T.set_subtensor(tmp[idx_tmp], 1)
-            number_of_children = tmp2.sum(dtype = dataType)
-
-	    number_of_children = ifelse(T.eq(number_of_children, shared_zero), shared_one, number_of_children)
-            # sprobowac zamiast zer, wstawic wektor wartosci 0.5
-
-	    hello_world_op = printing.Print('number_of_children: ')
-	    number_of_children = hello_world_op(number_of_children)
-
-
-            schh = hidden_states[word_children_positions].sum(axis=0) /  number_of_children#( number_of_children + 0.000001) 
-#dodane 0..1, zeby nie bawic sie w ify, gdy nie ma dzieci (wtedy suma i tak jest zero, wiece dzielenie nie ma znaczenia)
-            h = T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)])  # bez biasa i sigmoida
-            
-            #h = T.nnet.sigmoid(T.dot(self.emb[word_id],self.W_eh) + T.dot(schh,self.W_shsh) + self.bh)
-
-            #h_s = T.zeros_like(hidden_states)
-            #zeros_subtensor = h_s[i]
-            #new_h_s = T.set_subtensor(zeros_subtensor, h)
-
-            zeros_subtensor = hidden_states[i]
-            hidden_states_new = T.set_subtensor(zeros_subtensor, h)
-
-            #h2 = T.dot(h, self.W_h_h2)
-
-            #y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y))# + self.b)
-
-            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y))# + self.b)
-
-            cce = -T.log(y_prob[0][y_true])
-
-            #learning_rate = 0.01
-
-            updates = OrderedDict([#(self.W_h2_y, self.W_h2_y-learning_rate*T.grad(cce, self.W_h2_y)),
-                        (self.W_h_y, self.W_h_y-learning_rate*T.grad(cce, self.W_h_y)),
-			#(self.W_h_h2, self.W_h_h2-learning_rate*T.grad(cce, self.W_h_h2)),
-			(self.W_e_h, self.W_e_h-learning_rate*T.grad(cce, self.W_e_h)),
-			(self.W_sh_h, self.W_sh_h-learning_rate*T.grad(cce, self.W_sh_h)),
-			(self.emb, self.emb-learning_rate*T.grad(cce, self.emb)) 
-			])
-                                 
-
-            return (i+1,hidden_states_new, y_prob), updates  
-        
-
-
-
-        y = T.vector('y',dtype=dataType)
-
-        lr = T.scalar('lr',dtype=theano.config.floatX)
-
-	words = T.vector(dtype=dataType)
-	children_ids = T.matrix(dtype=dataType)
-        children_positions = T.matrix(dtype=dataType)
-        #words_indexes = T.vector(dtype=dataType)
-
-	y_probs, upd = theano.scan(fn=one_step, \
-                                 sequences = [words, children_ids, children_positions,y],#,words_indexes],
-                                 outputs_info = [theano.shared(0), 
-                                                 theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX)), 
-                                                 None], 
-                                 non_sequences = lr,
-                                 n_steps = words.shape[0])
- 
-
-        def one_step_classify(word_id, word_children_ids, word_children_positions, i, hidden_states):
-
-
-            idx_tmp = (word_children_positions>=0).nonzero()
-            tmp = T.zeros_like(word_children_positions)
-            tmp2 = T.set_subtensor(tmp[idx_tmp], 1)
-            number_of_children = tmp2.sum()
-
-            schh = hidden_states[word_children_positions].sum(axis=0) / ifelse(T.eq(number_of_children, shared_zero), shared_one, number_of_children)
-            h = T.concatenate([T.dot(self.emb[word_id],self.W_e_h), T.dot(schh,self.W_sh_h)])  # bez biasa i sigmoida
-
-            zeros_subtensor = hidden_states[i]
-            hidden_states_new = T.set_subtensor(zeros_subtensor, h)
-
-            #h2 = T.dot(h, self.W_h_h2)
-            #y_prob = T.nnet.softmax(T.dot(h2,self.W_h2_y))# + self.b)
-            y_prob = T.nnet.softmax(T.dot(h,self.W_h_y))
-                                
-            return i+1, hidden_states_new, y_prob
-        
-
-
-	y_probs_classify, _ = theano.scan(fn=one_step_classify, \
-                                 sequences = [words, children_ids, children_positions],
-                                 outputs_info = [theano.shared(0), 
-                                                 theano.shared(np.zeros((self.max_phrase_length+1,2*nh), dtype = theano.config.floatX)), 
-                                                 None], 
-                                 n_steps = words.shape[0])
-
-
-       
-        
-        predictions, _ = theano.scan(lambda i: (i+1, T.argmax(y_probs_classify[2][i][0])), outputs_info = [theano.shared(0), None], n_steps = y_probs_classify[2].shape[0])
-
-	#res2 , _ = theano.scan(lambda x,i : (i+1, T.argmax(x)), 
-        #                                                  sequences = [estimated_probs[1]], 
-        #                                                  outputs_info = [theano.shared(0), None]
-        #                                                  )
-
-
-#        minus_log_true_class_prob = res[1]
-        #prediction_class = res2[1]
-
-        
-#        nll = minus_log_true_class_prob.sum()
-
-        #y_pred = T.argmax(estimated_probs) # y_probs[-1][-1][0] zwraca wektor [P(y=0), P(y=1), ...] -> argmax zwraca predykce klasy 
-                 # dostajemy sie do predykcji dla ostatniego slowa, a klasyfikacja ostatniego slowa odpowiada klasyfikacji frazy,
-                 # bo slowa sa ustawione w takiej kolejnosci, ze korzen jest ostatnim slowem
-                 
-      
-	
-	
-        # cost and gradients and learning rate
-        #nll = -T.log(estimated_probs[1])[y] #to samo co (sprawdzone):
-        #nll = T.nnet.nnet.categorical_crossentropy(estimated_probs,T.extra_ops.to_one_hot(y.dimshuffle('x'), 5)[0])
-
-#        gradients = T.grad( nll, self.params )
-#        updates = OrderedDict(( p, p-lr*g ) for p, g in zip( self.params , gradients))
-        
-        # uwaga: ostani rzad macierzy sent_states - wektor odpowiadajacy dziecku, ktorego nie ma - jest stale rowny zero
-
-        
-        # theano functions
-        self.classify = theano.function(inputs=[words,children_ids,children_positions], outputs=predictions[1],
-                                     allow_input_downcast=True,
-                                     mode='FAST_RUN' )
-
-        self.train = theano.function( inputs  = [words,children_ids, children_positions, y, lr],#, words_indexes
-                                      outputs = [],#nll,
-                                      updates = upd,#updates,
-                                      allow_input_downcast=True,
-                                      mode='FAST_RUN' )
-
-       
-        #self.normalize = theano.function( inputs = [], #uwazac na dzielenie przez 0 - ostatni wiersz sent_states jest zerowy
-        #                 updates = {self.sent_states:\
-        #                 self.sent_states/T.sqrt((self.sent_states**2).sum(axis=1))})#.dimshuffle(0,'x')}) 
-
-    def save(self, folder, e, i):   
-        for param, name in zip(self.params, self.names):
-            np.save(os.path.join(folder, name + str(e) + "_" + str(i) + '.npy'), param.get_value())
@@ -3,8 +3,12 @@ import numpy
 from keras.preprocessing import sequence as seq
 import theano
  
+from collections import Counter
+
 import pickle
  
+
+
 def shuffle(lol, seed):
     '''
     lol :: list of list as input
@@ -70,7 +74,6 @@ def filter_embeddings(datasets, embedding_path, destination):
  
  
  
-
 def words_in_from_down_to_top_order(sentence_tree):
     #print sentence_tree
     levels = numpy.setdiff1d(range(len(sentence_tree)),numpy.unique(sentence_tree)) # - zwraca slowo/a, ktore nie jest niczyim dzieckiem - czyli powinno/y byc korzeniem/korzeniami frazy/fraz
@@ -81,7 +84,8 @@ def words_in_from_down_to_top_order(sentence_tree):
     for i in range(len(sentence_tree)):
         #print i
         #print levels[i]
-        levels.extend(numpy.setdiff1d(sentence_tree[levels[i]],-1))
+	x = numpy.setdiff1d(sentence_tree[levels[i]],-1)
+        levels.extend(x[x<len(sentence_tree)])
  
     ordered_words = numpy.array(levels)[levels != numpy.array(-1)][::-1] #odwaracmy kolejnosc na poczatku beda slowa znajdujace sie najglebiej
  
@@ -94,7 +98,6 @@ def words_in_from_down_to_top_order(sentence_tree):
  
  
  
-
 def load_conll_data(conll_format_data, words2ids):
  
  
@@ -633,12 +636,7 @@ def load_stanford_data3(labels, parents, tokens, words2ids, use_batch, batch_siz
  
 def load_stanford_data4(labels, parents, tokens, words2ids, use_batch, batch_size, nb_classes):
  
-
-
-
     def transform_labels(x, nb_classes):
-
-
         if nb_classes == 3:
             if x =='#' or int(x) == 0:
                 return 1
@@ -685,19 +683,12 @@ def load_stanford_data4(labels, parents, tokens, words2ids, use_batch, batch_siz
  
     for labels_i,parents_i,tokens_i in zip(labels,parents,tokens):
  
-       
-        
         k = k + 1
-        
-        
+         
         s = []
         for i in range(len(tokens_i)):
             s.append([i,int(parents_i[i]),labels_i[i],tokens_i[i]])
  
-        
-
-        
-
         if len(s) == 1 and use_batch == False: #przypadek gdy fraza sklada sie z jednego tokena
  
             #if nb_classes == 2:
@@ -743,7 +734,6 @@ def load_stanford_data4(labels, parents, tokens, words2ids, use_batch, batch_siz
             #    if current_sentence[3][-1] <0:
             #        continue
  
-
             if use_batch == True:
  
                 # w tej chwili len(current_sentence[0]) nie jest nigdzie wykorzystywane
@@ -770,8 +760,7 @@ def load_stanford_data4(labels, parents, tokens, words2ids, use_batch, batch_siz
                             #wyrzucamy macierz id dzieci batch_children_ids.append(current_batch[sent][0][1][tok])
                             batch_labels.append(current_batch[sent][0][2][tok])
                             batch_words.append(current_batch[sent][0][3][tok])
-                                
-                                
+                                                               
                     #wyrzucamy macierz id dzieci batch_children_ids = seq.pad_sequences(batch_children_ids, padding='post', value = -1)
                     batch_children_positions = seq.pad_sequences(batch_children_positions, padding='post', value = -1)
  
@@ -785,8 +774,7 @@ def load_stanford_data4(labels, parents, tokens, words2ids, use_batch, batch_siz
  
                     current_batch, batch_tokens, batch_children_positions, batch_labels = [], [], [], []
                     batch_words = []
-                    
-                    
+                                       
             else:
  
                 sentences.append(current_sentence)
@@ -826,15 +814,13 @@ def load_stanford_data4(labels, parents, tokens, words2ids, use_batch, batch_siz
                             numpy.array(batch_labels)
                             ,numpy.array(batch_words)
                         ])
-
-            
-            
-            
+           
     return sentences
  
  
  
  
+
 def load_stanford_data5(labels, parents, tokens, words2ids, use_batch, batch_size, nb_classes):
  
  
@@ -1033,5 +1019,200 @@ def load_stanford_data5(labels, parents, tokens, words2ids, use_batch, batch_siz
  
  
  
+def load_stanford_data6(labels, parents, tokens, relations, words2ids, use_batch, batch_size, nb_classes, k_most_common_relations):
+
+    def transform_labels(x, nb_classes):
+        if nb_classes == 3:
+            if x =='#' or int(x) == 0:
+                return 1
+            elif int(x) < 0:
+                return 0
+            else:
+                return 2
+        elif nb_classes == 5:
+            if x =='#':
+                return 2
+            else:
+                return int(x)+2
+       # elif nb_classes == 2: #jesli chcemy miec dwie klasy to neutralne wyrzucamy ze zbioru, 
+       #     if x =='#' or int(x) == 0:
+       #         return -1
+       #     elif int(x) < 0:
+       #         return 0
+       #     else:
+       #         return 1
+
+    sentences = []
+
+    l = open(labels, "r")
+    # 5 klas: labels = [[2 if y=='#' else int(y)+2 for y in x.split()] for x in l.readlines()] 
+
+    # Na ten moment przyjmujemy wartosc "2" w miejsce "#"
+
+    labels = [[transform_labels(y,nb_classes) for y in x.split()] for x in l.readlines()] 
+    l.close()
+
+    p = open(parents,"r")
+    parents = [[int(y) for y in x.split()] for x in p.readlines()]
+    p.close()
+
+    t = open(tokens,"r")
+    tokens = [x.split() for x in t.readlines()]
+    t.close()
+ 
+
+    rels = open(relations,"r")
+    relations = [[y for y in x.split()] for x in rels.readlines()]
+    rels.close()
+    most_common_rels = [x[0] for x in Counter(numpy.concatenate(relations)).most_common(k_most_common_relations)]
+    transform_rels = dict(zip(most_common_rels,range(len(most_common_rels))))
+    relations = [[transform_rels.get(x, k_most_common_relations) for x in sent] for sent in relations]
+   
+    
+    k = 0
+    sentence_length = 0
+    current_batch, batch_tokens, batch_children_ids, batch_children_positions, batch_labels, batch_relations = [], [], [], [], [], []
+    batch_words = []
+    
+    for labels_i, parents_i, tokens_i, relations_i in zip(labels,parents,tokens,relations):
+        
+        k = k + 1
+         
+        s = []
+        for i in range(len(tokens_i)):
+            s.append([i,int(parents_i[i]),labels_i[i],tokens_i[i],relations_i[i]])
+
+        if len(s) == 1 and use_batch == False: #przypadek gdy fraza sklada sie z jednego tokena
+
+            #if nb_classes == 2:
+            #    if s[0][-1] < 0:
+            #        continue
+
+            sentences.append([\
+                                  numpy.array([words2ids.get(tokens[0], -1)]),\
+                                  #wyrzucamy macierz id dzieci numpy.array([-1], ndmin=2),\
+                                  numpy.array([-1], ndmin=2), \
+                                  numpy.array(labels_i[0]), \
+                                  numpy.array(relations_i[0])
+                              ])    
+                                
+        else: 
+
+            for i in range(len(s)): # nie wiem czy sie nie wywali dla frazy dlugosci 1
+                children = []
+                for j in range(len(s)):
+                    if s[j][1] == i+1:
+                        children.append(s[j][0])
+                s[i].append(children)
+
+            words = [x[0] for x in s]
+            children = seq.pad_sequences([x[-1] for x in s], padding='post', value = -1)
+            tokens = [x[3] for x in s]
+            labels_in_batch = [x[2] for x in s]
+	    relations = [x[4] for x in s]
+            
+            ordered_words, order = words_in_from_down_to_top_order(children)
+
+            if ordered_words is None: 
+                continue
+
+            current_sentence = [
+                                  numpy.array([words2ids.get(x,-1) for x in tokens])[ordered_words],
+                                  #wyrzucamy macierz id dzieci numpy.array([[words2ids.get(tokens[w],-1) if w>=0 else -1 for w in x] 
+                                  #             for x in children[ordered_words]]), 
+                                  numpy.array([[order[w] if w>= 0 else -1 for w in x] for x in children[ordered_words]]), 
+                                  numpy.array(labels_in_batch)[ordered_words],
+				  numpy.array(relations)[ordered_words]  ,
+                                  numpy.array(words)
+                                  ]
+            #if nb_classes == 2:
+            #    if current_sentence[3][-1] <0:
+            #        continue
+
+            if use_batch == True:
+                
+                # w tej chwili len(current_sentence[0]) nie jest nigdzie wykorzystywane
+                current_batch.append((current_sentence, len(current_sentence[0])))
+                
+                if len(current_batch) % batch_size == 0:
+
+                    shift = 0
+                    
+                    for sent in range(batch_size):
+                        
+                        ##if sent > 0:
+                        ##    shift = shift + current_batch[sent-1][1]
+                        
+                        for tok in range(len(current_batch[sent][0][0])):
+                            
+                            if sent == 0:
+                                batch_children_positions.append(current_batch[sent][0][1][tok])
+                            else:
+                                batch_children_positions.append([chd+shift if chd>=0 else -1 for chd in current_batch[sent][0][1][tok]])
+                            #batch_children_positions.append(current_batch[sent][0][2][tok])
  
+                            batch_tokens.append(current_batch[sent][0][0][tok])   
+                            #wyrzucamy macierz id dzieci batch_children_ids.append(current_batch[sent][0][1][tok])
+                            batch_labels.append(current_batch[sent][0][2][tok])
+                            batch_relations.append(current_batch[sent][0][3][tok])
+                            batch_words.append(current_batch[sent][0][4][tok])
+                                                               
+                    #wyrzucamy macierz id dzieci batch_children_ids = seq.pad_sequences(batch_children_ids, padding='post', value = -1)
+                    batch_children_positions = seq.pad_sequences(batch_children_positions, padding='post', value = -1)
+                    
+                    sentences.append([
+                                        numpy.array(batch_tokens), 
+                                        #wyrzucamy macierz id dzieci numpy.array(batch_children_ids), 
+                                        numpy.array(batch_children_positions), 
+                                        numpy.array(batch_labels), 
+                                        numpy.array(batch_relations)
+                                        ,numpy.array(batch_words)
+                                    ])
+                    
+                    current_batch, batch_tokens, batch_children_positions, batch_labels, batch_relations = [], [], [], [], []
+                    batch_words = []
+                                       
+            else:
+                
+                sentences.append(current_sentence)
+
+            
+    # gdy liczba zdan nie jest wilokrotnosci licznosci batch, to na koncu trzeba dodac pozostale zdania:
+    if use_batch == True and len(current_batch) > 0:
+        
+        shift = 0
+
+        for sent in range(len(current_batch)):
+ 
+            #if sent > 0:
+            #    shift = shift + current_batch[sent-1][1]
+
+            for tok in range(len(current_batch[sent][0][0])):
+
+                if sent == 0:
+                    batch_children_positions.append(current_batch[sent][0][1][tok])
+                else:
+                    batch_children_positions.append([chd+shift if chd>=0 else -1 for chd in current_batch[sent][0][1][tok]])
+                #batch_children_positions.append(current_batch[sent][0][2][tok])
+
+                batch_tokens.append(current_batch[sent][0][0][tok])
+                #wyrzucamy macierz id dzieci batch_children_ids.append(current_batch[sent][0][1][tok])
+                batch_labels.append(current_batch[sent][0][2][tok])
+                batch_relations.append(current_batch[sent][0][3][tok])
+                batch_words.append(current_batch[sent][0][4][tok])
+
+
+        #wyrzucamy macierz id dzieci batch_children_ids = seq.pad_sequences(batch_children_ids, padding='post', value = -1)
+        batch_children_positions = seq.pad_sequences(batch_children_positions, padding='post', value = -1)
+
+        sentences.append([
+                            numpy.array(batch_tokens), 
+                            #wyrzucamy macierz id dzieci numpy.array(batch_children_ids), 
+                            numpy.array(batch_children_positions), 
+                            numpy.array(batch_labels), 
+                            numpy.array(batch_relations)
+                            ,numpy.array(batch_words)
+                        ])
+           
+    return sentences