2_main_stanford.py 6.38 KB
import numpy
import time
import sys
import subprocess
import os
import random

from modules.data import load
from modules.rnn.nnet_for_dependency_trees import model2
#from modules.metrics.accuracy import conlleval
from modules.utils.tools import shuffle, words_in_from_down_to_top_order, load_conll_data, load_stanford_data2,load_stanford_data3, filter_embeddings

import theano.tensor as T
import theano

import itertools

import os.path
import pickle

if __name__ == '__main__':

    theano.config.floatX = 'float64'

    file_with_filtered_embeddings = "embeddings/embedding_and_words2ids.pkl"
    if not os.path.exists(file_with_filtered_embeddings):
        print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
        filter_embeddings(["data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks", "data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks", "data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks"],

         "/home/norbert/Doktorat/clarin2sent/treelstm/data/glove/glove.840B.300d.txt",
         file_with_filtered_embeddings)
        
        


    s = {'lr':0.01,   #0.03 (przy r=0.05) dobilo do dopasowania 0.9 na 500 obserwacjach po 15 epokach
                      #bo 40 epoki, bylo ok 0.9, a potem spadlo do stalej predykcji rownej 0,
                      #chociaz w zbiorze treningowym nie bylo ani jednaj obserwacji z etykieta 0 !!!
                      
                      #0.03 (r=0.05) przy 5000 obs do 15 epoki stalo na ok 75% a potem spadlo dor predykcji stalej rownej 0

                      #0.05 - reszta j.w. nic sie nie nauczyl, a przy 10 iteracji predykcja spadla do stalej - 0
         'verbose':1,
         'decay':False, # decay on the learning rate if improvement stops
         'nepochs':50,
         'seed':345,
         'nh':300, # dimension of hidden state
         'nc':3 , # number of y classes
         'ds':30}  # dimension of sentiment state

    


    # instanciate the model
    numpy.random.seed(s['seed'])
    random.seed(s['seed'])
    rnn = model2(    nh = s['nh'],
                    nc = s['nc'],
                    ds = s['ds'],
                    w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami 
                    max_phrase_length = 220 # przydaloby sie to uzaleznic od danych, ale nie jest to konieczne. 
                                           # Wazne, ze to jest wartosc nie mniejsza niz dlugosc najdluzszego zdania w danych
                    )



    #train_size = 500

    for train_size in [0]:

            best_prediction = 0
            early_stop = 0

            #if train_size > 100:
            #    time.sleep(600)

	    train_data = load_stanford_data3("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks",rnn.words2ids,True,5,s['nc'])
        

	    #train_data = [train_data[idx] for idx in numpy.array(random.sample(range(len(train_data)),train_size))]
            #train_data = [train_data[0]]


	    dev_data = load_stanford_data2("data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks",rnn.words2ids,False,0,s['nc'])



	    test_data = load_stanford_data2("data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks",rnn.words2ids,False,0,s['nc'])


	    n_train = len(train_data)
	    n_dev = len(dev_data)
	    n_test = len(test_data)
	    #to do: training with early stopping on validation set
	   

	    #train_data = theano.shared(train_data)
	    #test_data = theano.shared(test_data)

	    print("Number of training phrases: ", n_train*5)
	    s['clr'] = s['lr']
	    for e in xrange(s['nepochs']):


                if early_stop == 5:
                    break

		#if e < 4:
		#    continue

		#if e>0 and e % 4 == 0:
		#    time.sleep(900)

		print("epoch: ", e)

		# shuffle
		shuffle([train_data], s['seed'])
		
		tic = time.time()
		for i in range(n_train):
		    

		    rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], s['clr'])
		    
		    #rnn.normalize()
		    #if i == n_train-1: #(i>0 and i % 600 == 0) or i == n_train-1:
		        
		        

		        

		rnn.save("saved_models_final4", train_size*5, e)       

	        # Dev:
	        counts_dev = numpy.zeros((s['nc'],s['nc']),dtype='int')
                counts_dev_root = numpy.zeros((s['nc'],s['nc']),dtype='int')
	        for ii in range(n_dev):
	            pred = rnn.classify(dev_data[ii][0],dev_data[ii][1], dev_data[ii][2])
	            for j in range(len(pred)):
	                counts_dev[pred[j], dev_data[ii][3][j]] += 1
                    counts_dev_root[pred[-1], dev_data[ii][3][-1]] += 1

	        if numpy.diag(counts_dev).sum()/float(counts_dev.sum()) > best_prediction:
                    best_prediction = numpy.diag(counts_dev).sum()/float(counts_dev.sum())
                    early_stop = 0
                else:
                    early_stop = early_stop + 1

		# Test:
		counts_test = numpy.zeros((s['nc'],s['nc']),dtype='int')
		counts_test_root = numpy.zeros((s['nc'],s['nc']),dtype='int')
		for i in range(n_test):
		    pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][2])
		    for j in range(len(pred)):
		        counts_test[pred[j], test_data[i][3][j]] += 1
                    counts_test_root[pred[-1], test_data[i][3][-1]] += 1
		
	     


		# Train
		counts = numpy.zeros((s['nc'],s['nc']),dtype='int')
		counts_root = numpy.zeros((s['nc'],s['nc']),dtype='int')
		for i in range(n_train):
		    
		    if i % 100 == 0: #sprawdzamy dopasowanie na 1/100 zbioru zeby oszczedzic czas
		        pred  = rnn.classify(train_data[i][0],train_data[i][1], train_data[i][2])
		        for j in range(len(pred)):
		            counts[pred[j], train_data[i][3][j]] += 1
                        counts_root[pred[-1], test_data[i][3][-1]] += 1

		print("Validattion all: ", numpy.diag(counts_dev).sum()/float(counts_dev.sum()), 
		      "   Test all: ", numpy.diag(counts_test).sum()/float(counts_test.sum()), 
                      "Validattion root: ", numpy.diag(counts_dev_root).sum()/float(counts_dev_root.sum()), 
		      "   Test root: ", numpy.diag(counts_test_root).sum()/float(counts_test_root.sum()), 
		      "   Train: ",  numpy.diag(counts).sum()/float(counts.sum()), 
		      "   Train root: ",  numpy.diag(counts_root).sum()/float(counts_root.sum())  
		     )
	       

		
		print(time.time()-tic)