2_main_stanford.py 5.66 KB
import numpy
import time
import sys
import subprocess
import os
import random

from modules.data import load
from modules.rnn.nnet_for_dependency_trees import model2
#from modules.metrics.accuracy import conlleval
from modules.utils.tools import shuffle, words_in_from_down_to_top_order, load_conll_data, load_stanford_data2, filter_embeddings

import theano.tensor as T
import theano

import itertools

import os.path
import pickle

if __name__ == '__main__':

    theano.config.floatX = 'float64'

    file_with_filtered_embeddings = "embeddings/embedding_and_words2ids.pkl"
    if not os.path.exists(file_with_filtered_embeddings):
        print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
        filter_embeddings(["data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks", "data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks", "data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks"],

         "/home/norbert/Doktorat/clarin2sent/treelstm/data/glove/glove.840B.300d.txt",
         file_with_filtered_embeddings)
        
        


    s = {'lr':0.01,   #0.03 (przy r=0.05) dobilo do dopasowania 0.9 na 500 obserwacjach po 15 epokach
                      #bo 40 epoki, bylo ok 0.9, a potem spadlo do stalej predykcji rownej 0,
                      #chociaz w zbiorze treningowym nie bylo ani jednaj obserwacji z etykieta 0 !!!
                      
                      #0.03 (r=0.05) przy 5000 obs do 15 epoki stalo na ok 75% a potem spadlo dor predykcji stalej rownej 0

                      #0.05 - reszta j.w. nic sie nie nauczyl, a przy 10 iteracji predykcja spadla do stalej - 0
         'verbose':1,
         'decay':False, # decay on the learning rate if improvement stops
         'nepochs':200,
         'seed':345,
         'nh':300, # dimension of hidden state
         'nc':3 , # number of y classes
         'ds':30}  # dimension of sentiment state

    


    # instanciate the model
    numpy.random.seed(s['seed'])
    random.seed(s['seed'])
    rnn = model2(    nh = s['nh'],
                    nc = s['nc'],
                    ds = s['ds'],
                    w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami 
                    max_phrase_length = 60 # przydaloby sie to uzaleznic od danych, ale nie jest to konieczne. 
                                           # Wazne, ze to jest wartosc nie mniejsza niz dlugosc najdluzszego zdania w danych
                    )



    train_data = load_stanford_data2("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks",rnn.words2ids,False)[::8][0:1000]

    #tmp = load_stanford_data2("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks",rnn.words2ids,False)[::8][0:1000]

    #print(train_data[0][2])
    #print([x[2] for x in tmp[0:3]])
    #2/0
    test_data = load_stanford_data2("data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks",rnn.words2ids,False)[::2][0:300]

    n_train = len(train_data)
    n_test = len(test_data)
    #to do: training with early stopping on validation set
   

    #train_data = theano.shared(train_data)
    #test_data = theano.shared(test_data)

    print("Number of training phrases: ", n_train)
    s['clr'] = s['lr']
    for e in xrange(s['nepochs']):


        if e < 2:
            continue

        if e>2:
            time.sleep(900)

        print("epoch: ", e)

        # shuffle
        #shuffle([train_data], s['seed'])
        
        tic = time.time()
        for i in range(n_train):
            
            if i > 0 and i % 100 == 0:
                print(i)  
                print(time.time()-tic)

                counts = numpy.zeros((3,3),dtype='int')
                for z in range(n_test):
                    pred = rnn.classify(test_data[z][0],test_data[z][1], test_data[z][2])
                    for j in range(len(pred)):
                        counts[pred[j], test_data[z][3][j]] += 1
                print("On test set:")
                print counts
                print numpy.diag(counts).sum()/float(counts.sum())
     

            #if i % 3 == 0:

            #print rnn.print_ep(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3])
            rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], s['clr'])
            
#rnn.normalize()

            #if s['verbose']:
            #    print ('[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./n_train),'completed in %.2f (sec) <<\r'%(time.time()-tic))
            #    sys.stdout.flush()

        
        print(time.time()-tic)

        rnn.save("saved_models3", e)

        tic = time.time()
        # Train
        counts = numpy.zeros((3,3),dtype='int')
        for i in range(n_train):
            
            if i % 4 == 0: #sprawdzamy dopasowanie na 1/4 zbioru zeby oszczedzic czas
                pred  = rnn.classify(train_data[i][0],train_data[i][1], train_data[i][2])
                for j in range(len(pred)):
                    counts[pred[j], train_data[i][3][j]] += 1
        print("On train set:")
        print counts
        print numpy.diag(counts).sum()/float(counts.sum())

        # Test:
        counts = numpy.zeros((3,3),dtype='int')
        for i in range(n_test):
            #if i % 5 == 0:
            pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][2])
            for j in range(len(pred)):
                counts[pred[j], test_data[i][3][j]] += 1
        print("On test set:")
        print counts
        print numpy.diag(counts).sum()/float(counts.sum())
     
        if e < 2:
            print(time.time()-tic)