main_stanford.py
4.67 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import numpy
import time
import sys
import subprocess
import os
import random
from modules.data import load
from modules.rnn.nnet_for_dependency_trees import model
#from modules.metrics.accuracy import conlleval
from modules.utils.tools import shuffle, words_in_from_down_to_top_order, load_conll_data, load_stanford_data, filter_embeddings
import theano.tensor as T
import theano
import itertools
import os.path
import pickle
if __name__ == '__main__':
theano.config.floatX = 'float64'
file_with_filtered_embeddings = "embeddings/embedding_and_words2ids.pkl"
if not os.path.exists(file_with_filtered_embeddings):
print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
filter_embeddings(["data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks", "data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks", "data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks"],
"/home/norbert/Doktorat/clarin2sent/treelstm/data/glove/glove.840B.300d.txt",
file_with_filtered_embeddings)
s = {'lr':0.01, #0.03 (przy r=0.05) dobilo do dopasowania 0.9 na 500 obserwacjach po 15 epokach
#bo 40 epoki, bylo ok 0.9, a potem spadlo do stalej predykcji rownej 0,
#chociaz w zbiorze treningowym nie bylo ani jednaj obserwacji z etykieta 0 !!!
#0.03 (r=0.05) przy 5000 obs do 15 epoki stalo na ok 75% a potem spadlo dor predykcji stalej rownej 0
#0.05 - reszta j.w. nic sie nie nauczyl, a przy 10 iteracji predykcja spadla do stalej - 0
'verbose':1,
'decay':False, # decay on the learning rate if improvement stops
'nepochs':200,
'seed':345,
'nh':300, # dimension of hidden state
'nc':5 , # number of y classes
'ds':30} # dimension of sentiment state
# instanciate the model
numpy.random.seed(s['seed'])
random.seed(s['seed'])
rnn = model( nh = s['nh'],
nc = s['nc'],
ds = s['ds'],
w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami
max_phrase_length = 60 # przydaloby sie to uzaleznic od danych, ale nie jest to konieczne.
# Wazne, ze to jest wartosc nie mniejsza niz dlugosc najdluzszego zdania w danych
)
train_data = load_stanford_data("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks",rnn.words2ids)[::3]
test_data = load_stanford_data("data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks",rnn.words2ids)[::5]
n_train = len(train_data)
n_test = len(test_data)
#to do: training with early stopping on validation set
#train_data = theano.shared(train_data)
#test_data = theano.shared(test_data)
print("Number of training phrases: ", n_train)
s['clr'] = s['lr']
for e in xrange(s['nepochs']):
if e < 18:
continue
#if e>15 and e%3==0:
# time.sleep(300)
print("epoch: ", e)
# shuffle
#shuffle([train_data], s['seed'])
#tic = time.time()
for i in range(n_train):
if i % 10000 == 0:
print(i)
#if i % 3 == 0:
rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], s['clr'])
#rnn.normalize()
#if s['verbose']:
# print ('[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./n_train),'completed in %.2f (sec) <<\r'%(time.time()-tic))
# sys.stdout.flush()
#print(time.time()-tic)
rnn.save("saved_models", e)
# Train
counts = numpy.zeros((5,5),dtype='int')
for i in range(n_train):
if i % 2 == 0: #sprawdzamy dopasowanie na 1/4 zbioru zeby oszczedzic czas
pred = rnn.classify(train_data[i][0],train_data[i][1], train_data[i][2])
counts[pred, train_data[i][3]] += 1
print("On train set:")
print counts
print numpy.diag(counts).sum()/float(counts.sum())
# Test:
counts = numpy.zeros((5,5),dtype='int')
for i in range(n_test):
#if i % 5 == 0:
pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][2])
counts[pred, test_data[i][3]] += 1
print("On test set:")
print counts
print numpy.diag(counts).sum()/float(counts.sum())