2_main_stanford.py
6.38 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import numpy
import time
import sys
import subprocess
import os
import random
from modules.data import load
from modules.rnn.nnet_for_dependency_trees import model2
#from modules.metrics.accuracy import conlleval
from modules.utils.tools import shuffle, words_in_from_down_to_top_order, load_conll_data, load_stanford_data2,load_stanford_data3, filter_embeddings
import theano.tensor as T
import theano
import itertools
import os.path
import pickle
if __name__ == '__main__':
theano.config.floatX = 'float64'
file_with_filtered_embeddings = "embeddings/embedding_and_words2ids.pkl"
if not os.path.exists(file_with_filtered_embeddings):
print("Cannot find file with only needed embeddings. We use 'filter_embeddings' in order to create it.")
filter_embeddings(["data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks", "data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks", "data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks"],
"/home/norbert/Doktorat/clarin2sent/treelstm/data/glove/glove.840B.300d.txt",
file_with_filtered_embeddings)
s = {'lr':0.01, #0.03 (przy r=0.05) dobilo do dopasowania 0.9 na 500 obserwacjach po 15 epokach
#bo 40 epoki, bylo ok 0.9, a potem spadlo do stalej predykcji rownej 0,
#chociaz w zbiorze treningowym nie bylo ani jednaj obserwacji z etykieta 0 !!!
#0.03 (r=0.05) przy 5000 obs do 15 epoki stalo na ok 75% a potem spadlo dor predykcji stalej rownej 0
#0.05 - reszta j.w. nic sie nie nauczyl, a przy 10 iteracji predykcja spadla do stalej - 0
'verbose':1,
'decay':False, # decay on the learning rate if improvement stops
'nepochs':50,
'seed':345,
'nh':300, # dimension of hidden state
'nc':3 , # number of y classes
'ds':30} # dimension of sentiment state
# instanciate the model
numpy.random.seed(s['seed'])
random.seed(s['seed'])
rnn = model2( nh = s['nh'],
nc = s['nc'],
ds = s['ds'],
w2v_model_path = file_with_filtered_embeddings, #sciezka do pliku z embeddingami
max_phrase_length = 220 # przydaloby sie to uzaleznic od danych, ale nie jest to konieczne.
# Wazne, ze to jest wartosc nie mniejsza niz dlugosc najdluzszego zdania w danych
)
#train_size = 500
for train_size in [0]:
best_prediction = 0
early_stop = 0
#if train_size > 100:
# time.sleep(600)
train_data = load_stanford_data3("data/sst/train/dlabels.txt", "data/sst/train/dparents.txt","data/sst/train/sents.toks",rnn.words2ids,True,5,s['nc'])
#train_data = [train_data[idx] for idx in numpy.array(random.sample(range(len(train_data)),train_size))]
#train_data = [train_data[0]]
dev_data = load_stanford_data2("data/sst/dev/dlabels.txt", "data/sst/dev/dparents.txt","data/sst/dev/sents.toks",rnn.words2ids,False,0,s['nc'])
test_data = load_stanford_data2("data/sst/test/dlabels.txt", "data/sst/test/dparents.txt","data/sst/test/sents.toks",rnn.words2ids,False,0,s['nc'])
n_train = len(train_data)
n_dev = len(dev_data)
n_test = len(test_data)
#to do: training with early stopping on validation set
#train_data = theano.shared(train_data)
#test_data = theano.shared(test_data)
print("Number of training phrases: ", n_train*5)
s['clr'] = s['lr']
for e in xrange(s['nepochs']):
if early_stop == 5:
break
#if e < 4:
# continue
#if e>0 and e % 4 == 0:
# time.sleep(900)
print("epoch: ", e)
# shuffle
shuffle([train_data], s['seed'])
tic = time.time()
for i in range(n_train):
rnn.train(train_data[i][0],train_data[i][1], train_data[i][2], train_data[i][3], s['clr'])
#rnn.normalize()
#if i == n_train-1: #(i>0 and i % 600 == 0) or i == n_train-1:
rnn.save("saved_models_final4", train_size*5, e)
# Dev:
counts_dev = numpy.zeros((s['nc'],s['nc']),dtype='int')
counts_dev_root = numpy.zeros((s['nc'],s['nc']),dtype='int')
for ii in range(n_dev):
pred = rnn.classify(dev_data[ii][0],dev_data[ii][1], dev_data[ii][2])
for j in range(len(pred)):
counts_dev[pred[j], dev_data[ii][3][j]] += 1
counts_dev_root[pred[-1], dev_data[ii][3][-1]] += 1
if numpy.diag(counts_dev).sum()/float(counts_dev.sum()) > best_prediction:
best_prediction = numpy.diag(counts_dev).sum()/float(counts_dev.sum())
early_stop = 0
else:
early_stop = early_stop + 1
# Test:
counts_test = numpy.zeros((s['nc'],s['nc']),dtype='int')
counts_test_root = numpy.zeros((s['nc'],s['nc']),dtype='int')
for i in range(n_test):
pred = rnn.classify(test_data[i][0],test_data[i][1], test_data[i][2])
for j in range(len(pred)):
counts_test[pred[j], test_data[i][3][j]] += 1
counts_test_root[pred[-1], test_data[i][3][-1]] += 1
# Train
counts = numpy.zeros((s['nc'],s['nc']),dtype='int')
counts_root = numpy.zeros((s['nc'],s['nc']),dtype='int')
for i in range(n_train):
if i % 100 == 0: #sprawdzamy dopasowanie na 1/100 zbioru zeby oszczedzic czas
pred = rnn.classify(train_data[i][0],train_data[i][1], train_data[i][2])
for j in range(len(pred)):
counts[pred[j], train_data[i][3][j]] += 1
counts_root[pred[-1], test_data[i][3][-1]] += 1
print("Validattion all: ", numpy.diag(counts_dev).sum()/float(counts_dev.sum()),
" Test all: ", numpy.diag(counts_test).sum()/float(counts_test.sum()),
"Validattion root: ", numpy.diag(counts_dev_root).sum()/float(counts_dev_root.sum()),
" Test root: ", numpy.diag(counts_test_root).sum()/float(counts_test_root.sum()),
" Train: ", numpy.diag(counts).sum()/float(counts.sum()),
" Train root: ", numpy.diag(counts_root).sum()/float(counts_root.sum())
)
print(time.time()-tic)