main.py
3.14 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import numpy
import time
import sys
import subprocess
import os
import random
from is13.data import load
from is13.rnn.nnet_for_dependency_trees import model
from is13.metrics.accuracy import conlleval
from is13.utils.tools import shuffle, minibatch, contextwin
import numpy
import theano.tensor as T
import theano
from keras.preprocessing import sequence as seq
from collections import OrderedDict
import itertools
def load_conll_data(conll_format_data):
#
# to do: trzeba jeszcze zmienic zeby byly id slow w macierzy embeddingow, a nie pozycje w zdaniu
#
label_trans = {'_\n':0, 'A\n':1, 'A':1, 'T':1, 'T\n':1}
sentences = []
#maxlen = 0
k = 0
y = 0
with open(conll_format_data) as fr:
s = []
for line in fr:
if len(line) < 2:
for i in range(len(s)):
childrens = []
for j in range(len(s)):
if s[j][1] == i+1:
childrens.append(s[j][0])
s[i].append(childrens)
words = [x[0] for x in s]
childrens = seq.pad_sequences([x[3] for x in s], padding='post', value = -1)
if len(s)>2:
sentences.append((words,childrens,y))
s = []
k = 0
else:
toks = line.split(' ')
word = toks[1].decode('utf8')
parent = int(toks[6])
sentiment = int(toks[-4] == 'S')
if parent == 0: # to oznacza, ze dane slowo jest korzeniem frazy
y = sentiment
#print(word, label,label_trans[label])
s.append( [k+1, parent, sentiment] )
k = k +1
return sentences
if __name__ == '__main__':
s = {'lr':0.0627142536696559,
'verbose':1,
'decay':False, # decay on the learning rate if improvement stops
'nepochs':2,
'seed':345,
'de':10, # dimension of word embedding
'nh':10, # dimension of hidden state
'nc':2 , # number of y classens
'ne':50, # vocabulary size
'ds':10} # dimension of sentiment state
conll_format_data = '/home/rexamine/Doktorat/Opinion_Targets/opta-tagger/train_data/conll-format/train.conll'
data = load_conll_data(conll_format_data)[0:2]
nsentences = len(data)
# instanciate the model
numpy.random.seed(s['seed'])
random.seed(s['seed'])
rnn = model( nh = s['nh'],
nc = s['nc'],
ne = s['ne'],
de = s['de'],
ds = s['ds'])
# train #to do: with early stopping on validation set
s['clr'] = s['lr']
for e in xrange(s['nepochs']):
# shuffle
shuffle([data], s['seed'])
#s['ce'] = e
tic = time.time()
for i in xrange(nsentences):
rnn.train(data[i][0],data[i][1], data[i][2], s['clr'])
rnn.normalize()
if s['verbose']:
print ('[learning] epoch %i >> %2.2f%%'%(e,(i+1)*100./nsentences),'completed in %.2f (sec) <<\r'%(time.time()-tic))
sys.stdout.flush()