load.py
2.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import gzip
import cPickle
import urllib
import os
import random
import pickle
from os.path import isfile
PREFIX = os.getenv('ATISDATA', '')
def download(origin):
'''
download the corresponding atis file
from http://www-etud.iro.umontreal.ca/~mesnilgr/atis/
'''
print( 'Downloading data from %s' % origin )
name = origin.split('/')[-1]
urllib.urlretrieve(origin, name)
def download_dropbox():
'''
download from drop box in the meantime
'''
print( 'Downloading data from https://www.dropbox.com/s/3lxl9jsbw0j7h8a/atis.pkl?dl=0' )
os.system('wget -O atis.pkl https://www.dropbox.com/s/3lxl9jsbw0j7h8a/atis.pkl?dl=0')
def load_dropbox(filename):
if not isfile(filename):
#download('http://www-etud.iro.umontreal.ca/~mesnilgr/atis/'+filename)
download_dropbox()
#f = gzip.open(filename,'rb')
f = open(filename,'rb')
return f
def load_udem(filename):
if not isfile(filename):
download('http://lisaweb.iro.umontreal.ca/transfert/lisa/users/mesnilgr/atis/'+filename)
f = gzip.open(filename,'rb')
return f
def atisfull():
f = load_dropbox(PREFIX + 'atis.pkl')
train_set, test_set, dicts = cPickle.load(f)
return train_set, test_set, dicts
def atisfold(fold):
assert fold in range(5)
f = load_udem(PREFIX + 'atis.fold'+str(fold)+'.pkl.gz')
train_set, valid_set, test_set, dicts = cPickle.load(f)
return train_set, valid_set, test_set, dicts
if __name__ == '__main__':
''' visualize a few sentences '''
import pdb
w2ne, w2la = {}, {}
train, test, dic = atisfull()
train, _, test, dic = atisfold(1)
w2idx, ne2idx, labels2idx = dic['words2idx'], dic['tables2idx'], dic['labels2idx']
idx2w = dict((v,k) for k,v in w2idx.iteritems())
idx2ne = dict((v,k) for k,v in ne2idx.iteritems())
idx2la = dict((v,k) for k,v in labels2idx.iteritems())
test_x, test_ne, test_label = test
train_x, train_ne, train_label = train
sentences_from_train_x = [[idx2w[wx] for wx in sent] for sent in train_x]
output = open('/home/rexamine/doktorat/RNN/is13/data/sentences_from_train_x', 'wb')
pickle.dump(sentences_from_train_x, output)
output.close()
output = open('/home/rexamine/doktorat/RNN/is13/data/sentences_from_train_x', 'rb')
c = pickle.load(output)
print(c[0])
output.close()
wlength = 35
for e in ['train','test']:
for sw, se, sl in zip(eval(e+'_x'), eval(e+'_ne'), eval(e+'_label')):
print( 'WORD'.rjust(wlength), 'LABEL'.rjust(wlength) )
for wx, la in zip(sw, sl): print( idx2w[wx].rjust(wlength), idx2la[la].rjust(wlength) )
print( '\n'+'**'*30+'\n' )
pdb.set_trace()