tools.py
3.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import random
import numpy
from keras.preprocessing import sequence as seq
def shuffle(lol, seed):
'''
lol :: list of list as input
seed :: seed the shuffling
shuffle inplace each list in the same order
'''
for l in lol:
random.seed(seed)
random.shuffle(l)
def minibatch(l, bs):
'''
l :: list of word idxs
return a list of minibatches of indexes
which size is equal to bs
border cases are treated as follow:
eg: [0,1,2,3] and bs = 3
will output:
[[0],[0,1],[0,1,2],[1,2,3]]
'''
out = [l[:i] for i in xrange(1, min(bs,len(l)+1) )]
out += [l[i-bs:i] for i in xrange(bs,len(l)+1) ]
assert len(l) == len(out)
return out
def words_in_from_down_to_top_order(sentence_tree):
levels = numpy.setdiff1d(range(len(sentence_tree)),numpy.unique(sentence_tree)) # - zwraca slowo, ktore nie jest niczyim dzieckiem - czyli powinno byc korzeniem frazy
if len(levels) != 1: # jezeli okazuje sie jest wiecej niz jeden korzec (lub nie ma korzenia) to zwracamy None, aby pozniej rozpoznac takie zdanie i je wywalic
return None, None
levels = levels.tolist()
for i in range(len(sentence_tree)):
levels.extend(numpy.setdiff1d(sentence_tree[levels[i]],-1))
ordered_words = numpy.array(levels)[levels != numpy.array(-1)][::-1] #odwaracmy kolejnosc na poczatku beda slowa znajdujace sie najglebiej
order = numpy.zeros(len(sentence_tree),dtype='int')
for i in range(len(sentence_tree)):
order[ordered_words[i]] = i
return ordered_words, order
def load_conll_data(conll_format_data, words2ids):
label_trans = {'_\n':0, 'A\n':1, 'A':1, 'T':1, 'T\n':1}
sentences = []
k = 0
with open(conll_format_data) as fr:
s = []
for line in fr:
if len(line) < 2:
for i in range(len(s)):
children = []
for j in range(len(s)):
if s[j][1] == i+1:
children.append(s[j][0])
s[i].append(children)
words = [x[0] for x in s]
children = seq.pad_sequences([x[4] for x in s], padding='post', value = -1)
tokens = [x[3] for x in s]
if len(s) == 1:
sentences.append([\
numpy.array([words2ids[tokens[0]]]),\
numpy.array([-1], ndmin=2),\
numpy.array([-1], ndmin=2), \
y \
])
else:
ordered_words, order = words_in_from_down_to_top_order(children)
if ordered_words is None: #jezeli we frazie jest 2 lub wiecej albo 0 korzeni to nie wlaczamy tego zdania do naszych danych, bo uznajemy je za blendne
s = []
k = 0
continue
sentences.append([\
numpy.array([words2ids[x] for x in tokens])[ordered_words],\
numpy.array([[words2ids[tokens[w]] if w>=0 else -1 for w in x] for x in children[ordered_words]]),\
numpy.array([[order[w] if w>= 0 else -1 for w in x] for x in children[ordered_words]]), \
y \
])
s = []
k = 0
else:
toks = line.split(' ')
token = toks[1].decode('utf8')
parent = int(toks[6])
sentiment = int(toks[-1] == 'S' or toks[-1] == 'S\n')
if parent == 0: # to oznacza, ze dane slowo jest korzeniem frazy
y = sentiment
s.append( [k, parent, sentiment, token] )
k = k +1
return sentences