reduce_cluster.py
1.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#! /usr/bin/python3
import numpy as np
TRESHOLD_FACTOR = 0.9
STEP = 0.01
def perform_reduce(matrix, keys, elements):
order = sorted(elements, reverse = True)
new_matrix = matrix
new_keys = keys
for i in order:
new_matrix = np.delete(new_matrix, i, axis = 0)
new_matrix = np.delete(new_matrix, i, axis = 1)
new_keys = np.delete(new_keys, i, axis = 0)
return (new_matrix, new_keys)
def calculate_average_similarities(matrix):
sim = []
count = matrix.shape[0]
for i in range(count):
tmp = np.sum(matrix[i,:i])
tmp += np.sum(matrix[i, (i+1):])
sim.append((tmp * 1.0) / (count - 1))
return sim
def reduce_cluster_iteration(matrix, keys, normalised_treshold):
avg_sims = calculate_average_similarities(matrix)
avg_sim = np.average(avg_sims)
min_avg_sim = np.min(avg_sims)
if min_avg_sim < (TRESHOLD_FACTOR * normalised_treshold) or avg_sim < normalised_treshold:
l = []
for i, avg in enumerate(avg_sims):
if avg < min_avg_sim + STEP:
l.append(i)
new_matrix, new_keys = perform_reduce(matrix, keys, l)
return (True, new_matrix, new_keys)
else:
return (False, matrix, keys)
def reduce_cluster(matrix, keys, normalised_treshold, min_size):
repeat = True
new_matrix = matrix
new_keys = keys
while repeat:
if len(new_keys) <= min_size:
return []
repeat, new_matrix, new_keys = reduce_cluster_iteration(new_matrix, new_keys, normalised_treshold)
return new_keys