Commit cf3852f0d13df6810dca9858fb87f599310567fe

Authored by Bartłomiej Nitoń
1 parent cdcb85d4

Added siamese neural model.

... ... @@ -6,13 +6,15 @@ from gensim.models.word2vec import Word2Vec
6 6  
7 7  
8 8 CONTEXT = 5
9   -THRESHOLD = 0.95
  9 +# THRESHOLD = 0.001
10 10 RANDOM_WORD_VECTORS = True
11 11 W2V_SIZE = 50
12 12 W2V_MODEL_NAME = 'w2v_allwiki_nkjpfull_50.model'
13 13  
14   -NUMBER_OF_FEATURES = 1190
15   -NEURAL_MODEL_NAME = 'model_1190_features.h5'
  14 +# simple or siamese
  15 +NEURAL_MODEL_ARCHITECTURE = 'siamese'
  16 +NUMBER_OF_FEATURES = 625
  17 +NEURAL_MODEL_NAME = 'weights_siamese_model.h5'
16 18  
17 19 FREQ_LIST_NAME = 'base.lst'
18 20 LEMMA2SYNONYMS_NAME = 'lemma2synonyms.map'
... ... @@ -28,7 +30,7 @@ W2V_MODEL_PATH = os.path.join(MAIN_PATH, 'models', W2V_MODEL_NAME)
28 30 W2V_MODEL = Word2Vec.load(W2V_MODEL_PATH)
29 31  
30 32 NEURAL_MODEL_PATH = os.path.join(MAIN_PATH, 'models', NEURAL_MODEL_NAME)
31   -NEURAL_MODEL = utils.initialize_neural_model(NUMBER_OF_FEATURES, NEURAL_MODEL_PATH)
  33 +NEURAL_MODEL = utils.initialize_neural_model(NEURAL_MODEL_ARCHITECTURE, NUMBER_OF_FEATURES, NEURAL_MODEL_PATH)
32 34  
33 35 FREQ_LIST_PATH = os.path.join(MAIN_PATH, 'freq', FREQ_LIST_NAME)
34 36 FREQ_LIST = utils.load_freq_list(FREQ_LIST_PATH)
... ...
corneferencer/main.py
... ... @@ -6,6 +6,7 @@ from natsort import natsorted
6 6  
7 7 sys.path.append(os.path.abspath(os.path.join('..')))
8 8  
  9 +import conf
9 10 from inout import mmax
10 11 from inout.constants import INPUT_FORMATS
11 12 from resolvers import resolve
... ... @@ -22,7 +23,11 @@ def main():
22 23 elif args.format not in INPUT_FORMATS:
23 24 eprint("Error: Unknown input file format!")
24 25 else:
25   - process_texts(args.input, args.output, args.format, args.resolver)
  26 + resolver = args.resolver
  27 + if conf.NEURAL_MODEL_ARCHITECTURE == 'siamese':
  28 + resolver = conf.NEURAL_MODEL_ARCHITECTURE
  29 + eprint ("Warning: Using %s resolver because of selected neural model architecture!" % conf.NEURAL_MODEL_ARCHITECTURE)
  30 + process_texts(args.input, args.output, args.format, resolver, args.threshold)
26 31  
27 32  
28 33 def parse_arguments():
... ... @@ -40,21 +45,24 @@ def parse_arguments():
40 45 dest='resolver', default='incremental',
41 46 help='resolve algorithm; default: incremental; possibilities: %s'
42 47 % ', '.join(RESOLVERS))
  48 + parser.add_argument('-t', '--threshold', type=float, action='store',
  49 + dest='threshold', default=0.001,
  50 + help='threshold; default: 0.001')
43 51  
44 52 args = parser.parse_args()
45 53 return args
46 54  
47 55  
48   -def process_texts(inpath, outpath, informat, resolver):
  56 +def process_texts(inpath, outpath, informat, resolver, threshold):
49 57 if os.path.isdir(inpath):
50   - process_directory(inpath, outpath, informat, resolver)
  58 + process_directory(inpath, outpath, informat, resolver, threshold)
51 59 elif os.path.isfile(inpath):
52   - process_file(inpath, outpath, informat, resolver)
  60 + process_file(inpath, outpath, informat, resolver, threshold)
53 61 else:
54 62 eprint("Error: Specified input does not exist!")
55 63  
56 64  
57   -def process_directory(inpath, outpath, informat, resolver):
  65 +def process_directory(inpath, outpath, informat, resolver, threshold):
58 66 inpath = os.path.abspath(inpath)
59 67 outpath = os.path.abspath(outpath)
60 68  
... ... @@ -65,18 +73,20 @@ def process_directory(inpath, outpath, informat, resolver):
65 73 textname = os.path.splitext(os.path.basename(filename))[0]
66 74 textoutput = os.path.join(outpath, textname)
67 75 textinput = os.path.join(inpath, filename)
68   - process_file(textinput, textoutput, informat, resolver)
  76 + process_file(textinput, textoutput, informat, resolver, threshold)
69 77  
70 78  
71   -def process_file(inpath, outpath, informat, resolver):
  79 +def process_file(inpath, outpath, informat, resolver, threshold):
72 80 basename = os.path.basename(inpath)
73 81 if informat == 'mmax' and basename.endswith('.mmax'):
74 82 print (basename)
75 83 text = mmax.read(inpath)
76 84 if resolver == 'incremental':
77   - resolve.incremental(text)
  85 + resolve.incremental(text, threshold)
78 86 elif resolver == 'entity_based':
79   - resolve.entity_based(text)
  87 + resolve.entity_based(text, threshold)
  88 + elif resolver == 'siamese':
  89 + resolve.siamese(text, threshold)
80 90 mmax.write(inpath, outpath, text)
81 91  
82 92  
... ...
corneferencer/resolvers/constants.py
1 1 # -*- coding: utf-8 -*-
2 2  
3   -RESOLVERS = ['entity_based', 'incremental']
  3 +RESOLVERS = ['entity_based', 'incremental', 'siamese']
4 4  
5 5 NOUN_TAGS = ['subst', 'ger', 'depr']
6 6 PPRON_TAGS = ['ppron12', 'ppron3']
... ...
corneferencer/resolvers/resolve.py
1 1 import numpy
2 2  
3   -from conf import NEURAL_MODEL, THRESHOLD
  3 +from conf import NEURAL_MODEL#, THRESHOLD
4 4 from corneferencer.resolvers import features
5   -from corneferencer.resolvers.vectors import get_pair_vector
  5 +from corneferencer.resolvers.vectors import get_pair_features, get_pair_vector
  6 +
  7 +
  8 +# siamese resolve algorithm
  9 +# def siamese(text):
  10 +# last_set_id = 0
  11 +# for i, ana in enumerate(text.mentions):
  12 +# if i > 0:
  13 +# best_prediction = 20.0
  14 +# best_ante = None
  15 +# for ante in text.mentions[:i]:
  16 +# if not features.pair_intersect(ante, ana):
  17 +# pair_features = get_pair_features(ante, ana)
  18 +#
  19 +# ante_vec = []
  20 +# ante_vec.extend(ante.features)
  21 +# ante_vec.extend(pair_features)
  22 +# ante_sample = numpy.asarray([ante_vec], dtype=numpy.float32)
  23 +#
  24 +# ana_vec = []
  25 +# ana_vec.extend(ana.features)
  26 +# ana_vec.extend(pair_features)
  27 +# ana_sample = numpy.asarray([ana_vec], dtype=numpy.float32)
  28 +#
  29 +# prediction = NEURAL_MODEL.predict([ante_sample, ana_sample])[0]
  30 +#
  31 +# print (ante.text, '--->', ana.text, '>>', prediction)
  32 +#
  33 +# if prediction < THRESHOLD and prediction < best_prediction:
  34 +# best_prediction = prediction
  35 +# best_ante = ante
  36 +# if best_ante is not None:
  37 +# if best_ante.set:
  38 +# ana.set = best_ante.set
  39 +# else:
  40 +# str_set_id = 'set_%d' % last_set_id
  41 +# best_ante.set = str_set_id
  42 +# ana.set = str_set_id
  43 +# last_set_id += 1
  44 +
  45 +
  46 +def siamese(text, threshold):
  47 + last_set_id = 0
  48 + for i, ana in enumerate(text.mentions):
  49 + if i > 0:
  50 + for ante in reversed(text.mentions[:i]):
  51 + if not features.pair_intersect(ante, ana):
  52 + pair_features = get_pair_features(ante, ana)
  53 +
  54 + ante_vec = []
  55 + ante_vec.extend(ante.features)
  56 + ante_vec.extend(pair_features)
  57 + ante_sample = numpy.asarray([ante_vec], dtype=numpy.float32)
  58 +
  59 + ana_vec = []
  60 + ana_vec.extend(ana.features)
  61 + ana_vec.extend(pair_features)
  62 + ana_sample = numpy.asarray([ana_vec], dtype=numpy.float32)
  63 +
  64 + prediction = NEURAL_MODEL.predict([ante_sample, ana_sample])[0]
  65 +
  66 + if prediction < threshold:
  67 + if ante.set:
  68 + ana.set = ante.set
  69 + else:
  70 + str_set_id = 'set_%d' % last_set_id
  71 + ante.set = str_set_id
  72 + ana.set = str_set_id
  73 + last_set_id += 1
  74 + break
6 75  
7 76  
8 77 # incremental resolve algorithm
9   -def incremental(text):
  78 +def incremental(text, threshold):
10 79 last_set_id = 0
11 80 for i, ana in enumerate(text.mentions):
12 81 if i > 0:
... ... @@ -17,13 +86,10 @@ def incremental(text):
17 86 pair_vec = get_pair_vector(ante, ana)
18 87 sample = numpy.asarray([pair_vec], dtype=numpy.float32)
19 88 prediction = NEURAL_MODEL.predict(sample)[0]
20   - if prediction > THRESHOLD and prediction >= best_prediction:
  89 + if prediction > threshold and prediction >= best_prediction:
21 90 best_prediction = prediction
22 91 best_ante = ante
23 92 if best_ante is not None:
24   - # print ('wynik')
25   - # print(best_ante.text, best_prediction, ana.text)
26   - # print (best_ante.set, ana.set)
27 93 if best_ante.set:
28 94 ana.set = best_ante.set
29 95 else:
... ... @@ -34,13 +100,12 @@ def incremental(text):
34 100  
35 101  
36 102 # entity based resolve algorithm
37   -def entity_based(text):
  103 +def entity_based(text, threshold):
38 104 sets = []
39 105 last_set_id = 0
40 106 for i, ana in enumerate(text.mentions):
41 107 if i > 0:
42   - # print ('!!!!!!!!!!%s!!!!!!!!!!!!' % ana.text)
43   - best_fit = get_best_set(sets, ana)
  108 + best_fit = get_best_set(sets, ana, threshold)
44 109 if best_fit is not None:
45 110 ana.set = best_fit['set_id']
46 111 best_fit['mentions'].append(ana)
... ... @@ -56,22 +121,16 @@ def entity_based(text):
56 121 'mentions': [ana]})
57 122 ana.set = str_set_id
58 123 last_set_id += 1
59   - # print (ana.set)
60   - # for ss in sets:
61   - # print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']]))
62 124  
63 125 remove_singletons(sets)
64   - # print (';'.join([ss['set_id'] for ss in sets]))
65   - # for ss in sets:
66   - # print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']]))
67 126  
68 127  
69   -def get_best_set(sets, ana):
  128 +def get_best_set(sets, ana, threshold):
70 129 best_prediction = 0.0
71 130 best_set = None
72 131 for s in sets:
73 132 accuracy = predict_set(s['mentions'], ana)
74   - if accuracy > THRESHOLD and accuracy >= best_prediction:
  133 + if accuracy > threshold and accuracy >= best_prediction:
75 134 best_prediction = accuracy
76 135 best_set = s
77 136 return best_set
... ... @@ -86,7 +145,6 @@ def predict_set(mentions, ana):
86 145 sample = numpy.asarray([pair_vec], dtype=numpy.float32)
87 146 prediction = NEURAL_MODEL.predict(sample)[0]
88 147 prediction_sum += prediction
89   - # print(mnt.text, prediction, ana.text)
90 148 return prediction_sum / float(len(mentions))
91 149  
92 150  
... ...
corneferencer/utils.py
... ... @@ -5,15 +5,26 @@ import sys
5 5  
6 6 import javaobj
7 7  
8   -from keras.models import Model
9   -from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
  8 +from keras.models import Sequential, Model
  9 +from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, Lambda
  10 +from keras.optimizers import RMSprop, Adam
  11 +from keras import backend as K
10 12  
11 13  
12 14 def eprint(*args, **kwargs):
13 15 print(*args, file=sys.stderr, **kwargs)
14 16  
15 17  
16   -def initialize_neural_model(number_of_features, path_to_model):
  18 +def initialize_neural_model(architecture, number_of_features, path_to_model):
  19 + model = None
  20 + if architecture == 'simple':
  21 + model = initialize_simple_model(number_of_features, path_to_model)
  22 + elif architecture == 'siamese':
  23 + model = initialize_siamese_model(number_of_features, path_to_model)
  24 + return model
  25 +
  26 +
  27 +def initialize_simple_model(number_of_features, path_to_model):
17 28 inputs = Input(shape=(number_of_features,))
18 29  
19 30 output_from_1st_layer = Dense(1000, activation='relu')(inputs)
... ... @@ -37,6 +48,60 @@ def initialize_neural_model(number_of_features, path_to_model):
37 48 return model
38 49  
39 50  
  51 +def initialize_siamese_model(number_of_features, path_to_model):
  52 + input_dim = number_of_features
  53 +
  54 + base_network = create_base_network(input_dim)
  55 +
  56 + input_a = Input(shape=(input_dim,))
  57 + input_b = Input(shape=(input_dim,))
  58 +
  59 + processed_a = base_network(input_a)
  60 + processed_b = base_network(input_b)
  61 +
  62 + distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
  63 +
  64 + model = Model([input_a, input_b], distance)
  65 + model.compile(loss=contrastive_loss, optimizer='Adam')
  66 + model.load_weights(path_to_model)
  67 +
  68 + return model
  69 +
  70 +
  71 +def create_base_network(input_dim):
  72 + '''Base network to be shared'''
  73 + seq = Sequential()
  74 +
  75 + seq.add(Dense(1000, input_shape=(input_dim,), activation='relu'))
  76 + seq.add(Dropout(0.2))
  77 + seq.add(BatchNormalization())
  78 +
  79 + seq.add(Dense(500, activation='relu'))
  80 + seq.add(Dropout(0.2))
  81 + seq.add(BatchNormalization())
  82 +
  83 + seq.add(Dense(300, activation='relu'))
  84 + return seq
  85 +
  86 +
  87 +def euclidean_distance(vects):
  88 + x, y = vects
  89 + return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
  90 +
  91 +
  92 +def eucl_dist_output_shape(shapes):
  93 + shape1, shape2 = shapes
  94 + return (shape1[0], 1)
  95 +
  96 +
  97 +def contrastive_loss(y_true, y_pred):
  98 + '''Contrastive loss from Hadsell-et-al.'06
  99 + http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
  100 + '''
  101 + margin = 1
  102 + return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
  103 +
  104 +
40 105 def load_freq_list(freq_path):
41 106 freq_list = {}
42 107 with codecs.open(freq_path, 'r', 'utf-8') as freq_file:
... ...