Added siamese neural model.

Bartłomiej Nitoń
1 parent cdcb85d4
Showing 5 changed files with 171 additions and 36 deletions
conf.py
corneferencer/main.py
corneferencer/resolvers/constants.py
corneferencer/resolvers/resolve.py
corneferencer/utils.py
@@ -6,13 +6,15 @@ from gensim.models.word2vec import Word2Vec
  
  
 CONTEXT = 5
-THRESHOLD = 0.95
+# THRESHOLD = 0.001
 RANDOM_WORD_VECTORS = True
 W2V_SIZE = 50
 W2V_MODEL_NAME = 'w2v_allwiki_nkjpfull_50.model'
  
-NUMBER_OF_FEATURES = 1190
-NEURAL_MODEL_NAME = 'model_1190_features.h5'
+# simple or siamese
+NEURAL_MODEL_ARCHITECTURE = 'siamese'
+NUMBER_OF_FEATURES = 625
+NEURAL_MODEL_NAME = 'weights_siamese_model.h5'
  
 FREQ_LIST_NAME = 'base.lst'
 LEMMA2SYNONYMS_NAME = 'lemma2synonyms.map'
@@ -28,7 +30,7 @@ W2V_MODEL_PATH = os.path.join(MAIN_PATH, &#39;models&#39;, W2V_MODEL_NAME)
 W2V_MODEL = Word2Vec.load(W2V_MODEL_PATH)
  
 NEURAL_MODEL_PATH = os.path.join(MAIN_PATH, 'models', NEURAL_MODEL_NAME)
-NEURAL_MODEL = utils.initialize_neural_model(NUMBER_OF_FEATURES, NEURAL_MODEL_PATH)
+NEURAL_MODEL = utils.initialize_neural_model(NEURAL_MODEL_ARCHITECTURE, NUMBER_OF_FEATURES, NEURAL_MODEL_PATH)
  
 FREQ_LIST_PATH = os.path.join(MAIN_PATH, 'freq', FREQ_LIST_NAME)
 FREQ_LIST = utils.load_freq_list(FREQ_LIST_PATH)
@@ -6,6 +6,7 @@ from natsort import natsorted
  
 sys.path.append(os.path.abspath(os.path.join('..')))
  
+import conf
 from inout import mmax
 from inout.constants import INPUT_FORMATS
 from resolvers import resolve
@@ -22,7 +23,11 @@ def main():
     elif args.format not in INPUT_FORMATS:
         eprint("Error: Unknown input file format!")
     else:
-        process_texts(args.input, args.output, args.format, args.resolver)
+        resolver = args.resolver
+        if conf.NEURAL_MODEL_ARCHITECTURE == 'siamese':
+            resolver = conf.NEURAL_MODEL_ARCHITECTURE
+            eprint ("Warning: Using %s resolver because of selected neural model architecture!" % conf.NEURAL_MODEL_ARCHITECTURE)
+        process_texts(args.input, args.output, args.format, resolver, args.threshold)
  
  
 def parse_arguments():
@@ -40,21 +45,24 @@ def parse_arguments():
                         dest='resolver', default='incremental',
                         help='resolve algorithm; default: incremental; possibilities: %s'
                              % ', '.join(RESOLVERS))
+    parser.add_argument('-t', '--threshold', type=float, action='store',
+                        dest='threshold', default=0.001,
+                        help='threshold; default: 0.001')
  
     args = parser.parse_args()
     return args
  
  
-def process_texts(inpath, outpath, informat, resolver):
+def process_texts(inpath, outpath, informat, resolver, threshold):
     if os.path.isdir(inpath):
-        process_directory(inpath, outpath, informat, resolver)
+        process_directory(inpath, outpath, informat, resolver, threshold)
     elif os.path.isfile(inpath):
-        process_file(inpath, outpath, informat, resolver)
+        process_file(inpath, outpath, informat, resolver, threshold)
     else:
         eprint("Error: Specified input does not exist!")
  
  
-def process_directory(inpath, outpath, informat, resolver):
+def process_directory(inpath, outpath, informat, resolver, threshold):
     inpath = os.path.abspath(inpath)
     outpath = os.path.abspath(outpath)
  
@@ -65,18 +73,20 @@ def process_directory(inpath, outpath, informat, resolver):
         textname = os.path.splitext(os.path.basename(filename))[0]
         textoutput = os.path.join(outpath, textname)
         textinput = os.path.join(inpath, filename)
-        process_file(textinput, textoutput, informat, resolver)
+        process_file(textinput, textoutput, informat, resolver, threshold)
  
  
-def process_file(inpath, outpath, informat, resolver):
+def process_file(inpath, outpath, informat, resolver, threshold):
     basename = os.path.basename(inpath)
     if informat == 'mmax' and basename.endswith('.mmax'):
         print (basename)
         text = mmax.read(inpath)
         if resolver == 'incremental':
-            resolve.incremental(text)
+            resolve.incremental(text, threshold)
         elif resolver == 'entity_based':
-            resolve.entity_based(text)
+            resolve.entity_based(text, threshold)
+        elif resolver == 'siamese':
+            resolve.siamese(text, threshold)
         mmax.write(inpath, outpath, text)
  
  
 # -*- coding: utf-8 -*-
  
-RESOLVERS = ['entity_based', 'incremental']
+RESOLVERS = ['entity_based', 'incremental', 'siamese']
  
 NOUN_TAGS = ['subst', 'ger', 'depr']
 PPRON_TAGS = ['ppron12', 'ppron3']
 import numpy
  
-from conf import NEURAL_MODEL, THRESHOLD
+from conf import NEURAL_MODEL#, THRESHOLD
 from corneferencer.resolvers import features
-from corneferencer.resolvers.vectors import get_pair_vector
+from corneferencer.resolvers.vectors import get_pair_features, get_pair_vector
+
+
+# siamese resolve algorithm
+# def siamese(text):
+#     last_set_id = 0
+#     for i, ana in enumerate(text.mentions):
+#         if i > 0:
+#             best_prediction = 20.0
+#             best_ante = None
+#             for ante in text.mentions[:i]:
+#                 if not features.pair_intersect(ante, ana):
+#                     pair_features = get_pair_features(ante, ana)
+#
+#                     ante_vec = []
+#                     ante_vec.extend(ante.features)
+#                     ante_vec.extend(pair_features)
+#                     ante_sample = numpy.asarray([ante_vec], dtype=numpy.float32)
+#
+#                     ana_vec = []
+#                     ana_vec.extend(ana.features)
+#                     ana_vec.extend(pair_features)
+#                     ana_sample = numpy.asarray([ana_vec], dtype=numpy.float32)
+#
+#                     prediction = NEURAL_MODEL.predict([ante_sample, ana_sample])[0]
+#
+#                     print (ante.text, '--->', ana.text, '>>', prediction)
+#
+#                     if prediction < THRESHOLD and prediction < best_prediction:
+#                         best_prediction = prediction
+#                         best_ante = ante
+#             if best_ante is not None:
+#                 if best_ante.set:
+#                     ana.set = best_ante.set
+#                 else:
+#                     str_set_id = 'set_%d' % last_set_id
+#                     best_ante.set = str_set_id
+#                     ana.set = str_set_id
+#                     last_set_id += 1
+
+
+def siamese(text, threshold):
+    last_set_id = 0
+    for i, ana in enumerate(text.mentions):
+        if i > 0:
+            for ante in reversed(text.mentions[:i]):
+                if not features.pair_intersect(ante, ana):
+                    pair_features = get_pair_features(ante, ana)
+
+                    ante_vec = []
+                    ante_vec.extend(ante.features)
+                    ante_vec.extend(pair_features)
+                    ante_sample = numpy.asarray([ante_vec], dtype=numpy.float32)
+
+                    ana_vec = []
+                    ana_vec.extend(ana.features)
+                    ana_vec.extend(pair_features)
+                    ana_sample = numpy.asarray([ana_vec], dtype=numpy.float32)
+
+                    prediction = NEURAL_MODEL.predict([ante_sample, ana_sample])[0]
+
+                    if prediction < threshold:
+                        if ante.set:
+                            ana.set = ante.set
+                        else:
+                            str_set_id = 'set_%d' % last_set_id
+                            ante.set = str_set_id
+                            ana.set = str_set_id
+                            last_set_id += 1
+                        break
  
  
 # incremental resolve algorithm
-def incremental(text):
+def incremental(text, threshold):
     last_set_id = 0
     for i, ana in enumerate(text.mentions):
         if i > 0:
@@ -17,13 +86,10 @@ def incremental(text):
                     pair_vec = get_pair_vector(ante, ana)
                     sample = numpy.asarray([pair_vec], dtype=numpy.float32)
                     prediction = NEURAL_MODEL.predict(sample)[0]
-                    if prediction > THRESHOLD and prediction >= best_prediction:
+                    if prediction > threshold and prediction >= best_prediction:
                         best_prediction = prediction
                         best_ante = ante
             if best_ante is not None:
-                # print ('wynik')
-                # print(best_ante.text, best_prediction, ana.text)
-                # print (best_ante.set, ana.set)
                 if best_ante.set:
                     ana.set = best_ante.set
                 else:
@@ -34,13 +100,12 @@ def incremental(text):
  
  
 # entity based resolve algorithm
-def entity_based(text):
+def entity_based(text, threshold):
     sets = []
     last_set_id = 0
     for i, ana in enumerate(text.mentions):
         if i > 0:
-            # print ('!!!!!!!!!!%s!!!!!!!!!!!!' % ana.text)
-            best_fit = get_best_set(sets, ana)
+            best_fit = get_best_set(sets, ana, threshold)
             if best_fit is not None:
                 ana.set = best_fit['set_id']
                 best_fit['mentions'].append(ana)
@@ -56,22 +121,16 @@ def entity_based(text):
                          'mentions': [ana]})
             ana.set = str_set_id
             last_set_id += 1
-        # print (ana.set)
-        # for ss in sets:
-        #     print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']]))
  
     remove_singletons(sets)
-    # print (';'.join([ss['set_id'] for ss in sets]))
-    # for ss in sets:
-    #     print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']]))
  
  
-def get_best_set(sets, ana):
+def get_best_set(sets, ana, threshold):
     best_prediction = 0.0
     best_set = None
     for s in sets:
         accuracy = predict_set(s['mentions'], ana)
-        if accuracy > THRESHOLD and accuracy >= best_prediction:
+        if accuracy > threshold and accuracy >= best_prediction:
             best_prediction = accuracy
             best_set = s
     return best_set
@@ -86,7 +145,6 @@ def predict_set(mentions, ana):
             sample = numpy.asarray([pair_vec], dtype=numpy.float32)
             prediction = NEURAL_MODEL.predict(sample)[0]
         prediction_sum += prediction
-        # print(mnt.text, prediction, ana.text)
     return prediction_sum / float(len(mentions))
  
  
@@ -5,15 +5,26 @@ import sys
  
 import javaobj
  
-from keras.models import Model
-from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
+from keras.models import Sequential, Model
+from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, Lambda
+from keras.optimizers import RMSprop, Adam
+from keras import backend as K
  
  
 def eprint(*args, **kwargs):
     print(*args, file=sys.stderr, **kwargs)
  
  
-def initialize_neural_model(number_of_features, path_to_model):
+def initialize_neural_model(architecture, number_of_features, path_to_model):
+    model = None
+    if architecture == 'simple':
+        model = initialize_simple_model(number_of_features, path_to_model)
+    elif architecture == 'siamese':
+        model = initialize_siamese_model(number_of_features, path_to_model)
+    return model
+
+
+def initialize_simple_model(number_of_features, path_to_model):
     inputs = Input(shape=(number_of_features,))
  
     output_from_1st_layer = Dense(1000, activation='relu')(inputs)
@@ -37,6 +48,60 @@ def initialize_neural_model(number_of_features, path_to_model):
     return model
  
  
+def initialize_siamese_model(number_of_features, path_to_model):
+    input_dim = number_of_features
+
+    base_network = create_base_network(input_dim)
+
+    input_a = Input(shape=(input_dim,))
+    input_b = Input(shape=(input_dim,))
+
+    processed_a = base_network(input_a)
+    processed_b = base_network(input_b)
+
+    distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b])
+
+    model = Model([input_a, input_b], distance)
+    model.compile(loss=contrastive_loss, optimizer='Adam')
+    model.load_weights(path_to_model)
+
+    return model
+
+
+def create_base_network(input_dim):
+    '''Base network to be shared'''
+    seq = Sequential()
+
+    seq.add(Dense(1000, input_shape=(input_dim,), activation='relu'))
+    seq.add(Dropout(0.2))
+    seq.add(BatchNormalization())
+
+    seq.add(Dense(500, activation='relu'))
+    seq.add(Dropout(0.2))
+    seq.add(BatchNormalization())
+
+    seq.add(Dense(300, activation='relu'))
+    return seq
+
+
+def euclidean_distance(vects):
+    x, y = vects
+    return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon()))
+
+
+def eucl_dist_output_shape(shapes):
+    shape1, shape2 = shapes
+    return (shape1[0], 1)
+
+
+def contrastive_loss(y_true, y_pred):
+    '''Contrastive loss from Hadsell-et-al.'06
+    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
+    '''
+    margin = 1
+    return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
+
+
 def load_freq_list(freq_path):
     freq_list = {}
     with codecs.open(freq_path, 'r', 'utf-8') as freq_file: