Commit cf3852f0d13df6810dca9858fb87f599310567fe
1 parent
cdcb85d4
Added siamese neural model.
Showing
5 changed files
with
171 additions
and
36 deletions
conf.py
... | ... | @@ -6,13 +6,15 @@ from gensim.models.word2vec import Word2Vec |
6 | 6 | |
7 | 7 | |
8 | 8 | CONTEXT = 5 |
9 | -THRESHOLD = 0.95 | |
9 | +# THRESHOLD = 0.001 | |
10 | 10 | RANDOM_WORD_VECTORS = True |
11 | 11 | W2V_SIZE = 50 |
12 | 12 | W2V_MODEL_NAME = 'w2v_allwiki_nkjpfull_50.model' |
13 | 13 | |
14 | -NUMBER_OF_FEATURES = 1190 | |
15 | -NEURAL_MODEL_NAME = 'model_1190_features.h5' | |
14 | +# simple or siamese | |
15 | +NEURAL_MODEL_ARCHITECTURE = 'siamese' | |
16 | +NUMBER_OF_FEATURES = 625 | |
17 | +NEURAL_MODEL_NAME = 'weights_siamese_model.h5' | |
16 | 18 | |
17 | 19 | FREQ_LIST_NAME = 'base.lst' |
18 | 20 | LEMMA2SYNONYMS_NAME = 'lemma2synonyms.map' |
... | ... | @@ -28,7 +30,7 @@ W2V_MODEL_PATH = os.path.join(MAIN_PATH, 'models', W2V_MODEL_NAME) |
28 | 30 | W2V_MODEL = Word2Vec.load(W2V_MODEL_PATH) |
29 | 31 | |
30 | 32 | NEURAL_MODEL_PATH = os.path.join(MAIN_PATH, 'models', NEURAL_MODEL_NAME) |
31 | -NEURAL_MODEL = utils.initialize_neural_model(NUMBER_OF_FEATURES, NEURAL_MODEL_PATH) | |
33 | +NEURAL_MODEL = utils.initialize_neural_model(NEURAL_MODEL_ARCHITECTURE, NUMBER_OF_FEATURES, NEURAL_MODEL_PATH) | |
32 | 34 | |
33 | 35 | FREQ_LIST_PATH = os.path.join(MAIN_PATH, 'freq', FREQ_LIST_NAME) |
34 | 36 | FREQ_LIST = utils.load_freq_list(FREQ_LIST_PATH) |
... | ... |
corneferencer/main.py
... | ... | @@ -6,6 +6,7 @@ from natsort import natsorted |
6 | 6 | |
7 | 7 | sys.path.append(os.path.abspath(os.path.join('..'))) |
8 | 8 | |
9 | +import conf | |
9 | 10 | from inout import mmax |
10 | 11 | from inout.constants import INPUT_FORMATS |
11 | 12 | from resolvers import resolve |
... | ... | @@ -22,7 +23,11 @@ def main(): |
22 | 23 | elif args.format not in INPUT_FORMATS: |
23 | 24 | eprint("Error: Unknown input file format!") |
24 | 25 | else: |
25 | - process_texts(args.input, args.output, args.format, args.resolver) | |
26 | + resolver = args.resolver | |
27 | + if conf.NEURAL_MODEL_ARCHITECTURE == 'siamese': | |
28 | + resolver = conf.NEURAL_MODEL_ARCHITECTURE | |
29 | + eprint ("Warning: Using %s resolver because of selected neural model architecture!" % conf.NEURAL_MODEL_ARCHITECTURE) | |
30 | + process_texts(args.input, args.output, args.format, resolver, args.threshold) | |
26 | 31 | |
27 | 32 | |
28 | 33 | def parse_arguments(): |
... | ... | @@ -40,21 +45,24 @@ def parse_arguments(): |
40 | 45 | dest='resolver', default='incremental', |
41 | 46 | help='resolve algorithm; default: incremental; possibilities: %s' |
42 | 47 | % ', '.join(RESOLVERS)) |
48 | + parser.add_argument('-t', '--threshold', type=float, action='store', | |
49 | + dest='threshold', default=0.001, | |
50 | + help='threshold; default: 0.001') | |
43 | 51 | |
44 | 52 | args = parser.parse_args() |
45 | 53 | return args |
46 | 54 | |
47 | 55 | |
48 | -def process_texts(inpath, outpath, informat, resolver): | |
56 | +def process_texts(inpath, outpath, informat, resolver, threshold): | |
49 | 57 | if os.path.isdir(inpath): |
50 | - process_directory(inpath, outpath, informat, resolver) | |
58 | + process_directory(inpath, outpath, informat, resolver, threshold) | |
51 | 59 | elif os.path.isfile(inpath): |
52 | - process_file(inpath, outpath, informat, resolver) | |
60 | + process_file(inpath, outpath, informat, resolver, threshold) | |
53 | 61 | else: |
54 | 62 | eprint("Error: Specified input does not exist!") |
55 | 63 | |
56 | 64 | |
57 | -def process_directory(inpath, outpath, informat, resolver): | |
65 | +def process_directory(inpath, outpath, informat, resolver, threshold): | |
58 | 66 | inpath = os.path.abspath(inpath) |
59 | 67 | outpath = os.path.abspath(outpath) |
60 | 68 | |
... | ... | @@ -65,18 +73,20 @@ def process_directory(inpath, outpath, informat, resolver): |
65 | 73 | textname = os.path.splitext(os.path.basename(filename))[0] |
66 | 74 | textoutput = os.path.join(outpath, textname) |
67 | 75 | textinput = os.path.join(inpath, filename) |
68 | - process_file(textinput, textoutput, informat, resolver) | |
76 | + process_file(textinput, textoutput, informat, resolver, threshold) | |
69 | 77 | |
70 | 78 | |
71 | -def process_file(inpath, outpath, informat, resolver): | |
79 | +def process_file(inpath, outpath, informat, resolver, threshold): | |
72 | 80 | basename = os.path.basename(inpath) |
73 | 81 | if informat == 'mmax' and basename.endswith('.mmax'): |
74 | 82 | print (basename) |
75 | 83 | text = mmax.read(inpath) |
76 | 84 | if resolver == 'incremental': |
77 | - resolve.incremental(text) | |
85 | + resolve.incremental(text, threshold) | |
78 | 86 | elif resolver == 'entity_based': |
79 | - resolve.entity_based(text) | |
87 | + resolve.entity_based(text, threshold) | |
88 | + elif resolver == 'siamese': | |
89 | + resolve.siamese(text, threshold) | |
80 | 90 | mmax.write(inpath, outpath, text) |
81 | 91 | |
82 | 92 | |
... | ... |
corneferencer/resolvers/constants.py
corneferencer/resolvers/resolve.py
1 | 1 | import numpy |
2 | 2 | |
3 | -from conf import NEURAL_MODEL, THRESHOLD | |
3 | +from conf import NEURAL_MODEL#, THRESHOLD | |
4 | 4 | from corneferencer.resolvers import features |
5 | -from corneferencer.resolvers.vectors import get_pair_vector | |
5 | +from corneferencer.resolvers.vectors import get_pair_features, get_pair_vector | |
6 | + | |
7 | + | |
8 | +# siamese resolve algorithm | |
9 | +# def siamese(text): | |
10 | +# last_set_id = 0 | |
11 | +# for i, ana in enumerate(text.mentions): | |
12 | +# if i > 0: | |
13 | +# best_prediction = 20.0 | |
14 | +# best_ante = None | |
15 | +# for ante in text.mentions[:i]: | |
16 | +# if not features.pair_intersect(ante, ana): | |
17 | +# pair_features = get_pair_features(ante, ana) | |
18 | +# | |
19 | +# ante_vec = [] | |
20 | +# ante_vec.extend(ante.features) | |
21 | +# ante_vec.extend(pair_features) | |
22 | +# ante_sample = numpy.asarray([ante_vec], dtype=numpy.float32) | |
23 | +# | |
24 | +# ana_vec = [] | |
25 | +# ana_vec.extend(ana.features) | |
26 | +# ana_vec.extend(pair_features) | |
27 | +# ana_sample = numpy.asarray([ana_vec], dtype=numpy.float32) | |
28 | +# | |
29 | +# prediction = NEURAL_MODEL.predict([ante_sample, ana_sample])[0] | |
30 | +# | |
31 | +# print (ante.text, '--->', ana.text, '>>', prediction) | |
32 | +# | |
33 | +# if prediction < THRESHOLD and prediction < best_prediction: | |
34 | +# best_prediction = prediction | |
35 | +# best_ante = ante | |
36 | +# if best_ante is not None: | |
37 | +# if best_ante.set: | |
38 | +# ana.set = best_ante.set | |
39 | +# else: | |
40 | +# str_set_id = 'set_%d' % last_set_id | |
41 | +# best_ante.set = str_set_id | |
42 | +# ana.set = str_set_id | |
43 | +# last_set_id += 1 | |
44 | + | |
45 | + | |
46 | +def siamese(text, threshold): | |
47 | + last_set_id = 0 | |
48 | + for i, ana in enumerate(text.mentions): | |
49 | + if i > 0: | |
50 | + for ante in reversed(text.mentions[:i]): | |
51 | + if not features.pair_intersect(ante, ana): | |
52 | + pair_features = get_pair_features(ante, ana) | |
53 | + | |
54 | + ante_vec = [] | |
55 | + ante_vec.extend(ante.features) | |
56 | + ante_vec.extend(pair_features) | |
57 | + ante_sample = numpy.asarray([ante_vec], dtype=numpy.float32) | |
58 | + | |
59 | + ana_vec = [] | |
60 | + ana_vec.extend(ana.features) | |
61 | + ana_vec.extend(pair_features) | |
62 | + ana_sample = numpy.asarray([ana_vec], dtype=numpy.float32) | |
63 | + | |
64 | + prediction = NEURAL_MODEL.predict([ante_sample, ana_sample])[0] | |
65 | + | |
66 | + if prediction < threshold: | |
67 | + if ante.set: | |
68 | + ana.set = ante.set | |
69 | + else: | |
70 | + str_set_id = 'set_%d' % last_set_id | |
71 | + ante.set = str_set_id | |
72 | + ana.set = str_set_id | |
73 | + last_set_id += 1 | |
74 | + break | |
6 | 75 | |
7 | 76 | |
8 | 77 | # incremental resolve algorithm |
9 | -def incremental(text): | |
78 | +def incremental(text, threshold): | |
10 | 79 | last_set_id = 0 |
11 | 80 | for i, ana in enumerate(text.mentions): |
12 | 81 | if i > 0: |
... | ... | @@ -17,13 +86,10 @@ def incremental(text): |
17 | 86 | pair_vec = get_pair_vector(ante, ana) |
18 | 87 | sample = numpy.asarray([pair_vec], dtype=numpy.float32) |
19 | 88 | prediction = NEURAL_MODEL.predict(sample)[0] |
20 | - if prediction > THRESHOLD and prediction >= best_prediction: | |
89 | + if prediction > threshold and prediction >= best_prediction: | |
21 | 90 | best_prediction = prediction |
22 | 91 | best_ante = ante |
23 | 92 | if best_ante is not None: |
24 | - # print ('wynik') | |
25 | - # print(best_ante.text, best_prediction, ana.text) | |
26 | - # print (best_ante.set, ana.set) | |
27 | 93 | if best_ante.set: |
28 | 94 | ana.set = best_ante.set |
29 | 95 | else: |
... | ... | @@ -34,13 +100,12 @@ def incremental(text): |
34 | 100 | |
35 | 101 | |
36 | 102 | # entity based resolve algorithm |
37 | -def entity_based(text): | |
103 | +def entity_based(text, threshold): | |
38 | 104 | sets = [] |
39 | 105 | last_set_id = 0 |
40 | 106 | for i, ana in enumerate(text.mentions): |
41 | 107 | if i > 0: |
42 | - # print ('!!!!!!!!!!%s!!!!!!!!!!!!' % ana.text) | |
43 | - best_fit = get_best_set(sets, ana) | |
108 | + best_fit = get_best_set(sets, ana, threshold) | |
44 | 109 | if best_fit is not None: |
45 | 110 | ana.set = best_fit['set_id'] |
46 | 111 | best_fit['mentions'].append(ana) |
... | ... | @@ -56,22 +121,16 @@ def entity_based(text): |
56 | 121 | 'mentions': [ana]}) |
57 | 122 | ana.set = str_set_id |
58 | 123 | last_set_id += 1 |
59 | - # print (ana.set) | |
60 | - # for ss in sets: | |
61 | - # print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']])) | |
62 | 124 | |
63 | 125 | remove_singletons(sets) |
64 | - # print (';'.join([ss['set_id'] for ss in sets])) | |
65 | - # for ss in sets: | |
66 | - # print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']])) | |
67 | 126 | |
68 | 127 | |
69 | -def get_best_set(sets, ana): | |
128 | +def get_best_set(sets, ana, threshold): | |
70 | 129 | best_prediction = 0.0 |
71 | 130 | best_set = None |
72 | 131 | for s in sets: |
73 | 132 | accuracy = predict_set(s['mentions'], ana) |
74 | - if accuracy > THRESHOLD and accuracy >= best_prediction: | |
133 | + if accuracy > threshold and accuracy >= best_prediction: | |
75 | 134 | best_prediction = accuracy |
76 | 135 | best_set = s |
77 | 136 | return best_set |
... | ... | @@ -86,7 +145,6 @@ def predict_set(mentions, ana): |
86 | 145 | sample = numpy.asarray([pair_vec], dtype=numpy.float32) |
87 | 146 | prediction = NEURAL_MODEL.predict(sample)[0] |
88 | 147 | prediction_sum += prediction |
89 | - # print(mnt.text, prediction, ana.text) | |
90 | 148 | return prediction_sum / float(len(mentions)) |
91 | 149 | |
92 | 150 | |
... | ... |
corneferencer/utils.py
... | ... | @@ -5,15 +5,26 @@ import sys |
5 | 5 | |
6 | 6 | import javaobj |
7 | 7 | |
8 | -from keras.models import Model | |
9 | -from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization | |
8 | +from keras.models import Sequential, Model | |
9 | +from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization, Lambda | |
10 | +from keras.optimizers import RMSprop, Adam | |
11 | +from keras import backend as K | |
10 | 12 | |
11 | 13 | |
12 | 14 | def eprint(*args, **kwargs): |
13 | 15 | print(*args, file=sys.stderr, **kwargs) |
14 | 16 | |
15 | 17 | |
16 | -def initialize_neural_model(number_of_features, path_to_model): | |
18 | +def initialize_neural_model(architecture, number_of_features, path_to_model): | |
19 | + model = None | |
20 | + if architecture == 'simple': | |
21 | + model = initialize_simple_model(number_of_features, path_to_model) | |
22 | + elif architecture == 'siamese': | |
23 | + model = initialize_siamese_model(number_of_features, path_to_model) | |
24 | + return model | |
25 | + | |
26 | + | |
27 | +def initialize_simple_model(number_of_features, path_to_model): | |
17 | 28 | inputs = Input(shape=(number_of_features,)) |
18 | 29 | |
19 | 30 | output_from_1st_layer = Dense(1000, activation='relu')(inputs) |
... | ... | @@ -37,6 +48,60 @@ def initialize_neural_model(number_of_features, path_to_model): |
37 | 48 | return model |
38 | 49 | |
39 | 50 | |
51 | +def initialize_siamese_model(number_of_features, path_to_model): | |
52 | + input_dim = number_of_features | |
53 | + | |
54 | + base_network = create_base_network(input_dim) | |
55 | + | |
56 | + input_a = Input(shape=(input_dim,)) | |
57 | + input_b = Input(shape=(input_dim,)) | |
58 | + | |
59 | + processed_a = base_network(input_a) | |
60 | + processed_b = base_network(input_b) | |
61 | + | |
62 | + distance = Lambda(euclidean_distance, output_shape=eucl_dist_output_shape)([processed_a, processed_b]) | |
63 | + | |
64 | + model = Model([input_a, input_b], distance) | |
65 | + model.compile(loss=contrastive_loss, optimizer='Adam') | |
66 | + model.load_weights(path_to_model) | |
67 | + | |
68 | + return model | |
69 | + | |
70 | + | |
71 | +def create_base_network(input_dim): | |
72 | + '''Base network to be shared''' | |
73 | + seq = Sequential() | |
74 | + | |
75 | + seq.add(Dense(1000, input_shape=(input_dim,), activation='relu')) | |
76 | + seq.add(Dropout(0.2)) | |
77 | + seq.add(BatchNormalization()) | |
78 | + | |
79 | + seq.add(Dense(500, activation='relu')) | |
80 | + seq.add(Dropout(0.2)) | |
81 | + seq.add(BatchNormalization()) | |
82 | + | |
83 | + seq.add(Dense(300, activation='relu')) | |
84 | + return seq | |
85 | + | |
86 | + | |
87 | +def euclidean_distance(vects): | |
88 | + x, y = vects | |
89 | + return K.sqrt(K.maximum(K.sum(K.square(x - y), axis=1, keepdims=True), K.epsilon())) | |
90 | + | |
91 | + | |
92 | +def eucl_dist_output_shape(shapes): | |
93 | + shape1, shape2 = shapes | |
94 | + return (shape1[0], 1) | |
95 | + | |
96 | + | |
97 | +def contrastive_loss(y_true, y_pred): | |
98 | + '''Contrastive loss from Hadsell-et-al.'06 | |
99 | + http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf | |
100 | + ''' | |
101 | + margin = 1 | |
102 | + return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0))) | |
103 | + | |
104 | + | |
40 | 105 | def load_freq_list(freq_path): |
41 | 106 | freq_list = {} |
42 | 107 | with codecs.open(freq_path, 'r', 'utf-8') as freq_file: |
... | ... |