Commit 44d0a5f290b4b8ecc9f182d3ebdd389905ca8a8c
1 parent
ba27568f
Added 21 new features.
Showing
8 changed files
with
212 additions
and
19 deletions
conf.py
... | ... | @@ -2,7 +2,7 @@ import os |
2 | 2 | |
3 | 3 | from gensim.models.word2vec import Word2Vec |
4 | 4 | |
5 | -from corneferencer.utils import initialize_neural_model | |
5 | +from corneferencer.utils import initialize_neural_model, load_freq_list | |
6 | 6 | |
7 | 7 | |
8 | 8 | CONTEXT = 5 |
... | ... | @@ -11,9 +11,10 @@ RANDOM_WORD_VECTORS = True |
11 | 11 | W2V_SIZE = 50 |
12 | 12 | W2V_MODEL_NAME = 'w2v_allwiki_nkjpfull_50.model' |
13 | 13 | |
14 | -NUMBER_OF_FEATURES = 1126 | |
15 | -NEURAL_MODEL_NAME = 'weights_2017_05_10.h5' | |
14 | +NUMBER_OF_FEATURES = 1147 | |
15 | +NEURAL_MODEL_NAME = 'model_1147_features.h5' | |
16 | 16 | |
17 | +FREQ_LIST_NAME = 'base.lst' | |
17 | 18 | |
18 | 19 | # do not change that |
19 | 20 | W2V_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', W2V_MODEL_NAME) |
... | ... | @@ -21,3 +22,6 @@ W2V_MODEL = Word2Vec.load(W2V_MODEL_PATH) |
21 | 22 | |
22 | 23 | NEURAL_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', NEURAL_MODEL_NAME) |
23 | 24 | NEURAL_MODEL = initialize_neural_model(NUMBER_OF_FEATURES, NEURAL_MODEL_PATH) |
25 | + | |
26 | +FREQ_LIST_PATH = os.path.join(os.path.dirname(__file__), 'freq', FREQ_LIST_NAME) | |
27 | +FREQ_LIST = load_freq_list(FREQ_LIST_PATH) | |
... | ... |
corneferencer/entities.py
... | ... | @@ -17,9 +17,9 @@ class Text: |
17 | 17 | class Mention: |
18 | 18 | |
19 | 19 | def __init__(self, mnt_id, text, lemmatized_text, words, span, |
20 | - head_orth, head_base, dominant, node, prec_context, | |
20 | + head_orth, head, dominant, node, prec_context, | |
21 | 21 | follow_context, sentence, position_in_mentions, |
22 | - start_in_words, end_in_words): | |
22 | + start_in_words, end_in_words, rarest, paragraph_id, sentence_id): | |
23 | 23 | self.id = mnt_id |
24 | 24 | self.set = '' |
25 | 25 | self.old_set = '' |
... | ... | @@ -28,7 +28,7 @@ class Mention: |
28 | 28 | self.words = words |
29 | 29 | self.span = span |
30 | 30 | self.head_orth = head_orth |
31 | - self.head_base = head_base | |
31 | + self.head = head | |
32 | 32 | self.dominant = dominant |
33 | 33 | self.node = node |
34 | 34 | self.prec_context = prec_context |
... | ... | @@ -38,3 +38,6 @@ class Mention: |
38 | 38 | self.start_in_words = start_in_words |
39 | 39 | self.end_in_words = end_in_words |
40 | 40 | self.features = get_mention_features(self) |
41 | + self.rarest = rarest | |
42 | + self.paragraph_id = paragraph_id | |
43 | + self.sentence_id = sentence_id | |
... | ... |
corneferencer/inout/mmax.py
... | ... | @@ -3,7 +3,7 @@ import shutil |
3 | 3 | |
4 | 4 | from lxml import etree |
5 | 5 | |
6 | -from conf import CONTEXT | |
6 | +from conf import CONTEXT, FREQ_LIST | |
7 | 7 | from corneferencer.entities import Mention, Text |
8 | 8 | |
9 | 9 | |
... | ... | @@ -38,16 +38,17 @@ def read_mentions(mentions_path, words_path): |
38 | 38 | mention_words = span_to_words(span, words) |
39 | 39 | |
40 | 40 | (prec_context, follow_context, sentence, |
41 | - mnt_start_position, mnt_end_position) = get_context(mention_words, words) | |
41 | + mnt_start_position, mnt_end_position, | |
42 | + paragraph_id, sentence_id) = get_context(mention_words, words) | |
42 | 43 | |
43 | - head_base = get_head_base(head_orth, mention_words) | |
44 | + head = get_head(head_orth, mention_words) | |
44 | 45 | mention = Mention(mnt_id=markable.attrib['id'], |
45 | 46 | text=span_to_text(span, words, 'orth'), |
46 | 47 | lemmatized_text=span_to_text(span, words, 'base'), |
47 | 48 | words=mention_words, |
48 | 49 | span=span, |
49 | 50 | head_orth=head_orth, |
50 | - head_base=head_base, | |
51 | + head=head, | |
51 | 52 | dominant=dominant, |
52 | 53 | node=markable, |
53 | 54 | prec_context=prec_context, |
... | ... | @@ -55,7 +56,10 @@ def read_mentions(mentions_path, words_path): |
55 | 56 | sentence=sentence, |
56 | 57 | position_in_mentions=idx, |
57 | 58 | start_in_words=mnt_start_position, |
58 | - end_in_words=mnt_end_position) | |
59 | + end_in_words=mnt_end_position, | |
60 | + rarest=get_rarest_word(mention_words), | |
61 | + paragraph_id=paragraph_id, | |
62 | + sentence_id=sentence_id) | |
59 | 63 | mentions.append(mention) |
60 | 64 | |
61 | 65 | return mentions |
... | ... | @@ -71,12 +75,20 @@ def get_words(filepath): |
71 | 75 | lastinsent = False |
72 | 76 | if 'lastinsent' in word.attrib and word.attrib['lastinsent'] == 'true': |
73 | 77 | lastinsent = True |
78 | + lastinpar = False | |
79 | + if 'lastinpar' in word.attrib and word.attrib['lastinpar'] == 'true': | |
80 | + lastinpar = True | |
74 | 81 | words.append({'id': word.attrib['id'], |
75 | 82 | 'orth': word.text, |
76 | 83 | 'base': word.attrib['base'], |
77 | 84 | 'hasnps': hasnps, |
78 | 85 | 'lastinsent': lastinsent, |
79 | - 'ctag': word.attrib['ctag']}) | |
86 | + 'lastinpar': lastinpar, | |
87 | + 'ctag': word.attrib['ctag'], | |
88 | + 'msd': word.attrib['msd'], | |
89 | + 'gender': get_gender(word.attrib['msd']), | |
90 | + 'person': get_person(word.attrib['msd']), | |
91 | + 'number': get_number(word.attrib['msd'])}) | |
80 | 92 | return words |
81 | 93 | |
82 | 94 | |
... | ... | @@ -130,6 +142,8 @@ def word_to_ignore(word): |
130 | 142 | |
131 | 143 | |
132 | 144 | def get_context(mention_words, words): |
145 | + paragraph_id = 0 | |
146 | + sentence_id = 0 | |
133 | 147 | prec_context = [] |
134 | 148 | follow_context = [] |
135 | 149 | sentence = [] |
... | ... | @@ -146,7 +160,11 @@ def get_context(mention_words, words): |
146 | 160 | sentence = get_sentence(idx, words) |
147 | 161 | mnt_end_position = get_mention_end(last_word, words) |
148 | 162 | break |
149 | - return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position | |
163 | + if word['lastinsent']: | |
164 | + sentence_id += 1 | |
165 | + if word['lastinpar']: | |
166 | + paragraph_id += 1 | |
167 | + return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id | |
150 | 168 | |
151 | 169 | |
152 | 170 | def get_prec_context(mention_start, words): |
... | ... | @@ -218,10 +236,10 @@ def get_sentence_end(words, word_idx): |
218 | 236 | return len(words) - 1 |
219 | 237 | |
220 | 238 | |
221 | -def get_head_base(head_orth, words): | |
239 | +def get_head(head_orth, words): | |
222 | 240 | for word in words: |
223 | 241 | if word['orth'].lower() == head_orth.lower() or word['orth'] == head_orth: |
224 | - return word['base'] | |
242 | + return word | |
225 | 243 | return None |
226 | 244 | |
227 | 245 | |
... | ... | @@ -272,6 +290,58 @@ def get_one_word_text(word_id, words, form): |
272 | 290 | return this_word[form] |
273 | 291 | |
274 | 292 | |
293 | +def get_gender(msd): | |
294 | + tags = msd.split(':') | |
295 | + if 'm1' in tags: | |
296 | + return 'm1' | |
297 | + elif 'm2' in tags: | |
298 | + return 'm2' | |
299 | + elif 'm3' in tags: | |
300 | + return 'm3' | |
301 | + elif 'f' in tags: | |
302 | + return 'f' | |
303 | + elif 'n' in tags: | |
304 | + return 'n' | |
305 | + else: | |
306 | + return 'unk' | |
307 | + | |
308 | + | |
309 | +def get_person(msd): | |
310 | + tags = msd.split(':') | |
311 | + if 'pri' in tags: | |
312 | + return 'pri' | |
313 | + elif 'sec' in tags: | |
314 | + return 'sec' | |
315 | + elif 'ter' in tags: | |
316 | + return 'ter' | |
317 | + else: | |
318 | + return 'unk' | |
319 | + | |
320 | + | |
321 | +def get_number(msd): | |
322 | + tags = msd.split(':') | |
323 | + if 'sg' in tags: | |
324 | + return 'sg' | |
325 | + elif 'pl' in tags: | |
326 | + return 'pl' | |
327 | + else: | |
328 | + return 'unk' | |
329 | + | |
330 | + | |
331 | +def get_rarest_word(words): | |
332 | + min_freq = 0 | |
333 | + rarest_word = words[0] | |
334 | + for i, word in enumerate(words): | |
335 | + word_freq = 0 | |
336 | + if word['base'] in FREQ_LIST: | |
337 | + word_freq = FREQ_LIST[word['base']] | |
338 | + | |
339 | + if i == 0 or word_freq < min_freq: | |
340 | + min_freq = word_freq | |
341 | + rarest_word = word | |
342 | + return rarest_word | |
343 | + | |
344 | + | |
275 | 345 | def write(inpath, outpath, text): |
276 | 346 | textname = os.path.splitext(os.path.basename(inpath))[0] |
277 | 347 | intextdir = os.path.dirname(inpath) |
... | ... |
corneferencer/resolvers/constants.py
corneferencer/resolvers/features.py
... | ... | @@ -2,11 +2,12 @@ import numpy |
2 | 2 | import random |
3 | 3 | |
4 | 4 | from conf import RANDOM_WORD_VECTORS, W2V_MODEL, W2V_SIZE |
5 | +from corneferencer.resolvers import constants | |
5 | 6 | |
6 | 7 | |
7 | 8 | # mention features |
8 | 9 | def head_vec(mention): |
9 | - return list(get_wv(W2V_MODEL, mention.head_base)) | |
10 | + return list(get_wv(W2V_MODEL, mention.head['base'])) | |
10 | 11 | |
11 | 12 | |
12 | 13 | def first_word_vec(mention): |
... | ... | @@ -65,6 +66,19 @@ def sentence_vec(mention): |
65 | 66 | return list(get_context_vec(mention.sentence, W2V_MODEL)) |
66 | 67 | |
67 | 68 | |
69 | +def mention_type(mention): | |
70 | + type_vec = [0] * 4 | |
71 | + if mention.head['ctag'] in constants.NOUN_TAGS: | |
72 | + type_vec[0] = 1 | |
73 | + elif mention.head['ctag'] in constants.PPRON_TAGS: | |
74 | + type_vec[1] = 1 | |
75 | + elif mention.head['ctag'] in constants.ZERO_TAGS: | |
76 | + type_vec[2] = 1 | |
77 | + else: | |
78 | + type_vec[3] = 1 | |
79 | + return type_vec | |
80 | + | |
81 | + | |
68 | 82 | # pair features |
69 | 83 | def distances_vec(ante, ana): |
70 | 84 | vec = [] |
... | ... | @@ -118,6 +132,45 @@ def base_match(ante, ana): |
118 | 132 | return 0 |
119 | 133 | |
120 | 134 | |
135 | +def ante_contains_rarest_from_ana(ante, ana): | |
136 | + ana_rarest = ana.rarest | |
137 | + for word in ante.words: | |
138 | + if word['base'] == ana_rarest['base']: | |
139 | + return 1 | |
140 | + return 0 | |
141 | + | |
142 | + | |
143 | +def agreement(ante, ana, tag_name): | |
144 | + agr_vec = [0] * 3 | |
145 | + if ante.head[tag_name] == 'unk' or ana.head[tag_name] == 'unk': | |
146 | + agr_vec[2] = 1 | |
147 | + elif ante.head[tag_name] == ana.head[tag_name]: | |
148 | + agr_vec[0] = 1 | |
149 | + else: | |
150 | + agr_vec[1] = 1 | |
151 | + return agr_vec | |
152 | + | |
153 | + | |
154 | +def is_acronym(ante, ana): | |
155 | + if ana.text.upper() == ana.text: | |
156 | + return check_one_way_acronym(ana.text, ante.text) | |
157 | + if ante.text.upper() == ante.text: | |
158 | + return check_one_way_acronym(ante.text, ana.text); | |
159 | + return 0; | |
160 | + | |
161 | + | |
162 | +def same_sentence(ante, ana): | |
163 | + if ante.sentence_id == ana.sentence_id: | |
164 | + return 1 | |
165 | + return 0 | |
166 | + | |
167 | + | |
168 | +def same_paragraph(ante, ana): | |
169 | + if ante.paragraph_id == ana.paragraph_id: | |
170 | + return 1 | |
171 | + return 0 | |
172 | + | |
173 | + | |
121 | 174 | # supporting functions |
122 | 175 | def get_wv(model, lemma, use_random_vec=True): |
123 | 176 | vec = None |
... | ... | @@ -168,3 +221,15 @@ def get_distance_bucket(distance): |
168 | 221 | elif distance >= 64: |
169 | 222 | return 9 |
170 | 223 | return 10 |
224 | + | |
225 | + | |
226 | +def check_one_way_acronym(acronym, expression): | |
227 | + initials = u'' | |
228 | + for expr1 in expression.split('-'): | |
229 | + for expr2 in expr1.split(): | |
230 | + expr2 = expr2.strip() | |
231 | + if expr2: | |
232 | + initials += unicode(expr2[0]).upper() | |
233 | + if acronym == initials: | |
234 | + return 1; | |
235 | + return 0; | |
... | ... |
corneferencer/resolvers/resolve.py
... | ... | @@ -19,6 +19,9 @@ def incremental(text): |
19 | 19 | best_prediction = prediction |
20 | 20 | best_ante = ante |
21 | 21 | if best_ante is not None: |
22 | + # print ('wynik') | |
23 | + # print(best_ante.text, best_prediction, ana.text) | |
24 | + # print (best_ante.set, ana.set) | |
22 | 25 | if best_ante.set: |
23 | 26 | ana.set = best_ante.set |
24 | 27 | else: |
... | ... | @@ -34,6 +37,7 @@ def entity_based(text): |
34 | 37 | last_set_id = 0 |
35 | 38 | for i, ana in enumerate(text.mentions): |
36 | 39 | if i > 0: |
40 | + print ('!!!!!!!!!!%s!!!!!!!!!!!!' % ana.text) | |
37 | 41 | best_fit = get_best_set(sets, ana) |
38 | 42 | if best_fit is not None: |
39 | 43 | ana.set = best_fit['set_id'] |
... | ... | @@ -50,8 +54,14 @@ def entity_based(text): |
50 | 54 | 'mentions': [ana]}) |
51 | 55 | ana.set = str_set_id |
52 | 56 | last_set_id += 1 |
57 | + print (ana.set) | |
58 | + for ss in sets: | |
59 | + print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']])) | |
53 | 60 | |
54 | 61 | remove_singletons(sets) |
62 | + print (';'.join([ss['set_id'] for ss in sets])) | |
63 | + for ss in sets: | |
64 | + print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']])) | |
55 | 65 | |
56 | 66 | |
57 | 67 | def get_best_set(sets, ana): |
... | ... | @@ -72,6 +82,7 @@ def predict_set(mentions, ana): |
72 | 82 | sample = numpy.asarray([pair_vec], dtype=numpy.float32) |
73 | 83 | prediction = NEURAL_MODEL.predict(sample)[0] |
74 | 84 | prediction_sum += prediction |
85 | + print(mnt.text, prediction, ana.text) | |
75 | 86 | return prediction_sum / float(len(mentions)) |
76 | 87 | |
77 | 88 | |
... | ... |
corneferencer/resolvers/vectors.py
... | ... | @@ -23,6 +23,10 @@ def get_mention_features(mention): |
23 | 23 | vec.extend(features.following_context_vec(mention)) |
24 | 24 | vec.extend(features.mention_vec(mention)) |
25 | 25 | vec.extend(features.sentence_vec(mention)) |
26 | + | |
27 | + # cechy uzupelniajace | |
28 | + vec.extend(features.mention_type(mention)) | |
29 | + | |
26 | 30 | return vec |
27 | 31 | |
28 | 32 | |
... | ... | @@ -32,4 +36,14 @@ def get_pair_features(ante, ana): |
32 | 36 | vec.append(features.head_match(ante, ana)) |
33 | 37 | vec.append(features.exact_match(ante, ana)) |
34 | 38 | vec.append(features.base_match(ante, ana)) |
39 | + | |
40 | + # cechy uzupelniajace | |
41 | + vec.append(features.ante_contains_rarest_from_ana(ante, ana)) | |
42 | + vec.extend(features.agreement(ante, ana, 'gender')) | |
43 | + vec.extend(features.agreement(ante, ana, 'number')) | |
44 | + vec.extend(features.agreement(ante, ana, 'person')) | |
45 | + vec.append(features.is_acronym(ante, ana)) | |
46 | + vec.append(features.same_sentence(ante, ana)) | |
47 | + vec.append(features.same_paragraph(ante, ana)) | |
48 | + | |
35 | 49 | return vec |
... | ... |
corneferencer/utils.py
1 | 1 | from __future__ import print_function |
2 | 2 | |
3 | +import codecs | |
3 | 4 | import sys |
4 | 5 | |
5 | 6 | from keras.models import Model |
... | ... | @@ -12,15 +13,36 @@ def eprint(*args, **kwargs): |
12 | 13 | |
13 | 14 | def initialize_neural_model(number_of_features, path_to_model): |
14 | 15 | inputs = Input(shape=(number_of_features,)) |
16 | + | |
15 | 17 | output_from_1st_layer = Dense(1000, activation='relu')(inputs) |
16 | - output_from_1st_layer = Dropout(0.5)(output_from_1st_layer) | |
18 | + output_from_1st_layer = Dropout(0.2)(output_from_1st_layer) | |
17 | 19 | output_from_1st_layer = BatchNormalization()(output_from_1st_layer) |
20 | + | |
18 | 21 | output_from_2nd_layer = Dense(500, activation='relu')(output_from_1st_layer) |
19 | - output_from_2nd_layer = Dropout(0.5)(output_from_2nd_layer) | |
22 | + output_from_2nd_layer = Dropout(0.2)(output_from_2nd_layer) | |
20 | 23 | output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer) |
21 | - output = Dense(1, activation='sigmoid')(output_from_2nd_layer) | |
24 | + | |
25 | + output_from_3rd_layer = Dense(300, activation='relu')(output_from_2nd_layer) | |
26 | + output_from_3rd_layer = Dropout(0.2)(output_from_3rd_layer) | |
27 | + output_from_3rd_layer = BatchNormalization()(output_from_3rd_layer) | |
28 | + | |
29 | + output = Dense(1, activation='sigmoid')(output_from_3rd_layer) | |
22 | 30 | |
23 | 31 | model = Model(inputs, output) |
24 | 32 | model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy']) |
25 | 33 | model.load_weights(path_to_model) |
34 | + | |
26 | 35 | return model |
36 | + | |
37 | + | |
38 | +def load_freq_list(freq_path): | |
39 | + freq_list = {} | |
40 | + with codecs.open(freq_path, 'r', 'utf-8') as freq_file: | |
41 | + lines = freq_file.readlines() | |
42 | + for line in lines: | |
43 | + line_parts = line.split() | |
44 | + freq = int(line_parts[0]) | |
45 | + base = line_parts[1] | |
46 | + if base not in freq_list: | |
47 | + freq_list[base] = freq | |
48 | + return freq_list | |
... | ... |