Commit 94c509a1097b7a480243539c9be92756fcd90ebf
1 parent
0f6eeffb
Added 43 new features.
Showing
7 changed files
with
468 additions
and
34 deletions
conf.py
1 | 1 | import os |
2 | 2 | |
3 | -from gensim.models.word2vec import Word2Vec | |
3 | +import utils | |
4 | 4 | |
5 | -from corneferencer.utils import initialize_neural_model, load_freq_list | |
5 | +from gensim.models.word2vec import Word2Vec | |
6 | 6 | |
7 | 7 | |
8 | 8 | CONTEXT = 5 |
9 | -THRESHOLD = 0.5 | |
9 | +THRESHOLD = 0.95 | |
10 | 10 | RANDOM_WORD_VECTORS = True |
11 | 11 | W2V_SIZE = 50 |
12 | 12 | W2V_MODEL_NAME = 'w2v_allwiki_nkjpfull_50.model' |
13 | 13 | |
14 | -NUMBER_OF_FEATURES = 1147 | |
15 | -NEURAL_MODEL_NAME = 'model_1147_features.h5' | |
14 | +NUMBER_OF_FEATURES = 1190 | |
15 | +NEURAL_MODEL_NAME = 'model_1190_features.h5' | |
16 | 16 | |
17 | 17 | FREQ_LIST_NAME = 'base.lst' |
18 | +LEMMA2SYNONYMS_NAME = 'lemma2synonyms.map' | |
19 | +LEMMA2HYPERNYMS_NAME = 'lemma2hypernyms.map' | |
20 | +TITLE2LINKS_NAME = 'link.map' | |
21 | +TITLE2REDIRECT_NAME = 'redirect.map' | |
22 | + | |
18 | 23 | |
19 | 24 | # do not change that |
20 | -W2V_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', W2V_MODEL_NAME) | |
25 | +MAIN_PATH = os.path.dirname(__file__) | |
26 | + | |
27 | +W2V_MODEL_PATH = os.path.join(MAIN_PATH, 'models', W2V_MODEL_NAME) | |
21 | 28 | W2V_MODEL = Word2Vec.load(W2V_MODEL_PATH) |
22 | 29 | |
23 | -NEURAL_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', NEURAL_MODEL_NAME) | |
24 | -NEURAL_MODEL = initialize_neural_model(NUMBER_OF_FEATURES, NEURAL_MODEL_PATH) | |
30 | +NEURAL_MODEL_PATH = os.path.join(MAIN_PATH, 'models', NEURAL_MODEL_NAME) | |
31 | +NEURAL_MODEL = utils.initialize_neural_model(NUMBER_OF_FEATURES, NEURAL_MODEL_PATH) | |
32 | + | |
33 | +FREQ_LIST_PATH = os.path.join(MAIN_PATH, 'freq', FREQ_LIST_NAME) | |
34 | +FREQ_LIST = utils.load_freq_list(FREQ_LIST_PATH) | |
35 | + | |
36 | +LEMMA2SYNONYMS_PATH = os.path.join(MAIN_PATH, 'wordnet', LEMMA2SYNONYMS_NAME) | |
37 | +LEMMA2SYNONYMS = utils.load_one2many_map(LEMMA2SYNONYMS_PATH) | |
38 | + | |
39 | +LEMMA2HYPERNYMS_PATH = os.path.join(MAIN_PATH, 'wordnet', LEMMA2HYPERNYMS_NAME) | |
40 | +LEMMA2HYPERNYMS = utils.load_one2many_map(LEMMA2HYPERNYMS_PATH) | |
41 | + | |
42 | +TITLE2LINKS_PATH = os.path.join(MAIN_PATH, 'wikipedia', TITLE2LINKS_NAME) | |
43 | +TITLE2LINKS = utils.load_one2many_map(TITLE2LINKS_PATH) | |
25 | 44 | |
26 | -FREQ_LIST_PATH = os.path.join(os.path.dirname(__file__), 'freq', FREQ_LIST_NAME) | |
27 | -FREQ_LIST = load_freq_list(FREQ_LIST_PATH) | |
45 | +TITLE2REDIRECT_PATH = os.path.join(MAIN_PATH, 'wikipedia', TITLE2REDIRECT_NAME) | |
46 | +TITLE2REDIRECT = utils.load_one2one_map(TITLE2REDIRECT_PATH) | |
... | ... |
corneferencer/entities.py
... | ... | @@ -19,7 +19,8 @@ class Mention: |
19 | 19 | def __init__(self, mnt_id, text, lemmatized_text, words, span, |
20 | 20 | head_orth, head, dominant, node, prec_context, |
21 | 21 | follow_context, sentence, position_in_mentions, |
22 | - start_in_words, end_in_words, rarest, paragraph_id, sentence_id): | |
22 | + start_in_words, end_in_words, rarest, paragraph_id, sentence_id, | |
23 | + first_in_sentence, first_in_paragraph): | |
23 | 24 | self.id = mnt_id |
24 | 25 | self.set = '' |
25 | 26 | self.old_set = '' |
... | ... | @@ -37,7 +38,9 @@ class Mention: |
37 | 38 | self.position_in_mentions = position_in_mentions |
38 | 39 | self.start_in_words = start_in_words |
39 | 40 | self.end_in_words = end_in_words |
40 | - self.features = get_mention_features(self) | |
41 | 41 | self.rarest = rarest |
42 | 42 | self.paragraph_id = paragraph_id |
43 | 43 | self.sentence_id = sentence_id |
44 | + self.first_in_sentence = first_in_sentence | |
45 | + self.first_in_paragraph = first_in_paragraph | |
46 | + self.features = get_mention_features(self) | |
... | ... |
corneferencer/inout/mmax.py
... | ... | @@ -39,7 +39,8 @@ def read_mentions(mentions_path, words_path): |
39 | 39 | |
40 | 40 | (prec_context, follow_context, sentence, |
41 | 41 | mnt_start_position, mnt_end_position, |
42 | - paragraph_id, sentence_id) = get_context(mention_words, words) | |
42 | + paragraph_id, sentence_id, | |
43 | + first_in_sentence, first_in_paragraph) = get_context(mention_words, words) | |
43 | 44 | |
44 | 45 | head = get_head(head_orth, mention_words) |
45 | 46 | mention = Mention(mnt_id=markable.attrib['id'], |
... | ... | @@ -59,7 +60,9 @@ def read_mentions(mentions_path, words_path): |
59 | 60 | end_in_words=mnt_end_position, |
60 | 61 | rarest=get_rarest_word(mention_words), |
61 | 62 | paragraph_id=paragraph_id, |
62 | - sentence_id=sentence_id) | |
63 | + sentence_id=sentence_id, | |
64 | + first_in_sentence=first_in_sentence, | |
65 | + first_in_paragraph=first_in_paragraph) | |
63 | 66 | mentions.append(mention) |
64 | 67 | |
65 | 68 | return mentions |
... | ... | @@ -151,10 +154,16 @@ def get_context(mention_words, words): |
151 | 154 | mnt_end_position = -1 |
152 | 155 | first_word = mention_words[0] |
153 | 156 | last_word = mention_words[-1] |
157 | + first_in_sentence = False | |
158 | + first_in_paragraph = False | |
154 | 159 | for idx, word in enumerate(words): |
155 | 160 | if word['id'] == first_word['id']: |
156 | 161 | prec_context = get_prec_context(idx, words) |
157 | 162 | mnt_start_position = get_mention_start(first_word, words) |
163 | + if idx == 0 or words[idx-1]['lastinsent']: | |
164 | + first_in_sentence = True | |
165 | + if idx == 0 or words[idx-1]['lastinpar']: | |
166 | + first_in_paragraph = True | |
158 | 167 | if word['id'] == last_word['id']: |
159 | 168 | follow_context = get_follow_context(idx, words) |
160 | 169 | sentence = get_sentence(idx, words) |
... | ... | @@ -164,7 +173,8 @@ def get_context(mention_words, words): |
164 | 173 | sentence_id += 1 |
165 | 174 | if word['lastinpar']: |
166 | 175 | paragraph_id += 1 |
167 | - return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id | |
176 | + return (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, | |
177 | + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph) | |
168 | 178 | |
169 | 179 | |
170 | 180 | def get_prec_context(mention_start, words): |
... | ... |
corneferencer/resolvers/constants.py
... | ... | @@ -3,3 +3,10 @@ RESOLVERS = ['entity_based', 'incremental'] |
3 | 3 | NOUN_TAGS = ['subst', 'ger', 'depr'] |
4 | 4 | PPRON_TAGS = ['ppron12', 'ppron3'] |
5 | 5 | ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt'] |
6 | +SIEBIE_TAGS = ['siebie'] | |
7 | +MASCULINE_TAGS = ['m1', 'm2', 'm3'] | |
8 | + | |
9 | +FIRST_SECOND_PERSON = ['pri', 'sec'] | |
10 | +INDICATIVE_PRONS_BASES = [u'ten', u'ta', u'to', u'ci', u'te', u'tamten', u'tamta', | |
11 | + u'tamto', u'tamci', u'tamte', u'ów', u'owa', u'owo', u'owi', | |
12 | + u'owe'] | |
... | ... |
corneferencer/resolvers/features.py
1 | +import math | |
1 | 2 | import numpy |
2 | 3 | import random |
4 | +import re | |
3 | 5 | |
4 | -from conf import RANDOM_WORD_VECTORS, W2V_MODEL, W2V_SIZE | |
6 | +import conf | |
5 | 7 | from corneferencer.resolvers import constants |
6 | 8 | |
7 | 9 | |
... | ... | @@ -10,63 +12,63 @@ def head_vec(mention): |
10 | 12 | head_base = mention.head_orth |
11 | 13 | if mention.head is not None: |
12 | 14 | head_base = mention.head['base'] |
13 | - return list(get_wv(W2V_MODEL, head_base)) | |
15 | + return list(get_wv(conf.W2V_MODEL, head_base)) | |
14 | 16 | |
15 | 17 | |
16 | 18 | def first_word_vec(mention): |
17 | - return list(get_wv(W2V_MODEL, mention.words[0]['base'])) | |
19 | + return list(get_wv(conf.W2V_MODEL, mention.words[0]['base'])) | |
18 | 20 | |
19 | 21 | |
20 | 22 | def last_word_vec(mention): |
21 | - return list(get_wv(W2V_MODEL, mention.words[-1]['base'])) | |
23 | + return list(get_wv(conf.W2V_MODEL, mention.words[-1]['base'])) | |
22 | 24 | |
23 | 25 | |
24 | 26 | def first_after_vec(mention): |
25 | 27 | if len(mention.follow_context) > 0: |
26 | - vec = list(get_wv(W2V_MODEL, mention.follow_context[0]['base'])) | |
28 | + vec = list(get_wv(conf.W2V_MODEL, mention.follow_context[0]['base'])) | |
27 | 29 | else: |
28 | - vec = [0.0] * W2V_SIZE | |
30 | + vec = [0.0] * conf.W2V_SIZE | |
29 | 31 | return vec |
30 | 32 | |
31 | 33 | |
32 | 34 | def second_after_vec(mention): |
33 | 35 | if len(mention.follow_context) > 1: |
34 | - vec = list(get_wv(W2V_MODEL, mention.follow_context[1]['base'])) | |
36 | + vec = list(get_wv(conf.W2V_MODEL, mention.follow_context[1]['base'])) | |
35 | 37 | else: |
36 | - vec = [0.0] * W2V_SIZE | |
38 | + vec = [0.0] * conf.W2V_SIZE | |
37 | 39 | return vec |
38 | 40 | |
39 | 41 | |
40 | 42 | def first_before_vec(mention): |
41 | 43 | if len(mention.prec_context) > 0: |
42 | - vec = list(get_wv(W2V_MODEL, mention.prec_context[-1]['base'])) | |
44 | + vec = list(get_wv(conf.W2V_MODEL, mention.prec_context[-1]['base'])) | |
43 | 45 | else: |
44 | - vec = [0.0] * W2V_SIZE | |
46 | + vec = [0.0] * conf.W2V_SIZE | |
45 | 47 | return vec |
46 | 48 | |
47 | 49 | |
48 | 50 | def second_before_vec(mention): |
49 | 51 | if len(mention.prec_context) > 1: |
50 | - vec = list(get_wv(W2V_MODEL, mention.prec_context[-2]['base'])) | |
52 | + vec = list(get_wv(conf.W2V_MODEL, mention.prec_context[-2]['base'])) | |
51 | 53 | else: |
52 | - vec = [0.0] * W2V_SIZE | |
54 | + vec = [0.0] * conf.W2V_SIZE | |
53 | 55 | return vec |
54 | 56 | |
55 | 57 | |
56 | 58 | def preceding_context_vec(mention): |
57 | - return list(get_context_vec(mention.prec_context, W2V_MODEL)) | |
59 | + return list(get_context_vec(mention.prec_context, conf.W2V_MODEL)) | |
58 | 60 | |
59 | 61 | |
60 | 62 | def following_context_vec(mention): |
61 | - return list(get_context_vec(mention.follow_context, W2V_MODEL)) | |
63 | + return list(get_context_vec(mention.follow_context, conf.W2V_MODEL)) | |
62 | 64 | |
63 | 65 | |
64 | 66 | def mention_vec(mention): |
65 | - return list(get_context_vec(mention.words, W2V_MODEL)) | |
67 | + return list(get_context_vec(mention.words, conf.W2V_MODEL)) | |
66 | 68 | |
67 | 69 | |
68 | 70 | def sentence_vec(mention): |
69 | - return list(get_context_vec(mention.sentence, W2V_MODEL)) | |
71 | + return list(get_context_vec(mention.sentence, conf.W2V_MODEL)) | |
70 | 72 | |
71 | 73 | |
72 | 74 | def mention_type(mention): |
... | ... | @@ -84,6 +86,75 @@ def mention_type(mention): |
84 | 86 | return type_vec |
85 | 87 | |
86 | 88 | |
89 | +def is_first_second_person(mention): | |
90 | + if mention.head['person'] in constants.FIRST_SECOND_PERSON: | |
91 | + return 1 | |
92 | + return 0 | |
93 | + | |
94 | + | |
95 | +def is_demonstrative(mention): | |
96 | + if mention.words[0]['base'].lower() in constants.INDICATIVE_PRONS_BASES: | |
97 | + return 1 | |
98 | + return 0 | |
99 | + | |
100 | + | |
101 | +def is_demonstrative_nominal(mention): | |
102 | + if is_demonstrative(mention) and mention.head['ctag'] in constants.NOUN_TAGS: | |
103 | + return 1 | |
104 | + return 0 | |
105 | + | |
106 | + | |
107 | +def is_demonstrative_pronoun(mention): | |
108 | + if (is_demonstrative(mention) and | |
109 | + (mention.head['ctag'] in constants.PPRON_TAGS or mention.head['ctag'] in constants.ZERO_TAGS)): | |
110 | + return 1 | |
111 | + return 0 | |
112 | + | |
113 | + | |
114 | +def is_refl_pronoun(mention): | |
115 | + if mention.head['ctag'] in constants.SIEBIE_TAGS: | |
116 | + return 1 | |
117 | + return 0 | |
118 | + | |
119 | + | |
120 | +def is_first_in_sentence(mention): | |
121 | + if mention.first_in_sentence: | |
122 | + return 1 | |
123 | + return 0 | |
124 | + | |
125 | + | |
126 | +def is_zero_or_pronoun(mention): | |
127 | + if mention.head['ctag'] in constants.PPRON_TAGS or mention.head['ctag'] in constants.ZERO_TAGS: | |
128 | + return 1 | |
129 | + return 0 | |
130 | + | |
131 | + | |
132 | +def head_contains_digit(mention): | |
133 | + _digits = re.compile('\d') | |
134 | + if _digits.search(mention.head_orth): | |
135 | + return 1 | |
136 | + return 0 | |
137 | + | |
138 | + | |
139 | +def mention_contains_digit(mention): | |
140 | + _digits = re.compile('\d') | |
141 | + if _digits.search(mention.text): | |
142 | + return 1 | |
143 | + return 0 | |
144 | + | |
145 | + | |
146 | +def contains_letter(mention): | |
147 | + if any(c.isalpha() for c in mention.text): | |
148 | + return 1 | |
149 | + return 0 | |
150 | + | |
151 | + | |
152 | +def post_modified(mention): | |
153 | + if mention.head['orth'] != mention.words[-1]['orth']: | |
154 | + return 1 | |
155 | + return 0 | |
156 | + | |
157 | + | |
87 | 158 | # pair features |
88 | 159 | def distances_vec(ante, ana): |
89 | 160 | vec = [] |
... | ... | @@ -171,12 +242,207 @@ def same_sentence(ante, ana): |
171 | 242 | return 0 |
172 | 243 | |
173 | 244 | |
245 | +def neighbouring_sentence(ante, ana): | |
246 | + if ana.sentence_id - ante.sentence_id == 1: | |
247 | + return 1 | |
248 | + return 0 | |
249 | + | |
250 | + | |
251 | +def cousin_sentence(ante, ana): | |
252 | + if ana.sentence_id - ante.sentence_id == 2: | |
253 | + return 1 | |
254 | + return 0 | |
255 | + | |
256 | + | |
257 | +def distant_sentence(ante, ana): | |
258 | + if ana.sentence_id - ante.sentence_id > 2: | |
259 | + return 1 | |
260 | + return 0 | |
261 | + | |
262 | + | |
174 | 263 | def same_paragraph(ante, ana): |
175 | 264 | if ante.paragraph_id == ana.paragraph_id: |
176 | 265 | return 1 |
177 | 266 | return 0 |
178 | 267 | |
179 | 268 | |
269 | +def flat_gender_agreement(ante, ana): | |
270 | + agr_vec = [0] * 3 | |
271 | + if ante.head['gender'] == 'unk' or ana.head['gender'] == 'unk': | |
272 | + agr_vec[2] = 1 | |
273 | + elif (ante.head['gender'] == ana.head['gender'] or | |
274 | + (ante.head['gender'] in constants.MASCULINE_TAGS and ana.head['gender'] in constants.MASCULINE_TAGS)): | |
275 | + agr_vec[0] = 1 | |
276 | + else: | |
277 | + agr_vec[1] = 1 | |
278 | + return agr_vec | |
279 | + | |
280 | + | |
281 | +def left_match(ante, ana): | |
282 | + if (ante.text.lower().startswith(ana.text.lower()) or | |
283 | + ana.text.lower().startswith(ante.text.lower())): | |
284 | + return 1 | |
285 | + return 0 | |
286 | + | |
287 | + | |
288 | +def right_match(ante, ana): | |
289 | + if (ante.text.lower().endswith(ana.text.lower()) or | |
290 | + ana.text.lower().endswith(ante.text.lower())): | |
291 | + return 1 | |
292 | + return 0 | |
293 | + | |
294 | + | |
295 | +def abbrev2(ante, ana): | |
296 | + ante_abbrev = get_abbrev(ante) | |
297 | + ana_abbrev = get_abbrev(ana) | |
298 | + if ante.head_orth == ana_abbrev or ana.head_orth == ante_abbrev: | |
299 | + return 1 | |
300 | + return 0 | |
301 | + | |
302 | + | |
303 | +def string_kernel(ante, ana): | |
304 | + s1 = ante.text | |
305 | + s2 = ana.text | |
306 | + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2))) | |
307 | + | |
308 | + | |
309 | +def head_string_kernel(ante, ana): | |
310 | + s1 = ante.head_orth | |
311 | + s2 = ana.head_orth | |
312 | + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2))) | |
313 | + | |
314 | + | |
315 | +def wordnet_synonyms(ante, ana): | |
316 | + ante_synonyms = set() | |
317 | + if ante.head['base'] in conf.LEMMA2SYNONYMS: | |
318 | + ante_synonyms = conf.LEMMA2SYNONYMS[ante.head['base']] | |
319 | + | |
320 | + ana_synonyms = set() | |
321 | + if ana.head['base'] in conf.LEMMA2SYNONYMS: | |
322 | + ana_synonyms = conf.LEMMA2SYNONYMS[ana.head['base']] | |
323 | + | |
324 | + if ana.head['base'] in ante_synonyms or ante.head['base'] in ana_synonyms: | |
325 | + return 1 | |
326 | + return 0 | |
327 | + | |
328 | + | |
329 | +def wordnet_ana_is_hypernym(ante, ana): | |
330 | + ante_hypernyms = set() | |
331 | + if ante.head['base'] in conf.LEMMA2HYPERNYMS: | |
332 | + ante_hypernyms = conf.LEMMA2HYPERNYMS[ante.head['base']] | |
333 | + | |
334 | + ana_hypernyms = set() | |
335 | + if ana.head['base'] in conf.LEMMA2HYPERNYMS: | |
336 | + ana_hypernyms = conf.LEMMA2HYPERNYMS[ana.head['base']] | |
337 | + | |
338 | + if not ante_hypernyms or not ana_hypernyms: | |
339 | + return 0 | |
340 | + | |
341 | + if ana.head['base'] in ante_hypernyms: | |
342 | + return 1 | |
343 | + return 0 | |
344 | + | |
345 | + | |
346 | +def wordnet_ante_is_hypernym(ante, ana): | |
347 | + ana_hypernyms = set() | |
348 | + if ana.head['base'] in conf.LEMMA2HYPERNYMS: | |
349 | + ana_hypernyms = conf.LEMMA2HYPERNYMS[ana.head['base']] | |
350 | + | |
351 | + ante_hypernyms = set() | |
352 | + if ante.head['base'] in conf.LEMMA2HYPERNYMS: | |
353 | + ante_hypernyms = conf.LEMMA2HYPERNYMS[ante.head['base']] | |
354 | + | |
355 | + if not ante_hypernyms or not ana_hypernyms: | |
356 | + return 0 | |
357 | + | |
358 | + if ante.head['base'] in ana_hypernyms: | |
359 | + return 1 | |
360 | + return 0 | |
361 | + | |
362 | + | |
363 | +def wikipedia_link(ante, ana): | |
364 | + ante_base = ante.lemmatized_text.lower() | |
365 | + ana_base = ana.lemmatized_text.lower() | |
366 | + if ante_base == ana_base: | |
367 | + return 1 | |
368 | + | |
369 | + ante_links = set() | |
370 | + if ante_base in conf.TITLE2LINKS: | |
371 | + ante_links = conf.TITLE2LINKS[ante_base] | |
372 | + | |
373 | + ana_links = set() | |
374 | + if ana_base in conf.TITLE2LINKS: | |
375 | + ana_links = conf.TITLE2LINKS[ana_base] | |
376 | + | |
377 | + if ana_base in ante_links or ante_base in ana_links: | |
378 | + return 1 | |
379 | + | |
380 | + return 0 | |
381 | + | |
382 | + | |
383 | +def wikipedia_mutual_link(ante, ana): | |
384 | + ante_base = ante.lemmatized_text.lower() | |
385 | + ana_base = ana.lemmatized_text.lower() | |
386 | + if ante_base == ana_base: | |
387 | + return 1 | |
388 | + | |
389 | + ante_links = set() | |
390 | + if ante_base in conf.TITLE2LINKS: | |
391 | + ante_links = conf.TITLE2LINKS[ante_base] | |
392 | + | |
393 | + ana_links = set() | |
394 | + if ana_base in conf.TITLE2LINKS: | |
395 | + ana_links = conf.TITLE2LINKS[ana_base] | |
396 | + | |
397 | + if ana_base in ante_links and ante_base in ana_links: | |
398 | + return 1 | |
399 | + | |
400 | + return 0 | |
401 | + | |
402 | + | |
403 | +def wikipedia_redirect(ante, ana): | |
404 | + ante_base = ante.lemmatized_text.lower() | |
405 | + ana_base = ana.lemmatized_text.lower() | |
406 | + if ante_base == ana_base: | |
407 | + return 1 | |
408 | + | |
409 | + if ante_base in conf.TITLE2REDIRECT and conf.TITLE2REDIRECT[ante_base] == ana_base: | |
410 | + return 1 | |
411 | + | |
412 | + if ana_base in conf.TITLE2REDIRECT and conf.TITLE2REDIRECT[ana_base] == ante_base: | |
413 | + return 1 | |
414 | + | |
415 | + return 0 | |
416 | + | |
417 | + | |
418 | +def samesent_anapron_antefirstinpar(ante, ana): | |
419 | + if same_sentence(ante, ana) and is_zero_or_pronoun(ana) and ante.first_in_paragraph: | |
420 | + return 1 | |
421 | + return 0 | |
422 | + | |
423 | + | |
424 | +def samesent_antefirstinpar_personnumbermatch(ante, ana): | |
425 | + if (same_sentence(ante, ana) and ante.first_in_paragraph | |
426 | + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]): | |
427 | + return 1 | |
428 | + return 0 | |
429 | + | |
430 | + | |
431 | +def adjsent_anapron_adjmen_personnumbermatch(ante, ana): | |
432 | + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana) | |
433 | + and ana.position_in_mentions - ante.position_in_mentions == 1 | |
434 | + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]): | |
435 | + return 1 | |
436 | + return 0 | |
437 | + | |
438 | + | |
439 | +def adjsent_anapron_adjmen(ante, ana): | |
440 | + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana) | |
441 | + and ana.position_in_mentions - ante.position_in_mentions == 1): | |
442 | + return 1 | |
443 | + return 0 | |
444 | + | |
445 | + | |
180 | 446 | # supporting functions |
181 | 447 | def get_wv(model, lemma, use_random_vec=True): |
182 | 448 | vec = None |
... | ... | @@ -192,15 +458,15 @@ def get_wv(model, lemma, use_random_vec=True): |
192 | 458 | |
193 | 459 | |
194 | 460 | def random_vec(): |
195 | - return numpy.asarray([random.uniform(-0.25, 0.25) for i in range(0, W2V_SIZE)], dtype=numpy.float32) | |
461 | + return numpy.asarray([random.uniform(-0.25, 0.25) for i in range(0, conf.W2V_SIZE)], dtype=numpy.float32) | |
196 | 462 | |
197 | 463 | |
198 | 464 | def get_context_vec(words, model): |
199 | - vec = numpy.zeros(W2V_SIZE, dtype=numpy.float32) | |
465 | + vec = numpy.zeros(conf.W2V_SIZE, dtype=numpy.float32) | |
200 | 466 | unknown_count = 0 |
201 | 467 | if len(words) != 0: |
202 | 468 | for word in words: |
203 | - word_vec = get_wv(model, word['base'], RANDOM_WORD_VECTORS) | |
469 | + word_vec = get_wv(model, word['base'], conf.RANDOM_WORD_VECTORS) | |
204 | 470 | if word_vec is None: |
205 | 471 | unknown_count += 1 |
206 | 472 | else: |
... | ... | @@ -239,3 +505,65 @@ def check_one_way_acronym(acronym, expression): |
239 | 505 | if acronym == initials: |
240 | 506 | return 1 |
241 | 507 | return 0 |
508 | + | |
509 | + | |
510 | +def get_abbrev(mention): | |
511 | + abbrev = u'' | |
512 | + for word in mention.words: | |
513 | + if word['orth'][0].isupper(): | |
514 | + abbrev += word['orth'][0] | |
515 | + return abbrev | |
516 | + | |
517 | + | |
518 | +def SK(s1, s2): | |
519 | + LAMBDA = 0.4 | |
520 | + | |
521 | + p = len(s1) | |
522 | + if len(s2) < len(s1): | |
523 | + p = len(s2) | |
524 | + | |
525 | + h, w = len(s1)+1, len(s2)+1 | |
526 | + DPS = [[0.0] * w for i in range(h)] | |
527 | + DP = [[0.0] * w for i in range(h)] | |
528 | + | |
529 | + kernel_mat = [0.0] * (len(s1) + 1) | |
530 | + | |
531 | + for i in range(len(s1)+1): | |
532 | + if i == 0: | |
533 | + continue | |
534 | + for j in range(len(s2)+1): | |
535 | + if j == 0: | |
536 | + continue | |
537 | + if s1[i-1] == s2[j-1]: | |
538 | + DPS[i][j] = LAMBDA * LAMBDA | |
539 | + kernel_mat[0] += DPS[i][j] | |
540 | + else: | |
541 | + DPS[i][j] = 0.0 | |
542 | + | |
543 | + for l in range(p): | |
544 | + if l == 0: | |
545 | + continue | |
546 | + | |
547 | + kernel_mat[l] = 0.0 | |
548 | + for j in range(len(s2)+1): | |
549 | + DP[l-1][j] = 0.0 | |
550 | + | |
551 | + for i in range(len(s1)+1): | |
552 | + DP[i][l-1] = 0.0 | |
553 | + | |
554 | + for i in range(len(s1)+1): | |
555 | + if i < l: | |
556 | + continue | |
557 | + for j in range(len(s2)+1): | |
558 | + if j < l: | |
559 | + continue | |
560 | + DP[i][j] = DPS[i][j] + LAMBDA * DP[i - 1][j] + LAMBDA * DP[i][j - 1] - LAMBDA * LAMBDA * DP[i - 1][j - 1] | |
561 | + | |
562 | + if s1[i-1] == s2[j-1]: | |
563 | + DPS[i][j] = LAMBDA * LAMBDA * DP[i - 1][j - 1] | |
564 | + kernel_mat[l] += DPS[i][j] | |
565 | + | |
566 | + K = 0.0 | |
567 | + for l in range(p): | |
568 | + K += kernel_mat[l] | |
569 | + return K | |
... | ... |
corneferencer/resolvers/vectors.py
... | ... | @@ -27,6 +27,19 @@ def get_mention_features(mention): |
27 | 27 | # cechy uzupelniajace |
28 | 28 | vec.extend(features.mention_type(mention)) |
29 | 29 | |
30 | + # cechy uzupelniajace 2 | |
31 | + vec.append(features.is_first_second_person(mention)) | |
32 | + vec.append(features.is_demonstrative(mention)) | |
33 | + vec.append(features.is_demonstrative_nominal(mention)) | |
34 | + vec.append(features.is_demonstrative_pronoun(mention)) | |
35 | + vec.append(features.is_refl_pronoun(mention)) | |
36 | + vec.append(features.is_first_in_sentence(mention)) | |
37 | + vec.append(features.is_zero_or_pronoun(mention)) | |
38 | + vec.append(features.head_contains_digit(mention)) | |
39 | + vec.append(features.mention_contains_digit(mention)) | |
40 | + vec.append(features.contains_letter(mention)) | |
41 | + vec.append(features.post_modified(mention)) | |
42 | + | |
30 | 43 | return vec |
31 | 44 | |
32 | 45 | |
... | ... | @@ -46,4 +59,30 @@ def get_pair_features(ante, ana): |
46 | 59 | vec.append(features.same_sentence(ante, ana)) |
47 | 60 | vec.append(features.same_paragraph(ante, ana)) |
48 | 61 | |
62 | + # cechy uzupelniajace 2 | |
63 | + vec.append(features.neighbouring_sentence(ante, ana)) | |
64 | + vec.append(features.cousin_sentence(ante, ana)) | |
65 | + vec.append(features.distant_sentence(ante, ana)) | |
66 | + vec.extend(features.flat_gender_agreement(ante, ana)) | |
67 | + vec.append(features.left_match(ante, ana)) | |
68 | + vec.append(features.right_match(ante, ana)) | |
69 | + vec.append(features.abbrev2(ante, ana)) | |
70 | + | |
71 | + vec.append(features.string_kernel(ante, ana)) | |
72 | + vec.append(features.head_string_kernel(ante, ana)) | |
73 | + | |
74 | + vec.append(features.wordnet_synonyms(ante, ana)) | |
75 | + vec.append(features.wordnet_ana_is_hypernym(ante, ana)) | |
76 | + vec.append(features.wordnet_ante_is_hypernym(ante, ana)) | |
77 | + | |
78 | + vec.append(features.wikipedia_link(ante, ana)) | |
79 | + vec.append(features.wikipedia_mutual_link(ante, ana)) | |
80 | + vec.append(features.wikipedia_redirect(ante, ana)) | |
81 | + | |
82 | + # combined | |
83 | + vec.append(features.samesent_anapron_antefirstinpar(ante, ana)) | |
84 | + vec.append(features.samesent_antefirstinpar_personnumbermatch(ante, ana)) | |
85 | + vec.append(features.adjsent_anapron_adjmen_personnumbermatch(ante, ana)) | |
86 | + vec.append(features.adjsent_anapron_adjmen(ante, ana)) | |
87 | + | |
49 | 88 | return vec |
... | ... |
corneferencer/utils.py
... | ... | @@ -3,6 +3,8 @@ from __future__ import print_function |
3 | 3 | import codecs |
4 | 4 | import sys |
5 | 5 | |
6 | +import javaobj | |
7 | + | |
6 | 8 | from keras.models import Model |
7 | 9 | from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization |
8 | 10 | |
... | ... | @@ -46,3 +48,29 @@ def load_freq_list(freq_path): |
46 | 48 | if base not in freq_list: |
47 | 49 | freq_list[base] = freq |
48 | 50 | return freq_list |
51 | + | |
52 | + | |
53 | +def load_one2many_map(map_path): | |
54 | + this_map = {} | |
55 | + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb')) | |
56 | + pobj = marshaller.readObject() | |
57 | + jmap_annotations = pobj.__dict__['annotations'] | |
58 | + jmap_annotations_count = len(jmap_annotations) | |
59 | + for i in range(jmap_annotations_count): | |
60 | + if i%2 == 1: | |
61 | + mapped_elements = set(jmap_annotations[i+1].__dict__['annotations']) | |
62 | + this_map[jmap_annotations[i]] = mapped_elements | |
63 | + return this_map | |
64 | + | |
65 | + | |
66 | +def load_one2one_map(map_path): | |
67 | + this_map = {} | |
68 | + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb')) | |
69 | + pobj = marshaller.readObject() | |
70 | + jmap_annotations = pobj.__dict__['annotations'] | |
71 | + jmap_annotations_count = len(jmap_annotations) | |
72 | + for i in range(jmap_annotations_count): | |
73 | + if i%2 == 1: | |
74 | + element = jmap_annotations[i+1] | |
75 | + this_map[jmap_annotations[i]] = element | |
76 | + return this_map | |
... | ... |