Commit 94c509a1097b7a480243539c9be92756fcd90ebf

Authored by Bartłomiej Nitoń
1 parent 0f6eeffb

Added 43 new features.

1 1 import os
2 2  
3   -from gensim.models.word2vec import Word2Vec
  3 +import utils
4 4  
5   -from corneferencer.utils import initialize_neural_model, load_freq_list
  5 +from gensim.models.word2vec import Word2Vec
6 6  
7 7  
8 8 CONTEXT = 5
9   -THRESHOLD = 0.5
  9 +THRESHOLD = 0.95
10 10 RANDOM_WORD_VECTORS = True
11 11 W2V_SIZE = 50
12 12 W2V_MODEL_NAME = 'w2v_allwiki_nkjpfull_50.model'
13 13  
14   -NUMBER_OF_FEATURES = 1147
15   -NEURAL_MODEL_NAME = 'model_1147_features.h5'
  14 +NUMBER_OF_FEATURES = 1190
  15 +NEURAL_MODEL_NAME = 'model_1190_features.h5'
16 16  
17 17 FREQ_LIST_NAME = 'base.lst'
  18 +LEMMA2SYNONYMS_NAME = 'lemma2synonyms.map'
  19 +LEMMA2HYPERNYMS_NAME = 'lemma2hypernyms.map'
  20 +TITLE2LINKS_NAME = 'link.map'
  21 +TITLE2REDIRECT_NAME = 'redirect.map'
  22 +
18 23  
19 24 # do not change that
20   -W2V_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', W2V_MODEL_NAME)
  25 +MAIN_PATH = os.path.dirname(__file__)
  26 +
  27 +W2V_MODEL_PATH = os.path.join(MAIN_PATH, 'models', W2V_MODEL_NAME)
21 28 W2V_MODEL = Word2Vec.load(W2V_MODEL_PATH)
22 29  
23   -NEURAL_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', NEURAL_MODEL_NAME)
24   -NEURAL_MODEL = initialize_neural_model(NUMBER_OF_FEATURES, NEURAL_MODEL_PATH)
  30 +NEURAL_MODEL_PATH = os.path.join(MAIN_PATH, 'models', NEURAL_MODEL_NAME)
  31 +NEURAL_MODEL = utils.initialize_neural_model(NUMBER_OF_FEATURES, NEURAL_MODEL_PATH)
  32 +
  33 +FREQ_LIST_PATH = os.path.join(MAIN_PATH, 'freq', FREQ_LIST_NAME)
  34 +FREQ_LIST = utils.load_freq_list(FREQ_LIST_PATH)
  35 +
  36 +LEMMA2SYNONYMS_PATH = os.path.join(MAIN_PATH, 'wordnet', LEMMA2SYNONYMS_NAME)
  37 +LEMMA2SYNONYMS = utils.load_one2many_map(LEMMA2SYNONYMS_PATH)
  38 +
  39 +LEMMA2HYPERNYMS_PATH = os.path.join(MAIN_PATH, 'wordnet', LEMMA2HYPERNYMS_NAME)
  40 +LEMMA2HYPERNYMS = utils.load_one2many_map(LEMMA2HYPERNYMS_PATH)
  41 +
  42 +TITLE2LINKS_PATH = os.path.join(MAIN_PATH, 'wikipedia', TITLE2LINKS_NAME)
  43 +TITLE2LINKS = utils.load_one2many_map(TITLE2LINKS_PATH)
25 44  
26   -FREQ_LIST_PATH = os.path.join(os.path.dirname(__file__), 'freq', FREQ_LIST_NAME)
27   -FREQ_LIST = load_freq_list(FREQ_LIST_PATH)
  45 +TITLE2REDIRECT_PATH = os.path.join(MAIN_PATH, 'wikipedia', TITLE2REDIRECT_NAME)
  46 +TITLE2REDIRECT = utils.load_one2one_map(TITLE2REDIRECT_PATH)
... ...
corneferencer/entities.py
... ... @@ -19,7 +19,8 @@ class Mention:
19 19 def __init__(self, mnt_id, text, lemmatized_text, words, span,
20 20 head_orth, head, dominant, node, prec_context,
21 21 follow_context, sentence, position_in_mentions,
22   - start_in_words, end_in_words, rarest, paragraph_id, sentence_id):
  22 + start_in_words, end_in_words, rarest, paragraph_id, sentence_id,
  23 + first_in_sentence, first_in_paragraph):
23 24 self.id = mnt_id
24 25 self.set = ''
25 26 self.old_set = ''
... ... @@ -37,7 +38,9 @@ class Mention:
37 38 self.position_in_mentions = position_in_mentions
38 39 self.start_in_words = start_in_words
39 40 self.end_in_words = end_in_words
40   - self.features = get_mention_features(self)
41 41 self.rarest = rarest
42 42 self.paragraph_id = paragraph_id
43 43 self.sentence_id = sentence_id
  44 + self.first_in_sentence = first_in_sentence
  45 + self.first_in_paragraph = first_in_paragraph
  46 + self.features = get_mention_features(self)
... ...
corneferencer/inout/mmax.py
... ... @@ -39,7 +39,8 @@ def read_mentions(mentions_path, words_path):
39 39  
40 40 (prec_context, follow_context, sentence,
41 41 mnt_start_position, mnt_end_position,
42   - paragraph_id, sentence_id) = get_context(mention_words, words)
  42 + paragraph_id, sentence_id,
  43 + first_in_sentence, first_in_paragraph) = get_context(mention_words, words)
43 44  
44 45 head = get_head(head_orth, mention_words)
45 46 mention = Mention(mnt_id=markable.attrib['id'],
... ... @@ -59,7 +60,9 @@ def read_mentions(mentions_path, words_path):
59 60 end_in_words=mnt_end_position,
60 61 rarest=get_rarest_word(mention_words),
61 62 paragraph_id=paragraph_id,
62   - sentence_id=sentence_id)
  63 + sentence_id=sentence_id,
  64 + first_in_sentence=first_in_sentence,
  65 + first_in_paragraph=first_in_paragraph)
63 66 mentions.append(mention)
64 67  
65 68 return mentions
... ... @@ -151,10 +154,16 @@ def get_context(mention_words, words):
151 154 mnt_end_position = -1
152 155 first_word = mention_words[0]
153 156 last_word = mention_words[-1]
  157 + first_in_sentence = False
  158 + first_in_paragraph = False
154 159 for idx, word in enumerate(words):
155 160 if word['id'] == first_word['id']:
156 161 prec_context = get_prec_context(idx, words)
157 162 mnt_start_position = get_mention_start(first_word, words)
  163 + if idx == 0 or words[idx-1]['lastinsent']:
  164 + first_in_sentence = True
  165 + if idx == 0 or words[idx-1]['lastinpar']:
  166 + first_in_paragraph = True
158 167 if word['id'] == last_word['id']:
159 168 follow_context = get_follow_context(idx, words)
160 169 sentence = get_sentence(idx, words)
... ... @@ -164,7 +173,8 @@ def get_context(mention_words, words):
164 173 sentence_id += 1
165 174 if word['lastinpar']:
166 175 paragraph_id += 1
167   - return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id
  176 + return (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position,
  177 + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph)
168 178  
169 179  
170 180 def get_prec_context(mention_start, words):
... ...
corneferencer/resolvers/constants.py
... ... @@ -3,3 +3,10 @@ RESOLVERS = ['entity_based', 'incremental']
3 3 NOUN_TAGS = ['subst', 'ger', 'depr']
4 4 PPRON_TAGS = ['ppron12', 'ppron3']
5 5 ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt']
  6 +SIEBIE_TAGS = ['siebie']
  7 +MASCULINE_TAGS = ['m1', 'm2', 'm3']
  8 +
  9 +FIRST_SECOND_PERSON = ['pri', 'sec']
  10 +INDICATIVE_PRONS_BASES = [u'ten', u'ta', u'to', u'ci', u'te', u'tamten', u'tamta',
  11 + u'tamto', u'tamci', u'tamte', u'ów', u'owa', u'owo', u'owi',
  12 + u'owe']
... ...
corneferencer/resolvers/features.py
  1 +import math
1 2 import numpy
2 3 import random
  4 +import re
3 5  
4   -from conf import RANDOM_WORD_VECTORS, W2V_MODEL, W2V_SIZE
  6 +import conf
5 7 from corneferencer.resolvers import constants
6 8  
7 9  
... ... @@ -10,63 +12,63 @@ def head_vec(mention):
10 12 head_base = mention.head_orth
11 13 if mention.head is not None:
12 14 head_base = mention.head['base']
13   - return list(get_wv(W2V_MODEL, head_base))
  15 + return list(get_wv(conf.W2V_MODEL, head_base))
14 16  
15 17  
16 18 def first_word_vec(mention):
17   - return list(get_wv(W2V_MODEL, mention.words[0]['base']))
  19 + return list(get_wv(conf.W2V_MODEL, mention.words[0]['base']))
18 20  
19 21  
20 22 def last_word_vec(mention):
21   - return list(get_wv(W2V_MODEL, mention.words[-1]['base']))
  23 + return list(get_wv(conf.W2V_MODEL, mention.words[-1]['base']))
22 24  
23 25  
24 26 def first_after_vec(mention):
25 27 if len(mention.follow_context) > 0:
26   - vec = list(get_wv(W2V_MODEL, mention.follow_context[0]['base']))
  28 + vec = list(get_wv(conf.W2V_MODEL, mention.follow_context[0]['base']))
27 29 else:
28   - vec = [0.0] * W2V_SIZE
  30 + vec = [0.0] * conf.W2V_SIZE
29 31 return vec
30 32  
31 33  
32 34 def second_after_vec(mention):
33 35 if len(mention.follow_context) > 1:
34   - vec = list(get_wv(W2V_MODEL, mention.follow_context[1]['base']))
  36 + vec = list(get_wv(conf.W2V_MODEL, mention.follow_context[1]['base']))
35 37 else:
36   - vec = [0.0] * W2V_SIZE
  38 + vec = [0.0] * conf.W2V_SIZE
37 39 return vec
38 40  
39 41  
40 42 def first_before_vec(mention):
41 43 if len(mention.prec_context) > 0:
42   - vec = list(get_wv(W2V_MODEL, mention.prec_context[-1]['base']))
  44 + vec = list(get_wv(conf.W2V_MODEL, mention.prec_context[-1]['base']))
43 45 else:
44   - vec = [0.0] * W2V_SIZE
  46 + vec = [0.0] * conf.W2V_SIZE
45 47 return vec
46 48  
47 49  
48 50 def second_before_vec(mention):
49 51 if len(mention.prec_context) > 1:
50   - vec = list(get_wv(W2V_MODEL, mention.prec_context[-2]['base']))
  52 + vec = list(get_wv(conf.W2V_MODEL, mention.prec_context[-2]['base']))
51 53 else:
52   - vec = [0.0] * W2V_SIZE
  54 + vec = [0.0] * conf.W2V_SIZE
53 55 return vec
54 56  
55 57  
56 58 def preceding_context_vec(mention):
57   - return list(get_context_vec(mention.prec_context, W2V_MODEL))
  59 + return list(get_context_vec(mention.prec_context, conf.W2V_MODEL))
58 60  
59 61  
60 62 def following_context_vec(mention):
61   - return list(get_context_vec(mention.follow_context, W2V_MODEL))
  63 + return list(get_context_vec(mention.follow_context, conf.W2V_MODEL))
62 64  
63 65  
64 66 def mention_vec(mention):
65   - return list(get_context_vec(mention.words, W2V_MODEL))
  67 + return list(get_context_vec(mention.words, conf.W2V_MODEL))
66 68  
67 69  
68 70 def sentence_vec(mention):
69   - return list(get_context_vec(mention.sentence, W2V_MODEL))
  71 + return list(get_context_vec(mention.sentence, conf.W2V_MODEL))
70 72  
71 73  
72 74 def mention_type(mention):
... ... @@ -84,6 +86,75 @@ def mention_type(mention):
84 86 return type_vec
85 87  
86 88  
  89 +def is_first_second_person(mention):
  90 + if mention.head['person'] in constants.FIRST_SECOND_PERSON:
  91 + return 1
  92 + return 0
  93 +
  94 +
  95 +def is_demonstrative(mention):
  96 + if mention.words[0]['base'].lower() in constants.INDICATIVE_PRONS_BASES:
  97 + return 1
  98 + return 0
  99 +
  100 +
  101 +def is_demonstrative_nominal(mention):
  102 + if is_demonstrative(mention) and mention.head['ctag'] in constants.NOUN_TAGS:
  103 + return 1
  104 + return 0
  105 +
  106 +
  107 +def is_demonstrative_pronoun(mention):
  108 + if (is_demonstrative(mention) and
  109 + (mention.head['ctag'] in constants.PPRON_TAGS or mention.head['ctag'] in constants.ZERO_TAGS)):
  110 + return 1
  111 + return 0
  112 +
  113 +
  114 +def is_refl_pronoun(mention):
  115 + if mention.head['ctag'] in constants.SIEBIE_TAGS:
  116 + return 1
  117 + return 0
  118 +
  119 +
  120 +def is_first_in_sentence(mention):
  121 + if mention.first_in_sentence:
  122 + return 1
  123 + return 0
  124 +
  125 +
  126 +def is_zero_or_pronoun(mention):
  127 + if mention.head['ctag'] in constants.PPRON_TAGS or mention.head['ctag'] in constants.ZERO_TAGS:
  128 + return 1
  129 + return 0
  130 +
  131 +
  132 +def head_contains_digit(mention):
  133 + _digits = re.compile('\d')
  134 + if _digits.search(mention.head_orth):
  135 + return 1
  136 + return 0
  137 +
  138 +
  139 +def mention_contains_digit(mention):
  140 + _digits = re.compile('\d')
  141 + if _digits.search(mention.text):
  142 + return 1
  143 + return 0
  144 +
  145 +
  146 +def contains_letter(mention):
  147 + if any(c.isalpha() for c in mention.text):
  148 + return 1
  149 + return 0
  150 +
  151 +
  152 +def post_modified(mention):
  153 + if mention.head['orth'] != mention.words[-1]['orth']:
  154 + return 1
  155 + return 0
  156 +
  157 +
87 158 # pair features
88 159 def distances_vec(ante, ana):
89 160 vec = []
... ... @@ -171,12 +242,207 @@ def same_sentence(ante, ana):
171 242 return 0
172 243  
173 244  
  245 +def neighbouring_sentence(ante, ana):
  246 + if ana.sentence_id - ante.sentence_id == 1:
  247 + return 1
  248 + return 0
  249 +
  250 +
  251 +def cousin_sentence(ante, ana):
  252 + if ana.sentence_id - ante.sentence_id == 2:
  253 + return 1
  254 + return 0
  255 +
  256 +
  257 +def distant_sentence(ante, ana):
  258 + if ana.sentence_id - ante.sentence_id > 2:
  259 + return 1
  260 + return 0
  261 +
  262 +
174 263 def same_paragraph(ante, ana):
175 264 if ante.paragraph_id == ana.paragraph_id:
176 265 return 1
177 266 return 0
178 267  
179 268  
  269 +def flat_gender_agreement(ante, ana):
  270 + agr_vec = [0] * 3
  271 + if ante.head['gender'] == 'unk' or ana.head['gender'] == 'unk':
  272 + agr_vec[2] = 1
  273 + elif (ante.head['gender'] == ana.head['gender'] or
  274 + (ante.head['gender'] in constants.MASCULINE_TAGS and ana.head['gender'] in constants.MASCULINE_TAGS)):
  275 + agr_vec[0] = 1
  276 + else:
  277 + agr_vec[1] = 1
  278 + return agr_vec
  279 +
  280 +
  281 +def left_match(ante, ana):
  282 + if (ante.text.lower().startswith(ana.text.lower()) or
  283 + ana.text.lower().startswith(ante.text.lower())):
  284 + return 1
  285 + return 0
  286 +
  287 +
  288 +def right_match(ante, ana):
  289 + if (ante.text.lower().endswith(ana.text.lower()) or
  290 + ana.text.lower().endswith(ante.text.lower())):
  291 + return 1
  292 + return 0
  293 +
  294 +
  295 +def abbrev2(ante, ana):
  296 + ante_abbrev = get_abbrev(ante)
  297 + ana_abbrev = get_abbrev(ana)
  298 + if ante.head_orth == ana_abbrev or ana.head_orth == ante_abbrev:
  299 + return 1
  300 + return 0
  301 +
  302 +
  303 +def string_kernel(ante, ana):
  304 + s1 = ante.text
  305 + s2 = ana.text
  306 + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2)))
  307 +
  308 +
  309 +def head_string_kernel(ante, ana):
  310 + s1 = ante.head_orth
  311 + s2 = ana.head_orth
  312 + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2)))
  313 +
  314 +
  315 +def wordnet_synonyms(ante, ana):
  316 + ante_synonyms = set()
  317 + if ante.head['base'] in conf.LEMMA2SYNONYMS:
  318 + ante_synonyms = conf.LEMMA2SYNONYMS[ante.head['base']]
  319 +
  320 + ana_synonyms = set()
  321 + if ana.head['base'] in conf.LEMMA2SYNONYMS:
  322 + ana_synonyms = conf.LEMMA2SYNONYMS[ana.head['base']]
  323 +
  324 + if ana.head['base'] in ante_synonyms or ante.head['base'] in ana_synonyms:
  325 + return 1
  326 + return 0
  327 +
  328 +
  329 +def wordnet_ana_is_hypernym(ante, ana):
  330 + ante_hypernyms = set()
  331 + if ante.head['base'] in conf.LEMMA2HYPERNYMS:
  332 + ante_hypernyms = conf.LEMMA2HYPERNYMS[ante.head['base']]
  333 +
  334 + ana_hypernyms = set()
  335 + if ana.head['base'] in conf.LEMMA2HYPERNYMS:
  336 + ana_hypernyms = conf.LEMMA2HYPERNYMS[ana.head['base']]
  337 +
  338 + if not ante_hypernyms or not ana_hypernyms:
  339 + return 0
  340 +
  341 + if ana.head['base'] in ante_hypernyms:
  342 + return 1
  343 + return 0
  344 +
  345 +
  346 +def wordnet_ante_is_hypernym(ante, ana):
  347 + ana_hypernyms = set()
  348 + if ana.head['base'] in conf.LEMMA2HYPERNYMS:
  349 + ana_hypernyms = conf.LEMMA2HYPERNYMS[ana.head['base']]
  350 +
  351 + ante_hypernyms = set()
  352 + if ante.head['base'] in conf.LEMMA2HYPERNYMS:
  353 + ante_hypernyms = conf.LEMMA2HYPERNYMS[ante.head['base']]
  354 +
  355 + if not ante_hypernyms or not ana_hypernyms:
  356 + return 0
  357 +
  358 + if ante.head['base'] in ana_hypernyms:
  359 + return 1
  360 + return 0
  361 +
  362 +
  363 +def wikipedia_link(ante, ana):
  364 + ante_base = ante.lemmatized_text.lower()
  365 + ana_base = ana.lemmatized_text.lower()
  366 + if ante_base == ana_base:
  367 + return 1
  368 +
  369 + ante_links = set()
  370 + if ante_base in conf.TITLE2LINKS:
  371 + ante_links = conf.TITLE2LINKS[ante_base]
  372 +
  373 + ana_links = set()
  374 + if ana_base in conf.TITLE2LINKS:
  375 + ana_links = conf.TITLE2LINKS[ana_base]
  376 +
  377 + if ana_base in ante_links or ante_base in ana_links:
  378 + return 1
  379 +
  380 + return 0
  381 +
  382 +
  383 +def wikipedia_mutual_link(ante, ana):
  384 + ante_base = ante.lemmatized_text.lower()
  385 + ana_base = ana.lemmatized_text.lower()
  386 + if ante_base == ana_base:
  387 + return 1
  388 +
  389 + ante_links = set()
  390 + if ante_base in conf.TITLE2LINKS:
  391 + ante_links = conf.TITLE2LINKS[ante_base]
  392 +
  393 + ana_links = set()
  394 + if ana_base in conf.TITLE2LINKS:
  395 + ana_links = conf.TITLE2LINKS[ana_base]
  396 +
  397 + if ana_base in ante_links and ante_base in ana_links:
  398 + return 1
  399 +
  400 + return 0
  401 +
  402 +
  403 +def wikipedia_redirect(ante, ana):
  404 + ante_base = ante.lemmatized_text.lower()
  405 + ana_base = ana.lemmatized_text.lower()
  406 + if ante_base == ana_base:
  407 + return 1
  408 +
  409 + if ante_base in conf.TITLE2REDIRECT and conf.TITLE2REDIRECT[ante_base] == ana_base:
  410 + return 1
  411 +
  412 + if ana_base in conf.TITLE2REDIRECT and conf.TITLE2REDIRECT[ana_base] == ante_base:
  413 + return 1
  414 +
  415 + return 0
  416 +
  417 +
  418 +def samesent_anapron_antefirstinpar(ante, ana):
  419 + if same_sentence(ante, ana) and is_zero_or_pronoun(ana) and ante.first_in_paragraph:
  420 + return 1
  421 + return 0
  422 +
  423 +
  424 +def samesent_antefirstinpar_personnumbermatch(ante, ana):
  425 + if (same_sentence(ante, ana) and ante.first_in_paragraph
  426 + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
  427 + return 1
  428 + return 0
  429 +
  430 +
  431 +def adjsent_anapron_adjmen_personnumbermatch(ante, ana):
  432 + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana)
  433 + and ana.position_in_mentions - ante.position_in_mentions == 1
  434 + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
  435 + return 1
  436 + return 0
  437 +
  438 +
  439 +def adjsent_anapron_adjmen(ante, ana):
  440 + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana)
  441 + and ana.position_in_mentions - ante.position_in_mentions == 1):
  442 + return 1
  443 + return 0
  444 +
  445 +
180 446 # supporting functions
181 447 def get_wv(model, lemma, use_random_vec=True):
182 448 vec = None
... ... @@ -192,15 +458,15 @@ def get_wv(model, lemma, use_random_vec=True):
192 458  
193 459  
194 460 def random_vec():
195   - return numpy.asarray([random.uniform(-0.25, 0.25) for i in range(0, W2V_SIZE)], dtype=numpy.float32)
  461 + return numpy.asarray([random.uniform(-0.25, 0.25) for i in range(0, conf.W2V_SIZE)], dtype=numpy.float32)
196 462  
197 463  
198 464 def get_context_vec(words, model):
199   - vec = numpy.zeros(W2V_SIZE, dtype=numpy.float32)
  465 + vec = numpy.zeros(conf.W2V_SIZE, dtype=numpy.float32)
200 466 unknown_count = 0
201 467 if len(words) != 0:
202 468 for word in words:
203   - word_vec = get_wv(model, word['base'], RANDOM_WORD_VECTORS)
  469 + word_vec = get_wv(model, word['base'], conf.RANDOM_WORD_VECTORS)
204 470 if word_vec is None:
205 471 unknown_count += 1
206 472 else:
... ... @@ -239,3 +505,65 @@ def check_one_way_acronym(acronym, expression):
239 505 if acronym == initials:
240 506 return 1
241 507 return 0
  508 +
  509 +
  510 +def get_abbrev(mention):
  511 + abbrev = u''
  512 + for word in mention.words:
  513 + if word['orth'][0].isupper():
  514 + abbrev += word['orth'][0]
  515 + return abbrev
  516 +
  517 +
  518 +def SK(s1, s2):
  519 + LAMBDA = 0.4
  520 +
  521 + p = len(s1)
  522 + if len(s2) < len(s1):
  523 + p = len(s2)
  524 +
  525 + h, w = len(s1)+1, len(s2)+1
  526 + DPS = [[0.0] * w for i in range(h)]
  527 + DP = [[0.0] * w for i in range(h)]
  528 +
  529 + kernel_mat = [0.0] * (len(s1) + 1)
  530 +
  531 + for i in range(len(s1)+1):
  532 + if i == 0:
  533 + continue
  534 + for j in range(len(s2)+1):
  535 + if j == 0:
  536 + continue
  537 + if s1[i-1] == s2[j-1]:
  538 + DPS[i][j] = LAMBDA * LAMBDA
  539 + kernel_mat[0] += DPS[i][j]
  540 + else:
  541 + DPS[i][j] = 0.0
  542 +
  543 + for l in range(p):
  544 + if l == 0:
  545 + continue
  546 +
  547 + kernel_mat[l] = 0.0
  548 + for j in range(len(s2)+1):
  549 + DP[l-1][j] = 0.0
  550 +
  551 + for i in range(len(s1)+1):
  552 + DP[i][l-1] = 0.0
  553 +
  554 + for i in range(len(s1)+1):
  555 + if i < l:
  556 + continue
  557 + for j in range(len(s2)+1):
  558 + if j < l:
  559 + continue
  560 + DP[i][j] = DPS[i][j] + LAMBDA * DP[i - 1][j] + LAMBDA * DP[i][j - 1] - LAMBDA * LAMBDA * DP[i - 1][j - 1]
  561 +
  562 + if s1[i-1] == s2[j-1]:
  563 + DPS[i][j] = LAMBDA * LAMBDA * DP[i - 1][j - 1]
  564 + kernel_mat[l] += DPS[i][j]
  565 +
  566 + K = 0.0
  567 + for l in range(p):
  568 + K += kernel_mat[l]
  569 + return K
... ...
corneferencer/resolvers/vectors.py
... ... @@ -27,6 +27,19 @@ def get_mention_features(mention):
27 27 # cechy uzupelniajace
28 28 vec.extend(features.mention_type(mention))
29 29  
  30 + # cechy uzupelniajace 2
  31 + vec.append(features.is_first_second_person(mention))
  32 + vec.append(features.is_demonstrative(mention))
  33 + vec.append(features.is_demonstrative_nominal(mention))
  34 + vec.append(features.is_demonstrative_pronoun(mention))
  35 + vec.append(features.is_refl_pronoun(mention))
  36 + vec.append(features.is_first_in_sentence(mention))
  37 + vec.append(features.is_zero_or_pronoun(mention))
  38 + vec.append(features.head_contains_digit(mention))
  39 + vec.append(features.mention_contains_digit(mention))
  40 + vec.append(features.contains_letter(mention))
  41 + vec.append(features.post_modified(mention))
  42 +
30 43 return vec
31 44  
32 45  
... ... @@ -46,4 +59,30 @@ def get_pair_features(ante, ana):
46 59 vec.append(features.same_sentence(ante, ana))
47 60 vec.append(features.same_paragraph(ante, ana))
48 61  
  62 + # cechy uzupelniajace 2
  63 + vec.append(features.neighbouring_sentence(ante, ana))
  64 + vec.append(features.cousin_sentence(ante, ana))
  65 + vec.append(features.distant_sentence(ante, ana))
  66 + vec.extend(features.flat_gender_agreement(ante, ana))
  67 + vec.append(features.left_match(ante, ana))
  68 + vec.append(features.right_match(ante, ana))
  69 + vec.append(features.abbrev2(ante, ana))
  70 +
  71 + vec.append(features.string_kernel(ante, ana))
  72 + vec.append(features.head_string_kernel(ante, ana))
  73 +
  74 + vec.append(features.wordnet_synonyms(ante, ana))
  75 + vec.append(features.wordnet_ana_is_hypernym(ante, ana))
  76 + vec.append(features.wordnet_ante_is_hypernym(ante, ana))
  77 +
  78 + vec.append(features.wikipedia_link(ante, ana))
  79 + vec.append(features.wikipedia_mutual_link(ante, ana))
  80 + vec.append(features.wikipedia_redirect(ante, ana))
  81 +
  82 + # combined
  83 + vec.append(features.samesent_anapron_antefirstinpar(ante, ana))
  84 + vec.append(features.samesent_antefirstinpar_personnumbermatch(ante, ana))
  85 + vec.append(features.adjsent_anapron_adjmen_personnumbermatch(ante, ana))
  86 + vec.append(features.adjsent_anapron_adjmen(ante, ana))
  87 +
49 88 return vec
... ...
corneferencer/utils.py
... ... @@ -3,6 +3,8 @@ from __future__ import print_function
3 3 import codecs
4 4 import sys
5 5  
  6 +import javaobj
  7 +
6 8 from keras.models import Model
7 9 from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
8 10  
... ... @@ -46,3 +48,29 @@ def load_freq_list(freq_path):
46 48 if base not in freq_list:
47 49 freq_list[base] = freq
48 50 return freq_list
  51 +
  52 +
  53 +def load_one2many_map(map_path):
  54 + this_map = {}
  55 + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb'))
  56 + pobj = marshaller.readObject()
  57 + jmap_annotations = pobj.__dict__['annotations']
  58 + jmap_annotations_count = len(jmap_annotations)
  59 + for i in range(jmap_annotations_count):
  60 + if i%2 == 1:
  61 + mapped_elements = set(jmap_annotations[i+1].__dict__['annotations'])
  62 + this_map[jmap_annotations[i]] = mapped_elements
  63 + return this_map
  64 +
  65 +
  66 +def load_one2one_map(map_path):
  67 + this_map = {}
  68 + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb'))
  69 + pobj = marshaller.readObject()
  70 + jmap_annotations = pobj.__dict__['annotations']
  71 + jmap_annotations_count = len(jmap_annotations)
  72 + for i in range(jmap_annotations_count):
  73 + if i%2 == 1:
  74 + element = jmap_annotations[i+1]
  75 + this_map[jmap_annotations[i]] = element
  76 + return this_map
... ...