Commit 7d27f9e9ae10935e1a182e7860943dcbb416e473

Authored by Bartłomiej Nitoń
1 parent 04c45e2d

Added Bartek-3 features to preparator script.

data/wikipedia/link.map 0 → 100755
No preview for this file type
data/wikipedia/redirect.map 0 → 100755
No preview for this file type
data/wordnet/lemma2hypernyms.map 0 → 100755
No preview for this file type
data/wordnet/lemma2synonyms.map 0 → 100755
No preview for this file type
for_investigation.ipynb
... ... @@ -114,7 +114,7 @@
114 114 },
115 115 "outputs": [],
116 116 "source": [
117   - " predictions = model.predict(test_set)"
  117 + "predictions = model.predict(test_set)"
118 118 ]
119 119 },
120 120 {
... ... @@ -141,7 +141,7 @@
141 141 }
142 142 ],
143 143 "source": [
144   - " true_positives = 0.0\n",
  144 + "true_positives = 0.0\n",
145 145 " false_positives = 0.0\n",
146 146 " true_negatives = 0.0\n",
147 147 " false_negatives = 0.0\n",
... ... @@ -173,7 +173,7 @@
173 173 "language_info": {
174 174 "codemirror_mode": {
175 175 "name": "ipython",
176   - "version": 2
  176 + "version": 2.0
177 177 },
178 178 "file_extension": ".py",
179 179 "mimetype": "text/x-python",
... ... @@ -184,5 +184,5 @@
184 184 }
185 185 },
186 186 "nbformat": 4,
187   - "nbformat_minor": 2
188   -}
  187 + "nbformat_minor": 0
  188 +}
189 189 \ No newline at end of file
... ...
mention-pair-classifier.ipynb
... ... @@ -78,7 +78,7 @@
78 78 "number_of_features = 1126\n",
79 79 "\n",
80 80 "X = data[:,0:1126]\n",
81   - "Y = data[:,1126] #last column consists of labels\n"
  81 + "Y = data[:,1126] #last column consists of labels"
82 82 ]
83 83 },
84 84 {
... ... @@ -270,7 +270,7 @@
270 270 "language_info": {
271 271 "codemirror_mode": {
272 272 "name": "ipython",
273   - "version": 2
  273 + "version": 2.0
274 274 },
275 275 "file_extension": ".py",
276 276 "mimetype": "text/x-python",
... ... @@ -281,5 +281,5 @@
281 281 }
282 282 },
283 283 "nbformat": 4,
284   - "nbformat_minor": 2
285   -}
  284 + "nbformat_minor": 0
  285 +}
286 286 \ No newline at end of file
... ...
preparator.py
1 1 # -*- coding: utf-8 -*-
2 2  
3 3 import codecs
  4 +import math
4 5 import numpy
5 6 import os
6 7 import random
  8 +import re
  9 +
  10 +import javaobj
7 11  
8 12 from lxml import etree
9 13 from itertools import combinations
... ... @@ -12,25 +16,39 @@ from natsort import natsorted
12 16 from gensim.models.word2vec import Word2Vec
13 17  
14 18  
15   -TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'test-prepared'))
16   -TRAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'train-prepared'))
17   -FREQ_300M_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'freq', 'base.lst'))
  19 +MAIN_PATH = os.path.dirname(__file__)
  20 +TEST_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'test-prepared'))
  21 +TRAIN_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'train-prepared'))
  22 +FREQ_300M_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'freq', 'base.lst'))
  23 +
  24 +LEMMA2SYNONYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2synonyms.map'))
  25 +LEMMA2HYPERNYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2hypernyms.map'))
  26 +
  27 +TITLE2LINKS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'link.map'))
  28 +TITLE2REDIRECT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'redirect.map'))
18 29  
19 30 ANNO_PATH = TEST_PATH
20   -OUT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data',
21   - 'test-20170627.csv'))
  31 +OUT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data',
  32 + 'test-20170720.csv'))
22 33 EACH_TEXT_SEPARATELLY = False
23 34  
24 35 CONTEXT = 5
25 36 W2V_SIZE = 50
26   -MODEL = os.path.abspath(os.path.join(os.path.dirname(__file__), 'models',
  37 +MODEL = os.path.abspath(os.path.join(MAIN_PATH, 'models',
27 38 '%d' % W2V_SIZE,
28 39 'w2v_allwiki_nkjpfull_%d.model' % W2V_SIZE))
29 40  
  41 +FIRST_SECOND_PERSON = ['pri', 'sec']
  42 +INDICATIVE_PRONS_BASES = ["ten", "ta", "to", "ci", "te", "tamten", "tamta",
  43 + "tamto", "tamci", "tamte", "ów", "owa", "owo", "owi", "owe"]
  44 +SIEBIE_TAGS = ['siebie']
  45 +MASCULINE_TAGS = ['m1', 'm2', 'm3']
  46 +
30 47 NOUN_TAGS = ['subst', 'ger', 'depr']
31 48 PPRON_TAGS = ['ppron12', 'ppron3']
32 49 ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt']
33 50 POSSIBLE_HEADS = [u'§', u'%', u'*', u'"', u'„', u'&', u'-']
  51 +HYPHEN_SIGNS = ['-', '#']
34 52  
35 53 NEG_PROPORTION = 1
36 54 RANDOM_VECTORS = True
... ... @@ -45,13 +63,18 @@ UNKNONW_WORDS = 0
45 63 def main():
46 64 model = Word2Vec.load(MODEL)
47 65 freq_list = load_freq_list(FREQ_300M_PATH)
  66 + lemma2synonyms = load_one2many_map(LEMMA2SYNONYMS_PATH)
  67 + lemma2hypernyms = load_one2many_map(LEMMA2HYPERNYMS_PATH)
  68 + title2links = load_one2many_map(TITLE2LINKS_PATH)
  69 + title2redirect = load_one2one_map(TITLE2REDIRECT_PATH)
48 70 try:
49   - create_data_vectors(model, freq_list)
  71 + create_data_vectors(model, freq_list, lemma2synonyms,
  72 + lemma2hypernyms, title2links, title2redirect)
50 73 finally:
51   - print 'Unknown words: ', UNKNONW_WORDS
52   - print 'All words: ', ALL_WORDS
53   - print 'Positives: ', POS_COUNT
54   - print 'Negatives: ', NEG_COUNT
  74 + print ('Unknown words: ', UNKNONW_WORDS)
  75 + print ('All words: ', ALL_WORDS)
  76 + print ('Positives: ', POS_COUNT)
  77 + print ('Negatives: ', NEG_COUNT)
55 78  
56 79  
57 80 def load_freq_list(freq_path):
... ... @@ -67,16 +90,43 @@ def load_freq_list(freq_path):
67 90 return freq_list
68 91  
69 92  
70   -def create_data_vectors(model, freq_list):
  93 +def load_one2many_map(map_path):
  94 + this_map = {}
  95 + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb'))
  96 + pobj = marshaller.readObject()
  97 + jmap_annotations = pobj.__dict__['annotations']
  98 + jmap_annotations_count = len(jmap_annotations)
  99 + for i in range(jmap_annotations_count):
  100 + if i%2 == 1:
  101 + mapped_elements = set(jmap_annotations[i+1].__dict__['annotations'])
  102 + this_map[jmap_annotations[i]] = mapped_elements
  103 + return this_map
  104 +
  105 +
  106 +def load_one2one_map(map_path):
  107 + this_map = {}
  108 + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb'))
  109 + pobj = marshaller.readObject()
  110 + jmap_annotations = pobj.__dict__['annotations']
  111 + jmap_annotations_count = len(jmap_annotations)
  112 + for i in range(jmap_annotations_count):
  113 + if i%2 == 1:
  114 + element = jmap_annotations[i+1]
  115 + this_map[jmap_annotations[i]] = element
  116 + return this_map
  117 +
  118 +
  119 +def create_data_vectors(model, freq_list, lemma2synonyms,
  120 + lemma2hypernyms, title2links, title2redirect):
71 121 features_file = None
72 122 if not EACH_TEXT_SEPARATELLY:
73   - features_file = codecs.open(OUT_PATH, 'wt', 'utf-8')
  123 + features_file = codecs.open(OUT_PATH, 'w', 'utf-8')
74 124  
75 125 anno_files = os.listdir(ANNO_PATH)
76 126 anno_files = natsorted(anno_files)
77 127 for filename in anno_files:
78 128 if filename.endswith('.mmax'):
79   - print '=======> ', filename
  129 + print ('=======> ', filename)
80 130 textname = filename.replace('.mmax', '')
81 131  
82 132 mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname)
... ... @@ -85,19 +135,18 @@ def create_data_vectors(model, freq_list):
85 135 positives, negatives = diff_mentions(mentions)
86 136  
87 137 if DEBUG:
88   - print 'Positives:'
89   - print len(positives)
90   -
91   - print 'Negatives:'
92   - print len(negatives)
  138 + print ('Positives:', len(positives))
  139 + print ('Negatives:', len(negatives))
93 140  
94 141 words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname)
95 142 mentions_dict = markables_level_2_dict(mentions_path, words_path, freq_list)
96 143  
97 144 if EACH_TEXT_SEPARATELLY:
98 145 text_features_path = os.path.join(OUT_PATH, '%s.csv' % textname)
99   - features_file = codecs.open(text_features_path, 'wt', 'utf-8')
100   - write_features(features_file, positives, negatives, mentions_dict, model, textname)
  146 + features_file = codecs.open(text_features_path, 'w', 'utf-8')
  147 + write_features(features_file, positives, negatives, mentions_dict,
  148 + model, textname, lemma2synonyms,
  149 + lemma2hypernyms, title2links, title2redirect)
101 150  
102 151 if not EACH_TEXT_SEPARATELLY:
103 152 features_file.close()
... ... @@ -108,7 +157,7 @@ def diff_mentions(mentions):
108 157 positives = get_positives(sets)
109 158 positives, negatives = get_negatives_and_update_positives(clustered_mensions, positives)
110 159 if len(negatives) != len(positives) and NEG_PROPORTION == 1:
111   - print u'Niezgodna liczba przypadków pozytywnych i negatywnych!'
  160 + print (u'Niezgodna liczba przypadków pozytywnych i negatywnych!')
112 161 return positives, negatives
113 162  
114 163  
... ... @@ -126,18 +175,18 @@ def get_sets(mentions):
126 175 sets[set_id].append(mention.attrib['span'])
127 176 clustered_mensions.append(mention.attrib['span'])
128 177 else:
129   - print u'Coś poszło nie tak przy wyszukiwaniu klastrów!'
  178 + print (u'Coś poszło nie tak przy wyszukiwaniu klastrów!')
130 179  
131 180 sets_to_remove = []
132 181 for set_id in sets:
133 182 if len(sets[set_id]) < 2:
134 183 sets_to_remove.append(set_id)
135 184 if len(sets[set_id]) == 1:
136   - print u'Removing clustered mention: ', sets[set_id][0]
  185 + print (u'Removing clustered mention: ', sets[set_id][0])
137 186 clustered_mensions.remove(sets[set_id][0])
138 187  
139 188 for set_id in sets_to_remove:
140   - print u'Removing set: ', set_id
  189 + print (u'Removing set: ', set_id)
141 190 sets.pop(set_id)
142 191  
143 192 return sets, clustered_mensions
... ... @@ -160,21 +209,24 @@ def get_negatives_and_update_positives(clustered_mensions, positives):
160 209 samples_count = len(negatives)
161 210 if NEG_PROPORTION == 1:
162 211 positives = random.sample(set(positives), samples_count)
163   - print u'Więcej przypadków pozytywnych niż negatywnych!'
  212 + print (u'Więcej przypadków pozytywnych niż negatywnych!')
164 213 negatives = random.sample(set(negatives), samples_count)
165 214 return positives, negatives
166 215  
167 216  
168   -def write_features(features_file, positives, negatives, mentions_dict, model, textname):
  217 +def write_features(features_file, positives, negatives, mentions_dict,
  218 + model, textname, lemma2synonyms,
  219 + lemma2hypernyms, title2links, title2redirect):
169 220 global POS_COUNT
170 221 POS_COUNT += len(positives)
171 222 for pair in positives:
172 223 pair_features = []
173 224 if DEBUG:
174 225 pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])]
175   - pair_features.extend(get_features(pair, mentions_dict, model))
  226 + pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms,
  227 + lemma2hypernyms, title2links, title2redirect))
176 228 pair_features.append(1)
177   - features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features]))
  229 + features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features]))
178 230  
179 231 global NEG_COUNT
180 232 NEG_COUNT += len(negatives)
... ... @@ -182,12 +234,14 @@ def write_features(features_file, positives, negatives, mentions_dict, model, te
182 234 pair_features = []
183 235 if DEBUG:
184 236 pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])]
185   - pair_features.extend(get_features(pair, mentions_dict, model))
  237 + pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms,
  238 + lemma2hypernyms, title2links, title2redirect))
186 239 pair_features.append(0)
187   - features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features]))
  240 + features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features]))
188 241  
189 242  
190   -def get_features(pair, mentions_dict, model):
  243 +def get_features(pair, mentions_dict, model, lemma2synonyms,
  244 + lemma2hypernyms, title2links, title2redirect):
191 245 features = []
192 246 ante = pair[0]
193 247 ana = pair[1]
... ... @@ -195,7 +249,8 @@ def get_features(pair, mentions_dict, model):
195 249 features.extend(ante_features)
196 250 ana_features = get_mention_features(ana, mentions_dict, model)
197 251 features.extend(ana_features)
198   - pair_features = get_pair_features(pair, mentions_dict)
  252 + pair_features = get_pair_features(pair, mentions_dict, lemma2synonyms,
  253 + lemma2hypernyms, title2links, title2redirect)
199 254 features.extend(pair_features)
200 255 return features
201 256  
... ... @@ -280,6 +335,19 @@ def get_mention_features(mention_span, mentions_dict, model):
280 335 # cechy uzupelniajace
281 336 features.extend(mention_type(mention))
282 337  
  338 + # cechy uzupelniajace 2
  339 + features.append(is_first_second_person(mention))
  340 + features.append(is_demonstrative(mention))
  341 + features.append(is_demonstrative_nominal(mention))
  342 + features.append(is_demonstrative_pronoun(mention))
  343 + features.append(is_refl_pronoun(mention))
  344 + features.append(is_first_in_sentence(mention))
  345 + features.append(is_zero_or_pronoun(mention))
  346 + features.append(contains_digit(mention, 'head_orth'))
  347 + features.append(contains_digit(mention, 'text'))
  348 + features.append(contains_letter(mention))
  349 + features.append(post_modified(mention))
  350 +
283 351 return features
284 352  
285 353  
... ... @@ -296,6 +364,68 @@ def mention_type(mention):
296 364 return type_vec
297 365  
298 366  
  367 +def is_first_second_person(mention):
  368 + if mention['head']['person'] in FIRST_SECOND_PERSON:
  369 + return 1
  370 + return 0
  371 +
  372 +
  373 +def is_demonstrative(mention):
  374 + if mention['words'][0]['base'].lower() in INDICATIVE_PRONS_BASES:
  375 + return 1
  376 + return 0
  377 +
  378 +
  379 +def is_demonstrative_nominal(mention):
  380 + if is_demonstrative(mention) and mention['head']['ctag'] in NOUN_TAGS:
  381 + return 1
  382 + return 0
  383 +
  384 +
  385 +def is_demonstrative_pronoun(mention):
  386 + if (is_demonstrative(mention) and
  387 + (mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS)):
  388 + return 1
  389 + return 0
  390 +
  391 +
  392 +def is_refl_pronoun(mention):
  393 + if mention['head']['ctag'] in SIEBIE_TAGS:
  394 + return 1
  395 + return 0
  396 +
  397 +
  398 +def is_first_in_sentence(mention):
  399 + if mention['first_in_sentence']:
  400 + return 1
  401 + return 0
  402 +
  403 +
  404 +def is_zero_or_pronoun(mention):
  405 + if mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS:
  406 + return 1
  407 + return 0
  408 +
  409 +
  410 +def contains_digit(mention, attr_name):
  411 + _digits = re.compile('\d')
  412 + if _digits.search(mention[attr_name]):
  413 + return 1
  414 + return 0
  415 +
  416 +
  417 +def contains_letter(mention):
  418 + if any(c.isalpha() for c in mention['text']):
  419 + return 1
  420 + return 0
  421 +
  422 +
  423 +def post_modified(mention):
  424 + if mention['head']['orth'] != mention['words'][-1]['orth']:
  425 + return 1
  426 + return 0
  427 +
  428 +
299 429 def get_wv(model, lemma, random=True):
300 430 global ALL_WORDS
301 431 global UNKNONW_WORDS
... ... @@ -332,7 +462,8 @@ def get_context_vec(words, model):
332 462 return vec
333 463  
334 464  
335   -def get_pair_features(pair, mentions_dict):
  465 +def get_pair_features(pair, mentions_dict, lemma2synonyms,
  466 + lemma2hypernyms, title2links, title2redirect):
336 467 ante = get_mention_by_attr(mentions_dict, 'span', pair[0])
337 468 ana = get_mention_by_attr(mentions_dict, 'span', pair[1])
338 469  
... ... @@ -375,6 +506,32 @@ def get_pair_features(pair, mentions_dict):
375 506 features.append(same_sentence(ante, ana))
376 507 features.append(same_paragraph(ante, ana))
377 508  
  509 + # cechy uzupelniajace 2
  510 + features.append(neighbouring_sentence(ante, ana))
  511 + features.append(cousin_sentence(ante, ana))
  512 + features.append(distant_sentence(ante, ana))
  513 + features.append(flat_gender_agreement(ante, ana))
  514 + features.append(left_match(ante, ana))
  515 + features.append(right_match(ante, ana))
  516 + features.append(abbrev2(ante, ana))
  517 +
  518 + features.append(string_kernel(ante, ana))
  519 + features.append(head_string_kernel(ante, ana))
  520 +
  521 + features.append(wordnet_synonyms(ante, ana, lemma2synonyms))
  522 + features.append(wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms))
  523 + features.append(wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms))
  524 +
  525 + features.append(wikipedia_link(ante, ana, title2links))
  526 + features.append(wikipedia_mutual_link(ante, ana, title2links))
  527 + features.append(wikipedia_redirect(ante, ana, title2redirect))
  528 +
  529 + # combined
  530 + features.append(samesent_anapron_antefirstinpar(ante, ana))
  531 + features.append(samesent_antefirstinpar_personnumbermatch(ante, ana))
  532 + features.append(adjsent_anapron_adjmen_personnumbermatch(ante, ana))
  533 + features.append(adjsent_anapron_adjmen(ante, ana))
  534 +
378 535 return features
379 536  
380 537  
... ... @@ -392,7 +549,7 @@ def get_distance_bucket(distance):
392 549 elif distance >= 64:
393 550 return 9
394 551 else:
395   - print u'Coś poszło nie tak przy kubełkowaniu!!'
  552 + print (u'Coś poszło nie tak przy kubełkowaniu!!')
396 553 return 10
397 554  
398 555  
... ... @@ -445,8 +602,8 @@ def is_acronym(ante, ana):
445 602 if ana['text'].upper() == ana['text']:
446 603 return check_one_way_acronym(ana['text'], ante['text'])
447 604 if ante['text'].upper() == ante['text']:
448   - return check_one_way_acronym(ante['text'], ana['text']);
449   - return 0;
  605 + return check_one_way_acronym(ante['text'], ana['text'])
  606 + return 0
450 607  
451 608  
452 609 def check_one_way_acronym(acronym, expression):
... ... @@ -455,10 +612,10 @@ def check_one_way_acronym(acronym, expression):
455 612 for expr2 in expr1.split():
456 613 expr2 = expr2.strip()
457 614 if expr2:
458   - initials += unicode(expr2[0]).upper()
  615 + initials += str(expr2[0]).upper()
459 616 if acronym == initials:
460   - return 1;
461   - return 0;
  617 + return 1
  618 + return 0
462 619  
463 620  
464 621 def same_sentence(ante, ana):
... ... @@ -467,12 +624,290 @@ def same_sentence(ante, ana):
467 624 return 0
468 625  
469 626  
  627 +def neighbouring_sentence(ante, ana):
  628 + if ana['sentence_id'] - ante['sentence_id'] == 1:
  629 + return 1
  630 + return 0
  631 +
  632 +
  633 +def cousin_sentence(ante, ana):
  634 + if ana['sentence_id'] - ante['sentence_id'] == 2:
  635 + return 1
  636 + return 0
  637 +
  638 +
  639 +def distant_sentence(ante, ana):
  640 + if ana['sentence_id'] - ante['sentence_id'] > 2:
  641 + return 1
  642 + return 0
  643 +
  644 +
470 645 def same_paragraph(ante, ana):
471 646 if ante['paragraph_id'] == ana['paragraph_id']:
472 647 return 1
473 648 return 0
474 649  
475 650  
  651 +def flat_gender_agreement(ante, ana):
  652 + agr_vec = [0] * 3
  653 + if ante['head']['gender'] == 'unk' or ana['head']['gender'] == 'unk':
  654 + agr_vec[2] = 1
  655 + elif (ante['head']['gender'] == ana['head']['gender'] or
  656 + (ante['head']['gender'] in MASCULINE_TAGS and ana['head']['gender'] in MASCULINE_TAGS)):
  657 + agr_vec[0] = 1
  658 + else:
  659 + agr_vec[1] = 1
  660 + return agr_vec
  661 +
  662 +
  663 +def string_kernel(ante, ana):
  664 + s1 = ante['text']
  665 + s2 = ana['text']
  666 + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2)))
  667 +
  668 +
  669 +def head_string_kernel(ante, ana):
  670 + s1 = ante['head_orth']
  671 + s2 = ana['head_orth']
  672 + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2)))
  673 +
  674 +
  675 +def SK(s1, s2):
  676 + LAMBDA = 0.4
  677 +
  678 + p = len(s1)
  679 + if len(s2) < len(s1):
  680 + p = len(s2)
  681 +
  682 + h, w = len(s1)+1, len(s2)+1
  683 + DPS = [[0.0] * w for i in range(h)]
  684 + DP = [[0.0] * w for i in range(h)]
  685 +
  686 + kernel_mat = [0.0] * (len(s1) + 1)
  687 +
  688 + for i in range(len(s1)+1):
  689 + if i == 0:
  690 + continue
  691 + for j in range(len(s2)+1):
  692 + if j == 0:
  693 + continue
  694 + if s1[i-1] == s2[j-1]:
  695 + DPS[i][j] = LAMBDA * LAMBDA
  696 + kernel_mat[0] += DPS[i][j]
  697 + else:
  698 + DPS[i][j] = 0.0
  699 +
  700 + for l in range(p):
  701 + if l == 0:
  702 + continue
  703 +
  704 + kernel_mat[l] = 0.0
  705 + for j in range(len(s2)+1):
  706 + DP[l-1][j] = 0.0
  707 +
  708 + for i in range(len(s1)+1):
  709 + DP[i][l-1] = 0.0
  710 +
  711 + for i in range(len(s1)+1):
  712 + if i < l:
  713 + continue
  714 + for j in range(len(s2)+1):
  715 + if j < l:
  716 + continue
  717 + DP[i][j] = DPS[i][j] + LAMBDA * DP[i - 1][j] + LAMBDA * DP[i][j - 1] - LAMBDA * LAMBDA * DP[i - 1][j - 1]
  718 +
  719 + if s1[i-1] == s2[j-1]:
  720 + DPS[i][j] = LAMBDA * LAMBDA * DP[i - 1][j - 1]
  721 + kernel_mat[l] += DPS[i][j]
  722 +
  723 + K = 0.0
  724 + for l in range(p):
  725 + K += kernel_mat[l]
  726 + return K
  727 +
  728 +
  729 +def left_match(ante, ana):
  730 + if (ante['text'].lower().startswith(ana['text'].lower()) or
  731 + ana['text'].lower().startswith(ante['text'].lower())):
  732 + return 1
  733 + return 0
  734 +
  735 +
  736 +def right_match(ante, ana):
  737 + if (ante['text'].lower().endswith(ana['text'].lower()) or
  738 + ana['text'].lower().endswith(ante['text'].lower())):
  739 + return 1
  740 + return 0
  741 +
  742 +# def string_match_no_hyphenation(ante, ana):
  743 +# ante_no_hyphen = remove_hyphen_signs(ante['text'])
  744 +# ana_no_hyphen = remove_hyphen_signs(ana['text'])
  745 +# if ante_no_hyphen == ana_no_hyphen:
  746 +# return 1
  747 +# return 0
  748 +#
  749 +#
  750 +# def string_match_no_hyphenation_lowercase(ante, ana):
  751 +# ante_no_hyphen = remove_hyphen_signs(ante['text']).lower()
  752 +# ana_no_hyphen = remove_hyphen_signs(ana['text']).lower()
  753 +# if ante_no_hyphen == ana_no_hyphen:
  754 +# return 1
  755 +# return 0
  756 +
  757 +
  758 +def remove_hyphen_signs(text):
  759 + for sign in HYPHEN_SIGNS:
  760 + text = text.replace(sign, '')
  761 + return text
  762 +
  763 +
  764 +def samesent_anapron_antefirstinpar(ante, ana):
  765 + if same_sentence(ante, ana) and is_zero_or_pronoun(ana) and ante['first_in_paragraph']:
  766 + return 1
  767 + return 0
  768 +
  769 +
  770 +def samesent_antefirstinpar_personnumbermatch(ante, ana):
  771 + if (same_sentence(ante, ana) and ante['first_in_paragraph']
  772 + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
  773 + return 1
  774 + return 0
  775 +
  776 +
  777 +def adjsent_anapron_adjmen_personnumbermatch(ante, ana):
  778 + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana)
  779 + and ana['position_in_mentions'] - ante['position_in_mentions'] == 1
  780 + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
  781 + return 1
  782 + return 0
  783 +
  784 +
  785 +def adjsent_anapron_adjmen(ante, ana):
  786 + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana)
  787 + and ana['position_in_mentions'] - ante['position_in_mentions'] == 1):
  788 + return 1
  789 + return 0
  790 +
  791 +
  792 +def abbrev2(ante, ana):
  793 + ante_abbrev = get_abbrev(ante)
  794 + ana_abbrev = get_abbrev(ana)
  795 + if ante['head_orth'] == ana_abbrev or ana['head_orth'] == ante_abbrev:
  796 + return 1
  797 + return 0
  798 +
  799 +
  800 +def get_abbrev(mention):
  801 + abbrev = u''
  802 + for word in mention['words']:
  803 + if word['orth'][0].isupper():
  804 + abbrev += word['orth'][0]
  805 + return abbrev
  806 +
  807 +
  808 +def wordnet_synonyms(ante, ana, lemma2synonyms):
  809 + ante_synonyms = set()
  810 + if ante['head']['base'] in lemma2synonyms:
  811 + ante_synonyms = lemma2synonyms[ante['head']['base']]
  812 +
  813 + ana_synonyms = set()
  814 + if ana['head']['base'] in lemma2synonyms:
  815 + ana_synonyms = lemma2synonyms[ana['head']['base']]
  816 +
  817 + if ana['head']['base'] in ante_synonyms or ante['head']['base'] in ana_synonyms:
  818 + return 1
  819 + return 0
  820 +
  821 +
  822 +def wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms):
  823 + ante_hypernyms = set()
  824 + if ante['head']['base'] in lemma2hypernyms:
  825 + ante_hypernyms = lemma2hypernyms[ante['head']['base']]
  826 +
  827 + ana_hypernyms = set()
  828 + if ana['head']['base'] in lemma2hypernyms:
  829 + ana_hypernyms = lemma2hypernyms[ana['head']['base']]
  830 +
  831 + if not ante_hypernyms or not ana_hypernyms:
  832 + return 0
  833 +
  834 + if ana['head']['base'] in ante_hypernyms:
  835 + return 1
  836 + return 0
  837 +
  838 +
  839 +def wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms):
  840 + ana_hypernyms = set()
  841 + if ana['head']['base'] in lemma2hypernyms:
  842 + ana_hypernyms = lemma2hypernyms[ana['head']['base']]
  843 +
  844 + ante_hypernyms = set()
  845 + if ante['head']['base'] in lemma2hypernyms:
  846 + ante_hypernyms = lemma2hypernyms[ante['head']['base']]
  847 +
  848 + if not ante_hypernyms or not ana_hypernyms:
  849 + return 0
  850 +
  851 + if ante['head']['base'] in ana_hypernyms:
  852 + return 1
  853 + return 0
  854 +
  855 +
  856 +def wikipedia_link(ante, ana, title2links):
  857 + ante_base = ante['lemmatized_text'].lower()
  858 + ana_base = ana['lemmatized_text'].lower()
  859 + if ante_base == ana_base:
  860 + return 1
  861 +
  862 + ante_links = set()
  863 + if ante_base in title2links:
  864 + ante_links = title2links[ante_base]
  865 +
  866 + ana_links = set()
  867 + if ana_base in title2links:
  868 + ana_links = title2links[ana_base]
  869 +
  870 + if ana_base in ante_links or ante_base in ana_links:
  871 + return 1
  872 +
  873 + return 0
  874 +
  875 +
  876 +def wikipedia_mutual_link(ante, ana, title2links):
  877 + ante_base = ante['lemmatized_text'].lower()
  878 + ana_base = ana['lemmatized_text'].lower()
  879 + if ante_base == ana_base:
  880 + return 1
  881 +
  882 + ante_links = set()
  883 + if ante_base in title2links:
  884 + ante_links = title2links[ante_base]
  885 +
  886 + ana_links = set()
  887 + if ana_base in title2links:
  888 + ana_links = title2links[ana_base]
  889 +
  890 + if ana_base in ante_links and ante_base in ana_links:
  891 + return 1
  892 +
  893 + return 0
  894 +
  895 +
  896 +def wikipedia_redirect(ante, ana, title2redirect):
  897 + ante_base = ante['lemmatized_text'].lower()
  898 + ana_base = ana['lemmatized_text'].lower()
  899 + if ante_base == ana_base:
  900 + return 1
  901 +
  902 + if ante_base in title2redirect and title2redirect[ante_base] == ana_base:
  903 + return 1
  904 +
  905 + if ana_base in title2redirect and title2redirect[ana_base] == ante_base:
  906 + return 1
  907 +
  908 + return 0
  909 +
  910 +
476 911 def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www.eml.org/NameSpaces/mention'):
477 912 markables_dicts = []
478 913 markables_tree = etree.parse(markables_path)
... ... @@ -492,7 +927,8 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace=&#39;www
492 927 if head_orth not in POSSIBLE_HEADS:
493 928 mention_words = span_to_words(span, words)
494 929  
495   - prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id = get_context(mention_words, words)
  930 + (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position,
  931 + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph) = get_context(mention_words, words)
496 932  
497 933 head = get_head(head_orth, mention_words)
498 934 markables_dicts.append({'id': markable.attrib['id'],
... ... @@ -513,9 +949,11 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace=&#39;www
513 949 'end_in_words': mnt_end_position,
514 950 'rarest': get_rarest_word(mention_words, freq_list),
515 951 'paragraph_id': paragraph_id,
516   - 'sentence_id': sentence_id})
  952 + 'sentence_id': sentence_id,
  953 + 'first_in_sentence': first_in_sentence,
  954 + 'first_in_paragraph': first_in_paragraph})
517 955 else:
518   - print 'Zduplikowana wzmianka: %s' % span
  956 + print ('Zduplikowana wzmianka: %s' % span)
519 957  
520 958 return markables_dicts
521 959  
... ... @@ -529,10 +967,16 @@ def get_context(mention_words, words):
529 967 mnt_start_position = -1
530 968 first_word = mention_words[0]
531 969 last_word = mention_words[-1]
  970 + first_in_sentence = False
  971 + first_in_paragraph = False
532 972 for idx, word in enumerate(words):
533 973 if word['id'] == first_word['id']:
534 974 prec_context = get_prec_context(idx, words)
535 975 mnt_start_position = get_mention_start(first_word, words)
  976 + if idx == 0 or words[idx-1]['lastinsent']:
  977 + first_in_sentence = True
  978 + if idx == 0 or words[idx-1]['lastinpar']:
  979 + first_in_paragraph = True
536 980 if word['id'] == last_word['id']:
537 981 follow_context = get_follow_context(idx, words)
538 982 sentence = get_sentence(idx, words)
... ... @@ -542,7 +986,8 @@ def get_context(mention_words, words):
542 986 sentence_id += 1
543 987 if word['lastinpar']:
544 988 paragraph_id += 1
545   - return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id
  989 + return (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position,
  990 + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph)
546 991  
547 992  
548 993 def get_prec_context(mention_start, words):
... ... @@ -743,9 +1188,9 @@ def to_text(words, form):
743 1188  
744 1189  
745 1190 def get_one_word_text(word_id, words, form):
746   - this_word = (word for word in words if word['id'] == word_id).next()
  1191 + this_word = next(word for word in words if word['id'] == word_id)
747 1192 if word_to_ignore(this_word):
748   - print this_word
  1193 + print (this_word)
749 1194 return this_word[form]
750 1195  
751 1196  
... ...