Commit 7d27f9e9ae10935e1a182e7860943dcbb416e473
1 parent
04c45e2d
Added Bartek-3 features to preparator script.
Showing
7 changed files
with
500 additions
and
55 deletions
data/wikipedia/link.map
0 → 100755
No preview for this file type
data/wikipedia/redirect.map
0 → 100755
No preview for this file type
data/wordnet/lemma2hypernyms.map
0 → 100755
No preview for this file type
data/wordnet/lemma2synonyms.map
0 → 100755
No preview for this file type
for_investigation.ipynb
... | ... | @@ -114,7 +114,7 @@ |
114 | 114 | }, |
115 | 115 | "outputs": [], |
116 | 116 | "source": [ |
117 | - " predictions = model.predict(test_set)" | |
117 | + "predictions = model.predict(test_set)" | |
118 | 118 | ] |
119 | 119 | }, |
120 | 120 | { |
... | ... | @@ -141,7 +141,7 @@ |
141 | 141 | } |
142 | 142 | ], |
143 | 143 | "source": [ |
144 | - " true_positives = 0.0\n", | |
144 | + "true_positives = 0.0\n", | |
145 | 145 | " false_positives = 0.0\n", |
146 | 146 | " true_negatives = 0.0\n", |
147 | 147 | " false_negatives = 0.0\n", |
... | ... | @@ -173,7 +173,7 @@ |
173 | 173 | "language_info": { |
174 | 174 | "codemirror_mode": { |
175 | 175 | "name": "ipython", |
176 | - "version": 2 | |
176 | + "version": 2.0 | |
177 | 177 | }, |
178 | 178 | "file_extension": ".py", |
179 | 179 | "mimetype": "text/x-python", |
... | ... | @@ -184,5 +184,5 @@ |
184 | 184 | } |
185 | 185 | }, |
186 | 186 | "nbformat": 4, |
187 | - "nbformat_minor": 2 | |
188 | -} | |
187 | + "nbformat_minor": 0 | |
188 | +} | |
189 | 189 | \ No newline at end of file |
... | ... |
mention-pair-classifier.ipynb
... | ... | @@ -78,7 +78,7 @@ |
78 | 78 | "number_of_features = 1126\n", |
79 | 79 | "\n", |
80 | 80 | "X = data[:,0:1126]\n", |
81 | - "Y = data[:,1126] #last column consists of labels\n" | |
81 | + "Y = data[:,1126] #last column consists of labels" | |
82 | 82 | ] |
83 | 83 | }, |
84 | 84 | { |
... | ... | @@ -270,7 +270,7 @@ |
270 | 270 | "language_info": { |
271 | 271 | "codemirror_mode": { |
272 | 272 | "name": "ipython", |
273 | - "version": 2 | |
273 | + "version": 2.0 | |
274 | 274 | }, |
275 | 275 | "file_extension": ".py", |
276 | 276 | "mimetype": "text/x-python", |
... | ... | @@ -281,5 +281,5 @@ |
281 | 281 | } |
282 | 282 | }, |
283 | 283 | "nbformat": 4, |
284 | - "nbformat_minor": 2 | |
285 | -} | |
284 | + "nbformat_minor": 0 | |
285 | +} | |
286 | 286 | \ No newline at end of file |
... | ... |
preparator.py
1 | 1 | # -*- coding: utf-8 -*- |
2 | 2 | |
3 | 3 | import codecs |
4 | +import math | |
4 | 5 | import numpy |
5 | 6 | import os |
6 | 7 | import random |
8 | +import re | |
9 | + | |
10 | +import javaobj | |
7 | 11 | |
8 | 12 | from lxml import etree |
9 | 13 | from itertools import combinations |
... | ... | @@ -12,25 +16,39 @@ from natsort import natsorted |
12 | 16 | from gensim.models.word2vec import Word2Vec |
13 | 17 | |
14 | 18 | |
15 | -TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'test-prepared')) | |
16 | -TRAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'train-prepared')) | |
17 | -FREQ_300M_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'freq', 'base.lst')) | |
19 | +MAIN_PATH = os.path.dirname(__file__) | |
20 | +TEST_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'test-prepared')) | |
21 | +TRAIN_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'train-prepared')) | |
22 | +FREQ_300M_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'freq', 'base.lst')) | |
23 | + | |
24 | +LEMMA2SYNONYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2synonyms.map')) | |
25 | +LEMMA2HYPERNYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2hypernyms.map')) | |
26 | + | |
27 | +TITLE2LINKS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'link.map')) | |
28 | +TITLE2REDIRECT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'redirect.map')) | |
18 | 29 | |
19 | 30 | ANNO_PATH = TEST_PATH |
20 | -OUT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', | |
21 | - 'test-20170627.csv')) | |
31 | +OUT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', | |
32 | + 'test-20170720.csv')) | |
22 | 33 | EACH_TEXT_SEPARATELLY = False |
23 | 34 | |
24 | 35 | CONTEXT = 5 |
25 | 36 | W2V_SIZE = 50 |
26 | -MODEL = os.path.abspath(os.path.join(os.path.dirname(__file__), 'models', | |
37 | +MODEL = os.path.abspath(os.path.join(MAIN_PATH, 'models', | |
27 | 38 | '%d' % W2V_SIZE, |
28 | 39 | 'w2v_allwiki_nkjpfull_%d.model' % W2V_SIZE)) |
29 | 40 | |
41 | +FIRST_SECOND_PERSON = ['pri', 'sec'] | |
42 | +INDICATIVE_PRONS_BASES = ["ten", "ta", "to", "ci", "te", "tamten", "tamta", | |
43 | + "tamto", "tamci", "tamte", "ów", "owa", "owo", "owi", "owe"] | |
44 | +SIEBIE_TAGS = ['siebie'] | |
45 | +MASCULINE_TAGS = ['m1', 'm2', 'm3'] | |
46 | + | |
30 | 47 | NOUN_TAGS = ['subst', 'ger', 'depr'] |
31 | 48 | PPRON_TAGS = ['ppron12', 'ppron3'] |
32 | 49 | ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt'] |
33 | 50 | POSSIBLE_HEADS = [u'§', u'%', u'*', u'"', u'„', u'&', u'-'] |
51 | +HYPHEN_SIGNS = ['-', '#'] | |
34 | 52 | |
35 | 53 | NEG_PROPORTION = 1 |
36 | 54 | RANDOM_VECTORS = True |
... | ... | @@ -45,13 +63,18 @@ UNKNONW_WORDS = 0 |
45 | 63 | def main(): |
46 | 64 | model = Word2Vec.load(MODEL) |
47 | 65 | freq_list = load_freq_list(FREQ_300M_PATH) |
66 | + lemma2synonyms = load_one2many_map(LEMMA2SYNONYMS_PATH) | |
67 | + lemma2hypernyms = load_one2many_map(LEMMA2HYPERNYMS_PATH) | |
68 | + title2links = load_one2many_map(TITLE2LINKS_PATH) | |
69 | + title2redirect = load_one2one_map(TITLE2REDIRECT_PATH) | |
48 | 70 | try: |
49 | - create_data_vectors(model, freq_list) | |
71 | + create_data_vectors(model, freq_list, lemma2synonyms, | |
72 | + lemma2hypernyms, title2links, title2redirect) | |
50 | 73 | finally: |
51 | - print 'Unknown words: ', UNKNONW_WORDS | |
52 | - print 'All words: ', ALL_WORDS | |
53 | - print 'Positives: ', POS_COUNT | |
54 | - print 'Negatives: ', NEG_COUNT | |
74 | + print ('Unknown words: ', UNKNONW_WORDS) | |
75 | + print ('All words: ', ALL_WORDS) | |
76 | + print ('Positives: ', POS_COUNT) | |
77 | + print ('Negatives: ', NEG_COUNT) | |
55 | 78 | |
56 | 79 | |
57 | 80 | def load_freq_list(freq_path): |
... | ... | @@ -67,16 +90,43 @@ def load_freq_list(freq_path): |
67 | 90 | return freq_list |
68 | 91 | |
69 | 92 | |
70 | -def create_data_vectors(model, freq_list): | |
93 | +def load_one2many_map(map_path): | |
94 | + this_map = {} | |
95 | + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb')) | |
96 | + pobj = marshaller.readObject() | |
97 | + jmap_annotations = pobj.__dict__['annotations'] | |
98 | + jmap_annotations_count = len(jmap_annotations) | |
99 | + for i in range(jmap_annotations_count): | |
100 | + if i%2 == 1: | |
101 | + mapped_elements = set(jmap_annotations[i+1].__dict__['annotations']) | |
102 | + this_map[jmap_annotations[i]] = mapped_elements | |
103 | + return this_map | |
104 | + | |
105 | + | |
106 | +def load_one2one_map(map_path): | |
107 | + this_map = {} | |
108 | + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb')) | |
109 | + pobj = marshaller.readObject() | |
110 | + jmap_annotations = pobj.__dict__['annotations'] | |
111 | + jmap_annotations_count = len(jmap_annotations) | |
112 | + for i in range(jmap_annotations_count): | |
113 | + if i%2 == 1: | |
114 | + element = jmap_annotations[i+1] | |
115 | + this_map[jmap_annotations[i]] = element | |
116 | + return this_map | |
117 | + | |
118 | + | |
119 | +def create_data_vectors(model, freq_list, lemma2synonyms, | |
120 | + lemma2hypernyms, title2links, title2redirect): | |
71 | 121 | features_file = None |
72 | 122 | if not EACH_TEXT_SEPARATELLY: |
73 | - features_file = codecs.open(OUT_PATH, 'wt', 'utf-8') | |
123 | + features_file = codecs.open(OUT_PATH, 'w', 'utf-8') | |
74 | 124 | |
75 | 125 | anno_files = os.listdir(ANNO_PATH) |
76 | 126 | anno_files = natsorted(anno_files) |
77 | 127 | for filename in anno_files: |
78 | 128 | if filename.endswith('.mmax'): |
79 | - print '=======> ', filename | |
129 | + print ('=======> ', filename) | |
80 | 130 | textname = filename.replace('.mmax', '') |
81 | 131 | |
82 | 132 | mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname) |
... | ... | @@ -85,19 +135,18 @@ def create_data_vectors(model, freq_list): |
85 | 135 | positives, negatives = diff_mentions(mentions) |
86 | 136 | |
87 | 137 | if DEBUG: |
88 | - print 'Positives:' | |
89 | - print len(positives) | |
90 | - | |
91 | - print 'Negatives:' | |
92 | - print len(negatives) | |
138 | + print ('Positives:', len(positives)) | |
139 | + print ('Negatives:', len(negatives)) | |
93 | 140 | |
94 | 141 | words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname) |
95 | 142 | mentions_dict = markables_level_2_dict(mentions_path, words_path, freq_list) |
96 | 143 | |
97 | 144 | if EACH_TEXT_SEPARATELLY: |
98 | 145 | text_features_path = os.path.join(OUT_PATH, '%s.csv' % textname) |
99 | - features_file = codecs.open(text_features_path, 'wt', 'utf-8') | |
100 | - write_features(features_file, positives, negatives, mentions_dict, model, textname) | |
146 | + features_file = codecs.open(text_features_path, 'w', 'utf-8') | |
147 | + write_features(features_file, positives, negatives, mentions_dict, | |
148 | + model, textname, lemma2synonyms, | |
149 | + lemma2hypernyms, title2links, title2redirect) | |
101 | 150 | |
102 | 151 | if not EACH_TEXT_SEPARATELLY: |
103 | 152 | features_file.close() |
... | ... | @@ -108,7 +157,7 @@ def diff_mentions(mentions): |
108 | 157 | positives = get_positives(sets) |
109 | 158 | positives, negatives = get_negatives_and_update_positives(clustered_mensions, positives) |
110 | 159 | if len(negatives) != len(positives) and NEG_PROPORTION == 1: |
111 | - print u'Niezgodna liczba przypadków pozytywnych i negatywnych!' | |
160 | + print (u'Niezgodna liczba przypadków pozytywnych i negatywnych!') | |
112 | 161 | return positives, negatives |
113 | 162 | |
114 | 163 | |
... | ... | @@ -126,18 +175,18 @@ def get_sets(mentions): |
126 | 175 | sets[set_id].append(mention.attrib['span']) |
127 | 176 | clustered_mensions.append(mention.attrib['span']) |
128 | 177 | else: |
129 | - print u'Coś poszło nie tak przy wyszukiwaniu klastrów!' | |
178 | + print (u'Coś poszło nie tak przy wyszukiwaniu klastrów!') | |
130 | 179 | |
131 | 180 | sets_to_remove = [] |
132 | 181 | for set_id in sets: |
133 | 182 | if len(sets[set_id]) < 2: |
134 | 183 | sets_to_remove.append(set_id) |
135 | 184 | if len(sets[set_id]) == 1: |
136 | - print u'Removing clustered mention: ', sets[set_id][0] | |
185 | + print (u'Removing clustered mention: ', sets[set_id][0]) | |
137 | 186 | clustered_mensions.remove(sets[set_id][0]) |
138 | 187 | |
139 | 188 | for set_id in sets_to_remove: |
140 | - print u'Removing set: ', set_id | |
189 | + print (u'Removing set: ', set_id) | |
141 | 190 | sets.pop(set_id) |
142 | 191 | |
143 | 192 | return sets, clustered_mensions |
... | ... | @@ -160,21 +209,24 @@ def get_negatives_and_update_positives(clustered_mensions, positives): |
160 | 209 | samples_count = len(negatives) |
161 | 210 | if NEG_PROPORTION == 1: |
162 | 211 | positives = random.sample(set(positives), samples_count) |
163 | - print u'Więcej przypadków pozytywnych niż negatywnych!' | |
212 | + print (u'Więcej przypadków pozytywnych niż negatywnych!') | |
164 | 213 | negatives = random.sample(set(negatives), samples_count) |
165 | 214 | return positives, negatives |
166 | 215 | |
167 | 216 | |
168 | -def write_features(features_file, positives, negatives, mentions_dict, model, textname): | |
217 | +def write_features(features_file, positives, negatives, mentions_dict, | |
218 | + model, textname, lemma2synonyms, | |
219 | + lemma2hypernyms, title2links, title2redirect): | |
169 | 220 | global POS_COUNT |
170 | 221 | POS_COUNT += len(positives) |
171 | 222 | for pair in positives: |
172 | 223 | pair_features = [] |
173 | 224 | if DEBUG: |
174 | 225 | pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])] |
175 | - pair_features.extend(get_features(pair, mentions_dict, model)) | |
226 | + pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms, | |
227 | + lemma2hypernyms, title2links, title2redirect)) | |
176 | 228 | pair_features.append(1) |
177 | - features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features])) | |
229 | + features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features])) | |
178 | 230 | |
179 | 231 | global NEG_COUNT |
180 | 232 | NEG_COUNT += len(negatives) |
... | ... | @@ -182,12 +234,14 @@ def write_features(features_file, positives, negatives, mentions_dict, model, te |
182 | 234 | pair_features = [] |
183 | 235 | if DEBUG: |
184 | 236 | pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])] |
185 | - pair_features.extend(get_features(pair, mentions_dict, model)) | |
237 | + pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms, | |
238 | + lemma2hypernyms, title2links, title2redirect)) | |
186 | 239 | pair_features.append(0) |
187 | - features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features])) | |
240 | + features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features])) | |
188 | 241 | |
189 | 242 | |
190 | -def get_features(pair, mentions_dict, model): | |
243 | +def get_features(pair, mentions_dict, model, lemma2synonyms, | |
244 | + lemma2hypernyms, title2links, title2redirect): | |
191 | 245 | features = [] |
192 | 246 | ante = pair[0] |
193 | 247 | ana = pair[1] |
... | ... | @@ -195,7 +249,8 @@ def get_features(pair, mentions_dict, model): |
195 | 249 | features.extend(ante_features) |
196 | 250 | ana_features = get_mention_features(ana, mentions_dict, model) |
197 | 251 | features.extend(ana_features) |
198 | - pair_features = get_pair_features(pair, mentions_dict) | |
252 | + pair_features = get_pair_features(pair, mentions_dict, lemma2synonyms, | |
253 | + lemma2hypernyms, title2links, title2redirect) | |
199 | 254 | features.extend(pair_features) |
200 | 255 | return features |
201 | 256 | |
... | ... | @@ -280,6 +335,19 @@ def get_mention_features(mention_span, mentions_dict, model): |
280 | 335 | # cechy uzupelniajace |
281 | 336 | features.extend(mention_type(mention)) |
282 | 337 | |
338 | + # cechy uzupelniajace 2 | |
339 | + features.append(is_first_second_person(mention)) | |
340 | + features.append(is_demonstrative(mention)) | |
341 | + features.append(is_demonstrative_nominal(mention)) | |
342 | + features.append(is_demonstrative_pronoun(mention)) | |
343 | + features.append(is_refl_pronoun(mention)) | |
344 | + features.append(is_first_in_sentence(mention)) | |
345 | + features.append(is_zero_or_pronoun(mention)) | |
346 | + features.append(contains_digit(mention, 'head_orth')) | |
347 | + features.append(contains_digit(mention, 'text')) | |
348 | + features.append(contains_letter(mention)) | |
349 | + features.append(post_modified(mention)) | |
350 | + | |
283 | 351 | return features |
284 | 352 | |
285 | 353 | |
... | ... | @@ -296,6 +364,68 @@ def mention_type(mention): |
296 | 364 | return type_vec |
297 | 365 | |
298 | 366 | |
367 | +def is_first_second_person(mention): | |
368 | + if mention['head']['person'] in FIRST_SECOND_PERSON: | |
369 | + return 1 | |
370 | + return 0 | |
371 | + | |
372 | + | |
373 | +def is_demonstrative(mention): | |
374 | + if mention['words'][0]['base'].lower() in INDICATIVE_PRONS_BASES: | |
375 | + return 1 | |
376 | + return 0 | |
377 | + | |
378 | + | |
379 | +def is_demonstrative_nominal(mention): | |
380 | + if is_demonstrative(mention) and mention['head']['ctag'] in NOUN_TAGS: | |
381 | + return 1 | |
382 | + return 0 | |
383 | + | |
384 | + | |
385 | +def is_demonstrative_pronoun(mention): | |
386 | + if (is_demonstrative(mention) and | |
387 | + (mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS)): | |
388 | + return 1 | |
389 | + return 0 | |
390 | + | |
391 | + | |
392 | +def is_refl_pronoun(mention): | |
393 | + if mention['head']['ctag'] in SIEBIE_TAGS: | |
394 | + return 1 | |
395 | + return 0 | |
396 | + | |
397 | + | |
398 | +def is_first_in_sentence(mention): | |
399 | + if mention['first_in_sentence']: | |
400 | + return 1 | |
401 | + return 0 | |
402 | + | |
403 | + | |
404 | +def is_zero_or_pronoun(mention): | |
405 | + if mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS: | |
406 | + return 1 | |
407 | + return 0 | |
408 | + | |
409 | + | |
410 | +def contains_digit(mention, attr_name): | |
411 | + _digits = re.compile('\d') | |
412 | + if _digits.search(mention[attr_name]): | |
413 | + return 1 | |
414 | + return 0 | |
415 | + | |
416 | + | |
417 | +def contains_letter(mention): | |
418 | + if any(c.isalpha() for c in mention['text']): | |
419 | + return 1 | |
420 | + return 0 | |
421 | + | |
422 | + | |
423 | +def post_modified(mention): | |
424 | + if mention['head']['orth'] != mention['words'][-1]['orth']: | |
425 | + return 1 | |
426 | + return 0 | |
427 | + | |
428 | + | |
299 | 429 | def get_wv(model, lemma, random=True): |
300 | 430 | global ALL_WORDS |
301 | 431 | global UNKNONW_WORDS |
... | ... | @@ -332,7 +462,8 @@ def get_context_vec(words, model): |
332 | 462 | return vec |
333 | 463 | |
334 | 464 | |
335 | -def get_pair_features(pair, mentions_dict): | |
465 | +def get_pair_features(pair, mentions_dict, lemma2synonyms, | |
466 | + lemma2hypernyms, title2links, title2redirect): | |
336 | 467 | ante = get_mention_by_attr(mentions_dict, 'span', pair[0]) |
337 | 468 | ana = get_mention_by_attr(mentions_dict, 'span', pair[1]) |
338 | 469 | |
... | ... | @@ -375,6 +506,32 @@ def get_pair_features(pair, mentions_dict): |
375 | 506 | features.append(same_sentence(ante, ana)) |
376 | 507 | features.append(same_paragraph(ante, ana)) |
377 | 508 | |
509 | + # cechy uzupelniajace 2 | |
510 | + features.append(neighbouring_sentence(ante, ana)) | |
511 | + features.append(cousin_sentence(ante, ana)) | |
512 | + features.append(distant_sentence(ante, ana)) | |
513 | + features.append(flat_gender_agreement(ante, ana)) | |
514 | + features.append(left_match(ante, ana)) | |
515 | + features.append(right_match(ante, ana)) | |
516 | + features.append(abbrev2(ante, ana)) | |
517 | + | |
518 | + features.append(string_kernel(ante, ana)) | |
519 | + features.append(head_string_kernel(ante, ana)) | |
520 | + | |
521 | + features.append(wordnet_synonyms(ante, ana, lemma2synonyms)) | |
522 | + features.append(wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms)) | |
523 | + features.append(wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms)) | |
524 | + | |
525 | + features.append(wikipedia_link(ante, ana, title2links)) | |
526 | + features.append(wikipedia_mutual_link(ante, ana, title2links)) | |
527 | + features.append(wikipedia_redirect(ante, ana, title2redirect)) | |
528 | + | |
529 | + # combined | |
530 | + features.append(samesent_anapron_antefirstinpar(ante, ana)) | |
531 | + features.append(samesent_antefirstinpar_personnumbermatch(ante, ana)) | |
532 | + features.append(adjsent_anapron_adjmen_personnumbermatch(ante, ana)) | |
533 | + features.append(adjsent_anapron_adjmen(ante, ana)) | |
534 | + | |
378 | 535 | return features |
379 | 536 | |
380 | 537 | |
... | ... | @@ -392,7 +549,7 @@ def get_distance_bucket(distance): |
392 | 549 | elif distance >= 64: |
393 | 550 | return 9 |
394 | 551 | else: |
395 | - print u'Coś poszło nie tak przy kubełkowaniu!!' | |
552 | + print (u'Coś poszło nie tak przy kubełkowaniu!!') | |
396 | 553 | return 10 |
397 | 554 | |
398 | 555 | |
... | ... | @@ -445,8 +602,8 @@ def is_acronym(ante, ana): |
445 | 602 | if ana['text'].upper() == ana['text']: |
446 | 603 | return check_one_way_acronym(ana['text'], ante['text']) |
447 | 604 | if ante['text'].upper() == ante['text']: |
448 | - return check_one_way_acronym(ante['text'], ana['text']); | |
449 | - return 0; | |
605 | + return check_one_way_acronym(ante['text'], ana['text']) | |
606 | + return 0 | |
450 | 607 | |
451 | 608 | |
452 | 609 | def check_one_way_acronym(acronym, expression): |
... | ... | @@ -455,10 +612,10 @@ def check_one_way_acronym(acronym, expression): |
455 | 612 | for expr2 in expr1.split(): |
456 | 613 | expr2 = expr2.strip() |
457 | 614 | if expr2: |
458 | - initials += unicode(expr2[0]).upper() | |
615 | + initials += str(expr2[0]).upper() | |
459 | 616 | if acronym == initials: |
460 | - return 1; | |
461 | - return 0; | |
617 | + return 1 | |
618 | + return 0 | |
462 | 619 | |
463 | 620 | |
464 | 621 | def same_sentence(ante, ana): |
... | ... | @@ -467,12 +624,290 @@ def same_sentence(ante, ana): |
467 | 624 | return 0 |
468 | 625 | |
469 | 626 | |
627 | +def neighbouring_sentence(ante, ana): | |
628 | + if ana['sentence_id'] - ante['sentence_id'] == 1: | |
629 | + return 1 | |
630 | + return 0 | |
631 | + | |
632 | + | |
633 | +def cousin_sentence(ante, ana): | |
634 | + if ana['sentence_id'] - ante['sentence_id'] == 2: | |
635 | + return 1 | |
636 | + return 0 | |
637 | + | |
638 | + | |
639 | +def distant_sentence(ante, ana): | |
640 | + if ana['sentence_id'] - ante['sentence_id'] > 2: | |
641 | + return 1 | |
642 | + return 0 | |
643 | + | |
644 | + | |
470 | 645 | def same_paragraph(ante, ana): |
471 | 646 | if ante['paragraph_id'] == ana['paragraph_id']: |
472 | 647 | return 1 |
473 | 648 | return 0 |
474 | 649 | |
475 | 650 | |
651 | +def flat_gender_agreement(ante, ana): | |
652 | + agr_vec = [0] * 3 | |
653 | + if ante['head']['gender'] == 'unk' or ana['head']['gender'] == 'unk': | |
654 | + agr_vec[2] = 1 | |
655 | + elif (ante['head']['gender'] == ana['head']['gender'] or | |
656 | + (ante['head']['gender'] in MASCULINE_TAGS and ana['head']['gender'] in MASCULINE_TAGS)): | |
657 | + agr_vec[0] = 1 | |
658 | + else: | |
659 | + agr_vec[1] = 1 | |
660 | + return agr_vec | |
661 | + | |
662 | + | |
663 | +def string_kernel(ante, ana): | |
664 | + s1 = ante['text'] | |
665 | + s2 = ana['text'] | |
666 | + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2))) | |
667 | + | |
668 | + | |
669 | +def head_string_kernel(ante, ana): | |
670 | + s1 = ante['head_orth'] | |
671 | + s2 = ana['head_orth'] | |
672 | + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2))) | |
673 | + | |
674 | + | |
675 | +def SK(s1, s2): | |
676 | + LAMBDA = 0.4 | |
677 | + | |
678 | + p = len(s1) | |
679 | + if len(s2) < len(s1): | |
680 | + p = len(s2) | |
681 | + | |
682 | + h, w = len(s1)+1, len(s2)+1 | |
683 | + DPS = [[0.0] * w for i in range(h)] | |
684 | + DP = [[0.0] * w for i in range(h)] | |
685 | + | |
686 | + kernel_mat = [0.0] * (len(s1) + 1) | |
687 | + | |
688 | + for i in range(len(s1)+1): | |
689 | + if i == 0: | |
690 | + continue | |
691 | + for j in range(len(s2)+1): | |
692 | + if j == 0: | |
693 | + continue | |
694 | + if s1[i-1] == s2[j-1]: | |
695 | + DPS[i][j] = LAMBDA * LAMBDA | |
696 | + kernel_mat[0] += DPS[i][j] | |
697 | + else: | |
698 | + DPS[i][j] = 0.0 | |
699 | + | |
700 | + for l in range(p): | |
701 | + if l == 0: | |
702 | + continue | |
703 | + | |
704 | + kernel_mat[l] = 0.0 | |
705 | + for j in range(len(s2)+1): | |
706 | + DP[l-1][j] = 0.0 | |
707 | + | |
708 | + for i in range(len(s1)+1): | |
709 | + DP[i][l-1] = 0.0 | |
710 | + | |
711 | + for i in range(len(s1)+1): | |
712 | + if i < l: | |
713 | + continue | |
714 | + for j in range(len(s2)+1): | |
715 | + if j < l: | |
716 | + continue | |
717 | + DP[i][j] = DPS[i][j] + LAMBDA * DP[i - 1][j] + LAMBDA * DP[i][j - 1] - LAMBDA * LAMBDA * DP[i - 1][j - 1] | |
718 | + | |
719 | + if s1[i-1] == s2[j-1]: | |
720 | + DPS[i][j] = LAMBDA * LAMBDA * DP[i - 1][j - 1] | |
721 | + kernel_mat[l] += DPS[i][j] | |
722 | + | |
723 | + K = 0.0 | |
724 | + for l in range(p): | |
725 | + K += kernel_mat[l] | |
726 | + return K | |
727 | + | |
728 | + | |
729 | +def left_match(ante, ana): | |
730 | + if (ante['text'].lower().startswith(ana['text'].lower()) or | |
731 | + ana['text'].lower().startswith(ante['text'].lower())): | |
732 | + return 1 | |
733 | + return 0 | |
734 | + | |
735 | + | |
736 | +def right_match(ante, ana): | |
737 | + if (ante['text'].lower().endswith(ana['text'].lower()) or | |
738 | + ana['text'].lower().endswith(ante['text'].lower())): | |
739 | + return 1 | |
740 | + return 0 | |
741 | + | |
742 | +# def string_match_no_hyphenation(ante, ana): | |
743 | +# ante_no_hyphen = remove_hyphen_signs(ante['text']) | |
744 | +# ana_no_hyphen = remove_hyphen_signs(ana['text']) | |
745 | +# if ante_no_hyphen == ana_no_hyphen: | |
746 | +# return 1 | |
747 | +# return 0 | |
748 | +# | |
749 | +# | |
750 | +# def string_match_no_hyphenation_lowercase(ante, ana): | |
751 | +# ante_no_hyphen = remove_hyphen_signs(ante['text']).lower() | |
752 | +# ana_no_hyphen = remove_hyphen_signs(ana['text']).lower() | |
753 | +# if ante_no_hyphen == ana_no_hyphen: | |
754 | +# return 1 | |
755 | +# return 0 | |
756 | + | |
757 | + | |
758 | +def remove_hyphen_signs(text): | |
759 | + for sign in HYPHEN_SIGNS: | |
760 | + text = text.replace(sign, '') | |
761 | + return text | |
762 | + | |
763 | + | |
764 | +def samesent_anapron_antefirstinpar(ante, ana): | |
765 | + if same_sentence(ante, ana) and is_zero_or_pronoun(ana) and ante['first_in_paragraph']: | |
766 | + return 1 | |
767 | + return 0 | |
768 | + | |
769 | + | |
770 | +def samesent_antefirstinpar_personnumbermatch(ante, ana): | |
771 | + if (same_sentence(ante, ana) and ante['first_in_paragraph'] | |
772 | + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]): | |
773 | + return 1 | |
774 | + return 0 | |
775 | + | |
776 | + | |
777 | +def adjsent_anapron_adjmen_personnumbermatch(ante, ana): | |
778 | + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana) | |
779 | + and ana['position_in_mentions'] - ante['position_in_mentions'] == 1 | |
780 | + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]): | |
781 | + return 1 | |
782 | + return 0 | |
783 | + | |
784 | + | |
785 | +def adjsent_anapron_adjmen(ante, ana): | |
786 | + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana) | |
787 | + and ana['position_in_mentions'] - ante['position_in_mentions'] == 1): | |
788 | + return 1 | |
789 | + return 0 | |
790 | + | |
791 | + | |
792 | +def abbrev2(ante, ana): | |
793 | + ante_abbrev = get_abbrev(ante) | |
794 | + ana_abbrev = get_abbrev(ana) | |
795 | + if ante['head_orth'] == ana_abbrev or ana['head_orth'] == ante_abbrev: | |
796 | + return 1 | |
797 | + return 0 | |
798 | + | |
799 | + | |
800 | +def get_abbrev(mention): | |
801 | + abbrev = u'' | |
802 | + for word in mention['words']: | |
803 | + if word['orth'][0].isupper(): | |
804 | + abbrev += word['orth'][0] | |
805 | + return abbrev | |
806 | + | |
807 | + | |
808 | +def wordnet_synonyms(ante, ana, lemma2synonyms): | |
809 | + ante_synonyms = set() | |
810 | + if ante['head']['base'] in lemma2synonyms: | |
811 | + ante_synonyms = lemma2synonyms[ante['head']['base']] | |
812 | + | |
813 | + ana_synonyms = set() | |
814 | + if ana['head']['base'] in lemma2synonyms: | |
815 | + ana_synonyms = lemma2synonyms[ana['head']['base']] | |
816 | + | |
817 | + if ana['head']['base'] in ante_synonyms or ante['head']['base'] in ana_synonyms: | |
818 | + return 1 | |
819 | + return 0 | |
820 | + | |
821 | + | |
822 | +def wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms): | |
823 | + ante_hypernyms = set() | |
824 | + if ante['head']['base'] in lemma2hypernyms: | |
825 | + ante_hypernyms = lemma2hypernyms[ante['head']['base']] | |
826 | + | |
827 | + ana_hypernyms = set() | |
828 | + if ana['head']['base'] in lemma2hypernyms: | |
829 | + ana_hypernyms = lemma2hypernyms[ana['head']['base']] | |
830 | + | |
831 | + if not ante_hypernyms or not ana_hypernyms: | |
832 | + return 0 | |
833 | + | |
834 | + if ana['head']['base'] in ante_hypernyms: | |
835 | + return 1 | |
836 | + return 0 | |
837 | + | |
838 | + | |
839 | +def wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms): | |
840 | + ana_hypernyms = set() | |
841 | + if ana['head']['base'] in lemma2hypernyms: | |
842 | + ana_hypernyms = lemma2hypernyms[ana['head']['base']] | |
843 | + | |
844 | + ante_hypernyms = set() | |
845 | + if ante['head']['base'] in lemma2hypernyms: | |
846 | + ante_hypernyms = lemma2hypernyms[ante['head']['base']] | |
847 | + | |
848 | + if not ante_hypernyms or not ana_hypernyms: | |
849 | + return 0 | |
850 | + | |
851 | + if ante['head']['base'] in ana_hypernyms: | |
852 | + return 1 | |
853 | + return 0 | |
854 | + | |
855 | + | |
856 | +def wikipedia_link(ante, ana, title2links): | |
857 | + ante_base = ante['lemmatized_text'].lower() | |
858 | + ana_base = ana['lemmatized_text'].lower() | |
859 | + if ante_base == ana_base: | |
860 | + return 1 | |
861 | + | |
862 | + ante_links = set() | |
863 | + if ante_base in title2links: | |
864 | + ante_links = title2links[ante_base] | |
865 | + | |
866 | + ana_links = set() | |
867 | + if ana_base in title2links: | |
868 | + ana_links = title2links[ana_base] | |
869 | + | |
870 | + if ana_base in ante_links or ante_base in ana_links: | |
871 | + return 1 | |
872 | + | |
873 | + return 0 | |
874 | + | |
875 | + | |
876 | +def wikipedia_mutual_link(ante, ana, title2links): | |
877 | + ante_base = ante['lemmatized_text'].lower() | |
878 | + ana_base = ana['lemmatized_text'].lower() | |
879 | + if ante_base == ana_base: | |
880 | + return 1 | |
881 | + | |
882 | + ante_links = set() | |
883 | + if ante_base in title2links: | |
884 | + ante_links = title2links[ante_base] | |
885 | + | |
886 | + ana_links = set() | |
887 | + if ana_base in title2links: | |
888 | + ana_links = title2links[ana_base] | |
889 | + | |
890 | + if ana_base in ante_links and ante_base in ana_links: | |
891 | + return 1 | |
892 | + | |
893 | + return 0 | |
894 | + | |
895 | + | |
896 | +def wikipedia_redirect(ante, ana, title2redirect): | |
897 | + ante_base = ante['lemmatized_text'].lower() | |
898 | + ana_base = ana['lemmatized_text'].lower() | |
899 | + if ante_base == ana_base: | |
900 | + return 1 | |
901 | + | |
902 | + if ante_base in title2redirect and title2redirect[ante_base] == ana_base: | |
903 | + return 1 | |
904 | + | |
905 | + if ana_base in title2redirect and title2redirect[ana_base] == ante_base: | |
906 | + return 1 | |
907 | + | |
908 | + return 0 | |
909 | + | |
910 | + | |
476 | 911 | def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www.eml.org/NameSpaces/mention'): |
477 | 912 | markables_dicts = [] |
478 | 913 | markables_tree = etree.parse(markables_path) |
... | ... | @@ -492,7 +927,8 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www |
492 | 927 | if head_orth not in POSSIBLE_HEADS: |
493 | 928 | mention_words = span_to_words(span, words) |
494 | 929 | |
495 | - prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id = get_context(mention_words, words) | |
930 | + (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, | |
931 | + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph) = get_context(mention_words, words) | |
496 | 932 | |
497 | 933 | head = get_head(head_orth, mention_words) |
498 | 934 | markables_dicts.append({'id': markable.attrib['id'], |
... | ... | @@ -513,9 +949,11 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www |
513 | 949 | 'end_in_words': mnt_end_position, |
514 | 950 | 'rarest': get_rarest_word(mention_words, freq_list), |
515 | 951 | 'paragraph_id': paragraph_id, |
516 | - 'sentence_id': sentence_id}) | |
952 | + 'sentence_id': sentence_id, | |
953 | + 'first_in_sentence': first_in_sentence, | |
954 | + 'first_in_paragraph': first_in_paragraph}) | |
517 | 955 | else: |
518 | - print 'Zduplikowana wzmianka: %s' % span | |
956 | + print ('Zduplikowana wzmianka: %s' % span) | |
519 | 957 | |
520 | 958 | return markables_dicts |
521 | 959 | |
... | ... | @@ -529,10 +967,16 @@ def get_context(mention_words, words): |
529 | 967 | mnt_start_position = -1 |
530 | 968 | first_word = mention_words[0] |
531 | 969 | last_word = mention_words[-1] |
970 | + first_in_sentence = False | |
971 | + first_in_paragraph = False | |
532 | 972 | for idx, word in enumerate(words): |
533 | 973 | if word['id'] == first_word['id']: |
534 | 974 | prec_context = get_prec_context(idx, words) |
535 | 975 | mnt_start_position = get_mention_start(first_word, words) |
976 | + if idx == 0 or words[idx-1]['lastinsent']: | |
977 | + first_in_sentence = True | |
978 | + if idx == 0 or words[idx-1]['lastinpar']: | |
979 | + first_in_paragraph = True | |
536 | 980 | if word['id'] == last_word['id']: |
537 | 981 | follow_context = get_follow_context(idx, words) |
538 | 982 | sentence = get_sentence(idx, words) |
... | ... | @@ -542,7 +986,8 @@ def get_context(mention_words, words): |
542 | 986 | sentence_id += 1 |
543 | 987 | if word['lastinpar']: |
544 | 988 | paragraph_id += 1 |
545 | - return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id | |
989 | + return (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, | |
990 | + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph) | |
546 | 991 | |
547 | 992 | |
548 | 993 | def get_prec_context(mention_start, words): |
... | ... | @@ -743,9 +1188,9 @@ def to_text(words, form): |
743 | 1188 | |
744 | 1189 | |
745 | 1190 | def get_one_word_text(word_id, words, form): |
746 | - this_word = (word for word in words if word['id'] == word_id).next() | |
1191 | + this_word = next(word for word in words if word['id'] == word_id) | |
747 | 1192 | if word_to_ignore(this_word): |
748 | - print this_word | |
1193 | + print (this_word) | |
749 | 1194 | return this_word[form] |
750 | 1195 | |
751 | 1196 | |
... | ... |