diff --git a/data/wikipedia/link.map b/data/wikipedia/link.map new file mode 100755 index 0000000..a14081b --- /dev/null +++ b/data/wikipedia/link.map diff --git a/data/wikipedia/redirect.map b/data/wikipedia/redirect.map new file mode 100755 index 0000000..3b614e4 --- /dev/null +++ b/data/wikipedia/redirect.map diff --git a/data/wordnet/lemma2hypernyms.map b/data/wordnet/lemma2hypernyms.map new file mode 100755 index 0000000..6fd662e --- /dev/null +++ b/data/wordnet/lemma2hypernyms.map diff --git a/data/wordnet/lemma2synonyms.map b/data/wordnet/lemma2synonyms.map new file mode 100755 index 0000000..caa12b2 --- /dev/null +++ b/data/wordnet/lemma2synonyms.map diff --git a/for_investigation.ipynb b/for_investigation.ipynb index 3e3e95e..e164840 100644 --- a/for_investigation.ipynb +++ b/for_investigation.ipynb @@ -114,7 +114,7 @@ }, "outputs": [], "source": [ - " predictions = model.predict(test_set)" + "predictions = model.predict(test_set)" ] }, { @@ -141,7 +141,7 @@ } ], "source": [ - " true_positives = 0.0\n", + "true_positives = 0.0\n", " false_positives = 0.0\n", " true_negatives = 0.0\n", " false_negatives = 0.0\n", @@ -173,7 +173,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 2.0 }, "file_extension": ".py", "mimetype": "text/x-python", @@ -184,5 +184,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/mention-pair-classifier.ipynb b/mention-pair-classifier.ipynb index 7b9e01b..8bdb4ef 100644 --- a/mention-pair-classifier.ipynb +++ b/mention-pair-classifier.ipynb @@ -78,7 +78,7 @@ "number_of_features = 1126\n", "\n", "X = data[:,0:1126]\n", - "Y = data[:,1126] #last column consists of labels\n" + "Y = data[:,1126] #last column consists of labels" ] }, { @@ -270,7 +270,7 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 2.0 }, "file_extension": ".py", "mimetype": "text/x-python", @@ -281,5 +281,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/preparator.py b/preparator.py index 2cfdcbb..ab6ac04 100644 --- a/preparator.py +++ b/preparator.py @@ -1,9 +1,13 @@ # -*- coding: utf-8 -*- import codecs +import math import numpy import os import random +import re + +import javaobj from lxml import etree from itertools import combinations @@ -12,25 +16,39 @@ from natsort import natsorted from gensim.models.word2vec import Word2Vec -TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'test-prepared')) -TRAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'train-prepared')) -FREQ_300M_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'freq', 'base.lst')) +MAIN_PATH = os.path.dirname(__file__) +TEST_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'test-prepared')) +TRAIN_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'train-prepared')) +FREQ_300M_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'freq', 'base.lst')) + +LEMMA2SYNONYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2synonyms.map')) +LEMMA2HYPERNYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2hypernyms.map')) + +TITLE2LINKS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'link.map')) +TITLE2REDIRECT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'redirect.map')) ANNO_PATH = TEST_PATH -OUT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', - 'test-20170627.csv')) +OUT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', + 'test-20170720.csv')) EACH_TEXT_SEPARATELLY = False CONTEXT = 5 W2V_SIZE = 50 -MODEL = os.path.abspath(os.path.join(os.path.dirname(__file__), 'models', +MODEL = os.path.abspath(os.path.join(MAIN_PATH, 'models', '%d' % W2V_SIZE, 'w2v_allwiki_nkjpfull_%d.model' % W2V_SIZE)) +FIRST_SECOND_PERSON = ['pri', 'sec'] +INDICATIVE_PRONS_BASES = ["ten", "ta", "to", "ci", "te", "tamten", "tamta", + "tamto", "tamci", "tamte", "ów", "owa", "owo", "owi", "owe"] +SIEBIE_TAGS = ['siebie'] +MASCULINE_TAGS = ['m1', 'm2', 'm3'] + NOUN_TAGS = ['subst', 'ger', 'depr'] PPRON_TAGS = ['ppron12', 'ppron3'] ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt'] POSSIBLE_HEADS = [u'§', u'%', u'*', u'"', u'„', u'&', u'-'] +HYPHEN_SIGNS = ['-', '#'] NEG_PROPORTION = 1 RANDOM_VECTORS = True @@ -45,13 +63,18 @@ UNKNONW_WORDS = 0 def main(): model = Word2Vec.load(MODEL) freq_list = load_freq_list(FREQ_300M_PATH) + lemma2synonyms = load_one2many_map(LEMMA2SYNONYMS_PATH) + lemma2hypernyms = load_one2many_map(LEMMA2HYPERNYMS_PATH) + title2links = load_one2many_map(TITLE2LINKS_PATH) + title2redirect = load_one2one_map(TITLE2REDIRECT_PATH) try: - create_data_vectors(model, freq_list) + create_data_vectors(model, freq_list, lemma2synonyms, + lemma2hypernyms, title2links, title2redirect) finally: - print 'Unknown words: ', UNKNONW_WORDS - print 'All words: ', ALL_WORDS - print 'Positives: ', POS_COUNT - print 'Negatives: ', NEG_COUNT + print ('Unknown words: ', UNKNONW_WORDS) + print ('All words: ', ALL_WORDS) + print ('Positives: ', POS_COUNT) + print ('Negatives: ', NEG_COUNT) def load_freq_list(freq_path): @@ -67,16 +90,43 @@ def load_freq_list(freq_path): return freq_list -def create_data_vectors(model, freq_list): +def load_one2many_map(map_path): + this_map = {} + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb')) + pobj = marshaller.readObject() + jmap_annotations = pobj.__dict__['annotations'] + jmap_annotations_count = len(jmap_annotations) + for i in range(jmap_annotations_count): + if i%2 == 1: + mapped_elements = set(jmap_annotations[i+1].__dict__['annotations']) + this_map[jmap_annotations[i]] = mapped_elements + return this_map + + +def load_one2one_map(map_path): + this_map = {} + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb')) + pobj = marshaller.readObject() + jmap_annotations = pobj.__dict__['annotations'] + jmap_annotations_count = len(jmap_annotations) + for i in range(jmap_annotations_count): + if i%2 == 1: + element = jmap_annotations[i+1] + this_map[jmap_annotations[i]] = element + return this_map + + +def create_data_vectors(model, freq_list, lemma2synonyms, + lemma2hypernyms, title2links, title2redirect): features_file = None if not EACH_TEXT_SEPARATELLY: - features_file = codecs.open(OUT_PATH, 'wt', 'utf-8') + features_file = codecs.open(OUT_PATH, 'w', 'utf-8') anno_files = os.listdir(ANNO_PATH) anno_files = natsorted(anno_files) for filename in anno_files: if filename.endswith('.mmax'): - print '=======> ', filename + print ('=======> ', filename) textname = filename.replace('.mmax', '') mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname) @@ -85,19 +135,18 @@ def create_data_vectors(model, freq_list): positives, negatives = diff_mentions(mentions) if DEBUG: - print 'Positives:' - print len(positives) - - print 'Negatives:' - print len(negatives) + print ('Positives:', len(positives)) + print ('Negatives:', len(negatives)) words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname) mentions_dict = markables_level_2_dict(mentions_path, words_path, freq_list) if EACH_TEXT_SEPARATELLY: text_features_path = os.path.join(OUT_PATH, '%s.csv' % textname) - features_file = codecs.open(text_features_path, 'wt', 'utf-8') - write_features(features_file, positives, negatives, mentions_dict, model, textname) + features_file = codecs.open(text_features_path, 'w', 'utf-8') + write_features(features_file, positives, negatives, mentions_dict, + model, textname, lemma2synonyms, + lemma2hypernyms, title2links, title2redirect) if not EACH_TEXT_SEPARATELLY: features_file.close() @@ -108,7 +157,7 @@ def diff_mentions(mentions): positives = get_positives(sets) positives, negatives = get_negatives_and_update_positives(clustered_mensions, positives) if len(negatives) != len(positives) and NEG_PROPORTION == 1: - print u'Niezgodna liczba przypadków pozytywnych i negatywnych!' + print (u'Niezgodna liczba przypadków pozytywnych i negatywnych!') return positives, negatives @@ -126,18 +175,18 @@ def get_sets(mentions): sets[set_id].append(mention.attrib['span']) clustered_mensions.append(mention.attrib['span']) else: - print u'Coś poszło nie tak przy wyszukiwaniu klastrów!' + print (u'Coś poszło nie tak przy wyszukiwaniu klastrów!') sets_to_remove = [] for set_id in sets: if len(sets[set_id]) < 2: sets_to_remove.append(set_id) if len(sets[set_id]) == 1: - print u'Removing clustered mention: ', sets[set_id][0] + print (u'Removing clustered mention: ', sets[set_id][0]) clustered_mensions.remove(sets[set_id][0]) for set_id in sets_to_remove: - print u'Removing set: ', set_id + print (u'Removing set: ', set_id) sets.pop(set_id) return sets, clustered_mensions @@ -160,21 +209,24 @@ def get_negatives_and_update_positives(clustered_mensions, positives): samples_count = len(negatives) if NEG_PROPORTION == 1: positives = random.sample(set(positives), samples_count) - print u'Więcej przypadków pozytywnych niż negatywnych!' + print (u'Więcej przypadków pozytywnych niż negatywnych!') negatives = random.sample(set(negatives), samples_count) return positives, negatives -def write_features(features_file, positives, negatives, mentions_dict, model, textname): +def write_features(features_file, positives, negatives, mentions_dict, + model, textname, lemma2synonyms, + lemma2hypernyms, title2links, title2redirect): global POS_COUNT POS_COUNT += len(positives) for pair in positives: pair_features = [] if DEBUG: pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])] - pair_features.extend(get_features(pair, mentions_dict, model)) + pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms, + lemma2hypernyms, title2links, title2redirect)) pair_features.append(1) - features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features])) + features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features])) global NEG_COUNT NEG_COUNT += len(negatives) @@ -182,12 +234,14 @@ def write_features(features_file, positives, negatives, mentions_dict, model, te pair_features = [] if DEBUG: pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])] - pair_features.extend(get_features(pair, mentions_dict, model)) + pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms, + lemma2hypernyms, title2links, title2redirect)) pair_features.append(0) - features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features])) + features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features])) -def get_features(pair, mentions_dict, model): +def get_features(pair, mentions_dict, model, lemma2synonyms, + lemma2hypernyms, title2links, title2redirect): features = [] ante = pair[0] ana = pair[1] @@ -195,7 +249,8 @@ def get_features(pair, mentions_dict, model): features.extend(ante_features) ana_features = get_mention_features(ana, mentions_dict, model) features.extend(ana_features) - pair_features = get_pair_features(pair, mentions_dict) + pair_features = get_pair_features(pair, mentions_dict, lemma2synonyms, + lemma2hypernyms, title2links, title2redirect) features.extend(pair_features) return features @@ -280,6 +335,19 @@ def get_mention_features(mention_span, mentions_dict, model): # cechy uzupelniajace features.extend(mention_type(mention)) + # cechy uzupelniajace 2 + features.append(is_first_second_person(mention)) + features.append(is_demonstrative(mention)) + features.append(is_demonstrative_nominal(mention)) + features.append(is_demonstrative_pronoun(mention)) + features.append(is_refl_pronoun(mention)) + features.append(is_first_in_sentence(mention)) + features.append(is_zero_or_pronoun(mention)) + features.append(contains_digit(mention, 'head_orth')) + features.append(contains_digit(mention, 'text')) + features.append(contains_letter(mention)) + features.append(post_modified(mention)) + return features @@ -296,6 +364,68 @@ def mention_type(mention): return type_vec +def is_first_second_person(mention): + if mention['head']['person'] in FIRST_SECOND_PERSON: + return 1 + return 0 + + +def is_demonstrative(mention): + if mention['words'][0]['base'].lower() in INDICATIVE_PRONS_BASES: + return 1 + return 0 + + +def is_demonstrative_nominal(mention): + if is_demonstrative(mention) and mention['head']['ctag'] in NOUN_TAGS: + return 1 + return 0 + + +def is_demonstrative_pronoun(mention): + if (is_demonstrative(mention) and + (mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS)): + return 1 + return 0 + + +def is_refl_pronoun(mention): + if mention['head']['ctag'] in SIEBIE_TAGS: + return 1 + return 0 + + +def is_first_in_sentence(mention): + if mention['first_in_sentence']: + return 1 + return 0 + + +def is_zero_or_pronoun(mention): + if mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS: + return 1 + return 0 + + +def contains_digit(mention, attr_name): + _digits = re.compile('\d') + if _digits.search(mention[attr_name]): + return 1 + return 0 + + +def contains_letter(mention): + if any(c.isalpha() for c in mention['text']): + return 1 + return 0 + + +def post_modified(mention): + if mention['head']['orth'] != mention['words'][-1]['orth']: + return 1 + return 0 + + def get_wv(model, lemma, random=True): global ALL_WORDS global UNKNONW_WORDS @@ -332,7 +462,8 @@ def get_context_vec(words, model): return vec -def get_pair_features(pair, mentions_dict): +def get_pair_features(pair, mentions_dict, lemma2synonyms, + lemma2hypernyms, title2links, title2redirect): ante = get_mention_by_attr(mentions_dict, 'span', pair[0]) ana = get_mention_by_attr(mentions_dict, 'span', pair[1]) @@ -375,6 +506,32 @@ def get_pair_features(pair, mentions_dict): features.append(same_sentence(ante, ana)) features.append(same_paragraph(ante, ana)) + # cechy uzupelniajace 2 + features.append(neighbouring_sentence(ante, ana)) + features.append(cousin_sentence(ante, ana)) + features.append(distant_sentence(ante, ana)) + features.append(flat_gender_agreement(ante, ana)) + features.append(left_match(ante, ana)) + features.append(right_match(ante, ana)) + features.append(abbrev2(ante, ana)) + + features.append(string_kernel(ante, ana)) + features.append(head_string_kernel(ante, ana)) + + features.append(wordnet_synonyms(ante, ana, lemma2synonyms)) + features.append(wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms)) + features.append(wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms)) + + features.append(wikipedia_link(ante, ana, title2links)) + features.append(wikipedia_mutual_link(ante, ana, title2links)) + features.append(wikipedia_redirect(ante, ana, title2redirect)) + + # combined + features.append(samesent_anapron_antefirstinpar(ante, ana)) + features.append(samesent_antefirstinpar_personnumbermatch(ante, ana)) + features.append(adjsent_anapron_adjmen_personnumbermatch(ante, ana)) + features.append(adjsent_anapron_adjmen(ante, ana)) + return features @@ -392,7 +549,7 @@ def get_distance_bucket(distance): elif distance >= 64: return 9 else: - print u'Coś poszło nie tak przy kubełkowaniu!!' + print (u'Coś poszło nie tak przy kubełkowaniu!!') return 10 @@ -445,8 +602,8 @@ def is_acronym(ante, ana): if ana['text'].upper() == ana['text']: return check_one_way_acronym(ana['text'], ante['text']) if ante['text'].upper() == ante['text']: - return check_one_way_acronym(ante['text'], ana['text']); - return 0; + return check_one_way_acronym(ante['text'], ana['text']) + return 0 def check_one_way_acronym(acronym, expression): @@ -455,10 +612,10 @@ def check_one_way_acronym(acronym, expression): for expr2 in expr1.split(): expr2 = expr2.strip() if expr2: - initials += unicode(expr2[0]).upper() + initials += str(expr2[0]).upper() if acronym == initials: - return 1; - return 0; + return 1 + return 0 def same_sentence(ante, ana): @@ -467,12 +624,290 @@ def same_sentence(ante, ana): return 0 +def neighbouring_sentence(ante, ana): + if ana['sentence_id'] - ante['sentence_id'] == 1: + return 1 + return 0 + + +def cousin_sentence(ante, ana): + if ana['sentence_id'] - ante['sentence_id'] == 2: + return 1 + return 0 + + +def distant_sentence(ante, ana): + if ana['sentence_id'] - ante['sentence_id'] > 2: + return 1 + return 0 + + def same_paragraph(ante, ana): if ante['paragraph_id'] == ana['paragraph_id']: return 1 return 0 +def flat_gender_agreement(ante, ana): + agr_vec = [0] * 3 + if ante['head']['gender'] == 'unk' or ana['head']['gender'] == 'unk': + agr_vec[2] = 1 + elif (ante['head']['gender'] == ana['head']['gender'] or + (ante['head']['gender'] in MASCULINE_TAGS and ana['head']['gender'] in MASCULINE_TAGS)): + agr_vec[0] = 1 + else: + agr_vec[1] = 1 + return agr_vec + + +def string_kernel(ante, ana): + s1 = ante['text'] + s2 = ana['text'] + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2))) + + +def head_string_kernel(ante, ana): + s1 = ante['head_orth'] + s2 = ana['head_orth'] + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2))) + + +def SK(s1, s2): + LAMBDA = 0.4 + + p = len(s1) + if len(s2) < len(s1): + p = len(s2) + + h, w = len(s1)+1, len(s2)+1 + DPS = [[0.0] * w for i in range(h)] + DP = [[0.0] * w for i in range(h)] + + kernel_mat = [0.0] * (len(s1) + 1) + + for i in range(len(s1)+1): + if i == 0: + continue + for j in range(len(s2)+1): + if j == 0: + continue + if s1[i-1] == s2[j-1]: + DPS[i][j] = LAMBDA * LAMBDA + kernel_mat[0] += DPS[i][j] + else: + DPS[i][j] = 0.0 + + for l in range(p): + if l == 0: + continue + + kernel_mat[l] = 0.0 + for j in range(len(s2)+1): + DP[l-1][j] = 0.0 + + for i in range(len(s1)+1): + DP[i][l-1] = 0.0 + + for i in range(len(s1)+1): + if i < l: + continue + for j in range(len(s2)+1): + if j < l: + continue + DP[i][j] = DPS[i][j] + LAMBDA * DP[i - 1][j] + LAMBDA * DP[i][j - 1] - LAMBDA * LAMBDA * DP[i - 1][j - 1] + + if s1[i-1] == s2[j-1]: + DPS[i][j] = LAMBDA * LAMBDA * DP[i - 1][j - 1] + kernel_mat[l] += DPS[i][j] + + K = 0.0 + for l in range(p): + K += kernel_mat[l] + return K + + +def left_match(ante, ana): + if (ante['text'].lower().startswith(ana['text'].lower()) or + ana['text'].lower().startswith(ante['text'].lower())): + return 1 + return 0 + + +def right_match(ante, ana): + if (ante['text'].lower().endswith(ana['text'].lower()) or + ana['text'].lower().endswith(ante['text'].lower())): + return 1 + return 0 + +# def string_match_no_hyphenation(ante, ana): +# ante_no_hyphen = remove_hyphen_signs(ante['text']) +# ana_no_hyphen = remove_hyphen_signs(ana['text']) +# if ante_no_hyphen == ana_no_hyphen: +# return 1 +# return 0 +# +# +# def string_match_no_hyphenation_lowercase(ante, ana): +# ante_no_hyphen = remove_hyphen_signs(ante['text']).lower() +# ana_no_hyphen = remove_hyphen_signs(ana['text']).lower() +# if ante_no_hyphen == ana_no_hyphen: +# return 1 +# return 0 + + +def remove_hyphen_signs(text): + for sign in HYPHEN_SIGNS: + text = text.replace(sign, '') + return text + + +def samesent_anapron_antefirstinpar(ante, ana): + if same_sentence(ante, ana) and is_zero_or_pronoun(ana) and ante['first_in_paragraph']: + return 1 + return 0 + + +def samesent_antefirstinpar_personnumbermatch(ante, ana): + if (same_sentence(ante, ana) and ante['first_in_paragraph'] + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]): + return 1 + return 0 + + +def adjsent_anapron_adjmen_personnumbermatch(ante, ana): + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana) + and ana['position_in_mentions'] - ante['position_in_mentions'] == 1 + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]): + return 1 + return 0 + + +def adjsent_anapron_adjmen(ante, ana): + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana) + and ana['position_in_mentions'] - ante['position_in_mentions'] == 1): + return 1 + return 0 + + +def abbrev2(ante, ana): + ante_abbrev = get_abbrev(ante) + ana_abbrev = get_abbrev(ana) + if ante['head_orth'] == ana_abbrev or ana['head_orth'] == ante_abbrev: + return 1 + return 0 + + +def get_abbrev(mention): + abbrev = u'' + for word in mention['words']: + if word['orth'][0].isupper(): + abbrev += word['orth'][0] + return abbrev + + +def wordnet_synonyms(ante, ana, lemma2synonyms): + ante_synonyms = set() + if ante['head']['base'] in lemma2synonyms: + ante_synonyms = lemma2synonyms[ante['head']['base']] + + ana_synonyms = set() + if ana['head']['base'] in lemma2synonyms: + ana_synonyms = lemma2synonyms[ana['head']['base']] + + if ana['head']['base'] in ante_synonyms or ante['head']['base'] in ana_synonyms: + return 1 + return 0 + + +def wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms): + ante_hypernyms = set() + if ante['head']['base'] in lemma2hypernyms: + ante_hypernyms = lemma2hypernyms[ante['head']['base']] + + ana_hypernyms = set() + if ana['head']['base'] in lemma2hypernyms: + ana_hypernyms = lemma2hypernyms[ana['head']['base']] + + if not ante_hypernyms or not ana_hypernyms: + return 0 + + if ana['head']['base'] in ante_hypernyms: + return 1 + return 0 + + +def wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms): + ana_hypernyms = set() + if ana['head']['base'] in lemma2hypernyms: + ana_hypernyms = lemma2hypernyms[ana['head']['base']] + + ante_hypernyms = set() + if ante['head']['base'] in lemma2hypernyms: + ante_hypernyms = lemma2hypernyms[ante['head']['base']] + + if not ante_hypernyms or not ana_hypernyms: + return 0 + + if ante['head']['base'] in ana_hypernyms: + return 1 + return 0 + + +def wikipedia_link(ante, ana, title2links): + ante_base = ante['lemmatized_text'].lower() + ana_base = ana['lemmatized_text'].lower() + if ante_base == ana_base: + return 1 + + ante_links = set() + if ante_base in title2links: + ante_links = title2links[ante_base] + + ana_links = set() + if ana_base in title2links: + ana_links = title2links[ana_base] + + if ana_base in ante_links or ante_base in ana_links: + return 1 + + return 0 + + +def wikipedia_mutual_link(ante, ana, title2links): + ante_base = ante['lemmatized_text'].lower() + ana_base = ana['lemmatized_text'].lower() + if ante_base == ana_base: + return 1 + + ante_links = set() + if ante_base in title2links: + ante_links = title2links[ante_base] + + ana_links = set() + if ana_base in title2links: + ana_links = title2links[ana_base] + + if ana_base in ante_links and ante_base in ana_links: + return 1 + + return 0 + + +def wikipedia_redirect(ante, ana, title2redirect): + ante_base = ante['lemmatized_text'].lower() + ana_base = ana['lemmatized_text'].lower() + if ante_base == ana_base: + return 1 + + if ante_base in title2redirect and title2redirect[ante_base] == ana_base: + return 1 + + if ana_base in title2redirect and title2redirect[ana_base] == ante_base: + return 1 + + return 0 + + def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www.eml.org/NameSpaces/mention'): markables_dicts = [] markables_tree = etree.parse(markables_path) @@ -492,7 +927,8 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www if head_orth not in POSSIBLE_HEADS: mention_words = span_to_words(span, words) - prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id = get_context(mention_words, words) + (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph) = get_context(mention_words, words) head = get_head(head_orth, mention_words) markables_dicts.append({'id': markable.attrib['id'], @@ -513,9 +949,11 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www 'end_in_words': mnt_end_position, 'rarest': get_rarest_word(mention_words, freq_list), 'paragraph_id': paragraph_id, - 'sentence_id': sentence_id}) + 'sentence_id': sentence_id, + 'first_in_sentence': first_in_sentence, + 'first_in_paragraph': first_in_paragraph}) else: - print 'Zduplikowana wzmianka: %s' % span + print ('Zduplikowana wzmianka: %s' % span) return markables_dicts @@ -529,10 +967,16 @@ def get_context(mention_words, words): mnt_start_position = -1 first_word = mention_words[0] last_word = mention_words[-1] + first_in_sentence = False + first_in_paragraph = False for idx, word in enumerate(words): if word['id'] == first_word['id']: prec_context = get_prec_context(idx, words) mnt_start_position = get_mention_start(first_word, words) + if idx == 0 or words[idx-1]['lastinsent']: + first_in_sentence = True + if idx == 0 or words[idx-1]['lastinpar']: + first_in_paragraph = True if word['id'] == last_word['id']: follow_context = get_follow_context(idx, words) sentence = get_sentence(idx, words) @@ -542,7 +986,8 @@ def get_context(mention_words, words): sentence_id += 1 if word['lastinpar']: paragraph_id += 1 - return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id + return (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph) def get_prec_context(mention_start, words): @@ -743,9 +1188,9 @@ def to_text(words, form): def get_one_word_text(word_id, words, form): - this_word = (word for word in words if word['id'] == word_id).next() + this_word = next(word for word in words if word['id'] == word_id) if word_to_ignore(this_word): - print this_word + print (this_word) return this_word[form]