Added Bartek-3 features to preparator script.

Bartłomiej Nitoń
1 parent 04c45e2d
Showing 7 changed files with 500 additions and 55 deletions
data/wikipedia/link.map
data/wikipedia/redirect.map
data/wordnet/lemma2hypernyms.map
data/wordnet/lemma2synonyms.map
for_investigation.ipynb
mention-pair-classifier.ipynb
preparator.py
@@ -114,7 +114,7 @@
    },
    "outputs": [],
    "source": [
-    " predictions = model.predict(test_set)"
+    "predictions = model.predict(test_set)"
    ]
   },
   {
@@ -141,7 +141,7 @@
     }
    ],
    "source": [
-    "    true_positives = 0.0\n",
+    "true_positives = 0.0\n",
     "    false_positives = 0.0\n",
     "    true_negatives = 0.0\n",
     "    false_negatives = 0.0\n",
@@ -173,7 +173,7 @@
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 2.0
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
@@ -184,5 +184,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
-}
+ "nbformat_minor": 0
+}
 \ No newline at end of file
@@ -78,7 +78,7 @@
     "number_of_features = 1126\n",
     "\n",
     "X = data[:,0:1126]\n",
-    "Y = data[:,1126] #last column consists of labels\n"
+    "Y = data[:,1126] #last column consists of labels"
    ]
   },
   {
@@ -270,7 +270,7 @@
   "language_info": {
    "codemirror_mode": {
     "name": "ipython",
-    "version": 2
+    "version": 2.0
    },
    "file_extension": ".py",
    "mimetype": "text/x-python",
@@ -281,5 +281,5 @@
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
-}
+ "nbformat_minor": 0
+}
 \ No newline at end of file
 # -*- coding: utf-8 -*-
 import codecs
+import math
 import numpy
 import os
 import random
+import re
+
+import javaobj
 from lxml import etree
 from itertools import combinations
@@ -12,25 +16,39 @@ from natsort import natsorted
 from gensim.models.word2vec import Word2Vec
-TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'test-prepared'))
-TRAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'train-prepared'))
-FREQ_300M_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'freq', 'base.lst'))
+MAIN_PATH = os.path.dirname(__file__)
+TEST_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'test-prepared'))
+TRAIN_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'train-prepared'))
+FREQ_300M_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'freq', 'base.lst'))
+
+LEMMA2SYNONYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2synonyms.map'))
+LEMMA2HYPERNYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2hypernyms.map'))
+
+TITLE2LINKS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'link.map'))
+TITLE2REDIRECT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'redirect.map'))
 ANNO_PATH = TEST_PATH
-OUT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data',
-                                        'test-20170627.csv'))
+OUT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data',
+                                        'test-20170720.csv'))
 EACH_TEXT_SEPARATELLY = False
 CONTEXT = 5
 W2V_SIZE = 50
-MODEL = os.path.abspath(os.path.join(os.path.dirname(__file__), 'models',
+MODEL = os.path.abspath(os.path.join(MAIN_PATH, 'models',
                                      '%d' % W2V_SIZE,
                                      'w2v_allwiki_nkjpfull_%d.model' % W2V_SIZE))
+FIRST_SECOND_PERSON = ['pri', 'sec']
+INDICATIVE_PRONS_BASES = ["ten", "ta", "to", "ci", "te", "tamten", "tamta",
+                          "tamto", "tamci", "tamte", "ów", "owa", "owo", "owi", "owe"]
+SIEBIE_TAGS = ['siebie']
+MASCULINE_TAGS = ['m1', 'm2', 'm3']
+
 NOUN_TAGS = ['subst', 'ger', 'depr']
 PPRON_TAGS = ['ppron12', 'ppron3']
 ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt']
 POSSIBLE_HEADS = [u'§', u'%', u'*', u'"', u'„', u'&', u'-']
+HYPHEN_SIGNS = ['-', '#']
 NEG_PROPORTION = 1
 RANDOM_VECTORS = True
@@ -45,13 +63,18 @@ UNKNONW_WORDS = 0
 def main():
     model = Word2Vec.load(MODEL)
     freq_list = load_freq_list(FREQ_300M_PATH)
+    lemma2synonyms = load_one2many_map(LEMMA2SYNONYMS_PATH)
+    lemma2hypernyms = load_one2many_map(LEMMA2HYPERNYMS_PATH)
+    title2links = load_one2many_map(TITLE2LINKS_PATH)
+    title2redirect = load_one2one_map(TITLE2REDIRECT_PATH)
     try:
-        create_data_vectors(model, freq_list)
+        create_data_vectors(model, freq_list, lemma2synonyms,
+                            lemma2hypernyms, title2links, title2redirect)
     finally:
-        print 'Unknown words: ', UNKNONW_WORDS
-        print 'All words: ', ALL_WORDS
-        print 'Positives: ', POS_COUNT
-        print 'Negatives: ', NEG_COUNT
+        print ('Unknown words: ', UNKNONW_WORDS)
+        print ('All words: ', ALL_WORDS)
+        print ('Positives: ', POS_COUNT)
+        print ('Negatives: ', NEG_COUNT)
 def load_freq_list(freq_path):
@@ -67,16 +90,43 @@ def load_freq_list(freq_path):
     return freq_list
-def create_data_vectors(model, freq_list):
+def load_one2many_map(map_path):
+    this_map = {}
+    marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb'))
+    pobj = marshaller.readObject()
+    jmap_annotations = pobj.__dict__['annotations']
+    jmap_annotations_count = len(jmap_annotations)
+    for i in range(jmap_annotations_count):
+    	if i%2 == 1:
+    		mapped_elements = set(jmap_annotations[i+1].__dict__['annotations'])
+    		this_map[jmap_annotations[i]] = mapped_elements
+    return this_map
+
+
+def load_one2one_map(map_path):
+    this_map = {}
+    marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb'))
+    pobj = marshaller.readObject()
+    jmap_annotations = pobj.__dict__['annotations']
+    jmap_annotations_count = len(jmap_annotations)
+    for i in range(jmap_annotations_count):
+    	if i%2 == 1:
+    		element = jmap_annotations[i+1]
+    		this_map[jmap_annotations[i]] = element
+    return this_map
+
+
+def create_data_vectors(model, freq_list, lemma2synonyms,
+                        lemma2hypernyms, title2links, title2redirect):
     features_file = None
     if not EACH_TEXT_SEPARATELLY:
-        features_file = codecs.open(OUT_PATH, 'wt', 'utf-8')
+        features_file = codecs.open(OUT_PATH, 'w', 'utf-8')
     anno_files = os.listdir(ANNO_PATH)
     anno_files = natsorted(anno_files)
     for filename in anno_files:
         if filename.endswith('.mmax'):
-            print '=======> ', filename
+            print ('=======> ', filename)
             textname = filename.replace('.mmax', '')
             mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname)
@@ -85,19 +135,18 @@ def create_data_vectors(model, freq_list):
             positives, negatives = diff_mentions(mentions)
             if DEBUG:
-                print 'Positives:'
-                print len(positives)
-
-                print 'Negatives:'
-                print len(negatives)
+                print ('Positives:', len(positives))
+                print ('Negatives:', len(negatives))
             words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname)
             mentions_dict = markables_level_2_dict(mentions_path, words_path, freq_list)
             if EACH_TEXT_SEPARATELLY:
                 text_features_path = os.path.join(OUT_PATH, '%s.csv' % textname)
-                features_file = codecs.open(text_features_path, 'wt', 'utf-8')
-            write_features(features_file, positives, negatives, mentions_dict, model, textname)
+                features_file = codecs.open(text_features_path, 'w', 'utf-8')
+            write_features(features_file, positives, negatives, mentions_dict,
+                           model, textname, lemma2synonyms,
+                           lemma2hypernyms, title2links, title2redirect)
     if not EACH_TEXT_SEPARATELLY:
         features_file.close()
@@ -108,7 +157,7 @@ def diff_mentions(mentions):
     positives = get_positives(sets)
     positives, negatives = get_negatives_and_update_positives(clustered_mensions, positives)
     if len(negatives) != len(positives) and NEG_PROPORTION == 1:
-        print u'Niezgodna liczba przypadków pozytywnych i negatywnych!'
+        print (u'Niezgodna liczba przypadków pozytywnych i negatywnych!')
     return positives, negatives
@@ -126,18 +175,18 @@ def get_sets(mentions):
             sets[set_id].append(mention.attrib['span'])
             clustered_mensions.append(mention.attrib['span'])
         else:
-            print u'Coś poszło nie tak przy wyszukiwaniu klastrów!'
+            print (u'Coś poszło nie tak przy wyszukiwaniu klastrów!')
     sets_to_remove = []
     for set_id in sets:
         if len(sets[set_id]) < 2:
             sets_to_remove.append(set_id)
             if len(sets[set_id]) == 1:
-                print u'Removing clustered mention: ', sets[set_id][0]
+                print (u'Removing clustered mention: ', sets[set_id][0])
                 clustered_mensions.remove(sets[set_id][0])
     for set_id in sets_to_remove:
-        print u'Removing set: ', set_id
+        print (u'Removing set: ', set_id)
         sets.pop(set_id)
     return sets, clustered_mensions
@@ -160,21 +209,24 @@ def get_negatives_and_update_positives(clustered_mensions, positives):
         samples_count = len(negatives)
         if NEG_PROPORTION == 1:
             positives = random.sample(set(positives), samples_count)
-        print u'Więcej przypadków pozytywnych niż negatywnych!'
+        print (u'Więcej przypadków pozytywnych niż negatywnych!')
     negatives = random.sample(set(negatives), samples_count)
     return positives, negatives
-def write_features(features_file, positives, negatives, mentions_dict, model, textname):
+def write_features(features_file, positives, negatives, mentions_dict,
+                   model, textname, lemma2synonyms,
+                   lemma2hypernyms, title2links, title2redirect):
     global POS_COUNT
     POS_COUNT += len(positives)
     for pair in positives:
         pair_features = []
         if DEBUG:
             pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])]
-        pair_features.extend(get_features(pair, mentions_dict, model))
+        pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms,
+                            lemma2hypernyms, title2links, title2redirect))
         pair_features.append(1)
-        features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features]))
+        features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features]))
     global NEG_COUNT
     NEG_COUNT += len(negatives)
@@ -182,12 +234,14 @@ def write_features(features_file, positives, negatives, mentions_dict, model, te
         pair_features = []
         if DEBUG:
             pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])]
-        pair_features.extend(get_features(pair, mentions_dict, model))
+        pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms,
+                            lemma2hypernyms, title2links, title2redirect))
         pair_features.append(0)
-        features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features]))
+        features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features]))
-def get_features(pair, mentions_dict, model):
+def get_features(pair, mentions_dict, model, lemma2synonyms,
+                 lemma2hypernyms, title2links, title2redirect):
     features = []
     ante = pair[0]
     ana = pair[1]
@@ -195,7 +249,8 @@ def get_features(pair, mentions_dict, model):
     features.extend(ante_features)
     ana_features = get_mention_features(ana, mentions_dict, model)
     features.extend(ana_features)
-    pair_features = get_pair_features(pair, mentions_dict)
+    pair_features = get_pair_features(pair, mentions_dict, lemma2synonyms,
+                                      lemma2hypernyms, title2links, title2redirect)
     features.extend(pair_features)
     return features
@@ -280,6 +335,19 @@ def get_mention_features(mention_span, mentions_dict, model):
     # cechy uzupelniajace
     features.extend(mention_type(mention))
+    # cechy uzupelniajace 2
+    features.append(is_first_second_person(mention))
+    features.append(is_demonstrative(mention))
+    features.append(is_demonstrative_nominal(mention))
+    features.append(is_demonstrative_pronoun(mention))
+    features.append(is_refl_pronoun(mention))
+    features.append(is_first_in_sentence(mention))
+    features.append(is_zero_or_pronoun(mention))
+    features.append(contains_digit(mention, 'head_orth'))
+    features.append(contains_digit(mention, 'text'))
+    features.append(contains_letter(mention))
+    features.append(post_modified(mention))
+
     return features
@@ -296,6 +364,68 @@ def mention_type(mention):
     return type_vec
+def is_first_second_person(mention):
+    if mention['head']['person'] in FIRST_SECOND_PERSON:
+        return 1
+    return 0
+
+
+def is_demonstrative(mention):
+    if mention['words'][0]['base'].lower() in INDICATIVE_PRONS_BASES:
+        return 1
+    return 0
+
+
+def is_demonstrative_nominal(mention):
+    if is_demonstrative(mention) and mention['head']['ctag'] in NOUN_TAGS:
+        return 1
+    return 0
+
+
+def is_demonstrative_pronoun(mention):
+    if (is_demonstrative(mention) and
+            (mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS)):
+        return 1
+    return 0
+
+
+def is_refl_pronoun(mention):
+    if mention['head']['ctag'] in SIEBIE_TAGS:
+        return 1
+    return 0
+
+
+def is_first_in_sentence(mention):
+    if mention['first_in_sentence']:
+        return 1
+    return 0
+
+
+def is_zero_or_pronoun(mention):
+    if mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS:
+        return 1
+    return 0
+
+
+def contains_digit(mention, attr_name):
+    _digits = re.compile('\d')
+    if _digits.search(mention[attr_name]):
+        return 1
+    return 0
+
+
+def contains_letter(mention):
+    if any(c.isalpha() for c in mention['text']):
+        return 1
+    return 0
+
+
+def post_modified(mention):
+    if mention['head']['orth'] != mention['words'][-1]['orth']:
+        return 1
+    return 0
+
+
 def get_wv(model, lemma, random=True):
     global ALL_WORDS
     global UNKNONW_WORDS
@@ -332,7 +462,8 @@ def get_context_vec(words, model):
     return vec
-def get_pair_features(pair, mentions_dict):
+def get_pair_features(pair, mentions_dict, lemma2synonyms,
+                      lemma2hypernyms, title2links, title2redirect):
     ante = get_mention_by_attr(mentions_dict, 'span', pair[0])
     ana = get_mention_by_attr(mentions_dict, 'span', pair[1])
@@ -375,6 +506,32 @@ def get_pair_features(pair, mentions_dict):
     features.append(same_sentence(ante, ana))
     features.append(same_paragraph(ante, ana))
+    # cechy uzupelniajace 2
+    features.append(neighbouring_sentence(ante, ana))
+    features.append(cousin_sentence(ante, ana))
+    features.append(distant_sentence(ante, ana))
+    features.append(flat_gender_agreement(ante, ana))
+    features.append(left_match(ante, ana))
+    features.append(right_match(ante, ana))
+    features.append(abbrev2(ante, ana))
+
+    features.append(string_kernel(ante, ana))
+    features.append(head_string_kernel(ante, ana))
+
+    features.append(wordnet_synonyms(ante, ana, lemma2synonyms))
+    features.append(wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms))
+    features.append(wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms))
+
+    features.append(wikipedia_link(ante, ana, title2links))
+    features.append(wikipedia_mutual_link(ante, ana, title2links))
+    features.append(wikipedia_redirect(ante, ana, title2redirect))
+
+    # combined
+    features.append(samesent_anapron_antefirstinpar(ante, ana))
+    features.append(samesent_antefirstinpar_personnumbermatch(ante, ana))
+    features.append(adjsent_anapron_adjmen_personnumbermatch(ante, ana))
+    features.append(adjsent_anapron_adjmen(ante, ana))
+
     return features
@@ -392,7 +549,7 @@ def get_distance_bucket(distance):
     elif distance >= 64:
         return 9
     else:
-        print u'Coś poszło nie tak przy kubełkowaniu!!'
+        print (u'Coś poszło nie tak przy kubełkowaniu!!')
     return 10
@@ -445,8 +602,8 @@ def is_acronym(ante, ana):
     if ana['text'].upper() == ana['text']:
         return check_one_way_acronym(ana['text'], ante['text'])
     if ante['text'].upper() == ante['text']:
-        return check_one_way_acronym(ante['text'], ana['text']);
-    return 0;
+        return check_one_way_acronym(ante['text'], ana['text'])
+    return 0
 def check_one_way_acronym(acronym, expression):
@@ -455,10 +612,10 @@ def check_one_way_acronym(acronym, expression):
         for expr2 in expr1.split():
             expr2 = expr2.strip()
             if expr2:
-                initials += unicode(expr2[0]).upper()
+                initials += str(expr2[0]).upper()
     if acronym == initials:
-        return 1;
-    return 0;
+        return 1
+    return 0
 def same_sentence(ante, ana):
@@ -467,12 +624,290 @@ def same_sentence(ante, ana):
     return 0
+def neighbouring_sentence(ante, ana):
+    if ana['sentence_id'] - ante['sentence_id'] == 1:
+        return 1
+    return 0
+
+
+def cousin_sentence(ante, ana):
+    if ana['sentence_id'] - ante['sentence_id'] == 2:
+        return 1
+    return 0
+
+
+def distant_sentence(ante, ana):
+    if ana['sentence_id'] - ante['sentence_id'] > 2:
+        return 1
+    return 0
+
+
 def same_paragraph(ante, ana):
     if ante['paragraph_id'] == ana['paragraph_id']:
         return 1
     return 0
+def flat_gender_agreement(ante, ana):
+    agr_vec = [0] * 3
+    if ante['head']['gender'] == 'unk' or ana['head']['gender'] == 'unk':
+        agr_vec[2] = 1
+    elif (ante['head']['gender'] == ana['head']['gender'] or
+              (ante['head']['gender'] in MASCULINE_TAGS and ana['head']['gender'] in MASCULINE_TAGS)):
+        agr_vec[0] = 1
+    else:
+        agr_vec[1] = 1
+    return agr_vec
+
+
+def string_kernel(ante, ana):
+    s1 = ante['text']
+    s2 = ana['text']
+    return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2)))
+
+
+def head_string_kernel(ante, ana):
+    s1 = ante['head_orth']
+    s2 = ana['head_orth']
+    return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2)))
+
+
+def SK(s1, s2):
+    LAMBDA = 0.4
+
+    p = len(s1)
+    if len(s2) < len(s1):
+        p = len(s2)
+
+    h, w = len(s1)+1, len(s2)+1
+    DPS = [[0.0] * w for i in range(h)]
+    DP = [[0.0] * w for i in range(h)]
+
+    kernel_mat = [0.0] * (len(s1) + 1)
+
+    for i in range(len(s1)+1):
+        if i == 0:
+            continue
+        for j in range(len(s2)+1):
+            if j == 0:
+                continue
+            if s1[i-1] == s2[j-1]:
+                DPS[i][j] = LAMBDA * LAMBDA
+                kernel_mat[0] += DPS[i][j]
+            else:
+                DPS[i][j] = 0.0
+
+    for l in range(p):
+        if l == 0:
+            continue
+
+        kernel_mat[l] = 0.0
+        for j in range(len(s2)+1):
+            DP[l-1][j] = 0.0
+
+        for i in range(len(s1)+1):
+            DP[i][l-1] = 0.0
+
+        for i in range(len(s1)+1):
+            if i < l:
+                continue
+            for j in range(len(s2)+1):
+                if j < l:
+                    continue
+                DP[i][j] = DPS[i][j] + LAMBDA * DP[i - 1][j] + LAMBDA * DP[i][j - 1] - LAMBDA * LAMBDA * DP[i - 1][j - 1]
+
+                if s1[i-1] == s2[j-1]:
+                    DPS[i][j] = LAMBDA * LAMBDA * DP[i - 1][j - 1]
+                    kernel_mat[l] += DPS[i][j]
+
+    K = 0.0
+    for l in range(p):
+        K += kernel_mat[l]
+    return K
+
+
+def left_match(ante, ana):
+    if (ante['text'].lower().startswith(ana['text'].lower()) or
+            ana['text'].lower().startswith(ante['text'].lower())):
+        return 1
+    return 0
+
+
+def right_match(ante, ana):
+    if (ante['text'].lower().endswith(ana['text'].lower()) or
+            ana['text'].lower().endswith(ante['text'].lower())):
+        return 1
+    return 0
+
+# def string_match_no_hyphenation(ante, ana):
+#     ante_no_hyphen = remove_hyphen_signs(ante['text'])
+#     ana_no_hyphen = remove_hyphen_signs(ana['text'])
+#     if ante_no_hyphen == ana_no_hyphen:
+#         return 1
+#     return 0
+#
+#
+# def string_match_no_hyphenation_lowercase(ante, ana):
+#     ante_no_hyphen = remove_hyphen_signs(ante['text']).lower()
+#     ana_no_hyphen = remove_hyphen_signs(ana['text']).lower()
+#     if ante_no_hyphen == ana_no_hyphen:
+#         return 1
+#     return 0
+
+
+def remove_hyphen_signs(text):
+    for sign in HYPHEN_SIGNS:
+        text = text.replace(sign, '')
+    return text
+
+
+def samesent_anapron_antefirstinpar(ante, ana):
+    if same_sentence(ante, ana) and is_zero_or_pronoun(ana) and ante['first_in_paragraph']:
+        return 1
+    return 0
+
+
+def samesent_antefirstinpar_personnumbermatch(ante, ana):
+    if (same_sentence(ante, ana) and ante['first_in_paragraph']
+        and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
+        return 1
+    return 0
+
+
+def adjsent_anapron_adjmen_personnumbermatch(ante, ana):
+    if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana)
+        and ana['position_in_mentions'] - ante['position_in_mentions'] == 1
+        and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
+        return 1
+    return 0
+
+
+def adjsent_anapron_adjmen(ante, ana):
+    if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana)
+        and ana['position_in_mentions'] - ante['position_in_mentions'] == 1):
+        return 1
+    return 0
+
+
+def abbrev2(ante, ana):
+    ante_abbrev = get_abbrev(ante)
+    ana_abbrev = get_abbrev(ana)
+    if ante['head_orth'] == ana_abbrev or ana['head_orth'] == ante_abbrev:
+        return 1
+    return 0
+
+
+def get_abbrev(mention):
+    abbrev = u''
+    for word in mention['words']:
+        if word['orth'][0].isupper():
+            abbrev += word['orth'][0]
+    return abbrev
+
+
+def wordnet_synonyms(ante, ana, lemma2synonyms):
+    ante_synonyms = set()
+    if ante['head']['base'] in lemma2synonyms:
+        ante_synonyms = lemma2synonyms[ante['head']['base']]
+
+    ana_synonyms = set()
+    if ana['head']['base'] in lemma2synonyms:
+        ana_synonyms = lemma2synonyms[ana['head']['base']]
+
+    if ana['head']['base'] in ante_synonyms or ante['head']['base'] in ana_synonyms:
+        return 1
+    return 0
+
+
+def wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms):
+    ante_hypernyms = set()
+    if ante['head']['base'] in lemma2hypernyms:
+        ante_hypernyms = lemma2hypernyms[ante['head']['base']]
+
+    ana_hypernyms = set()
+    if ana['head']['base'] in lemma2hypernyms:
+        ana_hypernyms = lemma2hypernyms[ana['head']['base']]
+
+    if not ante_hypernyms or not ana_hypernyms:
+        return 0
+
+    if ana['head']['base'] in ante_hypernyms:
+        return 1
+    return 0
+
+
+def wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms):
+    ana_hypernyms = set()
+    if ana['head']['base'] in lemma2hypernyms:
+        ana_hypernyms = lemma2hypernyms[ana['head']['base']]
+
+    ante_hypernyms = set()
+    if ante['head']['base'] in lemma2hypernyms:
+        ante_hypernyms = lemma2hypernyms[ante['head']['base']]
+
+    if not ante_hypernyms or not ana_hypernyms:
+        return 0
+
+    if ante['head']['base'] in ana_hypernyms:
+        return 1
+    return 0
+
+
+def wikipedia_link(ante, ana, title2links):
+    ante_base = ante['lemmatized_text'].lower()
+    ana_base = ana['lemmatized_text'].lower()
+    if ante_base == ana_base:
+        return 1
+
+    ante_links = set()
+    if ante_base in title2links:
+        ante_links = title2links[ante_base]
+
+    ana_links = set()
+    if ana_base in title2links:
+        ana_links = title2links[ana_base]
+
+    if ana_base in ante_links or ante_base in ana_links:
+        return 1
+
+    return 0
+
+
+def wikipedia_mutual_link(ante, ana, title2links):
+    ante_base = ante['lemmatized_text'].lower()
+    ana_base = ana['lemmatized_text'].lower()
+    if ante_base == ana_base:
+        return 1
+
+    ante_links = set()
+    if ante_base in title2links:
+        ante_links = title2links[ante_base]
+
+    ana_links = set()
+    if ana_base in title2links:
+        ana_links = title2links[ana_base]
+
+    if ana_base in ante_links and ante_base in ana_links:
+        return 1
+
+    return 0
+
+
+def wikipedia_redirect(ante, ana, title2redirect):
+    ante_base = ante['lemmatized_text'].lower()
+    ana_base = ana['lemmatized_text'].lower()
+    if ante_base == ana_base:
+        return 1
+
+    if ante_base in title2redirect and title2redirect[ante_base] == ana_base:
+        return 1
+
+    if ana_base in title2redirect and title2redirect[ana_base] == ante_base:
+        return 1
+
+    return 0
+
+
 def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www.eml.org/NameSpaces/mention'):
     markables_dicts = []
     markables_tree = etree.parse(markables_path)
@@ -492,7 +927,8 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace=&#39;www
             if head_orth not in POSSIBLE_HEADS:
                 mention_words = span_to_words(span, words)
-                prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id = get_context(mention_words, words)
+                (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position,
+                 paragraph_id, sentence_id, first_in_sentence, first_in_paragraph) = get_context(mention_words, words)
                 head = get_head(head_orth, mention_words)
                 markables_dicts.append({'id': markable.attrib['id'],
@@ -513,9 +949,11 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace=&#39;www
                                         'end_in_words': mnt_end_position,
                                         'rarest': get_rarest_word(mention_words, freq_list),
                                         'paragraph_id': paragraph_id,
-                                        'sentence_id': sentence_id})
+                                        'sentence_id': sentence_id,
+                                        'first_in_sentence': first_in_sentence,
+                                        'first_in_paragraph': first_in_paragraph})
         else:
-            print 'Zduplikowana wzmianka: %s' % span
+            print ('Zduplikowana wzmianka: %s' % span)
     return markables_dicts
@@ -529,10 +967,16 @@ def get_context(mention_words, words):
     mnt_start_position = -1
     first_word = mention_words[0]
     last_word = mention_words[-1]
+    first_in_sentence = False
+    first_in_paragraph = False
     for idx, word in enumerate(words):
         if word['id'] == first_word['id']:
             prec_context = get_prec_context(idx, words)
             mnt_start_position = get_mention_start(first_word, words)
+            if idx == 0 or words[idx-1]['lastinsent']:
+                first_in_sentence = True
+            if idx == 0 or words[idx-1]['lastinpar']:
+                first_in_paragraph = True
         if word['id'] == last_word['id']:
             follow_context = get_follow_context(idx, words)
             sentence = get_sentence(idx, words)
@@ -542,7 +986,8 @@ def get_context(mention_words, words):
             sentence_id += 1
         if word['lastinpar']:
             paragraph_id += 1
-    return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id
+    return (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position,
+            paragraph_id, sentence_id, first_in_sentence, first_in_paragraph)
 def get_prec_context(mention_start, words):
@@ -743,9 +1188,9 @@ def to_text(words, form):
 def get_one_word_text(word_id, words, form):
-    this_word = (word for word in words if word['id'] == word_id).next()
+    this_word = next(word for word in words if word['id'] == word_id)
     if word_to_ignore(this_word):
-        print this_word
+        print (this_word)
     return this_word[form]