Add TEI format support.

Bartłomiej Nitoń
1 parent 5be49adf
Showing 12 changed files with 569 additions and 75 deletions
conf.py
corneferencer/entities.py
corneferencer/inout/constants.py
corneferencer/inout/mmax.py
corneferencer/inout/tei.py
corneferencer/main.py
corneferencer/resolvers/constants.py
corneferencer/resolvers/features.py
corneferencer/resolvers/resolve.py
corneferencer/resolvers/vectors.py
corneferencer/utils.py
requirements.txt
@@ -7,6 +7,7 @@ from gensim.models.word2vec import Word2Vec
  
 CONTEXT = 5
 RANDOM_WORD_VECTORS = True
+CLEAR_INPUT = False
 W2V_SIZE = 50
 W2V_MODEL_NAME = 'w2v_allwiki_nkjpfull_50.model'
  
@@ -13,6 +13,12 @@ class Text:
                 return mnt.set
         return None
  
+    def get_mention(self, mnt_id):
+        for mnt in self.mentions:
+            if mnt.id == mnt_id:
+                return mnt
+        return None
+
     def get_sets(self):
         sets = {}
         for mnt in self.mentions:
@@ -22,7 +28,6 @@ class Text:
                 sets[mnt.set] = [mnt]
         return sets
  
-
     def merge_sets(self, set1, set2):
         for mnt in self.mentions:
             if mnt.set == set1:
@@ -38,7 +43,6 @@ class Mention:
                  first_in_sentence, first_in_paragraph, set_id=''):
         self.id = mnt_id
         self.set = set_id
-        self.old_set = ''
         self.text = text
         self.lemmatized_text = lemmatized_text
         self.words = words
-INPUT_FORMATS = ['mmax']
+INPUT_FORMATS = ['mmax', 'tei']
@@ -3,7 +3,7 @@ import shutil
  
 from lxml import etree
  
-from conf import CONTEXT, FREQ_LIST
+from conf import CLEAR_INPUT, CONTEXT, FREQ_LIST
 from corneferencer.entities import Mention, Text
  
  
@@ -43,7 +43,7 @@ def read_mentions(mentions_path, words_path):
  
         head = get_head(head_orth, mention_words)
         mention_group = ''
-        if markable.attrib['mention_group'] != 'empty':
+        if markable.attrib['mention_group'] != 'empty' and not CLEAR_INPUT:
             mention_group = markable.attrib['mention_group']
         mention = Mention(mnt_id=markable.attrib['id'],
                           text=span_to_text(span, words, 'orth'),
@@ -77,15 +77,15 @@ def get_words(filepath):
     for word in tree.xpath("//word"):
         hasnps = False
         if (('hasnps' in word.attrib and word.attrib['hasnps'] == 'true') or
-            ('hasNps' in word.attrib and word.attrib['hasNps'] == 'true')):
+                ('hasNps' in word.attrib and word.attrib['hasNps'] == 'true')):
             hasnps = True
         lastinsent = False
         if (('lastinsent' in word.attrib and word.attrib['lastinsent'] == 'true') or
-            ('lastInSent' in word.attrib and word.attrib['lastInSent'] == 'true')):
+                ('lastInSent' in word.attrib and word.attrib['lastInSent'] == 'true')):
             lastinsent = True
         lastinpar = False
         if (('lastinpar' in word.attrib and word.attrib['lastinpar'] == 'true') or
-            ('lastInPar' in word.attrib and word.attrib['lastInPar'] == 'true')):
+                ('lastInPar' in word.attrib and word.attrib['lastInPar'] == 'true')):
             lastinpar = True
         words.append({'id': word.attrib['id'],
                       'orth': word.text,
@@ -388,10 +388,13 @@ def write_mentions(inpath, outpath, text):
     tree = etree.parse(inpath)
     mentions = tree.xpath("//ns:markable", namespaces={'ns': 'www.eml.org/NameSpaces/mention'})
  
+    sets = text.get_sets()
+
     for mnt in mentions:
         mnt_set = text.get_mention_set(mnt.attrib['id'])
         if mnt_set:
             mnt.attrib['mention_group'] = mnt_set
+            mnt.attrib['dominant'] = get_dominant(sets[mnt_set])
         else:
             mnt.attrib['mention_group'] = 'empty'
  
@@ -399,3 +402,11 @@ def write_mentions(inpath, outpath, text):
         output_file.write(etree.tostring(tree, pretty_print=True,
                                          xml_declaration=True, encoding='UTF-8',
                                          doctype=u'<!DOCTYPE markables SYSTEM "markables.dtd">'))
+
+
+def get_dominant(mentions):
+    longest_mention = mentions[0]
+    for mnt in mentions:
+        if len(mnt.words) > len(longest_mention.words):
+            longest_mention = mnt
+    return longest_mention.text
+import gzip
+import os
+import shutil
+
+from lxml import etree
+
+from conf import CLEAR_INPUT, CONTEXT, FREQ_LIST
+from corneferencer.entities import Mention, Text
+from corneferencer.utils import eprint
+
+
+NKJP_NS = 'http://www.nkjp.pl/ns/1.0'
+TEI_NS = 'http://www.tei-c.org/ns/1.0'
+XI_NS = 'http://www.w3.org/2001/XInclude'
+XML_NS = 'http://www.w3.org/XML/1998/namespace'
+NSMAP = {None: TEI_NS,
+         'nkjp': NKJP_NS,
+         'xi': XI_NS}
+
+
+def read(inpath):
+    textname = os.path.basename(inpath)
+
+    text = Text(textname)
+
+    # essential layers
+    ann_segmentation = os.path.join(inpath, 'ann_segmentation.xml.gz')
+    ann_morphosyntax = os.path.join(inpath, 'ann_morphosyntax.xml.gz')
+    ann_mentions = os.path.join(inpath, 'ann_mentions.xml.gz')
+
+    # additional layers
+    ann_coreference = os.path.join(inpath, 'ann_coreference.xml.gz')
+
+    if os.path.exists(ann_segmentation):
+        pass
+    else:
+        eprint("Error: missing segmentation layer for text %s!" % textname)
+        return None
+
+    if os.path.exists(ann_morphosyntax):
+        (segments, segments_ids) = read_morphosyntax(ann_morphosyntax)
+    else:
+        eprint("Error: missing morphosyntax layer for text %s!" % textname)
+        return None
+
+    if os.path.exists(ann_mentions):
+        text.mentions = read_mentions(ann_mentions, segments, segments_ids)
+    else:
+        eprint("Error: missing mentions layer for text %s!" % textname)
+        return None
+
+    if os.path.exists(ann_coreference) and not CLEAR_INPUT:
+        add_coreference_layer(ann_coreference, text)
+
+    return text
+
+
+# morphosyntax
+def read_morphosyntax(ann_archive):
+    segments_dict = {}
+    segments_ids = []
+    ann_file = gzip.open(ann_archive, 'rb')
+    parser = etree.XMLParser(encoding="utf-8")
+    tree = etree.parse(ann_file, parser)
+    body = tree.xpath('//xmlns:body', namespaces={'xmlns': TEI_NS})[0]
+
+    paragraphs = body.xpath(".//xmlns:p", namespaces={'xmlns': TEI_NS})
+    for par in paragraphs:
+        sentences = par.xpath(".//xmlns:s", namespaces={'xmlns': TEI_NS})
+        for sent_id, sent in enumerate(sentences):
+            segments = sent.xpath(".//xmlns:seg", namespaces={'xmlns': TEI_NS})
+            for seg_id, seg in enumerate(segments):
+                lastinsent = False
+                lastinpar = False
+                if seg_id == len(segments) - 1:
+                    lastinsent = True
+                    if sent_id == len(sentences) - 1:
+                        lastinpar = True
+                segment = read_segment(seg, lastinsent, lastinpar)
+                segments_dict[segment['id']] = segment
+                segments_ids.append(segment['id'])
+
+    return segments_dict, segments_ids
+
+
+def read_segment(seg, lastinsent, lastinpar):
+    hasnps = False
+    base = ''
+    ctag = ''
+    msd = ''
+    orth = ''
+    idx = seg.attrib['{%s}id' % XML_NS]
+    for f in seg.xpath(".//xmlns:f", namespaces={'xmlns': TEI_NS}):
+        if f.attrib['name'] == 'orth':
+            orth = get_f_string(f)
+        elif f.attrib['name'] == 'nps':
+            hasnps = get_f_bin_value(f)
+        elif f.attrib['name'] == 'interpretation':
+            interpretation = get_f_string(f)
+            (base, ctag, msd) = parse_interpretation(interpretation)
+    return {'id': idx,
+            'orth': orth,
+            'base': base,
+            'hasnps': hasnps,
+            'lastinsent': lastinsent,
+            'lastinpar': lastinpar,
+            'ctag': ctag,
+            'msd': msd,
+            'number': get_number(msd),
+            'person': get_person(msd),
+            'gender': get_gender(msd)}
+
+
+def get_f_string(f):
+    return f.getchildren()[0].text
+
+
+def get_f_bin_value(f):
+    value = False
+    if f.getchildren()[0].attrib['value'] == 'true':
+        value = True
+    return value
+
+
+def parse_interpretation(interpretation):
+    split = interpretation.split(':')
+    if interpretation.startswith(':'):
+        base = ':'
+        ctag = 'interp'
+        msd = ''
+    elif len(split) > 2:
+        base = split[0]
+        ctag = split[1]
+        msd = ':'.join(split[2:])
+    else:
+        base = split[0]
+        ctag = split[1]
+        msd = ''
+    return base, ctag, msd
+
+
+def get_gender(msd):
+    tags = msd.split(':')
+    if 'm1' in tags:
+        return 'm1'
+    elif 'm2' in tags:
+        return 'm2'
+    elif 'm3' in tags:
+        return 'm3'
+    elif 'f' in tags:
+        return 'f'
+    elif 'n' in tags:
+        return 'n'
+    else:
+        return 'unk'
+
+
+def get_person(msd):
+    tags = msd.split(':')
+    if 'pri' in tags:
+        return 'pri'
+    elif 'sec' in tags:
+        return 'sec'
+    elif 'ter' in tags:
+        return 'ter'
+    else:
+        return 'unk'
+
+
+def get_number(msd):
+    tags = msd.split(':')
+    if 'sg' in tags:
+        return 'sg'
+    elif 'pl' in tags:
+        return 'pl'
+    else:
+        return 'unk'
+
+
+# mentions
+def read_mentions(ann_archive, segments, segments_ids):
+    mentions = []
+
+    ann_file = gzip.open(ann_archive, 'rb')
+    parser = etree.XMLParser(encoding="utf-8")
+    tree = etree.parse(ann_file, parser)
+    body = tree.xpath('//xmlns:body', namespaces={'xmlns': TEI_NS})[0]
+
+    paragraphs = body.xpath(".//xmlns:p", namespaces={'xmlns': TEI_NS})
+    mnt_id = 0
+    for par_id, par in enumerate(paragraphs):
+        sentences = par.xpath(".//xmlns:s", namespaces={'xmlns': TEI_NS})
+        for sent_id, sent in enumerate(sentences):
+            mention_nodes = sent.xpath(".//xmlns:seg", namespaces={'xmlns': TEI_NS})
+            for mnt in mention_nodes:
+                mnt_id += 1
+                mention = get_mention(mnt, mnt_id, segments, segments_ids, par_id, sent_id)
+                mentions.append(mention)
+
+    return mentions
+
+
+def get_mention(mention, mnt_id, segments, segments_ids, paragraph_id, sentence_id):
+    idx = mention.attrib['{%s}id' % XML_NS]
+
+    mnt_segments = []
+    for ptr in mention.xpath(".//xmlns:ptr", namespaces={'xmlns': TEI_NS}):
+        seg_id = ptr.attrib['target'].split('#')[-1]
+        if not word_to_ignore(segments[seg_id]):
+            mnt_segments.append(segments[seg_id])
+
+    semh = None
+    for f in mention.xpath(".//xmlns:f", namespaces={'xmlns': TEI_NS}):
+        if f.attrib['name'] == 'semh':
+            semh_id = get_fval(f).split('#')[-1]
+            semh = segments[semh_id]
+
+    (sent_segments, prec_context, follow_context,
+     first_in_sentence, first_in_paragraph) = get_context(mnt_segments, segments, segments_ids)
+
+    mention = Mention(mnt_id=idx,
+                      text=to_text(mnt_segments, 'orth'),
+                      lemmatized_text=to_text(mnt_segments, 'base'),
+                      words=mnt_segments,
+                      span=None,
+                      head_orth=semh['orth'],
+                      head=semh,
+                      node=mention,
+                      prec_context=prec_context,
+                      follow_context=follow_context,
+                      sentence=sent_segments,
+                      sentence_id=sentence_id,
+                      paragraph_id=paragraph_id,
+                      position_in_mentions=mnt_id,
+                      start_in_words=segments_ids.index(mnt_segments[0]['id']),
+                      end_in_words=segments_ids.index(mnt_segments[-1]['id']),
+                      rarest=get_rarest_word(mnt_segments),
+                      first_in_sentence=first_in_sentence,
+                      first_in_paragraph=first_in_paragraph,
+                      set_id=None,
+                      dominant=None,)
+
+    return mention
+
+
+def get_context(mention_words, segments, segments_ids):
+    prec_context = []
+    follow_context = []
+    sentence = []
+    first_word = mention_words[0]
+    last_word = mention_words[-1]
+    first_in_sentence = False
+    first_in_paragraph = False
+    for idx, morph_id in enumerate(segments_ids):
+        word = segments[morph_id]
+        if word['id'] == first_word['id']:
+            prec_context = get_prec_context(idx, segments, segments_ids)
+            if idx == 0 or segments[segments_ids[idx-1]]['lastinsent']:
+                first_in_sentence = True
+            if idx == 0 or segments[segments_ids[idx-1]]['lastinpar']:
+                first_in_paragraph = True
+        if word['id'] == last_word['id']:
+            follow_context = get_follow_context(idx, segments, segments_ids)
+            sentence = get_sentence(idx, segments, segments_ids)
+            break
+    return (sentence, prec_context, follow_context, first_in_sentence, first_in_paragraph)
+
+
+def get_prec_context(mention_start, segments, segments_ids):
+    context = []
+    context_start = mention_start - 1
+    while context_start >= 0:
+        if not word_to_ignore(segments[segments_ids[context_start]]):
+            context.append(segments[segments_ids[context_start]])
+        if len(context) == CONTEXT:
+            break
+        context_start -= 1
+    context.reverse()
+    return context
+
+
+def get_follow_context(mention_end, segments, segments_ids):
+    context = []
+    context_end = mention_end + 1
+    while context_end < len(segments):
+        if not word_to_ignore(segments[segments_ids[context_end]]):
+            context.append(segments[segments_ids[context_end]])
+        if len(context) == CONTEXT:
+            break
+        context_end += 1
+    return context
+
+
+def get_sentence(word_idx, segments, segments_ids):
+    sentence_start = get_sentence_start(segments, segments_ids, word_idx)
+    sentence_end = get_sentence_end(segments, segments_ids, word_idx)
+    sentence = [segments[morph_id] for morph_id in segments_ids[sentence_start:sentence_end + 1]
+                if not word_to_ignore(segments[morph_id])]
+    return sentence
+
+
+def get_sentence_start(segments, segments_ids, word_idx):
+    search_start = word_idx
+    while word_idx >= 0:
+        if segments[segments_ids[word_idx]]['lastinsent'] and search_start != word_idx:
+            return word_idx + 1
+        word_idx -= 1
+    return 0
+
+
+def get_sentence_end(segments, segments_ids, word_idx):
+    while word_idx < len(segments):
+        if segments[segments_ids[word_idx]]['lastinsent']:
+            return word_idx
+        word_idx += 1
+    return len(segments) - 1
+
+
+def word_to_ignore(word):
+    if word['ctag'] == 'interp':
+        return True
+    return False
+
+
+def to_text(words, form):
+    text = ''
+    for idx, word in enumerate(words):
+        if word['hasnps'] or idx == 0:
+            text += word[form]
+        else:
+            text += u' %s' % word[form]
+    return text
+
+
+def get_fval(f):
+    return f.attrib['fVal']
+
+
+def get_rarest_word(words):
+    min_freq = 0
+    rarest_word = words[0]
+    for i, word in enumerate(words):
+        word_freq = 0
+        if word['base'] in FREQ_LIST:
+            word_freq = FREQ_LIST[word['base']]
+
+        if i == 0 or word_freq < min_freq:
+            min_freq = word_freq
+            rarest_word = word
+    return rarest_word
+
+
+# coreference
+def add_coreference_layer(ann_archive, text):
+    ann_file = gzip.open(ann_archive, 'rb')
+    parser = etree.XMLParser(encoding="utf-8")
+    tree = etree.parse(ann_file, parser)
+    body = tree.xpath('//xmlns:body', namespaces={'xmlns': TEI_NS})[0]
+
+    parts = body.xpath(".//xmlns:p", namespaces={'xmlns': TEI_NS})
+    for par in parts:
+        coreferences = par.xpath(".//xmlns:seg", namespaces={'xmlns': TEI_NS})
+        for cor in coreferences:
+            add_coreference(cor, text)
+
+
+def add_coreference(coref, text):
+    idx = coref.attrib['{%s}id' % XML_NS]
+
+    coref_type = None
+    dominant = None
+    for f in coref.xpath(".//xmlns:f", namespaces={'xmlns': TEI_NS}):
+        if f.attrib['name'] == 'type':
+            coref_type = get_fval(f)
+        elif f.attrib['name'] == 'dominant':
+            dominant = get_fval(f)
+
+    if coref_type == 'ident':
+        for ptr in coref.xpath(".//xmlns:ptr", namespaces={'xmlns': TEI_NS}):
+            mnt_id = ptr.attrib['target'].split('#')[-1]
+            mention = text.get_mention(mnt_id)
+            mention.set = idx
+            mention.dominant = dominant
+
+
+# write
+def write(inpath, outpath, text):
+
+    if not os.path.exists(outpath):
+        os.mkdir(outpath)
+
+    for filename in os.listdir(inpath):
+        if not filename.startswith('ann_coreference'):
+            layer_inpath = os.path.join(inpath, filename)
+            layer_outpath = os.path.join(outpath, filename)
+            copy_layer(layer_inpath, layer_outpath)
+
+    coref_outpath = os.path.join(outpath, 'ann_coreference.xml.gz')
+    write_coreference(coref_outpath, text)
+
+
+def copy_layer(src, dest):
+    shutil.copyfile(src, dest)
+
+
+def write_coreference(outpath, text):
+    root, tei = write_header()
+    write_body(tei, text)
+
+    with gzip.open(outpath, 'wb') as output_file:
+        output_file.write(etree.tostring(root, pretty_print=True,
+                                         xml_declaration=True, encoding='UTF-8'))
+
+
+def write_header():
+    root = etree.Element('teiCorpus', nsmap=NSMAP)
+
+    corpus_xinclude = etree.SubElement(root, etree.QName(XI_NS, 'include'))
+    corpus_xinclude.attrib['href'] = 'PCC_header.xml'
+
+    tei = etree.SubElement(root, 'TEI')
+    tei_xinclude = etree.SubElement(tei, etree.QName(XI_NS, 'include'))
+    tei_xinclude.attrib['href'] = 'header.xml'
+
+    return root, tei
+
+
+def write_body(tei, text):
+    text_node = etree.SubElement(tei, 'text')
+    body = etree.SubElement(text_node, 'body')
+    p = etree.SubElement(body, 'p')
+
+    sets = text.get_sets()
+    for set_id in sets:
+        comment_text = create_set_comment(sets[set_id])
+        p.append(etree.Comment(comment_text))
+
+        seg = etree.SubElement(p, 'seg')
+        seg.attrib[etree.QName(XML_NS, 'id')] = set_id.replace('set', 'coreference')
+
+        fs = etree.SubElement(seg, 'fs')
+        fs.attrib['type'] = 'coreference'
+
+        f_type = etree.SubElement(fs, 'f')
+        f_type.attrib['name'] = 'type'
+        f_type.attrib['fVal'] = 'ident'
+
+        dominant = get_dominant(sets[set_id])
+        f_dominant = etree.SubElement(fs, 'f')
+        f_dominant.attrib['name'] = 'dominant'
+        f_dominant.attrib['fVal'] = dominant
+
+        for mnt in sets[set_id]:
+            ptr = etree.SubElement(seg, 'ptr')
+            ptr.attrib['target'] = 'ann_mentions.xml#%s' % mnt.id
+
+
+def create_set_comment(mentions):
+    mentions_orths = [mnt.text for mnt in mentions]
+    return '  %s  ' % '; '.join(mentions_orths)
+
+
+def get_dominant(mentions):
+    longest_mention = mentions[0]
+    for mnt in mentions:
+        if len(mnt.words) > len(longest_mention.words):
+            longest_mention = mnt
+    return longest_mention.text
@@ -7,7 +7,7 @@ from natsort import natsorted
 sys.path.append(os.path.abspath(os.path.join('..')))
  
 import conf
-from inout import mmax
+from inout import mmax, tei
 from inout.constants import INPUT_FORMATS
 from resolvers import resolve
 from resolvers.constants import RESOLVERS
@@ -26,7 +26,8 @@ def main():
         resolver = args.resolver
         if conf.NEURAL_MODEL_ARCHITECTURE == 'siamese':
             resolver = conf.NEURAL_MODEL_ARCHITECTURE
-            eprint ("Warning: Using %s resolver because of selected neural model architecture!" % conf.NEURAL_MODEL_ARCHITECTURE)
+            eprint("Warning: Using %s resolver because of selected neural model architecture!" %
+                conf.NEURAL_MODEL_ARCHITECTURE)
         process_texts(args.input, args.output, args.format, resolver, args.threshold)
  
  
@@ -39,15 +40,16 @@ def parse_arguments():
                         dest='output', default='',
                         help='output path; if not specified writes output to standard output')
     parser.add_argument('-f', '--format', type=str, action='store',
-                        dest='format', default='mmax',
-                        help='input format; default: mmax')
+                        dest='format', default=INPUT_FORMATS[0],
+                        help='input format; default: %s; possibilities: %s'
+                             % (INPUT_FORMATS[0], ', '.join(INPUT_FORMATS)))
     parser.add_argument('-r', '--resolver', type=str, action='store',
-                        dest='resolver', default='incremental',
-                        help='resolve algorithm; default: incremental; possibilities: %s'
-                             % ', '.join(RESOLVERS))
+                        dest='resolver', default=RESOLVERS[0],
+                        help='resolve algorithm; default: %s; possibilities: %s'
+                             % (RESOLVERS[0], ', '.join(RESOLVERS)))
     parser.add_argument('-t', '--threshold', type=float, action='store',
-                        dest='threshold', default=0.001,
-                        help='threshold; default: 0.001')
+                        dest='threshold', default=0.85,
+                        help='threshold; default: 0.85')
  
     args = parser.parse_args()
     return args
@@ -57,7 +59,7 @@ def process_texts(inpath, outpath, informat, resolver, threshold):
     if os.path.isdir(inpath):
         process_directory(inpath, outpath, informat, resolver, threshold)
     elif os.path.isfile(inpath):
-        process_file(inpath, outpath, informat, resolver, threshold)
+        process_text(inpath, outpath, informat, resolver, threshold)
     else:
         eprint("Error: Specified input does not exist!")
  
@@ -73,10 +75,10 @@ def process_directory(inpath, outpath, informat, resolver, threshold):
         textname = os.path.splitext(os.path.basename(filename))[0]
         textoutput = os.path.join(outpath, textname)
         textinput = os.path.join(inpath, filename)
-        process_file(textinput, textoutput, informat, resolver, threshold)
+        process_text(textinput, textoutput, informat, resolver, threshold)
  
  
-def process_file(inpath, outpath, informat, resolver, threshold):
+def process_text(inpath, outpath, informat, resolver, threshold):
     basename = os.path.basename(inpath)
     if informat == 'mmax' and basename.endswith('.mmax'):
         print (basename)
@@ -92,6 +94,20 @@ def process_file(inpath, outpath, informat, resolver, threshold):
         elif resolver == 'all2all':
             resolve.all2all(text, threshold)
         mmax.write(inpath, outpath, text)
+    elif informat == 'tei':
+        print (basename)
+        text = tei.read(inpath)
+        if resolver == 'incremental':
+            resolve.incremental(text, threshold)
+        elif resolver == 'entity_based':
+            resolve.entity_based(text, threshold)
+        elif resolver == 'closest':
+            resolve.closest(text, threshold)
+        elif resolver == 'siamese':
+            resolve.siamese(text, threshold)
+        elif resolver == 'all2all':
+            resolve.all2all(text, threshold)
+        tei.write(inpath, outpath, text)
  
  
 if __name__ == '__main__':
 # -*- coding: utf-8 -*-
  
-RESOLVERS = ['entity_based', 'incremental', 'closest', 'siamese', 'all2all']
+RESOLVERS = ['all2all', 'entity_based', 'incremental', 'closest', 'siamese']
  
 NOUN_TAGS = ['subst', 'ger', 'depr']
 PPRON_TAGS = ['ppron12', 'ppron3']
@@ -229,7 +229,7 @@ def ante_contains_rarest_from_ana(ante, ana):
 def agreement(ante, ana, tag_name):
     agr_vec = [0] * 3
     if (ante.head is None or ana.head is None or
-                ante.head[tag_name] == 'unk' or ana.head[tag_name] == 'unk'):
+            ante.head[tag_name] == 'unk' or ana.head[tag_name] == 'unk'):
         agr_vec[2] = 1
     elif ante.head[tag_name] == ana.head[tag_name]:
         agr_vec[0] = 1
@@ -279,10 +279,10 @@ def same_paragraph(ante, ana):
 def flat_gender_agreement(ante, ana):
     agr_vec = [0] * 3
     if (ante.head is None or ana.head is None or
-                ante.head['gender'] == 'unk' or ana.head['gender'] == 'unk'):
+            ante.head['gender'] == 'unk' or ana.head['gender'] == 'unk'):
         agr_vec[2] = 1
     elif (ante.head['gender'] == ana.head['gender'] or
-              (ante.head['gender'] in constants.MASCULINE_TAGS and ana.head['gender'] in constants.MASCULINE_TAGS)):
+            (ante.head['gender'] in constants.MASCULINE_TAGS and ana.head['gender'] in constants.MASCULINE_TAGS)):
         agr_vec[0] = 1
     else:
         agr_vec[1] = 1
@@ -314,13 +314,13 @@ def abbrev2(ante, ana):
 def string_kernel(ante, ana):
     s1 = ante.text
     s2 = ana.text
-    return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2)))
+    return sk(s1, s2) / (math.sqrt(sk(s1, s1) * sk(s2, s2)))
  
  
 def head_string_kernel(ante, ana):
     s1 = ante.head_orth
     s2 = ana.head_orth
-    return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2)))
+    return sk(s1, s2) / (math.sqrt(sk(s1, s1) * sk(s2, s2)))
  
  
 def wordnet_synonyms(ante, ana):
@@ -443,22 +443,22 @@ def samesent_anapron_antefirstinpar(ante, ana):
  
 def samesent_antefirstinpar_personnumbermatch(ante, ana):
     if (same_sentence(ante, ana) and ante.first_in_paragraph
-        and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
+            and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
         return 1
     return 0
  
  
 def adjsent_anapron_adjmen_personnumbermatch(ante, ana):
     if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana)
-        and ana.position_in_mentions - ante.position_in_mentions == 1
-        and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
+            and ana.position_in_mentions - ante.position_in_mentions == 1
+            and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]):
         return 1
     return 0
  
  
 def adjsent_anapron_adjmen(ante, ana):
     if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana)
-        and ana.position_in_mentions - ante.position_in_mentions == 1):
+            and ana.position_in_mentions - ante.position_in_mentions == 1):
         return 1
     return 0
  
@@ -535,16 +535,16 @@ def get_abbrev(mention):
     return abbrev
  
  
-def SK(s1, s2):
-    LAMBDA = 0.4
+def sk(s1, s2):
+    lam = 0.4
  
     p = len(s1)
     if len(s2) < len(s1):
         p = len(s2)
  
     h, w = len(s1)+1, len(s2)+1
-    DPS = [[0.0] * w for i in range(h)]
-    DP = [[0.0] * w for i in range(h)]
+    dps = [[0.0] * w for i in range(h)]
+    dp = [[0.0] * w for i in range(h)]
  
     kernel_mat = [0.0] * (len(s1) + 1)
  
@@ -555,35 +555,35 @@ def SK(s1, s2):
             if j == 0:
                 continue
             if s1[i-1] == s2[j-1]:
-                DPS[i][j] = LAMBDA * LAMBDA
-                kernel_mat[0] += DPS[i][j]
+                dps[i][j] = lam * lam
+                kernel_mat[0] += dps[i][j]
             else:
-                DPS[i][j] = 0.0
+                dps[i][j] = 0.0
  
-    for l in range(p):
-        if l == 0:
+    for m in range(p):
+        if m == 0:
             continue
  
-        kernel_mat[l] = 0.0
+        kernel_mat[m] = 0.0
         for j in range(len(s2)+1):
-            DP[l-1][j] = 0.0
+            dp[m-1][j] = 0.0
  
         for i in range(len(s1)+1):
-            DP[i][l-1] = 0.0
+            dp[i][m-1] = 0.0
  
         for i in range(len(s1)+1):
-            if i < l:
+            if i < m:
                 continue
             for j in range(len(s2)+1):
-                if j < l:
+                if j < m:
                     continue
-                DP[i][j] = DPS[i][j] + LAMBDA * DP[i - 1][j] + LAMBDA * DP[i][j - 1] - LAMBDA * LAMBDA * DP[i - 1][j - 1]
+                dp[i][j] = dps[i][j] + lam * dp[i - 1][j] + lam * dp[i][j - 1] - lam * lam * dp[i - 1][j - 1]
  
                 if s1[i-1] == s2[j-1]:
-                    DPS[i][j] = LAMBDA * LAMBDA * DP[i - 1][j - 1]
-                    kernel_mat[l] += DPS[i][j]
+                    dps[i][j] = lam * lam * dp[i - 1][j - 1]
+                    kernel_mat[m] += dps[i][j]
  
-    K = 0.0
-    for l in range(p):
-        K += kernel_mat[l]
-    return K
+    k = 0.0
+    for i in range(p):
+        k += kernel_mat[i]
+    return k
@@ -65,11 +65,10 @@ def incremental(text, threshold):
 def all2all_debug(text, threshold):
     last_set_id = 0
     for pos1, mnt1 in enumerate(text.mentions):
-        print ('!!!!!!!!!!%s!!!!!!!!!!!' % mnt1.text)
         best_prediction = 0.0
         best_link = None
         for pos2, mnt2 in enumerate(text.mentions):
-            if ((mnt1.set != mnt2.set or not mnt1.set) and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
+            if (mnt1.set != mnt2.set or not mnt1.set) and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2):
                 ante = mnt1
                 ana = mnt2
                 if pos2 < pos1:
@@ -78,12 +77,10 @@ def all2all_debug(text, threshold):
                 pair_vec = get_pair_vector(ante, ana)
                 sample = numpy.asarray([pair_vec], dtype=numpy.float32)
                 prediction = NEURAL_MODEL.predict(sample)[0]
-                print (u'mnt2: %s | %s == %s >> %f' % (mnt2.text, ante.text, ana.text, prediction))
                 if prediction > threshold and prediction > best_prediction:
                     best_prediction = prediction
                     best_link = mnt2
         if best_link is not None:
-            print (u'best: %s >> %f, best set: %s, mnt1_set: %s' % (best_link.text, best_prediction, best_link.set, mnt1.set))
             if best_link.set and not mnt1.set:
                 mnt1.set = best_link.set
             elif best_link.set and mnt1.set:
@@ -93,7 +90,6 @@ def all2all_debug(text, threshold):
                 best_link.set = str_set_id
                 mnt1.set = str_set_id
                 last_set_id += 1
-            print (u'best set: %s, mnt1_set: %s' % (best_link.set, mnt1.set))
  
  
 def all2all_v1(text, threshold):
@@ -103,7 +99,7 @@ def all2all_v1(text, threshold):
         best_link = None
         for pos2, mnt2 in enumerate(text.mentions):
             if ((mnt1.set != mnt2.set or not mnt1.set or not mnt2.set)
-                and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
+                    and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
                 ante = mnt1
                 ana = mnt2
                 if pos2 < pos1:
@@ -137,7 +133,7 @@ def all2all(text, threshold):
         best_link = None
         for pos2, mnt2 in enumerate(text.mentions):
             if ((mnt1.set != mnt2.set or not mnt1.set or not mnt2.set)
-                and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
+                    and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
                 ante = mnt1
                 ana = mnt2
                 if pos2 < pos1:
@@ -166,7 +162,6 @@ def all2all(text, threshold):
                 sets[str_set_id] = [best_link, mnt1]
  
  
-
 # entity based resolve algorithm
 def entity_based(text, threshold):
     sets = []
@@ -24,10 +24,10 @@ def get_mention_features(mention):
     vec.extend(features.mention_vec(mention))
     vec.extend(features.sentence_vec(mention))
  
-    # cechy uzupelniajace
+    # complementary features
     vec.extend(features.mention_type(mention))
  
-    # cechy uzupelniajace 2
+    # complementary features 2
     vec.append(features.is_first_second_person(mention))
     vec.append(features.is_demonstrative(mention))
     vec.append(features.is_demonstrative_nominal(mention))
@@ -50,7 +50,7 @@ def get_pair_features(ante, ana):
     vec.append(features.exact_match(ante, ana))
     vec.append(features.base_match(ante, ana))
  
-    # cechy uzupelniajace
+    # complementary features
     vec.append(features.ante_contains_rarest_from_ana(ante, ana))
     vec.extend(features.agreement(ante, ana, 'gender'))
     vec.extend(features.agreement(ante, ana, 'number'))
@@ -59,7 +59,7 @@ def get_pair_features(ante, ana):
     vec.append(features.same_sentence(ante, ana))
     vec.append(features.same_paragraph(ante, ana))
  
-    # cechy uzupelniajace 2
+    # complementary features 2
     vec.append(features.neighbouring_sentence(ante, ana))
     vec.append(features.cousin_sentence(ante, ana))
     vec.append(features.distant_sentence(ante, ana))
@@ -79,7 +79,7 @@ def get_pair_features(ante, ana):
     vec.append(features.wikipedia_mutual_link(ante, ana))
     vec.append(features.wikipedia_redirect(ante, ana))
  
-    # combined
+    # combined features
     vec.append(features.samesent_anapron_antefirstinpar(ante, ana))
     vec.append(features.samesent_antefirstinpar_personnumbermatch(ante, ana))
     vec.append(features.adjsent_anapron_adjmen_personnumbermatch(ante, ana))
@@ -72,7 +72,6 @@ def initialize_siamese_model(number_of_features, path_to_model):
  
  
 def create_base_network(input_dim):
-    '''Base network to be shared'''
     seq = Sequential()
  
     seq.add(Dense(1000, input_shape=(input_dim,), activation='relu'))
@@ -94,13 +93,10 @@ def euclidean_distance(vects):
  
 def eucl_dist_output_shape(shapes):
     shape1, shape2 = shapes
-    return (shape1[0], 1)
+    return shape1[0], 1
  
  
 def contrastive_loss(y_true, y_pred):
-    '''Contrastive loss from Hadsell-et-al.'06
-    http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
-    '''
     margin = 1
     return K.mean(y_true * K.square(y_pred) + (1 - y_true) * K.square(K.maximum(margin - y_pred, 0)))
  
@@ -125,9 +121,9 @@ def load_one2many_map(map_path):
     jmap_annotations = pobj.__dict__['annotations']
     jmap_annotations_count = len(jmap_annotations)
     for i in range(jmap_annotations_count):
-    	if i%2 == 1:
-    		mapped_elements = set(jmap_annotations[i+1].__dict__['annotations'])
-    		this_map[jmap_annotations[i]] = mapped_elements
+        if i % 2 == 1:
+            mapped_elements = set(jmap_annotations[i+1].__dict__['annotations'])
+            this_map[jmap_annotations[i]] = mapped_elements
     return this_map
  
  
@@ -138,7 +134,7 @@ def load_one2one_map(map_path):
     jmap_annotations = pobj.__dict__['annotations']
     jmap_annotations_count = len(jmap_annotations)
     for i in range(jmap_annotations_count):
-    	if i%2 == 1:
-    		element = jmap_annotations[i+1]
-    		this_map[jmap_annotations[i]] = element
+        if i % 2 == 1:
+            element = jmap_annotations[i+1]
+            this_map[jmap_annotations[i]] = element
     return this_map
 lxml
 natsort
 gensim
+keras
+tensorflow
 numpy
+javaobj-py3