Corneferencer alpha version (4a097c4f) | Commits | cothec / Corneferencer

Browse Code »

Commit 4a097c4f123cc34ee07d870361bad7a817a3faeb

Authored by Bartłomiej Nitoń 8 years ago

1 parent a5beabe2

master

Corneferencer alpha version

Inline Side-by-side

Showing 15 changed files with 782 additions and 0 deletions

conf.py 0 → 100644

View file @4a097c4

		1	+import os
		2	+
		3	+from gensim.models.word2vec import Word2Vec
		4	+
		5	+from corneferencer.utils import initialize_neural_model
		6	+
		7	+
		8	+CONTEXT = 5
		9	+THRESHOLD = 0.5
		10	+RANDOM_WORD_VECTORS = True
		11	+W2V_SIZE = 50
		12	+W2V_MODEL_NAME = 'w2v_allwiki_nkjpfull_50.model'
		13	+
		14	+NUMBER_OF_FEATURES = 1126
		15	+NEURAL_MODEL_NAME = 'weights_2017_05_10.h5'
		16	+
		17	+
		18	+# do not change that
		19	+W2V_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', W2V_MODEL_NAME)
		20	+W2V_MODEL = Word2Vec.load(W2V_MODEL_PATH)
		21	+
		22	+NEURAL_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', NEURAL_MODEL_NAME)
		23	+NEURAL_MODEL = initialize_neural_model(NUMBER_OF_FEATURES)

corneferencer/entities.py 0 → 100644

View file @4a097c4

		1	+from corneferencer.resolvers.vectors import get_mention_features
		2	+
		3	+
		4	+class Text:
		5	+
		6	+ def __init__(self, text_id):
		7	+ self.__id = text_id
		8	+ self.mentions = []
		9	+
		10	+ def get_mention_set(self, mnt_id):
		11	+ for mnt in self.mentions:
		12	+ if mnt.id == mnt_id:
		13	+ return mnt.set
		14	+ return None
		15	+
		16	+
		17	+class Mention:
		18	+
		19	+ def __init__(self, mnt_id, text, lemmatized_text, words, span,
		20	+ head_orth, head_base, dominant, node, prec_context,
		21	+ follow_context, sentence, position_in_mentions,
		22	+ start_in_words, end_in_words):
		23	+ self.id = mnt_id
		24	+ self.set = ''
		25	+ self.text = text
		26	+ self.lemmatized_text = lemmatized_text
		27	+ self.words = words
		28	+ self.span = span
		29	+ self.head_orth = head_orth
		30	+ self.head_base = head_base
		31	+ self.dominant = dominant
		32	+ self.node = node
		33	+ self.prec_context = prec_context
		34	+ self.follow_context = follow_context
		35	+ self.sentence = sentence
		36	+ self.position_in_mentions = position_in_mentions
		37	+ self.start_in_words = start_in_words
		38	+ self.end_in_words = end_in_words
		39	+ self.features = get_mention_features(self)

corneferencer/core.py renamed to corneferencer/inout/__init__.py

View file @4a097c4

corneferencer/inout/constants.py 0 → 100644

View file @4a097c4

+INPUT_FORMATS = ['mmax']

corneferencer/inout/mmax.py 0 → 100644

View file @4a097c4

		1	+import os
		2	+import shutil
		3	+
		4	+from lxml import etree
		5	+
		6	+from conf import CONTEXT
		7	+from corneferencer.entities import Mention, Text
		8	+
		9	+
		10	+def read(inpath):
		11	+ textname = os.path.splitext(os.path.basename(inpath))[0]
		12	+ textdir = os.path.dirname(inpath)
		13	+
		14	+ mentions_path = os.path.join(textdir, '%s_mentions.xml' % textname)
		15	+ words_path = os.path.join(textdir, '%s_words.xml' % textname)
		16	+
		17	+ text = Text(textname)
		18	+ mentions = read_mentions(mentions_path, words_path)
		19	+ text.mentions = mentions
		20	+ return text
		21	+
		22	+
		23	+def read_mentions(mentions_path, words_path):
		24	+ mentions = []
		25	+ mentions_tree = etree.parse(mentions_path)
		26	+ markables = mentions_tree.xpath("//ns:markable",
		27	+ namespaces={'ns': 'www.eml.org/NameSpaces/mention'})
		28	+ words = get_words(words_path)
		29	+
		30	+ for idx, markable in enumerate(markables):
		31	+ span = markable.attrib['span']
		32	+
		33	+ dominant = ''
		34	+ if 'dominant' in markable.attrib:
		35	+ dominant = markable.attrib['dominant']
		36	+
		37	+ head_orth = markable.attrib['mention_head']
		38	+ mention_words = span_to_words(span, words)
		39	+
		40	+ (prec_context, follow_context, sentence,
		41	+ mnt_start_position, mnt_end_position) = get_context(mention_words, words)
		42	+
		43	+ head_base = get_head_base(head_orth, mention_words)
		44	+ mention = Mention(mnt_id=markable.attrib['id'],
		45	+ text=span_to_text(span, words, 'orth'),
		46	+ lemmatized_text=span_to_text(span, words, 'base'),
		47	+ words=mention_words,
		48	+ span=span,
		49	+ head_orth=head_orth,
		50	+ head_base=head_base,
		51	+ dominant=dominant,
		52	+ node=markable,
		53	+ prec_context=prec_context,
		54	+ follow_context=follow_context,
		55	+ sentence=sentence,
		56	+ position_in_mentions=idx,
		57	+ start_in_words=mnt_start_position,
		58	+ end_in_words=mnt_end_position)
		59	+ mentions.append(mention)
		60	+
		61	+ return mentions
		62	+
		63	+
		64	+def get_words(filepath):
		65	+ tree = etree.parse(filepath)
		66	+ words = []
		67	+ for word in tree.xpath("//word"):
		68	+ hasnps = False
		69	+ if 'hasnps' in word.attrib and word.attrib['hasnps'] == 'true':
		70	+ hasnps = True
		71	+ lastinsent = False
		72	+ if 'lastinsent' in word.attrib and word.attrib['lastinsent'] == 'true':
		73	+ lastinsent = True
		74	+ words.append({'id': word.attrib['id'],
		75	+ 'orth': word.text,
		76	+ 'base': word.attrib['base'],
		77	+ 'hasnps': hasnps,
		78	+ 'lastinsent': lastinsent,
		79	+ 'ctag': word.attrib['ctag']})
		80	+ return words
		81	+
		82	+
		83	+def span_to_words(span, words):
		84	+ fragments = span.split(',')
		85	+ mention_parts = []
		86	+ for fragment in fragments:
		87	+ mention_parts.extend(fragment_to_words(fragment, words))
		88	+ return mention_parts
		89	+
		90	+
		91	+def fragment_to_words(fragment, words):
		92	+ mention_parts = []
		93	+ if '..' in fragment:
		94	+ mention_parts.extend(get_multiword(fragment, words))
		95	+ else:
		96	+ mention_parts.extend(get_word(fragment, words))
		97	+ return mention_parts
		98	+
		99	+
		100	+def get_multiword(fragment, words):
		101	+ mention_parts = []
		102	+ boundaries = fragment.split('..')
		103	+ start_id = boundaries[0]
		104	+ end_id = boundaries[1]
		105	+ in_string = False
		106	+ for word in words:
		107	+ if word['id'] == start_id:
		108	+ in_string = True
		109	+ if in_string and not word_to_ignore(word):
		110	+ mention_parts.append(word)
		111	+ if word['id'] == end_id:
		112	+ break
		113	+ return mention_parts
		114	+
		115	+
		116	+def get_word(word_id, words):
		117	+ for word in words:
		118	+ if word['id'] == word_id:
		119	+ if not word_to_ignore(word):
		120	+ return [word]
		121	+ else:
		122	+ return []
		123	+ return []
		124	+
		125	+
		126	+def word_to_ignore(word):
		127	+ if word['ctag'] == 'interp':
		128	+ return True
		129	+ return False
		130	+
		131	+
		132	+def get_context(mention_words, words):
		133	+ prec_context = []
		134	+ follow_context = []
		135	+ sentence = []
		136	+ mnt_start_position = -1
		137	+ mnt_end_position = -1
		138	+ first_word = mention_words[0]
		139	+ last_word = mention_words[-1]
		140	+ for idx, word in enumerate(words):
		141	+ if word['id'] == first_word['id']:
		142	+ prec_context = get_prec_context(idx, words)
		143	+ mnt_start_position = get_mention_start(first_word, words)
		144	+ if word['id'] == last_word['id']:
		145	+ follow_context = get_follow_context(idx, words)
		146	+ sentence = get_sentence(idx, words)
		147	+ mnt_end_position = get_mention_end(last_word, words)
		148	+ break
		149	+ return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position
		150	+
		151	+
		152	+def get_prec_context(mention_start, words):
		153	+ context = []
		154	+ context_start = mention_start - 1
		155	+ while context_start >= 0:
		156	+ if not word_to_ignore(words[context_start]):
		157	+ context.append(words[context_start])
		158	+ if len(context) == CONTEXT:
		159	+ break
		160	+ context_start -= 1
		161	+ context.reverse()
		162	+ return context
		163	+
		164	+
		165	+def get_mention_start(first_word, words):
		166	+ start = 0
		167	+ for word in words:
		168	+ if not word_to_ignore(word):
		169	+ start += 1
		170	+ if word['id'] == first_word['id']:
		171	+ break
		172	+ return start
		173	+
		174	+
		175	+def get_mention_end(last_word, words):
		176	+ end = 0
		177	+ for word in words:
		178	+ if not word_to_ignore(word):
		179	+ end += 1
		180	+ if word['id'] == last_word['id']:
		181	+ break
		182	+ return end
		183	+
		184	+
		185	+def get_follow_context(mention_end, words):
		186	+ context = []
		187	+ context_end = mention_end + 1
		188	+ while context_end < len(words):
		189	+ if not word_to_ignore(words[context_end]):
		190	+ context.append(words[context_end])
		191	+ if len(context) == CONTEXT:
		192	+ break
		193	+ context_end += 1
		194	+ return context
		195	+
		196	+
		197	+def get_sentence(word_idx, words):
		198	+ sentence_start = get_sentence_start(words, word_idx)
		199	+ sentence_end = get_sentence_end(words, word_idx)
		200	+ sentence = [word for word in words[sentence_start:sentence_end + 1] if not word_to_ignore(word)]
		201	+ return sentence
		202	+
		203	+
		204	+def get_sentence_start(words, word_idx):
		205	+ search_start = word_idx
		206	+ while word_idx >= 0:
		207	+ if words[word_idx]['lastinsent'] and search_start != word_idx:
		208	+ return word_idx + 1
		209	+ word_idx -= 1
		210	+ return 0
		211	+
		212	+
		213	+def get_sentence_end(words, word_idx):
		214	+ while word_idx < len(words):
		215	+ if words[word_idx]['lastinsent']:
		216	+ return word_idx
		217	+ word_idx += 1
		218	+ return len(words) - 1
		219	+
		220	+
		221	+def get_head_base(head_orth, words):
		222	+ for word in words:
		223	+ if word['orth'].lower() == head_orth.lower() or word['orth'] == head_orth:
		224	+ return word['base']
		225	+ return None
		226	+
		227	+
		228	+def span_to_text(span, words, form):
		229	+ fragments = span.split(',')
		230	+ mention_parts = []
		231	+ for fragment in fragments:
		232	+ mention_parts.append(fragment_to_text(fragment, words, form))
		233	+ return u' [...] '.join(mention_parts)
		234	+
		235	+
		236	+def fragment_to_text(fragment, words, form):
		237	+ if '..' in fragment:
		238	+ text = get_multiword_text(fragment, words, form)
		239	+ else:
		240	+ text = get_one_word_text(fragment, words, form)
		241	+ return text
		242	+
		243	+
		244	+def get_multiword_text(fragment, words, form):
		245	+ mention_parts = []
		246	+ boundaries = fragment.split('..')
		247	+ start_id = boundaries[0]
		248	+ end_id = boundaries[1]
		249	+ in_string = False
		250	+ for word in words:
		251	+ if word['id'] == start_id:
		252	+ in_string = True
		253	+ if in_string and not word_to_ignore(word):
		254	+ mention_parts.append(word)
		255	+ if word['id'] == end_id:
		256	+ break
		257	+ return to_text(mention_parts, form)
		258	+
		259	+
		260	+def to_text(words, form):
		261	+ text = ''
		262	+ for idx, word in enumerate(words):
		263	+ if word['hasnps'] or idx == 0:
		264	+ text += word[form]
		265	+ else:
		266	+ text += u' %s' % word[form]
		267	+ return text
		268	+
		269	+
		270	+def get_one_word_text(word_id, words, form):
		271	+ this_word = next(word for word in words if word['id'] == word_id)
		272	+ return this_word[form]
		273	+
		274	+
		275	+def write(inpath, outpath, text):
		276	+ textname = os.path.splitext(os.path.basename(inpath))[0]
		277	+ intextdir = os.path.dirname(inpath)
		278	+ outtextdir = os.path.dirname(outpath)
		279	+
		280	+ in_mmax_path = os.path.join(intextdir, '%s.mmax' % textname)
		281	+ out_mmax_path = os.path.join(outtextdir, '%s.mmax' % textname)
		282	+ copy_mmax(in_mmax_path, out_mmax_path)
		283	+
		284	+ in_words_path = os.path.join(intextdir, '%s_words.xml' % textname)
		285	+ out_words_path = os.path.join(outtextdir, '%s_words.xml' % textname)
		286	+ copy_words(in_words_path, out_words_path)
		287	+
		288	+ in_mentions_path = os.path.join(intextdir, '%s_mentions.xml' % textname)
		289	+ out_mentions_path = os.path.join(outtextdir, '%s_mentions.xml' % textname)
		290	+ write_mentions(in_mentions_path, out_mentions_path, text)
		291	+
		292	+
		293	+def copy_mmax(src, dest):
		294	+ shutil.copyfile(src, dest)
		295	+
		296	+
		297	+def copy_words(src, dest):
		298	+ shutil.copyfile(src, dest)
		299	+
		300	+
		301	+def write_mentions(inpath, outpath, text):
		302	+ tree = etree.parse(inpath)
		303	+ mentions = tree.xpath("//ns:markable", namespaces={'ns': 'www.eml.org/NameSpaces/mention'})
		304	+
		305	+ for mnt in mentions:
		306	+ mnt_set = text.get_mention_set(mnt.attrib['id'])
		307	+ if mnt_set:
		308	+ mnt.attrib['mention_group'] = mnt_set
		309	+ else:
		310	+ mnt.attrib['mention_group'] = 'empty'
		311	+
		312	+ with open(outpath, 'wb') as output_file:
		313	+ output_file.write(etree.tostring(tree, pretty_print=True,
		314	+ xml_declaration=True, encoding='UTF-8',
		315	+ doctype=u'<!DOCTYPE markables SYSTEM "markables.dtd">'))

corneferencer/main.py 0 → 100644

View file @4a097c4

		1	+import os
		2	+import sys
		3	+
		4	+from argparse import ArgumentParser
		5	+from natsort import natsorted
		6	+
		7	+sys.path.append(os.path.abspath(os.path.join('..')))
		8	+
		9	+from inout import mmax
		10	+from inout.constants import INPUT_FORMATS
		11	+from resolvers import resolve
		12	+from resolvers.constants import RESOLVERS
		13	+from utils import eprint
		14	+
		15	+
		16	+def main():
		17	+ args = parse_arguments()
		18	+ if not args.input:
		19	+ eprint("Error: Input file(s) not specified!")
		20	+ elif args.resolver not in RESOLVERS:
		21	+ eprint("Error: Unknown resolve algorithm!")
		22	+ elif args.format not in INPUT_FORMATS:
		23	+ eprint("Error: Unknown input file format!")
		24	+ else:
		25	+ process_texts(args.input, args.output, args.format, args.resolver)
		26	+
		27	+
		28	+def parse_arguments():
		29	+ parser = ArgumentParser(description='Corneferencer: coreference resolver using neural nets.')
		30	+ parser.add_argument('-i', '--input', type=str, action='store',
		31	+ dest='input', default='',
		32	+ help='input file or dir path')
		33	+ parser.add_argument('-o', '--output', type=str, action='store',
		34	+ dest='output', default='',
		35	+ help='output path; if not specified writes output to standard output')
		36	+ parser.add_argument('-f', '--format', type=str, action='store',
		37	+ dest='format', default='mmax',
		38	+ help='input format; default: mmax')
		39	+ parser.add_argument('-r', '--resolver', type=str, action='store',
		40	+ dest='resolver', default='incremental',
		41	+ help='resolve algorithm; default: incremental; possibilities: %s'
		42	+ % ', '.join(RESOLVERS))
		43	+
		44	+ args = parser.parse_args()
		45	+ return args
		46	+
		47	+
		48	+def process_texts(inpath, outpath, informat, resolver):
		49	+ if os.path.isdir(inpath):
		50	+ process_directory(inpath, outpath, informat, resolver)
		51	+ elif os.path.isfile(inpath):
		52	+ process_file(inpath, outpath, informat, resolver)
		53	+ else:
		54	+ eprint("Error: Specified input does not exist!")
		55	+
		56	+
		57	+def process_directory(inpath, outpath, informat, resolver):
		58	+ inpath = os.path.abspath(inpath)
		59	+ outpath = os.path.abspath(outpath)
		60	+
		61	+ files = os.listdir(inpath)
		62	+ files = natsorted(files)
		63	+
		64	+ for filename in files:
		65	+ textname = os.path.splitext(os.path.basename(filename))[0]
		66	+ textoutput = os.path.join(outpath, textname)
		67	+ textinput = os.path.join(inpath, filename)
		68	+ process_file(textinput, textoutput, informat, resolver)
		69	+
		70	+
		71	+def process_file(inpath, outpath, informat, resolver):
		72	+ basename = os.path.basename(inpath)
		73	+ if informat == 'mmax' and basename.endswith('.mmax'):
		74	+ print (basename)
		75	+ text = mmax.read(inpath)
		76	+ if resolver == 'incremental':
		77	+ resolve.incremental(text)
		78	+ elif resolver == 'entity_based':
		79	+ resolve.entity_based(text)
		80	+ mmax.write(inpath, outpath, text)
		81	+
		82	+
		83	+if __name__ == '__main__':
		84	+ main()

corneferencer/readers/__init__.py deleted

View file @a5beabe

corneferencer/entities/__init__.py renamed to corneferencer/resolvers/__init__.py

View file @4a097c4

corneferencer/resolvers/constants.py 0 → 100644

View file @4a097c4

+RESOLVERS = ['entity_based', 'incremental']

corneferencer/resolvers/features.py 0 → 100644

View file @4a097c4

		1	+import numpy
		2	+import random
		3	+
		4	+from conf import RANDOM_WORD_VECTORS, W2V_MODEL, W2V_SIZE
		5	+
		6	+
		7	+# mention features
		8	+def head_vec(mention):
		9	+ return list(get_wv(W2V_MODEL, mention.head_base))
		10	+
		11	+
		12	+def first_word_vec(mention):
		13	+ return list(get_wv(W2V_MODEL, mention.words[0]['base']))
		14	+
		15	+
		16	+def last_word_vec(mention):
		17	+ return list(get_wv(W2V_MODEL, mention.words[-1]['base']))
		18	+
		19	+
		20	+def first_after_vec(mention):
		21	+ if len(mention.follow_context) > 0:
		22	+ vec = list(get_wv(W2V_MODEL, mention.follow_context[0]['base']))
		23	+ else:
		24	+ vec = [0.0] * W2V_SIZE
		25	+ return vec
		26	+
		27	+
		28	+def second_after_vec(mention):
		29	+ if len(mention.follow_context) > 1:
		30	+ vec = list(get_wv(W2V_MODEL, mention.follow_context[1]['base']))
		31	+ else:
		32	+ vec = [0.0] * W2V_SIZE
		33	+ return vec
		34	+
		35	+
		36	+def first_before_vec(mention):
		37	+ if len(mention.prec_context) > 0:
		38	+ vec = list(get_wv(W2V_MODEL, mention.prec_context[-1]['base']))
		39	+ else:
		40	+ vec = [0.0] * W2V_SIZE
		41	+ return vec
		42	+
		43	+
		44	+def second_before_vec(mention):
		45	+ if len(mention.prec_context) > 1:
		46	+ vec = list(get_wv(W2V_MODEL, mention.prec_context[-2]['base']))
		47	+ else:
		48	+ vec = [0.0] * W2V_SIZE
		49	+ return vec
		50	+
		51	+
		52	+def preceding_context_vec(mention):
		53	+ return list(get_context_vec(mention.prec_context, W2V_MODEL))
		54	+
		55	+
		56	+def following_context_vec(mention):
		57	+ return list(get_context_vec(mention.follow_context, W2V_MODEL))
		58	+
		59	+
		60	+def mention_vec(mention):
		61	+ return list(get_context_vec(mention.words, W2V_MODEL))
		62	+
		63	+
		64	+def sentence_vec(mention):
		65	+ return list(get_context_vec(mention.sentence, W2V_MODEL))
		66	+
		67	+
		68	+# pair features
		69	+def distances_vec(ante, ana):
		70	+ vec = []
		71	+
		72	+ mnts_intersect = pair_intersect(ante, ana)
		73	+
		74	+ words_dist = [0] * 11
		75	+ words_bucket = 0
		76	+ if mnts_intersect != 1:
		77	+ words_bucket = get_distance_bucket(ana.start_in_words - ante.end_in_words - 1)
		78	+ words_dist[words_bucket] = 1
		79	+ vec.extend(words_dist)
		80	+
		81	+ mentions_dist = [0] * 11
		82	+ mentions_bucket = 0
		83	+ if mnts_intersect != 1:
		84	+ mentions_bucket = get_distance_bucket(ana.position_in_mentions - ante.position_in_mentions - 1)
		85	+ if words_bucket == 10:
		86	+ mentions_bucket = 10
		87	+ mentions_dist[mentions_bucket] = 1
		88	+ vec.extend(mentions_dist)
		89	+
		90	+ vec.append(mnts_intersect)
		91	+
		92	+ return vec
		93	+
		94	+
		95	+def pair_intersect(ante, ana):
		96	+ for ante_word in ante.words:
		97	+ for ana_word in ana.words:
		98	+ if ana_word['id'] == ante_word['id']:
		99	+ return 1
		100	+ return 0
		101	+
		102	+
		103	+def head_match(ante, ana):
		104	+ if ante.head_orth.lower() == ana.head_orth.lower():
		105	+ return 1
		106	+ return 0
		107	+
		108	+
		109	+def exact_match(ante, ana):
		110	+ if ante.text.lower() == ana.text.lower():
		111	+ return 1
		112	+ return 0
		113	+
		114	+
		115	+def base_match(ante, ana):
		116	+ if ante.lemmatized_text.lower() == ana.lemmatized_text.lower():
		117	+ return 1
		118	+ return 0
		119	+
		120	+
		121	+# supporting functions
		122	+def get_wv(model, lemma, use_random_vec=True):
		123	+ vec = None
		124	+ if use_random_vec:
		125	+ vec = random_vec()
		126	+ try:
		127	+ vec = model.wv[lemma]
		128	+ except KeyError:
		129	+ pass
		130	+ except TypeError:
		131	+ pass
		132	+ return vec
		133	+
		134	+
		135	+def random_vec():
		136	+ return numpy.asarray([random.uniform(-0.25, 0.25) for i in range(0, W2V_SIZE)], dtype=numpy.float32)
		137	+
		138	+
		139	+def get_context_vec(words, model):
		140	+ vec = numpy.zeros(W2V_SIZE, dtype=numpy.float32)
		141	+ unknown_count = 0
		142	+ if len(words) != 0:
		143	+ for word in words:
		144	+ word_vec = get_wv(model, word['base'], RANDOM_WORD_VECTORS)
		145	+ if word_vec is None:
		146	+ unknown_count += 1
		147	+ else:
		148	+ vec += word_vec
		149	+ significant_words = len(words) - unknown_count
		150	+ if significant_words != 0:
		151	+ vec = vec / float(significant_words)
		152	+ else:
		153	+ vec = random_vec()
		154	+ return vec
		155	+
		156	+
		157	+def get_distance_bucket(distance):
		158	+ if 0 <= distance <= 4:
		159	+ return distance
		160	+ elif 5 <= distance <= 7:
		161	+ return 5
		162	+ elif 8 <= distance <= 15:
		163	+ return 6
		164	+ elif 16 <= distance <= 31:
		165	+ return 7
		166	+ elif 32 <= distance <= 63:
		167	+ return 8
		168	+ elif distance >= 64:
		169	+ return 9
		170	+ return 10

corneferencer/resolvers/resolve.py 0 → 100644

View file @4a097c4

		1	+from conf import NEURAL_MODEL, THRESHOLD
		2	+from corneferencer.resolvers.vectors import create_pair_vector
		3	+
		4	+
		5	+# incremental resolve algorithm
		6	+def incremental(text):
		7	+ last_set_id = 1
		8	+ for i, ana in enumerate(text.mentions):
		9	+ if i > 0:
		10	+ best_prediction = 0.0
		11	+ best_ante = None
		12	+ for ante in text.mentions[:i:-1]:
		13	+ pair_vec = create_pair_vector(ante, ana)
		14	+ prediction = NEURAL_MODEL.predict(pair_vec)
		15	+ accuracy = prediction[0]
		16	+ if accuracy > THRESHOLD and accuracy > best_prediction:
		17	+ best_prediction = accuracy
		18	+ best_ante = ante
		19	+ if best_ante is not None:
		20	+ if best_ante.set:
		21	+ ana.set = best_ante.set
		22	+ else:
		23	+ str_set_id = 'set_%d' % last_set_id
		24	+ best_ante.set = str_set_id
		25	+ ana.set = str_set_id
		26	+ last_set_id += 1
		27	+
		28	+
		29	+# entity based resolve algorithm
		30	+def entity_based(text):
		31	+ sets = []
		32	+ last_set_id = 1
		33	+ for i, ana in enumerate(text.mentions):
		34	+ if i > 0:
		35	+ best_fit = get_best_set(sets, ana)
		36	+ if best_fit is not None:
		37	+ ana.set = best_fit['set_id']
		38	+ best_fit['mentions'].append(ana)
		39	+ else:
		40	+ str_set_id = 'set_%d' % last_set_id
		41	+ sets.append({'set_id': str_set_id,
		42	+ 'mentions': [ana]})
		43	+ ana.set = str_set_id
		44	+ last_set_id += 1
		45	+ else:
		46	+ str_set_id = 'set_%d' % last_set_id
		47	+ sets.append({'set_id': str_set_id,
		48	+ 'mentions': [ana]})
		49	+ ana.set = str_set_id
		50	+ last_set_id += 1
		51	+
		52	+ remove_singletons(sets)
		53	+
		54	+
		55	+def get_best_set(sets, ana):
		56	+ best_prediction = 0.0
		57	+ best_set = None
		58	+ for s in sets:
		59	+ accuracy = predict_set(s['mentions'], ana)
		60	+ if accuracy > THRESHOLD and accuracy >= best_prediction:
		61	+ best_prediction = accuracy
		62	+ best_set = s
		63	+ return best_set
		64	+
		65	+
		66	+def predict_set(mentions, ana):
		67	+ accuracy_sum = 0.0
		68	+ for mnt in mentions:
		69	+ pair_vec = create_pair_vector(mnt, ana)
		70	+ prediction = NEURAL_MODEL.predict(pair_vec)
		71	+ accuracy = prediction[0]
		72	+ accuracy_sum += accuracy
		73	+ return accuracy_sum / float(len(mentions))
		74	+
		75	+
		76	+def remove_singletons(sets):
		77	+ for s in sets:
		78	+ if len(s['mentions']) == 1:
		79	+ s['mentions'][0].set = ''

corneferencer/resolvers/vectors.py 0 → 100644

View file @4a097c4

		1	+import numpy
		2	+
		3	+from corneferencer.resolvers import features
		4	+
		5	+# input_1 to have shape (None, 1126) but got array with shape (1126, 1)
		6	+def create_pair_vector(ante, ana):
		7	+ vec = []
		8	+ # ante_features = get_mention_features(ante)
		9	+ # vec.extend(ante_features)
		10	+ # ana_features = get_mention_features(ana)
		11	+ # vec.extend(ana_features)
		12	+ vec.extend(ante.features)
		13	+ vec.extend(ana.features)
		14	+ pair_features = get_pair_features(ante, ana)
		15	+ vec.extend(pair_features)
		16	+ return numpy.asarray([vec], dtype=numpy.float32)
		17	+
		18	+
		19	+def get_mention_features(mention):
		20	+ vec = []
		21	+ vec.extend(features.head_vec(mention))
		22	+ vec.extend(features.first_word_vec(mention))
		23	+ vec.extend(features.last_word_vec(mention))
		24	+ vec.extend(features.first_after_vec(mention))
		25	+ vec.extend(features.second_after_vec(mention))
		26	+ vec.extend(features.first_before_vec(mention))
		27	+ vec.extend(features.second_before_vec(mention))
		28	+ vec.extend(features.preceding_context_vec(mention))
		29	+ vec.extend(features.following_context_vec(mention))
		30	+ vec.extend(features.mention_vec(mention))
		31	+ vec.extend(features.sentence_vec(mention))
		32	+ return vec
		33	+
		34	+
		35	+def get_pair_features(ante, ana):
		36	+ vec = []
		37	+ vec.extend(features.distances_vec(ante, ana))
		38	+ vec.append(features.head_match(ante, ana))
		39	+ vec.append(features.exact_match(ante, ana))
		40	+ vec.append(features.base_match(ante, ana))
		41	+ return vec

corneferencer/utils.py 0 → 100644

View file @4a097c4

		1	+from __future__ import print_function
		2	+
		3	+import sys
		4	+
		5	+from keras.models import Model
		6	+from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
		7	+
		8	+
		9	+def eprint(args, *kwargs):
		10	+ print(args, file=sys.stderr, *kwargs)
		11	+
		12	+
		13	+def initialize_neural_model(number_of_features):
		14	+ inputs = Input(shape=(number_of_features,))
		15	+ output_from_1st_layer = Dense(1000, activation='relu')(inputs)
		16	+ output_from_1st_layer = Dropout(0.5)(output_from_1st_layer)
		17	+ output_from_1st_layer = BatchNormalization()(output_from_1st_layer)
		18	+ output_from_2nd_layer = Dense(500, activation='relu')(output_from_1st_layer)
		19	+ output_from_2nd_layer = Dropout(0.5)(output_from_2nd_layer)
		20	+ output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer)
		21	+ output = Dense(1, activation='sigmoid')(output_from_2nd_layer)
		22	+
		23	+ model = Model(inputs, output)
		24	+ model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
		25	+ return model

requirements.txt

View file @4a097c4

		1	+lxml
		2	+natsort
		3	+gensim
		4	+numpy

setup.py deleted

View file @a5beabe