Basic evaluation and data preparation scripts. (c2871e0d) | Commits | cothec / neural4coref

Browse Code »

Commit c2871e0ded5bfb23380ab7c041d4237e1a0c8481

Authored by Bartłomiej Nitoń 8 years ago

1 parent 01a04337

master

Basic evaluation and data preparation scripts.

Inline Side-by-side

Showing 3 changed files with 790 additions and 0 deletions

counter.py 0 → 100644

View file @c2871e0

		1	+# -- coding: utf-8 --
		2	+
		3	+import os
		4	+
		5	+from lxml import etree
		6	+from natsort import natsorted
		7	+
		8	+from preparator import ANNO_PATH
		9	+
		10	+
		11	+def count_words():
		12	+ anno_files = os.listdir(ANNO_PATH)
		13	+ anno_files = natsorted(anno_files)
		14	+ for filename in anno_files:
		15	+ if filename.endswith('.mmax'):
		16	+ words_count = 0
		17	+ textname = filename.replace('.mmax', '')
		18	+ words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname)
		19	+ tree = etree.parse(words_path)
		20	+ for word in tree.xpath("//word"):
		21	+ if word.attrib['ctag'] != 'interp':
		22	+ words_count += 1
		23	+ print textname, words_count
		24	+
		25	+
		26	+def count_mentions():
		27	+ anno_files = os.listdir(ANNO_PATH)
		28	+ anno_files = natsorted(anno_files)
		29	+ for filename in anno_files:
		30	+ if filename.endswith('.mmax'):
		31	+ textname = filename.replace('.mmax', '')
		32	+
		33	+ mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname)
		34	+ tree = etree.parse(mentions_path)
		35	+ mentions = tree.xpath("//ns:markable", namespaces={'ns': 'www.eml.org/NameSpaces/mention'})
		36	+ print textname, len(mentions)

preparator.py 0 → 100644

View file @c2871e0

		1	+# -- coding: utf-8 --
		2	+
		3	+import codecs
		4	+import numpy
		5	+import os
		6	+import random
		7	+
		8	+from lxml import etree
		9	+from itertools import combinations
		10	+from natsort import natsorted
		11	+
		12	+from gensim.models.word2vec import Word2Vec
		13	+
		14	+
		15	+TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'test-prepared'))
		16	+TRAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'train-prepared'))
		17	+
		18	+ANNO_PATH = TEST_PATH
		19	+OUT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data',
		20	+ 'test.csv'))
		21	+EACH_TEXT_SEPARATELLY = False
		22	+
		23	+CONTEXT = 5
		24	+W2V_SIZE = 50
		25	+MODEL = os.path.abspath(os.path.join(os.path.dirname(__file__), 'models',
		26	+ '%d' % W2V_SIZE,
		27	+ 'w2v_allwiki_nkjpfull_%d.model' % W2V_SIZE))
		28	+POSSIBLE_HEADS = [u'§', u'%', u'*', u'"', u'„', u'&', u'-']
		29	+NEG_PROPORTION = 1
		30	+RANDOM_VECTORS = True
		31	+
		32	+DEBUG = False
		33	+POS_COUNT = 0
		34	+NEG_COUNT = 0
		35	+ALL_WORDS = 0
		36	+UNKNONW_WORDS = 0
		37	+
		38	+
		39	+def main():
		40	+ model = Word2Vec.load(MODEL)
		41	+ try:
		42	+ create_data_vectors(model)
		43	+ finally:
		44	+ print 'Unknown words: ', UNKNONW_WORDS
		45	+ print 'All words: ', ALL_WORDS
		46	+ print 'Positives: ', POS_COUNT
		47	+ print 'Negatives: ', NEG_COUNT
		48	+
		49	+
		50	+def create_data_vectors(model):
		51	+ features_file = None
		52	+ if not EACH_TEXT_SEPARATELLY:
		53	+ features_file = codecs.open(OUT_PATH, 'wt', 'utf-8')
		54	+
		55	+ anno_files = os.listdir(ANNO_PATH)
		56	+ anno_files = natsorted(anno_files)
		57	+ for filename in anno_files:
		58	+ if filename.endswith('.mmax'):
		59	+ print '=======> ', filename
		60	+ textname = filename.replace('.mmax', '')
		61	+
		62	+ mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname)
		63	+ tree = etree.parse(mentions_path)
		64	+ mentions = tree.xpath("//ns:markable", namespaces={'ns': 'www.eml.org/NameSpaces/mention'})
		65	+ positives, negatives = diff_mentions(mentions)
		66	+
		67	+ if DEBUG:
		68	+ print 'Positives:'
		69	+ print len(positives)
		70	+
		71	+ print 'Negatives:'
		72	+ print len(negatives)
		73	+
		74	+ words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname)
		75	+ mentions_dict = markables_level_2_dict(mentions_path, words_path)
		76	+
		77	+ if EACH_TEXT_SEPARATELLY:
		78	+ text_features_path = os.path.join(OUT_PATH, '%s.csv' % textname)
		79	+ features_file = codecs.open(text_features_path, 'wt', 'utf-8')
		80	+ write_features(features_file, positives, negatives, mentions_dict, model, textname)
		81	+
		82	+ if not EACH_TEXT_SEPARATELLY:
		83	+ features_file.close()
		84	+
		85	+
		86	+def diff_mentions(mentions):
		87	+ sets, clustered_mensions = get_sets(mentions)
		88	+ positives = get_positives(sets)
		89	+ positives, negatives = get_negatives_and_update_positives(clustered_mensions, positives)
		90	+ if len(negatives) != len(positives) and NEG_PROPORTION == 1:
		91	+ print u'Niezgodna liczba przypadków pozytywnych i negatywnych!'
		92	+ return positives, negatives
		93	+
		94	+
		95	+def get_sets(mentions):
		96	+ sets = {}
		97	+ clustered_mensions = []
		98	+ for mention in mentions:
		99	+ set_id = mention.attrib['mention_group']
		100	+ if set_id == 'empty' or set_id == '' or mention.attrib['mention_head'] in POSSIBLE_HEADS:
		101	+ pass
		102	+ elif set_id not in sets:
		103	+ sets[set_id] = [mention.attrib['span']]
		104	+ clustered_mensions.append(mention.attrib['span'])
		105	+ elif set_id in sets:
		106	+ sets[set_id].append(mention.attrib['span'])
		107	+ clustered_mensions.append(mention.attrib['span'])
		108	+ else:
		109	+ print u'Coś poszło nie tak przy wyszukiwaniu klastrów!'
		110	+
		111	+ sets_to_remove = []
		112	+ for set_id in sets:
		113	+ if len(sets[set_id]) < 2:
		114	+ sets_to_remove.append(set_id)
		115	+ if len(sets[set_id]) == 1:
		116	+ print u'Removing clustered mention: ', sets[set_id][0]
		117	+ clustered_mensions.remove(sets[set_id][0])
		118	+
		119	+ for set_id in sets_to_remove:
		120	+ print u'Removing set: ', set_id
		121	+ sets.pop(set_id)
		122	+
		123	+ return sets, clustered_mensions
		124	+
		125	+
		126	+def get_positives(sets):
		127	+ positives = []
		128	+ for set_id in sets:
		129	+ coref_set = sets[set_id]
		130	+ positives.extend(list(combinations(coref_set, 2)))
		131	+ return positives
		132	+
		133	+
		134	+def get_negatives_and_update_positives(clustered_mensions, positives):
		135	+ all_pairs = list(combinations(clustered_mensions, 2))
		136	+ all_pairs = set(all_pairs)
		137	+ negatives = [pair for pair in all_pairs if pair not in positives]
		138	+ samples_count = NEG_PROPORTION * len(positives)
		139	+ if samples_count > len(negatives):
		140	+ samples_count = len(negatives)
		141	+ if NEG_PROPORTION == 1:
		142	+ positives = random.sample(set(positives), samples_count)
		143	+ print u'Więcej przypadków pozytywnych niż negatywnych!'
		144	+ negatives = random.sample(set(negatives), samples_count)
		145	+ return positives, negatives
		146	+
		147	+
		148	+def write_features(features_file, positives, negatives, mentions_dict, model, textname):
		149	+ global POS_COUNT
		150	+ POS_COUNT += len(positives)
		151	+ for pair in positives:
		152	+ pair_features = []
		153	+ if DEBUG:
		154	+ pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])]
		155	+ pair_features.extend(get_features(pair, mentions_dict, model))
		156	+ pair_features.append(1)
		157	+ features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features]))
		158	+
		159	+ global NEG_COUNT
		160	+ NEG_COUNT += len(negatives)
		161	+ for pair in negatives:
		162	+ pair_features = []
		163	+ if DEBUG:
		164	+ pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])]
		165	+ pair_features.extend(get_features(pair, mentions_dict, model))
		166	+ pair_features.append(0)
		167	+ features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features]))
		168	+
		169	+
		170	+def get_features(pair, mentions_dict, model):
		171	+ features = []
		172	+ ante = pair[0]
		173	+ ana = pair[1]
		174	+ ante_features = get_mention_features(ante, mentions_dict, model)
		175	+ features.extend(ante_features)
		176	+ ana_features = get_mention_features(ana, mentions_dict, model)
		177	+ features.extend(ana_features)
		178	+ pair_features = get_pair_features(pair, mentions_dict)
		179	+ features.extend(pair_features)
		180	+ return features
		181	+
		182	+
		183	+def get_mention_features(mention_span, mentions_dict, model):
		184	+ features = []
		185	+ mention = get_mention_by_attr(mentions_dict, 'span', mention_span)
		186	+
		187	+ if DEBUG:
		188	+ features.append(mention['head_base'])
		189	+ head_vec = get_wv(model, mention['head_base'])
		190	+ features.extend(list(head_vec))
		191	+
		192	+ if DEBUG:
		193	+ features.append(mention['words'][0]['base'])
		194	+ first_vec = get_wv(model, mention['words'][0]['base'])
		195	+ features.extend(list(first_vec))
		196	+
		197	+ if DEBUG:
		198	+ features.append(mention['words'][-1]['base'])
		199	+ last_vec = get_wv(model, mention['words'][-1]['base'])
		200	+ features.extend(list(last_vec))
		201	+
		202	+ if len(mention['follow_context']) > 0:
		203	+ if DEBUG:
		204	+ features.append(mention['follow_context'][0]['base'])
		205	+ after_1_vec = get_wv(model, mention['follow_context'][0]['base'])
		206	+ features.extend(list(after_1_vec))
		207	+ else:
		208	+ if DEBUG:
		209	+ features.append('None')
		210	+ features.extend([0.0] * W2V_SIZE)
		211	+ if len(mention['follow_context']) > 1:
		212	+ if DEBUG:
		213	+ features.append(mention['follow_context'][1]['base'])
		214	+ after_2_vec = get_wv(model, mention['follow_context'][1]['base'])
		215	+ features.extend(list(after_2_vec))
		216	+ else:
		217	+ if DEBUG:
		218	+ features.append('None')
		219	+ features.extend([0.0] * W2V_SIZE)
		220	+
		221	+ if len(mention['prec_context']) > 0:
		222	+ if DEBUG:
		223	+ features.append(mention['prec_context'][-1]['base'])
		224	+ prec_1_vec = get_wv(model, mention['prec_context'][-1]['base'])
		225	+ features.extend(list(prec_1_vec))
		226	+ else:
		227	+ if DEBUG:
		228	+ features.append('None')
		229	+ features.extend([0.0] * W2V_SIZE)
		230	+ if len(mention['prec_context']) > 1:
		231	+ if DEBUG:
		232	+ features.append(mention['prec_context'][-2]['base'])
		233	+ prec_2_vec = get_wv(model, mention['prec_context'][-2]['base'])
		234	+ features.extend(list(prec_2_vec))
		235	+ else:
		236	+ if DEBUG:
		237	+ features.append('None')
		238	+ features.extend([0.0] * W2V_SIZE)
		239	+
		240	+ if DEBUG:
		241	+ features.append(u' '.join([word['orth'] for word in mention['prec_context']]))
		242	+ prec_vec = get_context_vec(mention['prec_context'], model)
		243	+ features.extend(list(prec_vec))
		244	+
		245	+ if DEBUG:
		246	+ features.append(u' '.join([word['orth'] for word in mention['follow_context']]))
		247	+ follow_vec = get_context_vec(mention['follow_context'], model)
		248	+ features.extend(list(follow_vec))
		249	+
		250	+ if DEBUG:
		251	+ features.append(u' '.join([word['orth'] for word in mention['words']]))
		252	+ mention_vec = get_context_vec(mention['words'], model)
		253	+ features.extend(list(mention_vec))
		254	+
		255	+ if DEBUG:
		256	+ features.append(u' '.join([word['orth'] for word in mention['sentence']]))
		257	+ sentence_vec = get_context_vec(mention['sentence'], model)
		258	+ features.extend(list(sentence_vec))
		259	+
		260	+ return features
		261	+
		262	+
		263	+def get_wv(model, lemma, random=True):
		264	+ global ALL_WORDS
		265	+ global UNKNONW_WORDS
		266	+ vec = None
		267	+ if random:
		268	+ vec = random_vec()
		269	+ ALL_WORDS += 1
		270	+ try:
		271	+ vec = model.wv[lemma]
		272	+ except KeyError:
		273	+ UNKNONW_WORDS += 1
		274	+ return vec
		275	+
		276	+
		277	+def random_vec():
		278	+ return numpy.asarray([random.uniform(-0.25, 0.25) for i in range(0, W2V_SIZE)], dtype=numpy.float32)
		279	+
		280	+
		281	+def get_context_vec(words, model):
		282	+ vec = numpy.zeros(W2V_SIZE, dtype=numpy.float32)
		283	+ unknown_count = 0
		284	+ if len(words) != 0:
		285	+ for word in words:
		286	+ word_vec = get_wv(model, word['base'], RANDOM_VECTORS)
		287	+ if word_vec is None:
		288	+ unknown_count += 1
		289	+ else:
		290	+ vec += word_vec
		291	+ significant_words = len(words) - unknown_count
		292	+ if significant_words != 0:
		293	+ vec = vec/float(significant_words)
		294	+ else:
		295	+ vec = random_vec()
		296	+ return vec
		297	+
		298	+
		299	+def get_pair_features(pair, mentions_dict):
		300	+ ante = get_mention_by_attr(mentions_dict, 'span', pair[0])
		301	+ ana = get_mention_by_attr(mentions_dict, 'span', pair[1])
		302	+
		303	+ features = []
		304	+ mnts_intersect = pair_intersect(ante, ana)
		305	+
		306	+ words_dist = [0] * 11
		307	+ words_bucket = 0
		308	+ if mnts_intersect != 1:
		309	+ words_bucket = get_distance_bucket(ana['start_in_words'] - ante['end_in_words'] - 1)
		310	+ if DEBUG:
		311	+ features.append('Bucket %d' % words_bucket)
		312	+ words_dist[words_bucket] = 1
		313	+ features.extend(words_dist)
		314	+
		315	+ mentions_dist = [0] * 11
		316	+ mentions_bucket = 0
		317	+ if mnts_intersect != 1:
		318	+ mentions_bucket = get_distance_bucket(ana['position_in_mentions'] - ante['position_in_mentions'] - 1)
		319	+ if words_bucket == 10:
		320	+ mentions_bucket = 10
		321	+ if DEBUG:
		322	+ features.append('Bucket %d' % mentions_bucket)
		323	+ mentions_dist[mentions_bucket] = 1
		324	+ features.extend(mentions_dist)
		325	+
		326	+ if DEBUG:
		327	+ features.append('Other features')
		328	+ features.append(mnts_intersect)
		329	+ features.append(head_match(ante, ana))
		330	+ features.append(exact_match(ante, ana))
		331	+ features.append(base_match(ante, ana))
		332	+
		333	+ if len(mentions_dict) > 100:
		334	+ features.append(1)
		335	+ else:
		336	+ features.append(0)
		337	+
		338	+ return features
		339	+
		340	+
		341	+def get_distance_bucket(distance):
		342	+ if distance >= 0 and distance <= 4:
		343	+ return distance
		344	+ elif distance >= 5 and distance <= 7:
		345	+ return 5
		346	+ elif distance >= 8 and distance <= 15:
		347	+ return 6
		348	+ elif distance >= 16 and distance <= 31:
		349	+ return 7
		350	+ elif distance >= 32 and distance <= 63:
		351	+ return 8
		352	+ elif distance >= 64:
		353	+ return 9
		354	+ else:
		355	+ print u'Coś poszło nie tak przy kubełkowaniu!!'
		356	+ return 10
		357	+
		358	+
		359	+def pair_intersect(ante, ana):
		360	+ for ante_word in ante['words']:
		361	+ for ana_word in ana['words']:
		362	+ if ana_word['id'] == ante_word['id']:
		363	+ return 1
		364	+ return 0
		365	+
		366	+
		367	+def head_match(ante, ana):
		368	+ if ante['head_orth'].lower() == ana['head_orth'].lower():
		369	+ return 1
		370	+ return 0
		371	+
		372	+
		373	+def exact_match(ante, ana):
		374	+ if ante['text'].lower() == ana['text'].lower():
		375	+ return 1
		376	+ return 0
		377	+
		378	+
		379	+def base_match(ante, ana):
		380	+ if ante['lemmatized_text'].lower() == ana['lemmatized_text'].lower():
		381	+ return 1
		382	+ return 0
		383	+
		384	+
		385	+def markables_level_2_dict(markables_path, words_path, namespace='www.eml.org/NameSpaces/mention'):
		386	+ markables_dicts = []
		387	+ markables_tree = etree.parse(markables_path)
		388	+ markables = markables_tree.xpath("//ns:markable", namespaces={'ns': namespace})
		389	+
		390	+ words = get_words(words_path)
		391	+
		392	+ for idx, markable in enumerate(markables):
		393	+ span = markable.attrib['span']
		394	+ if not get_mention_by_attr(markables_dicts, 'span', span):
		395	+
		396	+ dominant = ''
		397	+ if 'dominant' in markable.attrib:
		398	+ dominant = markable.attrib['dominant']
		399	+
		400	+ head_orth = markable.attrib['mention_head']
		401	+ if head_orth not in POSSIBLE_HEADS:
		402	+ mention_words = span_to_words(span, words)
		403	+
		404	+ prec_context, follow_context, sentence, mnt_start_position, mnt_end_position = get_context(mention_words, words)
		405	+
		406	+ head_base = get_head_base(head_orth, mention_words)
		407	+ markables_dicts.append({'id': markable.attrib['id'],
		408	+ 'set': markable.attrib['mention_group'],
		409	+ 'text': span_to_text(span, words, 'orth'),
		410	+ 'lemmatized_text': span_to_text(span, words, 'base'),
		411	+ 'words': mention_words,
		412	+ 'span': span,
		413	+ 'head_orth': head_orth,
		414	+ 'head_base': head_base,
		415	+ 'dominant': dominant,
		416	+ 'node': markable,
		417	+ 'prec_context': prec_context,
		418	+ 'follow_context': follow_context,
		419	+ 'sentence': sentence,
		420	+ 'position_in_mentions': idx,
		421	+ 'start_in_words': mnt_start_position,
		422	+ 'end_in_words': mnt_end_position})
		423	+ else:
		424	+ print 'Zduplikowana wzmianka: %s' % span
		425	+
		426	+ return markables_dicts
		427	+
		428	+
		429	+def get_context(mention_words, words):
		430	+ prec_context = []
		431	+ follow_context = []
		432	+ sentence = []
		433	+ mnt_start_position = -1
		434	+ first_word = mention_words[0]
		435	+ last_word = mention_words[-1]
		436	+ for idx, word in enumerate(words):
		437	+ if word['id'] == first_word['id']:
		438	+ prec_context = get_prec_context(idx, words)
		439	+ mnt_start_position = get_mention_start(first_word, words)
		440	+ if word['id'] == last_word['id']:
		441	+ follow_context = get_follow_context(idx, words)
		442	+ sentence = get_sentence(idx, words)
		443	+ mnt_end_position = get_mention_end(last_word, words)
		444	+ break
		445	+ return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position
		446	+
		447	+
		448	+def get_prec_context(mention_start, words):
		449	+ context = []
		450	+ context_start = mention_start - 1
		451	+ while context_start >= 0:
		452	+ if not word_to_ignore(words[context_start]):
		453	+ context.append(words[context_start])
		454	+ if len(context) == CONTEXT:
		455	+ break
		456	+ context_start -= 1
		457	+ context.reverse()
		458	+ return context
		459	+
		460	+
		461	+def get_mention_start(first_word, words):
		462	+ start = 0
		463	+ for word in words:
		464	+ if not word_to_ignore(word):
		465	+ start += 1
		466	+ if word['id'] == first_word['id']:
		467	+ break
		468	+ return start
		469	+
		470	+
		471	+def get_mention_end(last_word, words):
		472	+ end = 0
		473	+ for word in words:
		474	+ if not word_to_ignore(word):
		475	+ end += 1
		476	+ if word['id'] == last_word['id']:
		477	+ break
		478	+ return end
		479	+
		480	+
		481	+def get_follow_context(mention_end, words):
		482	+ context = []
		483	+ context_end = mention_end + 1
		484	+ while context_end < len(words):
		485	+ if not word_to_ignore(words[context_end]):
		486	+ context.append(words[context_end])
		487	+ if len(context) == CONTEXT:
		488	+ break
		489	+ context_end += 1
		490	+ return context
		491	+
		492	+
		493	+def get_sentence(word_idx, words):
		494	+ sentence_start = get_sentence_start(words, word_idx)
		495	+ sentence_end = get_sentence_end(words, word_idx)
		496	+ sentence = [word for word in words[sentence_start:sentence_end+1] if not word_to_ignore(word)]
		497	+ return sentence
		498	+
		499	+
		500	+def get_sentence_start(words, word_idx):
		501	+ search_start = word_idx
		502	+ while word_idx >= 0:
		503	+ if words[word_idx]['lastinsent'] and search_start != word_idx:
		504	+ return word_idx+1
		505	+ word_idx -= 1
		506	+ return 0
		507	+
		508	+
		509	+def get_sentence_end(words, word_idx):
		510	+ while word_idx < len(words):
		511	+ if words[word_idx]['lastinsent']:
		512	+ return word_idx
		513	+ word_idx += 1
		514	+ return len(words) - 1
		515	+
		516	+
		517	+def get_head_base(head_orth, words):
		518	+ for word in words:
		519	+ if word['orth'].lower() == head_orth.lower() or word['orth'] == head_orth:
		520	+ return word['base']
		521	+ return None
		522	+
		523	+
		524	+def get_words(filepath):
		525	+ tree = etree.parse(filepath)
		526	+ words = []
		527	+ for word in tree.xpath("//word"):
		528	+ hasnps = False
		529	+ if 'hasnps' in word.attrib and word.attrib['hasnps'] == 'true':
		530	+ hasnps = True
		531	+ lastinsent = False
		532	+ if 'lastinsent' in word.attrib and word.attrib['lastinsent'] == 'true':
		533	+ lastinsent = True
		534	+ words.append({'id': word.attrib['id'],
		535	+ 'orth': word.text,
		536	+ 'base': word.attrib['base'],
		537	+ 'hasnps': hasnps,
		538	+ 'lastinsent': lastinsent,
		539	+ 'ctag': word.attrib['ctag']})
		540	+ return words
		541	+
		542	+
		543	+def get_mention_by_attr(mentions, attr_name, value):
		544	+ for mention in mentions:
		545	+ if mention[attr_name] == value:
		546	+ return mention
		547	+ return None
		548	+
		549	+
		550	+def get_mention_index_by_attr(mentions, attr_name, value):
		551	+ for idx, mention in enumerate(mentions):
		552	+ if mention[attr_name] == value:
		553	+ return idx
		554	+ return None
		555	+
		556	+
		557	+def span_to_text(span, words, form):
		558	+ fragments = span.split(',')
		559	+ mention_parts = []
		560	+ for fragment in fragments:
		561	+ mention_parts.append(fragment_to_text(fragment, words, form))
		562	+ return u' [...] '.join(mention_parts)
		563	+
		564	+
		565	+def fragment_to_text(fragment, words, form):
		566	+ if '..' in fragment:
		567	+ text = get_multiword_text(fragment, words, form)
		568	+ else:
		569	+ text = get_one_word_text(fragment, words, form)
		570	+ return text
		571	+
		572	+
		573	+def get_multiword_text(fragment, words, form):
		574	+ mention_parts = []
		575	+ boundaries = fragment.split('..')
		576	+ start_id = boundaries[0]
		577	+ end_id = boundaries[1]
		578	+ in_string = False
		579	+ for word in words:
		580	+ if word['id'] == start_id:
		581	+ in_string = True
		582	+ if in_string and not word_to_ignore(word):
		583	+ mention_parts.append(word)
		584	+ if word['id'] == end_id:
		585	+ break
		586	+ return to_text(mention_parts, form)
		587	+
		588	+
		589	+def to_text(words, form):
		590	+ text = ''
		591	+ for idx, word in enumerate(words):
		592	+ if word['hasnps'] or idx == 0:
		593	+ text += word[form]
		594	+ else:
		595	+ text += u' %s' % word[form]
		596	+ return text
		597	+
		598	+
		599	+def get_one_word_text(word_id, words, form):
		600	+ this_word = (word for word in words if word['id'] == word_id).next()
		601	+ if word_to_ignore(this_word):
		602	+ print this_word
		603	+ return this_word[form]
		604	+
		605	+
		606	+def span_to_words(span, words):
		607	+ fragments = span.split(',')
		608	+ mention_parts = []
		609	+ for fragment in fragments:
		610	+ mention_parts.extend(fragment_to_words(fragment, words))
		611	+ return mention_parts
		612	+
		613	+
		614	+def fragment_to_words(fragment, words):
		615	+ mention_parts = []
		616	+ if '..' in fragment:
		617	+ mention_parts.extend(get_multiword(fragment, words))
		618	+ else:
		619	+ mention_parts.extend(get_word(fragment, words))
		620	+ return mention_parts
		621	+
		622	+
		623	+def get_multiword(fragment, words):
		624	+ mention_parts = []
		625	+ boundaries = fragment.split('..')
		626	+ start_id = boundaries[0]
		627	+ end_id = boundaries[1]
		628	+ in_string = False
		629	+ for word in words:
		630	+ if word['id'] == start_id:
		631	+ in_string = True
		632	+ if in_string and not word_to_ignore(word):
		633	+ mention_parts.append(word)
		634	+ if word['id'] == end_id:
		635	+ break
		636	+ return mention_parts
		637	+
		638	+
		639	+def get_word(word_id, words):
		640	+ for word in words:
		641	+ if word['id'] == word_id:
		642	+ if not word_to_ignore(word):
		643	+ return [word]
		644	+ else:
		645	+ return []
		646	+ return []
		647	+
		648	+
		649	+def word_to_ignore(word):
		650	+ if word['ctag'] == 'interp':
		651	+ return True
		652	+ return False
		653	+
		654	+
		655	+if __name__ == '__main__':
		656	+ main()

resolver.py 0 → 100644

View file @c2871e0

		1	+# -- coding: utf-8 --
		2	+
		3	+import codecs
		4	+import os
		5	+
		6	+import numpy as np
		7	+
		8	+from natsort import natsorted
		9	+
		10	+from keras.models import Model
		11	+from keras.layers import Input, Dense, Dropout, Activation, BatchNormalization
		12	+from keras.optimizers import SGD, Adam
		13	+
		14	+IN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data',
		15	+ 'prepared_text_files'))
		16	+OUT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data',
		17	+ 'metrics.csv'))
		18	+
		19	+MODEL = os.path.abspath(os.path.join(os.path.dirname(__file__), 'weights_2017_05_10.h5'))
		20	+
		21	+
		22	+NUMBER_OF_FEATURES = 1126
		23	+
		24	+
		25	+def main():
		26	+ resolve_files()
		27	+
		28	+
		29	+def resolve_files():
		30	+ metrics_file = codecs.open(OUT_PATH, 'w', 'utf-8')
		31	+ write_labels(metrics_file)
		32	+
		33	+ anno_files = os.listdir(IN_PATH)
		34	+ anno_files = natsorted(anno_files)
		35	+ for filename in anno_files:
		36	+ print (filename)
		37	+ textname = filename.replace('.csv', '')
		38	+ text_data_path = os.path.join(IN_PATH, filename)
		39	+ resolve(textname, text_data_path, metrics_file)
		40	+
		41	+ metrics_file.close()
		42	+
		43	+
		44	+def write_labels(metrics_file):
		45	+ metrics_file.write('Text\tAccuracy\tPrecision\tRecall\tF1\tPairs\n')
		46	+
		47	+
		48	+def resolve(textname, text_data_path, metrics_file):
		49	+ raw_data = open(text_data_path, 'rt')
		50	+ test_data = np.loadtxt(raw_data, delimiter='\t')
		51	+ test_set = test_data[:, 0:NUMBER_OF_FEATURES]
		52	+ test_labels = test_data[:, NUMBER_OF_FEATURES] # last column consists of labels
		53	+
		54	+ inputs = Input(shape=(NUMBER_OF_FEATURES,))
		55	+ output_from_1st_layer = Dense(1000, activation='relu')(inputs)
		56	+ output_from_1st_layer = Dropout(0.5)(output_from_1st_layer)
		57	+ output_from_1st_layer = BatchNormalization()(output_from_1st_layer)
		58	+ output_from_2nd_layer = Dense(500, activation='relu')(output_from_1st_layer)
		59	+ output_from_2nd_layer = Dropout(0.5)(output_from_2nd_layer)
		60	+ output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer)
		61	+ output = Dense(1, activation='sigmoid')(output_from_2nd_layer)
		62	+
		63	+ model = Model(inputs, output)
		64	+ model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
		65	+ model.load_weights(MODEL)
		66	+
		67	+ predictions = model.predict(test_set)
		68	+
		69	+ calc_metrics(textname, test_set, test_labels, predictions, metrics_file)
		70	+
		71	+
		72	+def calc_metrics(textname, test_set, test_labels, predictions, metrics_file):
		73	+ true_positives = 0.0
		74	+ false_positives = 0.0
		75	+ true_negatives = 0.0
		76	+ false_negatives = 0.0
		77	+
		78	+ for i in range(len(test_set)):
		79	+ if (predictions[i] < 0.5 and test_labels[i] == 0): true_negatives += 1
		80	+ if (predictions[i] < 0.5 and test_labels[i] == 1): false_negatives += 1
		81	+ if (predictions[i] >= 0.5 and test_labels[i] == 1): true_positives += 1
		82	+ if (predictions[i] >= 0.5 and test_labels[i] == 0): false_positives += 1
		83	+
		84	+ accuracy = (true_positives + true_negatives) / len(test_set)
		85	+ precision = true_positives / (true_positives + false_positives)
		86	+ recall = true_positives / (true_positives + false_negatives)
		87	+ f1 = 2 * (precision * recall) / (precision + recall)
		88	+
		89	+ metrics_file.write('%s\t%s\t%s\t%s\t%s\t%s\n' % (textname,
		90	+ repr(accuracy),
		91	+ repr(precision),
		92	+ repr(recall),
		93	+ repr(f1),
		94	+ repr(len(test_set))))
		95	+
		96	+
		97	+if __name__ == '__main__':
		98	+ main()