From 04c45e2d8290995034f03db5126026ea08041da0 Mon Sep 17 00:00:00 2001 From: Bartlomiej Niton <bartek.niton@gmail.com> Date: Tue, 27 Jun 2017 15:54:42 +0200 Subject: [PATCH] Added new features to feature vector. --- preparator.py | 198 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 179 insertions(+), 19 deletions(-) diff --git a/preparator.py b/preparator.py index 1c170b4..2cfdcbb 100644 --- a/preparator.py +++ b/preparator.py @@ -14,10 +14,11 @@ from gensim.models.word2vec import Word2Vec TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'test-prepared')) TRAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'train-prepared')) +FREQ_300M_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'freq', 'base.lst')) ANNO_PATH = TEST_PATH OUT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', - 'test.csv')) + 'test-20170627.csv')) EACH_TEXT_SEPARATELLY = False CONTEXT = 5 @@ -25,7 +26,12 @@ W2V_SIZE = 50 MODEL = os.path.abspath(os.path.join(os.path.dirname(__file__), 'models', '%d' % W2V_SIZE, 'w2v_allwiki_nkjpfull_%d.model' % W2V_SIZE)) + +NOUN_TAGS = ['subst', 'ger', 'depr'] +PPRON_TAGS = ['ppron12', 'ppron3'] +ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt'] POSSIBLE_HEADS = [u'§', u'%', u'*', u'"', u'„', u'&', u'-'] + NEG_PROPORTION = 1 RANDOM_VECTORS = True @@ -38,8 +44,9 @@ UNKNONW_WORDS = 0 def main(): model = Word2Vec.load(MODEL) + freq_list = load_freq_list(FREQ_300M_PATH) try: - create_data_vectors(model) + create_data_vectors(model, freq_list) finally: print 'Unknown words: ', UNKNONW_WORDS print 'All words: ', ALL_WORDS @@ -47,7 +54,20 @@ def main(): print 'Negatives: ', NEG_COUNT -def create_data_vectors(model): +def load_freq_list(freq_path): + freq_list = {} + with codecs.open(freq_path, 'r', 'utf-8') as freq_file: + lines = freq_file.readlines() + for line in lines: + line_parts = line.split() + freq = int(line_parts[0]) + base = line_parts[1] + if base not in freq_list: + freq_list[base] = freq + return freq_list + + +def create_data_vectors(model, freq_list): features_file = None if not EACH_TEXT_SEPARATELLY: features_file = codecs.open(OUT_PATH, 'wt', 'utf-8') @@ -72,7 +92,7 @@ def create_data_vectors(model): print len(negatives) words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname) - mentions_dict = markables_level_2_dict(mentions_path, words_path) + mentions_dict = markables_level_2_dict(mentions_path, words_path, freq_list) if EACH_TEXT_SEPARATELLY: text_features_path = os.path.join(OUT_PATH, '%s.csv' % textname) @@ -185,8 +205,8 @@ def get_mention_features(mention_span, mentions_dict, model): mention = get_mention_by_attr(mentions_dict, 'span', mention_span) if DEBUG: - features.append(mention['head_base']) - head_vec = get_wv(model, mention['head_base']) + features.append(mention['head']['base']) + head_vec = get_wv(model, mention['head']['base']) features.extend(list(head_vec)) if DEBUG: @@ -257,9 +277,25 @@ def get_mention_features(mention_span, mentions_dict, model): sentence_vec = get_context_vec(mention['sentence'], model) features.extend(list(sentence_vec)) + # cechy uzupelniajace + features.extend(mention_type(mention)) + return features +def mention_type(mention): + type_vec = [0] * 4 + if mention['head']['ctag'] in NOUN_TAGS: + type_vec[0] = 1 + elif mention['head']['ctag'] in PPRON_TAGS: + type_vec[1] = 1 + elif mention['head']['ctag'] in ZERO_TAGS: + type_vec[2] = 1 + else: + type_vec[3] = 1 + return type_vec + + def get_wv(model, lemma, random=True): global ALL_WORDS global UNKNONW_WORDS @@ -330,10 +366,14 @@ def get_pair_features(pair, mentions_dict): features.append(exact_match(ante, ana)) features.append(base_match(ante, ana)) - if len(mentions_dict) > 100: - features.append(1) - else: - features.append(0) + # cechy uzupelniajace + features.append(ante_contains_rarest_from_ana(ante, ana)) + features.extend(agreement(ante, ana, 'gender')) + features.extend(agreement(ante, ana, 'number')) + features.extend(agreement(ante, ana, 'person')) + features.append(is_acronym(ante, ana)) + features.append(same_sentence(ante, ana)) + features.append(same_paragraph(ante, ana)) return features @@ -382,7 +422,58 @@ def base_match(ante, ana): return 0 -def markables_level_2_dict(markables_path, words_path, namespace='www.eml.org/NameSpaces/mention'): +def ante_contains_rarest_from_ana(ante, ana): + ana_rarest = ana['rarest'] + for word in ante['words']: + if word['base'] == ana_rarest['base']: + return 1 + return 0 + + +def agreement(ante, ana, tag_name): + agr_vec = [0] * 3 + if ante['head'][tag_name] == 'unk' or ana['head'][tag_name] == 'unk': + agr_vec[2] = 1 + elif ante['head'][tag_name] == ana['head'][tag_name]: + agr_vec[0] = 1 + else: + agr_vec[1] = 1 + return agr_vec + + +def is_acronym(ante, ana): + if ana['text'].upper() == ana['text']: + return check_one_way_acronym(ana['text'], ante['text']) + if ante['text'].upper() == ante['text']: + return check_one_way_acronym(ante['text'], ana['text']); + return 0; + + +def check_one_way_acronym(acronym, expression): + initials = u'' + for expr1 in expression.split('-'): + for expr2 in expr1.split(): + expr2 = expr2.strip() + if expr2: + initials += unicode(expr2[0]).upper() + if acronym == initials: + return 1; + return 0; + + +def same_sentence(ante, ana): + if ante['sentence_id'] == ana['sentence_id']: + return 1 + return 0 + + +def same_paragraph(ante, ana): + if ante['paragraph_id'] == ana['paragraph_id']: + return 1 + return 0 + + +def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www.eml.org/NameSpaces/mention'): markables_dicts = [] markables_tree = etree.parse(markables_path) markables = markables_tree.xpath("//ns:markable", namespaces={'ns': namespace}) @@ -401,9 +492,9 @@ def markables_level_2_dict(markables_path, words_path, namespace='www.eml.org/Na if head_orth not in POSSIBLE_HEADS: mention_words = span_to_words(span, words) - prec_context, follow_context, sentence, mnt_start_position, mnt_end_position = get_context(mention_words, words) + prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id = get_context(mention_words, words) - head_base = get_head_base(head_orth, mention_words) + head = get_head(head_orth, mention_words) markables_dicts.append({'id': markable.attrib['id'], 'set': markable.attrib['mention_group'], 'text': span_to_text(span, words, 'orth'), @@ -411,7 +502,7 @@ def markables_level_2_dict(markables_path, words_path, namespace='www.eml.org/Na 'words': mention_words, 'span': span, 'head_orth': head_orth, - 'head_base': head_base, + 'head': head, 'dominant': dominant, 'node': markable, 'prec_context': prec_context, @@ -419,7 +510,10 @@ def markables_level_2_dict(markables_path, words_path, namespace='www.eml.org/Na 'sentence': sentence, 'position_in_mentions': idx, 'start_in_words': mnt_start_position, - 'end_in_words': mnt_end_position}) + 'end_in_words': mnt_end_position, + 'rarest': get_rarest_word(mention_words, freq_list), + 'paragraph_id': paragraph_id, + 'sentence_id': sentence_id}) else: print 'Zduplikowana wzmianka: %s' % span @@ -427,6 +521,8 @@ def markables_level_2_dict(markables_path, words_path, namespace='www.eml.org/Na def get_context(mention_words, words): + paragraph_id = 0 + sentence_id = 0 prec_context = [] follow_context = [] sentence = [] @@ -442,7 +538,11 @@ def get_context(mention_words, words): sentence = get_sentence(idx, words) mnt_end_position = get_mention_end(last_word, words) break - return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position + if word['lastinsent']: + sentence_id += 1 + if word['lastinpar']: + paragraph_id += 1 + return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id def get_prec_context(mention_start, words): @@ -514,10 +614,10 @@ def get_sentence_end(words, word_idx): return len(words) - 1 -def get_head_base(head_orth, words): +def get_head(head_orth, words): for word in words: if word['orth'].lower() == head_orth.lower() or word['orth'] == head_orth: - return word['base'] + return word return None @@ -531,15 +631,61 @@ def get_words(filepath): lastinsent = False if 'lastinsent' in word.attrib and word.attrib['lastinsent'] == 'true': lastinsent = True + lastinpar = False + if 'lastinpar' in word.attrib and word.attrib['lastinpar'] == 'true': + lastinpar = True words.append({'id': word.attrib['id'], 'orth': word.text, 'base': word.attrib['base'], 'hasnps': hasnps, 'lastinsent': lastinsent, - 'ctag': word.attrib['ctag']}) + 'lastinpar': lastinpar, + 'ctag': word.attrib['ctag'], + 'msd': word.attrib['msd'], + 'gender': get_gender(word.attrib['msd']), + 'person': get_person(word.attrib['msd']), + 'number': get_number(word.attrib['msd'])}) return words +def get_gender(msd): + tags = msd.split(':') + if 'm1' in tags: + return 'm1' + elif 'm2' in tags: + return 'm2' + elif 'm3' in tags: + return 'm3' + elif 'f' in tags: + return 'f' + elif 'n' in tags: + return 'n' + else: + return 'unk' + + +def get_person(msd): + tags = msd.split(':') + if 'pri' in tags: + return 'pri' + elif 'sec' in tags: + return 'sec' + elif 'ter' in tags: + return 'ter' + else: + return 'unk' + + +def get_number(msd): + tags = msd.split(':') + if 'sg' in tags: + return 'sg' + elif 'pl' in tags: + return 'pl' + else: + return 'unk' + + def get_mention_by_attr(mentions, attr_name, value): for mention in mentions: if mention[attr_name] == value: @@ -652,5 +798,19 @@ def word_to_ignore(word): return False +def get_rarest_word(words, freq_list): + min_freq = 0 + rarest_word = words[0] + for i, word in enumerate(words): + word_freq = 0 + if word['base'] in freq_list: + word_freq = freq_list[word['base']] + + if i == 0 or word_freq < min_freq: + min_freq = word_freq + rarest_word = word + return rarest_word + + if __name__ == '__main__': main() -- libgit2 0.22.2