Commit 44d0a5f290b4b8ecc9f182d3ebdd389905ca8a8c

Authored by Bartłomiej Nitoń
1 parent ba27568f

Added 21 new features.

... ... @@ -2,7 +2,7 @@ import os
2 2  
3 3 from gensim.models.word2vec import Word2Vec
4 4  
5   -from corneferencer.utils import initialize_neural_model
  5 +from corneferencer.utils import initialize_neural_model, load_freq_list
6 6  
7 7  
8 8 CONTEXT = 5
... ... @@ -11,9 +11,10 @@ RANDOM_WORD_VECTORS = True
11 11 W2V_SIZE = 50
12 12 W2V_MODEL_NAME = 'w2v_allwiki_nkjpfull_50.model'
13 13  
14   -NUMBER_OF_FEATURES = 1126
15   -NEURAL_MODEL_NAME = 'weights_2017_05_10.h5'
  14 +NUMBER_OF_FEATURES = 1147
  15 +NEURAL_MODEL_NAME = 'model_1147_features.h5'
16 16  
  17 +FREQ_LIST_NAME = 'base.lst'
17 18  
18 19 # do not change that
19 20 W2V_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', W2V_MODEL_NAME)
... ... @@ -21,3 +22,6 @@ W2V_MODEL = Word2Vec.load(W2V_MODEL_PATH)
21 22  
22 23 NEURAL_MODEL_PATH = os.path.join(os.path.dirname(__file__), 'models', NEURAL_MODEL_NAME)
23 24 NEURAL_MODEL = initialize_neural_model(NUMBER_OF_FEATURES, NEURAL_MODEL_PATH)
  25 +
  26 +FREQ_LIST_PATH = os.path.join(os.path.dirname(__file__), 'freq', FREQ_LIST_NAME)
  27 +FREQ_LIST = load_freq_list(FREQ_LIST_PATH)
... ...
corneferencer/entities.py
... ... @@ -17,9 +17,9 @@ class Text:
17 17 class Mention:
18 18  
19 19 def __init__(self, mnt_id, text, lemmatized_text, words, span,
20   - head_orth, head_base, dominant, node, prec_context,
  20 + head_orth, head, dominant, node, prec_context,
21 21 follow_context, sentence, position_in_mentions,
22   - start_in_words, end_in_words):
  22 + start_in_words, end_in_words, rarest, paragraph_id, sentence_id):
23 23 self.id = mnt_id
24 24 self.set = ''
25 25 self.old_set = ''
... ... @@ -28,7 +28,7 @@ class Mention:
28 28 self.words = words
29 29 self.span = span
30 30 self.head_orth = head_orth
31   - self.head_base = head_base
  31 + self.head = head
32 32 self.dominant = dominant
33 33 self.node = node
34 34 self.prec_context = prec_context
... ... @@ -38,3 +38,6 @@ class Mention:
38 38 self.start_in_words = start_in_words
39 39 self.end_in_words = end_in_words
40 40 self.features = get_mention_features(self)
  41 + self.rarest = rarest
  42 + self.paragraph_id = paragraph_id
  43 + self.sentence_id = sentence_id
... ...
corneferencer/inout/mmax.py
... ... @@ -3,7 +3,7 @@ import shutil
3 3  
4 4 from lxml import etree
5 5  
6   -from conf import CONTEXT
  6 +from conf import CONTEXT, FREQ_LIST
7 7 from corneferencer.entities import Mention, Text
8 8  
9 9  
... ... @@ -38,16 +38,17 @@ def read_mentions(mentions_path, words_path):
38 38 mention_words = span_to_words(span, words)
39 39  
40 40 (prec_context, follow_context, sentence,
41   - mnt_start_position, mnt_end_position) = get_context(mention_words, words)
  41 + mnt_start_position, mnt_end_position,
  42 + paragraph_id, sentence_id) = get_context(mention_words, words)
42 43  
43   - head_base = get_head_base(head_orth, mention_words)
  44 + head = get_head(head_orth, mention_words)
44 45 mention = Mention(mnt_id=markable.attrib['id'],
45 46 text=span_to_text(span, words, 'orth'),
46 47 lemmatized_text=span_to_text(span, words, 'base'),
47 48 words=mention_words,
48 49 span=span,
49 50 head_orth=head_orth,
50   - head_base=head_base,
  51 + head=head,
51 52 dominant=dominant,
52 53 node=markable,
53 54 prec_context=prec_context,
... ... @@ -55,7 +56,10 @@ def read_mentions(mentions_path, words_path):
55 56 sentence=sentence,
56 57 position_in_mentions=idx,
57 58 start_in_words=mnt_start_position,
58   - end_in_words=mnt_end_position)
  59 + end_in_words=mnt_end_position,
  60 + rarest=get_rarest_word(mention_words),
  61 + paragraph_id=paragraph_id,
  62 + sentence_id=sentence_id)
59 63 mentions.append(mention)
60 64  
61 65 return mentions
... ... @@ -71,12 +75,20 @@ def get_words(filepath):
71 75 lastinsent = False
72 76 if 'lastinsent' in word.attrib and word.attrib['lastinsent'] == 'true':
73 77 lastinsent = True
  78 + lastinpar = False
  79 + if 'lastinpar' in word.attrib and word.attrib['lastinpar'] == 'true':
  80 + lastinpar = True
74 81 words.append({'id': word.attrib['id'],
75 82 'orth': word.text,
76 83 'base': word.attrib['base'],
77 84 'hasnps': hasnps,
78 85 'lastinsent': lastinsent,
79   - 'ctag': word.attrib['ctag']})
  86 + 'lastinpar': lastinpar,
  87 + 'ctag': word.attrib['ctag'],
  88 + 'msd': word.attrib['msd'],
  89 + 'gender': get_gender(word.attrib['msd']),
  90 + 'person': get_person(word.attrib['msd']),
  91 + 'number': get_number(word.attrib['msd'])})
80 92 return words
81 93  
82 94  
... ... @@ -130,6 +142,8 @@ def word_to_ignore(word):
130 142  
131 143  
132 144 def get_context(mention_words, words):
  145 + paragraph_id = 0
  146 + sentence_id = 0
133 147 prec_context = []
134 148 follow_context = []
135 149 sentence = []
... ... @@ -146,7 +160,11 @@ def get_context(mention_words, words):
146 160 sentence = get_sentence(idx, words)
147 161 mnt_end_position = get_mention_end(last_word, words)
148 162 break
149   - return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position
  163 + if word['lastinsent']:
  164 + sentence_id += 1
  165 + if word['lastinpar']:
  166 + paragraph_id += 1
  167 + return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id
150 168  
151 169  
152 170 def get_prec_context(mention_start, words):
... ... @@ -218,10 +236,10 @@ def get_sentence_end(words, word_idx):
218 236 return len(words) - 1
219 237  
220 238  
221   -def get_head_base(head_orth, words):
  239 +def get_head(head_orth, words):
222 240 for word in words:
223 241 if word['orth'].lower() == head_orth.lower() or word['orth'] == head_orth:
224   - return word['base']
  242 + return word
225 243 return None
226 244  
227 245  
... ... @@ -272,6 +290,58 @@ def get_one_word_text(word_id, words, form):
272 290 return this_word[form]
273 291  
274 292  
  293 +def get_gender(msd):
  294 + tags = msd.split(':')
  295 + if 'm1' in tags:
  296 + return 'm1'
  297 + elif 'm2' in tags:
  298 + return 'm2'
  299 + elif 'm3' in tags:
  300 + return 'm3'
  301 + elif 'f' in tags:
  302 + return 'f'
  303 + elif 'n' in tags:
  304 + return 'n'
  305 + else:
  306 + return 'unk'
  307 +
  308 +
  309 +def get_person(msd):
  310 + tags = msd.split(':')
  311 + if 'pri' in tags:
  312 + return 'pri'
  313 + elif 'sec' in tags:
  314 + return 'sec'
  315 + elif 'ter' in tags:
  316 + return 'ter'
  317 + else:
  318 + return 'unk'
  319 +
  320 +
  321 +def get_number(msd):
  322 + tags = msd.split(':')
  323 + if 'sg' in tags:
  324 + return 'sg'
  325 + elif 'pl' in tags:
  326 + return 'pl'
  327 + else:
  328 + return 'unk'
  329 +
  330 +
  331 +def get_rarest_word(words):
  332 + min_freq = 0
  333 + rarest_word = words[0]
  334 + for i, word in enumerate(words):
  335 + word_freq = 0
  336 + if word['base'] in FREQ_LIST:
  337 + word_freq = FREQ_LIST[word['base']]
  338 +
  339 + if i == 0 or word_freq < min_freq:
  340 + min_freq = word_freq
  341 + rarest_word = word
  342 + return rarest_word
  343 +
  344 +
275 345 def write(inpath, outpath, text):
276 346 textname = os.path.splitext(os.path.basename(inpath))[0]
277 347 intextdir = os.path.dirname(inpath)
... ...
corneferencer/resolvers/constants.py
1 1 RESOLVERS = ['entity_based', 'incremental']
  2 +
  3 +NOUN_TAGS = ['subst', 'ger', 'depr']
  4 +PPRON_TAGS = ['ppron12', 'ppron3']
  5 +ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt']
... ...
corneferencer/resolvers/features.py
... ... @@ -2,11 +2,12 @@ import numpy
2 2 import random
3 3  
4 4 from conf import RANDOM_WORD_VECTORS, W2V_MODEL, W2V_SIZE
  5 +from corneferencer.resolvers import constants
5 6  
6 7  
7 8 # mention features
8 9 def head_vec(mention):
9   - return list(get_wv(W2V_MODEL, mention.head_base))
  10 + return list(get_wv(W2V_MODEL, mention.head['base']))
10 11  
11 12  
12 13 def first_word_vec(mention):
... ... @@ -65,6 +66,19 @@ def sentence_vec(mention):
65 66 return list(get_context_vec(mention.sentence, W2V_MODEL))
66 67  
67 68  
  69 +def mention_type(mention):
  70 + type_vec = [0] * 4
  71 + if mention.head['ctag'] in constants.NOUN_TAGS:
  72 + type_vec[0] = 1
  73 + elif mention.head['ctag'] in constants.PPRON_TAGS:
  74 + type_vec[1] = 1
  75 + elif mention.head['ctag'] in constants.ZERO_TAGS:
  76 + type_vec[2] = 1
  77 + else:
  78 + type_vec[3] = 1
  79 + return type_vec
  80 +
  81 +
68 82 # pair features
69 83 def distances_vec(ante, ana):
70 84 vec = []
... ... @@ -118,6 +132,45 @@ def base_match(ante, ana):
118 132 return 0
119 133  
120 134  
  135 +def ante_contains_rarest_from_ana(ante, ana):
  136 + ana_rarest = ana.rarest
  137 + for word in ante.words:
  138 + if word['base'] == ana_rarest['base']:
  139 + return 1
  140 + return 0
  141 +
  142 +
  143 +def agreement(ante, ana, tag_name):
  144 + agr_vec = [0] * 3
  145 + if ante.head[tag_name] == 'unk' or ana.head[tag_name] == 'unk':
  146 + agr_vec[2] = 1
  147 + elif ante.head[tag_name] == ana.head[tag_name]:
  148 + agr_vec[0] = 1
  149 + else:
  150 + agr_vec[1] = 1
  151 + return agr_vec
  152 +
  153 +
  154 +def is_acronym(ante, ana):
  155 + if ana.text.upper() == ana.text:
  156 + return check_one_way_acronym(ana.text, ante.text)
  157 + if ante.text.upper() == ante.text:
  158 + return check_one_way_acronym(ante.text, ana.text);
  159 + return 0;
  160 +
  161 +
  162 +def same_sentence(ante, ana):
  163 + if ante.sentence_id == ana.sentence_id:
  164 + return 1
  165 + return 0
  166 +
  167 +
  168 +def same_paragraph(ante, ana):
  169 + if ante.paragraph_id == ana.paragraph_id:
  170 + return 1
  171 + return 0
  172 +
  173 +
121 174 # supporting functions
122 175 def get_wv(model, lemma, use_random_vec=True):
123 176 vec = None
... ... @@ -168,3 +221,15 @@ def get_distance_bucket(distance):
168 221 elif distance >= 64:
169 222 return 9
170 223 return 10
  224 +
  225 +
  226 +def check_one_way_acronym(acronym, expression):
  227 + initials = u''
  228 + for expr1 in expression.split('-'):
  229 + for expr2 in expr1.split():
  230 + expr2 = expr2.strip()
  231 + if expr2:
  232 + initials += unicode(expr2[0]).upper()
  233 + if acronym == initials:
  234 + return 1;
  235 + return 0;
... ...
corneferencer/resolvers/resolve.py
... ... @@ -19,6 +19,9 @@ def incremental(text):
19 19 best_prediction = prediction
20 20 best_ante = ante
21 21 if best_ante is not None:
  22 + # print ('wynik')
  23 + # print(best_ante.text, best_prediction, ana.text)
  24 + # print (best_ante.set, ana.set)
22 25 if best_ante.set:
23 26 ana.set = best_ante.set
24 27 else:
... ... @@ -34,6 +37,7 @@ def entity_based(text):
34 37 last_set_id = 0
35 38 for i, ana in enumerate(text.mentions):
36 39 if i > 0:
  40 + print ('!!!!!!!!!!%s!!!!!!!!!!!!' % ana.text)
37 41 best_fit = get_best_set(sets, ana)
38 42 if best_fit is not None:
39 43 ana.set = best_fit['set_id']
... ... @@ -50,8 +54,14 @@ def entity_based(text):
50 54 'mentions': [ana]})
51 55 ana.set = str_set_id
52 56 last_set_id += 1
  57 + print (ana.set)
  58 + for ss in sets:
  59 + print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']]))
53 60  
54 61 remove_singletons(sets)
  62 + print (';'.join([ss['set_id'] for ss in sets]))
  63 + for ss in sets:
  64 + print (';;;'.join(['%s:%s' % (ss['set_id'], mnt.text) for mnt in ss['mentions']]))
55 65  
56 66  
57 67 def get_best_set(sets, ana):
... ... @@ -72,6 +82,7 @@ def predict_set(mentions, ana):
72 82 sample = numpy.asarray([pair_vec], dtype=numpy.float32)
73 83 prediction = NEURAL_MODEL.predict(sample)[0]
74 84 prediction_sum += prediction
  85 + print(mnt.text, prediction, ana.text)
75 86 return prediction_sum / float(len(mentions))
76 87  
77 88  
... ...
corneferencer/resolvers/vectors.py
... ... @@ -23,6 +23,10 @@ def get_mention_features(mention):
23 23 vec.extend(features.following_context_vec(mention))
24 24 vec.extend(features.mention_vec(mention))
25 25 vec.extend(features.sentence_vec(mention))
  26 +
  27 + # cechy uzupelniajace
  28 + vec.extend(features.mention_type(mention))
  29 +
26 30 return vec
27 31  
28 32  
... ... @@ -32,4 +36,14 @@ def get_pair_features(ante, ana):
32 36 vec.append(features.head_match(ante, ana))
33 37 vec.append(features.exact_match(ante, ana))
34 38 vec.append(features.base_match(ante, ana))
  39 +
  40 + # cechy uzupelniajace
  41 + vec.append(features.ante_contains_rarest_from_ana(ante, ana))
  42 + vec.extend(features.agreement(ante, ana, 'gender'))
  43 + vec.extend(features.agreement(ante, ana, 'number'))
  44 + vec.extend(features.agreement(ante, ana, 'person'))
  45 + vec.append(features.is_acronym(ante, ana))
  46 + vec.append(features.same_sentence(ante, ana))
  47 + vec.append(features.same_paragraph(ante, ana))
  48 +
35 49 return vec
... ...
corneferencer/utils.py
1 1 from __future__ import print_function
2 2  
  3 +import codecs
3 4 import sys
4 5  
5 6 from keras.models import Model
... ... @@ -12,15 +13,36 @@ def eprint(*args, **kwargs):
12 13  
13 14 def initialize_neural_model(number_of_features, path_to_model):
14 15 inputs = Input(shape=(number_of_features,))
  16 +
15 17 output_from_1st_layer = Dense(1000, activation='relu')(inputs)
16   - output_from_1st_layer = Dropout(0.5)(output_from_1st_layer)
  18 + output_from_1st_layer = Dropout(0.2)(output_from_1st_layer)
17 19 output_from_1st_layer = BatchNormalization()(output_from_1st_layer)
  20 +
18 21 output_from_2nd_layer = Dense(500, activation='relu')(output_from_1st_layer)
19   - output_from_2nd_layer = Dropout(0.5)(output_from_2nd_layer)
  22 + output_from_2nd_layer = Dropout(0.2)(output_from_2nd_layer)
20 23 output_from_2nd_layer = BatchNormalization()(output_from_2nd_layer)
21   - output = Dense(1, activation='sigmoid')(output_from_2nd_layer)
  24 +
  25 + output_from_3rd_layer = Dense(300, activation='relu')(output_from_2nd_layer)
  26 + output_from_3rd_layer = Dropout(0.2)(output_from_3rd_layer)
  27 + output_from_3rd_layer = BatchNormalization()(output_from_3rd_layer)
  28 +
  29 + output = Dense(1, activation='sigmoid')(output_from_3rd_layer)
22 30  
23 31 model = Model(inputs, output)
24 32 model.compile(optimizer='Adam', loss='binary_crossentropy', metrics=['accuracy'])
25 33 model.load_weights(path_to_model)
  34 +
26 35 return model
  36 +
  37 +
  38 +def load_freq_list(freq_path):
  39 + freq_list = {}
  40 + with codecs.open(freq_path, 'r', 'utf-8') as freq_file:
  41 + lines = freq_file.readlines()
  42 + for line in lines:
  43 + line_parts = line.split()
  44 + freq = int(line_parts[0])
  45 + base = line_parts[1]
  46 + if base not in freq_list:
  47 + freq_list[base] = freq
  48 + return freq_list
... ...