Commit 7d27f9e9ae10935e1a182e7860943dcbb416e473
1 parent
04c45e2d
Added Bartek-3 features to preparator script.
Showing
7 changed files
with
500 additions
and
55 deletions
data/wikipedia/link.map
0 → 100755
No preview for this file type
data/wikipedia/redirect.map
0 → 100755
No preview for this file type
data/wordnet/lemma2hypernyms.map
0 → 100755
No preview for this file type
data/wordnet/lemma2synonyms.map
0 → 100755
No preview for this file type
for_investigation.ipynb
@@ -114,7 +114,7 @@ | @@ -114,7 +114,7 @@ | ||
114 | }, | 114 | }, |
115 | "outputs": [], | 115 | "outputs": [], |
116 | "source": [ | 116 | "source": [ |
117 | - " predictions = model.predict(test_set)" | 117 | + "predictions = model.predict(test_set)" |
118 | ] | 118 | ] |
119 | }, | 119 | }, |
120 | { | 120 | { |
@@ -141,7 +141,7 @@ | @@ -141,7 +141,7 @@ | ||
141 | } | 141 | } |
142 | ], | 142 | ], |
143 | "source": [ | 143 | "source": [ |
144 | - " true_positives = 0.0\n", | 144 | + "true_positives = 0.0\n", |
145 | " false_positives = 0.0\n", | 145 | " false_positives = 0.0\n", |
146 | " true_negatives = 0.0\n", | 146 | " true_negatives = 0.0\n", |
147 | " false_negatives = 0.0\n", | 147 | " false_negatives = 0.0\n", |
@@ -173,7 +173,7 @@ | @@ -173,7 +173,7 @@ | ||
173 | "language_info": { | 173 | "language_info": { |
174 | "codemirror_mode": { | 174 | "codemirror_mode": { |
175 | "name": "ipython", | 175 | "name": "ipython", |
176 | - "version": 2 | 176 | + "version": 2.0 |
177 | }, | 177 | }, |
178 | "file_extension": ".py", | 178 | "file_extension": ".py", |
179 | "mimetype": "text/x-python", | 179 | "mimetype": "text/x-python", |
@@ -184,5 +184,5 @@ | @@ -184,5 +184,5 @@ | ||
184 | } | 184 | } |
185 | }, | 185 | }, |
186 | "nbformat": 4, | 186 | "nbformat": 4, |
187 | - "nbformat_minor": 2 | ||
188 | -} | 187 | + "nbformat_minor": 0 |
188 | +} | ||
189 | \ No newline at end of file | 189 | \ No newline at end of file |
mention-pair-classifier.ipynb
@@ -78,7 +78,7 @@ | @@ -78,7 +78,7 @@ | ||
78 | "number_of_features = 1126\n", | 78 | "number_of_features = 1126\n", |
79 | "\n", | 79 | "\n", |
80 | "X = data[:,0:1126]\n", | 80 | "X = data[:,0:1126]\n", |
81 | - "Y = data[:,1126] #last column consists of labels\n" | 81 | + "Y = data[:,1126] #last column consists of labels" |
82 | ] | 82 | ] |
83 | }, | 83 | }, |
84 | { | 84 | { |
@@ -270,7 +270,7 @@ | @@ -270,7 +270,7 @@ | ||
270 | "language_info": { | 270 | "language_info": { |
271 | "codemirror_mode": { | 271 | "codemirror_mode": { |
272 | "name": "ipython", | 272 | "name": "ipython", |
273 | - "version": 2 | 273 | + "version": 2.0 |
274 | }, | 274 | }, |
275 | "file_extension": ".py", | 275 | "file_extension": ".py", |
276 | "mimetype": "text/x-python", | 276 | "mimetype": "text/x-python", |
@@ -281,5 +281,5 @@ | @@ -281,5 +281,5 @@ | ||
281 | } | 281 | } |
282 | }, | 282 | }, |
283 | "nbformat": 4, | 283 | "nbformat": 4, |
284 | - "nbformat_minor": 2 | ||
285 | -} | 284 | + "nbformat_minor": 0 |
285 | +} | ||
286 | \ No newline at end of file | 286 | \ No newline at end of file |
preparator.py
1 | # -*- coding: utf-8 -*- | 1 | # -*- coding: utf-8 -*- |
2 | 2 | ||
3 | import codecs | 3 | import codecs |
4 | +import math | ||
4 | import numpy | 5 | import numpy |
5 | import os | 6 | import os |
6 | import random | 7 | import random |
8 | +import re | ||
9 | + | ||
10 | +import javaobj | ||
7 | 11 | ||
8 | from lxml import etree | 12 | from lxml import etree |
9 | from itertools import combinations | 13 | from itertools import combinations |
@@ -12,25 +16,39 @@ from natsort import natsorted | @@ -12,25 +16,39 @@ from natsort import natsorted | ||
12 | from gensim.models.word2vec import Word2Vec | 16 | from gensim.models.word2vec import Word2Vec |
13 | 17 | ||
14 | 18 | ||
15 | -TEST_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'test-prepared')) | ||
16 | -TRAIN_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'train-prepared')) | ||
17 | -FREQ_300M_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', 'freq', 'base.lst')) | 19 | +MAIN_PATH = os.path.dirname(__file__) |
20 | +TEST_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'test-prepared')) | ||
21 | +TRAIN_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'train-prepared')) | ||
22 | +FREQ_300M_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'freq', 'base.lst')) | ||
23 | + | ||
24 | +LEMMA2SYNONYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2synonyms.map')) | ||
25 | +LEMMA2HYPERNYMS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wordnet', 'lemma2hypernyms.map')) | ||
26 | + | ||
27 | +TITLE2LINKS_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'link.map')) | ||
28 | +TITLE2REDIRECT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', 'wikipedia', 'redirect.map')) | ||
18 | 29 | ||
19 | ANNO_PATH = TEST_PATH | 30 | ANNO_PATH = TEST_PATH |
20 | -OUT_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), 'data', | ||
21 | - 'test-20170627.csv')) | 31 | +OUT_PATH = os.path.abspath(os.path.join(MAIN_PATH, 'data', |
32 | + 'test-20170720.csv')) | ||
22 | EACH_TEXT_SEPARATELLY = False | 33 | EACH_TEXT_SEPARATELLY = False |
23 | 34 | ||
24 | CONTEXT = 5 | 35 | CONTEXT = 5 |
25 | W2V_SIZE = 50 | 36 | W2V_SIZE = 50 |
26 | -MODEL = os.path.abspath(os.path.join(os.path.dirname(__file__), 'models', | 37 | +MODEL = os.path.abspath(os.path.join(MAIN_PATH, 'models', |
27 | '%d' % W2V_SIZE, | 38 | '%d' % W2V_SIZE, |
28 | 'w2v_allwiki_nkjpfull_%d.model' % W2V_SIZE)) | 39 | 'w2v_allwiki_nkjpfull_%d.model' % W2V_SIZE)) |
29 | 40 | ||
41 | +FIRST_SECOND_PERSON = ['pri', 'sec'] | ||
42 | +INDICATIVE_PRONS_BASES = ["ten", "ta", "to", "ci", "te", "tamten", "tamta", | ||
43 | + "tamto", "tamci", "tamte", "ów", "owa", "owo", "owi", "owe"] | ||
44 | +SIEBIE_TAGS = ['siebie'] | ||
45 | +MASCULINE_TAGS = ['m1', 'm2', 'm3'] | ||
46 | + | ||
30 | NOUN_TAGS = ['subst', 'ger', 'depr'] | 47 | NOUN_TAGS = ['subst', 'ger', 'depr'] |
31 | PPRON_TAGS = ['ppron12', 'ppron3'] | 48 | PPRON_TAGS = ['ppron12', 'ppron3'] |
32 | ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt'] | 49 | ZERO_TAGS = ['fin', 'praet', 'bedzie', 'impt', 'winien', 'aglt'] |
33 | POSSIBLE_HEADS = [u'§', u'%', u'*', u'"', u'„', u'&', u'-'] | 50 | POSSIBLE_HEADS = [u'§', u'%', u'*', u'"', u'„', u'&', u'-'] |
51 | +HYPHEN_SIGNS = ['-', '#'] | ||
34 | 52 | ||
35 | NEG_PROPORTION = 1 | 53 | NEG_PROPORTION = 1 |
36 | RANDOM_VECTORS = True | 54 | RANDOM_VECTORS = True |
@@ -45,13 +63,18 @@ UNKNONW_WORDS = 0 | @@ -45,13 +63,18 @@ UNKNONW_WORDS = 0 | ||
45 | def main(): | 63 | def main(): |
46 | model = Word2Vec.load(MODEL) | 64 | model = Word2Vec.load(MODEL) |
47 | freq_list = load_freq_list(FREQ_300M_PATH) | 65 | freq_list = load_freq_list(FREQ_300M_PATH) |
66 | + lemma2synonyms = load_one2many_map(LEMMA2SYNONYMS_PATH) | ||
67 | + lemma2hypernyms = load_one2many_map(LEMMA2HYPERNYMS_PATH) | ||
68 | + title2links = load_one2many_map(TITLE2LINKS_PATH) | ||
69 | + title2redirect = load_one2one_map(TITLE2REDIRECT_PATH) | ||
48 | try: | 70 | try: |
49 | - create_data_vectors(model, freq_list) | 71 | + create_data_vectors(model, freq_list, lemma2synonyms, |
72 | + lemma2hypernyms, title2links, title2redirect) | ||
50 | finally: | 73 | finally: |
51 | - print 'Unknown words: ', UNKNONW_WORDS | ||
52 | - print 'All words: ', ALL_WORDS | ||
53 | - print 'Positives: ', POS_COUNT | ||
54 | - print 'Negatives: ', NEG_COUNT | 74 | + print ('Unknown words: ', UNKNONW_WORDS) |
75 | + print ('All words: ', ALL_WORDS) | ||
76 | + print ('Positives: ', POS_COUNT) | ||
77 | + print ('Negatives: ', NEG_COUNT) | ||
55 | 78 | ||
56 | 79 | ||
57 | def load_freq_list(freq_path): | 80 | def load_freq_list(freq_path): |
@@ -67,16 +90,43 @@ def load_freq_list(freq_path): | @@ -67,16 +90,43 @@ def load_freq_list(freq_path): | ||
67 | return freq_list | 90 | return freq_list |
68 | 91 | ||
69 | 92 | ||
70 | -def create_data_vectors(model, freq_list): | 93 | +def load_one2many_map(map_path): |
94 | + this_map = {} | ||
95 | + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb')) | ||
96 | + pobj = marshaller.readObject() | ||
97 | + jmap_annotations = pobj.__dict__['annotations'] | ||
98 | + jmap_annotations_count = len(jmap_annotations) | ||
99 | + for i in range(jmap_annotations_count): | ||
100 | + if i%2 == 1: | ||
101 | + mapped_elements = set(jmap_annotations[i+1].__dict__['annotations']) | ||
102 | + this_map[jmap_annotations[i]] = mapped_elements | ||
103 | + return this_map | ||
104 | + | ||
105 | + | ||
106 | +def load_one2one_map(map_path): | ||
107 | + this_map = {} | ||
108 | + marshaller = javaobj.JavaObjectUnmarshaller(open(map_path, 'rb')) | ||
109 | + pobj = marshaller.readObject() | ||
110 | + jmap_annotations = pobj.__dict__['annotations'] | ||
111 | + jmap_annotations_count = len(jmap_annotations) | ||
112 | + for i in range(jmap_annotations_count): | ||
113 | + if i%2 == 1: | ||
114 | + element = jmap_annotations[i+1] | ||
115 | + this_map[jmap_annotations[i]] = element | ||
116 | + return this_map | ||
117 | + | ||
118 | + | ||
119 | +def create_data_vectors(model, freq_list, lemma2synonyms, | ||
120 | + lemma2hypernyms, title2links, title2redirect): | ||
71 | features_file = None | 121 | features_file = None |
72 | if not EACH_TEXT_SEPARATELLY: | 122 | if not EACH_TEXT_SEPARATELLY: |
73 | - features_file = codecs.open(OUT_PATH, 'wt', 'utf-8') | 123 | + features_file = codecs.open(OUT_PATH, 'w', 'utf-8') |
74 | 124 | ||
75 | anno_files = os.listdir(ANNO_PATH) | 125 | anno_files = os.listdir(ANNO_PATH) |
76 | anno_files = natsorted(anno_files) | 126 | anno_files = natsorted(anno_files) |
77 | for filename in anno_files: | 127 | for filename in anno_files: |
78 | if filename.endswith('.mmax'): | 128 | if filename.endswith('.mmax'): |
79 | - print '=======> ', filename | 129 | + print ('=======> ', filename) |
80 | textname = filename.replace('.mmax', '') | 130 | textname = filename.replace('.mmax', '') |
81 | 131 | ||
82 | mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname) | 132 | mentions_path = os.path.join(ANNO_PATH, '%s_mentions.xml' % textname) |
@@ -85,19 +135,18 @@ def create_data_vectors(model, freq_list): | @@ -85,19 +135,18 @@ def create_data_vectors(model, freq_list): | ||
85 | positives, negatives = diff_mentions(mentions) | 135 | positives, negatives = diff_mentions(mentions) |
86 | 136 | ||
87 | if DEBUG: | 137 | if DEBUG: |
88 | - print 'Positives:' | ||
89 | - print len(positives) | ||
90 | - | ||
91 | - print 'Negatives:' | ||
92 | - print len(negatives) | 138 | + print ('Positives:', len(positives)) |
139 | + print ('Negatives:', len(negatives)) | ||
93 | 140 | ||
94 | words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname) | 141 | words_path = os.path.join(ANNO_PATH, '%s_words.xml' % textname) |
95 | mentions_dict = markables_level_2_dict(mentions_path, words_path, freq_list) | 142 | mentions_dict = markables_level_2_dict(mentions_path, words_path, freq_list) |
96 | 143 | ||
97 | if EACH_TEXT_SEPARATELLY: | 144 | if EACH_TEXT_SEPARATELLY: |
98 | text_features_path = os.path.join(OUT_PATH, '%s.csv' % textname) | 145 | text_features_path = os.path.join(OUT_PATH, '%s.csv' % textname) |
99 | - features_file = codecs.open(text_features_path, 'wt', 'utf-8') | ||
100 | - write_features(features_file, positives, negatives, mentions_dict, model, textname) | 146 | + features_file = codecs.open(text_features_path, 'w', 'utf-8') |
147 | + write_features(features_file, positives, negatives, mentions_dict, | ||
148 | + model, textname, lemma2synonyms, | ||
149 | + lemma2hypernyms, title2links, title2redirect) | ||
101 | 150 | ||
102 | if not EACH_TEXT_SEPARATELLY: | 151 | if not EACH_TEXT_SEPARATELLY: |
103 | features_file.close() | 152 | features_file.close() |
@@ -108,7 +157,7 @@ def diff_mentions(mentions): | @@ -108,7 +157,7 @@ def diff_mentions(mentions): | ||
108 | positives = get_positives(sets) | 157 | positives = get_positives(sets) |
109 | positives, negatives = get_negatives_and_update_positives(clustered_mensions, positives) | 158 | positives, negatives = get_negatives_and_update_positives(clustered_mensions, positives) |
110 | if len(negatives) != len(positives) and NEG_PROPORTION == 1: | 159 | if len(negatives) != len(positives) and NEG_PROPORTION == 1: |
111 | - print u'Niezgodna liczba przypadków pozytywnych i negatywnych!' | 160 | + print (u'Niezgodna liczba przypadków pozytywnych i negatywnych!') |
112 | return positives, negatives | 161 | return positives, negatives |
113 | 162 | ||
114 | 163 | ||
@@ -126,18 +175,18 @@ def get_sets(mentions): | @@ -126,18 +175,18 @@ def get_sets(mentions): | ||
126 | sets[set_id].append(mention.attrib['span']) | 175 | sets[set_id].append(mention.attrib['span']) |
127 | clustered_mensions.append(mention.attrib['span']) | 176 | clustered_mensions.append(mention.attrib['span']) |
128 | else: | 177 | else: |
129 | - print u'Coś poszło nie tak przy wyszukiwaniu klastrów!' | 178 | + print (u'Coś poszło nie tak przy wyszukiwaniu klastrów!') |
130 | 179 | ||
131 | sets_to_remove = [] | 180 | sets_to_remove = [] |
132 | for set_id in sets: | 181 | for set_id in sets: |
133 | if len(sets[set_id]) < 2: | 182 | if len(sets[set_id]) < 2: |
134 | sets_to_remove.append(set_id) | 183 | sets_to_remove.append(set_id) |
135 | if len(sets[set_id]) == 1: | 184 | if len(sets[set_id]) == 1: |
136 | - print u'Removing clustered mention: ', sets[set_id][0] | 185 | + print (u'Removing clustered mention: ', sets[set_id][0]) |
137 | clustered_mensions.remove(sets[set_id][0]) | 186 | clustered_mensions.remove(sets[set_id][0]) |
138 | 187 | ||
139 | for set_id in sets_to_remove: | 188 | for set_id in sets_to_remove: |
140 | - print u'Removing set: ', set_id | 189 | + print (u'Removing set: ', set_id) |
141 | sets.pop(set_id) | 190 | sets.pop(set_id) |
142 | 191 | ||
143 | return sets, clustered_mensions | 192 | return sets, clustered_mensions |
@@ -160,21 +209,24 @@ def get_negatives_and_update_positives(clustered_mensions, positives): | @@ -160,21 +209,24 @@ def get_negatives_and_update_positives(clustered_mensions, positives): | ||
160 | samples_count = len(negatives) | 209 | samples_count = len(negatives) |
161 | if NEG_PROPORTION == 1: | 210 | if NEG_PROPORTION == 1: |
162 | positives = random.sample(set(positives), samples_count) | 211 | positives = random.sample(set(positives), samples_count) |
163 | - print u'Więcej przypadków pozytywnych niż negatywnych!' | 212 | + print (u'Więcej przypadków pozytywnych niż negatywnych!') |
164 | negatives = random.sample(set(negatives), samples_count) | 213 | negatives = random.sample(set(negatives), samples_count) |
165 | return positives, negatives | 214 | return positives, negatives |
166 | 215 | ||
167 | 216 | ||
168 | -def write_features(features_file, positives, negatives, mentions_dict, model, textname): | 217 | +def write_features(features_file, positives, negatives, mentions_dict, |
218 | + model, textname, lemma2synonyms, | ||
219 | + lemma2hypernyms, title2links, title2redirect): | ||
169 | global POS_COUNT | 220 | global POS_COUNT |
170 | POS_COUNT += len(positives) | 221 | POS_COUNT += len(positives) |
171 | for pair in positives: | 222 | for pair in positives: |
172 | pair_features = [] | 223 | pair_features = [] |
173 | if DEBUG: | 224 | if DEBUG: |
174 | pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])] | 225 | pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])] |
175 | - pair_features.extend(get_features(pair, mentions_dict, model)) | 226 | + pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms, |
227 | + lemma2hypernyms, title2links, title2redirect)) | ||
176 | pair_features.append(1) | 228 | pair_features.append(1) |
177 | - features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features])) | 229 | + features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features])) |
178 | 230 | ||
179 | global NEG_COUNT | 231 | global NEG_COUNT |
180 | NEG_COUNT += len(negatives) | 232 | NEG_COUNT += len(negatives) |
@@ -182,12 +234,14 @@ def write_features(features_file, positives, negatives, mentions_dict, model, te | @@ -182,12 +234,14 @@ def write_features(features_file, positives, negatives, mentions_dict, model, te | ||
182 | pair_features = [] | 234 | pair_features = [] |
183 | if DEBUG: | 235 | if DEBUG: |
184 | pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])] | 236 | pair_features = ['%s>%s:%s' % (textname, pair[0], pair[1])] |
185 | - pair_features.extend(get_features(pair, mentions_dict, model)) | 237 | + pair_features.extend(get_features(pair, mentions_dict, model, lemma2synonyms, |
238 | + lemma2hypernyms, title2links, title2redirect)) | ||
186 | pair_features.append(0) | 239 | pair_features.append(0) |
187 | - features_file.write(u'%s\n' % u'\t'.join([unicode(feature) for feature in pair_features])) | 240 | + features_file.write(u'%s\n' % u'\t'.join([str(feature) for feature in pair_features])) |
188 | 241 | ||
189 | 242 | ||
190 | -def get_features(pair, mentions_dict, model): | 243 | +def get_features(pair, mentions_dict, model, lemma2synonyms, |
244 | + lemma2hypernyms, title2links, title2redirect): | ||
191 | features = [] | 245 | features = [] |
192 | ante = pair[0] | 246 | ante = pair[0] |
193 | ana = pair[1] | 247 | ana = pair[1] |
@@ -195,7 +249,8 @@ def get_features(pair, mentions_dict, model): | @@ -195,7 +249,8 @@ def get_features(pair, mentions_dict, model): | ||
195 | features.extend(ante_features) | 249 | features.extend(ante_features) |
196 | ana_features = get_mention_features(ana, mentions_dict, model) | 250 | ana_features = get_mention_features(ana, mentions_dict, model) |
197 | features.extend(ana_features) | 251 | features.extend(ana_features) |
198 | - pair_features = get_pair_features(pair, mentions_dict) | 252 | + pair_features = get_pair_features(pair, mentions_dict, lemma2synonyms, |
253 | + lemma2hypernyms, title2links, title2redirect) | ||
199 | features.extend(pair_features) | 254 | features.extend(pair_features) |
200 | return features | 255 | return features |
201 | 256 | ||
@@ -280,6 +335,19 @@ def get_mention_features(mention_span, mentions_dict, model): | @@ -280,6 +335,19 @@ def get_mention_features(mention_span, mentions_dict, model): | ||
280 | # cechy uzupelniajace | 335 | # cechy uzupelniajace |
281 | features.extend(mention_type(mention)) | 336 | features.extend(mention_type(mention)) |
282 | 337 | ||
338 | + # cechy uzupelniajace 2 | ||
339 | + features.append(is_first_second_person(mention)) | ||
340 | + features.append(is_demonstrative(mention)) | ||
341 | + features.append(is_demonstrative_nominal(mention)) | ||
342 | + features.append(is_demonstrative_pronoun(mention)) | ||
343 | + features.append(is_refl_pronoun(mention)) | ||
344 | + features.append(is_first_in_sentence(mention)) | ||
345 | + features.append(is_zero_or_pronoun(mention)) | ||
346 | + features.append(contains_digit(mention, 'head_orth')) | ||
347 | + features.append(contains_digit(mention, 'text')) | ||
348 | + features.append(contains_letter(mention)) | ||
349 | + features.append(post_modified(mention)) | ||
350 | + | ||
283 | return features | 351 | return features |
284 | 352 | ||
285 | 353 | ||
@@ -296,6 +364,68 @@ def mention_type(mention): | @@ -296,6 +364,68 @@ def mention_type(mention): | ||
296 | return type_vec | 364 | return type_vec |
297 | 365 | ||
298 | 366 | ||
367 | +def is_first_second_person(mention): | ||
368 | + if mention['head']['person'] in FIRST_SECOND_PERSON: | ||
369 | + return 1 | ||
370 | + return 0 | ||
371 | + | ||
372 | + | ||
373 | +def is_demonstrative(mention): | ||
374 | + if mention['words'][0]['base'].lower() in INDICATIVE_PRONS_BASES: | ||
375 | + return 1 | ||
376 | + return 0 | ||
377 | + | ||
378 | + | ||
379 | +def is_demonstrative_nominal(mention): | ||
380 | + if is_demonstrative(mention) and mention['head']['ctag'] in NOUN_TAGS: | ||
381 | + return 1 | ||
382 | + return 0 | ||
383 | + | ||
384 | + | ||
385 | +def is_demonstrative_pronoun(mention): | ||
386 | + if (is_demonstrative(mention) and | ||
387 | + (mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS)): | ||
388 | + return 1 | ||
389 | + return 0 | ||
390 | + | ||
391 | + | ||
392 | +def is_refl_pronoun(mention): | ||
393 | + if mention['head']['ctag'] in SIEBIE_TAGS: | ||
394 | + return 1 | ||
395 | + return 0 | ||
396 | + | ||
397 | + | ||
398 | +def is_first_in_sentence(mention): | ||
399 | + if mention['first_in_sentence']: | ||
400 | + return 1 | ||
401 | + return 0 | ||
402 | + | ||
403 | + | ||
404 | +def is_zero_or_pronoun(mention): | ||
405 | + if mention['head']['ctag'] in PPRON_TAGS or mention['head']['ctag'] in ZERO_TAGS: | ||
406 | + return 1 | ||
407 | + return 0 | ||
408 | + | ||
409 | + | ||
410 | +def contains_digit(mention, attr_name): | ||
411 | + _digits = re.compile('\d') | ||
412 | + if _digits.search(mention[attr_name]): | ||
413 | + return 1 | ||
414 | + return 0 | ||
415 | + | ||
416 | + | ||
417 | +def contains_letter(mention): | ||
418 | + if any(c.isalpha() for c in mention['text']): | ||
419 | + return 1 | ||
420 | + return 0 | ||
421 | + | ||
422 | + | ||
423 | +def post_modified(mention): | ||
424 | + if mention['head']['orth'] != mention['words'][-1]['orth']: | ||
425 | + return 1 | ||
426 | + return 0 | ||
427 | + | ||
428 | + | ||
299 | def get_wv(model, lemma, random=True): | 429 | def get_wv(model, lemma, random=True): |
300 | global ALL_WORDS | 430 | global ALL_WORDS |
301 | global UNKNONW_WORDS | 431 | global UNKNONW_WORDS |
@@ -332,7 +462,8 @@ def get_context_vec(words, model): | @@ -332,7 +462,8 @@ def get_context_vec(words, model): | ||
332 | return vec | 462 | return vec |
333 | 463 | ||
334 | 464 | ||
335 | -def get_pair_features(pair, mentions_dict): | 465 | +def get_pair_features(pair, mentions_dict, lemma2synonyms, |
466 | + lemma2hypernyms, title2links, title2redirect): | ||
336 | ante = get_mention_by_attr(mentions_dict, 'span', pair[0]) | 467 | ante = get_mention_by_attr(mentions_dict, 'span', pair[0]) |
337 | ana = get_mention_by_attr(mentions_dict, 'span', pair[1]) | 468 | ana = get_mention_by_attr(mentions_dict, 'span', pair[1]) |
338 | 469 | ||
@@ -375,6 +506,32 @@ def get_pair_features(pair, mentions_dict): | @@ -375,6 +506,32 @@ def get_pair_features(pair, mentions_dict): | ||
375 | features.append(same_sentence(ante, ana)) | 506 | features.append(same_sentence(ante, ana)) |
376 | features.append(same_paragraph(ante, ana)) | 507 | features.append(same_paragraph(ante, ana)) |
377 | 508 | ||
509 | + # cechy uzupelniajace 2 | ||
510 | + features.append(neighbouring_sentence(ante, ana)) | ||
511 | + features.append(cousin_sentence(ante, ana)) | ||
512 | + features.append(distant_sentence(ante, ana)) | ||
513 | + features.append(flat_gender_agreement(ante, ana)) | ||
514 | + features.append(left_match(ante, ana)) | ||
515 | + features.append(right_match(ante, ana)) | ||
516 | + features.append(abbrev2(ante, ana)) | ||
517 | + | ||
518 | + features.append(string_kernel(ante, ana)) | ||
519 | + features.append(head_string_kernel(ante, ana)) | ||
520 | + | ||
521 | + features.append(wordnet_synonyms(ante, ana, lemma2synonyms)) | ||
522 | + features.append(wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms)) | ||
523 | + features.append(wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms)) | ||
524 | + | ||
525 | + features.append(wikipedia_link(ante, ana, title2links)) | ||
526 | + features.append(wikipedia_mutual_link(ante, ana, title2links)) | ||
527 | + features.append(wikipedia_redirect(ante, ana, title2redirect)) | ||
528 | + | ||
529 | + # combined | ||
530 | + features.append(samesent_anapron_antefirstinpar(ante, ana)) | ||
531 | + features.append(samesent_antefirstinpar_personnumbermatch(ante, ana)) | ||
532 | + features.append(adjsent_anapron_adjmen_personnumbermatch(ante, ana)) | ||
533 | + features.append(adjsent_anapron_adjmen(ante, ana)) | ||
534 | + | ||
378 | return features | 535 | return features |
379 | 536 | ||
380 | 537 | ||
@@ -392,7 +549,7 @@ def get_distance_bucket(distance): | @@ -392,7 +549,7 @@ def get_distance_bucket(distance): | ||
392 | elif distance >= 64: | 549 | elif distance >= 64: |
393 | return 9 | 550 | return 9 |
394 | else: | 551 | else: |
395 | - print u'Coś poszło nie tak przy kubełkowaniu!!' | 552 | + print (u'Coś poszło nie tak przy kubełkowaniu!!') |
396 | return 10 | 553 | return 10 |
397 | 554 | ||
398 | 555 | ||
@@ -445,8 +602,8 @@ def is_acronym(ante, ana): | @@ -445,8 +602,8 @@ def is_acronym(ante, ana): | ||
445 | if ana['text'].upper() == ana['text']: | 602 | if ana['text'].upper() == ana['text']: |
446 | return check_one_way_acronym(ana['text'], ante['text']) | 603 | return check_one_way_acronym(ana['text'], ante['text']) |
447 | if ante['text'].upper() == ante['text']: | 604 | if ante['text'].upper() == ante['text']: |
448 | - return check_one_way_acronym(ante['text'], ana['text']); | ||
449 | - return 0; | 605 | + return check_one_way_acronym(ante['text'], ana['text']) |
606 | + return 0 | ||
450 | 607 | ||
451 | 608 | ||
452 | def check_one_way_acronym(acronym, expression): | 609 | def check_one_way_acronym(acronym, expression): |
@@ -455,10 +612,10 @@ def check_one_way_acronym(acronym, expression): | @@ -455,10 +612,10 @@ def check_one_way_acronym(acronym, expression): | ||
455 | for expr2 in expr1.split(): | 612 | for expr2 in expr1.split(): |
456 | expr2 = expr2.strip() | 613 | expr2 = expr2.strip() |
457 | if expr2: | 614 | if expr2: |
458 | - initials += unicode(expr2[0]).upper() | 615 | + initials += str(expr2[0]).upper() |
459 | if acronym == initials: | 616 | if acronym == initials: |
460 | - return 1; | ||
461 | - return 0; | 617 | + return 1 |
618 | + return 0 | ||
462 | 619 | ||
463 | 620 | ||
464 | def same_sentence(ante, ana): | 621 | def same_sentence(ante, ana): |
@@ -467,12 +624,290 @@ def same_sentence(ante, ana): | @@ -467,12 +624,290 @@ def same_sentence(ante, ana): | ||
467 | return 0 | 624 | return 0 |
468 | 625 | ||
469 | 626 | ||
627 | +def neighbouring_sentence(ante, ana): | ||
628 | + if ana['sentence_id'] - ante['sentence_id'] == 1: | ||
629 | + return 1 | ||
630 | + return 0 | ||
631 | + | ||
632 | + | ||
633 | +def cousin_sentence(ante, ana): | ||
634 | + if ana['sentence_id'] - ante['sentence_id'] == 2: | ||
635 | + return 1 | ||
636 | + return 0 | ||
637 | + | ||
638 | + | ||
639 | +def distant_sentence(ante, ana): | ||
640 | + if ana['sentence_id'] - ante['sentence_id'] > 2: | ||
641 | + return 1 | ||
642 | + return 0 | ||
643 | + | ||
644 | + | ||
470 | def same_paragraph(ante, ana): | 645 | def same_paragraph(ante, ana): |
471 | if ante['paragraph_id'] == ana['paragraph_id']: | 646 | if ante['paragraph_id'] == ana['paragraph_id']: |
472 | return 1 | 647 | return 1 |
473 | return 0 | 648 | return 0 |
474 | 649 | ||
475 | 650 | ||
651 | +def flat_gender_agreement(ante, ana): | ||
652 | + agr_vec = [0] * 3 | ||
653 | + if ante['head']['gender'] == 'unk' or ana['head']['gender'] == 'unk': | ||
654 | + agr_vec[2] = 1 | ||
655 | + elif (ante['head']['gender'] == ana['head']['gender'] or | ||
656 | + (ante['head']['gender'] in MASCULINE_TAGS and ana['head']['gender'] in MASCULINE_TAGS)): | ||
657 | + agr_vec[0] = 1 | ||
658 | + else: | ||
659 | + agr_vec[1] = 1 | ||
660 | + return agr_vec | ||
661 | + | ||
662 | + | ||
663 | +def string_kernel(ante, ana): | ||
664 | + s1 = ante['text'] | ||
665 | + s2 = ana['text'] | ||
666 | + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2))) | ||
667 | + | ||
668 | + | ||
669 | +def head_string_kernel(ante, ana): | ||
670 | + s1 = ante['head_orth'] | ||
671 | + s2 = ana['head_orth'] | ||
672 | + return SK(s1, s2) / (math.sqrt(SK(s1, s1) * SK(s2, s2))) | ||
673 | + | ||
674 | + | ||
675 | +def SK(s1, s2): | ||
676 | + LAMBDA = 0.4 | ||
677 | + | ||
678 | + p = len(s1) | ||
679 | + if len(s2) < len(s1): | ||
680 | + p = len(s2) | ||
681 | + | ||
682 | + h, w = len(s1)+1, len(s2)+1 | ||
683 | + DPS = [[0.0] * w for i in range(h)] | ||
684 | + DP = [[0.0] * w for i in range(h)] | ||
685 | + | ||
686 | + kernel_mat = [0.0] * (len(s1) + 1) | ||
687 | + | ||
688 | + for i in range(len(s1)+1): | ||
689 | + if i == 0: | ||
690 | + continue | ||
691 | + for j in range(len(s2)+1): | ||
692 | + if j == 0: | ||
693 | + continue | ||
694 | + if s1[i-1] == s2[j-1]: | ||
695 | + DPS[i][j] = LAMBDA * LAMBDA | ||
696 | + kernel_mat[0] += DPS[i][j] | ||
697 | + else: | ||
698 | + DPS[i][j] = 0.0 | ||
699 | + | ||
700 | + for l in range(p): | ||
701 | + if l == 0: | ||
702 | + continue | ||
703 | + | ||
704 | + kernel_mat[l] = 0.0 | ||
705 | + for j in range(len(s2)+1): | ||
706 | + DP[l-1][j] = 0.0 | ||
707 | + | ||
708 | + for i in range(len(s1)+1): | ||
709 | + DP[i][l-1] = 0.0 | ||
710 | + | ||
711 | + for i in range(len(s1)+1): | ||
712 | + if i < l: | ||
713 | + continue | ||
714 | + for j in range(len(s2)+1): | ||
715 | + if j < l: | ||
716 | + continue | ||
717 | + DP[i][j] = DPS[i][j] + LAMBDA * DP[i - 1][j] + LAMBDA * DP[i][j - 1] - LAMBDA * LAMBDA * DP[i - 1][j - 1] | ||
718 | + | ||
719 | + if s1[i-1] == s2[j-1]: | ||
720 | + DPS[i][j] = LAMBDA * LAMBDA * DP[i - 1][j - 1] | ||
721 | + kernel_mat[l] += DPS[i][j] | ||
722 | + | ||
723 | + K = 0.0 | ||
724 | + for l in range(p): | ||
725 | + K += kernel_mat[l] | ||
726 | + return K | ||
727 | + | ||
728 | + | ||
729 | +def left_match(ante, ana): | ||
730 | + if (ante['text'].lower().startswith(ana['text'].lower()) or | ||
731 | + ana['text'].lower().startswith(ante['text'].lower())): | ||
732 | + return 1 | ||
733 | + return 0 | ||
734 | + | ||
735 | + | ||
736 | +def right_match(ante, ana): | ||
737 | + if (ante['text'].lower().endswith(ana['text'].lower()) or | ||
738 | + ana['text'].lower().endswith(ante['text'].lower())): | ||
739 | + return 1 | ||
740 | + return 0 | ||
741 | + | ||
742 | +# def string_match_no_hyphenation(ante, ana): | ||
743 | +# ante_no_hyphen = remove_hyphen_signs(ante['text']) | ||
744 | +# ana_no_hyphen = remove_hyphen_signs(ana['text']) | ||
745 | +# if ante_no_hyphen == ana_no_hyphen: | ||
746 | +# return 1 | ||
747 | +# return 0 | ||
748 | +# | ||
749 | +# | ||
750 | +# def string_match_no_hyphenation_lowercase(ante, ana): | ||
751 | +# ante_no_hyphen = remove_hyphen_signs(ante['text']).lower() | ||
752 | +# ana_no_hyphen = remove_hyphen_signs(ana['text']).lower() | ||
753 | +# if ante_no_hyphen == ana_no_hyphen: | ||
754 | +# return 1 | ||
755 | +# return 0 | ||
756 | + | ||
757 | + | ||
758 | +def remove_hyphen_signs(text): | ||
759 | + for sign in HYPHEN_SIGNS: | ||
760 | + text = text.replace(sign, '') | ||
761 | + return text | ||
762 | + | ||
763 | + | ||
764 | +def samesent_anapron_antefirstinpar(ante, ana): | ||
765 | + if same_sentence(ante, ana) and is_zero_or_pronoun(ana) and ante['first_in_paragraph']: | ||
766 | + return 1 | ||
767 | + return 0 | ||
768 | + | ||
769 | + | ||
770 | +def samesent_antefirstinpar_personnumbermatch(ante, ana): | ||
771 | + if (same_sentence(ante, ana) and ante['first_in_paragraph'] | ||
772 | + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]): | ||
773 | + return 1 | ||
774 | + return 0 | ||
775 | + | ||
776 | + | ||
777 | +def adjsent_anapron_adjmen_personnumbermatch(ante, ana): | ||
778 | + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana) | ||
779 | + and ana['position_in_mentions'] - ante['position_in_mentions'] == 1 | ||
780 | + and agreement(ante, ana, 'number')[0] and agreement(ante, ana, 'person')[0]): | ||
781 | + return 1 | ||
782 | + return 0 | ||
783 | + | ||
784 | + | ||
785 | +def adjsent_anapron_adjmen(ante, ana): | ||
786 | + if (neighbouring_sentence(ante, ana) and is_zero_or_pronoun(ana) | ||
787 | + and ana['position_in_mentions'] - ante['position_in_mentions'] == 1): | ||
788 | + return 1 | ||
789 | + return 0 | ||
790 | + | ||
791 | + | ||
792 | +def abbrev2(ante, ana): | ||
793 | + ante_abbrev = get_abbrev(ante) | ||
794 | + ana_abbrev = get_abbrev(ana) | ||
795 | + if ante['head_orth'] == ana_abbrev or ana['head_orth'] == ante_abbrev: | ||
796 | + return 1 | ||
797 | + return 0 | ||
798 | + | ||
799 | + | ||
800 | +def get_abbrev(mention): | ||
801 | + abbrev = u'' | ||
802 | + for word in mention['words']: | ||
803 | + if word['orth'][0].isupper(): | ||
804 | + abbrev += word['orth'][0] | ||
805 | + return abbrev | ||
806 | + | ||
807 | + | ||
808 | +def wordnet_synonyms(ante, ana, lemma2synonyms): | ||
809 | + ante_synonyms = set() | ||
810 | + if ante['head']['base'] in lemma2synonyms: | ||
811 | + ante_synonyms = lemma2synonyms[ante['head']['base']] | ||
812 | + | ||
813 | + ana_synonyms = set() | ||
814 | + if ana['head']['base'] in lemma2synonyms: | ||
815 | + ana_synonyms = lemma2synonyms[ana['head']['base']] | ||
816 | + | ||
817 | + if ana['head']['base'] in ante_synonyms or ante['head']['base'] in ana_synonyms: | ||
818 | + return 1 | ||
819 | + return 0 | ||
820 | + | ||
821 | + | ||
822 | +def wordnet_ana_is_hypernym(ante, ana, lemma2hypernyms): | ||
823 | + ante_hypernyms = set() | ||
824 | + if ante['head']['base'] in lemma2hypernyms: | ||
825 | + ante_hypernyms = lemma2hypernyms[ante['head']['base']] | ||
826 | + | ||
827 | + ana_hypernyms = set() | ||
828 | + if ana['head']['base'] in lemma2hypernyms: | ||
829 | + ana_hypernyms = lemma2hypernyms[ana['head']['base']] | ||
830 | + | ||
831 | + if not ante_hypernyms or not ana_hypernyms: | ||
832 | + return 0 | ||
833 | + | ||
834 | + if ana['head']['base'] in ante_hypernyms: | ||
835 | + return 1 | ||
836 | + return 0 | ||
837 | + | ||
838 | + | ||
839 | +def wordnet_ante_is_hypernym(ante, ana, lemma2hypernyms): | ||
840 | + ana_hypernyms = set() | ||
841 | + if ana['head']['base'] in lemma2hypernyms: | ||
842 | + ana_hypernyms = lemma2hypernyms[ana['head']['base']] | ||
843 | + | ||
844 | + ante_hypernyms = set() | ||
845 | + if ante['head']['base'] in lemma2hypernyms: | ||
846 | + ante_hypernyms = lemma2hypernyms[ante['head']['base']] | ||
847 | + | ||
848 | + if not ante_hypernyms or not ana_hypernyms: | ||
849 | + return 0 | ||
850 | + | ||
851 | + if ante['head']['base'] in ana_hypernyms: | ||
852 | + return 1 | ||
853 | + return 0 | ||
854 | + | ||
855 | + | ||
856 | +def wikipedia_link(ante, ana, title2links): | ||
857 | + ante_base = ante['lemmatized_text'].lower() | ||
858 | + ana_base = ana['lemmatized_text'].lower() | ||
859 | + if ante_base == ana_base: | ||
860 | + return 1 | ||
861 | + | ||
862 | + ante_links = set() | ||
863 | + if ante_base in title2links: | ||
864 | + ante_links = title2links[ante_base] | ||
865 | + | ||
866 | + ana_links = set() | ||
867 | + if ana_base in title2links: | ||
868 | + ana_links = title2links[ana_base] | ||
869 | + | ||
870 | + if ana_base in ante_links or ante_base in ana_links: | ||
871 | + return 1 | ||
872 | + | ||
873 | + return 0 | ||
874 | + | ||
875 | + | ||
876 | +def wikipedia_mutual_link(ante, ana, title2links): | ||
877 | + ante_base = ante['lemmatized_text'].lower() | ||
878 | + ana_base = ana['lemmatized_text'].lower() | ||
879 | + if ante_base == ana_base: | ||
880 | + return 1 | ||
881 | + | ||
882 | + ante_links = set() | ||
883 | + if ante_base in title2links: | ||
884 | + ante_links = title2links[ante_base] | ||
885 | + | ||
886 | + ana_links = set() | ||
887 | + if ana_base in title2links: | ||
888 | + ana_links = title2links[ana_base] | ||
889 | + | ||
890 | + if ana_base in ante_links and ante_base in ana_links: | ||
891 | + return 1 | ||
892 | + | ||
893 | + return 0 | ||
894 | + | ||
895 | + | ||
896 | +def wikipedia_redirect(ante, ana, title2redirect): | ||
897 | + ante_base = ante['lemmatized_text'].lower() | ||
898 | + ana_base = ana['lemmatized_text'].lower() | ||
899 | + if ante_base == ana_base: | ||
900 | + return 1 | ||
901 | + | ||
902 | + if ante_base in title2redirect and title2redirect[ante_base] == ana_base: | ||
903 | + return 1 | ||
904 | + | ||
905 | + if ana_base in title2redirect and title2redirect[ana_base] == ante_base: | ||
906 | + return 1 | ||
907 | + | ||
908 | + return 0 | ||
909 | + | ||
910 | + | ||
476 | def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www.eml.org/NameSpaces/mention'): | 911 | def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www.eml.org/NameSpaces/mention'): |
477 | markables_dicts = [] | 912 | markables_dicts = [] |
478 | markables_tree = etree.parse(markables_path) | 913 | markables_tree = etree.parse(markables_path) |
@@ -492,7 +927,8 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www | @@ -492,7 +927,8 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www | ||
492 | if head_orth not in POSSIBLE_HEADS: | 927 | if head_orth not in POSSIBLE_HEADS: |
493 | mention_words = span_to_words(span, words) | 928 | mention_words = span_to_words(span, words) |
494 | 929 | ||
495 | - prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id = get_context(mention_words, words) | 930 | + (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, |
931 | + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph) = get_context(mention_words, words) | ||
496 | 932 | ||
497 | head = get_head(head_orth, mention_words) | 933 | head = get_head(head_orth, mention_words) |
498 | markables_dicts.append({'id': markable.attrib['id'], | 934 | markables_dicts.append({'id': markable.attrib['id'], |
@@ -513,9 +949,11 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www | @@ -513,9 +949,11 @@ def markables_level_2_dict(markables_path, words_path, freq_list, namespace='www | ||
513 | 'end_in_words': mnt_end_position, | 949 | 'end_in_words': mnt_end_position, |
514 | 'rarest': get_rarest_word(mention_words, freq_list), | 950 | 'rarest': get_rarest_word(mention_words, freq_list), |
515 | 'paragraph_id': paragraph_id, | 951 | 'paragraph_id': paragraph_id, |
516 | - 'sentence_id': sentence_id}) | 952 | + 'sentence_id': sentence_id, |
953 | + 'first_in_sentence': first_in_sentence, | ||
954 | + 'first_in_paragraph': first_in_paragraph}) | ||
517 | else: | 955 | else: |
518 | - print 'Zduplikowana wzmianka: %s' % span | 956 | + print ('Zduplikowana wzmianka: %s' % span) |
519 | 957 | ||
520 | return markables_dicts | 958 | return markables_dicts |
521 | 959 | ||
@@ -529,10 +967,16 @@ def get_context(mention_words, words): | @@ -529,10 +967,16 @@ def get_context(mention_words, words): | ||
529 | mnt_start_position = -1 | 967 | mnt_start_position = -1 |
530 | first_word = mention_words[0] | 968 | first_word = mention_words[0] |
531 | last_word = mention_words[-1] | 969 | last_word = mention_words[-1] |
970 | + first_in_sentence = False | ||
971 | + first_in_paragraph = False | ||
532 | for idx, word in enumerate(words): | 972 | for idx, word in enumerate(words): |
533 | if word['id'] == first_word['id']: | 973 | if word['id'] == first_word['id']: |
534 | prec_context = get_prec_context(idx, words) | 974 | prec_context = get_prec_context(idx, words) |
535 | mnt_start_position = get_mention_start(first_word, words) | 975 | mnt_start_position = get_mention_start(first_word, words) |
976 | + if idx == 0 or words[idx-1]['lastinsent']: | ||
977 | + first_in_sentence = True | ||
978 | + if idx == 0 or words[idx-1]['lastinpar']: | ||
979 | + first_in_paragraph = True | ||
536 | if word['id'] == last_word['id']: | 980 | if word['id'] == last_word['id']: |
537 | follow_context = get_follow_context(idx, words) | 981 | follow_context = get_follow_context(idx, words) |
538 | sentence = get_sentence(idx, words) | 982 | sentence = get_sentence(idx, words) |
@@ -542,7 +986,8 @@ def get_context(mention_words, words): | @@ -542,7 +986,8 @@ def get_context(mention_words, words): | ||
542 | sentence_id += 1 | 986 | sentence_id += 1 |
543 | if word['lastinpar']: | 987 | if word['lastinpar']: |
544 | paragraph_id += 1 | 988 | paragraph_id += 1 |
545 | - return prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, paragraph_id, sentence_id | 989 | + return (prec_context, follow_context, sentence, mnt_start_position, mnt_end_position, |
990 | + paragraph_id, sentence_id, first_in_sentence, first_in_paragraph) | ||
546 | 991 | ||
547 | 992 | ||
548 | def get_prec_context(mention_start, words): | 993 | def get_prec_context(mention_start, words): |
@@ -743,9 +1188,9 @@ def to_text(words, form): | @@ -743,9 +1188,9 @@ def to_text(words, form): | ||
743 | 1188 | ||
744 | 1189 | ||
745 | def get_one_word_text(word_id, words, form): | 1190 | def get_one_word_text(word_id, words, form): |
746 | - this_word = (word for word in words if word['id'] == word_id).next() | 1191 | + this_word = next(word for word in words if word['id'] == word_id) |
747 | if word_to_ignore(this_word): | 1192 | if word_to_ignore(this_word): |
748 | - print this_word | 1193 | + print (this_word) |
749 | return this_word[form] | 1194 | return this_word[form] |
750 | 1195 | ||
751 | 1196 |