Preparations for experiments with using sieve system output as Corneferencer input.

Bartłomiej Nitoń
1 parent 3d6332cc
Showing 3 changed files with 44 additions and 23 deletions
corneferencer/entities.py
corneferencer/inout/mmax.py
corneferencer/resolvers/resolve.py
@@ -13,6 +13,16 @@ class Text:
                 return mnt.set
         return None
  
+    def get_sets(self):
+        sets = {}
+        for mnt in self.mentions:
+            if mnt.set and mnt.set in sets:
+                sets[mnt.set].append(mnt)
+            elif mnt.set:
+                sets[mnt.set] = [mnt]
+        return sets
+
+
     def merge_sets(self, set1, set2):
         for mnt in self.mentions:
             if mnt.set == set1:
@@ -25,9 +35,9 @@ class Mention:
                  head_orth, head, dominant, node, prec_context,
                  follow_context, sentence, position_in_mentions,
                  start_in_words, end_in_words, rarest, paragraph_id, sentence_id,
-                 first_in_sentence, first_in_paragraph):
+                 first_in_sentence, first_in_paragraph, set_id=''):
         self.id = mnt_id
-        self.set = ''
+        self.set = set_id
         self.old_set = ''
         self.text = text
         self.lemmatized_text = lemmatized_text
@@ -15,8 +15,7 @@ def read(inpath):
     words_path = os.path.join(textdir, '%s_words.xml' % textname)
  
     text = Text(textname)
-    mentions = read_mentions(mentions_path, words_path)
-    text.mentions = mentions
+    text.mentions = read_mentions(mentions_path, words_path)
     return text
  
  
@@ -43,6 +42,9 @@ def read_mentions(mentions_path, words_path):
          first_in_sentence, first_in_paragraph) = get_context(mention_words, words)
  
         head = get_head(head_orth, mention_words)
+        mention_group = ''
+        if markable.attrib['mention_group'] != 'empty':
+            mention_group = markable.attrib['mention_group']
         mention = Mention(mnt_id=markable.attrib['id'],
                           text=span_to_text(span, words, 'orth'),
                           lemmatized_text=span_to_text(span, words, 'base'),
@@ -62,7 +64,8 @@ def read_mentions(mentions_path, words_path):
                           paragraph_id=paragraph_id,
                           sentence_id=sentence_id,
                           first_in_sentence=first_in_sentence,
-                          first_in_paragraph=first_in_paragraph)
+                          first_in_paragraph=first_in_paragraph,
+                          set_id=mention_group)
         mentions.append(mention)
  
     return mentions
@@ -73,13 +76,16 @@ def get_words(filepath):
     words = []
     for word in tree.xpath("//word"):
         hasnps = False
-        if 'hasnps' in word.attrib and word.attrib['hasnps'] == 'true':
+        if (('hasnps' in word.attrib and word.attrib['hasnps'] == 'true') or
+            ('hasNps' in word.attrib and word.attrib['hasNps'] == 'true')):
             hasnps = True
         lastinsent = False
-        if 'lastinsent' in word.attrib and word.attrib['lastinsent'] == 'true':
+        if (('lastinsent' in word.attrib and word.attrib['lastinsent'] == 'true') or
+            ('lastInSent' in word.attrib and word.attrib['lastInSent'] == 'true')):
             lastinsent = True
         lastinpar = False
-        if 'lastinpar' in word.attrib and word.attrib['lastinpar'] == 'true':
+        if (('lastinpar' in word.attrib and word.attrib['lastinpar'] == 'true') or
+            ('lastInPar' in word.attrib and word.attrib['lastInPar'] == 'true')):
             lastinpar = True
         words.append({'id': word.attrib['id'],
                       'orth': word.text,
@@ -62,16 +62,14 @@ def incremental(text, threshold):
  
  
 # all2all resolve algorithm
-def all2all_v1(text, threshold):
+def all2all_debug(text, threshold):
     last_set_id = 0
     for pos1, mnt1 in enumerate(text.mentions):
         print ('!!!!!!!!!!%s!!!!!!!!!!!' % mnt1.text)
         best_prediction = 0.0
         best_link = None
-        if mnt1.set:
-            continue
         for pos2, mnt2 in enumerate(text.mentions):
-            if (pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
+            if ((mnt1.set != mnt2.set or not mnt1.set) and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
                 ante = mnt1
                 ana = mnt2
                 if pos2 < pos1:
@@ -80,29 +78,32 @@ def all2all_v1(text, threshold):
                 pair_vec = get_pair_vector(ante, ana)
                 sample = numpy.asarray([pair_vec], dtype=numpy.float32)
                 prediction = NEURAL_MODEL.predict(sample)[0]
-                print (u'%s >> %f' % (mnt2.text, prediction))
+                print (u'mnt2: %s | %s == %s >> %f' % (mnt2.text, ante.text, ana.text, prediction))
                 if prediction > threshold and prediction > best_prediction:
                     best_prediction = prediction
                     best_link = mnt2
         if best_link is not None:
-            print (u'best: %s' % best_link.text)
-            if best_link.set:
+            print (u'best: %s >> %f, best set: %s, mnt1_set: %s' % (best_link.text, best_prediction, best_link.set, mnt1.set))
+            if best_link.set and not mnt1.set:
                 mnt1.set = best_link.set
-            else:
+            elif best_link.set and mnt1.set:
+                text.merge_sets(best_link.set, mnt1.set)
+            elif not best_link.set and not mnt1.set:
                 str_set_id = 'set_%d' % last_set_id
                 best_link.set = str_set_id
                 mnt1.set = str_set_id
                 last_set_id += 1
+            print (u'best set: %s, mnt1_set: %s' % (best_link.set, mnt1.set))
  
  
-def all2all_debug(text, threshold):
+def all2all_v1(text, threshold):
     last_set_id = 0
     for pos1, mnt1 in enumerate(text.mentions):
-        print ('!!!!!!!!!!%s!!!!!!!!!!!' % mnt1.text)
         best_prediction = 0.0
         best_link = None
         for pos2, mnt2 in enumerate(text.mentions):
-            if ((mnt1.set != mnt2.set or not mnt1.set) and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
+            if ((mnt1.set != mnt2.set or not mnt1.set or not mnt2.set)
+                and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
                 ante = mnt1
                 ana = mnt2
                 if pos2 < pos1:
@@ -111,14 +112,14 @@ def all2all_debug(text, threshold):
                 pair_vec = get_pair_vector(ante, ana)
                 sample = numpy.asarray([pair_vec], dtype=numpy.float32)
                 prediction = NEURAL_MODEL.predict(sample)[0]
-                print (u'mnt2: %s | %s == %s >> %f' % (mnt2.text, ante.text, ana.text, prediction))
                 if prediction > threshold and prediction > best_prediction:
                     best_prediction = prediction
                     best_link = mnt2
         if best_link is not None:
-            print (u'best: %s >> %f, best set: %s, mnt1_set: %s' % (best_link.text, best_prediction, best_link.set, mnt1.set))
             if best_link.set and not mnt1.set:
                 mnt1.set = best_link.set
+            elif not best_link.set and mnt1.set:
+                best_link.set = mnt1.set
             elif best_link.set and mnt1.set:
                 text.merge_sets(best_link.set, mnt1.set)
             elif not best_link.set and not mnt1.set:
@@ -126,11 +127,11 @@ def all2all_debug(text, threshold):
                 best_link.set = str_set_id
                 mnt1.set = str_set_id
                 last_set_id += 1
-            print (u'best set: %s, mnt1_set: %s' % (best_link.set, mnt1.set))
  
  
 def all2all(text, threshold):
     last_set_id = 0
+    sets = text.get_sets()
     for pos1, mnt1 in enumerate(text.mentions):
         best_prediction = 0.0
         best_link = None
@@ -157,9 +158,13 @@ def all2all(text, threshold):
                 text.merge_sets(best_link.set, mnt1.set)
             elif not best_link.set and not mnt1.set:
                 str_set_id = 'set_%d' % last_set_id
+                while str_set_id in sets:
+                    last_set_id += 1
+                    str_set_id = 'set_%d' % last_set_id
                 best_link.set = str_set_id
                 mnt1.set = str_set_id
-                last_set_id += 1
+                sets[str_set_id] = [best_link, mnt1]
+
  
  
 # entity based resolve algorithm