Added all2all resolve algorithm.

Bartłomiej Nitoń
1 parent db32de75
Showing 4 changed files with 106 additions and 1 deletions
corneferencer/entities.py
corneferencer/main.py
corneferencer/resolvers/constants.py
corneferencer/resolvers/resolve.py
@@ -13,6 +13,11 @@ class Text:
                 return mnt.set
         return None
  
+    def merge_sets(self, set1, set2):
+        for mnt in self.mentions:
+            if mnt.set == set1:
+                mnt.set = set2
+
  
 class Mention:
  
@@ -89,6 +89,8 @@ def process_file(inpath, outpath, informat, resolver, threshold):
             resolve.closest(text, threshold)
         elif resolver == 'siamese':
             resolve.siamese(text, threshold)
+        elif resolver == 'all2all':
+            resolve.all2all(text, threshold)
         mmax.write(inpath, outpath, text)
  
  
 # -*- coding: utf-8 -*-
  
-RESOLVERS = ['entity_based', 'incremental', 'closest', 'siamese']
+RESOLVERS = ['entity_based', 'incremental', 'closest', 'siamese', 'all2all']
  
 NOUN_TAGS = ['subst', 'ger', 'depr']
 PPRON_TAGS = ['ppron12', 'ppron3']
@@ -61,6 +61,104 @@ def incremental(text, threshold):
                     last_set_id += 1
  
  
+# all2all resolve algorithm
+def all2all_v1(text, threshold):
+    last_set_id = 0
+    for pos1, mnt1 in enumerate(text.mentions):
+        print ('!!!!!!!!!!%s!!!!!!!!!!!' % mnt1.text)
+        best_prediction = 0.0
+        best_link = None
+        if mnt1.set:
+            continue
+        for pos2, mnt2 in enumerate(text.mentions):
+            if (pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
+                ante = mnt1
+                ana = mnt2
+                if pos2 < pos1:
+                    ante = mnt2
+                    ana = mnt1
+                pair_vec = get_pair_vector(ante, ana)
+                sample = numpy.asarray([pair_vec], dtype=numpy.float32)
+                prediction = NEURAL_MODEL.predict(sample)[0]
+                print (u'%s >> %f' % (mnt2.text, prediction))
+                if prediction > threshold and prediction > best_prediction:
+                    best_prediction = prediction
+                    best_link = mnt2
+        if best_link is not None:
+            print (u'best: %s' % best_link.text)
+            if best_link.set:
+                mnt1.set = best_link.set
+            else:
+                str_set_id = 'set_%d' % last_set_id
+                best_link.set = str_set_id
+                mnt1.set = str_set_id
+                last_set_id += 1
+
+
+def all2all_debug(text, threshold):
+    last_set_id = 0
+    for pos1, mnt1 in enumerate(text.mentions):
+        print ('!!!!!!!!!!%s!!!!!!!!!!!' % mnt1.text)
+        best_prediction = 0.0
+        best_link = None
+        for pos2, mnt2 in enumerate(text.mentions):
+            if ((mnt1.set != mnt2.set or not mnt1.set) and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
+                ante = mnt1
+                ana = mnt2
+                if pos2 < pos1:
+                    ante = mnt2
+                    ana = mnt1
+                pair_vec = get_pair_vector(ante, ana)
+                sample = numpy.asarray([pair_vec], dtype=numpy.float32)
+                prediction = NEURAL_MODEL.predict(sample)[0]
+                print (u'mnt2: %s | %s == %s >> %f' % (mnt2.text, ante.text, ana.text, prediction))
+                if prediction > threshold and prediction > best_prediction:
+                    best_prediction = prediction
+                    best_link = mnt2
+        if best_link is not None:
+            print (u'best: %s >> %f, best set: %s, mnt1_set: %s' % (best_link.text, best_prediction, best_link.set, mnt1.set))
+            if best_link.set and not mnt1.set:
+                mnt1.set = best_link.set
+            elif best_link.set and mnt1.set:
+                text.merge_sets(best_link.set, mnt1.set)
+            elif not best_link.set and not mnt1.set:
+                str_set_id = 'set_%d' % last_set_id
+                best_link.set = str_set_id
+                mnt1.set = str_set_id
+                last_set_id += 1
+            print (u'best set: %s, mnt1_set: %s' % (best_link.set, mnt1.set))
+
+
+def all2all(text, threshold):
+    last_set_id = 0
+    for pos1, mnt1 in enumerate(text.mentions):
+        best_prediction = 0.0
+        best_link = None
+        for pos2, mnt2 in enumerate(text.mentions):
+            if ((mnt1.set != mnt2.set or not mnt1.set) and pos1 != pos2 and not features.pair_intersect(mnt1, mnt2)):
+                ante = mnt1
+                ana = mnt2
+                if pos2 < pos1:
+                    ante = mnt2
+                    ana = mnt1
+                pair_vec = get_pair_vector(ante, ana)
+                sample = numpy.asarray([pair_vec], dtype=numpy.float32)
+                prediction = NEURAL_MODEL.predict(sample)[0]
+                if prediction > threshold and prediction > best_prediction:
+                    best_prediction = prediction
+                    best_link = mnt2
+        if best_link is not None:
+            if best_link.set and not mnt1.set:
+                mnt1.set = best_link.set
+            elif best_link.set and mnt1.set:
+                text.merge_sets(best_link.set, mnt1.set)
+            elif not best_link.set and not mnt1.set:
+                str_set_id = 'set_%d' % last_set_id
+                best_link.set = str_set_id
+                mnt1.set = str_set_id
+                last_set_id += 1
+
+
 # entity based resolve algorithm
 def entity_based(text, threshold):
     sets = []