Commit 91b27b24a9fb0d6427debc133c923e3188f9a768

Authored by Mateusz Kopeć
1 parent e058b3c2

zeros corpus wip

Showing 32 changed files with 401 additions and 86 deletions
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java
1 -package pl.waw.ipipan.zil.summ.nicolas; 1 +package pl.waw.ipipan.zil.summ.nicolas.common;
2 2
  3 +import com.google.common.base.Charsets;
3 import weka.classifiers.Classifier; 4 import weka.classifiers.Classifier;
  5 +import weka.classifiers.functions.Logistic;
4 import weka.classifiers.trees.RandomForest; 6 import weka.classifiers.trees.RandomForest;
5 7
  8 +import java.nio.charset.Charset;
  9 +
6 10
7 public class Constants { 11 public class Constants {
8 12
9 public static final String MENTIONS_MODEL_PATH = "mentions_model.bin"; 13 public static final String MENTIONS_MODEL_PATH = "mentions_model.bin";
10 public static final String SENTENCES_MODEL_PATH = "sentences_model.bin"; 14 public static final String SENTENCES_MODEL_PATH = "sentences_model.bin";
  15 + public static final String ZERO_MODEL_PATH = "zeros_model.bin";
  16 +
11 public static final String MENTIONS_DATASET_PATH = "mentions_train.arff"; 17 public static final String MENTIONS_DATASET_PATH = "mentions_train.arff";
12 public static final String SENTENCES_DATASET_PATH = "sentences_train.arff"; 18 public static final String SENTENCES_DATASET_PATH = "sentences_train.arff";
  19 + public static final String ZERO_DATASET_PATH = "zeros_train.arff";
  20 +
  21 + public static final Charset ENCODING = Charsets.UTF_8;
13 22
14 private Constants() { 23 private Constants() {
15 } 24 }
16 25
17 - public static Classifier getClassifier() { 26 + public static Classifier getMentionClassifier() {
18 RandomForest classifier = new RandomForest(); 27 RandomForest classifier = new RandomForest();
19 classifier.setNumIterations(250); 28 classifier.setNumIterations(250);
20 classifier.setSeed(0); 29 classifier.setSeed(0);
@@ -22,7 +31,6 @@ public class Constants { @@ -22,7 +31,6 @@ public class Constants {
22 return classifier; 31 return classifier;
23 } 32 }
24 33
25 -  
26 public static Classifier getSentencesClassifier() { 34 public static Classifier getSentencesClassifier() {
27 RandomForest classifier = new RandomForest(); 35 RandomForest classifier = new RandomForest();
28 classifier.setNumIterations(250); 36 classifier.setNumIterations(250);
@@ -30,4 +38,9 @@ public class Constants { @@ -30,4 +38,9 @@ public class Constants {
30 classifier.setNumExecutionSlots(8); 38 classifier.setNumExecutionSlots(8);
31 return classifier; 39 return classifier;
32 } 40 }
  41 +
  42 + public static Classifier getZerosClassifier() {
  43 + Logistic classifier = new Logistic();
  44 + return classifier;
  45 + }
33 } 46 }
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
@@ -101,7 +101,7 @@ public class Utils { @@ -101,7 +101,7 @@ public class Utils {
101 STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co")); 101 STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co"));
102 } 102 }
103 103
104 - public static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { 104 + public static Map<TMention, String> loadMention2Orth(List<TSentence> sents, boolean discardStopwords) {
105 Map<TMention, String> mention2orth = Maps.newHashMap(); 105 Map<TMention, String> mention2orth = Maps.newHashMap();
106 for (TSentence s : sents) { 106 for (TSentence s : sents) {
107 Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); 107 Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
@@ -110,7 +110,7 @@ public class Utils { @@ -110,7 +110,7 @@ public class Utils {
110 StringBuffer mentionOrth = new StringBuffer(); 110 StringBuffer mentionOrth = new StringBuffer();
111 for (String tokId : m.getChildIds()) { 111 for (String tokId : m.getChildIds()) {
112 TToken token = tokId2tok.get(tokId); 112 TToken token = tokId2tok.get(tokId);
113 - if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { 113 + if (discardStopwords && STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) {
114 continue; 114 continue;
115 } 115 }
116 116
@@ -142,8 +142,16 @@ public class Utils { @@ -142,8 +142,16 @@ public class Utils {
142 } 142 }
143 143
144 public static String loadSentence2Orth(TSentence sentence) { 144 public static String loadSentence2Orth(TSentence sentence) {
  145 + return loadSentence2Orth(sentence, Sets.newHashSet());
  146 + }
  147 +
  148 + public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) {
145 StringBuilder sb = new StringBuilder(); 149 StringBuilder sb = new StringBuilder();
146 for (TToken token : sentence.getTokens()) { 150 for (TToken token : sentence.getTokens()) {
  151 + if (tokenIdsToSkip.contains(token.getId())) {
  152 + System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence));
  153 + continue;
  154 + }
147 if (!token.isNoPrecedingSpace()) 155 if (!token.isNoPrecedingSpace())
148 sb.append(" "); 156 sb.append(" ");
149 sb.append(token.getOrth()); 157 sb.append(token.getOrth());
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.common;
  2 +
  3 +import java.io.IOException;
  4 +import java.io.InputStream;
  5 +import java.io.ObjectInputStream;
  6 +import java.io.ObjectStreamClass;
  7 +
  8 +
  9 +public class VersionIgnoringObjectInputStream extends ObjectInputStream {
  10 +
  11 + public VersionIgnoringObjectInputStream(InputStream in) throws IOException {
  12 + super(in);
  13 + }
  14 +
  15 + protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException {
  16 + ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor
  17 + Class localClass; // the class in the local JVM that this descriptor represents.
  18 + try {
  19 + localClass = Class.forName(resultClassDescriptor.getName());
  20 + } catch (ClassNotFoundException e) {
  21 + return resultClassDescriptor;
  22 + }
  23 + ObjectStreamClass localClassDescriptor = ObjectStreamClass.lookup(localClass);
  24 + if (localClassDescriptor != null) { // only if class implements serializable
  25 + final long localSUID = localClassDescriptor.getSerialVersionUID();
  26 + final long streamSUID = resultClassDescriptor.getSerialVersionUID();
  27 + if (streamSUID != localSUID) { // check for serialVersionUID mismatch.
  28 + resultClassDescriptor = localClassDescriptor; // Use local class descriptor for deserialization
  29 + }
  30 + }
  31 + return resultClassDescriptor;
  32 + }
  33 +}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java
1 -package pl.waw.ipipan.zil.summ.nicolas.features; 1 +package pl.waw.ipipan.zil.summ.nicolas.common.features;
2 2
3 import com.google.common.collect.*; 3 import com.google.common.collect.*;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java
1 -package pl.waw.ipipan.zil.summ.nicolas.features; 1 +package pl.waw.ipipan.zil.summ.nicolas.common.features;
2 2
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import com.google.common.collect.Sets; 4 import com.google.common.collect.Sets;
@@ -17,6 +17,8 @@ import static java.util.stream.Collectors.toMap; @@ -17,6 +17,8 @@ import static java.util.stream.Collectors.toMap;
17 17
18 public class FeatureHelper { 18 public class FeatureHelper {
19 19
  20 + private final TText text;
  21 +
20 private final List<TMention> mentions; 22 private final List<TMention> mentions;
21 private final Map<String, TMention> mentionId2mention; 23 private final Map<String, TMention> mentionId2mention;
22 private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap(); 24 private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap();
@@ -37,6 +39,8 @@ public class FeatureHelper { @@ -37,6 +39,8 @@ public class FeatureHelper {
37 39
38 40
39 public FeatureHelper(TText preprocessedText) { 41 public FeatureHelper(TText preprocessedText) {
  42 + text = preprocessedText;
  43 +
40 mentions = preprocessedText.getParagraphs().stream() 44 mentions = preprocessedText.getParagraphs().stream()
41 .flatMap(p -> p.getSentences().stream()) 45 .flatMap(p -> p.getSentences().stream())
42 .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList()); 46 .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList());
@@ -55,7 +59,7 @@ public class FeatureHelper { @@ -55,7 +59,7 @@ public class FeatureHelper {
55 int sentIdx = 0; 59 int sentIdx = 0;
56 int mentionIdx = 0; 60 int mentionIdx = 0;
57 for (TParagraph par : preprocessedText.getParagraphs()) { 61 for (TParagraph par : preprocessedText.getParagraphs()) {
58 - Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences()); 62 + Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences(), false);
59 mention2Orth.putAll(m2o); 63 mention2Orth.putAll(m2o);
60 Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); 64 Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences());
61 mention2Base.putAll(m2b); 65 mention2Base.putAll(m2b);
@@ -182,4 +186,18 @@ public class FeatureHelper { @@ -182,4 +186,18 @@ public class FeatureHelper {
182 public TCoreference getMentionCluster(TMention tMention) { 186 public TCoreference getMentionCluster(TMention tMention) {
183 return this.mention2coref.get(tMention); 187 return this.mention2coref.get(tMention);
184 } 188 }
  189 +
  190 + public String getSentenceOrth(TSentence sentence) {
  191 + StringBuilder sb = new StringBuilder();
  192 + for (TToken token : sentence.getTokens()) {
  193 + if (!token.isNoPrecedingSpace())
  194 + sb.append(" ");
  195 + sb.append(token.getOrth());
  196 + }
  197 + return sb.toString().trim();
  198 + }
  199 +
  200 + public TText getText() {
  201 + return text;
  202 + }
185 } 203 }
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java
1 -package pl.waw.ipipan.zil.summ.nicolas.features; 1 +package pl.waw.ipipan.zil.summ.nicolas.common.features;
2 2
3 import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation; 3 import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation;
4 4
nicolas-core/pom.xml
@@ -21,6 +21,10 @@ @@ -21,6 +21,10 @@
21 <groupId>pl.waw.ipipan.zil.summ</groupId> 21 <groupId>pl.waw.ipipan.zil.summ</groupId>
22 <artifactId>nicolas-model</artifactId> 22 <artifactId>nicolas-model</artifactId>
23 </dependency> 23 </dependency>
  24 + <dependency>
  25 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  26 + <artifactId>nicolas-zero</artifactId>
  27 + </dependency>
24 28
25 <dependency> 29 <dependency>
26 <groupId>pl.waw.ipipan.zil.summ</groupId> 30 <groupId>pl.waw.ipipan.zil.summ</groupId>
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -6,6 +6,7 @@ import com.google.common.collect.Sets; @@ -6,6 +6,7 @@ import com.google.common.collect.Sets;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
9 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 10 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
10 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 11 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
11 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 12 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
@@ -8,12 +8,13 @@ import org.slf4j.LoggerFactory; @@ -8,12 +8,13 @@ import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 -import pl.waw.ipipan.zil.summ.nicolas.Constants;  
12 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; 11 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  12 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
13 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 13 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
16 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 16 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
  17 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectInjector;
17 import weka.classifiers.Classifier; 18 import weka.classifiers.Classifier;
18 import weka.core.Instance; 19 import weka.core.Instance;
19 import weka.core.Instances; 20 import weka.core.Instances;
@@ -29,8 +30,8 @@ public class ApplyModel2 { @@ -29,8 +30,8 @@ public class ApplyModel2 {
29 30
30 private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class); 31 private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class);
31 32
32 - private static final String TEST_PREPROCESSED_DATA_PATH = "src/main/resources/preprocessed_full_texts/test";  
33 - private static final String TARGET_DIR = "summaries"; 33 + private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test";
  34 + private static final String TARGET_DIR = "corpora/summaries";
34 35
35 public static void main(String[] args) throws Exception { 36 public static void main(String[] args) throws Exception {
36 Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); 37 Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
@@ -39,6 +40,8 @@ public class ApplyModel2 { @@ -39,6 +40,8 @@ public class ApplyModel2 {
39 Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); 40 Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH);
40 SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); 41 SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();
41 42
  43 + ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector();
  44 +
42 Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH); 45 Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH);
43 int i = 1; 46 int i = 1;
44 double avgSize = 0; 47 double avgSize = 0;
@@ -49,10 +52,10 @@ public class ApplyModel2 { @@ -49,10 +52,10 @@ public class ApplyModel2 {
49 = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); 52 = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text);
50 53
51 int targetSize = calculateTargetSize(text); 54 int targetSize = calculateTargetSize(text);
52 - String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); 55 + String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor, zeroSubjectInjector);
53 int size = Utils.tokenize(summary).size(); 56 int size = Utils.tokenize(summary).size();
54 avgSize += size; 57 avgSize += size;
55 - try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily3.txt")))) { 58 + try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily4.txt")))) {
56 bw.append(summary); 59 bw.append(summary);
57 } 60 }
58 61
@@ -71,12 +74,14 @@ public class ApplyModel2 { @@ -71,12 +74,14 @@ public class ApplyModel2 {
71 return (int) (0.2 * tokenCount); 74 return (int) (0.2 * tokenCount);
72 } 75 }
73 76
74 - private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { 77 + private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor, ZeroSubjectInjector zeroSubjectInjector) throws Exception {
75 List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); 78 List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor);
76 79
77 - StringBuffer sb = new StringBuffer(); 80 + Set<String> zeroSubjectTokenIds = zeroSubjectInjector.findZeroSubjectTokenIds(thrifted, selectedSentences);
  81 +
  82 + StringBuilder sb = new StringBuilder();
78 for (TSentence sent : selectedSentences) { 83 for (TSentence sent : selectedSentences) {
79 - sb.append(" " + Utils.loadSentence2Orth(sent)); 84 + sb.append(" " + Utils.loadSentence2Orth(sent, zeroSubjectTokenIds));
80 } 85 }
81 return sb.toString().trim(); 86 return sb.toString().trim();
82 } 87 }
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
@@ -2,9 +2,9 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; @@ -2,9 +2,9 @@ package pl.waw.ipipan.zil.summ.nicolas.mention;
2 2
3 import com.google.common.collect.*; 3 import com.google.common.collect.*;
4 import pl.waw.ipipan.zil.multiservice.thrift.types.*; 4 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
5 -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;  
6 -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;  
7 -import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; 5 +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;
  6 +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  7 +import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation;
8 import weka.core.Attribute; 8 import weka.core.Attribute;
9 9
10 import java.io.File; 10 import java.io.File;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
@@ -19,7 +19,7 @@ public class MentionScorer { @@ -19,7 +19,7 @@ public class MentionScorer {
19 Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); 19 Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
20 20
21 List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); 21 List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
22 - Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences); 22 + Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences, true);
23 23
24 return booleanTokenIntersection(mention2Orth, tokenCounts); 24 return booleanTokenIntersection(mention2Orth, tokenCounts);
25 } 25 }
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
@@ -7,7 +7,7 @@ import org.slf4j.Logger; @@ -7,7 +7,7 @@ import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory; 7 import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
10 -import pl.waw.ipipan.zil.summ.nicolas.Constants; 10 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
11 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; 11 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
12 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 12 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
13 import weka.core.Instance; 13 import weka.core.Instance;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention;
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 -import pl.waw.ipipan.zil.summ.nicolas.Constants; 6 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7 import weka.classifiers.Classifier; 7 import weka.classifiers.Classifier;
8 import weka.core.Instances; 8 import weka.core.Instances;
9 import weka.core.converters.ArffLoader; 9 import weka.core.converters.ArffLoader;
@@ -28,7 +28,7 @@ public class TrainModel { @@ -28,7 +28,7 @@ public class TrainModel {
28 StopWatch watch = new StopWatch(); 28 StopWatch watch = new StopWatch();
29 watch.start(); 29 watch.start();
30 30
31 - Classifier classifier = Constants.getClassifier(); 31 + Classifier classifier = Constants.getMentionClassifier();
32 32
33 LOG.info("Building classifier..."); 33 LOG.info("Building classifier...");
34 classifier.buildClassifier(instances); 34 classifier.buildClassifier(instances);
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test; @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test;
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 -import pl.waw.ipipan.zil.summ.nicolas.Constants; 6 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7 import weka.classifiers.Classifier; 7 import weka.classifiers.Classifier;
8 import weka.classifiers.evaluation.Evaluation; 8 import weka.classifiers.evaluation.Evaluation;
9 import weka.core.Instances; 9 import weka.core.Instances;
@@ -32,7 +32,7 @@ public class Crossvalidate { @@ -32,7 +32,7 @@ public class Crossvalidate {
32 StopWatch watch = new StopWatch(); 32 StopWatch watch = new StopWatch();
33 watch.start(); 33 watch.start();
34 34
35 - Classifier tree = Constants.getClassifier(); 35 + Classifier tree = Constants.getMentionClassifier();
36 36
37 Evaluation eval = new Evaluation(instances); 37 Evaluation eval = new Evaluation(instances);
38 eval.crossValidateModel(tree, instances, 10, new Random(1)); 38 eval.crossValidateModel(tree, instances, 10, new Random(1));
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test; @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test;
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 -import pl.waw.ipipan.zil.summ.nicolas.Constants; 6 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7 import weka.classifiers.Classifier; 7 import weka.classifiers.Classifier;
8 import weka.classifiers.evaluation.Evaluation; 8 import weka.classifiers.evaluation.Evaluation;
9 import weka.core.Instances; 9 import weka.core.Instances;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
@@ -8,7 +8,7 @@ import org.slf4j.LoggerFactory; @@ -8,7 +8,7 @@ import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 -import pl.waw.ipipan.zil.summ.nicolas.Constants; 11 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
12 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; 12 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
13 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 13 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
@@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; @@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence;
2 2
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import pl.waw.ipipan.zil.multiservice.thrift.types.*; 4 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
5 -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;  
6 -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 5 +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;
  6 +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
7 import weka.core.Attribute; 7 import weka.core.Attribute;
8 8
9 import java.util.List; 9 import java.util.List;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
@@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; @@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence;
3 import com.google.common.collect.HashMultiset; 3 import com.google.common.collect.HashMultiset;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
5 import com.google.common.collect.Multiset; 5 import com.google.common.collect.Multiset;
  6 +import com.google.common.collect.Sets;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence;
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 -import pl.waw.ipipan.zil.summ.nicolas.Constants; 6 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7 import weka.classifiers.Classifier; 7 import weka.classifiers.Classifier;
8 import weka.core.Instances; 8 import weka.core.Instances;
9 import weka.core.converters.ArffLoader; 9 import weka.core.converters.ArffLoader;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence.test; @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence.test;
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 -import pl.waw.ipipan.zil.summ.nicolas.Constants; 6 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7 import weka.classifiers.Classifier; 7 import weka.classifiers.Classifier;
8 import weka.classifiers.evaluation.Evaluation; 8 import weka.classifiers.evaluation.Evaluation;
9 import weka.core.Instances; 9 import weka.core.Instances;
nicolas-zero/pom.xml
@@ -27,6 +27,10 @@ @@ -27,6 +27,10 @@
27 <groupId>commons-io</groupId> 27 <groupId>commons-io</groupId>
28 <artifactId>commons-io</artifactId> 28 <artifactId>commons-io</artifactId>
29 </dependency> 29 </dependency>
  30 + <dependency>
  31 + <groupId>org.apache.commons</groupId>
  32 + <artifactId>commons-lang3</artifactId>
  33 + </dependency>
30 34
31 <!-- logging --> 35 <!-- logging -->
32 <dependency> 36 <dependency>
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
@@ -12,7 +12,10 @@ import java.util.Set; @@ -12,7 +12,10 @@ import java.util.Set;
12 12
13 public class CandidateFinder { 13 public class CandidateFinder {
14 14
15 - public List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { 15 + private CandidateFinder() {
  16 + }
  17 +
  18 + public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) {
16 List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); 19 List<ZeroSubjectCandidate> candidates = Lists.newArrayList();
17 20
18 Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); 21 Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import com.google.common.collect.Lists;
  4 +import com.google.common.collect.Maps;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  6 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  7 +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;
  8 +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  9 +import weka.core.Attribute;
  10 +
  11 +import java.util.List;
  12 +import java.util.Map;
  13 +
  14 +
  15 +public class ZeroFeatureExtractor extends FeatureExtractor {
  16 +
  17 + public ZeroFeatureExtractor() {
  18 +
  19 + for (String prefix : new String[]{"antecedent", "candidate"}) {
  20 + addNumericAttribute(prefix + "_index_in_sent");
  21 + addNumericAttribute(prefix + "_token_count");
  22 + addBinaryAttribute(prefix + "_is_zero");
  23 + addBinaryAttribute(prefix + "_is_pronoun");
  24 + addBinaryAttribute(prefix + "_is_named");
  25 + }
  26 +
  27 + addBinaryAttribute("pair_equal_orth");
  28 +
  29 + addNominalAttribute("score", Lists.newArrayList("bad", "good"));
  30 + fillSortedAttributes("score");
  31 + }
  32 +
  33 + public Map<ZeroSubjectCandidate, Map<Attribute, Double>> calculateFeatures(List<ZeroSubjectCandidate> candidates, TText text) {
  34 + Map<ZeroSubjectCandidate, Map<Attribute, Double>> result = Maps.newHashMap();
  35 +
  36 + FeatureHelper helper = new FeatureHelper(text);
  37 + for (ZeroSubjectCandidate candidate : candidates) {
  38 + Map<Attribute, Double> candidateFeatures = calculateFeatures(candidate, helper);
  39 + result.put(candidate, candidateFeatures);
  40 + }
  41 +
  42 + return result;
  43 + }
  44 +
  45 + private Map<Attribute, Double> calculateFeatures(ZeroSubjectCandidate candidate, FeatureHelper helper) {
  46 +
  47 + Map<Attribute, Double> candidateFeatures = Maps.newHashMap();
  48 + candidateFeatures.put(getAttributeByName("score"), weka.core.Utils.missingValue());
  49 +
  50 + TMention mention = candidate.getZeroCandidateMention();
  51 + TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get();
  52 +
  53 + addMentionFeatures(helper, candidateFeatures, mention, "candidate");
  54 + addMentionFeatures(helper, candidateFeatures, antecedent, "antecedent");
  55 +
  56 + candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equalsIgnoreCase(helper.getMentionOrth(antecedent))));
  57 +
  58 + return candidateFeatures;
  59 + }
  60 +
  61 + private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) {
  62 + candidateFeatures.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention));
  63 + candidateFeatures.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize());
  64 + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_zero"), toBinary(mention.isZeroSubject()));
  65 + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_pronoun"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getCtag().matches("ppron.*")));
  66 + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention)));
  67 + }
  68 +
  69 +}
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
1 package pl.waw.ipipan.zil.summ.nicolas.zero; 1 package pl.waw.ipipan.zil.summ.nicolas.zero;
2 2
  3 +import com.google.common.collect.Sets;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  6 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  7 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  8 +import pl.waw.ipipan.zil.summ.nicolas.zero.train.TrainingDataExtractor;
  9 +import weka.classifiers.Classifier;
  10 +import weka.core.Instance;
  11 +import weka.core.Instances;
  12 +
  13 +import java.io.IOException;
  14 +import java.util.List;
  15 +import java.util.Map;
  16 +import java.util.Set;
  17 +import java.util.stream.Collectors;
3 18
4 public class ZeroSubjectInjector { 19 public class ZeroSubjectInjector {
  20 +
  21 + private final ZeroFeatureExtractor featureExtractor;
  22 + private final Classifier classifier;
  23 + private final Instances instances;
  24 +
  25 + public ZeroSubjectInjector() throws IOException, ClassNotFoundException {
  26 + classifier = Utils.loadClassifier(Constants.ZERO_MODEL_PATH);
  27 + featureExtractor = new ZeroFeatureExtractor();
  28 + instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  29 + }
  30 +
  31 + public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception {
  32 + Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet());
  33 + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds);
  34 + Map<ZeroSubjectCandidate, Instance> candidate2instance =
  35 + TrainingDataExtractor.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
  36 +
  37 + Set<String> result = Sets.newHashSet();
  38 + for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) {
  39 + ZeroSubjectCandidate candidate = entry.getKey();
  40 + Instance instance = entry.getValue();
  41 + instance.setDataset(instances);
  42 + instance.setClassMissing();
  43 + boolean good = classifier.classifyInstance(instance) > 0.5;
  44 + if (good) {
  45 + result.addAll(candidate.getZeroCandidateMention().getChildIds());
  46 + }
  47 + }
  48 + return result;
  49 + }
  50 +
5 } 51 }
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/TrainModel.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero.train;
  2 +
  3 +import org.apache.commons.lang3.time.StopWatch;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  7 +import weka.classifiers.Classifier;
  8 +import weka.core.Instances;
  9 +import weka.core.converters.ArffLoader;
  10 +
  11 +import java.io.File;
  12 +import java.io.FileOutputStream;
  13 +import java.io.ObjectOutputStream;
  14 +
  15 +
  16 +public class TrainModel {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
  19 +
  20 + private TrainModel() {
  21 + }
  22 +
  23 + public static void main(String[] args) throws Exception {
  24 +
  25 + ArffLoader loader = new ArffLoader();
  26 + loader.setFile(new File(Constants.ZERO_DATASET_PATH));
  27 + Instances instances = loader.getDataSet();
  28 + instances.setClassIndex(0);
  29 + LOG.info(instances.size() + " instances loaded.");
  30 + LOG.info(instances.numAttributes() + " attributes for each instance.");
  31 +
  32 + StopWatch watch = new StopWatch();
  33 + watch.start();
  34 +
  35 + Classifier classifier = Constants.getZerosClassifier();
  36 +
  37 + LOG.info("Building classifier...");
  38 + classifier.buildClassifier(instances);
  39 + LOG.info("...done.");
  40 +
  41 + try (ObjectOutputStream oos = new ObjectOutputStream(
  42 + new FileOutputStream(Constants.ZERO_MODEL_PATH))) {
  43 + oos.writeObject(classifier);
  44 + }
  45 +
  46 + watch.stop();
  47 + LOG.info("Elapsed time: " + watch);
  48 +
  49 + LOG.info(classifier.toString());
  50 + }
  51 +}
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java renamed to nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/TrainingDataExtractor.java
1 -package pl.waw.ipipan.zil.summ.nicolas.zero; 1 +package pl.waw.ipipan.zil.summ.nicolas.zero.train;
2 2
3 -import com.google.common.collect.Lists;  
4 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
5 import com.google.common.collect.Sets; 4 import com.google.common.collect.Sets;
6 -import org.apache.commons.csv.CSVFormat;  
7 -import org.apache.commons.csv.CSVPrinter;  
8 -import org.apache.commons.csv.QuoteMode;  
9 import org.apache.commons.io.IOUtils; 5 import org.apache.commons.io.IOUtils;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper; 7 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
12 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 8 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  10 +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
  11 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
  12 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
  13 +import weka.core.Attribute;
  14 +import weka.core.DenseInstance;
  15 +import weka.core.Instance;
  16 +import weka.core.Instances;
  17 +import weka.core.converters.ArffSaver;
13 18
14 import java.io.File; 19 import java.io.File;
15 import java.io.FileReader; 20 import java.io.FileReader;
16 -import java.io.FileWriter;  
17 import java.io.IOException; 21 import java.io.IOException;
18 import java.util.List; 22 import java.util.List;
19 import java.util.Map; 23 import java.util.Map;
20 import java.util.Set; 24 import java.util.Set;
21 25
22 -public class Zero { 26 +public class TrainingDataExtractor {
23 27
24 private static final String IDS_PATH = "corpora/summaries_dev"; 28 private static final String IDS_PATH = "corpora/summaries_dev";
25 private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; 29 private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/";
  30 + private static final String GOLD_ZEROS_PATH = "/zeros.tsv";
26 31
27 - private Zero() { 32 + private TrainingDataExtractor() {
28 } 33 }
29 34
30 public static void main(String[] args) throws IOException { 35 public static void main(String[] args) throws IOException {
31 36
32 - CandidateFinder candidateFinder = new CandidateFinder();  
33 -  
34 Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); 37 Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);
35 Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH); 38 Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH);
36 39
37 - List<List<Object>> rows = Lists.newArrayList(); 40 + ZeroScorer zeroScorer = new ZeroScorer(GOLD_ZEROS_PATH);
  41 + ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor();
  42 +
  43 + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  44 +
38 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { 45 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
39 String textId = entry.getKey(); 46 String textId = entry.getKey();
40 47
41 TText text = entry.getValue(); 48 TText text = entry.getValue();
42 - ThriftTextHelper thriftTextHelper = new ThriftTextHelper(text);  
43 -  
44 Set<String> sentenceIds = id2sentIds.get(textId); 49 Set<String> sentenceIds = id2sentIds.get(textId);
  50 + FeatureHelper featureHelper = new FeatureHelper(text);
45 51
46 - List<ZeroSubjectCandidate> zeroSubjectCandidates = candidateFinder.findZeroSubjectCandidates(text, sentenceIds); 52 + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds);
  53 + Map<ZeroSubjectCandidate, Instance> candidate2instance = extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
47 54
48 - for (ZeroSubjectCandidate candidate : zeroSubjectCandidates) {  
49 - List<Object> row = Lists.newArrayList();  
50 - row.add("C");  
51 - row.add(textId);  
52 - row.add(thriftTextHelper.getMentionText(candidate.getZeroCandidateMention()));  
53 - row.add(thriftTextHelper.getSentenceText(candidate.getPreviousSentence()));  
54 - row.add(thriftTextHelper.getSentenceText(candidate.getSentence()));  
55 - rows.add(row); 55 + for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) {
  56 + boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper);
  57 + Instance instance = entry2.getValue();
  58 + instance.setDataset(instances);
  59 + instance.setClassValue(good ? 1 : 0);
  60 + instances.add(instance);
56 } 61 }
57 } 62 }
58 63
59 - try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) {  
60 - for (List<Object> row : rows) {  
61 - csvPrinter.printRecord(row); 64 + saveInstancesToFile(instances);
  65 + }
  66 +
  67 + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) {
  68 + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text);
  69 + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap();
  70 + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) {
  71 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  72 + Map<Attribute, Double> sentenceFeatures = entry.getValue();
  73 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  74 + instance.setValue(attribute, sentenceFeatures.get(attribute));
62 } 75 }
  76 + candidate2instance.put(entry.getKey(), instance);
63 } 77 }
  78 + return candidate2instance;
  79 + }
64 80
  81 + private static void saveInstancesToFile(Instances instances) throws IOException {
  82 + ArffSaver saver = new ArffSaver();
  83 + saver.setInstances(instances);
  84 + saver.setFile(new File(Constants.ZERO_DATASET_PATH));
  85 + saver.writeBatch();
65 } 86 }
66 87
67 private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { 88 private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException {
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/ZeroScorer.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero.train;
  2 +
  3 +import com.google.common.collect.Maps;
  4 +import org.apache.commons.csv.CSVFormat;
  5 +import org.apache.commons.csv.CSVParser;
  6 +import org.apache.commons.csv.CSVRecord;
  7 +import org.apache.commons.csv.QuoteMode;
  8 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  10 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
  11 +
  12 +import java.io.IOException;
  13 +import java.io.InputStream;
  14 +import java.io.InputStreamReader;
  15 +import java.util.List;
  16 +import java.util.Map;
  17 +
  18 +public class ZeroScorer {
  19 +
  20 + private static final char DELIMITER = '\t';
  21 +
  22 + private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap();
  23 +
  24 + public ZeroScorer(String goldZerosPath) throws IOException {
  25 + try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosPath);
  26 + InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING);
  27 + CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) {
  28 + List<CSVRecord> records = parser.getRecords();
  29 + for (CSVRecord record : records) {
  30 + candidateEncoding2Decision.put(encode(record.get(2), record.get(3), record.get(4)), record.get(0).equalsIgnoreCase("C"));
  31 + }
  32 + }
  33 + }
  34 +
  35 + private String encode(String mentionOrth, String firstSentenceOrth, String secondSentenceOrth) {
  36 + return mentionOrth + DELIMITER + firstSentenceOrth + DELIMITER + secondSentenceOrth;
  37 + }
  38 +
  39 + private String encode(ZeroSubjectCandidate candidate, FeatureHelper helper) {
  40 + String mentionOrth = helper.getMentionOrth(candidate.getZeroCandidateMention());
  41 + String firstSentenceOrth = helper.getSentenceOrth(candidate.getPreviousSentence());
  42 + String secondSentenceOrth = helper.getSentenceOrth(candidate.getSentence());
  43 + return encode(mentionOrth, firstSentenceOrth, secondSentenceOrth);
  44 + }
  45 +
  46 + public boolean isValidCandidate(ZeroSubjectCandidate candidate, FeatureHelper helper) {
  47 + return candidateEncoding2Decision.get(encode(candidate, helper));
  48 + }
  49 +
  50 +}
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -2,12 +2,11 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; @@ -2,12 +2,11 @@ package pl.waw.ipipan.zil.summ.nicolas.zero;
2 2
3 import com.google.common.collect.Sets; 3 import com.google.common.collect.Sets;
4 import org.apache.commons.io.IOUtils; 4 import org.apache.commons.io.IOUtils;
5 -import org.junit.BeforeClass;  
6 import org.junit.Test; 5 import org.junit.Test;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
9 -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper;  
10 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 8 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
11 10
12 import java.io.IOException; 11 import java.io.IOException;
13 import java.io.InputStream; 12 import java.io.InputStream;
@@ -22,18 +21,11 @@ public class CandidateFinderTest { @@ -22,18 +21,11 @@ public class CandidateFinderTest {
22 private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin"; 21 private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin";
23 private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; 22 private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt";
24 23
25 - private static CandidateFinder candidateFinder;  
26 -  
27 - @BeforeClass  
28 - public static void init() {  
29 - candidateFinder = new CandidateFinder();  
30 - }  
31 -  
32 @Test 24 @Test
33 public void shouldFindZeroSubjectCandidateInSampleText() throws Exception { 25 public void shouldFindZeroSubjectCandidateInSampleText() throws Exception {
34 - ThriftTextHelper sampleTextHelper = loadSampleTextHelper(); 26 + FeatureHelper sampleTextHelper = loadSampleTextHelper();
35 Set<String> summarySentenceIds = loadSampleTextSummarySentenceIds(); 27 Set<String> summarySentenceIds = loadSampleTextSummarySentenceIds();
36 - List<ZeroSubjectCandidate> candidates = candidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds); 28 + List<ZeroSubjectCandidate> candidates = CandidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds);
37 assertEquals(1, candidates.size()); 29 assertEquals(1, candidates.size());
38 30
39 ZeroSubjectCandidate zeroSubjectCandidate = candidates.get(0); 31 ZeroSubjectCandidate zeroSubjectCandidate = candidates.get(0);
@@ -41,9 +33,9 @@ public class CandidateFinderTest { @@ -41,9 +33,9 @@ public class CandidateFinderTest {
41 TSentence secondSentence = zeroSubjectCandidate.getSentence(); 33 TSentence secondSentence = zeroSubjectCandidate.getSentence();
42 TMention zeroCandidate = zeroSubjectCandidate.getZeroCandidateMention(); 34 TMention zeroCandidate = zeroSubjectCandidate.getZeroCandidateMention();
43 35
44 - assertEquals("Ala ma kota.", sampleTextHelper.getSentenceText(firstSentence));  
45 - assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceText(secondSentence));  
46 - assertEquals("Ala", sampleTextHelper.getMentionText(zeroCandidate)); 36 + assertEquals("Ala ma kota.", sampleTextHelper.getSentenceOrth(firstSentence));
  37 + assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceOrth(secondSentence));
  38 + assertEquals("Ala", sampleTextHelper.getMentionOrth(zeroCandidate));
47 } 39 }
48 40
49 private Set<String> loadSampleTextSummarySentenceIds() throws IOException { 41 private Set<String> loadSampleTextSummarySentenceIds() throws IOException {
@@ -53,9 +45,9 @@ public class CandidateFinderTest { @@ -53,9 +45,9 @@ public class CandidateFinderTest {
53 } 45 }
54 } 46 }
55 47
56 - private ThriftTextHelper loadSampleTextHelper() throws IOException { 48 + private FeatureHelper loadSampleTextHelper() throws IOException {
57 try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { 49 try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
58 - return new ThriftTextHelper(Utils.loadThrifted(stream)); 50 + return new FeatureHelper(Utils.loadThrifted(stream));
59 } 51 }
60 } 52 }
61 } 53 }
62 \ No newline at end of file 54 \ No newline at end of file
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjectorTest.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.zero;  
2 -  
3 -import org.junit.Test;  
4 -  
5 -public class ZeroSubjectInjectorTest {  
6 -  
7 - @Test  
8 - public void shouldInit() throws Exception {  
9 - ZeroSubjectInjector injector = new ZeroSubjectInjector();  
10 - }  
11 -}  
12 \ No newline at end of file 0 \ No newline at end of file
nicolas-zero/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin 0 → 100644
No preview for this file type
nicolas-zero/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt 0 → 100644
  1 +s-2.1
  2 +s-2.2
@@ -61,6 +61,11 @@ @@ -61,6 +61,11 @@
61 <artifactId>nicolas-common</artifactId> 61 <artifactId>nicolas-common</artifactId>
62 <version>${project.version}</version> 62 <version>${project.version}</version>
63 </dependency> 63 </dependency>
  64 + <dependency>
  65 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  66 + <artifactId>nicolas-zero</artifactId>
  67 + <version>${project.version}</version>
  68 + </dependency>
64 69
65 <!-- internal --> 70 <!-- internal -->
66 <dependency> 71 <dependency>