Commit 91b27b24a9fb0d6427debc133c923e3188f9a768
1 parent
e058b3c2
zeros corpus wip
Showing
32 changed files
with
401 additions
and
86 deletions
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java
1 | -package pl.waw.ipipan.zil.summ.nicolas; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.common; | |
2 | 2 | |
3 | +import com.google.common.base.Charsets; | |
3 | 4 | import weka.classifiers.Classifier; |
5 | +import weka.classifiers.functions.Logistic; | |
4 | 6 | import weka.classifiers.trees.RandomForest; |
5 | 7 | |
8 | +import java.nio.charset.Charset; | |
9 | + | |
6 | 10 | |
7 | 11 | public class Constants { |
8 | 12 | |
9 | 13 | public static final String MENTIONS_MODEL_PATH = "mentions_model.bin"; |
10 | 14 | public static final String SENTENCES_MODEL_PATH = "sentences_model.bin"; |
15 | + public static final String ZERO_MODEL_PATH = "zeros_model.bin"; | |
16 | + | |
11 | 17 | public static final String MENTIONS_DATASET_PATH = "mentions_train.arff"; |
12 | 18 | public static final String SENTENCES_DATASET_PATH = "sentences_train.arff"; |
19 | + public static final String ZERO_DATASET_PATH = "zeros_train.arff"; | |
20 | + | |
21 | + public static final Charset ENCODING = Charsets.UTF_8; | |
13 | 22 | |
14 | 23 | private Constants() { |
15 | 24 | } |
16 | 25 | |
17 | - public static Classifier getClassifier() { | |
26 | + public static Classifier getMentionClassifier() { | |
18 | 27 | RandomForest classifier = new RandomForest(); |
19 | 28 | classifier.setNumIterations(250); |
20 | 29 | classifier.setSeed(0); |
... | ... | @@ -22,7 +31,6 @@ public class Constants { |
22 | 31 | return classifier; |
23 | 32 | } |
24 | 33 | |
25 | - | |
26 | 34 | public static Classifier getSentencesClassifier() { |
27 | 35 | RandomForest classifier = new RandomForest(); |
28 | 36 | classifier.setNumIterations(250); |
... | ... | @@ -30,4 +38,9 @@ public class Constants { |
30 | 38 | classifier.setNumExecutionSlots(8); |
31 | 39 | return classifier; |
32 | 40 | } |
41 | + | |
42 | + public static Classifier getZerosClassifier() { | |
43 | + Logistic classifier = new Logistic(); | |
44 | + return classifier; | |
45 | + } | |
33 | 46 | } |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
... | ... | @@ -101,7 +101,7 @@ public class Utils { |
101 | 101 | STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co")); |
102 | 102 | } |
103 | 103 | |
104 | - public static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { | |
104 | + public static Map<TMention, String> loadMention2Orth(List<TSentence> sents, boolean discardStopwords) { | |
105 | 105 | Map<TMention, String> mention2orth = Maps.newHashMap(); |
106 | 106 | for (TSentence s : sents) { |
107 | 107 | Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); |
... | ... | @@ -110,7 +110,7 @@ public class Utils { |
110 | 110 | StringBuffer mentionOrth = new StringBuffer(); |
111 | 111 | for (String tokId : m.getChildIds()) { |
112 | 112 | TToken token = tokId2tok.get(tokId); |
113 | - if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { | |
113 | + if (discardStopwords && STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { | |
114 | 114 | continue; |
115 | 115 | } |
116 | 116 | |
... | ... | @@ -142,8 +142,16 @@ public class Utils { |
142 | 142 | } |
143 | 143 | |
144 | 144 | public static String loadSentence2Orth(TSentence sentence) { |
145 | + return loadSentence2Orth(sentence, Sets.newHashSet()); | |
146 | + } | |
147 | + | |
148 | + public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { | |
145 | 149 | StringBuilder sb = new StringBuilder(); |
146 | 150 | for (TToken token : sentence.getTokens()) { |
151 | + if (tokenIdsToSkip.contains(token.getId())) { | |
152 | + System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); | |
153 | + continue; | |
154 | + } | |
147 | 155 | if (!token.isNoPrecedingSpace()) |
148 | 156 | sb.append(" "); |
149 | 157 | sb.append(token.getOrth()); |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.common; | |
2 | + | |
3 | +import java.io.IOException; | |
4 | +import java.io.InputStream; | |
5 | +import java.io.ObjectInputStream; | |
6 | +import java.io.ObjectStreamClass; | |
7 | + | |
8 | + | |
9 | +public class VersionIgnoringObjectInputStream extends ObjectInputStream { | |
10 | + | |
11 | + public VersionIgnoringObjectInputStream(InputStream in) throws IOException { | |
12 | + super(in); | |
13 | + } | |
14 | + | |
15 | + protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException { | |
16 | + ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor | |
17 | + Class localClass; // the class in the local JVM that this descriptor represents. | |
18 | + try { | |
19 | + localClass = Class.forName(resultClassDescriptor.getName()); | |
20 | + } catch (ClassNotFoundException e) { | |
21 | + return resultClassDescriptor; | |
22 | + } | |
23 | + ObjectStreamClass localClassDescriptor = ObjectStreamClass.lookup(localClass); | |
24 | + if (localClassDescriptor != null) { // only if class implements serializable | |
25 | + final long localSUID = localClassDescriptor.getSerialVersionUID(); | |
26 | + final long streamSUID = resultClassDescriptor.getSerialVersionUID(); | |
27 | + if (streamSUID != localSUID) { // check for serialVersionUID mismatch. | |
28 | + resultClassDescriptor = localClassDescriptor; // Use local class descriptor for deserialization | |
29 | + } | |
30 | + } | |
31 | + return resultClassDescriptor; | |
32 | + } | |
33 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.features; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.common.features; | |
2 | 2 | |
3 | 3 | import com.google.common.collect.Maps; |
4 | 4 | import com.google.common.collect.Sets; |
... | ... | @@ -17,6 +17,8 @@ import static java.util.stream.Collectors.toMap; |
17 | 17 | |
18 | 18 | public class FeatureHelper { |
19 | 19 | |
20 | + private final TText text; | |
21 | + | |
20 | 22 | private final List<TMention> mentions; |
21 | 23 | private final Map<String, TMention> mentionId2mention; |
22 | 24 | private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap(); |
... | ... | @@ -37,6 +39,8 @@ public class FeatureHelper { |
37 | 39 | |
38 | 40 | |
39 | 41 | public FeatureHelper(TText preprocessedText) { |
42 | + text = preprocessedText; | |
43 | + | |
40 | 44 | mentions = preprocessedText.getParagraphs().stream() |
41 | 45 | .flatMap(p -> p.getSentences().stream()) |
42 | 46 | .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList()); |
... | ... | @@ -55,7 +59,7 @@ public class FeatureHelper { |
55 | 59 | int sentIdx = 0; |
56 | 60 | int mentionIdx = 0; |
57 | 61 | for (TParagraph par : preprocessedText.getParagraphs()) { |
58 | - Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences()); | |
62 | + Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences(), false); | |
59 | 63 | mention2Orth.putAll(m2o); |
60 | 64 | Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); |
61 | 65 | mention2Base.putAll(m2b); |
... | ... | @@ -182,4 +186,18 @@ public class FeatureHelper { |
182 | 186 | public TCoreference getMentionCluster(TMention tMention) { |
183 | 187 | return this.mention2coref.get(tMention); |
184 | 188 | } |
189 | + | |
190 | + public String getSentenceOrth(TSentence sentence) { | |
191 | + StringBuilder sb = new StringBuilder(); | |
192 | + for (TToken token : sentence.getTokens()) { | |
193 | + if (!token.isNoPrecedingSpace()) | |
194 | + sb.append(" "); | |
195 | + sb.append(token.getOrth()); | |
196 | + } | |
197 | + return sb.toString().trim(); | |
198 | + } | |
199 | + | |
200 | + public TText getText() { | |
201 | + return text; | |
202 | + } | |
185 | 203 | } |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java
nicolas-core/pom.xml
... | ... | @@ -21,6 +21,10 @@ |
21 | 21 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
22 | 22 | <artifactId>nicolas-model</artifactId> |
23 | 23 | </dependency> |
24 | + <dependency> | |
25 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
26 | + <artifactId>nicolas-zero</artifactId> | |
27 | + </dependency> | |
24 | 28 | |
25 | 29 | <dependency> |
26 | 30 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... | ... | @@ -6,6 +6,7 @@ import com.google.common.collect.Sets; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
9 | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
10 | 11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
11 | 12 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
... | ... | @@ -8,12 +8,13 @@ import org.slf4j.LoggerFactory; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
12 | 11 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
12 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
13 | 13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
14 | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
16 | 16 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
17 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectInjector; | |
17 | 18 | import weka.classifiers.Classifier; |
18 | 19 | import weka.core.Instance; |
19 | 20 | import weka.core.Instances; |
... | ... | @@ -29,8 +30,8 @@ public class ApplyModel2 { |
29 | 30 | |
30 | 31 | private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class); |
31 | 32 | |
32 | - private static final String TEST_PREPROCESSED_DATA_PATH = "src/main/resources/preprocessed_full_texts/test"; | |
33 | - private static final String TARGET_DIR = "summaries"; | |
33 | + private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; | |
34 | + private static final String TARGET_DIR = "corpora/summaries"; | |
34 | 35 | |
35 | 36 | public static void main(String[] args) throws Exception { |
36 | 37 | Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); |
... | ... | @@ -39,6 +40,8 @@ public class ApplyModel2 { |
39 | 40 | Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); |
40 | 41 | SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
41 | 42 | |
43 | + ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); | |
44 | + | |
42 | 45 | Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH); |
43 | 46 | int i = 1; |
44 | 47 | double avgSize = 0; |
... | ... | @@ -49,10 +52,10 @@ public class ApplyModel2 { |
49 | 52 | = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); |
50 | 53 | |
51 | 54 | int targetSize = calculateTargetSize(text); |
52 | - String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); | |
55 | + String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor, zeroSubjectInjector); | |
53 | 56 | int size = Utils.tokenize(summary).size(); |
54 | 57 | avgSize += size; |
55 | - try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily3.txt")))) { | |
58 | + try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily4.txt")))) { | |
56 | 59 | bw.append(summary); |
57 | 60 | } |
58 | 61 | |
... | ... | @@ -71,12 +74,14 @@ public class ApplyModel2 { |
71 | 74 | return (int) (0.2 * tokenCount); |
72 | 75 | } |
73 | 76 | |
74 | - private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | |
77 | + private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor, ZeroSubjectInjector zeroSubjectInjector) throws Exception { | |
75 | 78 | List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); |
76 | 79 | |
77 | - StringBuffer sb = new StringBuffer(); | |
80 | + Set<String> zeroSubjectTokenIds = zeroSubjectInjector.findZeroSubjectTokenIds(thrifted, selectedSentences); | |
81 | + | |
82 | + StringBuilder sb = new StringBuilder(); | |
78 | 83 | for (TSentence sent : selectedSentences) { |
79 | - sb.append(" " + Utils.loadSentence2Orth(sent)); | |
84 | + sb.append(" " + Utils.loadSentence2Orth(sent, zeroSubjectTokenIds)); | |
80 | 85 | } |
81 | 86 | return sb.toString().trim(); |
82 | 87 | } |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
... | ... | @@ -2,9 +2,9 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; |
2 | 2 | |
3 | 3 | import com.google.common.collect.*; |
4 | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
5 | -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | |
6 | -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
7 | -import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; | |
8 | 8 | import weka.core.Attribute; |
9 | 9 | |
10 | 10 | import java.io.File; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
... | ... | @@ -19,7 +19,7 @@ public class MentionScorer { |
19 | 19 | Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); |
20 | 20 | |
21 | 21 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); |
22 | - Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences); | |
22 | + Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences, true); | |
23 | 23 | |
24 | 24 | return booleanTokenIntersection(mention2Orth, tokenCounts); |
25 | 25 | } |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
... | ... | @@ -7,7 +7,7 @@ import org.slf4j.Logger; |
7 | 7 | import org.slf4j.LoggerFactory; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
11 | 11 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
12 | 12 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
13 | 13 | import weka.core.Instance; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java
... | ... | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | 7 | import weka.classifiers.Classifier; |
8 | 8 | import weka.core.Instances; |
9 | 9 | import weka.core.converters.ArffLoader; |
... | ... | @@ -28,7 +28,7 @@ public class TrainModel { |
28 | 28 | StopWatch watch = new StopWatch(); |
29 | 29 | watch.start(); |
30 | 30 | |
31 | - Classifier classifier = Constants.getClassifier(); | |
31 | + Classifier classifier = Constants.getMentionClassifier(); | |
32 | 32 | |
33 | 33 | LOG.info("Building classifier..."); |
34 | 34 | classifier.buildClassifier(instances); |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java
... | ... | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test; |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | 7 | import weka.classifiers.Classifier; |
8 | 8 | import weka.classifiers.evaluation.Evaluation; |
9 | 9 | import weka.core.Instances; |
... | ... | @@ -32,7 +32,7 @@ public class Crossvalidate { |
32 | 32 | StopWatch watch = new StopWatch(); |
33 | 33 | watch.start(); |
34 | 34 | |
35 | - Classifier tree = Constants.getClassifier(); | |
35 | + Classifier tree = Constants.getMentionClassifier(); | |
36 | 36 | |
37 | 37 | Evaluation eval = new Evaluation(instances); |
38 | 38 | eval.crossValidateModel(tree, instances, 10, new Random(1)); |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java
... | ... | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test; |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | 7 | import weka.classifiers.Classifier; |
8 | 8 | import weka.classifiers.evaluation.Evaluation; |
9 | 9 | import weka.core.Instances; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
... | ... | @@ -8,7 +8,7 @@ import org.slf4j.LoggerFactory; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
12 | 12 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
13 | 13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
14 | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
... | ... | @@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; |
2 | 2 | |
3 | 3 | import com.google.common.collect.Maps; |
4 | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
5 | -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | |
6 | -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
7 | 7 | import weka.core.Attribute; |
8 | 8 | |
9 | 9 | import java.util.List; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
... | ... | @@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; |
3 | 3 | import com.google.common.collect.HashMultiset; |
4 | 4 | import com.google.common.collect.Maps; |
5 | 5 | import com.google.common.collect.Multiset; |
6 | +import com.google.common.collect.Sets; | |
6 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; |
7 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java
... | ... | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | 7 | import weka.classifiers.Classifier; |
8 | 8 | import weka.core.Instances; |
9 | 9 | import weka.core.converters.ArffLoader; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java
... | ... | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence.test; |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | 7 | import weka.classifiers.Classifier; |
8 | 8 | import weka.classifiers.evaluation.Evaluation; |
9 | 9 | import weka.core.Instances; |
... | ... |
nicolas-zero/pom.xml
... | ... | @@ -27,6 +27,10 @@ |
27 | 27 | <groupId>commons-io</groupId> |
28 | 28 | <artifactId>commons-io</artifactId> |
29 | 29 | </dependency> |
30 | + <dependency> | |
31 | + <groupId>org.apache.commons</groupId> | |
32 | + <artifactId>commons-lang3</artifactId> | |
33 | + </dependency> | |
30 | 34 | |
31 | 35 | <!-- logging --> |
32 | 36 | <dependency> |
... | ... |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
... | ... | @@ -12,7 +12,10 @@ import java.util.Set; |
12 | 12 | |
13 | 13 | public class CandidateFinder { |
14 | 14 | |
15 | - public List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { | |
15 | + private CandidateFinder() { | |
16 | + } | |
17 | + | |
18 | + public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { | |
16 | 19 | List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); |
17 | 20 | |
18 | 21 | Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); |
... | ... |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | |
2 | + | |
3 | +import com.google.common.collect.Lists; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
9 | +import weka.core.Attribute; | |
10 | + | |
11 | +import java.util.List; | |
12 | +import java.util.Map; | |
13 | + | |
14 | + | |
15 | +public class ZeroFeatureExtractor extends FeatureExtractor { | |
16 | + | |
17 | + public ZeroFeatureExtractor() { | |
18 | + | |
19 | + for (String prefix : new String[]{"antecedent", "candidate"}) { | |
20 | + addNumericAttribute(prefix + "_index_in_sent"); | |
21 | + addNumericAttribute(prefix + "_token_count"); | |
22 | + addBinaryAttribute(prefix + "_is_zero"); | |
23 | + addBinaryAttribute(prefix + "_is_pronoun"); | |
24 | + addBinaryAttribute(prefix + "_is_named"); | |
25 | + } | |
26 | + | |
27 | + addBinaryAttribute("pair_equal_orth"); | |
28 | + | |
29 | + addNominalAttribute("score", Lists.newArrayList("bad", "good")); | |
30 | + fillSortedAttributes("score"); | |
31 | + } | |
32 | + | |
33 | + public Map<ZeroSubjectCandidate, Map<Attribute, Double>> calculateFeatures(List<ZeroSubjectCandidate> candidates, TText text) { | |
34 | + Map<ZeroSubjectCandidate, Map<Attribute, Double>> result = Maps.newHashMap(); | |
35 | + | |
36 | + FeatureHelper helper = new FeatureHelper(text); | |
37 | + for (ZeroSubjectCandidate candidate : candidates) { | |
38 | + Map<Attribute, Double> candidateFeatures = calculateFeatures(candidate, helper); | |
39 | + result.put(candidate, candidateFeatures); | |
40 | + } | |
41 | + | |
42 | + return result; | |
43 | + } | |
44 | + | |
45 | + private Map<Attribute, Double> calculateFeatures(ZeroSubjectCandidate candidate, FeatureHelper helper) { | |
46 | + | |
47 | + Map<Attribute, Double> candidateFeatures = Maps.newHashMap(); | |
48 | + candidateFeatures.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | |
49 | + | |
50 | + TMention mention = candidate.getZeroCandidateMention(); | |
51 | + TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get(); | |
52 | + | |
53 | + addMentionFeatures(helper, candidateFeatures, mention, "candidate"); | |
54 | + addMentionFeatures(helper, candidateFeatures, antecedent, "antecedent"); | |
55 | + | |
56 | + candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equalsIgnoreCase(helper.getMentionOrth(antecedent)))); | |
57 | + | |
58 | + return candidateFeatures; | |
59 | + } | |
60 | + | |
61 | + private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) { | |
62 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention)); | |
63 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize()); | |
64 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_zero"), toBinary(mention.isZeroSubject())); | |
65 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_pronoun"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getCtag().matches("ppron.*"))); | |
66 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention))); | |
67 | + } | |
68 | + | |
69 | +} | |
... | ... |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
1 | 1 | package pl.waw.ipipan.zil.summ.nicolas.zero; |
2 | 2 | |
3 | +import com.google.common.collect.Sets; | |
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.zero.train.TrainingDataExtractor; | |
9 | +import weka.classifiers.Classifier; | |
10 | +import weka.core.Instance; | |
11 | +import weka.core.Instances; | |
12 | + | |
13 | +import java.io.IOException; | |
14 | +import java.util.List; | |
15 | +import java.util.Map; | |
16 | +import java.util.Set; | |
17 | +import java.util.stream.Collectors; | |
3 | 18 | |
4 | 19 | public class ZeroSubjectInjector { |
20 | + | |
21 | + private final ZeroFeatureExtractor featureExtractor; | |
22 | + private final Classifier classifier; | |
23 | + private final Instances instances; | |
24 | + | |
25 | + public ZeroSubjectInjector() throws IOException, ClassNotFoundException { | |
26 | + classifier = Utils.loadClassifier(Constants.ZERO_MODEL_PATH); | |
27 | + featureExtractor = new ZeroFeatureExtractor(); | |
28 | + instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
29 | + } | |
30 | + | |
31 | + public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { | |
32 | + Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); | |
33 | + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); | |
34 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = | |
35 | + TrainingDataExtractor.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
36 | + | |
37 | + Set<String> result = Sets.newHashSet(); | |
38 | + for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { | |
39 | + ZeroSubjectCandidate candidate = entry.getKey(); | |
40 | + Instance instance = entry.getValue(); | |
41 | + instance.setDataset(instances); | |
42 | + instance.setClassMissing(); | |
43 | + boolean good = classifier.classifyInstance(instance) > 0.5; | |
44 | + if (good) { | |
45 | + result.addAll(candidate.getZeroCandidateMention().getChildIds()); | |
46 | + } | |
47 | + } | |
48 | + return result; | |
49 | + } | |
50 | + | |
5 | 51 | } |
... | ... |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/TrainModel.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.train; | |
2 | + | |
3 | +import org.apache.commons.lang3.time.StopWatch; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | +import weka.classifiers.Classifier; | |
8 | +import weka.core.Instances; | |
9 | +import weka.core.converters.ArffLoader; | |
10 | + | |
11 | +import java.io.File; | |
12 | +import java.io.FileOutputStream; | |
13 | +import java.io.ObjectOutputStream; | |
14 | + | |
15 | + | |
16 | +public class TrainModel { | |
17 | + | |
18 | + private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | |
19 | + | |
20 | + private TrainModel() { | |
21 | + } | |
22 | + | |
23 | + public static void main(String[] args) throws Exception { | |
24 | + | |
25 | + ArffLoader loader = new ArffLoader(); | |
26 | + loader.setFile(new File(Constants.ZERO_DATASET_PATH)); | |
27 | + Instances instances = loader.getDataSet(); | |
28 | + instances.setClassIndex(0); | |
29 | + LOG.info(instances.size() + " instances loaded."); | |
30 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | |
31 | + | |
32 | + StopWatch watch = new StopWatch(); | |
33 | + watch.start(); | |
34 | + | |
35 | + Classifier classifier = Constants.getZerosClassifier(); | |
36 | + | |
37 | + LOG.info("Building classifier..."); | |
38 | + classifier.buildClassifier(instances); | |
39 | + LOG.info("...done."); | |
40 | + | |
41 | + try (ObjectOutputStream oos = new ObjectOutputStream( | |
42 | + new FileOutputStream(Constants.ZERO_MODEL_PATH))) { | |
43 | + oos.writeObject(classifier); | |
44 | + } | |
45 | + | |
46 | + watch.stop(); | |
47 | + LOG.info("Elapsed time: " + watch); | |
48 | + | |
49 | + LOG.info(classifier.toString()); | |
50 | + } | |
51 | +} | |
... | ... |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java renamed to nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/TrainingDataExtractor.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.train; | |
2 | 2 | |
3 | -import com.google.common.collect.Lists; | |
4 | 3 | import com.google.common.collect.Maps; |
5 | 4 | import com.google.common.collect.Sets; |
6 | -import org.apache.commons.csv.CSVFormat; | |
7 | -import org.apache.commons.csv.CSVPrinter; | |
8 | -import org.apache.commons.csv.QuoteMode; | |
9 | 5 | import org.apache.commons.io.IOUtils; |
10 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper; | |
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
12 | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | |
13 | +import weka.core.Attribute; | |
14 | +import weka.core.DenseInstance; | |
15 | +import weka.core.Instance; | |
16 | +import weka.core.Instances; | |
17 | +import weka.core.converters.ArffSaver; | |
13 | 18 | |
14 | 19 | import java.io.File; |
15 | 20 | import java.io.FileReader; |
16 | -import java.io.FileWriter; | |
17 | 21 | import java.io.IOException; |
18 | 22 | import java.util.List; |
19 | 23 | import java.util.Map; |
20 | 24 | import java.util.Set; |
21 | 25 | |
22 | -public class Zero { | |
26 | +public class TrainingDataExtractor { | |
23 | 27 | |
24 | 28 | private static final String IDS_PATH = "corpora/summaries_dev"; |
25 | 29 | private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; |
30 | + private static final String GOLD_ZEROS_PATH = "/zeros.tsv"; | |
26 | 31 | |
27 | - private Zero() { | |
32 | + private TrainingDataExtractor() { | |
28 | 33 | } |
29 | 34 | |
30 | 35 | public static void main(String[] args) throws IOException { |
31 | 36 | |
32 | - CandidateFinder candidateFinder = new CandidateFinder(); | |
33 | - | |
34 | 37 | Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); |
35 | 38 | Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH); |
36 | 39 | |
37 | - List<List<Object>> rows = Lists.newArrayList(); | |
40 | + ZeroScorer zeroScorer = new ZeroScorer(GOLD_ZEROS_PATH); | |
41 | + ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); | |
42 | + | |
43 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
44 | + | |
38 | 45 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
39 | 46 | String textId = entry.getKey(); |
40 | 47 | |
41 | 48 | TText text = entry.getValue(); |
42 | - ThriftTextHelper thriftTextHelper = new ThriftTextHelper(text); | |
43 | - | |
44 | 49 | Set<String> sentenceIds = id2sentIds.get(textId); |
50 | + FeatureHelper featureHelper = new FeatureHelper(text); | |
45 | 51 | |
46 | - List<ZeroSubjectCandidate> zeroSubjectCandidates = candidateFinder.findZeroSubjectCandidates(text, sentenceIds); | |
52 | + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); | |
53 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
47 | 54 | |
48 | - for (ZeroSubjectCandidate candidate : zeroSubjectCandidates) { | |
49 | - List<Object> row = Lists.newArrayList(); | |
50 | - row.add("C"); | |
51 | - row.add(textId); | |
52 | - row.add(thriftTextHelper.getMentionText(candidate.getZeroCandidateMention())); | |
53 | - row.add(thriftTextHelper.getSentenceText(candidate.getPreviousSentence())); | |
54 | - row.add(thriftTextHelper.getSentenceText(candidate.getSentence())); | |
55 | - rows.add(row); | |
55 | + for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { | |
56 | + boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); | |
57 | + Instance instance = entry2.getValue(); | |
58 | + instance.setDataset(instances); | |
59 | + instance.setClassValue(good ? 1 : 0); | |
60 | + instances.add(instance); | |
56 | 61 | } |
57 | 62 | } |
58 | 63 | |
59 | - try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) { | |
60 | - for (List<Object> row : rows) { | |
61 | - csvPrinter.printRecord(row); | |
64 | + saveInstancesToFile(instances); | |
65 | + } | |
66 | + | |
67 | + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | |
68 | + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | |
69 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | |
70 | + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | |
71 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
72 | + Map<Attribute, Double> sentenceFeatures = entry.getValue(); | |
73 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | |
74 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | |
62 | 75 | } |
76 | + candidate2instance.put(entry.getKey(), instance); | |
63 | 77 | } |
78 | + return candidate2instance; | |
79 | + } | |
64 | 80 | |
81 | + private static void saveInstancesToFile(Instances instances) throws IOException { | |
82 | + ArffSaver saver = new ArffSaver(); | |
83 | + saver.setInstances(instances); | |
84 | + saver.setFile(new File(Constants.ZERO_DATASET_PATH)); | |
85 | + saver.writeBatch(); | |
65 | 86 | } |
66 | 87 | |
67 | 88 | private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { |
... | ... |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/ZeroScorer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.train; | |
2 | + | |
3 | +import com.google.common.collect.Maps; | |
4 | +import org.apache.commons.csv.CSVFormat; | |
5 | +import org.apache.commons.csv.CSVParser; | |
6 | +import org.apache.commons.csv.CSVRecord; | |
7 | +import org.apache.commons.csv.QuoteMode; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | |
11 | + | |
12 | +import java.io.IOException; | |
13 | +import java.io.InputStream; | |
14 | +import java.io.InputStreamReader; | |
15 | +import java.util.List; | |
16 | +import java.util.Map; | |
17 | + | |
18 | +public class ZeroScorer { | |
19 | + | |
20 | + private static final char DELIMITER = '\t'; | |
21 | + | |
22 | + private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap(); | |
23 | + | |
24 | + public ZeroScorer(String goldZerosPath) throws IOException { | |
25 | + try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosPath); | |
26 | + InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING); | |
27 | + CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) { | |
28 | + List<CSVRecord> records = parser.getRecords(); | |
29 | + for (CSVRecord record : records) { | |
30 | + candidateEncoding2Decision.put(encode(record.get(2), record.get(3), record.get(4)), record.get(0).equalsIgnoreCase("C")); | |
31 | + } | |
32 | + } | |
33 | + } | |
34 | + | |
35 | + private String encode(String mentionOrth, String firstSentenceOrth, String secondSentenceOrth) { | |
36 | + return mentionOrth + DELIMITER + firstSentenceOrth + DELIMITER + secondSentenceOrth; | |
37 | + } | |
38 | + | |
39 | + private String encode(ZeroSubjectCandidate candidate, FeatureHelper helper) { | |
40 | + String mentionOrth = helper.getMentionOrth(candidate.getZeroCandidateMention()); | |
41 | + String firstSentenceOrth = helper.getSentenceOrth(candidate.getPreviousSentence()); | |
42 | + String secondSentenceOrth = helper.getSentenceOrth(candidate.getSentence()); | |
43 | + return encode(mentionOrth, firstSentenceOrth, secondSentenceOrth); | |
44 | + } | |
45 | + | |
46 | + public boolean isValidCandidate(ZeroSubjectCandidate candidate, FeatureHelper helper) { | |
47 | + return candidateEncoding2Decision.get(encode(candidate, helper)); | |
48 | + } | |
49 | + | |
50 | +} | |
... | ... |
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
... | ... | @@ -2,12 +2,11 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; |
2 | 2 | |
3 | 3 | import com.google.common.collect.Sets; |
4 | 4 | import org.apache.commons.io.IOUtils; |
5 | -import org.junit.BeforeClass; | |
6 | 5 | import org.junit.Test; |
7 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
8 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper; | |
10 | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
11 | 10 | |
12 | 11 | import java.io.IOException; |
13 | 12 | import java.io.InputStream; |
... | ... | @@ -22,18 +21,11 @@ public class CandidateFinderTest { |
22 | 21 | private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin"; |
23 | 22 | private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; |
24 | 23 | |
25 | - private static CandidateFinder candidateFinder; | |
26 | - | |
27 | - @BeforeClass | |
28 | - public static void init() { | |
29 | - candidateFinder = new CandidateFinder(); | |
30 | - } | |
31 | - | |
32 | 24 | @Test |
33 | 25 | public void shouldFindZeroSubjectCandidateInSampleText() throws Exception { |
34 | - ThriftTextHelper sampleTextHelper = loadSampleTextHelper(); | |
26 | + FeatureHelper sampleTextHelper = loadSampleTextHelper(); | |
35 | 27 | Set<String> summarySentenceIds = loadSampleTextSummarySentenceIds(); |
36 | - List<ZeroSubjectCandidate> candidates = candidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds); | |
28 | + List<ZeroSubjectCandidate> candidates = CandidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds); | |
37 | 29 | assertEquals(1, candidates.size()); |
38 | 30 | |
39 | 31 | ZeroSubjectCandidate zeroSubjectCandidate = candidates.get(0); |
... | ... | @@ -41,9 +33,9 @@ public class CandidateFinderTest { |
41 | 33 | TSentence secondSentence = zeroSubjectCandidate.getSentence(); |
42 | 34 | TMention zeroCandidate = zeroSubjectCandidate.getZeroCandidateMention(); |
43 | 35 | |
44 | - assertEquals("Ala ma kota.", sampleTextHelper.getSentenceText(firstSentence)); | |
45 | - assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceText(secondSentence)); | |
46 | - assertEquals("Ala", sampleTextHelper.getMentionText(zeroCandidate)); | |
36 | + assertEquals("Ala ma kota.", sampleTextHelper.getSentenceOrth(firstSentence)); | |
37 | + assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceOrth(secondSentence)); | |
38 | + assertEquals("Ala", sampleTextHelper.getMentionOrth(zeroCandidate)); | |
47 | 39 | } |
48 | 40 | |
49 | 41 | private Set<String> loadSampleTextSummarySentenceIds() throws IOException { |
... | ... | @@ -53,9 +45,9 @@ public class CandidateFinderTest { |
53 | 45 | } |
54 | 46 | } |
55 | 47 | |
56 | - private ThriftTextHelper loadSampleTextHelper() throws IOException { | |
48 | + private FeatureHelper loadSampleTextHelper() throws IOException { | |
57 | 49 | try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { |
58 | - return new ThriftTextHelper(Utils.loadThrifted(stream)); | |
50 | + return new FeatureHelper(Utils.loadThrifted(stream)); | |
59 | 51 | } |
60 | 52 | } |
61 | 53 | } |
62 | 54 | \ No newline at end of file |
... | ... |
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjectorTest.java deleted
nicolas-zero/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin
0 → 100644
No preview for this file type
nicolas-zero/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt
0 → 100644
pom.xml
... | ... | @@ -61,6 +61,11 @@ |
61 | 61 | <artifactId>nicolas-common</artifactId> |
62 | 62 | <version>${project.version}</version> |
63 | 63 | </dependency> |
64 | + <dependency> | |
65 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
66 | + <artifactId>nicolas-zero</artifactId> | |
67 | + <version>${project.version}</version> | |
68 | + </dependency> | |
64 | 69 | |
65 | 70 | <!-- internal --> |
66 | 71 | <dependency> |
... | ... |