Commit 91b27b24a9fb0d6427debc133c923e3188f9a768
1 parent
e058b3c2
zeros corpus wip
Showing
32 changed files
with
401 additions
and
86 deletions
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java
1 | -package pl.waw.ipipan.zil.summ.nicolas; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.common; |
2 | 2 | ||
3 | +import com.google.common.base.Charsets; | ||
3 | import weka.classifiers.Classifier; | 4 | import weka.classifiers.Classifier; |
5 | +import weka.classifiers.functions.Logistic; | ||
4 | import weka.classifiers.trees.RandomForest; | 6 | import weka.classifiers.trees.RandomForest; |
5 | 7 | ||
8 | +import java.nio.charset.Charset; | ||
9 | + | ||
6 | 10 | ||
7 | public class Constants { | 11 | public class Constants { |
8 | 12 | ||
9 | public static final String MENTIONS_MODEL_PATH = "mentions_model.bin"; | 13 | public static final String MENTIONS_MODEL_PATH = "mentions_model.bin"; |
10 | public static final String SENTENCES_MODEL_PATH = "sentences_model.bin"; | 14 | public static final String SENTENCES_MODEL_PATH = "sentences_model.bin"; |
15 | + public static final String ZERO_MODEL_PATH = "zeros_model.bin"; | ||
16 | + | ||
11 | public static final String MENTIONS_DATASET_PATH = "mentions_train.arff"; | 17 | public static final String MENTIONS_DATASET_PATH = "mentions_train.arff"; |
12 | public static final String SENTENCES_DATASET_PATH = "sentences_train.arff"; | 18 | public static final String SENTENCES_DATASET_PATH = "sentences_train.arff"; |
19 | + public static final String ZERO_DATASET_PATH = "zeros_train.arff"; | ||
20 | + | ||
21 | + public static final Charset ENCODING = Charsets.UTF_8; | ||
13 | 22 | ||
14 | private Constants() { | 23 | private Constants() { |
15 | } | 24 | } |
16 | 25 | ||
17 | - public static Classifier getClassifier() { | 26 | + public static Classifier getMentionClassifier() { |
18 | RandomForest classifier = new RandomForest(); | 27 | RandomForest classifier = new RandomForest(); |
19 | classifier.setNumIterations(250); | 28 | classifier.setNumIterations(250); |
20 | classifier.setSeed(0); | 29 | classifier.setSeed(0); |
@@ -22,7 +31,6 @@ public class Constants { | @@ -22,7 +31,6 @@ public class Constants { | ||
22 | return classifier; | 31 | return classifier; |
23 | } | 32 | } |
24 | 33 | ||
25 | - | ||
26 | public static Classifier getSentencesClassifier() { | 34 | public static Classifier getSentencesClassifier() { |
27 | RandomForest classifier = new RandomForest(); | 35 | RandomForest classifier = new RandomForest(); |
28 | classifier.setNumIterations(250); | 36 | classifier.setNumIterations(250); |
@@ -30,4 +38,9 @@ public class Constants { | @@ -30,4 +38,9 @@ public class Constants { | ||
30 | classifier.setNumExecutionSlots(8); | 38 | classifier.setNumExecutionSlots(8); |
31 | return classifier; | 39 | return classifier; |
32 | } | 40 | } |
41 | + | ||
42 | + public static Classifier getZerosClassifier() { | ||
43 | + Logistic classifier = new Logistic(); | ||
44 | + return classifier; | ||
45 | + } | ||
33 | } | 46 | } |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
@@ -101,7 +101,7 @@ public class Utils { | @@ -101,7 +101,7 @@ public class Utils { | ||
101 | STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co")); | 101 | STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co")); |
102 | } | 102 | } |
103 | 103 | ||
104 | - public static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { | 104 | + public static Map<TMention, String> loadMention2Orth(List<TSentence> sents, boolean discardStopwords) { |
105 | Map<TMention, String> mention2orth = Maps.newHashMap(); | 105 | Map<TMention, String> mention2orth = Maps.newHashMap(); |
106 | for (TSentence s : sents) { | 106 | for (TSentence s : sents) { |
107 | Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | 107 | Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); |
@@ -110,7 +110,7 @@ public class Utils { | @@ -110,7 +110,7 @@ public class Utils { | ||
110 | StringBuffer mentionOrth = new StringBuffer(); | 110 | StringBuffer mentionOrth = new StringBuffer(); |
111 | for (String tokId : m.getChildIds()) { | 111 | for (String tokId : m.getChildIds()) { |
112 | TToken token = tokId2tok.get(tokId); | 112 | TToken token = tokId2tok.get(tokId); |
113 | - if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { | 113 | + if (discardStopwords && STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { |
114 | continue; | 114 | continue; |
115 | } | 115 | } |
116 | 116 | ||
@@ -142,8 +142,16 @@ public class Utils { | @@ -142,8 +142,16 @@ public class Utils { | ||
142 | } | 142 | } |
143 | 143 | ||
144 | public static String loadSentence2Orth(TSentence sentence) { | 144 | public static String loadSentence2Orth(TSentence sentence) { |
145 | + return loadSentence2Orth(sentence, Sets.newHashSet()); | ||
146 | + } | ||
147 | + | ||
148 | + public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { | ||
145 | StringBuilder sb = new StringBuilder(); | 149 | StringBuilder sb = new StringBuilder(); |
146 | for (TToken token : sentence.getTokens()) { | 150 | for (TToken token : sentence.getTokens()) { |
151 | + if (tokenIdsToSkip.contains(token.getId())) { | ||
152 | + System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); | ||
153 | + continue; | ||
154 | + } | ||
147 | if (!token.isNoPrecedingSpace()) | 155 | if (!token.isNoPrecedingSpace()) |
148 | sb.append(" "); | 156 | sb.append(" "); |
149 | sb.append(token.getOrth()); | 157 | sb.append(token.getOrth()); |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.common; | ||
2 | + | ||
3 | +import java.io.IOException; | ||
4 | +import java.io.InputStream; | ||
5 | +import java.io.ObjectInputStream; | ||
6 | +import java.io.ObjectStreamClass; | ||
7 | + | ||
8 | + | ||
9 | +public class VersionIgnoringObjectInputStream extends ObjectInputStream { | ||
10 | + | ||
11 | + public VersionIgnoringObjectInputStream(InputStream in) throws IOException { | ||
12 | + super(in); | ||
13 | + } | ||
14 | + | ||
15 | + protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException { | ||
16 | + ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor | ||
17 | + Class localClass; // the class in the local JVM that this descriptor represents. | ||
18 | + try { | ||
19 | + localClass = Class.forName(resultClassDescriptor.getName()); | ||
20 | + } catch (ClassNotFoundException e) { | ||
21 | + return resultClassDescriptor; | ||
22 | + } | ||
23 | + ObjectStreamClass localClassDescriptor = ObjectStreamClass.lookup(localClass); | ||
24 | + if (localClassDescriptor != null) { // only if class implements serializable | ||
25 | + final long localSUID = localClassDescriptor.getSerialVersionUID(); | ||
26 | + final long streamSUID = resultClassDescriptor.getSerialVersionUID(); | ||
27 | + if (streamSUID != localSUID) { // check for serialVersionUID mismatch. | ||
28 | + resultClassDescriptor = localClassDescriptor; // Use local class descriptor for deserialization | ||
29 | + } | ||
30 | + } | ||
31 | + return resultClassDescriptor; | ||
32 | + } | ||
33 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.features; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.common.features; |
2 | 2 | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import com.google.common.collect.Sets; | 4 | import com.google.common.collect.Sets; |
@@ -17,6 +17,8 @@ import static java.util.stream.Collectors.toMap; | @@ -17,6 +17,8 @@ import static java.util.stream.Collectors.toMap; | ||
17 | 17 | ||
18 | public class FeatureHelper { | 18 | public class FeatureHelper { |
19 | 19 | ||
20 | + private final TText text; | ||
21 | + | ||
20 | private final List<TMention> mentions; | 22 | private final List<TMention> mentions; |
21 | private final Map<String, TMention> mentionId2mention; | 23 | private final Map<String, TMention> mentionId2mention; |
22 | private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap(); | 24 | private final Map<TCoreference, List<TMention>> coref2mentions = Maps.newHashMap(); |
@@ -37,6 +39,8 @@ public class FeatureHelper { | @@ -37,6 +39,8 @@ public class FeatureHelper { | ||
37 | 39 | ||
38 | 40 | ||
39 | public FeatureHelper(TText preprocessedText) { | 41 | public FeatureHelper(TText preprocessedText) { |
42 | + text = preprocessedText; | ||
43 | + | ||
40 | mentions = preprocessedText.getParagraphs().stream() | 44 | mentions = preprocessedText.getParagraphs().stream() |
41 | .flatMap(p -> p.getSentences().stream()) | 45 | .flatMap(p -> p.getSentences().stream()) |
42 | .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList()); | 46 | .flatMap(s -> s.getMentions().stream()).collect(Collectors.toList()); |
@@ -55,7 +59,7 @@ public class FeatureHelper { | @@ -55,7 +59,7 @@ public class FeatureHelper { | ||
55 | int sentIdx = 0; | 59 | int sentIdx = 0; |
56 | int mentionIdx = 0; | 60 | int mentionIdx = 0; |
57 | for (TParagraph par : preprocessedText.getParagraphs()) { | 61 | for (TParagraph par : preprocessedText.getParagraphs()) { |
58 | - Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences()); | 62 | + Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences(), false); |
59 | mention2Orth.putAll(m2o); | 63 | mention2Orth.putAll(m2o); |
60 | Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); | 64 | Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); |
61 | mention2Base.putAll(m2b); | 65 | mention2Base.putAll(m2b); |
@@ -182,4 +186,18 @@ public class FeatureHelper { | @@ -182,4 +186,18 @@ public class FeatureHelper { | ||
182 | public TCoreference getMentionCluster(TMention tMention) { | 186 | public TCoreference getMentionCluster(TMention tMention) { |
183 | return this.mention2coref.get(tMention); | 187 | return this.mention2coref.get(tMention); |
184 | } | 188 | } |
189 | + | ||
190 | + public String getSentenceOrth(TSentence sentence) { | ||
191 | + StringBuilder sb = new StringBuilder(); | ||
192 | + for (TToken token : sentence.getTokens()) { | ||
193 | + if (!token.isNoPrecedingSpace()) | ||
194 | + sb.append(" "); | ||
195 | + sb.append(token.getOrth()); | ||
196 | + } | ||
197 | + return sb.toString().trim(); | ||
198 | + } | ||
199 | + | ||
200 | + public TText getText() { | ||
201 | + return text; | ||
202 | + } | ||
185 | } | 203 | } |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java
nicolas-core/pom.xml
@@ -21,6 +21,10 @@ | @@ -21,6 +21,10 @@ | ||
21 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 21 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
22 | <artifactId>nicolas-model</artifactId> | 22 | <artifactId>nicolas-model</artifactId> |
23 | </dependency> | 23 | </dependency> |
24 | + <dependency> | ||
25 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
26 | + <artifactId>nicolas-zero</artifactId> | ||
27 | + </dependency> | ||
24 | 28 | ||
25 | <dependency> | 29 | <dependency> |
26 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 30 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -6,6 +6,7 @@ import com.google.common.collect.Sets; | @@ -6,6 +6,7 @@ import com.google.common.collect.Sets; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
9 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
10 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 12 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
@@ -8,12 +8,13 @@ import org.slf4j.LoggerFactory; | @@ -8,12 +8,13 @@ import org.slf4j.LoggerFactory; | ||
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
12 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | 11 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
12 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
16 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 16 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
17 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectInjector; | ||
17 | import weka.classifiers.Classifier; | 18 | import weka.classifiers.Classifier; |
18 | import weka.core.Instance; | 19 | import weka.core.Instance; |
19 | import weka.core.Instances; | 20 | import weka.core.Instances; |
@@ -29,8 +30,8 @@ public class ApplyModel2 { | @@ -29,8 +30,8 @@ public class ApplyModel2 { | ||
29 | 30 | ||
30 | private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class); | 31 | private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class); |
31 | 32 | ||
32 | - private static final String TEST_PREPROCESSED_DATA_PATH = "src/main/resources/preprocessed_full_texts/test"; | ||
33 | - private static final String TARGET_DIR = "summaries"; | 33 | + private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; |
34 | + private static final String TARGET_DIR = "corpora/summaries"; | ||
34 | 35 | ||
35 | public static void main(String[] args) throws Exception { | 36 | public static void main(String[] args) throws Exception { |
36 | Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | 37 | Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); |
@@ -39,6 +40,8 @@ public class ApplyModel2 { | @@ -39,6 +40,8 @@ public class ApplyModel2 { | ||
39 | Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); | 40 | Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); |
40 | SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); | 41 | SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
41 | 42 | ||
43 | + ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); | ||
44 | + | ||
42 | Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH); | 45 | Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH); |
43 | int i = 1; | 46 | int i = 1; |
44 | double avgSize = 0; | 47 | double avgSize = 0; |
@@ -49,10 +52,10 @@ public class ApplyModel2 { | @@ -49,10 +52,10 @@ public class ApplyModel2 { | ||
49 | = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); | 52 | = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); |
50 | 53 | ||
51 | int targetSize = calculateTargetSize(text); | 54 | int targetSize = calculateTargetSize(text); |
52 | - String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); | 55 | + String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor, zeroSubjectInjector); |
53 | int size = Utils.tokenize(summary).size(); | 56 | int size = Utils.tokenize(summary).size(); |
54 | avgSize += size; | 57 | avgSize += size; |
55 | - try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily3.txt")))) { | 58 | + try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily4.txt")))) { |
56 | bw.append(summary); | 59 | bw.append(summary); |
57 | } | 60 | } |
58 | 61 | ||
@@ -71,12 +74,14 @@ public class ApplyModel2 { | @@ -71,12 +74,14 @@ public class ApplyModel2 { | ||
71 | return (int) (0.2 * tokenCount); | 74 | return (int) (0.2 * tokenCount); |
72 | } | 75 | } |
73 | 76 | ||
74 | - private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | 77 | + private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor, ZeroSubjectInjector zeroSubjectInjector) throws Exception { |
75 | List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); | 78 | List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); |
76 | 79 | ||
77 | - StringBuffer sb = new StringBuffer(); | 80 | + Set<String> zeroSubjectTokenIds = zeroSubjectInjector.findZeroSubjectTokenIds(thrifted, selectedSentences); |
81 | + | ||
82 | + StringBuilder sb = new StringBuilder(); | ||
78 | for (TSentence sent : selectedSentences) { | 83 | for (TSentence sent : selectedSentences) { |
79 | - sb.append(" " + Utils.loadSentence2Orth(sent)); | 84 | + sb.append(" " + Utils.loadSentence2Orth(sent, zeroSubjectTokenIds)); |
80 | } | 85 | } |
81 | return sb.toString().trim(); | 86 | return sb.toString().trim(); |
82 | } | 87 | } |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
@@ -2,9 +2,9 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; | @@ -2,9 +2,9 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
2 | 2 | ||
3 | import com.google.common.collect.*; | 3 | import com.google.common.collect.*; |
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
5 | -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | ||
6 | -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | ||
7 | -import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; | 5 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | ||
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; | ||
8 | import weka.core.Attribute; | 8 | import weka.core.Attribute; |
9 | 9 | ||
10 | import java.io.File; | 10 | import java.io.File; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
@@ -19,7 +19,7 @@ public class MentionScorer { | @@ -19,7 +19,7 @@ public class MentionScorer { | ||
19 | Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | 19 | Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); |
20 | 20 | ||
21 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | 21 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); |
22 | - Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences); | 22 | + Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences, true); |
23 | 23 | ||
24 | return booleanTokenIntersection(mention2Orth, tokenCounts); | 24 | return booleanTokenIntersection(mention2Orth, tokenCounts); |
25 | } | 25 | } |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
@@ -7,7 +7,7 @@ import org.slf4j.Logger; | @@ -7,7 +7,7 @@ import org.slf4j.Logger; | ||
7 | import org.slf4j.LoggerFactory; | 7 | import org.slf4j.LoggerFactory; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | 10 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
11 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | 11 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
12 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 12 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
13 | import weka.core.Instance; | 13 | import weka.core.Instance; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | import weka.classifiers.Classifier; | 7 | import weka.classifiers.Classifier; |
8 | import weka.core.Instances; | 8 | import weka.core.Instances; |
9 | import weka.core.converters.ArffLoader; | 9 | import weka.core.converters.ArffLoader; |
@@ -28,7 +28,7 @@ public class TrainModel { | @@ -28,7 +28,7 @@ public class TrainModel { | ||
28 | StopWatch watch = new StopWatch(); | 28 | StopWatch watch = new StopWatch(); |
29 | watch.start(); | 29 | watch.start(); |
30 | 30 | ||
31 | - Classifier classifier = Constants.getClassifier(); | 31 | + Classifier classifier = Constants.getMentionClassifier(); |
32 | 32 | ||
33 | LOG.info("Building classifier..."); | 33 | LOG.info("Building classifier..."); |
34 | classifier.buildClassifier(instances); | 34 | classifier.buildClassifier(instances); |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test; | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test; | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | import weka.classifiers.Classifier; | 7 | import weka.classifiers.Classifier; |
8 | import weka.classifiers.evaluation.Evaluation; | 8 | import weka.classifiers.evaluation.Evaluation; |
9 | import weka.core.Instances; | 9 | import weka.core.Instances; |
@@ -32,7 +32,7 @@ public class Crossvalidate { | @@ -32,7 +32,7 @@ public class Crossvalidate { | ||
32 | StopWatch watch = new StopWatch(); | 32 | StopWatch watch = new StopWatch(); |
33 | watch.start(); | 33 | watch.start(); |
34 | 34 | ||
35 | - Classifier tree = Constants.getClassifier(); | 35 | + Classifier tree = Constants.getMentionClassifier(); |
36 | 36 | ||
37 | Evaluation eval = new Evaluation(instances); | 37 | Evaluation eval = new Evaluation(instances); |
38 | eval.crossValidateModel(tree, instances, 10, new Random(1)); | 38 | eval.crossValidateModel(tree, instances, 10, new Random(1)); |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test; | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention.test; | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | import weka.classifiers.Classifier; | 7 | import weka.classifiers.Classifier; |
8 | import weka.classifiers.evaluation.Evaluation; | 8 | import weka.classifiers.evaluation.Evaluation; |
9 | import weka.core.Instances; | 9 | import weka.core.Instances; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
@@ -8,7 +8,7 @@ import org.slf4j.LoggerFactory; | @@ -8,7 +8,7 @@ import org.slf4j.LoggerFactory; | ||
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | 11 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
12 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | 12 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
@@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; | @@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; | ||
2 | 2 | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
5 | -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | ||
6 | -import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 5 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | ||
7 | import weka.core.Attribute; | 7 | import weka.core.Attribute; |
8 | 8 | ||
9 | import java.util.List; | 9 | import java.util.List; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
@@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; | @@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; | ||
3 | import com.google.common.collect.HashMultiset; | 3 | import com.google.common.collect.HashMultiset; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
5 | import com.google.common.collect.Multiset; | 5 | import com.google.common.collect.Multiset; |
6 | +import com.google.common.collect.Sets; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | import weka.classifiers.Classifier; | 7 | import weka.classifiers.Classifier; |
8 | import weka.core.Instances; | 8 | import weka.core.Instances; |
9 | import weka.core.converters.ArffLoader; | 9 | import weka.core.converters.ArffLoader; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence.test; | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence.test; | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | import weka.classifiers.Classifier; | 7 | import weka.classifiers.Classifier; |
8 | import weka.classifiers.evaluation.Evaluation; | 8 | import weka.classifiers.evaluation.Evaluation; |
9 | import weka.core.Instances; | 9 | import weka.core.Instances; |
nicolas-zero/pom.xml
@@ -27,6 +27,10 @@ | @@ -27,6 +27,10 @@ | ||
27 | <groupId>commons-io</groupId> | 27 | <groupId>commons-io</groupId> |
28 | <artifactId>commons-io</artifactId> | 28 | <artifactId>commons-io</artifactId> |
29 | </dependency> | 29 | </dependency> |
30 | + <dependency> | ||
31 | + <groupId>org.apache.commons</groupId> | ||
32 | + <artifactId>commons-lang3</artifactId> | ||
33 | + </dependency> | ||
30 | 34 | ||
31 | <!-- logging --> | 35 | <!-- logging --> |
32 | <dependency> | 36 | <dependency> |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
@@ -12,7 +12,10 @@ import java.util.Set; | @@ -12,7 +12,10 @@ import java.util.Set; | ||
12 | 12 | ||
13 | public class CandidateFinder { | 13 | public class CandidateFinder { |
14 | 14 | ||
15 | - public List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { | 15 | + private CandidateFinder() { |
16 | + } | ||
17 | + | ||
18 | + public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { | ||
16 | List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); | 19 | List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); |
17 | 20 | ||
18 | Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | 21 | Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | + | ||
3 | +import com.google.common.collect.Lists; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | ||
8 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | ||
9 | +import weka.core.Attribute; | ||
10 | + | ||
11 | +import java.util.List; | ||
12 | +import java.util.Map; | ||
13 | + | ||
14 | + | ||
15 | +public class ZeroFeatureExtractor extends FeatureExtractor { | ||
16 | + | ||
17 | + public ZeroFeatureExtractor() { | ||
18 | + | ||
19 | + for (String prefix : new String[]{"antecedent", "candidate"}) { | ||
20 | + addNumericAttribute(prefix + "_index_in_sent"); | ||
21 | + addNumericAttribute(prefix + "_token_count"); | ||
22 | + addBinaryAttribute(prefix + "_is_zero"); | ||
23 | + addBinaryAttribute(prefix + "_is_pronoun"); | ||
24 | + addBinaryAttribute(prefix + "_is_named"); | ||
25 | + } | ||
26 | + | ||
27 | + addBinaryAttribute("pair_equal_orth"); | ||
28 | + | ||
29 | + addNominalAttribute("score", Lists.newArrayList("bad", "good")); | ||
30 | + fillSortedAttributes("score"); | ||
31 | + } | ||
32 | + | ||
33 | + public Map<ZeroSubjectCandidate, Map<Attribute, Double>> calculateFeatures(List<ZeroSubjectCandidate> candidates, TText text) { | ||
34 | + Map<ZeroSubjectCandidate, Map<Attribute, Double>> result = Maps.newHashMap(); | ||
35 | + | ||
36 | + FeatureHelper helper = new FeatureHelper(text); | ||
37 | + for (ZeroSubjectCandidate candidate : candidates) { | ||
38 | + Map<Attribute, Double> candidateFeatures = calculateFeatures(candidate, helper); | ||
39 | + result.put(candidate, candidateFeatures); | ||
40 | + } | ||
41 | + | ||
42 | + return result; | ||
43 | + } | ||
44 | + | ||
45 | + private Map<Attribute, Double> calculateFeatures(ZeroSubjectCandidate candidate, FeatureHelper helper) { | ||
46 | + | ||
47 | + Map<Attribute, Double> candidateFeatures = Maps.newHashMap(); | ||
48 | + candidateFeatures.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | ||
49 | + | ||
50 | + TMention mention = candidate.getZeroCandidateMention(); | ||
51 | + TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get(); | ||
52 | + | ||
53 | + addMentionFeatures(helper, candidateFeatures, mention, "candidate"); | ||
54 | + addMentionFeatures(helper, candidateFeatures, antecedent, "antecedent"); | ||
55 | + | ||
56 | + candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equalsIgnoreCase(helper.getMentionOrth(antecedent)))); | ||
57 | + | ||
58 | + return candidateFeatures; | ||
59 | + } | ||
60 | + | ||
61 | + private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) { | ||
62 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention)); | ||
63 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize()); | ||
64 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_zero"), toBinary(mention.isZeroSubject())); | ||
65 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_pronoun"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getCtag().matches("ppron.*"))); | ||
66 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention))); | ||
67 | + } | ||
68 | + | ||
69 | +} |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
1 | package pl.waw.ipipan.zil.summ.nicolas.zero; | 1 | package pl.waw.ipipan.zil.summ.nicolas.zero; |
2 | 2 | ||
3 | +import com.google.common.collect.Sets; | ||
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
8 | +import pl.waw.ipipan.zil.summ.nicolas.zero.train.TrainingDataExtractor; | ||
9 | +import weka.classifiers.Classifier; | ||
10 | +import weka.core.Instance; | ||
11 | +import weka.core.Instances; | ||
12 | + | ||
13 | +import java.io.IOException; | ||
14 | +import java.util.List; | ||
15 | +import java.util.Map; | ||
16 | +import java.util.Set; | ||
17 | +import java.util.stream.Collectors; | ||
3 | 18 | ||
4 | public class ZeroSubjectInjector { | 19 | public class ZeroSubjectInjector { |
20 | + | ||
21 | + private final ZeroFeatureExtractor featureExtractor; | ||
22 | + private final Classifier classifier; | ||
23 | + private final Instances instances; | ||
24 | + | ||
25 | + public ZeroSubjectInjector() throws IOException, ClassNotFoundException { | ||
26 | + classifier = Utils.loadClassifier(Constants.ZERO_MODEL_PATH); | ||
27 | + featureExtractor = new ZeroFeatureExtractor(); | ||
28 | + instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | ||
29 | + } | ||
30 | + | ||
31 | + public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { | ||
32 | + Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); | ||
33 | + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); | ||
34 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = | ||
35 | + TrainingDataExtractor.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | ||
36 | + | ||
37 | + Set<String> result = Sets.newHashSet(); | ||
38 | + for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { | ||
39 | + ZeroSubjectCandidate candidate = entry.getKey(); | ||
40 | + Instance instance = entry.getValue(); | ||
41 | + instance.setDataset(instances); | ||
42 | + instance.setClassMissing(); | ||
43 | + boolean good = classifier.classifyInstance(instance) > 0.5; | ||
44 | + if (good) { | ||
45 | + result.addAll(candidate.getZeroCandidateMention().getChildIds()); | ||
46 | + } | ||
47 | + } | ||
48 | + return result; | ||
49 | + } | ||
50 | + | ||
5 | } | 51 | } |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/TrainModel.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.train; | ||
2 | + | ||
3 | +import org.apache.commons.lang3.time.StopWatch; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
7 | +import weka.classifiers.Classifier; | ||
8 | +import weka.core.Instances; | ||
9 | +import weka.core.converters.ArffLoader; | ||
10 | + | ||
11 | +import java.io.File; | ||
12 | +import java.io.FileOutputStream; | ||
13 | +import java.io.ObjectOutputStream; | ||
14 | + | ||
15 | + | ||
16 | +public class TrainModel { | ||
17 | + | ||
18 | + private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | ||
19 | + | ||
20 | + private TrainModel() { | ||
21 | + } | ||
22 | + | ||
23 | + public static void main(String[] args) throws Exception { | ||
24 | + | ||
25 | + ArffLoader loader = new ArffLoader(); | ||
26 | + loader.setFile(new File(Constants.ZERO_DATASET_PATH)); | ||
27 | + Instances instances = loader.getDataSet(); | ||
28 | + instances.setClassIndex(0); | ||
29 | + LOG.info(instances.size() + " instances loaded."); | ||
30 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
31 | + | ||
32 | + StopWatch watch = new StopWatch(); | ||
33 | + watch.start(); | ||
34 | + | ||
35 | + Classifier classifier = Constants.getZerosClassifier(); | ||
36 | + | ||
37 | + LOG.info("Building classifier..."); | ||
38 | + classifier.buildClassifier(instances); | ||
39 | + LOG.info("...done."); | ||
40 | + | ||
41 | + try (ObjectOutputStream oos = new ObjectOutputStream( | ||
42 | + new FileOutputStream(Constants.ZERO_MODEL_PATH))) { | ||
43 | + oos.writeObject(classifier); | ||
44 | + } | ||
45 | + | ||
46 | + watch.stop(); | ||
47 | + LOG.info("Elapsed time: " + watch); | ||
48 | + | ||
49 | + LOG.info(classifier.toString()); | ||
50 | + } | ||
51 | +} |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java renamed to nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/TrainingDataExtractor.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.train; |
2 | 2 | ||
3 | -import com.google.common.collect.Lists; | ||
4 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
5 | import com.google.common.collect.Sets; | 4 | import com.google.common.collect.Sets; |
6 | -import org.apache.commons.csv.CSVFormat; | ||
7 | -import org.apache.commons.csv.CSVPrinter; | ||
8 | -import org.apache.commons.csv.QuoteMode; | ||
9 | import org.apache.commons.io.IOUtils; | 5 | import org.apache.commons.io.IOUtils; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper; | 7 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
12 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | ||
10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | ||
12 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | ||
13 | +import weka.core.Attribute; | ||
14 | +import weka.core.DenseInstance; | ||
15 | +import weka.core.Instance; | ||
16 | +import weka.core.Instances; | ||
17 | +import weka.core.converters.ArffSaver; | ||
13 | 18 | ||
14 | import java.io.File; | 19 | import java.io.File; |
15 | import java.io.FileReader; | 20 | import java.io.FileReader; |
16 | -import java.io.FileWriter; | ||
17 | import java.io.IOException; | 21 | import java.io.IOException; |
18 | import java.util.List; | 22 | import java.util.List; |
19 | import java.util.Map; | 23 | import java.util.Map; |
20 | import java.util.Set; | 24 | import java.util.Set; |
21 | 25 | ||
22 | -public class Zero { | 26 | +public class TrainingDataExtractor { |
23 | 27 | ||
24 | private static final String IDS_PATH = "corpora/summaries_dev"; | 28 | private static final String IDS_PATH = "corpora/summaries_dev"; |
25 | private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; | 29 | private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; |
30 | + private static final String GOLD_ZEROS_PATH = "/zeros.tsv"; | ||
26 | 31 | ||
27 | - private Zero() { | 32 | + private TrainingDataExtractor() { |
28 | } | 33 | } |
29 | 34 | ||
30 | public static void main(String[] args) throws IOException { | 35 | public static void main(String[] args) throws IOException { |
31 | 36 | ||
32 | - CandidateFinder candidateFinder = new CandidateFinder(); | ||
33 | - | ||
34 | Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); | 37 | Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); |
35 | Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH); | 38 | Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH); |
36 | 39 | ||
37 | - List<List<Object>> rows = Lists.newArrayList(); | 40 | + ZeroScorer zeroScorer = new ZeroScorer(GOLD_ZEROS_PATH); |
41 | + ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); | ||
42 | + | ||
43 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | ||
44 | + | ||
38 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | 45 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
39 | String textId = entry.getKey(); | 46 | String textId = entry.getKey(); |
40 | 47 | ||
41 | TText text = entry.getValue(); | 48 | TText text = entry.getValue(); |
42 | - ThriftTextHelper thriftTextHelper = new ThriftTextHelper(text); | ||
43 | - | ||
44 | Set<String> sentenceIds = id2sentIds.get(textId); | 49 | Set<String> sentenceIds = id2sentIds.get(textId); |
50 | + FeatureHelper featureHelper = new FeatureHelper(text); | ||
45 | 51 | ||
46 | - List<ZeroSubjectCandidate> zeroSubjectCandidates = candidateFinder.findZeroSubjectCandidates(text, sentenceIds); | 52 | + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); |
53 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | ||
47 | 54 | ||
48 | - for (ZeroSubjectCandidate candidate : zeroSubjectCandidates) { | ||
49 | - List<Object> row = Lists.newArrayList(); | ||
50 | - row.add("C"); | ||
51 | - row.add(textId); | ||
52 | - row.add(thriftTextHelper.getMentionText(candidate.getZeroCandidateMention())); | ||
53 | - row.add(thriftTextHelper.getSentenceText(candidate.getPreviousSentence())); | ||
54 | - row.add(thriftTextHelper.getSentenceText(candidate.getSentence())); | ||
55 | - rows.add(row); | 55 | + for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { |
56 | + boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); | ||
57 | + Instance instance = entry2.getValue(); | ||
58 | + instance.setDataset(instances); | ||
59 | + instance.setClassValue(good ? 1 : 0); | ||
60 | + instances.add(instance); | ||
56 | } | 61 | } |
57 | } | 62 | } |
58 | 63 | ||
59 | - try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) { | ||
60 | - for (List<Object> row : rows) { | ||
61 | - csvPrinter.printRecord(row); | 64 | + saveInstancesToFile(instances); |
65 | + } | ||
66 | + | ||
67 | + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | ||
68 | + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | ||
69 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | ||
70 | + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | ||
71 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
72 | + Map<Attribute, Double> sentenceFeatures = entry.getValue(); | ||
73 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
74 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | ||
62 | } | 75 | } |
76 | + candidate2instance.put(entry.getKey(), instance); | ||
63 | } | 77 | } |
78 | + return candidate2instance; | ||
79 | + } | ||
64 | 80 | ||
81 | + private static void saveInstancesToFile(Instances instances) throws IOException { | ||
82 | + ArffSaver saver = new ArffSaver(); | ||
83 | + saver.setInstances(instances); | ||
84 | + saver.setFile(new File(Constants.ZERO_DATASET_PATH)); | ||
85 | + saver.writeBatch(); | ||
65 | } | 86 | } |
66 | 87 | ||
67 | private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { | 88 | private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/ZeroScorer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.train; | ||
2 | + | ||
3 | +import com.google.common.collect.Maps; | ||
4 | +import org.apache.commons.csv.CSVFormat; | ||
5 | +import org.apache.commons.csv.CSVParser; | ||
6 | +import org.apache.commons.csv.CSVRecord; | ||
7 | +import org.apache.commons.csv.QuoteMode; | ||
8 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | ||
10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | ||
11 | + | ||
12 | +import java.io.IOException; | ||
13 | +import java.io.InputStream; | ||
14 | +import java.io.InputStreamReader; | ||
15 | +import java.util.List; | ||
16 | +import java.util.Map; | ||
17 | + | ||
18 | +public class ZeroScorer { | ||
19 | + | ||
20 | + private static final char DELIMITER = '\t'; | ||
21 | + | ||
22 | + private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap(); | ||
23 | + | ||
24 | + public ZeroScorer(String goldZerosPath) throws IOException { | ||
25 | + try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosPath); | ||
26 | + InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING); | ||
27 | + CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) { | ||
28 | + List<CSVRecord> records = parser.getRecords(); | ||
29 | + for (CSVRecord record : records) { | ||
30 | + candidateEncoding2Decision.put(encode(record.get(2), record.get(3), record.get(4)), record.get(0).equalsIgnoreCase("C")); | ||
31 | + } | ||
32 | + } | ||
33 | + } | ||
34 | + | ||
35 | + private String encode(String mentionOrth, String firstSentenceOrth, String secondSentenceOrth) { | ||
36 | + return mentionOrth + DELIMITER + firstSentenceOrth + DELIMITER + secondSentenceOrth; | ||
37 | + } | ||
38 | + | ||
39 | + private String encode(ZeroSubjectCandidate candidate, FeatureHelper helper) { | ||
40 | + String mentionOrth = helper.getMentionOrth(candidate.getZeroCandidateMention()); | ||
41 | + String firstSentenceOrth = helper.getSentenceOrth(candidate.getPreviousSentence()); | ||
42 | + String secondSentenceOrth = helper.getSentenceOrth(candidate.getSentence()); | ||
43 | + return encode(mentionOrth, firstSentenceOrth, secondSentenceOrth); | ||
44 | + } | ||
45 | + | ||
46 | + public boolean isValidCandidate(ZeroSubjectCandidate candidate, FeatureHelper helper) { | ||
47 | + return candidateEncoding2Decision.get(encode(candidate, helper)); | ||
48 | + } | ||
49 | + | ||
50 | +} |
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -2,12 +2,11 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | @@ -2,12 +2,11 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | 2 | ||
3 | import com.google.common.collect.Sets; | 3 | import com.google.common.collect.Sets; |
4 | import org.apache.commons.io.IOUtils; | 4 | import org.apache.commons.io.IOUtils; |
5 | -import org.junit.BeforeClass; | ||
6 | import org.junit.Test; | 5 | import org.junit.Test; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper; | ||
10 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | ||
11 | 10 | ||
12 | import java.io.IOException; | 11 | import java.io.IOException; |
13 | import java.io.InputStream; | 12 | import java.io.InputStream; |
@@ -22,18 +21,11 @@ public class CandidateFinderTest { | @@ -22,18 +21,11 @@ public class CandidateFinderTest { | ||
22 | private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin"; | 21 | private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin"; |
23 | private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; | 22 | private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; |
24 | 23 | ||
25 | - private static CandidateFinder candidateFinder; | ||
26 | - | ||
27 | - @BeforeClass | ||
28 | - public static void init() { | ||
29 | - candidateFinder = new CandidateFinder(); | ||
30 | - } | ||
31 | - | ||
32 | @Test | 24 | @Test |
33 | public void shouldFindZeroSubjectCandidateInSampleText() throws Exception { | 25 | public void shouldFindZeroSubjectCandidateInSampleText() throws Exception { |
34 | - ThriftTextHelper sampleTextHelper = loadSampleTextHelper(); | 26 | + FeatureHelper sampleTextHelper = loadSampleTextHelper(); |
35 | Set<String> summarySentenceIds = loadSampleTextSummarySentenceIds(); | 27 | Set<String> summarySentenceIds = loadSampleTextSummarySentenceIds(); |
36 | - List<ZeroSubjectCandidate> candidates = candidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds); | 28 | + List<ZeroSubjectCandidate> candidates = CandidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds); |
37 | assertEquals(1, candidates.size()); | 29 | assertEquals(1, candidates.size()); |
38 | 30 | ||
39 | ZeroSubjectCandidate zeroSubjectCandidate = candidates.get(0); | 31 | ZeroSubjectCandidate zeroSubjectCandidate = candidates.get(0); |
@@ -41,9 +33,9 @@ public class CandidateFinderTest { | @@ -41,9 +33,9 @@ public class CandidateFinderTest { | ||
41 | TSentence secondSentence = zeroSubjectCandidate.getSentence(); | 33 | TSentence secondSentence = zeroSubjectCandidate.getSentence(); |
42 | TMention zeroCandidate = zeroSubjectCandidate.getZeroCandidateMention(); | 34 | TMention zeroCandidate = zeroSubjectCandidate.getZeroCandidateMention(); |
43 | 35 | ||
44 | - assertEquals("Ala ma kota.", sampleTextHelper.getSentenceText(firstSentence)); | ||
45 | - assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceText(secondSentence)); | ||
46 | - assertEquals("Ala", sampleTextHelper.getMentionText(zeroCandidate)); | 36 | + assertEquals("Ala ma kota.", sampleTextHelper.getSentenceOrth(firstSentence)); |
37 | + assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceOrth(secondSentence)); | ||
38 | + assertEquals("Ala", sampleTextHelper.getMentionOrth(zeroCandidate)); | ||
47 | } | 39 | } |
48 | 40 | ||
49 | private Set<String> loadSampleTextSummarySentenceIds() throws IOException { | 41 | private Set<String> loadSampleTextSummarySentenceIds() throws IOException { |
@@ -53,9 +45,9 @@ public class CandidateFinderTest { | @@ -53,9 +45,9 @@ public class CandidateFinderTest { | ||
53 | } | 45 | } |
54 | } | 46 | } |
55 | 47 | ||
56 | - private ThriftTextHelper loadSampleTextHelper() throws IOException { | 48 | + private FeatureHelper loadSampleTextHelper() throws IOException { |
57 | try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { | 49 | try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { |
58 | - return new ThriftTextHelper(Utils.loadThrifted(stream)); | 50 | + return new FeatureHelper(Utils.loadThrifted(stream)); |
59 | } | 51 | } |
60 | } | 52 | } |
61 | } | 53 | } |
62 | \ No newline at end of file | 54 | \ No newline at end of file |
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjectorTest.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | - | ||
3 | -import org.junit.Test; | ||
4 | - | ||
5 | -public class ZeroSubjectInjectorTest { | ||
6 | - | ||
7 | - @Test | ||
8 | - public void shouldInit() throws Exception { | ||
9 | - ZeroSubjectInjector injector = new ZeroSubjectInjector(); | ||
10 | - } | ||
11 | -} | ||
12 | \ No newline at end of file | 0 | \ No newline at end of file |
nicolas-zero/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin
0 → 100644
No preview for this file type
nicolas-zero/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt
0 → 100644
pom.xml
@@ -61,6 +61,11 @@ | @@ -61,6 +61,11 @@ | ||
61 | <artifactId>nicolas-common</artifactId> | 61 | <artifactId>nicolas-common</artifactId> |
62 | <version>${project.version}</version> | 62 | <version>${project.version}</version> |
63 | </dependency> | 63 | </dependency> |
64 | + <dependency> | ||
65 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
66 | + <artifactId>nicolas-zero</artifactId> | ||
67 | + <version>${project.version}</version> | ||
68 | + </dependency> | ||
64 | 69 | ||
65 | <!-- internal --> | 70 | <!-- internal --> |
66 | <dependency> | 71 | <dependency> |