Commit 156b37078e717ab9f3849c9d6a1ef1d2ddbc2a10

Authored by Mateusz Kopeć
1 parent 89870bd0

add zero subject removal

Showing 20 changed files with 324 additions and 117 deletions
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -12,6 +12,7 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; @@ -12,6 +12,7 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel;
12 import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; 12 import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils;
13 import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; 13 import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
14 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; 14 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
  15 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroModel;
15 import weka.classifiers.Classifier; 16 import weka.classifiers.Classifier;
16 17
17 import java.io.IOException; 18 import java.io.IOException;
@@ -29,35 +30,43 @@ public class Nicolas { @@ -29,35 +30,43 @@ public class Nicolas {
29 private final SentenceFeatureExtractor sentenceFeatureExtractor; 30 private final SentenceFeatureExtractor sentenceFeatureExtractor;
30 private final ZeroFeatureExtractor zeroFeatureExtractor; 31 private final ZeroFeatureExtractor zeroFeatureExtractor;
31 32
32 - public Nicolas() throws NicolasException { 33 + public Nicolas(boolean useZeroModel) throws NicolasException {
33 try { 34 try {
  35 + mentionFeatureExtractor = new MentionFeatureExtractor();
34 mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); 36 mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
35 - sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);  
36 - zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);  
37 37
38 - mentionFeatureExtractor = new MentionFeatureExtractor();  
39 sentenceFeatureExtractor = new SentenceFeatureExtractor(); 38 sentenceFeatureExtractor = new SentenceFeatureExtractor();
40 - zeroFeatureExtractor = new ZeroFeatureExtractor(); 39 + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
  40 +
  41 + zeroFeatureExtractor = useZeroModel ? new ZeroFeatureExtractor() : null;
  42 + zeroModel = useZeroModel ? ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH) : null;
  43 +
41 } catch (IOException e) { 44 } catch (IOException e) {
42 throw new NicolasException(e); 45 throw new NicolasException(e);
43 } 46 }
44 } 47 }
45 48
  49 + public Nicolas() throws NicolasException {
  50 + this(true);
  51 + }
  52 +
46 public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { 53 public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException {
47 try { 54 try {
48 Set<TMention> goodMentions = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); 55 Set<TMention> goodMentions = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text);
49 - return calculateSummary(text, goodMentions, targetTokenCount); 56 + List<TSentence> selectedSentences = selectSummarySentences(text, goodMentions, targetTokenCount);
  57 + Set<String> zeroSubjectTokenIds = zeroModel == null ? Collections.emptySet() : ZeroModel.findZeroSubjectTokenIds(zeroModel, zeroFeatureExtractor, text, selectedSentences);
  58 +
  59 + return createSummaryFromSentences(selectedSentences, zeroSubjectTokenIds);
  60 +
50 } catch (Exception e) { 61 } catch (Exception e) {
51 throw new NicolasException(e); 62 throw new NicolasException(e);
52 } 63 }
53 } 64 }
54 65
55 - private String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception {  
56 - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize);  
57 - 66 + private String createSummaryFromSentences(List<TSentence> selectedSentences, Set<String> zeroSubjectTokenIds) {
58 StringBuilder sb = new StringBuilder(); 67 StringBuilder sb = new StringBuilder();
59 for (TSentence sent : selectedSentences) { 68 for (TSentence sent : selectedSentences) {
60 - sb.append(" ").append(TextUtils.loadSentence2Orth(sent)); 69 + sb.append(" ").append(TextUtils.loadSentence2Orth(sent, zeroSubjectTokenIds));
61 } 70 }
62 return sb.toString().trim(); 71 return sb.toString().trim();
63 } 72 }
@@ -70,16 +79,16 @@ public class Nicolas { @@ -70,16 +79,16 @@ public class Nicolas {
70 List<TSentence> sortedSentences = Lists.newArrayList(sentences); 79 List<TSentence> sortedSentences = Lists.newArrayList(sentences);
71 sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed()); 80 sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed());
72 81
73 - int size = 0;  
74 - Random r = new Random(1); 82 + int currentSize = 0;
75 Set<TSentence> summary = Sets.newHashSet(); 83 Set<TSentence> summary = Sets.newHashSet();
76 for (TSentence sent : sortedSentences) { 84 for (TSentence sent : sortedSentences) {
77 - size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size();  
78 - if (r.nextDouble() > 0.4 && size > targetSize)  
79 - break;  
80 - summary.add(sent);  
81 - if (size > targetSize)  
82 - break; 85 + int sentenceSize = TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size();
  86 + int newSize = currentSize + sentenceSize;
  87 +
  88 + if (Math.abs(newSize - targetSize) < Math.abs(currentSize - targetSize)) {
  89 + currentSize = newSize;
  90 + summary.add(sent);
  91 + }
83 } 92 }
84 List<TSentence> selectedSentences = Lists.newArrayList(); 93 List<TSentence> selectedSentences = Lists.newArrayList();
85 for (TSentence sent : sentences) { 94 for (TSentence sent : sentences) {
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java
1 package pl.waw.ipipan.zil.summ.nicolas; 1 package pl.waw.ipipan.zil.summ.nicolas;
2 2
3 public class NicolasException extends Exception { 3 public class NicolasException extends Exception {
4 - public NicolasException(Exception e) { 4 + NicolasException(Exception e) {
5 super(e); 5 super(e);
6 } 6 }
7 } 7 }
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
@@ -33,7 +33,7 @@ public class MentionModel { @@ -33,7 +33,7 @@ public class MentionModel {
33 if (good) 33 if (good)
34 goodMentions.add(entry.getKey()); 34 goodMentions.add(entry.getKey());
35 } 35 }
36 - LOG.info("Classified {} mentions as good.", goodMentions.size()); 36 + LOG.debug("Classified {} mentions as good.", goodMentions.size());
37 return goodMentions; 37 return goodMentions;
38 } 38 }
39 39
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
@@ -32,7 +32,7 @@ public class SentenceModel { @@ -32,7 +32,7 @@ public class SentenceModel {
32 double score = sentenceClassifier.classifyInstance(instance); 32 double score = sentenceClassifier.classifyInstance(instance);
33 sentence2score.put(entry.getKey(), score); 33 sentence2score.put(entry.getKey(), score);
34 } 34 }
35 - LOG.info("Scored " + sentence2score.size() + " sentences."); 35 + LOG.debug("Scored {} sentences.", sentence2score.size());
36 36
37 return sentence2score; 37 return sentence2score;
38 } 38 }
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java
@@ -8,6 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; @@ -8,6 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 9 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
10 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 10 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
  11 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
  12 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
11 import weka.core.Attribute; 13 import weka.core.Attribute;
12 import weka.core.DenseInstance; 14 import weka.core.DenseInstance;
13 import weka.core.Instance; 15 import weka.core.Instance;
@@ -65,6 +67,20 @@ public class InstanceUtils { @@ -65,6 +67,20 @@ public class InstanceUtils {
65 return sentence2instance; 67 return sentence2instance;
66 } 68 }
67 69
  70 + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) {
  71 + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text);
  72 + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap();
  73 + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) {
  74 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  75 + Map<Attribute, Double> sentenceFeatures = entry.getValue();
  76 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  77 + instance.setValue(attribute, sentenceFeatures.get(attribute));
  78 + }
  79 + candidate2instance.put(entry.getKey(), instance);
  80 + }
  81 + return candidate2instance;
  82 + }
  83 +
68 @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList 84 @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList
69 public static Instances createNewInstances(ArrayList<Attribute> attributesList) { 85 public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
70 Instances instances = new Instances(DATASET_NAME, attributesList, 0); 86 Instances instances = new Instances(DATASET_NAME, attributesList, 0);
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.zero;  
2 -  
3 -import com.google.common.collect.Maps;  
4 -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;  
5 -import weka.core.Attribute;  
6 -import weka.core.DenseInstance;  
7 -import weka.core.Instance;  
8 -  
9 -import java.util.List;  
10 -import java.util.Map;  
11 -  
12 -public class InstanceCreator {  
13 -  
14 - private InstanceCreator() {  
15 - }  
16 -  
17 - public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) {  
18 - Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text);  
19 - Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap();  
20 - for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) {  
21 - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());  
22 - Map<Attribute, Double> sentenceFeatures = entry.getValue();  
23 - for (Attribute attribute : featureExtractor.getAttributesList()) {  
24 - instance.setValue(attribute, sentenceFeatures.get(attribute));  
25 - }  
26 - candidate2instance.put(entry.getKey(), instance);  
27 - }  
28 - return candidate2instance;  
29 - }  
30 -  
31 -}  
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
@@ -10,6 +10,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; @@ -10,6 +10,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
10 import pl.waw.ipipan.zil.summ.nicolas.Constants; 10 import pl.waw.ipipan.zil.summ.nicolas.Constants;
11 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; 11 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
12 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 12 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
  13 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
13 import weka.core.Attribute; 14 import weka.core.Attribute;
14 15
15 import java.util.List; 16 import java.util.List;
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroModel.java
@@ -3,35 +3,30 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; @@ -3,35 +3,30 @@ package pl.waw.ipipan.zil.summ.nicolas.zero;
3 import com.google.common.collect.Sets; 3 import com.google.common.collect.Sets;
4 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 4 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
5 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 5 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
6 -import pl.waw.ipipan.zil.summ.nicolas.Constants;  
7 import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; 6 import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
  7 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder;
  8 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
8 import weka.classifiers.Classifier; 9 import weka.classifiers.Classifier;
9 import weka.core.Instance; 10 import weka.core.Instance;
10 import weka.core.Instances; 11 import weka.core.Instances;
11 -import weka.core.SerializationHelper;  
12 12
13 import java.util.List; 13 import java.util.List;
14 import java.util.Map; 14 import java.util.Map;
15 import java.util.Set; 15 import java.util.Set;
16 import java.util.stream.Collectors; 16 import java.util.stream.Collectors;
17 17
18 -public class ZeroSubjectInjector { 18 +public class ZeroModel {
19 19
20 - private final ZeroFeatureExtractor featureExtractor;  
21 - private final Classifier classifier;  
22 - private final Instances instances;  
23 -  
24 - public ZeroSubjectInjector() throws Exception {  
25 - classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH);  
26 - featureExtractor = new ZeroFeatureExtractor();  
27 - instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); 20 + private ZeroModel() {
28 } 21 }
29 22
30 - public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { 23 + public static Set<String> findZeroSubjectTokenIds(Classifier classifier, ZeroFeatureExtractor featureExtractor, TText text, List<TSentence> selectedSentences) throws Exception {
  24 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
  25 +
31 Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); 26 Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet());
32 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); 27 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds);
33 Map<ZeroSubjectCandidate, Instance> candidate2instance = 28 Map<ZeroSubjectCandidate, Instance> candidate2instance =
34 - InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); 29 + InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
35 30
36 Set<String> result = Sets.newHashSet(); 31 Set<String> result = Sets.newHashSet();
37 for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { 32 for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) {
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/CandidateFinder.java
1 -package pl.waw.ipipan.zil.summ.nicolas.zero; 1 +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate;
2 2
3 import com.google.common.collect.Lists; 3 import com.google.common.collect.Lists;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
@@ -12,57 +12,110 @@ import java.util.Set; @@ -12,57 +12,110 @@ import java.util.Set;
12 12
13 public class CandidateFinder { 13 public class CandidateFinder {
14 14
  15 + private static final String SUBST = "subst";
  16 + private static final String NOM = "nom";
  17 + private static final String MSD_SPLITTER = ":";
  18 +
15 private CandidateFinder() { 19 private CandidateFinder() {
16 } 20 }
17 21
18 public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { 22 public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) {
  23 + Map<String, Set<String>> mentionId2Cluster = getMentionId2Cluster(text);
  24 + return getZeroSubjectCandidates(text, summarySentenceIds, mentionId2Cluster);
  25 + }
  26 +
  27 + private static List<ZeroSubjectCandidate> getZeroSubjectCandidates(TText text, Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster) {
19 List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); 28 List<ZeroSubjectCandidate> candidates = Lists.newArrayList();
20 29
21 - Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();  
22 - for (TCoreference coreference : text.getCoreferences()) {  
23 - for (String mentionId : coreference.getMentionIds()) {  
24 - mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); 30 + PrevSentenceState prevSentenceState = new PrevSentenceState();
  31 + for (TParagraph p : text.getParagraphs()) {
  32 + for (TSentence sentence : p.getSentences()) {
  33 + processSentence(summarySentenceIds, mentionId2Cluster, candidates, prevSentenceState, sentence);
25 } 34 }
26 } 35 }
  36 + return candidates;
  37 + }
  38 +
  39 + private static void processSentence(Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence) {
  40 + if (!summarySentenceIds.contains(sentence.getId()))
  41 + return;
  42 + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();
  43 +
  44 + Map<String, TToken> tokenId2Token = getTokenId2Token(sentence);
  45 +
  46 + for (TMention mention : sentence.getMentions()) {
  47 + processMention(mentionId2Cluster, candidates, prevSentenceState, sentence, currentSentenceNominativeMentionIds, tokenId2Token, mention);
  48 + }
27 49
28 - Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet();  
29 - TSentence prevSentence = null;  
30 - for (TParagraph p : text.getParagraphs()) {  
31 - for (TSentence sentence : p.getSentences()) {  
32 - if (!summarySentenceIds.contains(sentence.getId()))  
33 - continue;  
34 - Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();  
35 -  
36 - Map<String, TToken> tokenId2Token = Maps.newHashMap();  
37 - for (TToken t : sentence.getTokens())  
38 - tokenId2Token.put(t.getId(), t);  
39 -  
40 - for (TMention mention : sentence.getMentions()) {  
41 -  
42 - for (String tokenId : mention.getHeadIds()) {  
43 - TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();  
44 - if (isInNominative(interp)) {  
45 -  
46 - currentSentenceNominativeMentionIds.add(mention.getId());  
47 - if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) {  
48 - ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentence, sentence, mention);  
49 - candidates.add(candidate);  
50 - }  
51 - break;  
52 - }  
53 - } 50 + prevSentenceState.setPrevSentence(sentence);
  51 + prevSentenceState.setNominativeMentionIds(currentSentenceNominativeMentionIds);
  52 + }
  53 +
  54 + private static void processMention(Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence, Set<String> currentSentenceNominativeMentionIds, Map<String, TToken> tokenId2Token, TMention mention) {
  55 + for (String tokenId : mention.getHeadIds()) {
  56 + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();
  57 + if (isInNominative(interp)) {
  58 + currentSentenceNominativeMentionIds.add(mention.getId());
  59 + if (isCoreferentWithPreviousSentence(mentionId2Cluster, prevSentenceState, mention)) {
  60 + ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentenceState.getPrevSentence(), sentence, mention);
  61 + candidates.add(candidate);
54 } 62 }
  63 + break;
  64 + }
  65 + }
  66 + }
55 67
56 - prevSentence = sentence;  
57 - prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; 68 + private static boolean isCoreferentWithPreviousSentence(Map<String, Set<String>> mentionId2Cluster, PrevSentenceState prevSentenceState, TMention mention) {
  69 + return mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceState.getNominativeMentionIds()::contains);
  70 + }
  71 +
  72 + private static Map<String, TToken> getTokenId2Token(TSentence sentence) {
  73 + Map<String, TToken> tokenId2Token = Maps.newHashMap();
  74 + for (TToken t : sentence.getTokens())
  75 + tokenId2Token.put(t.getId(), t);
  76 + return tokenId2Token;
  77 + }
  78 +
  79 + private static Map<String, Set<String>> getMentionId2Cluster(TText text) {
  80 + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
  81 + for (TCoreference coreference : text.getCoreferences()) {
  82 + for (String mentionId : coreference.getMentionIds()) {
  83 + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));
58 } 84 }
59 } 85 }
60 - return candidates; 86 + return mentionId2Cluster;
61 } 87 }
62 88
63 private static boolean isInNominative(TInterpretation interp) { 89 private static boolean isInNominative(TInterpretation interp) {
64 - boolean isNominative = Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom"));  
65 - boolean isSubst = interp.getCtag().equals("subst"); 90 + boolean isNominative = Arrays.stream(interp.getMsd().split(MSD_SPLITTER)).anyMatch(t -> t.equals(NOM));
  91 + boolean isSubst = interp.getCtag().equals(SUBST);
66 return isSubst && isNominative; 92 return isSubst && isNominative;
67 } 93 }
  94 +
  95 + private static class PrevSentenceState {
  96 +
  97 + private Set<String> nominativeMentionIds;
  98 + private TSentence prevSentence;
  99 +
  100 + PrevSentenceState() {
  101 + nominativeMentionIds = Sets.newHashSet();
  102 + prevSentence = null;
  103 + }
  104 +
  105 + Set<String> getNominativeMentionIds() {
  106 + return nominativeMentionIds;
  107 + }
  108 +
  109 + TSentence getPrevSentence() {
  110 + return prevSentence;
  111 + }
  112 +
  113 + void setNominativeMentionIds(Set<String> nominativeMentionIds) {
  114 + this.nominativeMentionIds = nominativeMentionIds;
  115 + }
  116 +
  117 + void setPrevSentence(TSentence prevSentence) {
  118 + this.prevSentence = prevSentence;
  119 + }
  120 + }
68 } 121 }
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/ZeroSubjectCandidate.java
1 -package pl.waw.ipipan.zil.summ.nicolas.zero; 1 +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate;
2 2
3 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 3 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
4 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 4 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
@@ -9,7 +9,7 @@ public class ZeroSubjectCandidate { @@ -9,7 +9,7 @@ public class ZeroSubjectCandidate {
9 private final TSentence sentence; 9 private final TSentence sentence;
10 private final TMention zeroCandidateMention; 10 private final TMention zeroCandidateMention;
11 11
12 - public ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) { 12 + ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) {
13 this.previousSentence = previousSentence; 13 this.previousSentence = previousSentence;
14 this.sentence = sentence; 14 this.sentence = sentence;
15 this.zeroCandidateMention = zeroCandidateMention; 15 this.zeroCandidateMention = zeroCandidateMention;
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -7,6 +7,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; @@ -7,6 +7,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 8 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
9 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; 9 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
  10 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder;
  11 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
10 12
11 import java.io.IOException; 13 import java.io.IOException;
12 import java.io.InputStream; 14 import java.io.InputStream;
nicolas-train/src/main/R/plot_summary_lenghts.R 0 → 100644
  1 +require(ggplot2)
  2 +require(grid)
  3 +require(gridExtra)
  4 +require(lattice)
  5 +
  6 +DATA_DIR="../../../../data/"
  7 +
  8 +########################## functions
  9 +gpl = function(d) {
  10 + ggplot(d, aes(x=as.factor(d$SumRatio), y=SumRealRatio)) +
  11 + geom_boxplot(outlier.shape=4, outlier.colour = "blue") +
  12 + ylim(0, 40) +
  13 + ylab("Obtained summary ratio (word count)") +
  14 + xlab("Requested summary ratio (word count)") +
  15 + theme(text = element_text(size=15))
  16 +}
  17 +
  18 +ploto = function(d) {
  19 + p = gpl(d)
  20 +}
  21 +
  22 +histo = function(d) {
  23 + p = ggplot(d, aes(abs(d$SumRealRatio*100/d$SumRatio))) +
  24 + geom_histogram(binwidth = 1) +
  25 + xlim(80, 120) +
  26 + ylab("Number of summaries") +
  27 + xlab("Obtained summary ratio as percent of requested ratio (20%)") +
  28 + theme(text = element_text(size=15))
  29 +}
  30 +
  31 +######################### automatic summaries
  32 +data = read.csv(paste(DATA_DIR, "summary-lengths.tsv", sep=""), sep = "\t")
  33 +
  34 +names = list("Swietl", "nicolas", "nicolas-zero", "BASELINE")
  35 +titles = list("Świetlicka", "Nicolas", "Nicolas-zero", "Baseline")
  36 +plots = list()
  37 +hists = list()
  38 +i = 1
  39 +for (n in names) {
  40 + print(n)
  41 + title = titles[[i]]
  42 + i = i + 1
  43 +
  44 + d = data[data$SumAuthor==n,]
  45 + print(mean(d$SumRealRatio))
  46 +
  47 + p = ploto(d)
  48 + p = p + ggtitle(title)
  49 + plots = c(plots, list(p))
  50 +
  51 + hi = histo(d)
  52 + hi = hi + ggtitle(title)
  53 + hists = c(hists, list(hi))
  54 +}
  55 +
  56 +pdf(file=paste(DATA_DIR, "summary-length-plots.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27)
  57 +grid.arrange(plots[[1]], plots[[2]], plots[[3]], plots[[4]], ncol=2, nrow=2)
  58 +dev.off()
  59 +
  60 +pdf(file=paste(DATA_DIR, "summary-length-hists.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27)
  61 +grid.arrange(hists[[1]], hists[[2]], hists[[3]], hists[[4]], ncol=2, nrow=2)
  62 +dev.off()
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java
@@ -18,7 +18,7 @@ public class CorpusHelper { @@ -18,7 +18,7 @@ public class CorpusHelper {
18 private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; 18 private static final String ABSTRACT_SUMMARY_TYPE = "abstract";
19 private static final String EXTRACT_SUMMARY_TYPE = "extract"; 19 private static final String EXTRACT_SUMMARY_TYPE = "extract";
20 20
21 - private static final int SUMMARY_RATIO = 20; 21 + public static final int SUMMARY_RATIO = 20;
22 22
23 private CorpusHelper() { 23 private CorpusHelper() {
24 } 24 }
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java
@@ -6,6 +6,7 @@ import org.apache.commons.csv.CSVPrinter; @@ -6,6 +6,7 @@ import org.apache.commons.csv.CSVPrinter;
6 import org.slf4j.Logger; 6 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory; 7 import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.summ.nicolas.Constants; 8 import pl.waw.ipipan.zil.summ.nicolas.Constants;
  9 +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper;
9 import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; 10 import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
10 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; 11 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
11 import pl.waw.ipipan.zil.summ.pscapi.xml.Text; 12 import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
@@ -26,7 +27,7 @@ public class CalculateSystemSummaryLengths { @@ -26,7 +27,7 @@ public class CalculateSystemSummaryLengths {
26 private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class); 27 private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class);
27 28
28 private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId", 29 private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId",
29 - "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t'); 30 + "TextWC", "SumType", "SumAuthor", "SumRatio", "SumWC", "SumRealRatio").withDelimiter('\t');
30 31
31 private CalculateSystemSummaryLengths() { 32 private CalculateSystemSummaryLengths() {
32 } 33 }
@@ -61,9 +62,10 @@ public class CalculateSystemSummaryLengths { @@ -61,9 +62,10 @@ public class CalculateSystemSummaryLengths {
61 record.add(textWC); 62 record.add(textWC);
62 record.add("automatic"); 63 record.add("automatic");
63 record.add(systemName); 64 record.add(systemName);
  65 + record.add(CorpusHelper.SUMMARY_RATIO);
64 int sumWC = TextUtils.tokenize(body).size(); 66 int sumWC = TextUtils.tokenize(body).size();
65 record.add(sumWC); 67 record.add(sumWC);
66 - record.add(sumWC * 1.0 / textWC); 68 + record.add(sumWC * 100.0 / textWC);
67 printer.printRecord(record); 69 printer.printRecord(record);
68 } 70 }
69 71
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
@@ -29,6 +29,8 @@ public class SummarizeTestCorpus { @@ -29,6 +29,8 @@ public class SummarizeTestCorpus {
29 private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); 29 private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class);
30 30
31 private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; 31 private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt";
  32 + private static final String SUMMARY_WITH_ZERO_FILE_SUFFIX = "_nicolas-zero.txt";
  33 +
32 private static final double SUMMARY_RATIO = 0.2; 34 private static final double SUMMARY_RATIO = 0.2;
33 35
34 private SummarizeTestCorpus() { 36 private SummarizeTestCorpus() {
@@ -41,16 +43,20 @@ public class SummarizeTestCorpus { @@ -41,16 +43,20 @@ public class SummarizeTestCorpus {
41 Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains); 43 Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains);
42 LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); 44 LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size());
43 45
44 - Map<String, String> id2summary = summarizeTexts(id2preprocessedText); 46 + summarize(new Nicolas(false), id2preprocessedText, SUMMARY_FILE_SUFFIX);
  47 + summarize(new Nicolas(), id2preprocessedText, SUMMARY_WITH_ZERO_FILE_SUFFIX);
  48 + }
  49 +
  50 + private static void summarize(Nicolas nicolas, Map<String, TText> id2preprocessedText, String fileSuffix) throws NicolasException, IOException {
  51 + Map<String, String> id2summary = summarizeTexts(id2preprocessedText, nicolas);
45 LOG.info("Texts summarized."); 52 LOG.info("Texts summarized.");
46 53
47 - saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR); 54 + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR, fileSuffix);
48 LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR); 55 LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR);
49 } 56 }
50 57
51 - private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { 58 + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText, Nicolas nicolas) throws NicolasException {
52 Map<String, String> id2summary = Maps.newHashMap(); 59 Map<String, String> id2summary = Maps.newHashMap();
53 - Nicolas nicolas = new Nicolas();  
54 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { 60 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
55 TText text = entry.getValue(); 61 TText text = entry.getValue();
56 int targetSize = calculateTargetSize(text); 62 int targetSize = calculateTargetSize(text);
@@ -70,11 +76,11 @@ public class SummarizeTestCorpus { @@ -70,11 +76,11 @@ public class SummarizeTestCorpus {
70 return (int) (SUMMARY_RATIO * tokenCount); 76 return (int) (SUMMARY_RATIO * tokenCount);
71 } 77 }
72 78
73 - private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException { 79 + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir, String fileSuffix) throws IOException {
74 for (Map.Entry<String, String> entry : id2summary.entrySet()) { 80 for (Map.Entry<String, String> entry : id2summary.entrySet()) {
75 String textId = entry.getKey(); 81 String textId = entry.getKey();
76 String summary = entry.getValue(); 82 String summary = entry.getValue();
77 - String targetFileName = textId + SUMMARY_FILE_SUFFIX; 83 + String targetFileName = textId + fileSuffix;
78 try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) { 84 try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) {
79 writer.write(summary); 85 writer.write(summary);
80 } 86 }
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Main.java
1 package pl.waw.ipipan.zil.summ.nicolas.train; 1 package pl.waw.ipipan.zil.summ.nicolas.train;
2 2
3 import pl.waw.ipipan.zil.summ.nicolas.train.pipeline.*; 3 import pl.waw.ipipan.zil.summ.nicolas.train.pipeline.*;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractMostFrequentMentions;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractStopwords;
4 6
5 public class Main { 7 public class Main {
6 8
@@ -12,6 +14,8 @@ public class Main { @@ -12,6 +14,8 @@ public class Main {
12 DownloadTrainingResources.main(args); 14 DownloadTrainingResources.main(args);
13 ExtractGoldSummaries.main(args); 15 ExtractGoldSummaries.main(args);
14 CreateOptimalSummaries.main(args); 16 CreateOptimalSummaries.main(args);
  17 + ExtractStopwords.main(args);
  18 + ExtractMostFrequentMentions.main(args);
15 PrepareTrainingData.main(args); 19 PrepareTrainingData.main(args);
16 TrainAllModels.main(args); 20 TrainAllModels.main(args);
17 } 21 }
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java
@@ -7,7 +7,7 @@ import org.apache.commons.csv.CSVRecord; @@ -7,7 +7,7 @@ import org.apache.commons.csv.CSVRecord;
7 import org.apache.commons.csv.QuoteMode; 7 import org.apache.commons.csv.QuoteMode;
8 import pl.waw.ipipan.zil.summ.nicolas.Constants; 8 import pl.waw.ipipan.zil.summ.nicolas.Constants;
9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; 10 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
11 11
12 import java.io.*; 12 import java.io.*;
13 import java.util.List; 13 import java.util.List;
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
@@ -19,10 +19,9 @@ import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; @@ -19,10 +19,9 @@ import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer;
19 import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; 19 import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer;
20 import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; 20 import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
21 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; 21 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
22 -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;  
23 -import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;  
24 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; 22 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
25 -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; 23 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder;
  24 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
26 import weka.core.Instance; 25 import weka.core.Instance;
27 import weka.core.Instances; 26 import weka.core.Instances;
28 import weka.core.converters.ArffSaver; 27 import weka.core.converters.ArffSaver;
@@ -152,7 +151,7 @@ public class PrepareTrainingData { @@ -152,7 +151,7 @@ public class PrepareTrainingData {
152 FeatureHelper featureHelper = new FeatureHelper(text); 151 FeatureHelper featureHelper = new FeatureHelper(text);
153 152
154 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); 153 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds);
155 - Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); 154 + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
156 155
157 for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { 156 for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) {
158 boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); 157 boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper);
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractMostFrequentMentions.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.resources;
  2 +
  3 +import com.google.common.collect.*;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  6 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
  7 +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
  8 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  9 +
  10 +import javax.xml.bind.JAXBException;
  11 +import java.io.File;
  12 +import java.io.IOException;
  13 +import java.util.Comparator;
  14 +import java.util.List;
  15 +import java.util.Map;
  16 +import java.util.Set;
  17 +import java.util.stream.Collectors;
  18 +
  19 +public class ExtractMostFrequentMentions {
  20 +
  21 + public static final String GOLD_DATA_PATH = "/home/me2/Dropbox/3_nauka/3_doktorat/3_korpus_streszczen/dist/src/data/";
  22 +
  23 + public static final String THRIFTED_PREFIX = "/home/me2/Desktop/thrifted_texts/thrifted_all/";
  24 + public static final String THRIFTED_SUFFIX = "/original";
  25 +
  26 + public static void main(String[] args) throws IOException, JAXBException {
  27 +
  28 + Set<String> devIds = Sets.newHashSet();
  29 +
  30 + File goldDir = new File(GOLD_DATA_PATH);
  31 + for (File file : goldDir.listFiles()) {
  32 + Text goldText = PSC_IO.readText(file);
  33 + if (goldText.getSummaries().getSummary().stream().anyMatch(s -> s.getType().equals("abstract")))
  34 + continue;
  35 +
  36 + devIds.add(file.getName().replace(".xml", ""));
  37 + }
  38 +
  39 +
  40 + System.out.println(devIds.size());
  41 +
  42 + Multiset<String> mentionCounts = HashMultiset.create();
  43 + for (String id : devIds) {
  44 + Set<String> distinctTextMentions = Sets.newHashSet();
  45 + File input = new File(THRIFTED_PREFIX + id + THRIFTED_SUFFIX);
  46 + TText thrifted = ThriftUtils.loadThriftTextFromFile(input);
  47 + List<TSentence> sents = thrifted.getParagraphs().stream()
  48 + .flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
  49 +
  50 + Map<String, String> tokenId2base = Maps.newHashMap();
  51 + sents.stream()
  52 + .flatMap(s -> s.getTokens().stream())
  53 + .forEach(token -> tokenId2base.put(token.getId(), token.getChosenInterpretation().getBase()));
  54 +
  55 + sents.stream().flatMap(s -> s.getMentions().stream()).forEach(m -> {
  56 + StringBuffer sb = new StringBuffer();
  57 + for (String tokId : m.getChildIds()) {
  58 + sb.append(tokenId2base.get(tokId) + " ");
  59 + }
  60 + distinctTextMentions.add(sb.toString().trim().toLowerCase());
  61 + });
  62 +
  63 + mentionCounts.addAll(distinctTextMentions);
  64 + }
  65 +
  66 + System.out.println(mentionCounts.elementSet().size());
  67 + List<String> sorted = Lists.newArrayList();
  68 + sorted.addAll(mentionCounts.elementSet());
  69 + sorted.sort(Comparator.comparing(mentionCounts::count).reversed());
  70 + int i = 0;
  71 + for (String mention : sorted) {
  72 + if (mentionCounts.count(mention) < 50)
  73 + break;
  74 + System.out.println(mention);
  75 + }
  76 +
  77 + }
  78 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractStopwords.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.resources;
  2 +
  3 +public class ExtractStopwords {
  4 +
  5 + private ExtractStopwords() {
  6 + }
  7 +
  8 + public static void main(String[] args) {
  9 +
  10 + }
  11 +}