Commit 156b37078e717ab9f3849c9d6a1ef1d2ddbc2a10

Authored by Mateusz Kopeć
1 parent 89870bd0

add zero subject removal

Showing 20 changed files with 324 additions and 117 deletions
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... ... @@ -12,6 +12,7 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel;
12 12 import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils;
13 13 import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
14 14 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
  15 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroModel;
15 16 import weka.classifiers.Classifier;
16 17  
17 18 import java.io.IOException;
... ... @@ -29,35 +30,43 @@ public class Nicolas {
29 30 private final SentenceFeatureExtractor sentenceFeatureExtractor;
30 31 private final ZeroFeatureExtractor zeroFeatureExtractor;
31 32  
32   - public Nicolas() throws NicolasException {
  33 + public Nicolas(boolean useZeroModel) throws NicolasException {
33 34 try {
  35 + mentionFeatureExtractor = new MentionFeatureExtractor();
34 36 mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
35   - sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
36   - zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);
37 37  
38   - mentionFeatureExtractor = new MentionFeatureExtractor();
39 38 sentenceFeatureExtractor = new SentenceFeatureExtractor();
40   - zeroFeatureExtractor = new ZeroFeatureExtractor();
  39 + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
  40 +
  41 + zeroFeatureExtractor = useZeroModel ? new ZeroFeatureExtractor() : null;
  42 + zeroModel = useZeroModel ? ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH) : null;
  43 +
41 44 } catch (IOException e) {
42 45 throw new NicolasException(e);
43 46 }
44 47 }
45 48  
  49 + public Nicolas() throws NicolasException {
  50 + this(true);
  51 + }
  52 +
46 53 public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException {
47 54 try {
48 55 Set<TMention> goodMentions = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text);
49   - return calculateSummary(text, goodMentions, targetTokenCount);
  56 + List<TSentence> selectedSentences = selectSummarySentences(text, goodMentions, targetTokenCount);
  57 + Set<String> zeroSubjectTokenIds = zeroModel == null ? Collections.emptySet() : ZeroModel.findZeroSubjectTokenIds(zeroModel, zeroFeatureExtractor, text, selectedSentences);
  58 +
  59 + return createSummaryFromSentences(selectedSentences, zeroSubjectTokenIds);
  60 +
50 61 } catch (Exception e) {
51 62 throw new NicolasException(e);
52 63 }
53 64 }
54 65  
55   - private String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception {
56   - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize);
57   -
  66 + private String createSummaryFromSentences(List<TSentence> selectedSentences, Set<String> zeroSubjectTokenIds) {
58 67 StringBuilder sb = new StringBuilder();
59 68 for (TSentence sent : selectedSentences) {
60   - sb.append(" ").append(TextUtils.loadSentence2Orth(sent));
  69 + sb.append(" ").append(TextUtils.loadSentence2Orth(sent, zeroSubjectTokenIds));
61 70 }
62 71 return sb.toString().trim();
63 72 }
... ... @@ -70,16 +79,16 @@ public class Nicolas {
70 79 List<TSentence> sortedSentences = Lists.newArrayList(sentences);
71 80 sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed());
72 81  
73   - int size = 0;
74   - Random r = new Random(1);
  82 + int currentSize = 0;
75 83 Set<TSentence> summary = Sets.newHashSet();
76 84 for (TSentence sent : sortedSentences) {
77   - size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size();
78   - if (r.nextDouble() > 0.4 && size > targetSize)
79   - break;
80   - summary.add(sent);
81   - if (size > targetSize)
82   - break;
  85 + int sentenceSize = TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size();
  86 + int newSize = currentSize + sentenceSize;
  87 +
  88 + if (Math.abs(newSize - targetSize) < Math.abs(currentSize - targetSize)) {
  89 + currentSize = newSize;
  90 + summary.add(sent);
  91 + }
83 92 }
84 93 List<TSentence> selectedSentences = Lists.newArrayList();
85 94 for (TSentence sent : sentences) {
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java
1 1 package pl.waw.ipipan.zil.summ.nicolas;
2 2  
3 3 public class NicolasException extends Exception {
4   - public NicolasException(Exception e) {
  4 + NicolasException(Exception e) {
5 5 super(e);
6 6 }
7 7 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
... ... @@ -33,7 +33,7 @@ public class MentionModel {
33 33 if (good)
34 34 goodMentions.add(entry.getKey());
35 35 }
36   - LOG.info("Classified {} mentions as good.", goodMentions.size());
  36 + LOG.debug("Classified {} mentions as good.", goodMentions.size());
37 37 return goodMentions;
38 38 }
39 39  
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
... ... @@ -32,7 +32,7 @@ public class SentenceModel {
32 32 double score = sentenceClassifier.classifyInstance(instance);
33 33 sentence2score.put(entry.getKey(), score);
34 34 }
35   - LOG.info("Scored " + sentence2score.size() + " sentences.");
  35 + LOG.debug("Scored {} sentences.", sentence2score.size());
36 36  
37 37 return sentence2score;
38 38 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java
... ... @@ -8,6 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 9 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
10 10 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
  11 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
  12 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
11 13 import weka.core.Attribute;
12 14 import weka.core.DenseInstance;
13 15 import weka.core.Instance;
... ... @@ -65,6 +67,20 @@ public class InstanceUtils {
65 67 return sentence2instance;
66 68 }
67 69  
  70 + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) {
  71 + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text);
  72 + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap();
  73 + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) {
  74 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  75 + Map<Attribute, Double> sentenceFeatures = entry.getValue();
  76 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  77 + instance.setValue(attribute, sentenceFeatures.get(attribute));
  78 + }
  79 + candidate2instance.put(entry.getKey(), instance);
  80 + }
  81 + return candidate2instance;
  82 + }
  83 +
68 84 @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList
69 85 public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
70 86 Instances instances = new Instances(DATASET_NAME, attributesList, 0);
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.zero;
2   -
3   -import com.google.common.collect.Maps;
4   -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
5   -import weka.core.Attribute;
6   -import weka.core.DenseInstance;
7   -import weka.core.Instance;
8   -
9   -import java.util.List;
10   -import java.util.Map;
11   -
12   -public class InstanceCreator {
13   -
14   - private InstanceCreator() {
15   - }
16   -
17   - public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) {
18   - Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text);
19   - Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap();
20   - for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) {
21   - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
22   - Map<Attribute, Double> sentenceFeatures = entry.getValue();
23   - for (Attribute attribute : featureExtractor.getAttributesList()) {
24   - instance.setValue(attribute, sentenceFeatures.get(attribute));
25   - }
26   - candidate2instance.put(entry.getKey(), instance);
27   - }
28   - return candidate2instance;
29   - }
30   -
31   -}
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... ... @@ -10,6 +10,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
10 10 import pl.waw.ipipan.zil.summ.nicolas.Constants;
11 11 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
12 12 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
  13 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
13 14 import weka.core.Attribute;
14 15  
15 16 import java.util.List;
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroModel.java
... ... @@ -3,35 +3,30 @@ package pl.waw.ipipan.zil.summ.nicolas.zero;
3 3 import com.google.common.collect.Sets;
4 4 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
5 5 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
6   -import pl.waw.ipipan.zil.summ.nicolas.Constants;
7 6 import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
  7 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder;
  8 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
8 9 import weka.classifiers.Classifier;
9 10 import weka.core.Instance;
10 11 import weka.core.Instances;
11   -import weka.core.SerializationHelper;
12 12  
13 13 import java.util.List;
14 14 import java.util.Map;
15 15 import java.util.Set;
16 16 import java.util.stream.Collectors;
17 17  
18   -public class ZeroSubjectInjector {
  18 +public class ZeroModel {
19 19  
20   - private final ZeroFeatureExtractor featureExtractor;
21   - private final Classifier classifier;
22   - private final Instances instances;
23   -
24   - public ZeroSubjectInjector() throws Exception {
25   - classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH);
26   - featureExtractor = new ZeroFeatureExtractor();
27   - instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
  20 + private ZeroModel() {
28 21 }
29 22  
30   - public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception {
  23 + public static Set<String> findZeroSubjectTokenIds(Classifier classifier, ZeroFeatureExtractor featureExtractor, TText text, List<TSentence> selectedSentences) throws Exception {
  24 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
  25 +
31 26 Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet());
32 27 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds);
33 28 Map<ZeroSubjectCandidate, Instance> candidate2instance =
34   - InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
  29 + InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
35 30  
36 31 Set<String> result = Sets.newHashSet();
37 32 for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) {
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/CandidateFinder.java
1   -package pl.waw.ipipan.zil.summ.nicolas.zero;
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate;
2 2  
3 3 import com.google.common.collect.Lists;
4 4 import com.google.common.collect.Maps;
... ... @@ -12,57 +12,110 @@ import java.util.Set;
12 12  
13 13 public class CandidateFinder {
14 14  
  15 + private static final String SUBST = "subst";
  16 + private static final String NOM = "nom";
  17 + private static final String MSD_SPLITTER = ":";
  18 +
15 19 private CandidateFinder() {
16 20 }
17 21  
18 22 public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) {
  23 + Map<String, Set<String>> mentionId2Cluster = getMentionId2Cluster(text);
  24 + return getZeroSubjectCandidates(text, summarySentenceIds, mentionId2Cluster);
  25 + }
  26 +
  27 + private static List<ZeroSubjectCandidate> getZeroSubjectCandidates(TText text, Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster) {
19 28 List<ZeroSubjectCandidate> candidates = Lists.newArrayList();
20 29  
21   - Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
22   - for (TCoreference coreference : text.getCoreferences()) {
23   - for (String mentionId : coreference.getMentionIds()) {
24   - mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));
  30 + PrevSentenceState prevSentenceState = new PrevSentenceState();
  31 + for (TParagraph p : text.getParagraphs()) {
  32 + for (TSentence sentence : p.getSentences()) {
  33 + processSentence(summarySentenceIds, mentionId2Cluster, candidates, prevSentenceState, sentence);
25 34 }
26 35 }
  36 + return candidates;
  37 + }
  38 +
  39 + private static void processSentence(Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence) {
  40 + if (!summarySentenceIds.contains(sentence.getId()))
  41 + return;
  42 + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();
  43 +
  44 + Map<String, TToken> tokenId2Token = getTokenId2Token(sentence);
  45 +
  46 + for (TMention mention : sentence.getMentions()) {
  47 + processMention(mentionId2Cluster, candidates, prevSentenceState, sentence, currentSentenceNominativeMentionIds, tokenId2Token, mention);
  48 + }
27 49  
28   - Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet();
29   - TSentence prevSentence = null;
30   - for (TParagraph p : text.getParagraphs()) {
31   - for (TSentence sentence : p.getSentences()) {
32   - if (!summarySentenceIds.contains(sentence.getId()))
33   - continue;
34   - Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();
35   -
36   - Map<String, TToken> tokenId2Token = Maps.newHashMap();
37   - for (TToken t : sentence.getTokens())
38   - tokenId2Token.put(t.getId(), t);
39   -
40   - for (TMention mention : sentence.getMentions()) {
41   -
42   - for (String tokenId : mention.getHeadIds()) {
43   - TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();
44   - if (isInNominative(interp)) {
45   -
46   - currentSentenceNominativeMentionIds.add(mention.getId());
47   - if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) {
48   - ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentence, sentence, mention);
49   - candidates.add(candidate);
50   - }
51   - break;
52   - }
53   - }
  50 + prevSentenceState.setPrevSentence(sentence);
  51 + prevSentenceState.setNominativeMentionIds(currentSentenceNominativeMentionIds);
  52 + }
  53 +
  54 + private static void processMention(Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence, Set<String> currentSentenceNominativeMentionIds, Map<String, TToken> tokenId2Token, TMention mention) {
  55 + for (String tokenId : mention.getHeadIds()) {
  56 + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();
  57 + if (isInNominative(interp)) {
  58 + currentSentenceNominativeMentionIds.add(mention.getId());
  59 + if (isCoreferentWithPreviousSentence(mentionId2Cluster, prevSentenceState, mention)) {
  60 + ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentenceState.getPrevSentence(), sentence, mention);
  61 + candidates.add(candidate);
54 62 }
  63 + break;
  64 + }
  65 + }
  66 + }
55 67  
56   - prevSentence = sentence;
57   - prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds;
  68 + private static boolean isCoreferentWithPreviousSentence(Map<String, Set<String>> mentionId2Cluster, PrevSentenceState prevSentenceState, TMention mention) {
  69 + return mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceState.getNominativeMentionIds()::contains);
  70 + }
  71 +
  72 + private static Map<String, TToken> getTokenId2Token(TSentence sentence) {
  73 + Map<String, TToken> tokenId2Token = Maps.newHashMap();
  74 + for (TToken t : sentence.getTokens())
  75 + tokenId2Token.put(t.getId(), t);
  76 + return tokenId2Token;
  77 + }
  78 +
  79 + private static Map<String, Set<String>> getMentionId2Cluster(TText text) {
  80 + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
  81 + for (TCoreference coreference : text.getCoreferences()) {
  82 + for (String mentionId : coreference.getMentionIds()) {
  83 + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));
58 84 }
59 85 }
60   - return candidates;
  86 + return mentionId2Cluster;
61 87 }
62 88  
63 89 private static boolean isInNominative(TInterpretation interp) {
64   - boolean isNominative = Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom"));
65   - boolean isSubst = interp.getCtag().equals("subst");
  90 + boolean isNominative = Arrays.stream(interp.getMsd().split(MSD_SPLITTER)).anyMatch(t -> t.equals(NOM));
  91 + boolean isSubst = interp.getCtag().equals(SUBST);
66 92 return isSubst && isNominative;
67 93 }
  94 +
  95 + private static class PrevSentenceState {
  96 +
  97 + private Set<String> nominativeMentionIds;
  98 + private TSentence prevSentence;
  99 +
  100 + PrevSentenceState() {
  101 + nominativeMentionIds = Sets.newHashSet();
  102 + prevSentence = null;
  103 + }
  104 +
  105 + Set<String> getNominativeMentionIds() {
  106 + return nominativeMentionIds;
  107 + }
  108 +
  109 + TSentence getPrevSentence() {
  110 + return prevSentence;
  111 + }
  112 +
  113 + void setNominativeMentionIds(Set<String> nominativeMentionIds) {
  114 + this.nominativeMentionIds = nominativeMentionIds;
  115 + }
  116 +
  117 + void setPrevSentence(TSentence prevSentence) {
  118 + this.prevSentence = prevSentence;
  119 + }
  120 + }
68 121 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/ZeroSubjectCandidate.java
1   -package pl.waw.ipipan.zil.summ.nicolas.zero;
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate;
2 2  
3 3 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
4 4 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
... ... @@ -9,7 +9,7 @@ public class ZeroSubjectCandidate {
9 9 private final TSentence sentence;
10 10 private final TMention zeroCandidateMention;
11 11  
12   - public ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) {
  12 + ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) {
13 13 this.previousSentence = previousSentence;
14 14 this.sentence = sentence;
15 15 this.zeroCandidateMention = zeroCandidateMention;
... ...
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
... ... @@ -7,6 +7,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
9 9 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
  10 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder;
  11 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
10 12  
11 13 import java.io.IOException;
12 14 import java.io.InputStream;
... ...
nicolas-train/src/main/R/plot_summary_lenghts.R 0 → 100644
  1 +require(ggplot2)
  2 +require(grid)
  3 +require(gridExtra)
  4 +require(lattice)
  5 +
  6 +DATA_DIR="../../../../data/"
  7 +
  8 +########################## functions
  9 +gpl = function(d) {
  10 + ggplot(d, aes(x=as.factor(d$SumRatio), y=SumRealRatio)) +
  11 + geom_boxplot(outlier.shape=4, outlier.colour = "blue") +
  12 + ylim(0, 40) +
  13 + ylab("Obtained summary ratio (word count)") +
  14 + xlab("Requested summary ratio (word count)") +
  15 + theme(text = element_text(size=15))
  16 +}
  17 +
  18 +ploto = function(d) {
  19 + p = gpl(d)
  20 +}
  21 +
  22 +histo = function(d) {
  23 + p = ggplot(d, aes(abs(d$SumRealRatio*100/d$SumRatio))) +
  24 + geom_histogram(binwidth = 1) +
  25 + xlim(80, 120) +
  26 + ylab("Number of summaries") +
  27 + xlab("Obtained summary ratio as percent of requested ratio (20%)") +
  28 + theme(text = element_text(size=15))
  29 +}
  30 +
  31 +######################### automatic summaries
  32 +data = read.csv(paste(DATA_DIR, "summary-lengths.tsv", sep=""), sep = "\t")
  33 +
  34 +names = list("Swietl", "nicolas", "nicolas-zero", "BASELINE")
  35 +titles = list("Świetlicka", "Nicolas", "Nicolas-zero", "Baseline")
  36 +plots = list()
  37 +hists = list()
  38 +i = 1
  39 +for (n in names) {
  40 + print(n)
  41 + title = titles[[i]]
  42 + i = i + 1
  43 +
  44 + d = data[data$SumAuthor==n,]
  45 + print(mean(d$SumRealRatio))
  46 +
  47 + p = ploto(d)
  48 + p = p + ggtitle(title)
  49 + plots = c(plots, list(p))
  50 +
  51 + hi = histo(d)
  52 + hi = hi + ggtitle(title)
  53 + hists = c(hists, list(hi))
  54 +}
  55 +
  56 +pdf(file=paste(DATA_DIR, "summary-length-plots.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27)
  57 +grid.arrange(plots[[1]], plots[[2]], plots[[3]], plots[[4]], ncol=2, nrow=2)
  58 +dev.off()
  59 +
  60 +pdf(file=paste(DATA_DIR, "summary-length-hists.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27)
  61 +grid.arrange(hists[[1]], hists[[2]], hists[[3]], hists[[4]], ncol=2, nrow=2)
  62 +dev.off()
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java
... ... @@ -18,7 +18,7 @@ public class CorpusHelper {
18 18 private static final String ABSTRACT_SUMMARY_TYPE = "abstract";
19 19 private static final String EXTRACT_SUMMARY_TYPE = "extract";
20 20  
21   - private static final int SUMMARY_RATIO = 20;
  21 + public static final int SUMMARY_RATIO = 20;
22 22  
23 23 private CorpusHelper() {
24 24 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java
... ... @@ -6,6 +6,7 @@ import org.apache.commons.csv.CSVPrinter;
6 6 import org.slf4j.Logger;
7 7 import org.slf4j.LoggerFactory;
8 8 import pl.waw.ipipan.zil.summ.nicolas.Constants;
  9 +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper;
9 10 import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
10 11 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
11 12 import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
... ... @@ -26,7 +27,7 @@ public class CalculateSystemSummaryLengths {
26 27 private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class);
27 28  
28 29 private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId",
29   - "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t');
  30 + "TextWC", "SumType", "SumAuthor", "SumRatio", "SumWC", "SumRealRatio").withDelimiter('\t');
30 31  
31 32 private CalculateSystemSummaryLengths() {
32 33 }
... ... @@ -61,9 +62,10 @@ public class CalculateSystemSummaryLengths {
61 62 record.add(textWC);
62 63 record.add("automatic");
63 64 record.add(systemName);
  65 + record.add(CorpusHelper.SUMMARY_RATIO);
64 66 int sumWC = TextUtils.tokenize(body).size();
65 67 record.add(sumWC);
66   - record.add(sumWC * 1.0 / textWC);
  68 + record.add(sumWC * 100.0 / textWC);
67 69 printer.printRecord(record);
68 70 }
69 71  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
... ... @@ -29,6 +29,8 @@ public class SummarizeTestCorpus {
29 29 private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class);
30 30  
31 31 private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt";
  32 + private static final String SUMMARY_WITH_ZERO_FILE_SUFFIX = "_nicolas-zero.txt";
  33 +
32 34 private static final double SUMMARY_RATIO = 0.2;
33 35  
34 36 private SummarizeTestCorpus() {
... ... @@ -41,16 +43,20 @@ public class SummarizeTestCorpus {
41 43 Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains);
42 44 LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size());
43 45  
44   - Map<String, String> id2summary = summarizeTexts(id2preprocessedText);
  46 + summarize(new Nicolas(false), id2preprocessedText, SUMMARY_FILE_SUFFIX);
  47 + summarize(new Nicolas(), id2preprocessedText, SUMMARY_WITH_ZERO_FILE_SUFFIX);
  48 + }
  49 +
  50 + private static void summarize(Nicolas nicolas, Map<String, TText> id2preprocessedText, String fileSuffix) throws NicolasException, IOException {
  51 + Map<String, String> id2summary = summarizeTexts(id2preprocessedText, nicolas);
45 52 LOG.info("Texts summarized.");
46 53  
47   - saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR);
  54 + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR, fileSuffix);
48 55 LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR);
49 56 }
50 57  
51   - private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException {
  58 + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText, Nicolas nicolas) throws NicolasException {
52 59 Map<String, String> id2summary = Maps.newHashMap();
53   - Nicolas nicolas = new Nicolas();
54 60 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
55 61 TText text = entry.getValue();
56 62 int targetSize = calculateTargetSize(text);
... ... @@ -70,11 +76,11 @@ public class SummarizeTestCorpus {
70 76 return (int) (SUMMARY_RATIO * tokenCount);
71 77 }
72 78  
73   - private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException {
  79 + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir, String fileSuffix) throws IOException {
74 80 for (Map.Entry<String, String> entry : id2summary.entrySet()) {
75 81 String textId = entry.getKey();
76 82 String summary = entry.getValue();
77   - String targetFileName = textId + SUMMARY_FILE_SUFFIX;
  83 + String targetFileName = textId + fileSuffix;
78 84 try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) {
79 85 writer.write(summary);
80 86 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Main.java
1 1 package pl.waw.ipipan.zil.summ.nicolas.train;
2 2  
3 3 import pl.waw.ipipan.zil.summ.nicolas.train.pipeline.*;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractMostFrequentMentions;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractStopwords;
4 6  
5 7 public class Main {
6 8  
... ... @@ -12,6 +14,8 @@ public class Main {
12 14 DownloadTrainingResources.main(args);
13 15 ExtractGoldSummaries.main(args);
14 16 CreateOptimalSummaries.main(args);
  17 + ExtractStopwords.main(args);
  18 + ExtractMostFrequentMentions.main(args);
15 19 PrepareTrainingData.main(args);
16 20 TrainAllModels.main(args);
17 21 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java
... ... @@ -7,7 +7,7 @@ import org.apache.commons.csv.CSVRecord;
7 7 import org.apache.commons.csv.QuoteMode;
8 8 import pl.waw.ipipan.zil.summ.nicolas.Constants;
9 9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10   -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
  10 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
11 11  
12 12 import java.io.*;
13 13 import java.util.List;
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
... ... @@ -19,10 +19,9 @@ import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer;
19 19 import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer;
20 20 import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
21 21 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
22   -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
23   -import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;
24 22 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
25   -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
  23 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder;
  24 +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate;
26 25 import weka.core.Instance;
27 26 import weka.core.Instances;
28 27 import weka.core.converters.ArffSaver;
... ... @@ -152,7 +151,7 @@ public class PrepareTrainingData {
152 151 FeatureHelper featureHelper = new FeatureHelper(text);
153 152  
154 153 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds);
155   - Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
  154 + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
156 155  
157 156 for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) {
158 157 boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper);
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractMostFrequentMentions.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.resources;
  2 +
  3 +import com.google.common.collect.*;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  6 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
  7 +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
  8 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  9 +
  10 +import javax.xml.bind.JAXBException;
  11 +import java.io.File;
  12 +import java.io.IOException;
  13 +import java.util.Comparator;
  14 +import java.util.List;
  15 +import java.util.Map;
  16 +import java.util.Set;
  17 +import java.util.stream.Collectors;
  18 +
  19 +public class ExtractMostFrequentMentions {
  20 +
  21 + public static final String GOLD_DATA_PATH = "/home/me2/Dropbox/3_nauka/3_doktorat/3_korpus_streszczen/dist/src/data/";
  22 +
  23 + public static final String THRIFTED_PREFIX = "/home/me2/Desktop/thrifted_texts/thrifted_all/";
  24 + public static final String THRIFTED_SUFFIX = "/original";
  25 +
  26 + public static void main(String[] args) throws IOException, JAXBException {
  27 +
  28 + Set<String> devIds = Sets.newHashSet();
  29 +
  30 + File goldDir = new File(GOLD_DATA_PATH);
  31 + for (File file : goldDir.listFiles()) {
  32 + Text goldText = PSC_IO.readText(file);
  33 + if (goldText.getSummaries().getSummary().stream().anyMatch(s -> s.getType().equals("abstract")))
  34 + continue;
  35 +
  36 + devIds.add(file.getName().replace(".xml", ""));
  37 + }
  38 +
  39 +
  40 + System.out.println(devIds.size());
  41 +
  42 + Multiset<String> mentionCounts = HashMultiset.create();
  43 + for (String id : devIds) {
  44 + Set<String> distinctTextMentions = Sets.newHashSet();
  45 + File input = new File(THRIFTED_PREFIX + id + THRIFTED_SUFFIX);
  46 + TText thrifted = ThriftUtils.loadThriftTextFromFile(input);
  47 + List<TSentence> sents = thrifted.getParagraphs().stream()
  48 + .flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
  49 +
  50 + Map<String, String> tokenId2base = Maps.newHashMap();
  51 + sents.stream()
  52 + .flatMap(s -> s.getTokens().stream())
  53 + .forEach(token -> tokenId2base.put(token.getId(), token.getChosenInterpretation().getBase()));
  54 +
  55 + sents.stream().flatMap(s -> s.getMentions().stream()).forEach(m -> {
  56 + StringBuffer sb = new StringBuffer();
  57 + for (String tokId : m.getChildIds()) {
  58 + sb.append(tokenId2base.get(tokId) + " ");
  59 + }
  60 + distinctTextMentions.add(sb.toString().trim().toLowerCase());
  61 + });
  62 +
  63 + mentionCounts.addAll(distinctTextMentions);
  64 + }
  65 +
  66 + System.out.println(mentionCounts.elementSet().size());
  67 + List<String> sorted = Lists.newArrayList();
  68 + sorted.addAll(mentionCounts.elementSet());
  69 + sorted.sort(Comparator.comparing(mentionCounts::count).reversed());
  70 + int i = 0;
  71 + for (String mention : sorted) {
  72 + if (mentionCounts.count(mention) < 50)
  73 + break;
  74 + System.out.println(mention);
  75 + }
  76 +
  77 + }
  78 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractStopwords.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.resources;
  2 +
  3 +public class ExtractStopwords {
  4 +
  5 + private ExtractStopwords() {
  6 + }
  7 +
  8 + public static void main(String[] args) {
  9 +
  10 + }
  11 +}
... ...