diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java index 79d3e34..0281fd6 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java @@ -12,6 +12,7 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroModel; import weka.classifiers.Classifier; import java.io.IOException; @@ -29,35 +30,43 @@ public class Nicolas { private final SentenceFeatureExtractor sentenceFeatureExtractor; private final ZeroFeatureExtractor zeroFeatureExtractor; - public Nicolas() throws NicolasException { + public Nicolas(boolean useZeroModel) throws NicolasException { try { + mentionFeatureExtractor = new MentionFeatureExtractor(); mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); - sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); - zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); - mentionFeatureExtractor = new MentionFeatureExtractor(); sentenceFeatureExtractor = new SentenceFeatureExtractor(); - zeroFeatureExtractor = new ZeroFeatureExtractor(); + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); + + zeroFeatureExtractor = useZeroModel ? new ZeroFeatureExtractor() : null; + zeroModel = useZeroModel ? ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH) : null; + } catch (IOException e) { throw new NicolasException(e); } } + public Nicolas() throws NicolasException { + this(true); + } + public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { try { Set<TMention> goodMentions = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); - return calculateSummary(text, goodMentions, targetTokenCount); + List<TSentence> selectedSentences = selectSummarySentences(text, goodMentions, targetTokenCount); + Set<String> zeroSubjectTokenIds = zeroModel == null ? Collections.emptySet() : ZeroModel.findZeroSubjectTokenIds(zeroModel, zeroFeatureExtractor, text, selectedSentences); + + return createSummaryFromSentences(selectedSentences, zeroSubjectTokenIds); + } catch (Exception e) { throw new NicolasException(e); } } - private String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize); - + private String createSummaryFromSentences(List<TSentence> selectedSentences, Set<String> zeroSubjectTokenIds) { StringBuilder sb = new StringBuilder(); for (TSentence sent : selectedSentences) { - sb.append(" ").append(TextUtils.loadSentence2Orth(sent)); + sb.append(" ").append(TextUtils.loadSentence2Orth(sent, zeroSubjectTokenIds)); } return sb.toString().trim(); } @@ -70,16 +79,16 @@ public class Nicolas { List<TSentence> sortedSentences = Lists.newArrayList(sentences); sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed()); - int size = 0; - Random r = new Random(1); + int currentSize = 0; Set<TSentence> summary = Sets.newHashSet(); for (TSentence sent : sortedSentences) { - size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); - if (r.nextDouble() > 0.4 && size > targetSize) - break; - summary.add(sent); - if (size > targetSize) - break; + int sentenceSize = TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); + int newSize = currentSize + sentenceSize; + + if (Math.abs(newSize - targetSize) < Math.abs(currentSize - targetSize)) { + currentSize = newSize; + summary.add(sent); + } } List<TSentence> selectedSentences = Lists.newArrayList(); for (TSentence sent : sentences) { diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java index a8ae412..6ff3cfb 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java @@ -1,7 +1,7 @@ package pl.waw.ipipan.zil.summ.nicolas; public class NicolasException extends Exception { - public NicolasException(Exception e) { + NicolasException(Exception e) { super(e); } } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java index 83468e3..449a0b5 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java @@ -33,7 +33,7 @@ public class MentionModel { if (good) goodMentions.add(entry.getKey()); } - LOG.info("Classified {} mentions as good.", goodMentions.size()); + LOG.debug("Classified {} mentions as good.", goodMentions.size()); return goodMentions; } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java index dc9cc6f..bbb088d 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java @@ -32,7 +32,7 @@ public class SentenceModel { double score = sentenceClassifier.classifyInstance(instance); sentence2score.put(entry.getKey(), score); } - LOG.info("Scored " + sentence2score.size() + " sentences."); + LOG.debug("Scored {} sentences.", sentence2score.size()); return sentence2score; } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java index 7fdf82b..a7780f1 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java @@ -8,6 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; @@ -65,6 +67,20 @@ public class InstanceUtils { return sentence2instance; } + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); + Map<Attribute, Double> sentenceFeatures = entry.getValue(); + for (Attribute attribute : featureExtractor.getAttributesList()) { + instance.setValue(attribute, sentenceFeatures.get(attribute)); + } + candidate2instance.put(entry.getKey(), instance); + } + return candidate2instance; + } + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList public static Instances createNewInstances(ArrayList<Attribute> attributesList) { Instances instances = new Instances(DATASET_NAME, attributesList, 0); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java deleted file mode 100644 index 8873735..0000000 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java +++ /dev/null @@ -1,31 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.zero; - -import com.google.common.collect.Maps; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import weka.core.Attribute; -import weka.core.DenseInstance; -import weka.core.Instance; - -import java.util.List; -import java.util.Map; - -public class InstanceCreator { - - private InstanceCreator() { - } - - public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { - Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); - Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); - for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); - Map<Attribute, Double> sentenceFeatures = entry.getValue(); - for (Attribute attribute : featureExtractor.getAttributesList()) { - instance.setValue(attribute, sentenceFeatures.get(attribute)); - } - candidate2instance.put(entry.getKey(), instance); - } - return candidate2instance; - } - -} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java index dfa853b..d63fe0b 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java @@ -10,6 +10,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; import weka.core.Attribute; import java.util.List; diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroModel.java index 11280f6..a49460d 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroModel.java @@ -3,35 +3,30 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; import com.google.common.collect.Sets; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder; +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; import weka.classifiers.Classifier; import weka.core.Instance; import weka.core.Instances; -import weka.core.SerializationHelper; import java.util.List; import java.util.Map; import java.util.Set; import java.util.stream.Collectors; -public class ZeroSubjectInjector { +public class ZeroModel { - private final ZeroFeatureExtractor featureExtractor; - private final Classifier classifier; - private final Instances instances; - - public ZeroSubjectInjector() throws Exception { - classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); - featureExtractor = new ZeroFeatureExtractor(); - instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); + private ZeroModel() { } - public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { + public static Set<String> findZeroSubjectTokenIds(Classifier classifier, ZeroFeatureExtractor featureExtractor, TText text, List<TSentence> selectedSentences) throws Exception { + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); + Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); Map<ZeroSubjectCandidate, Instance> candidate2instance = - InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); + InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); Set<String> result = Sets.newHashSet(); for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/CandidateFinder.java index f862b31..0daf066 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/CandidateFinder.java @@ -1,4 +1,4 @@ -package pl.waw.ipipan.zil.summ.nicolas.zero; +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate; import com.google.common.collect.Lists; import com.google.common.collect.Maps; @@ -12,57 +12,110 @@ import java.util.Set; public class CandidateFinder { + private static final String SUBST = "subst"; + private static final String NOM = "nom"; + private static final String MSD_SPLITTER = ":"; + private CandidateFinder() { } public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { + Map<String, Set<String>> mentionId2Cluster = getMentionId2Cluster(text); + return getZeroSubjectCandidates(text, summarySentenceIds, mentionId2Cluster); + } + + private static List<ZeroSubjectCandidate> getZeroSubjectCandidates(TText text, Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster) { List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); - Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); - for (TCoreference coreference : text.getCoreferences()) { - for (String mentionId : coreference.getMentionIds()) { - mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); + PrevSentenceState prevSentenceState = new PrevSentenceState(); + for (TParagraph p : text.getParagraphs()) { + for (TSentence sentence : p.getSentences()) { + processSentence(summarySentenceIds, mentionId2Cluster, candidates, prevSentenceState, sentence); } } + return candidates; + } + + private static void processSentence(Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence) { + if (!summarySentenceIds.contains(sentence.getId())) + return; + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); + + Map<String, TToken> tokenId2Token = getTokenId2Token(sentence); + + for (TMention mention : sentence.getMentions()) { + processMention(mentionId2Cluster, candidates, prevSentenceState, sentence, currentSentenceNominativeMentionIds, tokenId2Token, mention); + } - Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet(); - TSentence prevSentence = null; - for (TParagraph p : text.getParagraphs()) { - for (TSentence sentence : p.getSentences()) { - if (!summarySentenceIds.contains(sentence.getId())) - continue; - Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); - - Map<String, TToken> tokenId2Token = Maps.newHashMap(); - for (TToken t : sentence.getTokens()) - tokenId2Token.put(t.getId(), t); - - for (TMention mention : sentence.getMentions()) { - - for (String tokenId : mention.getHeadIds()) { - TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); - if (isInNominative(interp)) { - - currentSentenceNominativeMentionIds.add(mention.getId()); - if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) { - ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentence, sentence, mention); - candidates.add(candidate); - } - break; - } - } + prevSentenceState.setPrevSentence(sentence); + prevSentenceState.setNominativeMentionIds(currentSentenceNominativeMentionIds); + } + + private static void processMention(Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence, Set<String> currentSentenceNominativeMentionIds, Map<String, TToken> tokenId2Token, TMention mention) { + for (String tokenId : mention.getHeadIds()) { + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); + if (isInNominative(interp)) { + currentSentenceNominativeMentionIds.add(mention.getId()); + if (isCoreferentWithPreviousSentence(mentionId2Cluster, prevSentenceState, mention)) { + ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentenceState.getPrevSentence(), sentence, mention); + candidates.add(candidate); } + break; + } + } + } - prevSentence = sentence; - prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; + private static boolean isCoreferentWithPreviousSentence(Map<String, Set<String>> mentionId2Cluster, PrevSentenceState prevSentenceState, TMention mention) { + return mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceState.getNominativeMentionIds()::contains); + } + + private static Map<String, TToken> getTokenId2Token(TSentence sentence) { + Map<String, TToken> tokenId2Token = Maps.newHashMap(); + for (TToken t : sentence.getTokens()) + tokenId2Token.put(t.getId(), t); + return tokenId2Token; + } + + private static Map<String, Set<String>> getMentionId2Cluster(TText text) { + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); + for (TCoreference coreference : text.getCoreferences()) { + for (String mentionId : coreference.getMentionIds()) { + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); } } - return candidates; + return mentionId2Cluster; } private static boolean isInNominative(TInterpretation interp) { - boolean isNominative = Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); - boolean isSubst = interp.getCtag().equals("subst"); + boolean isNominative = Arrays.stream(interp.getMsd().split(MSD_SPLITTER)).anyMatch(t -> t.equals(NOM)); + boolean isSubst = interp.getCtag().equals(SUBST); return isSubst && isNominative; } + + private static class PrevSentenceState { + + private Set<String> nominativeMentionIds; + private TSentence prevSentence; + + PrevSentenceState() { + nominativeMentionIds = Sets.newHashSet(); + prevSentence = null; + } + + Set<String> getNominativeMentionIds() { + return nominativeMentionIds; + } + + TSentence getPrevSentence() { + return prevSentence; + } + + void setNominativeMentionIds(Set<String> nominativeMentionIds) { + this.nominativeMentionIds = nominativeMentionIds; + } + + void setPrevSentence(TSentence prevSentence) { + this.prevSentence = prevSentence; + } + } } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/ZeroSubjectCandidate.java index 6d0a76f..c14a55f 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/ZeroSubjectCandidate.java @@ -1,4 +1,4 @@ -package pl.waw.ipipan.zil.summ.nicolas.zero; +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; @@ -9,7 +9,7 @@ public class ZeroSubjectCandidate { private final TSentence sentence; private final TMention zeroCandidateMention; - public ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) { + ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) { this.previousSentence = previousSentence; this.sentence = sentence; this.zeroCandidateMention = zeroCandidateMention; diff --git a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java index 274356b..a2adc96 100644 --- a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java +++ b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java @@ -7,6 +7,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder; +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; import java.io.IOException; import java.io.InputStream; diff --git a/nicolas-train/src/main/R/plot_summary_lenghts.R b/nicolas-train/src/main/R/plot_summary_lenghts.R new file mode 100644 index 0000000..bfbc326 --- /dev/null +++ b/nicolas-train/src/main/R/plot_summary_lenghts.R @@ -0,0 +1,62 @@ +require(ggplot2) +require(grid) +require(gridExtra) +require(lattice) + +DATA_DIR="../../../../data/" + +########################## functions +gpl = function(d) { + ggplot(d, aes(x=as.factor(d$SumRatio), y=SumRealRatio)) + + geom_boxplot(outlier.shape=4, outlier.colour = "blue") + + ylim(0, 40) + + ylab("Obtained summary ratio (word count)") + + xlab("Requested summary ratio (word count)") + + theme(text = element_text(size=15)) +} + +ploto = function(d) { + p = gpl(d) +} + +histo = function(d) { + p = ggplot(d, aes(abs(d$SumRealRatio*100/d$SumRatio))) + + geom_histogram(binwidth = 1) + + xlim(80, 120) + + ylab("Number of summaries") + + xlab("Obtained summary ratio as percent of requested ratio (20%)") + + theme(text = element_text(size=15)) +} + +######################### automatic summaries +data = read.csv(paste(DATA_DIR, "summary-lengths.tsv", sep=""), sep = "\t") + +names = list("Swietl", "nicolas", "nicolas-zero", "BASELINE") +titles = list("Ĺwietlicka", "Nicolas", "Nicolas-zero", "Baseline") +plots = list() +hists = list() +i = 1 +for (n in names) { + print(n) + title = titles[[i]] + i = i + 1 + + d = data[data$SumAuthor==n,] + print(mean(d$SumRealRatio)) + + p = ploto(d) + p = p + ggtitle(title) + plots = c(plots, list(p)) + + hi = histo(d) + hi = hi + ggtitle(title) + hists = c(hists, list(hi)) +} + +pdf(file=paste(DATA_DIR, "summary-length-plots.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27) +grid.arrange(plots[[1]], plots[[2]], plots[[3]], plots[[4]], ncol=2, nrow=2) +dev.off() + +pdf(file=paste(DATA_DIR, "summary-length-hists.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27) +grid.arrange(hists[[1]], hists[[2]], hists[[3]], hists[[4]], ncol=2, nrow=2) +dev.off() diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java index d71b4fa..e938576 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java @@ -18,7 +18,7 @@ public class CorpusHelper { private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; private static final String EXTRACT_SUMMARY_TYPE = "extract"; - private static final int SUMMARY_RATIO = 20; + public static final int SUMMARY_RATIO = 20; private CorpusHelper() { } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java index c09c928..2cbad62 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java @@ -6,6 +6,7 @@ import org.apache.commons.csv.CSVPrinter; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.summ.nicolas.Constants; +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; import pl.waw.ipipan.zil.summ.pscapi.xml.Text; @@ -26,7 +27,7 @@ public class CalculateSystemSummaryLengths { private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class); private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId", - "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t'); + "TextWC", "SumType", "SumAuthor", "SumRatio", "SumWC", "SumRealRatio").withDelimiter('\t'); private CalculateSystemSummaryLengths() { } @@ -61,9 +62,10 @@ public class CalculateSystemSummaryLengths { record.add(textWC); record.add("automatic"); record.add(systemName); + record.add(CorpusHelper.SUMMARY_RATIO); int sumWC = TextUtils.tokenize(body).size(); record.add(sumWC); - record.add(sumWC * 1.0 / textWC); + record.add(sumWC * 100.0 / textWC); printer.printRecord(record); } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java index 852ce47..557ab02 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java @@ -29,6 +29,8 @@ public class SummarizeTestCorpus { private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; + private static final String SUMMARY_WITH_ZERO_FILE_SUFFIX = "_nicolas-zero.txt"; + private static final double SUMMARY_RATIO = 0.2; private SummarizeTestCorpus() { @@ -41,16 +43,20 @@ public class SummarizeTestCorpus { Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains); LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); - Map<String, String> id2summary = summarizeTexts(id2preprocessedText); + summarize(new Nicolas(false), id2preprocessedText, SUMMARY_FILE_SUFFIX); + summarize(new Nicolas(), id2preprocessedText, SUMMARY_WITH_ZERO_FILE_SUFFIX); + } + + private static void summarize(Nicolas nicolas, Map<String, TText> id2preprocessedText, String fileSuffix) throws NicolasException, IOException { + Map<String, String> id2summary = summarizeTexts(id2preprocessedText, nicolas); LOG.info("Texts summarized."); - saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR); + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR, fileSuffix); LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR); } - private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText, Nicolas nicolas) throws NicolasException { Map<String, String> id2summary = Maps.newHashMap(); - Nicolas nicolas = new Nicolas(); for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { TText text = entry.getValue(); int targetSize = calculateTargetSize(text); @@ -70,11 +76,11 @@ public class SummarizeTestCorpus { return (int) (SUMMARY_RATIO * tokenCount); } - private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException { + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir, String fileSuffix) throws IOException { for (Map.Entry<String, String> entry : id2summary.entrySet()) { String textId = entry.getKey(); String summary = entry.getValue(); - String targetFileName = textId + SUMMARY_FILE_SUFFIX; + String targetFileName = textId + fileSuffix; try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) { writer.write(summary); } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Main.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Main.java index 07e556a..f418514 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Main.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Main.java @@ -1,6 +1,8 @@ package pl.waw.ipipan.zil.summ.nicolas.train; import pl.waw.ipipan.zil.summ.nicolas.train.pipeline.*; +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractMostFrequentMentions; +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractStopwords; public class Main { @@ -12,6 +14,8 @@ public class Main { DownloadTrainingResources.main(args); ExtractGoldSummaries.main(args); CreateOptimalSummaries.main(args); + ExtractStopwords.main(args); + ExtractMostFrequentMentions.main(args); PrepareTrainingData.main(args); TrainAllModels.main(args); } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java index 98d0d67..f412ba5 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java @@ -7,7 +7,7 @@ import org.apache.commons.csv.CSVRecord; import org.apache.commons.csv.QuoteMode; import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; import java.io.*; import java.util.List; diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java index 808e45d..9c4f012 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java @@ -19,10 +19,9 @@ import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; -import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder; +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; import weka.core.Instance; import weka.core.Instances; import weka.core.converters.ArffSaver; @@ -152,7 +151,7 @@ public class PrepareTrainingData { FeatureHelper featureHelper = new FeatureHelper(text); List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); - Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractMostFrequentMentions.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractMostFrequentMentions.java new file mode 100644 index 0000000..349888c --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractMostFrequentMentions.java @@ -0,0 +1,78 @@ +package pl.waw.ipipan.zil.summ.nicolas.train.resources; + +import com.google.common.collect.*; +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; + +import javax.xml.bind.JAXBException; +import java.io.File; +import java.io.IOException; +import java.util.Comparator; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +public class ExtractMostFrequentMentions { + + public static final String GOLD_DATA_PATH = "/home/me2/Dropbox/3_nauka/3_doktorat/3_korpus_streszczen/dist/src/data/"; + + public static final String THRIFTED_PREFIX = "/home/me2/Desktop/thrifted_texts/thrifted_all/"; + public static final String THRIFTED_SUFFIX = "/original"; + + public static void main(String[] args) throws IOException, JAXBException { + + Set<String> devIds = Sets.newHashSet(); + + File goldDir = new File(GOLD_DATA_PATH); + for (File file : goldDir.listFiles()) { + Text goldText = PSC_IO.readText(file); + if (goldText.getSummaries().getSummary().stream().anyMatch(s -> s.getType().equals("abstract"))) + continue; + + devIds.add(file.getName().replace(".xml", "")); + } + + + System.out.println(devIds.size()); + + Multiset<String> mentionCounts = HashMultiset.create(); + for (String id : devIds) { + Set<String> distinctTextMentions = Sets.newHashSet(); + File input = new File(THRIFTED_PREFIX + id + THRIFTED_SUFFIX); + TText thrifted = ThriftUtils.loadThriftTextFromFile(input); + List<TSentence> sents = thrifted.getParagraphs().stream() + .flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); + + Map<String, String> tokenId2base = Maps.newHashMap(); + sents.stream() + .flatMap(s -> s.getTokens().stream()) + .forEach(token -> tokenId2base.put(token.getId(), token.getChosenInterpretation().getBase())); + + sents.stream().flatMap(s -> s.getMentions().stream()).forEach(m -> { + StringBuffer sb = new StringBuffer(); + for (String tokId : m.getChildIds()) { + sb.append(tokenId2base.get(tokId) + " "); + } + distinctTextMentions.add(sb.toString().trim().toLowerCase()); + }); + + mentionCounts.addAll(distinctTextMentions); + } + + System.out.println(mentionCounts.elementSet().size()); + List<String> sorted = Lists.newArrayList(); + sorted.addAll(mentionCounts.elementSet()); + sorted.sort(Comparator.comparing(mentionCounts::count).reversed()); + int i = 0; + for (String mention : sorted) { + if (mentionCounts.count(mention) < 50) + break; + System.out.println(mention); + } + + } +} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractStopwords.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractStopwords.java new file mode 100644 index 0000000..31db95c --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractStopwords.java @@ -0,0 +1,11 @@ +package pl.waw.ipipan.zil.summ.nicolas.train.resources; + +public class ExtractStopwords { + + private ExtractStopwords() { + } + + public static void main(String[] args) { + + } +}