Commit 156b37078e717ab9f3849c9d6a1ef1d2ddbc2a10
1 parent
89870bd0
add zero subject removal
Showing
20 changed files
with
324 additions
and
117 deletions
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... | ... | @@ -12,6 +12,7 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; |
12 | 12 | import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; |
13 | 13 | import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; |
14 | 14 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; |
15 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroModel; | |
15 | 16 | import weka.classifiers.Classifier; |
16 | 17 | |
17 | 18 | import java.io.IOException; |
... | ... | @@ -29,35 +30,43 @@ public class Nicolas { |
29 | 30 | private final SentenceFeatureExtractor sentenceFeatureExtractor; |
30 | 31 | private final ZeroFeatureExtractor zeroFeatureExtractor; |
31 | 32 | |
32 | - public Nicolas() throws NicolasException { | |
33 | + public Nicolas(boolean useZeroModel) throws NicolasException { | |
33 | 34 | try { |
35 | + mentionFeatureExtractor = new MentionFeatureExtractor(); | |
34 | 36 | mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); |
35 | - sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
36 | - zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | |
37 | 37 | |
38 | - mentionFeatureExtractor = new MentionFeatureExtractor(); | |
39 | 38 | sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
40 | - zeroFeatureExtractor = new ZeroFeatureExtractor(); | |
39 | + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
40 | + | |
41 | + zeroFeatureExtractor = useZeroModel ? new ZeroFeatureExtractor() : null; | |
42 | + zeroModel = useZeroModel ? ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH) : null; | |
43 | + | |
41 | 44 | } catch (IOException e) { |
42 | 45 | throw new NicolasException(e); |
43 | 46 | } |
44 | 47 | } |
45 | 48 | |
49 | + public Nicolas() throws NicolasException { | |
50 | + this(true); | |
51 | + } | |
52 | + | |
46 | 53 | public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { |
47 | 54 | try { |
48 | 55 | Set<TMention> goodMentions = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); |
49 | - return calculateSummary(text, goodMentions, targetTokenCount); | |
56 | + List<TSentence> selectedSentences = selectSummarySentences(text, goodMentions, targetTokenCount); | |
57 | + Set<String> zeroSubjectTokenIds = zeroModel == null ? Collections.emptySet() : ZeroModel.findZeroSubjectTokenIds(zeroModel, zeroFeatureExtractor, text, selectedSentences); | |
58 | + | |
59 | + return createSummaryFromSentences(selectedSentences, zeroSubjectTokenIds); | |
60 | + | |
50 | 61 | } catch (Exception e) { |
51 | 62 | throw new NicolasException(e); |
52 | 63 | } |
53 | 64 | } |
54 | 65 | |
55 | - private String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { | |
56 | - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize); | |
57 | - | |
66 | + private String createSummaryFromSentences(List<TSentence> selectedSentences, Set<String> zeroSubjectTokenIds) { | |
58 | 67 | StringBuilder sb = new StringBuilder(); |
59 | 68 | for (TSentence sent : selectedSentences) { |
60 | - sb.append(" ").append(TextUtils.loadSentence2Orth(sent)); | |
69 | + sb.append(" ").append(TextUtils.loadSentence2Orth(sent, zeroSubjectTokenIds)); | |
61 | 70 | } |
62 | 71 | return sb.toString().trim(); |
63 | 72 | } |
... | ... | @@ -70,16 +79,16 @@ public class Nicolas { |
70 | 79 | List<TSentence> sortedSentences = Lists.newArrayList(sentences); |
71 | 80 | sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed()); |
72 | 81 | |
73 | - int size = 0; | |
74 | - Random r = new Random(1); | |
82 | + int currentSize = 0; | |
75 | 83 | Set<TSentence> summary = Sets.newHashSet(); |
76 | 84 | for (TSentence sent : sortedSentences) { |
77 | - size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); | |
78 | - if (r.nextDouble() > 0.4 && size > targetSize) | |
79 | - break; | |
80 | - summary.add(sent); | |
81 | - if (size > targetSize) | |
82 | - break; | |
85 | + int sentenceSize = TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); | |
86 | + int newSize = currentSize + sentenceSize; | |
87 | + | |
88 | + if (Math.abs(newSize - targetSize) < Math.abs(currentSize - targetSize)) { | |
89 | + currentSize = newSize; | |
90 | + summary.add(sent); | |
91 | + } | |
83 | 92 | } |
84 | 93 | List<TSentence> selectedSentences = Lists.newArrayList(); |
85 | 94 | for (TSentence sent : sentences) { |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
... | ... | @@ -33,7 +33,7 @@ public class MentionModel { |
33 | 33 | if (good) |
34 | 34 | goodMentions.add(entry.getKey()); |
35 | 35 | } |
36 | - LOG.info("Classified {} mentions as good.", goodMentions.size()); | |
36 | + LOG.debug("Classified {} mentions as good.", goodMentions.size()); | |
37 | 37 | return goodMentions; |
38 | 38 | } |
39 | 39 | |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
... | ... | @@ -32,7 +32,7 @@ public class SentenceModel { |
32 | 32 | double score = sentenceClassifier.classifyInstance(instance); |
33 | 33 | sentence2score.put(entry.getKey(), score); |
34 | 34 | } |
35 | - LOG.info("Scored " + sentence2score.size() + " sentences."); | |
35 | + LOG.debug("Scored {} sentences.", sentence2score.size()); | |
36 | 36 | |
37 | 37 | return sentence2score; |
38 | 38 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java
... | ... | @@ -8,6 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | 9 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
10 | 10 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
11 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | |
11 | 13 | import weka.core.Attribute; |
12 | 14 | import weka.core.DenseInstance; |
13 | 15 | import weka.core.Instance; |
... | ... | @@ -65,6 +67,20 @@ public class InstanceUtils { |
65 | 67 | return sentence2instance; |
66 | 68 | } |
67 | 69 | |
70 | + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | |
71 | + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | |
72 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | |
73 | + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | |
74 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
75 | + Map<Attribute, Double> sentenceFeatures = entry.getValue(); | |
76 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | |
77 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | |
78 | + } | |
79 | + candidate2instance.put(entry.getKey(), instance); | |
80 | + } | |
81 | + return candidate2instance; | |
82 | + } | |
83 | + | |
68 | 84 | @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList |
69 | 85 | public static Instances createNewInstances(ArrayList<Attribute> attributesList) { |
70 | 86 | Instances instances = new Instances(DATASET_NAME, attributesList, 0); |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | |
2 | - | |
3 | -import com.google.common.collect.Maps; | |
4 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
5 | -import weka.core.Attribute; | |
6 | -import weka.core.DenseInstance; | |
7 | -import weka.core.Instance; | |
8 | - | |
9 | -import java.util.List; | |
10 | -import java.util.Map; | |
11 | - | |
12 | -public class InstanceCreator { | |
13 | - | |
14 | - private InstanceCreator() { | |
15 | - } | |
16 | - | |
17 | - public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | |
18 | - Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | |
19 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | |
20 | - for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | |
21 | - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
22 | - Map<Attribute, Double> sentenceFeatures = entry.getValue(); | |
23 | - for (Attribute attribute : featureExtractor.getAttributesList()) { | |
24 | - instance.setValue(attribute, sentenceFeatures.get(attribute)); | |
25 | - } | |
26 | - candidate2instance.put(entry.getKey(), instance); | |
27 | - } | |
28 | - return candidate2instance; | |
29 | - } | |
30 | - | |
31 | -} |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... | ... | @@ -10,6 +10,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
10 | 10 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
11 | 11 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; |
12 | 12 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | |
13 | 14 | import weka.core.Attribute; |
14 | 15 | |
15 | 16 | import java.util.List; |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroModel.java
... | ... | @@ -3,35 +3,30 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; |
3 | 3 | import com.google.common.collect.Sets; |
4 | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
5 | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
7 | 6 | import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; |
7 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | |
8 | 9 | import weka.classifiers.Classifier; |
9 | 10 | import weka.core.Instance; |
10 | 11 | import weka.core.Instances; |
11 | -import weka.core.SerializationHelper; | |
12 | 12 | |
13 | 13 | import java.util.List; |
14 | 14 | import java.util.Map; |
15 | 15 | import java.util.Set; |
16 | 16 | import java.util.stream.Collectors; |
17 | 17 | |
18 | -public class ZeroSubjectInjector { | |
18 | +public class ZeroModel { | |
19 | 19 | |
20 | - private final ZeroFeatureExtractor featureExtractor; | |
21 | - private final Classifier classifier; | |
22 | - private final Instances instances; | |
23 | - | |
24 | - public ZeroSubjectInjector() throws Exception { | |
25 | - classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); | |
26 | - featureExtractor = new ZeroFeatureExtractor(); | |
27 | - instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); | |
20 | + private ZeroModel() { | |
28 | 21 | } |
29 | 22 | |
30 | - public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { | |
23 | + public static Set<String> findZeroSubjectTokenIds(Classifier classifier, ZeroFeatureExtractor featureExtractor, TText text, List<TSentence> selectedSentences) throws Exception { | |
24 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); | |
25 | + | |
31 | 26 | Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); |
32 | 27 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); |
33 | 28 | Map<ZeroSubjectCandidate, Instance> candidate2instance = |
34 | - InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
29 | + InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
35 | 30 | |
36 | 31 | Set<String> result = Sets.newHashSet(); |
37 | 32 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/CandidateFinder.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate; | |
2 | 2 | |
3 | 3 | import com.google.common.collect.Lists; |
4 | 4 | import com.google.common.collect.Maps; |
... | ... | @@ -12,57 +12,110 @@ import java.util.Set; |
12 | 12 | |
13 | 13 | public class CandidateFinder { |
14 | 14 | |
15 | + private static final String SUBST = "subst"; | |
16 | + private static final String NOM = "nom"; | |
17 | + private static final String MSD_SPLITTER = ":"; | |
18 | + | |
15 | 19 | private CandidateFinder() { |
16 | 20 | } |
17 | 21 | |
18 | 22 | public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { |
23 | + Map<String, Set<String>> mentionId2Cluster = getMentionId2Cluster(text); | |
24 | + return getZeroSubjectCandidates(text, summarySentenceIds, mentionId2Cluster); | |
25 | + } | |
26 | + | |
27 | + private static List<ZeroSubjectCandidate> getZeroSubjectCandidates(TText text, Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster) { | |
19 | 28 | List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); |
20 | 29 | |
21 | - Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | |
22 | - for (TCoreference coreference : text.getCoreferences()) { | |
23 | - for (String mentionId : coreference.getMentionIds()) { | |
24 | - mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | |
30 | + PrevSentenceState prevSentenceState = new PrevSentenceState(); | |
31 | + for (TParagraph p : text.getParagraphs()) { | |
32 | + for (TSentence sentence : p.getSentences()) { | |
33 | + processSentence(summarySentenceIds, mentionId2Cluster, candidates, prevSentenceState, sentence); | |
25 | 34 | } |
26 | 35 | } |
36 | + return candidates; | |
37 | + } | |
38 | + | |
39 | + private static void processSentence(Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence) { | |
40 | + if (!summarySentenceIds.contains(sentence.getId())) | |
41 | + return; | |
42 | + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | |
43 | + | |
44 | + Map<String, TToken> tokenId2Token = getTokenId2Token(sentence); | |
45 | + | |
46 | + for (TMention mention : sentence.getMentions()) { | |
47 | + processMention(mentionId2Cluster, candidates, prevSentenceState, sentence, currentSentenceNominativeMentionIds, tokenId2Token, mention); | |
48 | + } | |
27 | 49 | |
28 | - Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet(); | |
29 | - TSentence prevSentence = null; | |
30 | - for (TParagraph p : text.getParagraphs()) { | |
31 | - for (TSentence sentence : p.getSentences()) { | |
32 | - if (!summarySentenceIds.contains(sentence.getId())) | |
33 | - continue; | |
34 | - Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | |
35 | - | |
36 | - Map<String, TToken> tokenId2Token = Maps.newHashMap(); | |
37 | - for (TToken t : sentence.getTokens()) | |
38 | - tokenId2Token.put(t.getId(), t); | |
39 | - | |
40 | - for (TMention mention : sentence.getMentions()) { | |
41 | - | |
42 | - for (String tokenId : mention.getHeadIds()) { | |
43 | - TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | |
44 | - if (isInNominative(interp)) { | |
45 | - | |
46 | - currentSentenceNominativeMentionIds.add(mention.getId()); | |
47 | - if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) { | |
48 | - ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentence, sentence, mention); | |
49 | - candidates.add(candidate); | |
50 | - } | |
51 | - break; | |
52 | - } | |
53 | - } | |
50 | + prevSentenceState.setPrevSentence(sentence); | |
51 | + prevSentenceState.setNominativeMentionIds(currentSentenceNominativeMentionIds); | |
52 | + } | |
53 | + | |
54 | + private static void processMention(Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence, Set<String> currentSentenceNominativeMentionIds, Map<String, TToken> tokenId2Token, TMention mention) { | |
55 | + for (String tokenId : mention.getHeadIds()) { | |
56 | + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | |
57 | + if (isInNominative(interp)) { | |
58 | + currentSentenceNominativeMentionIds.add(mention.getId()); | |
59 | + if (isCoreferentWithPreviousSentence(mentionId2Cluster, prevSentenceState, mention)) { | |
60 | + ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentenceState.getPrevSentence(), sentence, mention); | |
61 | + candidates.add(candidate); | |
54 | 62 | } |
63 | + break; | |
64 | + } | |
65 | + } | |
66 | + } | |
55 | 67 | |
56 | - prevSentence = sentence; | |
57 | - prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; | |
68 | + private static boolean isCoreferentWithPreviousSentence(Map<String, Set<String>> mentionId2Cluster, PrevSentenceState prevSentenceState, TMention mention) { | |
69 | + return mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceState.getNominativeMentionIds()::contains); | |
70 | + } | |
71 | + | |
72 | + private static Map<String, TToken> getTokenId2Token(TSentence sentence) { | |
73 | + Map<String, TToken> tokenId2Token = Maps.newHashMap(); | |
74 | + for (TToken t : sentence.getTokens()) | |
75 | + tokenId2Token.put(t.getId(), t); | |
76 | + return tokenId2Token; | |
77 | + } | |
78 | + | |
79 | + private static Map<String, Set<String>> getMentionId2Cluster(TText text) { | |
80 | + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | |
81 | + for (TCoreference coreference : text.getCoreferences()) { | |
82 | + for (String mentionId : coreference.getMentionIds()) { | |
83 | + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | |
58 | 84 | } |
59 | 85 | } |
60 | - return candidates; | |
86 | + return mentionId2Cluster; | |
61 | 87 | } |
62 | 88 | |
63 | 89 | private static boolean isInNominative(TInterpretation interp) { |
64 | - boolean isNominative = Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); | |
65 | - boolean isSubst = interp.getCtag().equals("subst"); | |
90 | + boolean isNominative = Arrays.stream(interp.getMsd().split(MSD_SPLITTER)).anyMatch(t -> t.equals(NOM)); | |
91 | + boolean isSubst = interp.getCtag().equals(SUBST); | |
66 | 92 | return isSubst && isNominative; |
67 | 93 | } |
94 | + | |
95 | + private static class PrevSentenceState { | |
96 | + | |
97 | + private Set<String> nominativeMentionIds; | |
98 | + private TSentence prevSentence; | |
99 | + | |
100 | + PrevSentenceState() { | |
101 | + nominativeMentionIds = Sets.newHashSet(); | |
102 | + prevSentence = null; | |
103 | + } | |
104 | + | |
105 | + Set<String> getNominativeMentionIds() { | |
106 | + return nominativeMentionIds; | |
107 | + } | |
108 | + | |
109 | + TSentence getPrevSentence() { | |
110 | + return prevSentence; | |
111 | + } | |
112 | + | |
113 | + void setNominativeMentionIds(Set<String> nominativeMentionIds) { | |
114 | + this.nominativeMentionIds = nominativeMentionIds; | |
115 | + } | |
116 | + | |
117 | + void setPrevSentence(TSentence prevSentence) { | |
118 | + this.prevSentence = prevSentence; | |
119 | + } | |
120 | + } | |
68 | 121 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/ZeroSubjectCandidate.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate; | |
2 | 2 | |
3 | 3 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
4 | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
... | ... | @@ -9,7 +9,7 @@ public class ZeroSubjectCandidate { |
9 | 9 | private final TSentence sentence; |
10 | 10 | private final TMention zeroCandidateMention; |
11 | 11 | |
12 | - public ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) { | |
12 | + ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) { | |
13 | 13 | this.previousSentence = previousSentence; |
14 | 14 | this.sentence = sentence; |
15 | 15 | this.zeroCandidateMention = zeroCandidateMention; |
... | ... |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
... | ... | @@ -7,6 +7,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
9 | 9 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; |
10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | |
10 | 12 | |
11 | 13 | import java.io.IOException; |
12 | 14 | import java.io.InputStream; |
... | ... |
nicolas-train/src/main/R/plot_summary_lenghts.R
0 → 100644
1 | +require(ggplot2) | |
2 | +require(grid) | |
3 | +require(gridExtra) | |
4 | +require(lattice) | |
5 | + | |
6 | +DATA_DIR="../../../../data/" | |
7 | + | |
8 | +########################## functions | |
9 | +gpl = function(d) { | |
10 | + ggplot(d, aes(x=as.factor(d$SumRatio), y=SumRealRatio)) + | |
11 | + geom_boxplot(outlier.shape=4, outlier.colour = "blue") + | |
12 | + ylim(0, 40) + | |
13 | + ylab("Obtained summary ratio (word count)") + | |
14 | + xlab("Requested summary ratio (word count)") + | |
15 | + theme(text = element_text(size=15)) | |
16 | +} | |
17 | + | |
18 | +ploto = function(d) { | |
19 | + p = gpl(d) | |
20 | +} | |
21 | + | |
22 | +histo = function(d) { | |
23 | + p = ggplot(d, aes(abs(d$SumRealRatio*100/d$SumRatio))) + | |
24 | + geom_histogram(binwidth = 1) + | |
25 | + xlim(80, 120) + | |
26 | + ylab("Number of summaries") + | |
27 | + xlab("Obtained summary ratio as percent of requested ratio (20%)") + | |
28 | + theme(text = element_text(size=15)) | |
29 | +} | |
30 | + | |
31 | +######################### automatic summaries | |
32 | +data = read.csv(paste(DATA_DIR, "summary-lengths.tsv", sep=""), sep = "\t") | |
33 | + | |
34 | +names = list("Swietl", "nicolas", "nicolas-zero", "BASELINE") | |
35 | +titles = list("Świetlicka", "Nicolas", "Nicolas-zero", "Baseline") | |
36 | +plots = list() | |
37 | +hists = list() | |
38 | +i = 1 | |
39 | +for (n in names) { | |
40 | + print(n) | |
41 | + title = titles[[i]] | |
42 | + i = i + 1 | |
43 | + | |
44 | + d = data[data$SumAuthor==n,] | |
45 | + print(mean(d$SumRealRatio)) | |
46 | + | |
47 | + p = ploto(d) | |
48 | + p = p + ggtitle(title) | |
49 | + plots = c(plots, list(p)) | |
50 | + | |
51 | + hi = histo(d) | |
52 | + hi = hi + ggtitle(title) | |
53 | + hists = c(hists, list(hi)) | |
54 | +} | |
55 | + | |
56 | +pdf(file=paste(DATA_DIR, "summary-length-plots.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27) | |
57 | +grid.arrange(plots[[1]], plots[[2]], plots[[3]], plots[[4]], ncol=2, nrow=2) | |
58 | +dev.off() | |
59 | + | |
60 | +pdf(file=paste(DATA_DIR, "summary-length-hists.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27) | |
61 | +grid.arrange(hists[[1]], hists[[2]], hists[[3]], hists[[4]], ncol=2, nrow=2) | |
62 | +dev.off() | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java
... | ... | @@ -18,7 +18,7 @@ public class CorpusHelper { |
18 | 18 | private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; |
19 | 19 | private static final String EXTRACT_SUMMARY_TYPE = "extract"; |
20 | 20 | |
21 | - private static final int SUMMARY_RATIO = 20; | |
21 | + public static final int SUMMARY_RATIO = 20; | |
22 | 22 | |
23 | 23 | private CorpusHelper() { |
24 | 24 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java
... | ... | @@ -6,6 +6,7 @@ import org.apache.commons.csv.CSVPrinter; |
6 | 6 | import org.slf4j.Logger; |
7 | 7 | import org.slf4j.LoggerFactory; |
8 | 8 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; | |
9 | 10 | import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; |
10 | 11 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; |
11 | 12 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; |
... | ... | @@ -26,7 +27,7 @@ public class CalculateSystemSummaryLengths { |
26 | 27 | private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class); |
27 | 28 | |
28 | 29 | private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId", |
29 | - "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t'); | |
30 | + "TextWC", "SumType", "SumAuthor", "SumRatio", "SumWC", "SumRealRatio").withDelimiter('\t'); | |
30 | 31 | |
31 | 32 | private CalculateSystemSummaryLengths() { |
32 | 33 | } |
... | ... | @@ -61,9 +62,10 @@ public class CalculateSystemSummaryLengths { |
61 | 62 | record.add(textWC); |
62 | 63 | record.add("automatic"); |
63 | 64 | record.add(systemName); |
65 | + record.add(CorpusHelper.SUMMARY_RATIO); | |
64 | 66 | int sumWC = TextUtils.tokenize(body).size(); |
65 | 67 | record.add(sumWC); |
66 | - record.add(sumWC * 1.0 / textWC); | |
68 | + record.add(sumWC * 100.0 / textWC); | |
67 | 69 | printer.printRecord(record); |
68 | 70 | } |
69 | 71 | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
... | ... | @@ -29,6 +29,8 @@ public class SummarizeTestCorpus { |
29 | 29 | private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); |
30 | 30 | |
31 | 31 | private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; |
32 | + private static final String SUMMARY_WITH_ZERO_FILE_SUFFIX = "_nicolas-zero.txt"; | |
33 | + | |
32 | 34 | private static final double SUMMARY_RATIO = 0.2; |
33 | 35 | |
34 | 36 | private SummarizeTestCorpus() { |
... | ... | @@ -41,16 +43,20 @@ public class SummarizeTestCorpus { |
41 | 43 | Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains); |
42 | 44 | LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); |
43 | 45 | |
44 | - Map<String, String> id2summary = summarizeTexts(id2preprocessedText); | |
46 | + summarize(new Nicolas(false), id2preprocessedText, SUMMARY_FILE_SUFFIX); | |
47 | + summarize(new Nicolas(), id2preprocessedText, SUMMARY_WITH_ZERO_FILE_SUFFIX); | |
48 | + } | |
49 | + | |
50 | + private static void summarize(Nicolas nicolas, Map<String, TText> id2preprocessedText, String fileSuffix) throws NicolasException, IOException { | |
51 | + Map<String, String> id2summary = summarizeTexts(id2preprocessedText, nicolas); | |
45 | 52 | LOG.info("Texts summarized."); |
46 | 53 | |
47 | - saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR); | |
54 | + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR, fileSuffix); | |
48 | 55 | LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR); |
49 | 56 | } |
50 | 57 | |
51 | - private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { | |
58 | + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText, Nicolas nicolas) throws NicolasException { | |
52 | 59 | Map<String, String> id2summary = Maps.newHashMap(); |
53 | - Nicolas nicolas = new Nicolas(); | |
54 | 60 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
55 | 61 | TText text = entry.getValue(); |
56 | 62 | int targetSize = calculateTargetSize(text); |
... | ... | @@ -70,11 +76,11 @@ public class SummarizeTestCorpus { |
70 | 76 | return (int) (SUMMARY_RATIO * tokenCount); |
71 | 77 | } |
72 | 78 | |
73 | - private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException { | |
79 | + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir, String fileSuffix) throws IOException { | |
74 | 80 | for (Map.Entry<String, String> entry : id2summary.entrySet()) { |
75 | 81 | String textId = entry.getKey(); |
76 | 82 | String summary = entry.getValue(); |
77 | - String targetFileName = textId + SUMMARY_FILE_SUFFIX; | |
83 | + String targetFileName = textId + fileSuffix; | |
78 | 84 | try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) { |
79 | 85 | writer.write(summary); |
80 | 86 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Main.java
1 | 1 | package pl.waw.ipipan.zil.summ.nicolas.train; |
2 | 2 | |
3 | 3 | import pl.waw.ipipan.zil.summ.nicolas.train.pipeline.*; |
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractMostFrequentMentions; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractStopwords; | |
4 | 6 | |
5 | 7 | public class Main { |
6 | 8 | |
... | ... | @@ -12,6 +14,8 @@ public class Main { |
12 | 14 | DownloadTrainingResources.main(args); |
13 | 15 | ExtractGoldSummaries.main(args); |
14 | 16 | CreateOptimalSummaries.main(args); |
17 | + ExtractStopwords.main(args); | |
18 | + ExtractMostFrequentMentions.main(args); | |
15 | 19 | PrepareTrainingData.main(args); |
16 | 20 | TrainAllModels.main(args); |
17 | 21 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java
... | ... | @@ -7,7 +7,7 @@ import org.apache.commons.csv.CSVRecord; |
7 | 7 | import org.apache.commons.csv.QuoteMode; |
8 | 8 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
9 | 9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | |
11 | 11 | |
12 | 12 | import java.io.*; |
13 | 13 | import java.util.List; |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
... | ... | @@ -19,10 +19,9 @@ import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; |
19 | 19 | import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; |
20 | 20 | import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; |
21 | 21 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; |
22 | -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | |
23 | -import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; | |
24 | 22 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; |
25 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | |
23 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder; | |
24 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | |
26 | 25 | import weka.core.Instance; |
27 | 26 | import weka.core.Instances; |
28 | 27 | import weka.core.converters.ArffSaver; |
... | ... | @@ -152,7 +151,7 @@ public class PrepareTrainingData { |
152 | 151 | FeatureHelper featureHelper = new FeatureHelper(text); |
153 | 152 | |
154 | 153 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); |
155 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
154 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
156 | 155 | |
157 | 156 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { |
158 | 157 | boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractMostFrequentMentions.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.resources; | |
2 | + | |
3 | +import com.google.common.collect.*; | |
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | |
7 | +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | |
8 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | |
9 | + | |
10 | +import javax.xml.bind.JAXBException; | |
11 | +import java.io.File; | |
12 | +import java.io.IOException; | |
13 | +import java.util.Comparator; | |
14 | +import java.util.List; | |
15 | +import java.util.Map; | |
16 | +import java.util.Set; | |
17 | +import java.util.stream.Collectors; | |
18 | + | |
19 | +public class ExtractMostFrequentMentions { | |
20 | + | |
21 | + public static final String GOLD_DATA_PATH = "/home/me2/Dropbox/3_nauka/3_doktorat/3_korpus_streszczen/dist/src/data/"; | |
22 | + | |
23 | + public static final String THRIFTED_PREFIX = "/home/me2/Desktop/thrifted_texts/thrifted_all/"; | |
24 | + public static final String THRIFTED_SUFFIX = "/original"; | |
25 | + | |
26 | + public static void main(String[] args) throws IOException, JAXBException { | |
27 | + | |
28 | + Set<String> devIds = Sets.newHashSet(); | |
29 | + | |
30 | + File goldDir = new File(GOLD_DATA_PATH); | |
31 | + for (File file : goldDir.listFiles()) { | |
32 | + Text goldText = PSC_IO.readText(file); | |
33 | + if (goldText.getSummaries().getSummary().stream().anyMatch(s -> s.getType().equals("abstract"))) | |
34 | + continue; | |
35 | + | |
36 | + devIds.add(file.getName().replace(".xml", "")); | |
37 | + } | |
38 | + | |
39 | + | |
40 | + System.out.println(devIds.size()); | |
41 | + | |
42 | + Multiset<String> mentionCounts = HashMultiset.create(); | |
43 | + for (String id : devIds) { | |
44 | + Set<String> distinctTextMentions = Sets.newHashSet(); | |
45 | + File input = new File(THRIFTED_PREFIX + id + THRIFTED_SUFFIX); | |
46 | + TText thrifted = ThriftUtils.loadThriftTextFromFile(input); | |
47 | + List<TSentence> sents = thrifted.getParagraphs().stream() | |
48 | + .flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | |
49 | + | |
50 | + Map<String, String> tokenId2base = Maps.newHashMap(); | |
51 | + sents.stream() | |
52 | + .flatMap(s -> s.getTokens().stream()) | |
53 | + .forEach(token -> tokenId2base.put(token.getId(), token.getChosenInterpretation().getBase())); | |
54 | + | |
55 | + sents.stream().flatMap(s -> s.getMentions().stream()).forEach(m -> { | |
56 | + StringBuffer sb = new StringBuffer(); | |
57 | + for (String tokId : m.getChildIds()) { | |
58 | + sb.append(tokenId2base.get(tokId) + " "); | |
59 | + } | |
60 | + distinctTextMentions.add(sb.toString().trim().toLowerCase()); | |
61 | + }); | |
62 | + | |
63 | + mentionCounts.addAll(distinctTextMentions); | |
64 | + } | |
65 | + | |
66 | + System.out.println(mentionCounts.elementSet().size()); | |
67 | + List<String> sorted = Lists.newArrayList(); | |
68 | + sorted.addAll(mentionCounts.elementSet()); | |
69 | + sorted.sort(Comparator.comparing(mentionCounts::count).reversed()); | |
70 | + int i = 0; | |
71 | + for (String mention : sorted) { | |
72 | + if (mentionCounts.count(mention) < 50) | |
73 | + break; | |
74 | + System.out.println(mention); | |
75 | + } | |
76 | + | |
77 | + } | |
78 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractStopwords.java
0 → 100644