Commit 156b37078e717ab9f3849c9d6a1ef1d2ddbc2a10
1 parent
89870bd0
add zero subject removal
Showing
20 changed files
with
324 additions
and
117 deletions
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -12,6 +12,7 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; | @@ -12,6 +12,7 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; | ||
12 | import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; | 12 | import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; |
13 | import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | 13 | import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; |
14 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | 14 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; |
15 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroModel; | ||
15 | import weka.classifiers.Classifier; | 16 | import weka.classifiers.Classifier; |
16 | 17 | ||
17 | import java.io.IOException; | 18 | import java.io.IOException; |
@@ -29,35 +30,43 @@ public class Nicolas { | @@ -29,35 +30,43 @@ public class Nicolas { | ||
29 | private final SentenceFeatureExtractor sentenceFeatureExtractor; | 30 | private final SentenceFeatureExtractor sentenceFeatureExtractor; |
30 | private final ZeroFeatureExtractor zeroFeatureExtractor; | 31 | private final ZeroFeatureExtractor zeroFeatureExtractor; |
31 | 32 | ||
32 | - public Nicolas() throws NicolasException { | 33 | + public Nicolas(boolean useZeroModel) throws NicolasException { |
33 | try { | 34 | try { |
35 | + mentionFeatureExtractor = new MentionFeatureExtractor(); | ||
34 | mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | 36 | mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); |
35 | - sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | ||
36 | - zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | ||
37 | 37 | ||
38 | - mentionFeatureExtractor = new MentionFeatureExtractor(); | ||
39 | sentenceFeatureExtractor = new SentenceFeatureExtractor(); | 38 | sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
40 | - zeroFeatureExtractor = new ZeroFeatureExtractor(); | 39 | + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); |
40 | + | ||
41 | + zeroFeatureExtractor = useZeroModel ? new ZeroFeatureExtractor() : null; | ||
42 | + zeroModel = useZeroModel ? ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH) : null; | ||
43 | + | ||
41 | } catch (IOException e) { | 44 | } catch (IOException e) { |
42 | throw new NicolasException(e); | 45 | throw new NicolasException(e); |
43 | } | 46 | } |
44 | } | 47 | } |
45 | 48 | ||
49 | + public Nicolas() throws NicolasException { | ||
50 | + this(true); | ||
51 | + } | ||
52 | + | ||
46 | public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { | 53 | public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { |
47 | try { | 54 | try { |
48 | Set<TMention> goodMentions = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); | 55 | Set<TMention> goodMentions = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); |
49 | - return calculateSummary(text, goodMentions, targetTokenCount); | 56 | + List<TSentence> selectedSentences = selectSummarySentences(text, goodMentions, targetTokenCount); |
57 | + Set<String> zeroSubjectTokenIds = zeroModel == null ? Collections.emptySet() : ZeroModel.findZeroSubjectTokenIds(zeroModel, zeroFeatureExtractor, text, selectedSentences); | ||
58 | + | ||
59 | + return createSummaryFromSentences(selectedSentences, zeroSubjectTokenIds); | ||
60 | + | ||
50 | } catch (Exception e) { | 61 | } catch (Exception e) { |
51 | throw new NicolasException(e); | 62 | throw new NicolasException(e); |
52 | } | 63 | } |
53 | } | 64 | } |
54 | 65 | ||
55 | - private String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { | ||
56 | - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize); | ||
57 | - | 66 | + private String createSummaryFromSentences(List<TSentence> selectedSentences, Set<String> zeroSubjectTokenIds) { |
58 | StringBuilder sb = new StringBuilder(); | 67 | StringBuilder sb = new StringBuilder(); |
59 | for (TSentence sent : selectedSentences) { | 68 | for (TSentence sent : selectedSentences) { |
60 | - sb.append(" ").append(TextUtils.loadSentence2Orth(sent)); | 69 | + sb.append(" ").append(TextUtils.loadSentence2Orth(sent, zeroSubjectTokenIds)); |
61 | } | 70 | } |
62 | return sb.toString().trim(); | 71 | return sb.toString().trim(); |
63 | } | 72 | } |
@@ -70,16 +79,16 @@ public class Nicolas { | @@ -70,16 +79,16 @@ public class Nicolas { | ||
70 | List<TSentence> sortedSentences = Lists.newArrayList(sentences); | 79 | List<TSentence> sortedSentences = Lists.newArrayList(sentences); |
71 | sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed()); | 80 | sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed()); |
72 | 81 | ||
73 | - int size = 0; | ||
74 | - Random r = new Random(1); | 82 | + int currentSize = 0; |
75 | Set<TSentence> summary = Sets.newHashSet(); | 83 | Set<TSentence> summary = Sets.newHashSet(); |
76 | for (TSentence sent : sortedSentences) { | 84 | for (TSentence sent : sortedSentences) { |
77 | - size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); | ||
78 | - if (r.nextDouble() > 0.4 && size > targetSize) | ||
79 | - break; | ||
80 | - summary.add(sent); | ||
81 | - if (size > targetSize) | ||
82 | - break; | 85 | + int sentenceSize = TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); |
86 | + int newSize = currentSize + sentenceSize; | ||
87 | + | ||
88 | + if (Math.abs(newSize - targetSize) < Math.abs(currentSize - targetSize)) { | ||
89 | + currentSize = newSize; | ||
90 | + summary.add(sent); | ||
91 | + } | ||
83 | } | 92 | } |
84 | List<TSentence> selectedSentences = Lists.newArrayList(); | 93 | List<TSentence> selectedSentences = Lists.newArrayList(); |
85 | for (TSentence sent : sentences) { | 94 | for (TSentence sent : sentences) { |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java
1 | package pl.waw.ipipan.zil.summ.nicolas; | 1 | package pl.waw.ipipan.zil.summ.nicolas; |
2 | 2 | ||
3 | public class NicolasException extends Exception { | 3 | public class NicolasException extends Exception { |
4 | - public NicolasException(Exception e) { | 4 | + NicolasException(Exception e) { |
5 | super(e); | 5 | super(e); |
6 | } | 6 | } |
7 | } | 7 | } |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
@@ -33,7 +33,7 @@ public class MentionModel { | @@ -33,7 +33,7 @@ public class MentionModel { | ||
33 | if (good) | 33 | if (good) |
34 | goodMentions.add(entry.getKey()); | 34 | goodMentions.add(entry.getKey()); |
35 | } | 35 | } |
36 | - LOG.info("Classified {} mentions as good.", goodMentions.size()); | 36 | + LOG.debug("Classified {} mentions as good.", goodMentions.size()); |
37 | return goodMentions; | 37 | return goodMentions; |
38 | } | 38 | } |
39 | 39 |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
@@ -32,7 +32,7 @@ public class SentenceModel { | @@ -32,7 +32,7 @@ public class SentenceModel { | ||
32 | double score = sentenceClassifier.classifyInstance(instance); | 32 | double score = sentenceClassifier.classifyInstance(instance); |
33 | sentence2score.put(entry.getKey(), score); | 33 | sentence2score.put(entry.getKey(), score); |
34 | } | 34 | } |
35 | - LOG.info("Scored " + sentence2score.size() + " sentences."); | 35 | + LOG.debug("Scored {} sentences.", sentence2score.size()); |
36 | 36 | ||
37 | return sentence2score; | 37 | return sentence2score; |
38 | } | 38 | } |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java
@@ -8,6 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | @@ -8,6 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 9 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
10 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 10 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
11 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | ||
12 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | ||
11 | import weka.core.Attribute; | 13 | import weka.core.Attribute; |
12 | import weka.core.DenseInstance; | 14 | import weka.core.DenseInstance; |
13 | import weka.core.Instance; | 15 | import weka.core.Instance; |
@@ -65,6 +67,20 @@ public class InstanceUtils { | @@ -65,6 +67,20 @@ public class InstanceUtils { | ||
65 | return sentence2instance; | 67 | return sentence2instance; |
66 | } | 68 | } |
67 | 69 | ||
70 | + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | ||
71 | + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | ||
72 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | ||
73 | + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | ||
74 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
75 | + Map<Attribute, Double> sentenceFeatures = entry.getValue(); | ||
76 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
77 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | ||
78 | + } | ||
79 | + candidate2instance.put(entry.getKey(), instance); | ||
80 | + } | ||
81 | + return candidate2instance; | ||
82 | + } | ||
83 | + | ||
68 | @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList | 84 | @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList |
69 | public static Instances createNewInstances(ArrayList<Attribute> attributesList) { | 85 | public static Instances createNewInstances(ArrayList<Attribute> attributesList) { |
70 | Instances instances = new Instances(DATASET_NAME, attributesList, 0); | 86 | Instances instances = new Instances(DATASET_NAME, attributesList, 0); |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | - | ||
3 | -import com.google.common.collect.Maps; | ||
4 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
5 | -import weka.core.Attribute; | ||
6 | -import weka.core.DenseInstance; | ||
7 | -import weka.core.Instance; | ||
8 | - | ||
9 | -import java.util.List; | ||
10 | -import java.util.Map; | ||
11 | - | ||
12 | -public class InstanceCreator { | ||
13 | - | ||
14 | - private InstanceCreator() { | ||
15 | - } | ||
16 | - | ||
17 | - public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | ||
18 | - Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | ||
19 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | ||
20 | - for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | ||
21 | - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
22 | - Map<Attribute, Double> sentenceFeatures = entry.getValue(); | ||
23 | - for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
24 | - instance.setValue(attribute, sentenceFeatures.get(attribute)); | ||
25 | - } | ||
26 | - candidate2instance.put(entry.getKey(), instance); | ||
27 | - } | ||
28 | - return candidate2instance; | ||
29 | - } | ||
30 | - | ||
31 | -} |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
@@ -10,6 +10,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | @@ -10,6 +10,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | ||
10 | import pl.waw.ipipan.zil.summ.nicolas.Constants; | 10 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
11 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | 11 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; |
12 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 12 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | ||
13 | import weka.core.Attribute; | 14 | import weka.core.Attribute; |
14 | 15 | ||
15 | import java.util.List; | 16 | import java.util.List; |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroModel.java
@@ -3,35 +3,30 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | @@ -3,35 +3,30 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
3 | import com.google.common.collect.Sets; | 3 | import com.google.common.collect.Sets; |
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
7 | import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; | 6 | import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; |
7 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder; | ||
8 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | ||
8 | import weka.classifiers.Classifier; | 9 | import weka.classifiers.Classifier; |
9 | import weka.core.Instance; | 10 | import weka.core.Instance; |
10 | import weka.core.Instances; | 11 | import weka.core.Instances; |
11 | -import weka.core.SerializationHelper; | ||
12 | 12 | ||
13 | import java.util.List; | 13 | import java.util.List; |
14 | import java.util.Map; | 14 | import java.util.Map; |
15 | import java.util.Set; | 15 | import java.util.Set; |
16 | import java.util.stream.Collectors; | 16 | import java.util.stream.Collectors; |
17 | 17 | ||
18 | -public class ZeroSubjectInjector { | 18 | +public class ZeroModel { |
19 | 19 | ||
20 | - private final ZeroFeatureExtractor featureExtractor; | ||
21 | - private final Classifier classifier; | ||
22 | - private final Instances instances; | ||
23 | - | ||
24 | - public ZeroSubjectInjector() throws Exception { | ||
25 | - classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); | ||
26 | - featureExtractor = new ZeroFeatureExtractor(); | ||
27 | - instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); | 20 | + private ZeroModel() { |
28 | } | 21 | } |
29 | 22 | ||
30 | - public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { | 23 | + public static Set<String> findZeroSubjectTokenIds(Classifier classifier, ZeroFeatureExtractor featureExtractor, TText text, List<TSentence> selectedSentences) throws Exception { |
24 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); | ||
25 | + | ||
31 | Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); | 26 | Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); |
32 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); | 27 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); |
33 | Map<ZeroSubjectCandidate, Instance> candidate2instance = | 28 | Map<ZeroSubjectCandidate, Instance> candidate2instance = |
34 | - InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | 29 | + InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); |
35 | 30 | ||
36 | Set<String> result = Sets.newHashSet(); | 31 | Set<String> result = Sets.newHashSet(); |
37 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { | 32 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/CandidateFinder.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate; |
2 | 2 | ||
3 | import com.google.common.collect.Lists; | 3 | import com.google.common.collect.Lists; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
@@ -12,57 +12,110 @@ import java.util.Set; | @@ -12,57 +12,110 @@ import java.util.Set; | ||
12 | 12 | ||
13 | public class CandidateFinder { | 13 | public class CandidateFinder { |
14 | 14 | ||
15 | + private static final String SUBST = "subst"; | ||
16 | + private static final String NOM = "nom"; | ||
17 | + private static final String MSD_SPLITTER = ":"; | ||
18 | + | ||
15 | private CandidateFinder() { | 19 | private CandidateFinder() { |
16 | } | 20 | } |
17 | 21 | ||
18 | public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { | 22 | public static List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { |
23 | + Map<String, Set<String>> mentionId2Cluster = getMentionId2Cluster(text); | ||
24 | + return getZeroSubjectCandidates(text, summarySentenceIds, mentionId2Cluster); | ||
25 | + } | ||
26 | + | ||
27 | + private static List<ZeroSubjectCandidate> getZeroSubjectCandidates(TText text, Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster) { | ||
19 | List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); | 28 | List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); |
20 | 29 | ||
21 | - Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | ||
22 | - for (TCoreference coreference : text.getCoreferences()) { | ||
23 | - for (String mentionId : coreference.getMentionIds()) { | ||
24 | - mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | 30 | + PrevSentenceState prevSentenceState = new PrevSentenceState(); |
31 | + for (TParagraph p : text.getParagraphs()) { | ||
32 | + for (TSentence sentence : p.getSentences()) { | ||
33 | + processSentence(summarySentenceIds, mentionId2Cluster, candidates, prevSentenceState, sentence); | ||
25 | } | 34 | } |
26 | } | 35 | } |
36 | + return candidates; | ||
37 | + } | ||
38 | + | ||
39 | + private static void processSentence(Set<String> summarySentenceIds, Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence) { | ||
40 | + if (!summarySentenceIds.contains(sentence.getId())) | ||
41 | + return; | ||
42 | + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | ||
43 | + | ||
44 | + Map<String, TToken> tokenId2Token = getTokenId2Token(sentence); | ||
45 | + | ||
46 | + for (TMention mention : sentence.getMentions()) { | ||
47 | + processMention(mentionId2Cluster, candidates, prevSentenceState, sentence, currentSentenceNominativeMentionIds, tokenId2Token, mention); | ||
48 | + } | ||
27 | 49 | ||
28 | - Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet(); | ||
29 | - TSentence prevSentence = null; | ||
30 | - for (TParagraph p : text.getParagraphs()) { | ||
31 | - for (TSentence sentence : p.getSentences()) { | ||
32 | - if (!summarySentenceIds.contains(sentence.getId())) | ||
33 | - continue; | ||
34 | - Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | ||
35 | - | ||
36 | - Map<String, TToken> tokenId2Token = Maps.newHashMap(); | ||
37 | - for (TToken t : sentence.getTokens()) | ||
38 | - tokenId2Token.put(t.getId(), t); | ||
39 | - | ||
40 | - for (TMention mention : sentence.getMentions()) { | ||
41 | - | ||
42 | - for (String tokenId : mention.getHeadIds()) { | ||
43 | - TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | ||
44 | - if (isInNominative(interp)) { | ||
45 | - | ||
46 | - currentSentenceNominativeMentionIds.add(mention.getId()); | ||
47 | - if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) { | ||
48 | - ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentence, sentence, mention); | ||
49 | - candidates.add(candidate); | ||
50 | - } | ||
51 | - break; | ||
52 | - } | ||
53 | - } | 50 | + prevSentenceState.setPrevSentence(sentence); |
51 | + prevSentenceState.setNominativeMentionIds(currentSentenceNominativeMentionIds); | ||
52 | + } | ||
53 | + | ||
54 | + private static void processMention(Map<String, Set<String>> mentionId2Cluster, List<ZeroSubjectCandidate> candidates, PrevSentenceState prevSentenceState, TSentence sentence, Set<String> currentSentenceNominativeMentionIds, Map<String, TToken> tokenId2Token, TMention mention) { | ||
55 | + for (String tokenId : mention.getHeadIds()) { | ||
56 | + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | ||
57 | + if (isInNominative(interp)) { | ||
58 | + currentSentenceNominativeMentionIds.add(mention.getId()); | ||
59 | + if (isCoreferentWithPreviousSentence(mentionId2Cluster, prevSentenceState, mention)) { | ||
60 | + ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentenceState.getPrevSentence(), sentence, mention); | ||
61 | + candidates.add(candidate); | ||
54 | } | 62 | } |
63 | + break; | ||
64 | + } | ||
65 | + } | ||
66 | + } | ||
55 | 67 | ||
56 | - prevSentence = sentence; | ||
57 | - prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; | 68 | + private static boolean isCoreferentWithPreviousSentence(Map<String, Set<String>> mentionId2Cluster, PrevSentenceState prevSentenceState, TMention mention) { |
69 | + return mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceState.getNominativeMentionIds()::contains); | ||
70 | + } | ||
71 | + | ||
72 | + private static Map<String, TToken> getTokenId2Token(TSentence sentence) { | ||
73 | + Map<String, TToken> tokenId2Token = Maps.newHashMap(); | ||
74 | + for (TToken t : sentence.getTokens()) | ||
75 | + tokenId2Token.put(t.getId(), t); | ||
76 | + return tokenId2Token; | ||
77 | + } | ||
78 | + | ||
79 | + private static Map<String, Set<String>> getMentionId2Cluster(TText text) { | ||
80 | + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | ||
81 | + for (TCoreference coreference : text.getCoreferences()) { | ||
82 | + for (String mentionId : coreference.getMentionIds()) { | ||
83 | + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | ||
58 | } | 84 | } |
59 | } | 85 | } |
60 | - return candidates; | 86 | + return mentionId2Cluster; |
61 | } | 87 | } |
62 | 88 | ||
63 | private static boolean isInNominative(TInterpretation interp) { | 89 | private static boolean isInNominative(TInterpretation interp) { |
64 | - boolean isNominative = Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); | ||
65 | - boolean isSubst = interp.getCtag().equals("subst"); | 90 | + boolean isNominative = Arrays.stream(interp.getMsd().split(MSD_SPLITTER)).anyMatch(t -> t.equals(NOM)); |
91 | + boolean isSubst = interp.getCtag().equals(SUBST); | ||
66 | return isSubst && isNominative; | 92 | return isSubst && isNominative; |
67 | } | 93 | } |
94 | + | ||
95 | + private static class PrevSentenceState { | ||
96 | + | ||
97 | + private Set<String> nominativeMentionIds; | ||
98 | + private TSentence prevSentence; | ||
99 | + | ||
100 | + PrevSentenceState() { | ||
101 | + nominativeMentionIds = Sets.newHashSet(); | ||
102 | + prevSentence = null; | ||
103 | + } | ||
104 | + | ||
105 | + Set<String> getNominativeMentionIds() { | ||
106 | + return nominativeMentionIds; | ||
107 | + } | ||
108 | + | ||
109 | + TSentence getPrevSentence() { | ||
110 | + return prevSentence; | ||
111 | + } | ||
112 | + | ||
113 | + void setNominativeMentionIds(Set<String> nominativeMentionIds) { | ||
114 | + this.nominativeMentionIds = nominativeMentionIds; | ||
115 | + } | ||
116 | + | ||
117 | + void setPrevSentence(TSentence prevSentence) { | ||
118 | + this.prevSentence = prevSentence; | ||
119 | + } | ||
120 | + } | ||
68 | } | 121 | } |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/candidate/ZeroSubjectCandidate.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.candidate; |
2 | 2 | ||
3 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 3 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
@@ -9,7 +9,7 @@ public class ZeroSubjectCandidate { | @@ -9,7 +9,7 @@ public class ZeroSubjectCandidate { | ||
9 | private final TSentence sentence; | 9 | private final TSentence sentence; |
10 | private final TMention zeroCandidateMention; | 10 | private final TMention zeroCandidateMention; |
11 | 11 | ||
12 | - public ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) { | 12 | + ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) { |
13 | this.previousSentence = previousSentence; | 13 | this.previousSentence = previousSentence; |
14 | this.sentence = sentence; | 14 | this.sentence = sentence; |
15 | this.zeroCandidateMention = zeroCandidateMention; | 15 | this.zeroCandidateMention = zeroCandidateMention; |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -7,6 +7,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | @@ -7,6 +7,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 8 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
9 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | 9 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; |
10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | ||
10 | 12 | ||
11 | import java.io.IOException; | 13 | import java.io.IOException; |
12 | import java.io.InputStream; | 14 | import java.io.InputStream; |
nicolas-train/src/main/R/plot_summary_lenghts.R
0 → 100644
1 | +require(ggplot2) | ||
2 | +require(grid) | ||
3 | +require(gridExtra) | ||
4 | +require(lattice) | ||
5 | + | ||
6 | +DATA_DIR="../../../../data/" | ||
7 | + | ||
8 | +########################## functions | ||
9 | +gpl = function(d) { | ||
10 | + ggplot(d, aes(x=as.factor(d$SumRatio), y=SumRealRatio)) + | ||
11 | + geom_boxplot(outlier.shape=4, outlier.colour = "blue") + | ||
12 | + ylim(0, 40) + | ||
13 | + ylab("Obtained summary ratio (word count)") + | ||
14 | + xlab("Requested summary ratio (word count)") + | ||
15 | + theme(text = element_text(size=15)) | ||
16 | +} | ||
17 | + | ||
18 | +ploto = function(d) { | ||
19 | + p = gpl(d) | ||
20 | +} | ||
21 | + | ||
22 | +histo = function(d) { | ||
23 | + p = ggplot(d, aes(abs(d$SumRealRatio*100/d$SumRatio))) + | ||
24 | + geom_histogram(binwidth = 1) + | ||
25 | + xlim(80, 120) + | ||
26 | + ylab("Number of summaries") + | ||
27 | + xlab("Obtained summary ratio as percent of requested ratio (20%)") + | ||
28 | + theme(text = element_text(size=15)) | ||
29 | +} | ||
30 | + | ||
31 | +######################### automatic summaries | ||
32 | +data = read.csv(paste(DATA_DIR, "summary-lengths.tsv", sep=""), sep = "\t") | ||
33 | + | ||
34 | +names = list("Swietl", "nicolas", "nicolas-zero", "BASELINE") | ||
35 | +titles = list("Świetlicka", "Nicolas", "Nicolas-zero", "Baseline") | ||
36 | +plots = list() | ||
37 | +hists = list() | ||
38 | +i = 1 | ||
39 | +for (n in names) { | ||
40 | + print(n) | ||
41 | + title = titles[[i]] | ||
42 | + i = i + 1 | ||
43 | + | ||
44 | + d = data[data$SumAuthor==n,] | ||
45 | + print(mean(d$SumRealRatio)) | ||
46 | + | ||
47 | + p = ploto(d) | ||
48 | + p = p + ggtitle(title) | ||
49 | + plots = c(plots, list(p)) | ||
50 | + | ||
51 | + hi = histo(d) | ||
52 | + hi = hi + ggtitle(title) | ||
53 | + hists = c(hists, list(hi)) | ||
54 | +} | ||
55 | + | ||
56 | +pdf(file=paste(DATA_DIR, "summary-length-plots.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27) | ||
57 | +grid.arrange(plots[[1]], plots[[2]], plots[[3]], plots[[4]], ncol=2, nrow=2) | ||
58 | +dev.off() | ||
59 | + | ||
60 | +pdf(file=paste(DATA_DIR, "summary-length-hists.pdf", sep=""), encoding="ISOLatin2", width=11.69, height=8.27) | ||
61 | +grid.arrange(hists[[1]], hists[[2]], hists[[3]], hists[[4]], ncol=2, nrow=2) | ||
62 | +dev.off() |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java
@@ -18,7 +18,7 @@ public class CorpusHelper { | @@ -18,7 +18,7 @@ public class CorpusHelper { | ||
18 | private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; | 18 | private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; |
19 | private static final String EXTRACT_SUMMARY_TYPE = "extract"; | 19 | private static final String EXTRACT_SUMMARY_TYPE = "extract"; |
20 | 20 | ||
21 | - private static final int SUMMARY_RATIO = 20; | 21 | + public static final int SUMMARY_RATIO = 20; |
22 | 22 | ||
23 | private CorpusHelper() { | 23 | private CorpusHelper() { |
24 | } | 24 | } |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java
@@ -6,6 +6,7 @@ import org.apache.commons.csv.CSVPrinter; | @@ -6,6 +6,7 @@ import org.apache.commons.csv.CSVPrinter; | ||
6 | import org.slf4j.Logger; | 6 | import org.slf4j.Logger; |
7 | import org.slf4j.LoggerFactory; | 7 | import org.slf4j.LoggerFactory; |
8 | import pl.waw.ipipan.zil.summ.nicolas.Constants; | 8 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; | ||
9 | import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | 10 | import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; |
10 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | 11 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; |
11 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | 12 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; |
@@ -26,7 +27,7 @@ public class CalculateSystemSummaryLengths { | @@ -26,7 +27,7 @@ public class CalculateSystemSummaryLengths { | ||
26 | private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class); | 27 | private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class); |
27 | 28 | ||
28 | private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId", | 29 | private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId", |
29 | - "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t'); | 30 | + "TextWC", "SumType", "SumAuthor", "SumRatio", "SumWC", "SumRealRatio").withDelimiter('\t'); |
30 | 31 | ||
31 | private CalculateSystemSummaryLengths() { | 32 | private CalculateSystemSummaryLengths() { |
32 | } | 33 | } |
@@ -61,9 +62,10 @@ public class CalculateSystemSummaryLengths { | @@ -61,9 +62,10 @@ public class CalculateSystemSummaryLengths { | ||
61 | record.add(textWC); | 62 | record.add(textWC); |
62 | record.add("automatic"); | 63 | record.add("automatic"); |
63 | record.add(systemName); | 64 | record.add(systemName); |
65 | + record.add(CorpusHelper.SUMMARY_RATIO); | ||
64 | int sumWC = TextUtils.tokenize(body).size(); | 66 | int sumWC = TextUtils.tokenize(body).size(); |
65 | record.add(sumWC); | 67 | record.add(sumWC); |
66 | - record.add(sumWC * 1.0 / textWC); | 68 | + record.add(sumWC * 100.0 / textWC); |
67 | printer.printRecord(record); | 69 | printer.printRecord(record); |
68 | } | 70 | } |
69 | 71 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
@@ -29,6 +29,8 @@ public class SummarizeTestCorpus { | @@ -29,6 +29,8 @@ public class SummarizeTestCorpus { | ||
29 | private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); | 29 | private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); |
30 | 30 | ||
31 | private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; | 31 | private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; |
32 | + private static final String SUMMARY_WITH_ZERO_FILE_SUFFIX = "_nicolas-zero.txt"; | ||
33 | + | ||
32 | private static final double SUMMARY_RATIO = 0.2; | 34 | private static final double SUMMARY_RATIO = 0.2; |
33 | 35 | ||
34 | private SummarizeTestCorpus() { | 36 | private SummarizeTestCorpus() { |
@@ -41,16 +43,20 @@ public class SummarizeTestCorpus { | @@ -41,16 +43,20 @@ public class SummarizeTestCorpus { | ||
41 | Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains); | 43 | Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains); |
42 | LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); | 44 | LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); |
43 | 45 | ||
44 | - Map<String, String> id2summary = summarizeTexts(id2preprocessedText); | 46 | + summarize(new Nicolas(false), id2preprocessedText, SUMMARY_FILE_SUFFIX); |
47 | + summarize(new Nicolas(), id2preprocessedText, SUMMARY_WITH_ZERO_FILE_SUFFIX); | ||
48 | + } | ||
49 | + | ||
50 | + private static void summarize(Nicolas nicolas, Map<String, TText> id2preprocessedText, String fileSuffix) throws NicolasException, IOException { | ||
51 | + Map<String, String> id2summary = summarizeTexts(id2preprocessedText, nicolas); | ||
45 | LOG.info("Texts summarized."); | 52 | LOG.info("Texts summarized."); |
46 | 53 | ||
47 | - saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR); | 54 | + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR, fileSuffix); |
48 | LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR); | 55 | LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR); |
49 | } | 56 | } |
50 | 57 | ||
51 | - private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { | 58 | + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText, Nicolas nicolas) throws NicolasException { |
52 | Map<String, String> id2summary = Maps.newHashMap(); | 59 | Map<String, String> id2summary = Maps.newHashMap(); |
53 | - Nicolas nicolas = new Nicolas(); | ||
54 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | 60 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
55 | TText text = entry.getValue(); | 61 | TText text = entry.getValue(); |
56 | int targetSize = calculateTargetSize(text); | 62 | int targetSize = calculateTargetSize(text); |
@@ -70,11 +76,11 @@ public class SummarizeTestCorpus { | @@ -70,11 +76,11 @@ public class SummarizeTestCorpus { | ||
70 | return (int) (SUMMARY_RATIO * tokenCount); | 76 | return (int) (SUMMARY_RATIO * tokenCount); |
71 | } | 77 | } |
72 | 78 | ||
73 | - private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException { | 79 | + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir, String fileSuffix) throws IOException { |
74 | for (Map.Entry<String, String> entry : id2summary.entrySet()) { | 80 | for (Map.Entry<String, String> entry : id2summary.entrySet()) { |
75 | String textId = entry.getKey(); | 81 | String textId = entry.getKey(); |
76 | String summary = entry.getValue(); | 82 | String summary = entry.getValue(); |
77 | - String targetFileName = textId + SUMMARY_FILE_SUFFIX; | 83 | + String targetFileName = textId + fileSuffix; |
78 | try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) { | 84 | try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) { |
79 | writer.write(summary); | 85 | writer.write(summary); |
80 | } | 86 | } |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Main.java
1 | package pl.waw.ipipan.zil.summ.nicolas.train; | 1 | package pl.waw.ipipan.zil.summ.nicolas.train; |
2 | 2 | ||
3 | import pl.waw.ipipan.zil.summ.nicolas.train.pipeline.*; | 3 | import pl.waw.ipipan.zil.summ.nicolas.train.pipeline.*; |
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractMostFrequentMentions; | ||
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.resources.ExtractStopwords; | ||
4 | 6 | ||
5 | public class Main { | 7 | public class Main { |
6 | 8 | ||
@@ -12,6 +14,8 @@ public class Main { | @@ -12,6 +14,8 @@ public class Main { | ||
12 | DownloadTrainingResources.main(args); | 14 | DownloadTrainingResources.main(args); |
13 | ExtractGoldSummaries.main(args); | 15 | ExtractGoldSummaries.main(args); |
14 | CreateOptimalSummaries.main(args); | 16 | CreateOptimalSummaries.main(args); |
17 | + ExtractStopwords.main(args); | ||
18 | + ExtractMostFrequentMentions.main(args); | ||
15 | PrepareTrainingData.main(args); | 19 | PrepareTrainingData.main(args); |
16 | TrainAllModels.main(args); | 20 | TrainAllModels.main(args); |
17 | } | 21 | } |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java
@@ -7,7 +7,7 @@ import org.apache.commons.csv.CSVRecord; | @@ -7,7 +7,7 @@ import org.apache.commons.csv.CSVRecord; | ||
7 | import org.apache.commons.csv.QuoteMode; | 7 | import org.apache.commons.csv.QuoteMode; |
8 | import pl.waw.ipipan.zil.summ.nicolas.Constants; | 8 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | 10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; |
11 | 11 | ||
12 | import java.io.*; | 12 | import java.io.*; |
13 | import java.util.List; | 13 | import java.util.List; |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
@@ -19,10 +19,9 @@ import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; | @@ -19,10 +19,9 @@ import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; | ||
19 | import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; | 19 | import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; |
20 | import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; | 20 | import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; |
21 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | 21 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; |
22 | -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | ||
23 | -import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; | ||
24 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | 22 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; |
25 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | 23 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.CandidateFinder; |
24 | +import pl.waw.ipipan.zil.summ.nicolas.zero.candidate.ZeroSubjectCandidate; | ||
26 | import weka.core.Instance; | 25 | import weka.core.Instance; |
27 | import weka.core.Instances; | 26 | import weka.core.Instances; |
28 | import weka.core.converters.ArffSaver; | 27 | import weka.core.converters.ArffSaver; |
@@ -152,7 +151,7 @@ public class PrepareTrainingData { | @@ -152,7 +151,7 @@ public class PrepareTrainingData { | ||
152 | FeatureHelper featureHelper = new FeatureHelper(text); | 151 | FeatureHelper featureHelper = new FeatureHelper(text); |
153 | 152 | ||
154 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); | 153 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); |
155 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | 154 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceUtils.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); |
156 | 155 | ||
157 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { | 156 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { |
158 | boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); | 157 | boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractMostFrequentMentions.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.resources; | ||
2 | + | ||
3 | +import com.google.common.collect.*; | ||
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | ||
7 | +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | ||
8 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | ||
9 | + | ||
10 | +import javax.xml.bind.JAXBException; | ||
11 | +import java.io.File; | ||
12 | +import java.io.IOException; | ||
13 | +import java.util.Comparator; | ||
14 | +import java.util.List; | ||
15 | +import java.util.Map; | ||
16 | +import java.util.Set; | ||
17 | +import java.util.stream.Collectors; | ||
18 | + | ||
19 | +public class ExtractMostFrequentMentions { | ||
20 | + | ||
21 | + public static final String GOLD_DATA_PATH = "/home/me2/Dropbox/3_nauka/3_doktorat/3_korpus_streszczen/dist/src/data/"; | ||
22 | + | ||
23 | + public static final String THRIFTED_PREFIX = "/home/me2/Desktop/thrifted_texts/thrifted_all/"; | ||
24 | + public static final String THRIFTED_SUFFIX = "/original"; | ||
25 | + | ||
26 | + public static void main(String[] args) throws IOException, JAXBException { | ||
27 | + | ||
28 | + Set<String> devIds = Sets.newHashSet(); | ||
29 | + | ||
30 | + File goldDir = new File(GOLD_DATA_PATH); | ||
31 | + for (File file : goldDir.listFiles()) { | ||
32 | + Text goldText = PSC_IO.readText(file); | ||
33 | + if (goldText.getSummaries().getSummary().stream().anyMatch(s -> s.getType().equals("abstract"))) | ||
34 | + continue; | ||
35 | + | ||
36 | + devIds.add(file.getName().replace(".xml", "")); | ||
37 | + } | ||
38 | + | ||
39 | + | ||
40 | + System.out.println(devIds.size()); | ||
41 | + | ||
42 | + Multiset<String> mentionCounts = HashMultiset.create(); | ||
43 | + for (String id : devIds) { | ||
44 | + Set<String> distinctTextMentions = Sets.newHashSet(); | ||
45 | + File input = new File(THRIFTED_PREFIX + id + THRIFTED_SUFFIX); | ||
46 | + TText thrifted = ThriftUtils.loadThriftTextFromFile(input); | ||
47 | + List<TSentence> sents = thrifted.getParagraphs().stream() | ||
48 | + .flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | ||
49 | + | ||
50 | + Map<String, String> tokenId2base = Maps.newHashMap(); | ||
51 | + sents.stream() | ||
52 | + .flatMap(s -> s.getTokens().stream()) | ||
53 | + .forEach(token -> tokenId2base.put(token.getId(), token.getChosenInterpretation().getBase())); | ||
54 | + | ||
55 | + sents.stream().flatMap(s -> s.getMentions().stream()).forEach(m -> { | ||
56 | + StringBuffer sb = new StringBuffer(); | ||
57 | + for (String tokId : m.getChildIds()) { | ||
58 | + sb.append(tokenId2base.get(tokId) + " "); | ||
59 | + } | ||
60 | + distinctTextMentions.add(sb.toString().trim().toLowerCase()); | ||
61 | + }); | ||
62 | + | ||
63 | + mentionCounts.addAll(distinctTextMentions); | ||
64 | + } | ||
65 | + | ||
66 | + System.out.println(mentionCounts.elementSet().size()); | ||
67 | + List<String> sorted = Lists.newArrayList(); | ||
68 | + sorted.addAll(mentionCounts.elementSet()); | ||
69 | + sorted.sort(Comparator.comparing(mentionCounts::count).reversed()); | ||
70 | + int i = 0; | ||
71 | + for (String mention : sorted) { | ||
72 | + if (mentionCounts.count(mention) < 50) | ||
73 | + break; | ||
74 | + System.out.println(mention); | ||
75 | + } | ||
76 | + | ||
77 | + } | ||
78 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/resources/ExtractStopwords.java
0 → 100644