Commit eac83d24d0d460300033f920fafbc7fa3d5ecdbb
1 parent
b41f6532
refactor
Showing
24 changed files
with
639 additions
and
169 deletions
nicolas-common/pom.xml
0 → 100644
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | + <modelVersion>4.0.0</modelVersion> | ||
6 | + <parent> | ||
7 | + <artifactId>nicolas-container</artifactId> | ||
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
9 | + <version>1.0-SNAPSHOT</version> | ||
10 | + </parent> | ||
11 | + | ||
12 | + <artifactId>nicolas-common</artifactId> | ||
13 | + | ||
14 | + <dependencies> | ||
15 | + <!-- internal --> | ||
16 | + <dependency> | ||
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | + <artifactId>pscapi</artifactId> | ||
19 | + </dependency> | ||
20 | + <dependency> | ||
21 | + <groupId>pl.waw.ipipan.zil.multiservice</groupId> | ||
22 | + <artifactId>utils</artifactId> | ||
23 | + </dependency> | ||
24 | + | ||
25 | + <!-- third party --> | ||
26 | + <dependency> | ||
27 | + <groupId>nz.ac.waikato.cms.weka</groupId> | ||
28 | + <artifactId>weka-dev</artifactId> | ||
29 | + </dependency> | ||
30 | + | ||
31 | + <!-- logging --> | ||
32 | + <dependency> | ||
33 | + <groupId>org.slf4j</groupId> | ||
34 | + <artifactId>slf4j-api</artifactId> | ||
35 | + </dependency> | ||
36 | + | ||
37 | + </dependencies> | ||
38 | + | ||
39 | +</project> | ||
0 | \ No newline at end of file | 40 | \ No newline at end of file |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
1 | -package pl.waw.ipipan.zil.summ.nicolas; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.common; |
2 | 2 | ||
3 | -import com.google.common.base.Charsets; | ||
4 | import com.google.common.collect.Lists; | 3 | import com.google.common.collect.Lists; |
5 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
6 | import com.google.common.collect.Sets; | 5 | import com.google.common.collect.Sets; |
7 | -import com.google.common.io.Files; | ||
8 | import org.slf4j.Logger; | 6 | import org.slf4j.Logger; |
9 | import org.slf4j.LoggerFactory; | 7 | import org.slf4j.LoggerFactory; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
12 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
13 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | 11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
14 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | ||
15 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer; | ||
16 | -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | ||
17 | import weka.classifiers.Classifier; | 12 | import weka.classifiers.Classifier; |
18 | import weka.core.Attribute; | 13 | import weka.core.Attribute; |
19 | -import weka.core.DenseInstance; | ||
20 | -import weka.core.Instance; | ||
21 | import weka.core.Instances; | 14 | import weka.core.Instances; |
22 | 15 | ||
23 | -import java.io.File; | ||
24 | -import java.io.FileInputStream; | ||
25 | -import java.io.IOException; | ||
26 | -import java.io.ObjectInputStream; | 16 | +import java.io.*; |
27 | import java.util.*; | 17 | import java.util.*; |
28 | import java.util.function.Function; | 18 | import java.util.function.Function; |
29 | import java.util.stream.Collectors; | 19 | import java.util.stream.Collectors; |
30 | 20 | ||
31 | -import static java.util.stream.Collectors.toList; | ||
32 | - | ||
33 | public class Utils { | 21 | public class Utils { |
34 | 22 | ||
35 | private static final Logger LOG = LoggerFactory.getLogger(Utils.class); | 23 | private static final Logger LOG = LoggerFactory.getLogger(Utils.class); |
36 | 24 | ||
37 | private static final String DATASET_NAME = "Dataset"; | 25 | private static final String DATASET_NAME = "Dataset"; |
38 | 26 | ||
39 | - public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { | ||
40 | - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
41 | - Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); | ||
42 | - | ||
43 | - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention."); | ||
44 | - Map<TMention, Instance> mention2instance = Maps.newHashMap(); | ||
45 | - for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { | ||
46 | - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
47 | - Map<Attribute, Double> mentionFeatures = mention2features.get(tMention); | ||
48 | - for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
49 | - instance.setValue(attribute, mentionFeatures.get(attribute)); | ||
50 | - } | ||
51 | - mention2instance.put(tMention, instance); | ||
52 | - } | ||
53 | - return mention2instance; | ||
54 | - } | ||
55 | - | ||
56 | - public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) { | ||
57 | - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
58 | - Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); | ||
59 | - | ||
60 | - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence."); | ||
61 | - Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); | ||
62 | - for (TSentence sentence : sentences) { | ||
63 | - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
64 | - Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence); | ||
65 | - for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
66 | - instance.setValue(attribute, sentenceFeatures.get(attribute)); | ||
67 | - } | ||
68 | - sentence2instance.put(sentence, instance); | ||
69 | - } | ||
70 | - return sentence2instance; | ||
71 | - } | ||
72 | - | ||
73 | public static Instances createNewInstances(ArrayList<Attribute> attributesList) { | 27 | public static Instances createNewInstances(ArrayList<Attribute> attributesList) { |
74 | Instances instances = new Instances(DATASET_NAME, attributesList, 0); | 28 | Instances instances = new Instances(DATASET_NAME, attributesList, 0); |
75 | instances.setClassIndex(0); | 29 | instances.setClassIndex(0); |
@@ -97,7 +51,16 @@ public class Utils { | @@ -97,7 +51,16 @@ public class Utils { | ||
97 | 51 | ||
98 | 52 | ||
99 | public static TText loadThrifted(File originalFile) { | 53 | public static TText loadThrifted(File originalFile) { |
100 | - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(originalFile))) { | 54 | + try (FileInputStream inputStream = new FileInputStream(originalFile)) { |
55 | + return loadThrifted(inputStream); | ||
56 | + } catch (IOException e) { | ||
57 | + LOG.error("Error reading serialized file: " + e); | ||
58 | + return null; | ||
59 | + } | ||
60 | + } | ||
61 | + | ||
62 | + public static TText loadThrifted(InputStream stream) { | ||
63 | + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { | ||
101 | return (TText) ois.readObject(); | 64 | return (TText) ois.readObject(); |
102 | } catch (ClassNotFoundException | IOException e) { | 65 | } catch (ClassNotFoundException | IOException e) { |
103 | LOG.error("Error reading serialized file: " + e); | 66 | LOG.error("Error reading serialized file: " + e); |
@@ -188,13 +151,5 @@ public class Utils { | @@ -188,13 +151,5 @@ public class Utils { | ||
188 | return sb.toString().trim(); | 151 | return sb.toString().trim(); |
189 | } | 152 | } |
190 | 153 | ||
191 | - public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException { | ||
192 | - String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8); | ||
193 | 154 | ||
194 | - MentionScorer scorer = new MentionScorer(); | ||
195 | - Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text); | ||
196 | - | ||
197 | - mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0); | ||
198 | - return mention2score.keySet(); | ||
199 | - } | ||
200 | } | 155 | } |
201 | \ No newline at end of file | 156 | \ No newline at end of file |
nicolas-core/pom.xml
@@ -12,10 +12,14 @@ | @@ -12,10 +12,14 @@ | ||
12 | <artifactId>nicolas</artifactId> | 12 | <artifactId>nicolas</artifactId> |
13 | 13 | ||
14 | <dependencies> | 14 | <dependencies> |
15 | + <!-- project --> | ||
16 | + <dependency> | ||
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | + <artifactId>nicolas-common</artifactId> | ||
19 | + </dependency> | ||
15 | <dependency> | 20 | <dependency> |
16 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 21 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
17 | <artifactId>nicolas-model</artifactId> | 22 | <artifactId>nicolas-model</artifactId> |
18 | - <scope>runtime</scope> | ||
19 | </dependency> | 23 | </dependency> |
20 | 24 | ||
21 | <dependency> | 25 | <dependency> |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -6,6 +6,7 @@ import com.google.common.collect.Sets; | @@ -6,6 +6,7 @@ import com.google.common.collect.Sets; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
9 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 10 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
10 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
11 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 12 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
@@ -53,7 +54,7 @@ public class Nicolas { | @@ -53,7 +54,7 @@ public class Nicolas { | ||
53 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | 54 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
54 | 55 | ||
55 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | 56 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); |
56 | - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | 57 | + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); |
57 | 58 | ||
58 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); | 59 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
59 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | 60 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas; | ||
2 | + | ||
3 | +import com.google.common.base.Charsets; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import com.google.common.io.Files; | ||
6 | +import org.slf4j.Logger; | ||
7 | +import org.slf4j.LoggerFactory; | ||
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | ||
12 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer; | ||
13 | +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | ||
14 | +import weka.core.Attribute; | ||
15 | +import weka.core.DenseInstance; | ||
16 | +import weka.core.Instance; | ||
17 | + | ||
18 | +import java.io.File; | ||
19 | +import java.io.IOException; | ||
20 | +import java.util.List; | ||
21 | +import java.util.Map; | ||
22 | +import java.util.Set; | ||
23 | + | ||
24 | +import static java.util.stream.Collectors.toList; | ||
25 | + | ||
26 | +public class ThriftUtils { | ||
27 | + | ||
28 | + private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class); | ||
29 | + | ||
30 | + public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException { | ||
31 | + String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8); | ||
32 | + | ||
33 | + MentionScorer scorer = new MentionScorer(); | ||
34 | + Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text); | ||
35 | + | ||
36 | + mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0); | ||
37 | + return mention2score.keySet(); | ||
38 | + } | ||
39 | + | ||
40 | + public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { | ||
41 | + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
42 | + Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); | ||
43 | + | ||
44 | + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention."); | ||
45 | + Map<TMention, Instance> mention2instance = Maps.newHashMap(); | ||
46 | + for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { | ||
47 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
48 | + Map<Attribute, Double> mentionFeatures = mention2features.get(tMention); | ||
49 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
50 | + instance.setValue(attribute, mentionFeatures.get(attribute)); | ||
51 | + } | ||
52 | + mention2instance.put(tMention, instance); | ||
53 | + } | ||
54 | + return mention2instance; | ||
55 | + } | ||
56 | + | ||
57 | + public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) { | ||
58 | + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
59 | + Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); | ||
60 | + | ||
61 | + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence."); | ||
62 | + Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); | ||
63 | + for (TSentence sentence : sentences) { | ||
64 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
65 | + Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence); | ||
66 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
67 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | ||
68 | + } | ||
69 | + sentence2instance.put(sentence, instance); | ||
70 | + } | ||
71 | + return sentence2instance; | ||
72 | + } | ||
73 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
@@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | @@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | import pl.waw.ipipan.zil.summ.nicolas.Constants; | 11 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
12 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | 12 | +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
13 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
15 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 16 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
@@ -85,7 +86,7 @@ public class ApplyModel2 { | @@ -85,7 +86,7 @@ public class ApplyModel2 { | ||
85 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | 86 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
86 | 87 | ||
87 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | 88 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); |
88 | - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | 89 | + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); |
89 | 90 | ||
90 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); | 91 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
91 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | 92 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.features; | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.features; | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import com.google.common.collect.Sets; | 4 | import com.google.common.collect.Sets; |
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
7 | 7 | ||
8 | import java.util.List; | 8 | import java.util.List; |
9 | import java.util.Map; | 9 | import java.util.Map; |
@@ -14,9 +14,7 @@ import java.util.stream.Collectors; | @@ -14,9 +14,7 @@ import java.util.stream.Collectors; | ||
14 | import static java.util.stream.Collectors.toList; | 14 | import static java.util.stream.Collectors.toList; |
15 | import static java.util.stream.Collectors.toMap; | 15 | import static java.util.stream.Collectors.toMap; |
16 | 16 | ||
17 | -/** | ||
18 | - * Created by me2 on 04.04.16. | ||
19 | - */ | 17 | + |
20 | public class FeatureHelper { | 18 | public class FeatureHelper { |
21 | 19 | ||
22 | private final List<TMention> mentions; | 20 | private final List<TMention> mentions; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
@@ -5,7 +5,8 @@ import org.slf4j.Logger; | @@ -5,7 +5,8 @@ import org.slf4j.Logger; | ||
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | 8 | +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
9 | import weka.classifiers.Classifier; | 10 | import weka.classifiers.Classifier; |
10 | import weka.core.Instance; | 11 | import weka.core.Instance; |
11 | import weka.core.Instances; | 12 | import weka.core.Instances; |
@@ -21,7 +22,7 @@ public class MentionModel { | @@ -21,7 +22,7 @@ public class MentionModel { | ||
21 | Set<TMention> goodMentions = Sets.newHashSet(); | 22 | Set<TMention> goodMentions = Sets.newHashSet(); |
22 | 23 | ||
23 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 24 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
24 | - Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(text, featureExtractor); | 25 | + Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(text, featureExtractor); |
25 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | 26 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { |
26 | Instance instance = entry.getValue(); | 27 | Instance instance = entry.getValue(); |
27 | instance.setDataset(instances); | 28 | instance.setDataset(instances); |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
@@ -6,9 +6,8 @@ import com.google.common.collect.Multiset; | @@ -6,9 +6,8 @@ import com.google.common.collect.Multiset; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | 9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
10 | 10 | ||
11 | -import java.util.Collection; | ||
12 | import java.util.List; | 11 | import java.util.List; |
13 | import java.util.Map; | 12 | import java.util.Map; |
14 | import java.util.stream.Collectors; | 13 | import java.util.stream.Collectors; |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
@@ -8,7 +8,8 @@ import org.slf4j.LoggerFactory; | @@ -8,7 +8,8 @@ import org.slf4j.LoggerFactory; | ||
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
10 | import pl.waw.ipipan.zil.summ.nicolas.Constants; | 10 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | 11 | +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
12 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
12 | import weka.core.Instance; | 13 | import weka.core.Instance; |
13 | import weka.core.Instances; | 14 | import weka.core.Instances; |
14 | import weka.core.converters.ArffSaver; | 15 | import weka.core.converters.ArffSaver; |
@@ -45,7 +46,7 @@ public class PrepareTrainingData { | @@ -45,7 +46,7 @@ public class PrepareTrainingData { | ||
45 | continue; | 46 | continue; |
46 | Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); | 47 | Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); |
47 | 48 | ||
48 | - Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(preprocessedText, featureExtractor); | 49 | + Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); |
49 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | 50 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { |
50 | TMention mention = entry.getKey(); | 51 | TMention mention = entry.getKey(); |
51 | Instance instance = entry.getValue(); | 52 | Instance instance = entry.getValue(); |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
@@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | @@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | import pl.waw.ipipan.zil.summ.nicolas.Constants; | 11 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
12 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | 12 | +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
13 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
15 | import weka.classifiers.Classifier; | 16 | import weka.classifiers.Classifier; |
@@ -58,7 +59,7 @@ public class PrepareTrainingData { | @@ -58,7 +59,7 @@ public class PrepareTrainingData { | ||
58 | // Set<TMention> goodMentions | 59 | // Set<TMention> goodMentions |
59 | // = Utils.loadGoldGoodMentions(textId, preprocessedText, true); | 60 | // = Utils.loadGoldGoodMentions(textId, preprocessedText, true); |
60 | 61 | ||
61 | - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); | 62 | + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); |
62 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | 63 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
63 | TSentence sentence = entry.getKey(); | 64 | TSentence sentence = entry.getKey(); |
64 | Instance instance = entry.getValue(); | 65 | Instance instance = entry.getValue(); |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
@@ -6,7 +6,7 @@ import com.google.common.collect.Multiset; | @@ -6,7 +6,7 @@ import com.google.common.collect.Multiset; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | 9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
10 | 10 | ||
11 | import java.util.List; | 11 | import java.util.List; |
12 | import java.util.Map; | 12 | import java.util.Map; |
nicolas-train/pom.xml
@@ -11,4 +11,21 @@ | @@ -11,4 +11,21 @@ | ||
11 | 11 | ||
12 | <artifactId>nicolas-train</artifactId> | 12 | <artifactId>nicolas-train</artifactId> |
13 | 13 | ||
14 | + <dependencies> | ||
15 | + <!-- internal --> | ||
16 | + <dependency> | ||
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | + <artifactId>pscapi</artifactId> | ||
19 | + </dependency> | ||
20 | + <dependency> | ||
21 | + <groupId>pl.waw.ipipan.zil.multiservice</groupId> | ||
22 | + <artifactId>utils</artifactId> | ||
23 | + </dependency> | ||
24 | + | ||
25 | + <!-- logging --> | ||
26 | + <dependency> | ||
27 | + <groupId>org.slf4j</groupId> | ||
28 | + <artifactId>slf4j-api</artifactId> | ||
29 | + </dependency> | ||
30 | + </dependencies> | ||
14 | </project> | 31 | </project> |
15 | \ No newline at end of file | 32 | \ No newline at end of file |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Trainer.java
0 → 100644
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; | ||
2 | + | ||
3 | +import org.apache.thrift.TException; | ||
4 | +import org.apache.thrift.protocol.TBinaryProtocol; | ||
5 | +import org.apache.thrift.protocol.TProtocol; | ||
6 | +import org.apache.thrift.transport.TSocket; | ||
7 | +import org.apache.thrift.transport.TTransport; | ||
8 | +import org.slf4j.Logger; | ||
9 | +import org.slf4j.LoggerFactory; | ||
10 | +import pl.waw.ipipan.zil.multiservice.thrift.Multiservice; | ||
11 | +import pl.waw.ipipan.zil.multiservice.thrift.ObjectRequest; | ||
12 | +import pl.waw.ipipan.zil.multiservice.thrift.RequestPart; | ||
13 | +import pl.waw.ipipan.zil.multiservice.thrift.RequestStatus; | ||
14 | +import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; | ||
15 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | ||
16 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
17 | + | ||
18 | +import java.util.ArrayList; | ||
19 | +import java.util.HashMap; | ||
20 | +import java.util.List; | ||
21 | +import java.util.Map; | ||
22 | + | ||
23 | +public class MultiserviceProxy { | ||
24 | + | ||
25 | + private static final Logger LOG = LoggerFactory.getLogger(MultiserviceProxy.class); | ||
26 | + | ||
27 | + private int port; | ||
28 | + private String host; | ||
29 | + | ||
30 | + public MultiserviceProxy(String host, int port) { | ||
31 | + this.host = host; | ||
32 | + this.port = port; | ||
33 | + LOG.info("Multiservice at " + host + ":" + port); | ||
34 | + } | ||
35 | + | ||
36 | + public TText process(String text, List<String> services) throws Exception { | ||
37 | + List<Map<String, String>> options = new ArrayList<>(); | ||
38 | + for (int i = 0; i < services.size(); i++) | ||
39 | + options.add(new HashMap<>()); | ||
40 | + return process(text, "", services, options); | ||
41 | + } | ||
42 | + | ||
43 | + public TText process(String text, String title, List<String> services, List<Map<String, String>> options) | ||
44 | + throws Exception { | ||
45 | + TTransport transport = new TSocket(host, port); | ||
46 | + ObjectRequest objectRequest = createRequest(text, title, services, options); | ||
47 | + | ||
48 | + try { | ||
49 | + transport.open(); | ||
50 | + | ||
51 | + TProtocol protocol = new TBinaryProtocol(transport); | ||
52 | + Multiservice.Client client = new Multiservice.Client(protocol); | ||
53 | + | ||
54 | + LOG.debug("Sending Multservice request..."); | ||
55 | + TText responseText = request(objectRequest, client); | ||
56 | + LOG.debug("...done"); | ||
57 | + | ||
58 | + return responseText; | ||
59 | + | ||
60 | + } catch (TException e) { | ||
61 | + LOG.error("Error processing request:" + e); | ||
62 | + throw new Exception(e); | ||
63 | + | ||
64 | + } finally { | ||
65 | + transport.close(); | ||
66 | + } | ||
67 | + } | ||
68 | + | ||
69 | + private TText request(ObjectRequest objectRequest, Multiservice.Client client) throws TException { | ||
70 | + | ||
71 | + String requestToken = client.putObjectRequest(objectRequest); | ||
72 | + while (true) { | ||
73 | + RequestStatus status = client.getRequestStatus(requestToken); | ||
74 | + if (RequestStatus.DONE.equals(status)) { | ||
75 | + TText result = client.getResultObject(requestToken); | ||
76 | + return result; | ||
77 | + } else if (RequestStatus.FAILED.equals(status) || RequestStatus.DUMPED.equals(status)) { | ||
78 | + try { | ||
79 | + MultiserviceException exception = client.getException(requestToken); | ||
80 | + throw exception; | ||
81 | + } catch (TException e) { | ||
82 | + throw e; | ||
83 | + } | ||
84 | + } | ||
85 | + } | ||
86 | + } | ||
87 | + | ||
88 | + private ObjectRequest createRequest(String textBody, String textTitle, List<String> services, | ||
89 | + List<Map<String, String>> options) { | ||
90 | + TText text = new TText(); | ||
91 | + | ||
92 | + TParagraph par = new TParagraph(); | ||
93 | + par.setText(textTitle); | ||
94 | + text.addToParagraphs(par); | ||
95 | + | ||
96 | + for (String p : textBody.split("\n\n")) { | ||
97 | + par = new TParagraph(); | ||
98 | + par.setText(p); | ||
99 | + text.addToParagraphs(par); | ||
100 | + } | ||
101 | + | ||
102 | + List<RequestPart> processingChain = new ArrayList<>(); | ||
103 | + int i = 0; | ||
104 | + for (String serviceName : services) | ||
105 | + processingChain.add(new RequestPart(serviceName, options.get(i++))); | ||
106 | + | ||
107 | + return new ObjectRequest(text, processingChain); | ||
108 | + } | ||
109 | + | ||
110 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
6 | +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | ||
7 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | ||
8 | + | ||
9 | +import java.io.File; | ||
10 | +import java.io.FileOutputStream; | ||
11 | +import java.io.IOException; | ||
12 | +import java.io.ObjectOutputStream; | ||
13 | +import java.util.Arrays; | ||
14 | +import java.util.List; | ||
15 | + | ||
16 | +public class NLPProcess { | ||
17 | + | ||
18 | + private static final Logger LOG = LoggerFactory.getLogger(NLPProcess.class); | ||
19 | + | ||
20 | + private static final List<String> SERVICES = Arrays.asList("Concraft", "Spejd", "Nerf", "MentionDetector", | ||
21 | + "Bartek"); | ||
22 | + private static final int PORT = 20000; | ||
23 | + private static final String HOST = "multiservice.nlp.ipipan.waw.pl"; | ||
24 | + | ||
25 | + private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT); | ||
26 | + | ||
27 | + private NLPProcess() { | ||
28 | + } | ||
29 | + | ||
30 | + public static void main(String[] args) { | ||
31 | + if (args.length != 2) { | ||
32 | + LOG.error("Wrong usage! Try " + NLPProcess.class.getSimpleName() + " dirWithCorpusFiles targetDir"); | ||
33 | + return; | ||
34 | + } | ||
35 | + File corpusDir = new File(args[0]); | ||
36 | + if (!corpusDir.isDirectory()) { | ||
37 | + LOG.error("Corpus directory does not exist: " + corpusDir); | ||
38 | + return; | ||
39 | + } | ||
40 | + File targetDir = new File(args[1]); | ||
41 | + if (!targetDir.isDirectory()) { | ||
42 | + LOG.error("Target directory does not exist: " + targetDir); | ||
43 | + return; | ||
44 | + } | ||
45 | + | ||
46 | + int ok = 0; | ||
47 | + int err = 0; | ||
48 | + File[] files = corpusDir.listFiles(f -> f.getName().endsWith(".xml")); | ||
49 | + Arrays.sort(files); | ||
50 | + for (File file : files) { | ||
51 | + try { | ||
52 | + Text text = PSC_IO.readText(file); | ||
53 | + File targetFile = new File(targetDir, file.getName().replaceFirst(".xml$", ".bin")); | ||
54 | + annotateNLP(text, targetFile); | ||
55 | + ok++; | ||
56 | + } catch (Exception e) { | ||
57 | + err++; | ||
58 | + LOG.error("Problem with text in " + file + ", " + e); | ||
59 | + } | ||
60 | + } | ||
61 | + LOG.info(ok + " texts processed successfully."); | ||
62 | + LOG.info(err + " texts with errors."); | ||
63 | + } | ||
64 | + | ||
65 | + private static void annotateNLP(Text text, File targetFile) throws Exception { | ||
66 | + annotate(text.getBody(), targetFile); | ||
67 | + } | ||
68 | + | ||
69 | + private static void annotate(String body, File targetFile) throws Exception { | ||
70 | + if (targetFile.exists()) { | ||
71 | + LOG.debug("Skipping existing file.."); | ||
72 | + return; | ||
73 | + } | ||
74 | + LOG.info("Processing text into " + targetFile.getPath()); | ||
75 | + TText ttext = MSPROXY.process(body, SERVICES); | ||
76 | + serialize(ttext, targetFile); | ||
77 | + } | ||
78 | + | ||
79 | + public static void serialize(TText ttext, File targetFile) throws IOException { | ||
80 | + try (FileOutputStream fout = new FileOutputStream(targetFile); | ||
81 | + ObjectOutputStream oos = new ObjectOutputStream(fout)) { | ||
82 | + oos.writeObject(ttext); | ||
83 | + } | ||
84 | + } | ||
85 | + | ||
86 | + public static TText annotate(String body) throws Exception { | ||
87 | + return MSPROXY.process(body, SERVICES); | ||
88 | + } | ||
89 | + | ||
90 | +} |
nicolas-zero/pom.xml
@@ -11,4 +11,34 @@ | @@ -11,4 +11,34 @@ | ||
11 | 11 | ||
12 | <artifactId>nicolas-zero</artifactId> | 12 | <artifactId>nicolas-zero</artifactId> |
13 | 13 | ||
14 | + <dependencies> | ||
15 | + <!-- project --> | ||
16 | + <dependency> | ||
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | + <artifactId>nicolas-common</artifactId> | ||
19 | + </dependency> | ||
20 | + | ||
21 | + <!-- third party --> | ||
22 | + <dependency> | ||
23 | + <groupId>org.apache.commons</groupId> | ||
24 | + <artifactId>commons-csv</artifactId> | ||
25 | + </dependency> | ||
26 | + <dependency> | ||
27 | + <groupId>commons-io</groupId> | ||
28 | + <artifactId>commons-io</artifactId> | ||
29 | + </dependency> | ||
30 | + | ||
31 | + <!-- logging --> | ||
32 | + <dependency> | ||
33 | + <groupId>org.slf4j</groupId> | ||
34 | + <artifactId>slf4j-api</artifactId> | ||
35 | + </dependency> | ||
36 | + | ||
37 | + <!-- test --> | ||
38 | + <dependency> | ||
39 | + <groupId>junit</groupId> | ||
40 | + <artifactId>junit</artifactId> | ||
41 | + </dependency> | ||
42 | + </dependencies> | ||
43 | + | ||
14 | </project> | 44 | </project> |
15 | \ No newline at end of file | 45 | \ No newline at end of file |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java renamed to nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
@@ -3,126 +3,61 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | @@ -3,126 +3,61 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
3 | import com.google.common.collect.Lists; | 3 | import com.google.common.collect.Lists; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
5 | import com.google.common.collect.Sets; | 5 | import com.google.common.collect.Sets; |
6 | -import org.apache.commons.csv.CSVFormat; | ||
7 | -import org.apache.commons.csv.CSVPrinter; | ||
8 | -import org.apache.commons.csv.QuoteMode; | ||
9 | -import org.apache.commons.io.IOUtils; | ||
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | ||
12 | 7 | ||
13 | -import java.io.File; | ||
14 | -import java.io.FileReader; | ||
15 | -import java.io.FileWriter; | ||
16 | -import java.io.IOException; | ||
17 | import java.util.Arrays; | 8 | import java.util.Arrays; |
18 | import java.util.List; | 9 | import java.util.List; |
19 | import java.util.Map; | 10 | import java.util.Map; |
20 | import java.util.Set; | 11 | import java.util.Set; |
21 | 12 | ||
22 | -/** | ||
23 | - * Created by me2 on 26.07.16. | ||
24 | - */ | ||
25 | -public class Zero { | 13 | +public class CandidateFinder { |
26 | 14 | ||
27 | - private static final String IDS_PATH = "summaries_dev"; | ||
28 | - private static final String THRIFTED_PATH = "src/main/resources/preprocessed_full_texts/dev/"; | 15 | + public List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { |
16 | + List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); | ||
29 | 17 | ||
30 | - public static void main(String[] args) throws IOException { | ||
31 | - | ||
32 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); | ||
33 | - Map<String, List<String>> id2sentIds = loadSentenceIds(IDS_PATH); | ||
34 | - | ||
35 | - int mentionCount = 0; | ||
36 | - int mentionInNom = 0; | ||
37 | - int mentionInNomSequential = 0; | ||
38 | - | ||
39 | - List<List<Object>> rows = Lists.newArrayList(); | ||
40 | - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
41 | - String textId = entry.getKey(); | ||
42 | -// System.out.println(id); | ||
43 | - | ||
44 | - TText text = entry.getValue(); | ||
45 | - List<String> sentenceIds = id2sentIds.get(textId); | ||
46 | -// System.out.println(sentenceIds); | ||
47 | - | ||
48 | - Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | ||
49 | - for (TCoreference coreference : text.getCoreferences()) { | ||
50 | - for (String mentionId : coreference.getMentionIds()) { | ||
51 | - mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | ||
52 | - } | 18 | + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); |
19 | + for (TCoreference coreference : text.getCoreferences()) { | ||
20 | + for (String mentionId : coreference.getMentionIds()) { | ||
21 | + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | ||
53 | } | 22 | } |
23 | + } | ||
54 | 24 | ||
55 | - Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet(); | ||
56 | - TSentence prevSentence = null; | ||
57 | - for (TParagraph p : text.getParagraphs()) { | ||
58 | - Map<TMention, String> tMentionStringMap = Utils.loadMention2Orth(p.getSentences()); | ||
59 | - | ||
60 | - for (TSentence sentence : p.getSentences()) { | ||
61 | - if (!sentenceIds.contains(sentence.getId())) | ||
62 | - continue; | ||
63 | - Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | ||
64 | - | ||
65 | - Map<String, TToken> tokenId2Token = Maps.newHashMap(); | ||
66 | - for (TToken t : sentence.getTokens()) | ||
67 | - tokenId2Token.put(t.getId(), t); | 25 | + Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet(); |
26 | + TSentence prevSentence = null; | ||
27 | + for (TParagraph p : text.getParagraphs()) { | ||
28 | + for (TSentence sentence : p.getSentences()) { | ||
29 | + if (!summarySentenceIds.contains(sentence.getId())) | ||
30 | + continue; | ||
31 | + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | ||
68 | 32 | ||
69 | - for (TMention mention : sentence.getMentions()) { | ||
70 | - mentionCount++; | 33 | + Map<String, TToken> tokenId2Token = Maps.newHashMap(); |
34 | + for (TToken t : sentence.getTokens()) | ||
35 | + tokenId2Token.put(t.getId(), t); | ||
71 | 36 | ||
72 | - for (String tokenId : mention.getHeadIds()) { | ||
73 | - TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | ||
74 | - if (isInNominative(interp)) { | ||
75 | - mentionInNom++; | 37 | + for (TMention mention : sentence.getMentions()) { |
76 | 38 | ||
77 | - currentSentenceNominativeMentionIds.add(mention.getId()); | ||
78 | - if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) { | ||
79 | - mentionInNomSequential++; | ||
80 | - System.out.println(tMentionStringMap.get(mention) | ||
81 | - + "\n\t" + Utils.loadSentence2Orth(prevSentence) | ||
82 | - + "\n\t" + Utils.loadSentence2Orth(sentence)); | 39 | + for (String tokenId : mention.getHeadIds()) { |
40 | + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | ||
41 | + if (isInNominative(interp)) { | ||
83 | 42 | ||
84 | - List<Object> row = Lists.newArrayList(); | ||
85 | - row.add("C"); | ||
86 | - row.add(textId); | ||
87 | - row.add(tMentionStringMap.get(mention)); | ||
88 | - row.add(Utils.loadSentence2Orth(prevSentence)); | ||
89 | - row.add(Utils.loadSentence2Orth(sentence)); | ||
90 | - rows.add(row); | ||
91 | - } | ||
92 | - break; | 43 | + currentSentenceNominativeMentionIds.add(mention.getId()); |
44 | + if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) { | ||
45 | + ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentence, sentence, mention); | ||
46 | + candidates.add(candidate); | ||
93 | } | 47 | } |
48 | + break; | ||
94 | } | 49 | } |
95 | } | 50 | } |
96 | - | ||
97 | - prevSentence = sentence; | ||
98 | - prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; | ||
99 | } | 51 | } |
100 | - } | ||
101 | - } | ||
102 | - | ||
103 | - System.out.println(mentionCount + " mentions"); | ||
104 | - System.out.println(mentionInNom + " mention in nom"); | ||
105 | - System.out.println(mentionInNomSequential + " mention in nom with previous in nom"); | ||
106 | 52 | ||
107 | - try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) { | ||
108 | - for (List<Object> row : rows) { | ||
109 | - csvPrinter.printRecord(row); | 53 | + prevSentence = sentence; |
54 | + prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; | ||
110 | } | 55 | } |
111 | } | 56 | } |
112 | - | 57 | + return candidates; |
113 | } | 58 | } |
114 | 59 | ||
115 | private static boolean isInNominative(TInterpretation interp) { | 60 | private static boolean isInNominative(TInterpretation interp) { |
116 | return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); | 61 | return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); |
117 | } | 62 | } |
118 | - | ||
119 | - private static Map<String, List<String>> loadSentenceIds(String idsPath) throws IOException { | ||
120 | - Map<String, List<String>> result = Maps.newHashMap(); | ||
121 | - for (File f : new File(idsPath).listFiles()) { | ||
122 | - String id = f.getName().split("_")[0]; | ||
123 | - List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | ||
124 | - result.put(id, sentenceIds); | ||
125 | - } | ||
126 | - return result; | ||
127 | - } | ||
128 | } | 63 | } |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | + | ||
3 | +import com.google.common.collect.Lists; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import com.google.common.collect.Sets; | ||
6 | +import org.apache.commons.csv.CSVFormat; | ||
7 | +import org.apache.commons.csv.CSVPrinter; | ||
8 | +import org.apache.commons.csv.QuoteMode; | ||
9 | +import org.apache.commons.io.IOUtils; | ||
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper; | ||
12 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
13 | + | ||
14 | +import java.io.File; | ||
15 | +import java.io.FileReader; | ||
16 | +import java.io.FileWriter; | ||
17 | +import java.io.IOException; | ||
18 | +import java.util.List; | ||
19 | +import java.util.Map; | ||
20 | +import java.util.Set; | ||
21 | + | ||
22 | +public class Zero { | ||
23 | + | ||
24 | + private static final String IDS_PATH = "corpora/summaries_dev"; | ||
25 | + private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; | ||
26 | + | ||
27 | + private Zero() { | ||
28 | + } | ||
29 | + | ||
30 | + public static void main(String[] args) throws IOException { | ||
31 | + | ||
32 | + CandidateFinder candidateFinder = new CandidateFinder(); | ||
33 | + | ||
34 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); | ||
35 | + Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH); | ||
36 | + | ||
37 | + List<List<Object>> rows = Lists.newArrayList(); | ||
38 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
39 | + String textId = entry.getKey(); | ||
40 | + | ||
41 | + TText text = entry.getValue(); | ||
42 | + ThriftTextHelper thriftTextHelper = new ThriftTextHelper(text); | ||
43 | + | ||
44 | + Set<String> sentenceIds = id2sentIds.get(textId); | ||
45 | + | ||
46 | + List<ZeroSubjectCandidate> zeroSubjectCandidates = candidateFinder.findZeroSubjectCandidates(text, sentenceIds); | ||
47 | + | ||
48 | + for (ZeroSubjectCandidate candidate : zeroSubjectCandidates) { | ||
49 | + List<Object> row = Lists.newArrayList(); | ||
50 | + row.add("C"); | ||
51 | + row.add(textId); | ||
52 | + row.add(thriftTextHelper.getMentionText(candidate.getZeroCandidateMention())); | ||
53 | + row.add(thriftTextHelper.getSentenceText(candidate.getPreviousSentence())); | ||
54 | + row.add(thriftTextHelper.getSentenceText(candidate.getSentence())); | ||
55 | + rows.add(row); | ||
56 | + } | ||
57 | + } | ||
58 | + | ||
59 | + try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) { | ||
60 | + for (List<Object> row : rows) { | ||
61 | + csvPrinter.printRecord(row); | ||
62 | + } | ||
63 | + } | ||
64 | + | ||
65 | + } | ||
66 | + | ||
67 | + private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { | ||
68 | + Map<String, Set<String>> result = Maps.newHashMap(); | ||
69 | + for (File f : new File(idsPath).listFiles()) { | ||
70 | + String id = f.getName().split("_")[0]; | ||
71 | + List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | ||
72 | + result.put(id, Sets.newHashSet(sentenceIds)); | ||
73 | + } | ||
74 | + return result; | ||
75 | + } | ||
76 | +} |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
5 | + | ||
6 | +public class ZeroSubjectCandidate { | ||
7 | + | ||
8 | + private final TSentence previousSentence; | ||
9 | + private final TSentence sentence; | ||
10 | + private final TMention zeroCandidateMention; | ||
11 | + | ||
12 | + public ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) { | ||
13 | + this.previousSentence = previousSentence; | ||
14 | + this.sentence = sentence; | ||
15 | + this.zeroCandidateMention = zeroCandidateMention; | ||
16 | + } | ||
17 | + | ||
18 | + public TSentence getPreviousSentence() { | ||
19 | + return previousSentence; | ||
20 | + } | ||
21 | + | ||
22 | + public TSentence getSentence() { | ||
23 | + return sentence; | ||
24 | + } | ||
25 | + | ||
26 | + public TMention getZeroCandidateMention() { | ||
27 | + return zeroCandidateMention; | ||
28 | + } | ||
29 | +} |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
0 → 100644
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | + | ||
3 | +import com.google.common.collect.Sets; | ||
4 | +import org.apache.commons.io.IOUtils; | ||
5 | +import org.junit.BeforeClass; | ||
6 | +import org.junit.Test; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper; | ||
10 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
11 | + | ||
12 | +import java.io.IOException; | ||
13 | +import java.io.InputStream; | ||
14 | +import java.io.InputStreamReader; | ||
15 | +import java.util.List; | ||
16 | +import java.util.Set; | ||
17 | + | ||
18 | +import static org.junit.Assert.assertEquals; | ||
19 | + | ||
20 | +public class CandidateFinderTest { | ||
21 | + | ||
22 | + private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin"; | ||
23 | + private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; | ||
24 | + | ||
25 | + private static CandidateFinder candidateFinder; | ||
26 | + | ||
27 | + @BeforeClass | ||
28 | + public static void init() { | ||
29 | + candidateFinder = new CandidateFinder(); | ||
30 | + } | ||
31 | + | ||
32 | + @Test | ||
33 | + public void shouldFindZeroSubjectCandidateInSampleText() throws Exception { | ||
34 | + ThriftTextHelper sampleTextHelper = loadSampleTextHelper(); | ||
35 | + Set<String> summarySentenceIds = loadSampleTextSummarySentenceIds(); | ||
36 | + List<ZeroSubjectCandidate> candidates = candidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds); | ||
37 | + assertEquals(1, candidates.size()); | ||
38 | + | ||
39 | + ZeroSubjectCandidate zeroSubjectCandidate = candidates.get(0); | ||
40 | + TSentence firstSentence = zeroSubjectCandidate.getPreviousSentence(); | ||
41 | + TSentence secondSentence = zeroSubjectCandidate.getSentence(); | ||
42 | + TMention zeroCandidate = zeroSubjectCandidate.getZeroCandidateMention(); | ||
43 | + | ||
44 | + assertEquals("Ala ma kota.", sampleTextHelper.getSentenceText(firstSentence)); | ||
45 | + assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceText(secondSentence)); | ||
46 | + assertEquals("Ala", sampleTextHelper.getMentionText(zeroCandidate)); | ||
47 | + } | ||
48 | + | ||
49 | + private Set<String> loadSampleTextSummarySentenceIds() throws IOException { | ||
50 | + try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_SUMMARY_IDS_PATH); | ||
51 | + InputStreamReader reader = new InputStreamReader(stream)) { | ||
52 | + return Sets.newHashSet(IOUtils.readLines(reader)); | ||
53 | + } | ||
54 | + } | ||
55 | + | ||
56 | + private ThriftTextHelper loadSampleTextHelper() throws IOException { | ||
57 | + try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { | ||
58 | + return new ThriftTextHelper(Utils.loadThrifted(stream)); | ||
59 | + } | ||
60 | + } | ||
61 | +} | ||
0 | \ No newline at end of file | 62 | \ No newline at end of file |
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjectorTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | + | ||
3 | +import org.junit.Test; | ||
4 | + | ||
5 | +public class ZeroSubjectInjectorTest { | ||
6 | + | ||
7 | + @Test | ||
8 | + public void shouldInit() throws Exception { | ||
9 | + ZeroSubjectInjector injector = new ZeroSubjectInjector(); | ||
10 | + } | ||
11 | +} | ||
0 | \ No newline at end of file | 12 | \ No newline at end of file |
pom.xml
@@ -16,6 +16,7 @@ | @@ -16,6 +16,7 @@ | ||
16 | <module>nicolas-model</module> | 16 | <module>nicolas-model</module> |
17 | <module>nicolas-train</module> | 17 | <module>nicolas-train</module> |
18 | <module>nicolas-zero</module> | 18 | <module>nicolas-zero</module> |
19 | + <module>nicolas-common</module> | ||
19 | </modules> | 20 | </modules> |
20 | 21 | ||
21 | <properties> | 22 | <properties> |
@@ -30,6 +31,8 @@ | @@ -30,6 +31,8 @@ | ||
30 | <weka-dev.version>3.9.0</weka-dev.version> | 31 | <weka-dev.version>3.9.0</weka-dev.version> |
31 | <commons-lang3.version>3.5</commons-lang3.version> | 32 | <commons-lang3.version>3.5</commons-lang3.version> |
32 | <commons-io.version>2.5</commons-io.version> | 33 | <commons-io.version>2.5</commons-io.version> |
34 | + <slf4j-api.version>1.7.12</slf4j-api.version> | ||
35 | + <junit.version>4.12</junit.version> | ||
33 | </properties> | 36 | </properties> |
34 | 37 | ||
35 | <prerequisites> | 38 | <prerequisites> |
@@ -46,13 +49,20 @@ | @@ -46,13 +49,20 @@ | ||
46 | 49 | ||
47 | <dependencyManagement> | 50 | <dependencyManagement> |
48 | <dependencies> | 51 | <dependencies> |
52 | + <!-- project --> | ||
49 | <dependency> | 53 | <dependency> |
50 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 54 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
51 | <artifactId>nicolas-model</artifactId> | 55 | <artifactId>nicolas-model</artifactId> |
52 | <version>${project.version}</version> | 56 | <version>${project.version}</version> |
53 | <scope>runtime</scope> | 57 | <scope>runtime</scope> |
54 | </dependency> | 58 | </dependency> |
59 | + <dependency> | ||
60 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
61 | + <artifactId>nicolas-common</artifactId> | ||
62 | + <version>${project.version}</version> | ||
63 | + </dependency> | ||
55 | 64 | ||
65 | + <!-- internal --> | ||
56 | <dependency> | 66 | <dependency> |
57 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 67 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
58 | <artifactId>pscapi</artifactId> | 68 | <artifactId>pscapi</artifactId> |
@@ -64,6 +74,7 @@ | @@ -64,6 +74,7 @@ | ||
64 | <version>${utils.version}</version> | 74 | <version>${utils.version}</version> |
65 | </dependency> | 75 | </dependency> |
66 | 76 | ||
77 | + <!-- third party --> | ||
67 | <dependency> | 78 | <dependency> |
68 | <groupId>org.apache.commons</groupId> | 79 | <groupId>org.apache.commons</groupId> |
69 | <artifactId>commons-csv</artifactId> | 80 | <artifactId>commons-csv</artifactId> |
@@ -89,6 +100,20 @@ | @@ -89,6 +100,20 @@ | ||
89 | <artifactId>commons-io</artifactId> | 100 | <artifactId>commons-io</artifactId> |
90 | <version>${commons-io.version}</version> | 101 | <version>${commons-io.version}</version> |
91 | </dependency> | 102 | </dependency> |
103 | + | ||
104 | + <!-- logging --> | ||
105 | + <dependency> | ||
106 | + <groupId>org.slf4j</groupId> | ||
107 | + <artifactId>slf4j-api</artifactId> | ||
108 | + <version>${slf4j-api.version}</version> | ||
109 | + </dependency> | ||
110 | + | ||
111 | + <!-- test --> | ||
112 | + <dependency> | ||
113 | + <groupId>junit</groupId> | ||
114 | + <artifactId>junit</artifactId> | ||
115 | + <version>${junit.version}</version> | ||
116 | + </dependency> | ||
92 | </dependencies> | 117 | </dependencies> |
93 | </dependencyManagement> | 118 | </dependencyManagement> |
94 | 119 |