Commit eac83d24d0d460300033f920fafbc7fa3d5ecdbb
1 parent
b41f6532
refactor
Showing
24 changed files
with
639 additions
and
169 deletions
nicolas-common/pom.xml
0 → 100644
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | + <modelVersion>4.0.0</modelVersion> | |
6 | + <parent> | |
7 | + <artifactId>nicolas-container</artifactId> | |
8 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
9 | + <version>1.0-SNAPSHOT</version> | |
10 | + </parent> | |
11 | + | |
12 | + <artifactId>nicolas-common</artifactId> | |
13 | + | |
14 | + <dependencies> | |
15 | + <!-- internal --> | |
16 | + <dependency> | |
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
18 | + <artifactId>pscapi</artifactId> | |
19 | + </dependency> | |
20 | + <dependency> | |
21 | + <groupId>pl.waw.ipipan.zil.multiservice</groupId> | |
22 | + <artifactId>utils</artifactId> | |
23 | + </dependency> | |
24 | + | |
25 | + <!-- third party --> | |
26 | + <dependency> | |
27 | + <groupId>nz.ac.waikato.cms.weka</groupId> | |
28 | + <artifactId>weka-dev</artifactId> | |
29 | + </dependency> | |
30 | + | |
31 | + <!-- logging --> | |
32 | + <dependency> | |
33 | + <groupId>org.slf4j</groupId> | |
34 | + <artifactId>slf4j-api</artifactId> | |
35 | + </dependency> | |
36 | + | |
37 | + </dependencies> | |
38 | + | |
39 | +</project> | |
0 | 40 | \ No newline at end of file |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
1 | -package pl.waw.ipipan.zil.summ.nicolas; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.common; | |
2 | 2 | |
3 | -import com.google.common.base.Charsets; | |
4 | 3 | import com.google.common.collect.Lists; |
5 | 4 | import com.google.common.collect.Maps; |
6 | 5 | import com.google.common.collect.Sets; |
7 | -import com.google.common.io.Files; | |
8 | 6 | import org.slf4j.Logger; |
9 | 7 | import org.slf4j.LoggerFactory; |
10 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
11 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
12 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
13 | 11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
14 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | |
15 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer; | |
16 | -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | |
17 | 12 | import weka.classifiers.Classifier; |
18 | 13 | import weka.core.Attribute; |
19 | -import weka.core.DenseInstance; | |
20 | -import weka.core.Instance; | |
21 | 14 | import weka.core.Instances; |
22 | 15 | |
23 | -import java.io.File; | |
24 | -import java.io.FileInputStream; | |
25 | -import java.io.IOException; | |
26 | -import java.io.ObjectInputStream; | |
16 | +import java.io.*; | |
27 | 17 | import java.util.*; |
28 | 18 | import java.util.function.Function; |
29 | 19 | import java.util.stream.Collectors; |
30 | 20 | |
31 | -import static java.util.stream.Collectors.toList; | |
32 | - | |
33 | 21 | public class Utils { |
34 | 22 | |
35 | 23 | private static final Logger LOG = LoggerFactory.getLogger(Utils.class); |
36 | 24 | |
37 | 25 | private static final String DATASET_NAME = "Dataset"; |
38 | 26 | |
39 | - public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { | |
40 | - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
41 | - Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); | |
42 | - | |
43 | - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention."); | |
44 | - Map<TMention, Instance> mention2instance = Maps.newHashMap(); | |
45 | - for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { | |
46 | - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
47 | - Map<Attribute, Double> mentionFeatures = mention2features.get(tMention); | |
48 | - for (Attribute attribute : featureExtractor.getAttributesList()) { | |
49 | - instance.setValue(attribute, mentionFeatures.get(attribute)); | |
50 | - } | |
51 | - mention2instance.put(tMention, instance); | |
52 | - } | |
53 | - return mention2instance; | |
54 | - } | |
55 | - | |
56 | - public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) { | |
57 | - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
58 | - Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); | |
59 | - | |
60 | - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence."); | |
61 | - Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); | |
62 | - for (TSentence sentence : sentences) { | |
63 | - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
64 | - Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence); | |
65 | - for (Attribute attribute : featureExtractor.getAttributesList()) { | |
66 | - instance.setValue(attribute, sentenceFeatures.get(attribute)); | |
67 | - } | |
68 | - sentence2instance.put(sentence, instance); | |
69 | - } | |
70 | - return sentence2instance; | |
71 | - } | |
72 | - | |
73 | 27 | public static Instances createNewInstances(ArrayList<Attribute> attributesList) { |
74 | 28 | Instances instances = new Instances(DATASET_NAME, attributesList, 0); |
75 | 29 | instances.setClassIndex(0); |
... | ... | @@ -97,7 +51,16 @@ public class Utils { |
97 | 51 | |
98 | 52 | |
99 | 53 | public static TText loadThrifted(File originalFile) { |
100 | - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(originalFile))) { | |
54 | + try (FileInputStream inputStream = new FileInputStream(originalFile)) { | |
55 | + return loadThrifted(inputStream); | |
56 | + } catch (IOException e) { | |
57 | + LOG.error("Error reading serialized file: " + e); | |
58 | + return null; | |
59 | + } | |
60 | + } | |
61 | + | |
62 | + public static TText loadThrifted(InputStream stream) { | |
63 | + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { | |
101 | 64 | return (TText) ois.readObject(); |
102 | 65 | } catch (ClassNotFoundException | IOException e) { |
103 | 66 | LOG.error("Error reading serialized file: " + e); |
... | ... | @@ -188,13 +151,5 @@ public class Utils { |
188 | 151 | return sb.toString().trim(); |
189 | 152 | } |
190 | 153 | |
191 | - public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException { | |
192 | - String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8); | |
193 | 154 | |
194 | - MentionScorer scorer = new MentionScorer(); | |
195 | - Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text); | |
196 | - | |
197 | - mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0); | |
198 | - return mention2score.keySet(); | |
199 | - } | |
200 | 155 | } |
201 | 156 | \ No newline at end of file |
... | ... |
nicolas-core/pom.xml
... | ... | @@ -12,10 +12,14 @@ |
12 | 12 | <artifactId>nicolas</artifactId> |
13 | 13 | |
14 | 14 | <dependencies> |
15 | + <!-- project --> | |
16 | + <dependency> | |
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
18 | + <artifactId>nicolas-common</artifactId> | |
19 | + </dependency> | |
15 | 20 | <dependency> |
16 | 21 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
17 | 22 | <artifactId>nicolas-model</artifactId> |
18 | - <scope>runtime</scope> | |
19 | 23 | </dependency> |
20 | 24 | |
21 | 25 | <dependency> |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... | ... | @@ -6,6 +6,7 @@ import com.google.common.collect.Sets; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
9 | 10 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
10 | 11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
11 | 12 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
... | ... | @@ -53,7 +54,7 @@ public class Nicolas { |
53 | 54 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
54 | 55 | |
55 | 56 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); |
56 | - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
57 | + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
57 | 58 | |
58 | 59 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
59 | 60 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas; | |
2 | + | |
3 | +import com.google.common.base.Charsets; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import com.google.common.io.Files; | |
6 | +import org.slf4j.Logger; | |
7 | +import org.slf4j.LoggerFactory; | |
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer; | |
13 | +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | |
14 | +import weka.core.Attribute; | |
15 | +import weka.core.DenseInstance; | |
16 | +import weka.core.Instance; | |
17 | + | |
18 | +import java.io.File; | |
19 | +import java.io.IOException; | |
20 | +import java.util.List; | |
21 | +import java.util.Map; | |
22 | +import java.util.Set; | |
23 | + | |
24 | +import static java.util.stream.Collectors.toList; | |
25 | + | |
26 | +public class ThriftUtils { | |
27 | + | |
28 | + private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class); | |
29 | + | |
30 | + public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException { | |
31 | + String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8); | |
32 | + | |
33 | + MentionScorer scorer = new MentionScorer(); | |
34 | + Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text); | |
35 | + | |
36 | + mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0); | |
37 | + return mention2score.keySet(); | |
38 | + } | |
39 | + | |
40 | + public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { | |
41 | + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
42 | + Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); | |
43 | + | |
44 | + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention."); | |
45 | + Map<TMention, Instance> mention2instance = Maps.newHashMap(); | |
46 | + for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { | |
47 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
48 | + Map<Attribute, Double> mentionFeatures = mention2features.get(tMention); | |
49 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | |
50 | + instance.setValue(attribute, mentionFeatures.get(attribute)); | |
51 | + } | |
52 | + mention2instance.put(tMention, instance); | |
53 | + } | |
54 | + return mention2instance; | |
55 | + } | |
56 | + | |
57 | + public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) { | |
58 | + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
59 | + Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); | |
60 | + | |
61 | + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence."); | |
62 | + Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); | |
63 | + for (TSentence sentence : sentences) { | |
64 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
65 | + Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence); | |
66 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | |
67 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | |
68 | + } | |
69 | + sentence2instance.put(sentence, instance); | |
70 | + } | |
71 | + return sentence2instance; | |
72 | + } | |
73 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
... | ... | @@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | 11 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
12 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | |
13 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
13 | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
14 | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
15 | 16 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
... | ... | @@ -85,7 +86,7 @@ public class ApplyModel2 { |
85 | 86 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
86 | 87 | |
87 | 88 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); |
88 | - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
89 | + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
89 | 90 | |
90 | 91 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
91 | 92 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
... | ... | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.features; |
3 | 3 | import com.google.common.collect.Maps; |
4 | 4 | import com.google.common.collect.Sets; |
5 | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
7 | 7 | |
8 | 8 | import java.util.List; |
9 | 9 | import java.util.Map; |
... | ... | @@ -14,9 +14,7 @@ import java.util.stream.Collectors; |
14 | 14 | import static java.util.stream.Collectors.toList; |
15 | 15 | import static java.util.stream.Collectors.toMap; |
16 | 16 | |
17 | -/** | |
18 | - * Created by me2 on 04.04.16. | |
19 | - */ | |
17 | + | |
20 | 18 | public class FeatureHelper { |
21 | 19 | |
22 | 20 | private final List<TMention> mentions; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
... | ... | @@ -5,7 +5,8 @@ import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
9 | 10 | import weka.classifiers.Classifier; |
10 | 11 | import weka.core.Instance; |
11 | 12 | import weka.core.Instances; |
... | ... | @@ -21,7 +22,7 @@ public class MentionModel { |
21 | 22 | Set<TMention> goodMentions = Sets.newHashSet(); |
22 | 23 | |
23 | 24 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
24 | - Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(text, featureExtractor); | |
25 | + Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(text, featureExtractor); | |
25 | 26 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { |
26 | 27 | Instance instance = entry.getValue(); |
27 | 28 | instance.setDataset(instances); |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
... | ... | @@ -6,9 +6,8 @@ import com.google.common.collect.Multiset; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
10 | 10 | |
11 | -import java.util.Collection; | |
12 | 11 | import java.util.List; |
13 | 12 | import java.util.Map; |
14 | 13 | import java.util.stream.Collectors; |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
... | ... | @@ -8,7 +8,8 @@ import org.slf4j.LoggerFactory; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
10 | 10 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
12 | 13 | import weka.core.Instance; |
13 | 14 | import weka.core.Instances; |
14 | 15 | import weka.core.converters.ArffSaver; |
... | ... | @@ -45,7 +46,7 @@ public class PrepareTrainingData { |
45 | 46 | continue; |
46 | 47 | Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); |
47 | 48 | |
48 | - Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(preprocessedText, featureExtractor); | |
49 | + Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); | |
49 | 50 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { |
50 | 51 | TMention mention = entry.getKey(); |
51 | 52 | Instance instance = entry.getValue(); |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
... | ... | @@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | 11 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
12 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | |
13 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
13 | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
14 | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
15 | 16 | import weka.classifiers.Classifier; |
... | ... | @@ -58,7 +59,7 @@ public class PrepareTrainingData { |
58 | 59 | // Set<TMention> goodMentions |
59 | 60 | // = Utils.loadGoldGoodMentions(textId, preprocessedText, true); |
60 | 61 | |
61 | - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); | |
62 | + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); | |
62 | 63 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
63 | 64 | TSentence sentence = entry.getKey(); |
64 | 65 | Instance instance = entry.getValue(); |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
... | ... | @@ -6,7 +6,7 @@ import com.google.common.collect.Multiset; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
10 | 10 | |
11 | 11 | import java.util.List; |
12 | 12 | import java.util.Map; |
... | ... |
nicolas-train/pom.xml
... | ... | @@ -11,4 +11,21 @@ |
11 | 11 | |
12 | 12 | <artifactId>nicolas-train</artifactId> |
13 | 13 | |
14 | + <dependencies> | |
15 | + <!-- internal --> | |
16 | + <dependency> | |
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
18 | + <artifactId>pscapi</artifactId> | |
19 | + </dependency> | |
20 | + <dependency> | |
21 | + <groupId>pl.waw.ipipan.zil.multiservice</groupId> | |
22 | + <artifactId>utils</artifactId> | |
23 | + </dependency> | |
24 | + | |
25 | + <!-- logging --> | |
26 | + <dependency> | |
27 | + <groupId>org.slf4j</groupId> | |
28 | + <artifactId>slf4j-api</artifactId> | |
29 | + </dependency> | |
30 | + </dependencies> | |
14 | 31 | </project> |
15 | 32 | \ No newline at end of file |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Trainer.java
0 → 100644
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; | |
2 | + | |
3 | +import org.apache.thrift.TException; | |
4 | +import org.apache.thrift.protocol.TBinaryProtocol; | |
5 | +import org.apache.thrift.protocol.TProtocol; | |
6 | +import org.apache.thrift.transport.TSocket; | |
7 | +import org.apache.thrift.transport.TTransport; | |
8 | +import org.slf4j.Logger; | |
9 | +import org.slf4j.LoggerFactory; | |
10 | +import pl.waw.ipipan.zil.multiservice.thrift.Multiservice; | |
11 | +import pl.waw.ipipan.zil.multiservice.thrift.ObjectRequest; | |
12 | +import pl.waw.ipipan.zil.multiservice.thrift.RequestPart; | |
13 | +import pl.waw.ipipan.zil.multiservice.thrift.RequestStatus; | |
14 | +import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; | |
15 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | |
16 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
17 | + | |
18 | +import java.util.ArrayList; | |
19 | +import java.util.HashMap; | |
20 | +import java.util.List; | |
21 | +import java.util.Map; | |
22 | + | |
23 | +public class MultiserviceProxy { | |
24 | + | |
25 | + private static final Logger LOG = LoggerFactory.getLogger(MultiserviceProxy.class); | |
26 | + | |
27 | + private int port; | |
28 | + private String host; | |
29 | + | |
30 | + public MultiserviceProxy(String host, int port) { | |
31 | + this.host = host; | |
32 | + this.port = port; | |
33 | + LOG.info("Multiservice at " + host + ":" + port); | |
34 | + } | |
35 | + | |
36 | + public TText process(String text, List<String> services) throws Exception { | |
37 | + List<Map<String, String>> options = new ArrayList<>(); | |
38 | + for (int i = 0; i < services.size(); i++) | |
39 | + options.add(new HashMap<>()); | |
40 | + return process(text, "", services, options); | |
41 | + } | |
42 | + | |
43 | + public TText process(String text, String title, List<String> services, List<Map<String, String>> options) | |
44 | + throws Exception { | |
45 | + TTransport transport = new TSocket(host, port); | |
46 | + ObjectRequest objectRequest = createRequest(text, title, services, options); | |
47 | + | |
48 | + try { | |
49 | + transport.open(); | |
50 | + | |
51 | + TProtocol protocol = new TBinaryProtocol(transport); | |
52 | + Multiservice.Client client = new Multiservice.Client(protocol); | |
53 | + | |
54 | + LOG.debug("Sending Multservice request..."); | |
55 | + TText responseText = request(objectRequest, client); | |
56 | + LOG.debug("...done"); | |
57 | + | |
58 | + return responseText; | |
59 | + | |
60 | + } catch (TException e) { | |
61 | + LOG.error("Error processing request:" + e); | |
62 | + throw new Exception(e); | |
63 | + | |
64 | + } finally { | |
65 | + transport.close(); | |
66 | + } | |
67 | + } | |
68 | + | |
69 | + private TText request(ObjectRequest objectRequest, Multiservice.Client client) throws TException { | |
70 | + | |
71 | + String requestToken = client.putObjectRequest(objectRequest); | |
72 | + while (true) { | |
73 | + RequestStatus status = client.getRequestStatus(requestToken); | |
74 | + if (RequestStatus.DONE.equals(status)) { | |
75 | + TText result = client.getResultObject(requestToken); | |
76 | + return result; | |
77 | + } else if (RequestStatus.FAILED.equals(status) || RequestStatus.DUMPED.equals(status)) { | |
78 | + try { | |
79 | + MultiserviceException exception = client.getException(requestToken); | |
80 | + throw exception; | |
81 | + } catch (TException e) { | |
82 | + throw e; | |
83 | + } | |
84 | + } | |
85 | + } | |
86 | + } | |
87 | + | |
88 | + private ObjectRequest createRequest(String textBody, String textTitle, List<String> services, | |
89 | + List<Map<String, String>> options) { | |
90 | + TText text = new TText(); | |
91 | + | |
92 | + TParagraph par = new TParagraph(); | |
93 | + par.setText(textTitle); | |
94 | + text.addToParagraphs(par); | |
95 | + | |
96 | + for (String p : textBody.split("\n\n")) { | |
97 | + par = new TParagraph(); | |
98 | + par.setText(p); | |
99 | + text.addToParagraphs(par); | |
100 | + } | |
101 | + | |
102 | + List<RequestPart> processingChain = new ArrayList<>(); | |
103 | + int i = 0; | |
104 | + for (String serviceName : services) | |
105 | + processingChain.add(new RequestPart(serviceName, options.get(i++))); | |
106 | + | |
107 | + return new ObjectRequest(text, processingChain); | |
108 | + } | |
109 | + | |
110 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; | |
2 | + | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
6 | +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | |
7 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | |
8 | + | |
9 | +import java.io.File; | |
10 | +import java.io.FileOutputStream; | |
11 | +import java.io.IOException; | |
12 | +import java.io.ObjectOutputStream; | |
13 | +import java.util.Arrays; | |
14 | +import java.util.List; | |
15 | + | |
16 | +public class NLPProcess { | |
17 | + | |
18 | + private static final Logger LOG = LoggerFactory.getLogger(NLPProcess.class); | |
19 | + | |
20 | + private static final List<String> SERVICES = Arrays.asList("Concraft", "Spejd", "Nerf", "MentionDetector", | |
21 | + "Bartek"); | |
22 | + private static final int PORT = 20000; | |
23 | + private static final String HOST = "multiservice.nlp.ipipan.waw.pl"; | |
24 | + | |
25 | + private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT); | |
26 | + | |
27 | + private NLPProcess() { | |
28 | + } | |
29 | + | |
30 | + public static void main(String[] args) { | |
31 | + if (args.length != 2) { | |
32 | + LOG.error("Wrong usage! Try " + NLPProcess.class.getSimpleName() + " dirWithCorpusFiles targetDir"); | |
33 | + return; | |
34 | + } | |
35 | + File corpusDir = new File(args[0]); | |
36 | + if (!corpusDir.isDirectory()) { | |
37 | + LOG.error("Corpus directory does not exist: " + corpusDir); | |
38 | + return; | |
39 | + } | |
40 | + File targetDir = new File(args[1]); | |
41 | + if (!targetDir.isDirectory()) { | |
42 | + LOG.error("Target directory does not exist: " + targetDir); | |
43 | + return; | |
44 | + } | |
45 | + | |
46 | + int ok = 0; | |
47 | + int err = 0; | |
48 | + File[] files = corpusDir.listFiles(f -> f.getName().endsWith(".xml")); | |
49 | + Arrays.sort(files); | |
50 | + for (File file : files) { | |
51 | + try { | |
52 | + Text text = PSC_IO.readText(file); | |
53 | + File targetFile = new File(targetDir, file.getName().replaceFirst(".xml$", ".bin")); | |
54 | + annotateNLP(text, targetFile); | |
55 | + ok++; | |
56 | + } catch (Exception e) { | |
57 | + err++; | |
58 | + LOG.error("Problem with text in " + file + ", " + e); | |
59 | + } | |
60 | + } | |
61 | + LOG.info(ok + " texts processed successfully."); | |
62 | + LOG.info(err + " texts with errors."); | |
63 | + } | |
64 | + | |
65 | + private static void annotateNLP(Text text, File targetFile) throws Exception { | |
66 | + annotate(text.getBody(), targetFile); | |
67 | + } | |
68 | + | |
69 | + private static void annotate(String body, File targetFile) throws Exception { | |
70 | + if (targetFile.exists()) { | |
71 | + LOG.debug("Skipping existing file.."); | |
72 | + return; | |
73 | + } | |
74 | + LOG.info("Processing text into " + targetFile.getPath()); | |
75 | + TText ttext = MSPROXY.process(body, SERVICES); | |
76 | + serialize(ttext, targetFile); | |
77 | + } | |
78 | + | |
79 | + public static void serialize(TText ttext, File targetFile) throws IOException { | |
80 | + try (FileOutputStream fout = new FileOutputStream(targetFile); | |
81 | + ObjectOutputStream oos = new ObjectOutputStream(fout)) { | |
82 | + oos.writeObject(ttext); | |
83 | + } | |
84 | + } | |
85 | + | |
86 | + public static TText annotate(String body) throws Exception { | |
87 | + return MSPROXY.process(body, SERVICES); | |
88 | + } | |
89 | + | |
90 | +} | |
... | ... |
nicolas-zero/pom.xml
... | ... | @@ -11,4 +11,34 @@ |
11 | 11 | |
12 | 12 | <artifactId>nicolas-zero</artifactId> |
13 | 13 | |
14 | + <dependencies> | |
15 | + <!-- project --> | |
16 | + <dependency> | |
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
18 | + <artifactId>nicolas-common</artifactId> | |
19 | + </dependency> | |
20 | + | |
21 | + <!-- third party --> | |
22 | + <dependency> | |
23 | + <groupId>org.apache.commons</groupId> | |
24 | + <artifactId>commons-csv</artifactId> | |
25 | + </dependency> | |
26 | + <dependency> | |
27 | + <groupId>commons-io</groupId> | |
28 | + <artifactId>commons-io</artifactId> | |
29 | + </dependency> | |
30 | + | |
31 | + <!-- logging --> | |
32 | + <dependency> | |
33 | + <groupId>org.slf4j</groupId> | |
34 | + <artifactId>slf4j-api</artifactId> | |
35 | + </dependency> | |
36 | + | |
37 | + <!-- test --> | |
38 | + <dependency> | |
39 | + <groupId>junit</groupId> | |
40 | + <artifactId>junit</artifactId> | |
41 | + </dependency> | |
42 | + </dependencies> | |
43 | + | |
14 | 44 | </project> |
15 | 45 | \ No newline at end of file |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java renamed to nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
... | ... | @@ -3,126 +3,61 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; |
3 | 3 | import com.google.common.collect.Lists; |
4 | 4 | import com.google.common.collect.Maps; |
5 | 5 | import com.google.common.collect.Sets; |
6 | -import org.apache.commons.csv.CSVFormat; | |
7 | -import org.apache.commons.csv.CSVPrinter; | |
8 | -import org.apache.commons.csv.QuoteMode; | |
9 | -import org.apache.commons.io.IOUtils; | |
10 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.Utils; | |
12 | 7 | |
13 | -import java.io.File; | |
14 | -import java.io.FileReader; | |
15 | -import java.io.FileWriter; | |
16 | -import java.io.IOException; | |
17 | 8 | import java.util.Arrays; |
18 | 9 | import java.util.List; |
19 | 10 | import java.util.Map; |
20 | 11 | import java.util.Set; |
21 | 12 | |
22 | -/** | |
23 | - * Created by me2 on 26.07.16. | |
24 | - */ | |
25 | -public class Zero { | |
13 | +public class CandidateFinder { | |
26 | 14 | |
27 | - private static final String IDS_PATH = "summaries_dev"; | |
28 | - private static final String THRIFTED_PATH = "src/main/resources/preprocessed_full_texts/dev/"; | |
15 | + public List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) { | |
16 | + List<ZeroSubjectCandidate> candidates = Lists.newArrayList(); | |
29 | 17 | |
30 | - public static void main(String[] args) throws IOException { | |
31 | - | |
32 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); | |
33 | - Map<String, List<String>> id2sentIds = loadSentenceIds(IDS_PATH); | |
34 | - | |
35 | - int mentionCount = 0; | |
36 | - int mentionInNom = 0; | |
37 | - int mentionInNomSequential = 0; | |
38 | - | |
39 | - List<List<Object>> rows = Lists.newArrayList(); | |
40 | - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
41 | - String textId = entry.getKey(); | |
42 | -// System.out.println(id); | |
43 | - | |
44 | - TText text = entry.getValue(); | |
45 | - List<String> sentenceIds = id2sentIds.get(textId); | |
46 | -// System.out.println(sentenceIds); | |
47 | - | |
48 | - Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | |
49 | - for (TCoreference coreference : text.getCoreferences()) { | |
50 | - for (String mentionId : coreference.getMentionIds()) { | |
51 | - mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | |
52 | - } | |
18 | + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap(); | |
19 | + for (TCoreference coreference : text.getCoreferences()) { | |
20 | + for (String mentionId : coreference.getMentionIds()) { | |
21 | + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds())); | |
53 | 22 | } |
23 | + } | |
54 | 24 | |
55 | - Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet(); | |
56 | - TSentence prevSentence = null; | |
57 | - for (TParagraph p : text.getParagraphs()) { | |
58 | - Map<TMention, String> tMentionStringMap = Utils.loadMention2Orth(p.getSentences()); | |
59 | - | |
60 | - for (TSentence sentence : p.getSentences()) { | |
61 | - if (!sentenceIds.contains(sentence.getId())) | |
62 | - continue; | |
63 | - Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | |
64 | - | |
65 | - Map<String, TToken> tokenId2Token = Maps.newHashMap(); | |
66 | - for (TToken t : sentence.getTokens()) | |
67 | - tokenId2Token.put(t.getId(), t); | |
25 | + Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet(); | |
26 | + TSentence prevSentence = null; | |
27 | + for (TParagraph p : text.getParagraphs()) { | |
28 | + for (TSentence sentence : p.getSentences()) { | |
29 | + if (!summarySentenceIds.contains(sentence.getId())) | |
30 | + continue; | |
31 | + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet(); | |
68 | 32 | |
69 | - for (TMention mention : sentence.getMentions()) { | |
70 | - mentionCount++; | |
33 | + Map<String, TToken> tokenId2Token = Maps.newHashMap(); | |
34 | + for (TToken t : sentence.getTokens()) | |
35 | + tokenId2Token.put(t.getId(), t); | |
71 | 36 | |
72 | - for (String tokenId : mention.getHeadIds()) { | |
73 | - TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | |
74 | - if (isInNominative(interp)) { | |
75 | - mentionInNom++; | |
37 | + for (TMention mention : sentence.getMentions()) { | |
76 | 38 | |
77 | - currentSentenceNominativeMentionIds.add(mention.getId()); | |
78 | - if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) { | |
79 | - mentionInNomSequential++; | |
80 | - System.out.println(tMentionStringMap.get(mention) | |
81 | - + "\n\t" + Utils.loadSentence2Orth(prevSentence) | |
82 | - + "\n\t" + Utils.loadSentence2Orth(sentence)); | |
39 | + for (String tokenId : mention.getHeadIds()) { | |
40 | + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation(); | |
41 | + if (isInNominative(interp)) { | |
83 | 42 | |
84 | - List<Object> row = Lists.newArrayList(); | |
85 | - row.add("C"); | |
86 | - row.add(textId); | |
87 | - row.add(tMentionStringMap.get(mention)); | |
88 | - row.add(Utils.loadSentence2Orth(prevSentence)); | |
89 | - row.add(Utils.loadSentence2Orth(sentence)); | |
90 | - rows.add(row); | |
91 | - } | |
92 | - break; | |
43 | + currentSentenceNominativeMentionIds.add(mention.getId()); | |
44 | + if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) { | |
45 | + ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentence, sentence, mention); | |
46 | + candidates.add(candidate); | |
93 | 47 | } |
48 | + break; | |
94 | 49 | } |
95 | 50 | } |
96 | - | |
97 | - prevSentence = sentence; | |
98 | - prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; | |
99 | 51 | } |
100 | - } | |
101 | - } | |
102 | - | |
103 | - System.out.println(mentionCount + " mentions"); | |
104 | - System.out.println(mentionInNom + " mention in nom"); | |
105 | - System.out.println(mentionInNomSequential + " mention in nom with previous in nom"); | |
106 | 52 | |
107 | - try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) { | |
108 | - for (List<Object> row : rows) { | |
109 | - csvPrinter.printRecord(row); | |
53 | + prevSentence = sentence; | |
54 | + prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds; | |
110 | 55 | } |
111 | 56 | } |
112 | - | |
57 | + return candidates; | |
113 | 58 | } |
114 | 59 | |
115 | 60 | private static boolean isInNominative(TInterpretation interp) { |
116 | 61 | return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); |
117 | 62 | } |
118 | - | |
119 | - private static Map<String, List<String>> loadSentenceIds(String idsPath) throws IOException { | |
120 | - Map<String, List<String>> result = Maps.newHashMap(); | |
121 | - for (File f : new File(idsPath).listFiles()) { | |
122 | - String id = f.getName().split("_")[0]; | |
123 | - List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | |
124 | - result.put(id, sentenceIds); | |
125 | - } | |
126 | - return result; | |
127 | - } | |
128 | 63 | } |
... | ... |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | |
2 | + | |
3 | +import com.google.common.collect.Lists; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import com.google.common.collect.Sets; | |
6 | +import org.apache.commons.csv.CSVFormat; | |
7 | +import org.apache.commons.csv.CSVPrinter; | |
8 | +import org.apache.commons.csv.QuoteMode; | |
9 | +import org.apache.commons.io.IOUtils; | |
10 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
13 | + | |
14 | +import java.io.File; | |
15 | +import java.io.FileReader; | |
16 | +import java.io.FileWriter; | |
17 | +import java.io.IOException; | |
18 | +import java.util.List; | |
19 | +import java.util.Map; | |
20 | +import java.util.Set; | |
21 | + | |
22 | +public class Zero { | |
23 | + | |
24 | + private static final String IDS_PATH = "corpora/summaries_dev"; | |
25 | + private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; | |
26 | + | |
27 | + private Zero() { | |
28 | + } | |
29 | + | |
30 | + public static void main(String[] args) throws IOException { | |
31 | + | |
32 | + CandidateFinder candidateFinder = new CandidateFinder(); | |
33 | + | |
34 | + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); | |
35 | + Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH); | |
36 | + | |
37 | + List<List<Object>> rows = Lists.newArrayList(); | |
38 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
39 | + String textId = entry.getKey(); | |
40 | + | |
41 | + TText text = entry.getValue(); | |
42 | + ThriftTextHelper thriftTextHelper = new ThriftTextHelper(text); | |
43 | + | |
44 | + Set<String> sentenceIds = id2sentIds.get(textId); | |
45 | + | |
46 | + List<ZeroSubjectCandidate> zeroSubjectCandidates = candidateFinder.findZeroSubjectCandidates(text, sentenceIds); | |
47 | + | |
48 | + for (ZeroSubjectCandidate candidate : zeroSubjectCandidates) { | |
49 | + List<Object> row = Lists.newArrayList(); | |
50 | + row.add("C"); | |
51 | + row.add(textId); | |
52 | + row.add(thriftTextHelper.getMentionText(candidate.getZeroCandidateMention())); | |
53 | + row.add(thriftTextHelper.getSentenceText(candidate.getPreviousSentence())); | |
54 | + row.add(thriftTextHelper.getSentenceText(candidate.getSentence())); | |
55 | + rows.add(row); | |
56 | + } | |
57 | + } | |
58 | + | |
59 | + try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) { | |
60 | + for (List<Object> row : rows) { | |
61 | + csvPrinter.printRecord(row); | |
62 | + } | |
63 | + } | |
64 | + | |
65 | + } | |
66 | + | |
67 | + private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { | |
68 | + Map<String, Set<String>> result = Maps.newHashMap(); | |
69 | + for (File f : new File(idsPath).listFiles()) { | |
70 | + String id = f.getName().split("_")[0]; | |
71 | + List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | |
72 | + result.put(id, Sets.newHashSet(sentenceIds)); | |
73 | + } | |
74 | + return result; | |
75 | + } | |
76 | +} | |
... | ... |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
5 | + | |
6 | +public class ZeroSubjectCandidate { | |
7 | + | |
8 | + private final TSentence previousSentence; | |
9 | + private final TSentence sentence; | |
10 | + private final TMention zeroCandidateMention; | |
11 | + | |
12 | + public ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) { | |
13 | + this.previousSentence = previousSentence; | |
14 | + this.sentence = sentence; | |
15 | + this.zeroCandidateMention = zeroCandidateMention; | |
16 | + } | |
17 | + | |
18 | + public TSentence getPreviousSentence() { | |
19 | + return previousSentence; | |
20 | + } | |
21 | + | |
22 | + public TSentence getSentence() { | |
23 | + return sentence; | |
24 | + } | |
25 | + | |
26 | + public TMention getZeroCandidateMention() { | |
27 | + return zeroCandidateMention; | |
28 | + } | |
29 | +} | |
... | ... |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
0 → 100644
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | |
2 | + | |
3 | +import com.google.common.collect.Sets; | |
4 | +import org.apache.commons.io.IOUtils; | |
5 | +import org.junit.BeforeClass; | |
6 | +import org.junit.Test; | |
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
11 | + | |
12 | +import java.io.IOException; | |
13 | +import java.io.InputStream; | |
14 | +import java.io.InputStreamReader; | |
15 | +import java.util.List; | |
16 | +import java.util.Set; | |
17 | + | |
18 | +import static org.junit.Assert.assertEquals; | |
19 | + | |
20 | +public class CandidateFinderTest { | |
21 | + | |
22 | + private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin"; | |
23 | + private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; | |
24 | + | |
25 | + private static CandidateFinder candidateFinder; | |
26 | + | |
27 | + @BeforeClass | |
28 | + public static void init() { | |
29 | + candidateFinder = new CandidateFinder(); | |
30 | + } | |
31 | + | |
32 | + @Test | |
33 | + public void shouldFindZeroSubjectCandidateInSampleText() throws Exception { | |
34 | + ThriftTextHelper sampleTextHelper = loadSampleTextHelper(); | |
35 | + Set<String> summarySentenceIds = loadSampleTextSummarySentenceIds(); | |
36 | + List<ZeroSubjectCandidate> candidates = candidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds); | |
37 | + assertEquals(1, candidates.size()); | |
38 | + | |
39 | + ZeroSubjectCandidate zeroSubjectCandidate = candidates.get(0); | |
40 | + TSentence firstSentence = zeroSubjectCandidate.getPreviousSentence(); | |
41 | + TSentence secondSentence = zeroSubjectCandidate.getSentence(); | |
42 | + TMention zeroCandidate = zeroSubjectCandidate.getZeroCandidateMention(); | |
43 | + | |
44 | + assertEquals("Ala ma kota.", sampleTextHelper.getSentenceText(firstSentence)); | |
45 | + assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceText(secondSentence)); | |
46 | + assertEquals("Ala", sampleTextHelper.getMentionText(zeroCandidate)); | |
47 | + } | |
48 | + | |
49 | + private Set<String> loadSampleTextSummarySentenceIds() throws IOException { | |
50 | + try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_SUMMARY_IDS_PATH); | |
51 | + InputStreamReader reader = new InputStreamReader(stream)) { | |
52 | + return Sets.newHashSet(IOUtils.readLines(reader)); | |
53 | + } | |
54 | + } | |
55 | + | |
56 | + private ThriftTextHelper loadSampleTextHelper() throws IOException { | |
57 | + try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { | |
58 | + return new ThriftTextHelper(Utils.loadThrifted(stream)); | |
59 | + } | |
60 | + } | |
61 | +} | |
0 | 62 | \ No newline at end of file |
... | ... |
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjectorTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | |
2 | + | |
3 | +import org.junit.Test; | |
4 | + | |
5 | +public class ZeroSubjectInjectorTest { | |
6 | + | |
7 | + @Test | |
8 | + public void shouldInit() throws Exception { | |
9 | + ZeroSubjectInjector injector = new ZeroSubjectInjector(); | |
10 | + } | |
11 | +} | |
0 | 12 | \ No newline at end of file |
... | ... |
pom.xml
... | ... | @@ -16,6 +16,7 @@ |
16 | 16 | <module>nicolas-model</module> |
17 | 17 | <module>nicolas-train</module> |
18 | 18 | <module>nicolas-zero</module> |
19 | + <module>nicolas-common</module> | |
19 | 20 | </modules> |
20 | 21 | |
21 | 22 | <properties> |
... | ... | @@ -30,6 +31,8 @@ |
30 | 31 | <weka-dev.version>3.9.0</weka-dev.version> |
31 | 32 | <commons-lang3.version>3.5</commons-lang3.version> |
32 | 33 | <commons-io.version>2.5</commons-io.version> |
34 | + <slf4j-api.version>1.7.12</slf4j-api.version> | |
35 | + <junit.version>4.12</junit.version> | |
33 | 36 | </properties> |
34 | 37 | |
35 | 38 | <prerequisites> |
... | ... | @@ -46,13 +49,20 @@ |
46 | 49 | |
47 | 50 | <dependencyManagement> |
48 | 51 | <dependencies> |
52 | + <!-- project --> | |
49 | 53 | <dependency> |
50 | 54 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
51 | 55 | <artifactId>nicolas-model</artifactId> |
52 | 56 | <version>${project.version}</version> |
53 | 57 | <scope>runtime</scope> |
54 | 58 | </dependency> |
59 | + <dependency> | |
60 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
61 | + <artifactId>nicolas-common</artifactId> | |
62 | + <version>${project.version}</version> | |
63 | + </dependency> | |
55 | 64 | |
65 | + <!-- internal --> | |
56 | 66 | <dependency> |
57 | 67 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
58 | 68 | <artifactId>pscapi</artifactId> |
... | ... | @@ -64,6 +74,7 @@ |
64 | 74 | <version>${utils.version}</version> |
65 | 75 | </dependency> |
66 | 76 | |
77 | + <!-- third party --> | |
67 | 78 | <dependency> |
68 | 79 | <groupId>org.apache.commons</groupId> |
69 | 80 | <artifactId>commons-csv</artifactId> |
... | ... | @@ -89,6 +100,20 @@ |
89 | 100 | <artifactId>commons-io</artifactId> |
90 | 101 | <version>${commons-io.version}</version> |
91 | 102 | </dependency> |
103 | + | |
104 | + <!-- logging --> | |
105 | + <dependency> | |
106 | + <groupId>org.slf4j</groupId> | |
107 | + <artifactId>slf4j-api</artifactId> | |
108 | + <version>${slf4j-api.version}</version> | |
109 | + </dependency> | |
110 | + | |
111 | + <!-- test --> | |
112 | + <dependency> | |
113 | + <groupId>junit</groupId> | |
114 | + <artifactId>junit</artifactId> | |
115 | + <version>${junit.version}</version> | |
116 | + </dependency> | |
92 | 117 | </dependencies> |
93 | 118 | </dependencyManagement> |
94 | 119 | |
... | ... |