Commit eac83d24d0d460300033f920fafbc7fa3d5ecdbb

Authored by Mateusz Kopeć
1 parent b41f6532

refactor

Showing 24 changed files with 639 additions and 169 deletions
nicolas-common/pom.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <modelVersion>4.0.0</modelVersion>
  6 + <parent>
  7 + <artifactId>nicolas-container</artifactId>
  8 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  9 + <version>1.0-SNAPSHOT</version>
  10 + </parent>
  11 +
  12 + <artifactId>nicolas-common</artifactId>
  13 +
  14 + <dependencies>
  15 + <!-- internal -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>pscapi</artifactId>
  19 + </dependency>
  20 + <dependency>
  21 + <groupId>pl.waw.ipipan.zil.multiservice</groupId>
  22 + <artifactId>utils</artifactId>
  23 + </dependency>
  24 +
  25 + <!-- third party -->
  26 + <dependency>
  27 + <groupId>nz.ac.waikato.cms.weka</groupId>
  28 + <artifactId>weka-dev</artifactId>
  29 + </dependency>
  30 +
  31 + <!-- logging -->
  32 + <dependency>
  33 + <groupId>org.slf4j</groupId>
  34 + <artifactId>slf4j-api</artifactId>
  35 + </dependency>
  36 +
  37 + </dependencies>
  38 +
  39 +</project>
0 40 \ No newline at end of file
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
1   -package pl.waw.ipipan.zil.summ.nicolas;
  1 +package pl.waw.ipipan.zil.summ.nicolas.common;
2 2  
3   -import com.google.common.base.Charsets;
4 3 import com.google.common.collect.Lists;
5 4 import com.google.common.collect.Maps;
6 5 import com.google.common.collect.Sets;
7   -import com.google.common.io.Files;
8 6 import org.slf4j.Logger;
9 7 import org.slf4j.LoggerFactory;
10 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
11 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
12 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
13 11 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
14   -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15   -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer;
16   -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
17 12 import weka.classifiers.Classifier;
18 13 import weka.core.Attribute;
19   -import weka.core.DenseInstance;
20   -import weka.core.Instance;
21 14 import weka.core.Instances;
22 15  
23   -import java.io.File;
24   -import java.io.FileInputStream;
25   -import java.io.IOException;
26   -import java.io.ObjectInputStream;
  16 +import java.io.*;
27 17 import java.util.*;
28 18 import java.util.function.Function;
29 19 import java.util.stream.Collectors;
30 20  
31   -import static java.util.stream.Collectors.toList;
32   -
33 21 public class Utils {
34 22  
35 23 private static final Logger LOG = LoggerFactory.getLogger(Utils.class);
36 24  
37 25 private static final String DATASET_NAME = "Dataset";
38 26  
39   - public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
40   - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
41   - Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
42   -
43   - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention.");
44   - Map<TMention, Instance> mention2instance = Maps.newHashMap();
45   - for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) {
46   - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
47   - Map<Attribute, Double> mentionFeatures = mention2features.get(tMention);
48   - for (Attribute attribute : featureExtractor.getAttributesList()) {
49   - instance.setValue(attribute, mentionFeatures.get(attribute));
50   - }
51   - mention2instance.put(tMention, instance);
52   - }
53   - return mention2instance;
54   - }
55   -
56   - public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) {
57   - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
58   - Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions);
59   -
60   - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence.");
61   - Map<TSentence, Instance> sentence2instance = Maps.newHashMap();
62   - for (TSentence sentence : sentences) {
63   - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
64   - Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence);
65   - for (Attribute attribute : featureExtractor.getAttributesList()) {
66   - instance.setValue(attribute, sentenceFeatures.get(attribute));
67   - }
68   - sentence2instance.put(sentence, instance);
69   - }
70   - return sentence2instance;
71   - }
72   -
73 27 public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
74 28 Instances instances = new Instances(DATASET_NAME, attributesList, 0);
75 29 instances.setClassIndex(0);
... ... @@ -97,7 +51,16 @@ public class Utils {
97 51  
98 52  
99 53 public static TText loadThrifted(File originalFile) {
100   - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(originalFile))) {
  54 + try (FileInputStream inputStream = new FileInputStream(originalFile)) {
  55 + return loadThrifted(inputStream);
  56 + } catch (IOException e) {
  57 + LOG.error("Error reading serialized file: " + e);
  58 + return null;
  59 + }
  60 + }
  61 +
  62 + public static TText loadThrifted(InputStream stream) {
  63 + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) {
101 64 return (TText) ois.readObject();
102 65 } catch (ClassNotFoundException | IOException e) {
103 66 LOG.error("Error reading serialized file: " + e);
... ... @@ -188,13 +151,5 @@ public class Utils {
188 151 return sb.toString().trim();
189 152 }
190 153  
191   - public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException {
192   - String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8);
193 154  
194   - MentionScorer scorer = new MentionScorer();
195   - Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);
196   -
197   - mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0);
198   - return mention2score.keySet();
199   - }
200 155 }
201 156 \ No newline at end of file
... ...
nicolas-core/pom.xml
... ... @@ -12,10 +12,14 @@
12 12 <artifactId>nicolas</artifactId>
13 13  
14 14 <dependencies>
  15 + <!-- project -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>nicolas-common</artifactId>
  19 + </dependency>
15 20 <dependency>
16 21 <groupId>pl.waw.ipipan.zil.summ</groupId>
17 22 <artifactId>nicolas-model</artifactId>
18   - <scope>runtime</scope>
19 23 </dependency>
20 24  
21 25 <dependency>
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... ... @@ -6,6 +6,7 @@ import com.google.common.collect.Sets;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
9 10 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
10 11 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
11 12 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
... ... @@ -53,7 +54,7 @@ public class Nicolas {
53 54 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
54 55  
55 56 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
56   - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
  57 + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
57 58  
58 59 Map<TSentence, Double> sentence2score = Maps.newHashMap();
59 60 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +import com.google.common.base.Charsets;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.io.Files;
  6 +import org.slf4j.Logger;
  7 +import org.slf4j.LoggerFactory;
  8 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  9 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  11 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
  12 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer;
  13 +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
  14 +import weka.core.Attribute;
  15 +import weka.core.DenseInstance;
  16 +import weka.core.Instance;
  17 +
  18 +import java.io.File;
  19 +import java.io.IOException;
  20 +import java.util.List;
  21 +import java.util.Map;
  22 +import java.util.Set;
  23 +
  24 +import static java.util.stream.Collectors.toList;
  25 +
  26 +public class ThriftUtils {
  27 +
  28 + private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class);
  29 +
  30 + public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException {
  31 + String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8);
  32 +
  33 + MentionScorer scorer = new MentionScorer();
  34 + Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);
  35 +
  36 + mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0);
  37 + return mention2score.keySet();
  38 + }
  39 +
  40 + public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
  41 + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  42 + Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
  43 +
  44 + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention.");
  45 + Map<TMention, Instance> mention2instance = Maps.newHashMap();
  46 + for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) {
  47 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  48 + Map<Attribute, Double> mentionFeatures = mention2features.get(tMention);
  49 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  50 + instance.setValue(attribute, mentionFeatures.get(attribute));
  51 + }
  52 + mention2instance.put(tMention, instance);
  53 + }
  54 + return mention2instance;
  55 + }
  56 +
  57 + public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) {
  58 + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  59 + Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions);
  60 +
  61 + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence.");
  62 + Map<TSentence, Instance> sentence2instance = Maps.newHashMap();
  63 + for (TSentence sentence : sentences) {
  64 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  65 + Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence);
  66 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  67 + instance.setValue(attribute, sentenceFeatures.get(attribute));
  68 + }
  69 + sentence2instance.put(sentence, instance);
  70 + }
  71 + return sentence2instance;
  72 + }
  73 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
... ... @@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 11 import pl.waw.ipipan.zil.summ.nicolas.Constants;
12   -import pl.waw.ipipan.zil.summ.nicolas.Utils;
  12 +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
13 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
14 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
15 16 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
... ... @@ -85,7 +86,7 @@ public class ApplyModel2 {
85 86 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
86 87  
87 88 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
88   - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
  89 + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
89 90  
90 91 Map<TSentence, Double> sentence2score = Maps.newHashMap();
91 92 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
... ... @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.features;
3 3 import com.google.common.collect.Maps;
4 4 import com.google.common.collect.Sets;
5 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
6   -import pl.waw.ipipan.zil.summ.nicolas.Utils;
  6 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
7 7  
8 8 import java.util.List;
9 9 import java.util.Map;
... ... @@ -14,9 +14,7 @@ import java.util.stream.Collectors;
14 14 import static java.util.stream.Collectors.toList;
15 15 import static java.util.stream.Collectors.toMap;
16 16  
17   -/**
18   - * Created by me2 on 04.04.16.
19   - */
  17 +
20 18 public class FeatureHelper {
21 19  
22 20 private final List<TMention> mentions;
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
... ... @@ -5,7 +5,8 @@ import org.slf4j.Logger;
5 5 import org.slf4j.LoggerFactory;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8   -import pl.waw.ipipan.zil.summ.nicolas.Utils;
  8 +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
9 10 import weka.classifiers.Classifier;
10 11 import weka.core.Instance;
11 12 import weka.core.Instances;
... ... @@ -21,7 +22,7 @@ public class MentionModel {
21 22 Set<TMention> goodMentions = Sets.newHashSet();
22 23  
23 24 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
24   - Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(text, featureExtractor);
  25 + Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(text, featureExtractor);
25 26 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
26 27 Instance instance = entry.getValue();
27 28 instance.setDataset(instances);
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
... ... @@ -6,9 +6,8 @@ import com.google.common.collect.Multiset;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9   -import pl.waw.ipipan.zil.summ.nicolas.Utils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
10 10  
11   -import java.util.Collection;
12 11 import java.util.List;
13 12 import java.util.Map;
14 13 import java.util.stream.Collectors;
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
... ... @@ -8,7 +8,8 @@ import org.slf4j.LoggerFactory;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
10 10 import pl.waw.ipipan.zil.summ.nicolas.Constants;
11   -import pl.waw.ipipan.zil.summ.nicolas.Utils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  12 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
12 13 import weka.core.Instance;
13 14 import weka.core.Instances;
14 15 import weka.core.converters.ArffSaver;
... ... @@ -45,7 +46,7 @@ public class PrepareTrainingData {
45 46 continue;
46 47 Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
47 48  
48   - Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(preprocessedText, featureExtractor);
  49 + Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor);
49 50 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
50 51 TMention mention = entry.getKey();
51 52 Instance instance = entry.getValue();
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
... ... @@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 11 import pl.waw.ipipan.zil.summ.nicolas.Constants;
12   -import pl.waw.ipipan.zil.summ.nicolas.Utils;
  12 +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
13 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
14 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
15 16 import weka.classifiers.Classifier;
... ... @@ -58,7 +59,7 @@ public class PrepareTrainingData {
58 59 // Set<TMention> goodMentions
59 60 // = Utils.loadGoldGoodMentions(textId, preprocessedText, true);
60 61  
61   - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
  62 + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
62 63 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
63 64 TSentence sentence = entry.getKey();
64 65 Instance instance = entry.getValue();
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
... ... @@ -6,7 +6,7 @@ import com.google.common.collect.Multiset;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9   -import pl.waw.ipipan.zil.summ.nicolas.Utils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
10 10  
11 11 import java.util.List;
12 12 import java.util.Map;
... ...
nicolas-train/pom.xml
... ... @@ -11,4 +11,21 @@
11 11  
12 12 <artifactId>nicolas-train</artifactId>
13 13  
  14 + <dependencies>
  15 + <!-- internal -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>pscapi</artifactId>
  19 + </dependency>
  20 + <dependency>
  21 + <groupId>pl.waw.ipipan.zil.multiservice</groupId>
  22 + <artifactId>utils</artifactId>
  23 + </dependency>
  24 +
  25 + <!-- logging -->
  26 + <dependency>
  27 + <groupId>org.slf4j</groupId>
  28 + <artifactId>slf4j-api</artifactId>
  29 + </dependency>
  30 + </dependencies>
14 31 </project>
15 32 \ No newline at end of file
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Trainer.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train;
  2 +
  3 +public class Trainer {
  4 +
  5 + public static void main(String[] args) {
  6 +
  7 + }
  8 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.multiservice;
  2 +
  3 +import org.apache.thrift.TException;
  4 +import org.apache.thrift.protocol.TBinaryProtocol;
  5 +import org.apache.thrift.protocol.TProtocol;
  6 +import org.apache.thrift.transport.TSocket;
  7 +import org.apache.thrift.transport.TTransport;
  8 +import org.slf4j.Logger;
  9 +import org.slf4j.LoggerFactory;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.Multiservice;
  11 +import pl.waw.ipipan.zil.multiservice.thrift.ObjectRequest;
  12 +import pl.waw.ipipan.zil.multiservice.thrift.RequestPart;
  13 +import pl.waw.ipipan.zil.multiservice.thrift.RequestStatus;
  14 +import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException;
  15 +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
  16 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  17 +
  18 +import java.util.ArrayList;
  19 +import java.util.HashMap;
  20 +import java.util.List;
  21 +import java.util.Map;
  22 +
  23 +public class MultiserviceProxy {
  24 +
  25 + private static final Logger LOG = LoggerFactory.getLogger(MultiserviceProxy.class);
  26 +
  27 + private int port;
  28 + private String host;
  29 +
  30 + public MultiserviceProxy(String host, int port) {
  31 + this.host = host;
  32 + this.port = port;
  33 + LOG.info("Multiservice at " + host + ":" + port);
  34 + }
  35 +
  36 + public TText process(String text, List<String> services) throws Exception {
  37 + List<Map<String, String>> options = new ArrayList<>();
  38 + for (int i = 0; i < services.size(); i++)
  39 + options.add(new HashMap<>());
  40 + return process(text, "", services, options);
  41 + }
  42 +
  43 + public TText process(String text, String title, List<String> services, List<Map<String, String>> options)
  44 + throws Exception {
  45 + TTransport transport = new TSocket(host, port);
  46 + ObjectRequest objectRequest = createRequest(text, title, services, options);
  47 +
  48 + try {
  49 + transport.open();
  50 +
  51 + TProtocol protocol = new TBinaryProtocol(transport);
  52 + Multiservice.Client client = new Multiservice.Client(protocol);
  53 +
  54 + LOG.debug("Sending Multservice request...");
  55 + TText responseText = request(objectRequest, client);
  56 + LOG.debug("...done");
  57 +
  58 + return responseText;
  59 +
  60 + } catch (TException e) {
  61 + LOG.error("Error processing request:" + e);
  62 + throw new Exception(e);
  63 +
  64 + } finally {
  65 + transport.close();
  66 + }
  67 + }
  68 +
  69 + private TText request(ObjectRequest objectRequest, Multiservice.Client client) throws TException {
  70 +
  71 + String requestToken = client.putObjectRequest(objectRequest);
  72 + while (true) {
  73 + RequestStatus status = client.getRequestStatus(requestToken);
  74 + if (RequestStatus.DONE.equals(status)) {
  75 + TText result = client.getResultObject(requestToken);
  76 + return result;
  77 + } else if (RequestStatus.FAILED.equals(status) || RequestStatus.DUMPED.equals(status)) {
  78 + try {
  79 + MultiserviceException exception = client.getException(requestToken);
  80 + throw exception;
  81 + } catch (TException e) {
  82 + throw e;
  83 + }
  84 + }
  85 + }
  86 + }
  87 +
  88 + private ObjectRequest createRequest(String textBody, String textTitle, List<String> services,
  89 + List<Map<String, String>> options) {
  90 + TText text = new TText();
  91 +
  92 + TParagraph par = new TParagraph();
  93 + par.setText(textTitle);
  94 + text.addToParagraphs(par);
  95 +
  96 + for (String p : textBody.split("\n\n")) {
  97 + par = new TParagraph();
  98 + par.setText(p);
  99 + text.addToParagraphs(par);
  100 + }
  101 +
  102 + List<RequestPart> processingChain = new ArrayList<>();
  103 + int i = 0;
  104 + for (String serviceName : services)
  105 + processingChain.add(new RequestPart(serviceName, options.get(i++)));
  106 +
  107 + return new ObjectRequest(text, processingChain);
  108 + }
  109 +
  110 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.multiservice;
  2 +
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  6 +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
  7 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  8 +
  9 +import java.io.File;
  10 +import java.io.FileOutputStream;
  11 +import java.io.IOException;
  12 +import java.io.ObjectOutputStream;
  13 +import java.util.Arrays;
  14 +import java.util.List;
  15 +
  16 +public class NLPProcess {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(NLPProcess.class);
  19 +
  20 + private static final List<String> SERVICES = Arrays.asList("Concraft", "Spejd", "Nerf", "MentionDetector",
  21 + "Bartek");
  22 + private static final int PORT = 20000;
  23 + private static final String HOST = "multiservice.nlp.ipipan.waw.pl";
  24 +
  25 + private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT);
  26 +
  27 + private NLPProcess() {
  28 + }
  29 +
  30 + public static void main(String[] args) {
  31 + if (args.length != 2) {
  32 + LOG.error("Wrong usage! Try " + NLPProcess.class.getSimpleName() + " dirWithCorpusFiles targetDir");
  33 + return;
  34 + }
  35 + File corpusDir = new File(args[0]);
  36 + if (!corpusDir.isDirectory()) {
  37 + LOG.error("Corpus directory does not exist: " + corpusDir);
  38 + return;
  39 + }
  40 + File targetDir = new File(args[1]);
  41 + if (!targetDir.isDirectory()) {
  42 + LOG.error("Target directory does not exist: " + targetDir);
  43 + return;
  44 + }
  45 +
  46 + int ok = 0;
  47 + int err = 0;
  48 + File[] files = corpusDir.listFiles(f -> f.getName().endsWith(".xml"));
  49 + Arrays.sort(files);
  50 + for (File file : files) {
  51 + try {
  52 + Text text = PSC_IO.readText(file);
  53 + File targetFile = new File(targetDir, file.getName().replaceFirst(".xml$", ".bin"));
  54 + annotateNLP(text, targetFile);
  55 + ok++;
  56 + } catch (Exception e) {
  57 + err++;
  58 + LOG.error("Problem with text in " + file + ", " + e);
  59 + }
  60 + }
  61 + LOG.info(ok + " texts processed successfully.");
  62 + LOG.info(err + " texts with errors.");
  63 + }
  64 +
  65 + private static void annotateNLP(Text text, File targetFile) throws Exception {
  66 + annotate(text.getBody(), targetFile);
  67 + }
  68 +
  69 + private static void annotate(String body, File targetFile) throws Exception {
  70 + if (targetFile.exists()) {
  71 + LOG.debug("Skipping existing file..");
  72 + return;
  73 + }
  74 + LOG.info("Processing text into " + targetFile.getPath());
  75 + TText ttext = MSPROXY.process(body, SERVICES);
  76 + serialize(ttext, targetFile);
  77 + }
  78 +
  79 + public static void serialize(TText ttext, File targetFile) throws IOException {
  80 + try (FileOutputStream fout = new FileOutputStream(targetFile);
  81 + ObjectOutputStream oos = new ObjectOutputStream(fout)) {
  82 + oos.writeObject(ttext);
  83 + }
  84 + }
  85 +
  86 + public static TText annotate(String body) throws Exception {
  87 + return MSPROXY.process(body, SERVICES);
  88 + }
  89 +
  90 +}
... ...
nicolas-zero/pom.xml
... ... @@ -11,4 +11,34 @@
11 11  
12 12 <artifactId>nicolas-zero</artifactId>
13 13  
  14 + <dependencies>
  15 + <!-- project -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>nicolas-common</artifactId>
  19 + </dependency>
  20 +
  21 + <!-- third party -->
  22 + <dependency>
  23 + <groupId>org.apache.commons</groupId>
  24 + <artifactId>commons-csv</artifactId>
  25 + </dependency>
  26 + <dependency>
  27 + <groupId>commons-io</groupId>
  28 + <artifactId>commons-io</artifactId>
  29 + </dependency>
  30 +
  31 + <!-- logging -->
  32 + <dependency>
  33 + <groupId>org.slf4j</groupId>
  34 + <artifactId>slf4j-api</artifactId>
  35 + </dependency>
  36 +
  37 + <!-- test -->
  38 + <dependency>
  39 + <groupId>junit</groupId>
  40 + <artifactId>junit</artifactId>
  41 + </dependency>
  42 + </dependencies>
  43 +
14 44 </project>
15 45 \ No newline at end of file
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java renamed to nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
... ... @@ -3,126 +3,61 @@ package pl.waw.ipipan.zil.summ.nicolas.zero;
3 3 import com.google.common.collect.Lists;
4 4 import com.google.common.collect.Maps;
5 5 import com.google.common.collect.Sets;
6   -import org.apache.commons.csv.CSVFormat;
7   -import org.apache.commons.csv.CSVPrinter;
8   -import org.apache.commons.csv.QuoteMode;
9   -import org.apache.commons.io.IOUtils;
10 6 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
11   -import pl.waw.ipipan.zil.summ.nicolas.Utils;
12 7  
13   -import java.io.File;
14   -import java.io.FileReader;
15   -import java.io.FileWriter;
16   -import java.io.IOException;
17 8 import java.util.Arrays;
18 9 import java.util.List;
19 10 import java.util.Map;
20 11 import java.util.Set;
21 12  
22   -/**
23   - * Created by me2 on 26.07.16.
24   - */
25   -public class Zero {
  13 +public class CandidateFinder {
26 14  
27   - private static final String IDS_PATH = "summaries_dev";
28   - private static final String THRIFTED_PATH = "src/main/resources/preprocessed_full_texts/dev/";
  15 + public List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) {
  16 + List<ZeroSubjectCandidate> candidates = Lists.newArrayList();
29 17  
30   - public static void main(String[] args) throws IOException {
31   -
32   - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);
33   - Map<String, List<String>> id2sentIds = loadSentenceIds(IDS_PATH);
34   -
35   - int mentionCount = 0;
36   - int mentionInNom = 0;
37   - int mentionInNomSequential = 0;
38   -
39   - List<List<Object>> rows = Lists.newArrayList();
40   - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
41   - String textId = entry.getKey();
42   -// System.out.println(id);
43   -
44   - TText text = entry.getValue();
45   - List<String> sentenceIds = id2sentIds.get(textId);
46   -// System.out.println(sentenceIds);
47   -
48   - Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
49   - for (TCoreference coreference : text.getCoreferences()) {
50   - for (String mentionId : coreference.getMentionIds()) {
51   - mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));
52   - }
  18 + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
  19 + for (TCoreference coreference : text.getCoreferences()) {
  20 + for (String mentionId : coreference.getMentionIds()) {
  21 + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));
53 22 }
  23 + }
54 24  
55   - Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet();
56   - TSentence prevSentence = null;
57   - for (TParagraph p : text.getParagraphs()) {
58   - Map<TMention, String> tMentionStringMap = Utils.loadMention2Orth(p.getSentences());
59   -
60   - for (TSentence sentence : p.getSentences()) {
61   - if (!sentenceIds.contains(sentence.getId()))
62   - continue;
63   - Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();
64   -
65   - Map<String, TToken> tokenId2Token = Maps.newHashMap();
66   - for (TToken t : sentence.getTokens())
67   - tokenId2Token.put(t.getId(), t);
  25 + Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet();
  26 + TSentence prevSentence = null;
  27 + for (TParagraph p : text.getParagraphs()) {
  28 + for (TSentence sentence : p.getSentences()) {
  29 + if (!summarySentenceIds.contains(sentence.getId()))
  30 + continue;
  31 + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();
68 32  
69   - for (TMention mention : sentence.getMentions()) {
70   - mentionCount++;
  33 + Map<String, TToken> tokenId2Token = Maps.newHashMap();
  34 + for (TToken t : sentence.getTokens())
  35 + tokenId2Token.put(t.getId(), t);
71 36  
72   - for (String tokenId : mention.getHeadIds()) {
73   - TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();
74   - if (isInNominative(interp)) {
75   - mentionInNom++;
  37 + for (TMention mention : sentence.getMentions()) {
76 38  
77   - currentSentenceNominativeMentionIds.add(mention.getId());
78   - if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) {
79   - mentionInNomSequential++;
80   - System.out.println(tMentionStringMap.get(mention)
81   - + "\n\t" + Utils.loadSentence2Orth(prevSentence)
82   - + "\n\t" + Utils.loadSentence2Orth(sentence));
  39 + for (String tokenId : mention.getHeadIds()) {
  40 + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();
  41 + if (isInNominative(interp)) {
83 42  
84   - List<Object> row = Lists.newArrayList();
85   - row.add("C");
86   - row.add(textId);
87   - row.add(tMentionStringMap.get(mention));
88   - row.add(Utils.loadSentence2Orth(prevSentence));
89   - row.add(Utils.loadSentence2Orth(sentence));
90   - rows.add(row);
91   - }
92   - break;
  43 + currentSentenceNominativeMentionIds.add(mention.getId());
  44 + if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) {
  45 + ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentence, sentence, mention);
  46 + candidates.add(candidate);
93 47 }
  48 + break;
94 49 }
95 50 }
96   -
97   - prevSentence = sentence;
98   - prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds;
99 51 }
100   - }
101   - }
102   -
103   - System.out.println(mentionCount + " mentions");
104   - System.out.println(mentionInNom + " mention in nom");
105   - System.out.println(mentionInNomSequential + " mention in nom with previous in nom");
106 52  
107   - try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) {
108   - for (List<Object> row : rows) {
109   - csvPrinter.printRecord(row);
  53 + prevSentence = sentence;
  54 + prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds;
110 55 }
111 56 }
112   -
  57 + return candidates;
113 58 }
114 59  
115 60 private static boolean isInNominative(TInterpretation interp) {
116 61 return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom"));
117 62 }
118   -
119   - private static Map<String, List<String>> loadSentenceIds(String idsPath) throws IOException {
120   - Map<String, List<String>> result = Maps.newHashMap();
121   - for (File f : new File(idsPath).listFiles()) {
122   - String id = f.getName().split("_")[0];
123   - List<String> sentenceIds = IOUtils.readLines(new FileReader(f));
124   - result.put(id, sentenceIds);
125   - }
126   - return result;
127   - }
128 63 }
... ...
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import com.google.common.collect.Lists;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.collect.Sets;
  6 +import org.apache.commons.csv.CSVFormat;
  7 +import org.apache.commons.csv.CSVPrinter;
  8 +import org.apache.commons.csv.QuoteMode;
  9 +import org.apache.commons.io.IOUtils;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  11 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper;
  12 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  13 +
  14 +import java.io.File;
  15 +import java.io.FileReader;
  16 +import java.io.FileWriter;
  17 +import java.io.IOException;
  18 +import java.util.List;
  19 +import java.util.Map;
  20 +import java.util.Set;
  21 +
  22 +public class Zero {
  23 +
  24 + private static final String IDS_PATH = "corpora/summaries_dev";
  25 + private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/";
  26 +
  27 + private Zero() {
  28 + }
  29 +
  30 + public static void main(String[] args) throws IOException {
  31 +
  32 + CandidateFinder candidateFinder = new CandidateFinder();
  33 +
  34 + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);
  35 + Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH);
  36 +
  37 + List<List<Object>> rows = Lists.newArrayList();
  38 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  39 + String textId = entry.getKey();
  40 +
  41 + TText text = entry.getValue();
  42 + ThriftTextHelper thriftTextHelper = new ThriftTextHelper(text);
  43 +
  44 + Set<String> sentenceIds = id2sentIds.get(textId);
  45 +
  46 + List<ZeroSubjectCandidate> zeroSubjectCandidates = candidateFinder.findZeroSubjectCandidates(text, sentenceIds);
  47 +
  48 + for (ZeroSubjectCandidate candidate : zeroSubjectCandidates) {
  49 + List<Object> row = Lists.newArrayList();
  50 + row.add("C");
  51 + row.add(textId);
  52 + row.add(thriftTextHelper.getMentionText(candidate.getZeroCandidateMention()));
  53 + row.add(thriftTextHelper.getSentenceText(candidate.getPreviousSentence()));
  54 + row.add(thriftTextHelper.getSentenceText(candidate.getSentence()));
  55 + rows.add(row);
  56 + }
  57 + }
  58 +
  59 + try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) {
  60 + for (List<Object> row : rows) {
  61 + csvPrinter.printRecord(row);
  62 + }
  63 + }
  64 +
  65 + }
  66 +
  67 + private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException {
  68 + Map<String, Set<String>> result = Maps.newHashMap();
  69 + for (File f : new File(idsPath).listFiles()) {
  70 + String id = f.getName().split("_")[0];
  71 + List<String> sentenceIds = IOUtils.readLines(new FileReader(f));
  72 + result.put(id, Sets.newHashSet(sentenceIds));
  73 + }
  74 + return result;
  75 + }
  76 +}
... ...
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  5 +
  6 +public class ZeroSubjectCandidate {
  7 +
  8 + private final TSentence previousSentence;
  9 + private final TSentence sentence;
  10 + private final TMention zeroCandidateMention;
  11 +
  12 + public ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) {
  13 + this.previousSentence = previousSentence;
  14 + this.sentence = sentence;
  15 + this.zeroCandidateMention = zeroCandidateMention;
  16 + }
  17 +
  18 + public TSentence getPreviousSentence() {
  19 + return previousSentence;
  20 + }
  21 +
  22 + public TSentence getSentence() {
  23 + return sentence;
  24 + }
  25 +
  26 + public TMention getZeroCandidateMention() {
  27 + return zeroCandidateMention;
  28 + }
  29 +}
... ...
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +
  4 +public class ZeroSubjectInjector {
  5 +}
... ...
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import com.google.common.collect.Sets;
  4 +import org.apache.commons.io.IOUtils;
  5 +import org.junit.BeforeClass;
  6 +import org.junit.Test;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  8 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper;
  10 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  11 +
  12 +import java.io.IOException;
  13 +import java.io.InputStream;
  14 +import java.io.InputStreamReader;
  15 +import java.util.List;
  16 +import java.util.Set;
  17 +
  18 +import static org.junit.Assert.assertEquals;
  19 +
  20 +public class CandidateFinderTest {
  21 +
  22 + private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin";
  23 + private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt";
  24 +
  25 + private static CandidateFinder candidateFinder;
  26 +
  27 + @BeforeClass
  28 + public static void init() {
  29 + candidateFinder = new CandidateFinder();
  30 + }
  31 +
  32 + @Test
  33 + public void shouldFindZeroSubjectCandidateInSampleText() throws Exception {
  34 + ThriftTextHelper sampleTextHelper = loadSampleTextHelper();
  35 + Set<String> summarySentenceIds = loadSampleTextSummarySentenceIds();
  36 + List<ZeroSubjectCandidate> candidates = candidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds);
  37 + assertEquals(1, candidates.size());
  38 +
  39 + ZeroSubjectCandidate zeroSubjectCandidate = candidates.get(0);
  40 + TSentence firstSentence = zeroSubjectCandidate.getPreviousSentence();
  41 + TSentence secondSentence = zeroSubjectCandidate.getSentence();
  42 + TMention zeroCandidate = zeroSubjectCandidate.getZeroCandidateMention();
  43 +
  44 + assertEquals("Ala ma kota.", sampleTextHelper.getSentenceText(firstSentence));
  45 + assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceText(secondSentence));
  46 + assertEquals("Ala", sampleTextHelper.getMentionText(zeroCandidate));
  47 + }
  48 +
  49 + private Set<String> loadSampleTextSummarySentenceIds() throws IOException {
  50 + try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_SUMMARY_IDS_PATH);
  51 + InputStreamReader reader = new InputStreamReader(stream)) {
  52 + return Sets.newHashSet(IOUtils.readLines(reader));
  53 + }
  54 + }
  55 +
  56 + private ThriftTextHelper loadSampleTextHelper() throws IOException {
  57 + try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
  58 + return new ThriftTextHelper(Utils.loadThrifted(stream));
  59 + }
  60 + }
  61 +}
0 62 \ No newline at end of file
... ...
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjectorTest.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import org.junit.Test;
  4 +
  5 +public class ZeroSubjectInjectorTest {
  6 +
  7 + @Test
  8 + public void shouldInit() throws Exception {
  9 + ZeroSubjectInjector injector = new ZeroSubjectInjector();
  10 + }
  11 +}
0 12 \ No newline at end of file
... ...
... ... @@ -16,6 +16,7 @@
16 16 <module>nicolas-model</module>
17 17 <module>nicolas-train</module>
18 18 <module>nicolas-zero</module>
  19 + <module>nicolas-common</module>
19 20 </modules>
20 21  
21 22 <properties>
... ... @@ -30,6 +31,8 @@
30 31 <weka-dev.version>3.9.0</weka-dev.version>
31 32 <commons-lang3.version>3.5</commons-lang3.version>
32 33 <commons-io.version>2.5</commons-io.version>
  34 + <slf4j-api.version>1.7.12</slf4j-api.version>
  35 + <junit.version>4.12</junit.version>
33 36 </properties>
34 37  
35 38 <prerequisites>
... ... @@ -46,13 +49,20 @@
46 49  
47 50 <dependencyManagement>
48 51 <dependencies>
  52 + <!-- project -->
49 53 <dependency>
50 54 <groupId>pl.waw.ipipan.zil.summ</groupId>
51 55 <artifactId>nicolas-model</artifactId>
52 56 <version>${project.version}</version>
53 57 <scope>runtime</scope>
54 58 </dependency>
  59 + <dependency>
  60 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  61 + <artifactId>nicolas-common</artifactId>
  62 + <version>${project.version}</version>
  63 + </dependency>
55 64  
  65 + <!-- internal -->
56 66 <dependency>
57 67 <groupId>pl.waw.ipipan.zil.summ</groupId>
58 68 <artifactId>pscapi</artifactId>
... ... @@ -64,6 +74,7 @@
64 74 <version>${utils.version}</version>
65 75 </dependency>
66 76  
  77 + <!-- third party -->
67 78 <dependency>
68 79 <groupId>org.apache.commons</groupId>
69 80 <artifactId>commons-csv</artifactId>
... ... @@ -89,6 +100,20 @@
89 100 <artifactId>commons-io</artifactId>
90 101 <version>${commons-io.version}</version>
91 102 </dependency>
  103 +
  104 + <!-- logging -->
  105 + <dependency>
  106 + <groupId>org.slf4j</groupId>
  107 + <artifactId>slf4j-api</artifactId>
  108 + <version>${slf4j-api.version}</version>
  109 + </dependency>
  110 +
  111 + <!-- test -->
  112 + <dependency>
  113 + <groupId>junit</groupId>
  114 + <artifactId>junit</artifactId>
  115 + <version>${junit.version}</version>
  116 + </dependency>
92 117 </dependencies>
93 118 </dependencyManagement>
94 119  
... ...