Commit eac83d24d0d460300033f920fafbc7fa3d5ecdbb

Authored by Mateusz Kopeć
1 parent b41f6532

refactor

Showing 24 changed files with 639 additions and 169 deletions
nicolas-common/pom.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <modelVersion>4.0.0</modelVersion>
  6 + <parent>
  7 + <artifactId>nicolas-container</artifactId>
  8 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  9 + <version>1.0-SNAPSHOT</version>
  10 + </parent>
  11 +
  12 + <artifactId>nicolas-common</artifactId>
  13 +
  14 + <dependencies>
  15 + <!-- internal -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>pscapi</artifactId>
  19 + </dependency>
  20 + <dependency>
  21 + <groupId>pl.waw.ipipan.zil.multiservice</groupId>
  22 + <artifactId>utils</artifactId>
  23 + </dependency>
  24 +
  25 + <!-- third party -->
  26 + <dependency>
  27 + <groupId>nz.ac.waikato.cms.weka</groupId>
  28 + <artifactId>weka-dev</artifactId>
  29 + </dependency>
  30 +
  31 + <!-- logging -->
  32 + <dependency>
  33 + <groupId>org.slf4j</groupId>
  34 + <artifactId>slf4j-api</artifactId>
  35 + </dependency>
  36 +
  37 + </dependencies>
  38 +
  39 +</project>
0 \ No newline at end of file 40 \ No newline at end of file
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Utils.java renamed to nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
1 -package pl.waw.ipipan.zil.summ.nicolas; 1 +package pl.waw.ipipan.zil.summ.nicolas.common;
2 2
3 -import com.google.common.base.Charsets;  
4 import com.google.common.collect.Lists; 3 import com.google.common.collect.Lists;
5 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
6 import com.google.common.collect.Sets; 5 import com.google.common.collect.Sets;
7 -import com.google.common.io.Files;  
8 import org.slf4j.Logger; 6 import org.slf4j.Logger;
9 import org.slf4j.LoggerFactory; 7 import org.slf4j.LoggerFactory;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
13 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; 11 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
14 -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;  
15 -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer;  
16 -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;  
17 import weka.classifiers.Classifier; 12 import weka.classifiers.Classifier;
18 import weka.core.Attribute; 13 import weka.core.Attribute;
19 -import weka.core.DenseInstance;  
20 -import weka.core.Instance;  
21 import weka.core.Instances; 14 import weka.core.Instances;
22 15
23 -import java.io.File;  
24 -import java.io.FileInputStream;  
25 -import java.io.IOException;  
26 -import java.io.ObjectInputStream; 16 +import java.io.*;
27 import java.util.*; 17 import java.util.*;
28 import java.util.function.Function; 18 import java.util.function.Function;
29 import java.util.stream.Collectors; 19 import java.util.stream.Collectors;
30 20
31 -import static java.util.stream.Collectors.toList;  
32 -  
33 public class Utils { 21 public class Utils {
34 22
35 private static final Logger LOG = LoggerFactory.getLogger(Utils.class); 23 private static final Logger LOG = LoggerFactory.getLogger(Utils.class);
36 24
37 private static final String DATASET_NAME = "Dataset"; 25 private static final String DATASET_NAME = "Dataset";
38 26
39 - public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {  
40 - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());  
41 - Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);  
42 -  
43 - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention.");  
44 - Map<TMention, Instance> mention2instance = Maps.newHashMap();  
45 - for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) {  
46 - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());  
47 - Map<Attribute, Double> mentionFeatures = mention2features.get(tMention);  
48 - for (Attribute attribute : featureExtractor.getAttributesList()) {  
49 - instance.setValue(attribute, mentionFeatures.get(attribute));  
50 - }  
51 - mention2instance.put(tMention, instance);  
52 - }  
53 - return mention2instance;  
54 - }  
55 -  
56 - public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) {  
57 - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());  
58 - Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions);  
59 -  
60 - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence.");  
61 - Map<TSentence, Instance> sentence2instance = Maps.newHashMap();  
62 - for (TSentence sentence : sentences) {  
63 - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());  
64 - Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence);  
65 - for (Attribute attribute : featureExtractor.getAttributesList()) {  
66 - instance.setValue(attribute, sentenceFeatures.get(attribute));  
67 - }  
68 - sentence2instance.put(sentence, instance);  
69 - }  
70 - return sentence2instance;  
71 - }  
72 -  
73 public static Instances createNewInstances(ArrayList<Attribute> attributesList) { 27 public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
74 Instances instances = new Instances(DATASET_NAME, attributesList, 0); 28 Instances instances = new Instances(DATASET_NAME, attributesList, 0);
75 instances.setClassIndex(0); 29 instances.setClassIndex(0);
@@ -97,7 +51,16 @@ public class Utils { @@ -97,7 +51,16 @@ public class Utils {
97 51
98 52
99 public static TText loadThrifted(File originalFile) { 53 public static TText loadThrifted(File originalFile) {
100 - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(originalFile))) { 54 + try (FileInputStream inputStream = new FileInputStream(originalFile)) {
  55 + return loadThrifted(inputStream);
  56 + } catch (IOException e) {
  57 + LOG.error("Error reading serialized file: " + e);
  58 + return null;
  59 + }
  60 + }
  61 +
  62 + public static TText loadThrifted(InputStream stream) {
  63 + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) {
101 return (TText) ois.readObject(); 64 return (TText) ois.readObject();
102 } catch (ClassNotFoundException | IOException e) { 65 } catch (ClassNotFoundException | IOException e) {
103 LOG.error("Error reading serialized file: " + e); 66 LOG.error("Error reading serialized file: " + e);
@@ -188,13 +151,5 @@ public class Utils { @@ -188,13 +151,5 @@ public class Utils {
188 return sb.toString().trim(); 151 return sb.toString().trim();
189 } 152 }
190 153
191 - public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException {  
192 - String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8);  
193 154
194 - MentionScorer scorer = new MentionScorer();  
195 - Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);  
196 -  
197 - mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0);  
198 - return mention2score.keySet();  
199 - }  
200 } 155 }
201 \ No newline at end of file 156 \ No newline at end of file
nicolas-core/pom.xml
@@ -12,10 +12,14 @@ @@ -12,10 +12,14 @@
12 <artifactId>nicolas</artifactId> 12 <artifactId>nicolas</artifactId>
13 13
14 <dependencies> 14 <dependencies>
  15 + <!-- project -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>nicolas-common</artifactId>
  19 + </dependency>
15 <dependency> 20 <dependency>
16 <groupId>pl.waw.ipipan.zil.summ</groupId> 21 <groupId>pl.waw.ipipan.zil.summ</groupId>
17 <artifactId>nicolas-model</artifactId> 22 <artifactId>nicolas-model</artifactId>
18 - <scope>runtime</scope>  
19 </dependency> 23 </dependency>
20 24
21 <dependency> 25 <dependency>
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -6,6 +6,7 @@ import com.google.common.collect.Sets; @@ -6,6 +6,7 @@ import com.google.common.collect.Sets;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
9 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 10 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
10 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 11 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
11 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 12 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
@@ -53,7 +54,7 @@ public class Nicolas { @@ -53,7 +54,7 @@ public class Nicolas {
53 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); 54 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
54 55
55 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); 56 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
56 - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); 57 + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
57 58
58 Map<TSentence, Double> sentence2score = Maps.newHashMap(); 59 Map<TSentence, Double> sentence2score = Maps.newHashMap();
59 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { 60 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +import com.google.common.base.Charsets;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.io.Files;
  6 +import org.slf4j.Logger;
  7 +import org.slf4j.LoggerFactory;
  8 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  9 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  11 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
  12 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer;
  13 +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
  14 +import weka.core.Attribute;
  15 +import weka.core.DenseInstance;
  16 +import weka.core.Instance;
  17 +
  18 +import java.io.File;
  19 +import java.io.IOException;
  20 +import java.util.List;
  21 +import java.util.Map;
  22 +import java.util.Set;
  23 +
  24 +import static java.util.stream.Collectors.toList;
  25 +
  26 +public class ThriftUtils {
  27 +
  28 + private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class);
  29 +
  30 + public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException {
  31 + String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8);
  32 +
  33 + MentionScorer scorer = new MentionScorer();
  34 + Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);
  35 +
  36 + mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0);
  37 + return mention2score.keySet();
  38 + }
  39 +
  40 + public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
  41 + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  42 + Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
  43 +
  44 + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention.");
  45 + Map<TMention, Instance> mention2instance = Maps.newHashMap();
  46 + for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) {
  47 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  48 + Map<Attribute, Double> mentionFeatures = mention2features.get(tMention);
  49 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  50 + instance.setValue(attribute, mentionFeatures.get(attribute));
  51 + }
  52 + mention2instance.put(tMention, instance);
  53 + }
  54 + return mention2instance;
  55 + }
  56 +
  57 + public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) {
  58 + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  59 + Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions);
  60 +
  61 + LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence.");
  62 + Map<TSentence, Instance> sentence2instance = Maps.newHashMap();
  63 + for (TSentence sentence : sentences) {
  64 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  65 + Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence);
  66 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  67 + instance.setValue(attribute, sentenceFeatures.get(attribute));
  68 + }
  69 + sentence2instance.put(sentence, instance);
  70 + }
  71 + return sentence2instance;
  72 + }
  73 +}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java
@@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; @@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 import pl.waw.ipipan.zil.summ.nicolas.Constants; 11 import pl.waw.ipipan.zil.summ.nicolas.Constants;
12 -import pl.waw.ipipan.zil.summ.nicolas.Utils; 12 +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
13 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
15 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 16 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
@@ -85,7 +86,7 @@ public class ApplyModel2 { @@ -85,7 +86,7 @@ public class ApplyModel2 {
85 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); 86 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
86 87
87 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); 88 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
88 - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); 89 + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
89 90
90 Map<TSentence, Double> sentence2score = Maps.newHashMap(); 91 Map<TSentence, Double> sentence2score = Maps.newHashMap();
91 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { 92 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.features; @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.features;
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import com.google.common.collect.Sets; 4 import com.google.common.collect.Sets;
5 import pl.waw.ipipan.zil.multiservice.thrift.types.*; 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
6 -import pl.waw.ipipan.zil.summ.nicolas.Utils; 6 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
7 7
8 import java.util.List; 8 import java.util.List;
9 import java.util.Map; 9 import java.util.Map;
@@ -14,9 +14,7 @@ import java.util.stream.Collectors; @@ -14,9 +14,7 @@ import java.util.stream.Collectors;
14 import static java.util.stream.Collectors.toList; 14 import static java.util.stream.Collectors.toList;
15 import static java.util.stream.Collectors.toMap; 15 import static java.util.stream.Collectors.toMap;
16 16
17 -/**  
18 - * Created by me2 on 04.04.16.  
19 - */ 17 +
20 public class FeatureHelper { 18 public class FeatureHelper {
21 19
22 private final List<TMention> mentions; 20 private final List<TMention> mentions;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
@@ -5,7 +5,8 @@ import org.slf4j.Logger; @@ -5,7 +5,8 @@ import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8 -import pl.waw.ipipan.zil.summ.nicolas.Utils; 8 +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
9 import weka.classifiers.Classifier; 10 import weka.classifiers.Classifier;
10 import weka.core.Instance; 11 import weka.core.Instance;
11 import weka.core.Instances; 12 import weka.core.Instances;
@@ -21,7 +22,7 @@ public class MentionModel { @@ -21,7 +22,7 @@ public class MentionModel {
21 Set<TMention> goodMentions = Sets.newHashSet(); 22 Set<TMention> goodMentions = Sets.newHashSet();
22 23
23 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 24 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
24 - Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(text, featureExtractor); 25 + Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(text, featureExtractor);
25 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { 26 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
26 Instance instance = entry.getValue(); 27 Instance instance = entry.getValue();
27 instance.setDataset(instances); 28 instance.setDataset(instances);
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java
@@ -6,9 +6,8 @@ import com.google.common.collect.Multiset; @@ -6,9 +6,8 @@ import com.google.common.collect.Multiset;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 -import pl.waw.ipipan.zil.summ.nicolas.Utils; 9 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
10 10
11 -import java.util.Collection;  
12 import java.util.List; 11 import java.util.List;
13 import java.util.Map; 12 import java.util.Map;
14 import java.util.stream.Collectors; 13 import java.util.stream.Collectors;
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java
@@ -8,7 +8,8 @@ import org.slf4j.LoggerFactory; @@ -8,7 +8,8 @@ import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
10 import pl.waw.ipipan.zil.summ.nicolas.Constants; 10 import pl.waw.ipipan.zil.summ.nicolas.Constants;
11 -import pl.waw.ipipan.zil.summ.nicolas.Utils; 11 +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  12 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
12 import weka.core.Instance; 13 import weka.core.Instance;
13 import weka.core.Instances; 14 import weka.core.Instances;
14 import weka.core.converters.ArffSaver; 15 import weka.core.converters.ArffSaver;
@@ -45,7 +46,7 @@ public class PrepareTrainingData { @@ -45,7 +46,7 @@ public class PrepareTrainingData {
45 continue; 46 continue;
46 Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); 47 Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
47 48
48 - Map<TMention, Instance> mention2instance = Utils.extractInstancesFromMentions(preprocessedText, featureExtractor); 49 + Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor);
49 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { 50 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
50 TMention mention = entry.getKey(); 51 TMention mention = entry.getKey();
51 Instance instance = entry.getValue(); 52 Instance instance = entry.getValue();
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java
@@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; @@ -9,7 +9,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 import pl.waw.ipipan.zil.summ.nicolas.Constants; 11 import pl.waw.ipipan.zil.summ.nicolas.Constants;
12 -import pl.waw.ipipan.zil.summ.nicolas.Utils; 12 +import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
13 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
15 import weka.classifiers.Classifier; 16 import weka.classifiers.Classifier;
@@ -58,7 +59,7 @@ public class PrepareTrainingData { @@ -58,7 +59,7 @@ public class PrepareTrainingData {
58 // Set<TMention> goodMentions 59 // Set<TMention> goodMentions
59 // = Utils.loadGoldGoodMentions(textId, preprocessedText, true); 60 // = Utils.loadGoldGoodMentions(textId, preprocessedText, true);
60 61
61 - Map<TSentence, Instance> sentence2instance = Utils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); 62 + Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
62 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { 63 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
63 TSentence sentence = entry.getKey(); 64 TSentence sentence = entry.getKey();
64 Instance instance = entry.getValue(); 65 Instance instance = entry.getValue();
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java
@@ -6,7 +6,7 @@ import com.google.common.collect.Multiset; @@ -6,7 +6,7 @@ import com.google.common.collect.Multiset;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 -import pl.waw.ipipan.zil.summ.nicolas.Utils; 9 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
10 10
11 import java.util.List; 11 import java.util.List;
12 import java.util.Map; 12 import java.util.Map;
nicolas-train/pom.xml
@@ -11,4 +11,21 @@ @@ -11,4 +11,21 @@
11 11
12 <artifactId>nicolas-train</artifactId> 12 <artifactId>nicolas-train</artifactId>
13 13
  14 + <dependencies>
  15 + <!-- internal -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>pscapi</artifactId>
  19 + </dependency>
  20 + <dependency>
  21 + <groupId>pl.waw.ipipan.zil.multiservice</groupId>
  22 + <artifactId>utils</artifactId>
  23 + </dependency>
  24 +
  25 + <!-- logging -->
  26 + <dependency>
  27 + <groupId>org.slf4j</groupId>
  28 + <artifactId>slf4j-api</artifactId>
  29 + </dependency>
  30 + </dependencies>
14 </project> 31 </project>
15 \ No newline at end of file 32 \ No newline at end of file
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Trainer.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train;
  2 +
  3 +public class Trainer {
  4 +
  5 + public static void main(String[] args) {
  6 +
  7 + }
  8 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.multiservice;
  2 +
  3 +import org.apache.thrift.TException;
  4 +import org.apache.thrift.protocol.TBinaryProtocol;
  5 +import org.apache.thrift.protocol.TProtocol;
  6 +import org.apache.thrift.transport.TSocket;
  7 +import org.apache.thrift.transport.TTransport;
  8 +import org.slf4j.Logger;
  9 +import org.slf4j.LoggerFactory;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.Multiservice;
  11 +import pl.waw.ipipan.zil.multiservice.thrift.ObjectRequest;
  12 +import pl.waw.ipipan.zil.multiservice.thrift.RequestPart;
  13 +import pl.waw.ipipan.zil.multiservice.thrift.RequestStatus;
  14 +import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException;
  15 +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
  16 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  17 +
  18 +import java.util.ArrayList;
  19 +import java.util.HashMap;
  20 +import java.util.List;
  21 +import java.util.Map;
  22 +
  23 +public class MultiserviceProxy {
  24 +
  25 + private static final Logger LOG = LoggerFactory.getLogger(MultiserviceProxy.class);
  26 +
  27 + private int port;
  28 + private String host;
  29 +
  30 + public MultiserviceProxy(String host, int port) {
  31 + this.host = host;
  32 + this.port = port;
  33 + LOG.info("Multiservice at " + host + ":" + port);
  34 + }
  35 +
  36 + public TText process(String text, List<String> services) throws Exception {
  37 + List<Map<String, String>> options = new ArrayList<>();
  38 + for (int i = 0; i < services.size(); i++)
  39 + options.add(new HashMap<>());
  40 + return process(text, "", services, options);
  41 + }
  42 +
  43 + public TText process(String text, String title, List<String> services, List<Map<String, String>> options)
  44 + throws Exception {
  45 + TTransport transport = new TSocket(host, port);
  46 + ObjectRequest objectRequest = createRequest(text, title, services, options);
  47 +
  48 + try {
  49 + transport.open();
  50 +
  51 + TProtocol protocol = new TBinaryProtocol(transport);
  52 + Multiservice.Client client = new Multiservice.Client(protocol);
  53 +
  54 + LOG.debug("Sending Multservice request...");
  55 + TText responseText = request(objectRequest, client);
  56 + LOG.debug("...done");
  57 +
  58 + return responseText;
  59 +
  60 + } catch (TException e) {
  61 + LOG.error("Error processing request:" + e);
  62 + throw new Exception(e);
  63 +
  64 + } finally {
  65 + transport.close();
  66 + }
  67 + }
  68 +
  69 + private TText request(ObjectRequest objectRequest, Multiservice.Client client) throws TException {
  70 +
  71 + String requestToken = client.putObjectRequest(objectRequest);
  72 + while (true) {
  73 + RequestStatus status = client.getRequestStatus(requestToken);
  74 + if (RequestStatus.DONE.equals(status)) {
  75 + TText result = client.getResultObject(requestToken);
  76 + return result;
  77 + } else if (RequestStatus.FAILED.equals(status) || RequestStatus.DUMPED.equals(status)) {
  78 + try {
  79 + MultiserviceException exception = client.getException(requestToken);
  80 + throw exception;
  81 + } catch (TException e) {
  82 + throw e;
  83 + }
  84 + }
  85 + }
  86 + }
  87 +
  88 + private ObjectRequest createRequest(String textBody, String textTitle, List<String> services,
  89 + List<Map<String, String>> options) {
  90 + TText text = new TText();
  91 +
  92 + TParagraph par = new TParagraph();
  93 + par.setText(textTitle);
  94 + text.addToParagraphs(par);
  95 +
  96 + for (String p : textBody.split("\n\n")) {
  97 + par = new TParagraph();
  98 + par.setText(p);
  99 + text.addToParagraphs(par);
  100 + }
  101 +
  102 + List<RequestPart> processingChain = new ArrayList<>();
  103 + int i = 0;
  104 + for (String serviceName : services)
  105 + processingChain.add(new RequestPart(serviceName, options.get(i++)));
  106 +
  107 + return new ObjectRequest(text, processingChain);
  108 + }
  109 +
  110 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.multiservice;
  2 +
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  6 +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
  7 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  8 +
  9 +import java.io.File;
  10 +import java.io.FileOutputStream;
  11 +import java.io.IOException;
  12 +import java.io.ObjectOutputStream;
  13 +import java.util.Arrays;
  14 +import java.util.List;
  15 +
  16 +public class NLPProcess {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(NLPProcess.class);
  19 +
  20 + private static final List<String> SERVICES = Arrays.asList("Concraft", "Spejd", "Nerf", "MentionDetector",
  21 + "Bartek");
  22 + private static final int PORT = 20000;
  23 + private static final String HOST = "multiservice.nlp.ipipan.waw.pl";
  24 +
  25 + private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT);
  26 +
  27 + private NLPProcess() {
  28 + }
  29 +
  30 + public static void main(String[] args) {
  31 + if (args.length != 2) {
  32 + LOG.error("Wrong usage! Try " + NLPProcess.class.getSimpleName() + " dirWithCorpusFiles targetDir");
  33 + return;
  34 + }
  35 + File corpusDir = new File(args[0]);
  36 + if (!corpusDir.isDirectory()) {
  37 + LOG.error("Corpus directory does not exist: " + corpusDir);
  38 + return;
  39 + }
  40 + File targetDir = new File(args[1]);
  41 + if (!targetDir.isDirectory()) {
  42 + LOG.error("Target directory does not exist: " + targetDir);
  43 + return;
  44 + }
  45 +
  46 + int ok = 0;
  47 + int err = 0;
  48 + File[] files = corpusDir.listFiles(f -> f.getName().endsWith(".xml"));
  49 + Arrays.sort(files);
  50 + for (File file : files) {
  51 + try {
  52 + Text text = PSC_IO.readText(file);
  53 + File targetFile = new File(targetDir, file.getName().replaceFirst(".xml$", ".bin"));
  54 + annotateNLP(text, targetFile);
  55 + ok++;
  56 + } catch (Exception e) {
  57 + err++;
  58 + LOG.error("Problem with text in " + file + ", " + e);
  59 + }
  60 + }
  61 + LOG.info(ok + " texts processed successfully.");
  62 + LOG.info(err + " texts with errors.");
  63 + }
  64 +
  65 + private static void annotateNLP(Text text, File targetFile) throws Exception {
  66 + annotate(text.getBody(), targetFile);
  67 + }
  68 +
  69 + private static void annotate(String body, File targetFile) throws Exception {
  70 + if (targetFile.exists()) {
  71 + LOG.debug("Skipping existing file..");
  72 + return;
  73 + }
  74 + LOG.info("Processing text into " + targetFile.getPath());
  75 + TText ttext = MSPROXY.process(body, SERVICES);
  76 + serialize(ttext, targetFile);
  77 + }
  78 +
  79 + public static void serialize(TText ttext, File targetFile) throws IOException {
  80 + try (FileOutputStream fout = new FileOutputStream(targetFile);
  81 + ObjectOutputStream oos = new ObjectOutputStream(fout)) {
  82 + oos.writeObject(ttext);
  83 + }
  84 + }
  85 +
  86 + public static TText annotate(String body) throws Exception {
  87 + return MSPROXY.process(body, SERVICES);
  88 + }
  89 +
  90 +}
nicolas-zero/pom.xml
@@ -11,4 +11,34 @@ @@ -11,4 +11,34 @@
11 11
12 <artifactId>nicolas-zero</artifactId> 12 <artifactId>nicolas-zero</artifactId>
13 13
  14 + <dependencies>
  15 + <!-- project -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>nicolas-common</artifactId>
  19 + </dependency>
  20 +
  21 + <!-- third party -->
  22 + <dependency>
  23 + <groupId>org.apache.commons</groupId>
  24 + <artifactId>commons-csv</artifactId>
  25 + </dependency>
  26 + <dependency>
  27 + <groupId>commons-io</groupId>
  28 + <artifactId>commons-io</artifactId>
  29 + </dependency>
  30 +
  31 + <!-- logging -->
  32 + <dependency>
  33 + <groupId>org.slf4j</groupId>
  34 + <artifactId>slf4j-api</artifactId>
  35 + </dependency>
  36 +
  37 + <!-- test -->
  38 + <dependency>
  39 + <groupId>junit</groupId>
  40 + <artifactId>junit</artifactId>
  41 + </dependency>
  42 + </dependencies>
  43 +
14 </project> 44 </project>
15 \ No newline at end of file 45 \ No newline at end of file
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java renamed to nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
@@ -3,126 +3,61 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; @@ -3,126 +3,61 @@ package pl.waw.ipipan.zil.summ.nicolas.zero;
3 import com.google.common.collect.Lists; 3 import com.google.common.collect.Lists;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
5 import com.google.common.collect.Sets; 5 import com.google.common.collect.Sets;
6 -import org.apache.commons.csv.CSVFormat;  
7 -import org.apache.commons.csv.CSVPrinter;  
8 -import org.apache.commons.csv.QuoteMode;  
9 -import org.apache.commons.io.IOUtils;  
10 import pl.waw.ipipan.zil.multiservice.thrift.types.*; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
11 -import pl.waw.ipipan.zil.summ.nicolas.Utils;  
12 7
13 -import java.io.File;  
14 -import java.io.FileReader;  
15 -import java.io.FileWriter;  
16 -import java.io.IOException;  
17 import java.util.Arrays; 8 import java.util.Arrays;
18 import java.util.List; 9 import java.util.List;
19 import java.util.Map; 10 import java.util.Map;
20 import java.util.Set; 11 import java.util.Set;
21 12
22 -/**  
23 - * Created by me2 on 26.07.16.  
24 - */  
25 -public class Zero { 13 +public class CandidateFinder {
26 14
27 - private static final String IDS_PATH = "summaries_dev";  
28 - private static final String THRIFTED_PATH = "src/main/resources/preprocessed_full_texts/dev/"; 15 + public List<ZeroSubjectCandidate> findZeroSubjectCandidates(TText text, Set<String> summarySentenceIds) {
  16 + List<ZeroSubjectCandidate> candidates = Lists.newArrayList();
29 17
30 - public static void main(String[] args) throws IOException {  
31 -  
32 - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);  
33 - Map<String, List<String>> id2sentIds = loadSentenceIds(IDS_PATH);  
34 -  
35 - int mentionCount = 0;  
36 - int mentionInNom = 0;  
37 - int mentionInNomSequential = 0;  
38 -  
39 - List<List<Object>> rows = Lists.newArrayList();  
40 - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {  
41 - String textId = entry.getKey();  
42 -// System.out.println(id);  
43 -  
44 - TText text = entry.getValue();  
45 - List<String> sentenceIds = id2sentIds.get(textId);  
46 -// System.out.println(sentenceIds);  
47 -  
48 - Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();  
49 - for (TCoreference coreference : text.getCoreferences()) {  
50 - for (String mentionId : coreference.getMentionIds()) {  
51 - mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));  
52 - } 18 + Map<String, Set<String>> mentionId2Cluster = Maps.newHashMap();
  19 + for (TCoreference coreference : text.getCoreferences()) {
  20 + for (String mentionId : coreference.getMentionIds()) {
  21 + mentionId2Cluster.put(mentionId, Sets.newHashSet(coreference.getMentionIds()));
53 } 22 }
  23 + }
54 24
55 - Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet();  
56 - TSentence prevSentence = null;  
57 - for (TParagraph p : text.getParagraphs()) {  
58 - Map<TMention, String> tMentionStringMap = Utils.loadMention2Orth(p.getSentences());  
59 -  
60 - for (TSentence sentence : p.getSentences()) {  
61 - if (!sentenceIds.contains(sentence.getId()))  
62 - continue;  
63 - Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();  
64 -  
65 - Map<String, TToken> tokenId2Token = Maps.newHashMap();  
66 - for (TToken t : sentence.getTokens())  
67 - tokenId2Token.put(t.getId(), t); 25 + Set<String> prevSentenceNominativeMentionIds = Sets.newHashSet();
  26 + TSentence prevSentence = null;
  27 + for (TParagraph p : text.getParagraphs()) {
  28 + for (TSentence sentence : p.getSentences()) {
  29 + if (!summarySentenceIds.contains(sentence.getId()))
  30 + continue;
  31 + Set<String> currentSentenceNominativeMentionIds = Sets.newHashSet();
68 32
69 - for (TMention mention : sentence.getMentions()) {  
70 - mentionCount++; 33 + Map<String, TToken> tokenId2Token = Maps.newHashMap();
  34 + for (TToken t : sentence.getTokens())
  35 + tokenId2Token.put(t.getId(), t);
71 36
72 - for (String tokenId : mention.getHeadIds()) {  
73 - TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();  
74 - if (isInNominative(interp)) {  
75 - mentionInNom++; 37 + for (TMention mention : sentence.getMentions()) {
76 38
77 - currentSentenceNominativeMentionIds.add(mention.getId());  
78 - if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) {  
79 - mentionInNomSequential++;  
80 - System.out.println(tMentionStringMap.get(mention)  
81 - + "\n\t" + Utils.loadSentence2Orth(prevSentence)  
82 - + "\n\t" + Utils.loadSentence2Orth(sentence)); 39 + for (String tokenId : mention.getHeadIds()) {
  40 + TInterpretation interp = tokenId2Token.get(tokenId).getChosenInterpretation();
  41 + if (isInNominative(interp)) {
83 42
84 - List<Object> row = Lists.newArrayList();  
85 - row.add("C");  
86 - row.add(textId);  
87 - row.add(tMentionStringMap.get(mention));  
88 - row.add(Utils.loadSentence2Orth(prevSentence));  
89 - row.add(Utils.loadSentence2Orth(sentence));  
90 - rows.add(row);  
91 - }  
92 - break; 43 + currentSentenceNominativeMentionIds.add(mention.getId());
  44 + if (mentionId2Cluster.get(mention.getId()).stream().anyMatch(prevSentenceNominativeMentionIds::contains)) {
  45 + ZeroSubjectCandidate candidate = new ZeroSubjectCandidate(prevSentence, sentence, mention);
  46 + candidates.add(candidate);
93 } 47 }
  48 + break;
94 } 49 }
95 } 50 }
96 -  
97 - prevSentence = sentence;  
98 - prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds;  
99 } 51 }
100 - }  
101 - }  
102 -  
103 - System.out.println(mentionCount + " mentions");  
104 - System.out.println(mentionInNom + " mention in nom");  
105 - System.out.println(mentionInNomSequential + " mention in nom with previous in nom");  
106 52
107 - try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) {  
108 - for (List<Object> row : rows) {  
109 - csvPrinter.printRecord(row); 53 + prevSentence = sentence;
  54 + prevSentenceNominativeMentionIds = currentSentenceNominativeMentionIds;
110 } 55 }
111 } 56 }
112 - 57 + return candidates;
113 } 58 }
114 59
115 private static boolean isInNominative(TInterpretation interp) { 60 private static boolean isInNominative(TInterpretation interp) {
116 return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); 61 return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom"));
117 } 62 }
118 -  
119 - private static Map<String, List<String>> loadSentenceIds(String idsPath) throws IOException {  
120 - Map<String, List<String>> result = Maps.newHashMap();  
121 - for (File f : new File(idsPath).listFiles()) {  
122 - String id = f.getName().split("_")[0];  
123 - List<String> sentenceIds = IOUtils.readLines(new FileReader(f));  
124 - result.put(id, sentenceIds);  
125 - }  
126 - return result;  
127 - }  
128 } 63 }
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/Zero.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import com.google.common.collect.Lists;
  4 +import com.google.common.collect.Maps;
  5 +import com.google.common.collect.Sets;
  6 +import org.apache.commons.csv.CSVFormat;
  7 +import org.apache.commons.csv.CSVPrinter;
  8 +import org.apache.commons.csv.QuoteMode;
  9 +import org.apache.commons.io.IOUtils;
  10 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  11 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper;
  12 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  13 +
  14 +import java.io.File;
  15 +import java.io.FileReader;
  16 +import java.io.FileWriter;
  17 +import java.io.IOException;
  18 +import java.util.List;
  19 +import java.util.Map;
  20 +import java.util.Set;
  21 +
  22 +public class Zero {
  23 +
  24 + private static final String IDS_PATH = "corpora/summaries_dev";
  25 + private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/";
  26 +
  27 + private Zero() {
  28 + }
  29 +
  30 + public static void main(String[] args) throws IOException {
  31 +
  32 + CandidateFinder candidateFinder = new CandidateFinder();
  33 +
  34 + Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);
  35 + Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH);
  36 +
  37 + List<List<Object>> rows = Lists.newArrayList();
  38 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  39 + String textId = entry.getKey();
  40 +
  41 + TText text = entry.getValue();
  42 + ThriftTextHelper thriftTextHelper = new ThriftTextHelper(text);
  43 +
  44 + Set<String> sentenceIds = id2sentIds.get(textId);
  45 +
  46 + List<ZeroSubjectCandidate> zeroSubjectCandidates = candidateFinder.findZeroSubjectCandidates(text, sentenceIds);
  47 +
  48 + for (ZeroSubjectCandidate candidate : zeroSubjectCandidates) {
  49 + List<Object> row = Lists.newArrayList();
  50 + row.add("C");
  51 + row.add(textId);
  52 + row.add(thriftTextHelper.getMentionText(candidate.getZeroCandidateMention()));
  53 + row.add(thriftTextHelper.getSentenceText(candidate.getPreviousSentence()));
  54 + row.add(thriftTextHelper.getSentenceText(candidate.getSentence()));
  55 + rows.add(row);
  56 + }
  57 + }
  58 +
  59 + try (CSVPrinter csvPrinter = new CSVPrinter(new FileWriter("zeros.tsv"), CSVFormat.DEFAULT.withDelimiter('\t').withEscape('\\').withQuoteMode(QuoteMode.NONE).withQuote('"'))) {
  60 + for (List<Object> row : rows) {
  61 + csvPrinter.printRecord(row);
  62 + }
  63 + }
  64 +
  65 + }
  66 +
  67 + private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException {
  68 + Map<String, Set<String>> result = Maps.newHashMap();
  69 + for (File f : new File(idsPath).listFiles()) {
  70 + String id = f.getName().split("_")[0];
  71 + List<String> sentenceIds = IOUtils.readLines(new FileReader(f));
  72 + result.put(id, Sets.newHashSet(sentenceIds));
  73 + }
  74 + return result;
  75 + }
  76 +}
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  5 +
  6 +public class ZeroSubjectCandidate {
  7 +
  8 + private final TSentence previousSentence;
  9 + private final TSentence sentence;
  10 + private final TMention zeroCandidateMention;
  11 +
  12 + public ZeroSubjectCandidate(TSentence previousSentence, TSentence sentence, TMention zeroCandidateMention) {
  13 + this.previousSentence = previousSentence;
  14 + this.sentence = sentence;
  15 + this.zeroCandidateMention = zeroCandidateMention;
  16 + }
  17 +
  18 + public TSentence getPreviousSentence() {
  19 + return previousSentence;
  20 + }
  21 +
  22 + public TSentence getSentence() {
  23 + return sentence;
  24 + }
  25 +
  26 + public TMention getZeroCandidateMention() {
  27 + return zeroCandidateMention;
  28 + }
  29 +}
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +
  4 +public class ZeroSubjectInjector {
  5 +}
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import com.google.common.collect.Sets;
  4 +import org.apache.commons.io.IOUtils;
  5 +import org.junit.BeforeClass;
  6 +import org.junit.Test;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  8 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  9 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftTextHelper;
  10 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  11 +
  12 +import java.io.IOException;
  13 +import java.io.InputStream;
  14 +import java.io.InputStreamReader;
  15 +import java.util.List;
  16 +import java.util.Set;
  17 +
  18 +import static org.junit.Assert.assertEquals;
  19 +
  20 +public class CandidateFinderTest {
  21 +
  22 + private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin";
  23 + private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt";
  24 +
  25 + private static CandidateFinder candidateFinder;
  26 +
  27 + @BeforeClass
  28 + public static void init() {
  29 + candidateFinder = new CandidateFinder();
  30 + }
  31 +
  32 + @Test
  33 + public void shouldFindZeroSubjectCandidateInSampleText() throws Exception {
  34 + ThriftTextHelper sampleTextHelper = loadSampleTextHelper();
  35 + Set<String> summarySentenceIds = loadSampleTextSummarySentenceIds();
  36 + List<ZeroSubjectCandidate> candidates = candidateFinder.findZeroSubjectCandidates(sampleTextHelper.getText(), summarySentenceIds);
  37 + assertEquals(1, candidates.size());
  38 +
  39 + ZeroSubjectCandidate zeroSubjectCandidate = candidates.get(0);
  40 + TSentence firstSentence = zeroSubjectCandidate.getPreviousSentence();
  41 + TSentence secondSentence = zeroSubjectCandidate.getSentence();
  42 + TMention zeroCandidate = zeroSubjectCandidate.getZeroCandidateMention();
  43 +
  44 + assertEquals("Ala ma kota.", sampleTextHelper.getSentenceText(firstSentence));
  45 + assertEquals("Ala ma też psa.", sampleTextHelper.getSentenceText(secondSentence));
  46 + assertEquals("Ala", sampleTextHelper.getMentionText(zeroCandidate));
  47 + }
  48 +
  49 + private Set<String> loadSampleTextSummarySentenceIds() throws IOException {
  50 + try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_SUMMARY_IDS_PATH);
  51 + InputStreamReader reader = new InputStreamReader(stream)) {
  52 + return Sets.newHashSet(IOUtils.readLines(reader));
  53 + }
  54 + }
  55 +
  56 + private ThriftTextHelper loadSampleTextHelper() throws IOException {
  57 + try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
  58 + return new ThriftTextHelper(Utils.loadThrifted(stream));
  59 + }
  60 + }
  61 +}
0 \ No newline at end of file 62 \ No newline at end of file
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjectorTest.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import org.junit.Test;
  4 +
  5 +public class ZeroSubjectInjectorTest {
  6 +
  7 + @Test
  8 + public void shouldInit() throws Exception {
  9 + ZeroSubjectInjector injector = new ZeroSubjectInjector();
  10 + }
  11 +}
0 \ No newline at end of file 12 \ No newline at end of file
@@ -16,6 +16,7 @@ @@ -16,6 +16,7 @@
16 <module>nicolas-model</module> 16 <module>nicolas-model</module>
17 <module>nicolas-train</module> 17 <module>nicolas-train</module>
18 <module>nicolas-zero</module> 18 <module>nicolas-zero</module>
  19 + <module>nicolas-common</module>
19 </modules> 20 </modules>
20 21
21 <properties> 22 <properties>
@@ -30,6 +31,8 @@ @@ -30,6 +31,8 @@
30 <weka-dev.version>3.9.0</weka-dev.version> 31 <weka-dev.version>3.9.0</weka-dev.version>
31 <commons-lang3.version>3.5</commons-lang3.version> 32 <commons-lang3.version>3.5</commons-lang3.version>
32 <commons-io.version>2.5</commons-io.version> 33 <commons-io.version>2.5</commons-io.version>
  34 + <slf4j-api.version>1.7.12</slf4j-api.version>
  35 + <junit.version>4.12</junit.version>
33 </properties> 36 </properties>
34 37
35 <prerequisites> 38 <prerequisites>
@@ -46,13 +49,20 @@ @@ -46,13 +49,20 @@
46 49
47 <dependencyManagement> 50 <dependencyManagement>
48 <dependencies> 51 <dependencies>
  52 + <!-- project -->
49 <dependency> 53 <dependency>
50 <groupId>pl.waw.ipipan.zil.summ</groupId> 54 <groupId>pl.waw.ipipan.zil.summ</groupId>
51 <artifactId>nicolas-model</artifactId> 55 <artifactId>nicolas-model</artifactId>
52 <version>${project.version}</version> 56 <version>${project.version}</version>
53 <scope>runtime</scope> 57 <scope>runtime</scope>
54 </dependency> 58 </dependency>
  59 + <dependency>
  60 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  61 + <artifactId>nicolas-common</artifactId>
  62 + <version>${project.version}</version>
  63 + </dependency>
55 64
  65 + <!-- internal -->
56 <dependency> 66 <dependency>
57 <groupId>pl.waw.ipipan.zil.summ</groupId> 67 <groupId>pl.waw.ipipan.zil.summ</groupId>
58 <artifactId>pscapi</artifactId> 68 <artifactId>pscapi</artifactId>
@@ -64,6 +74,7 @@ @@ -64,6 +74,7 @@
64 <version>${utils.version}</version> 74 <version>${utils.version}</version>
65 </dependency> 75 </dependency>
66 76
  77 + <!-- third party -->
67 <dependency> 78 <dependency>
68 <groupId>org.apache.commons</groupId> 79 <groupId>org.apache.commons</groupId>
69 <artifactId>commons-csv</artifactId> 80 <artifactId>commons-csv</artifactId>
@@ -89,6 +100,20 @@ @@ -89,6 +100,20 @@
89 <artifactId>commons-io</artifactId> 100 <artifactId>commons-io</artifactId>
90 <version>${commons-io.version}</version> 101 <version>${commons-io.version}</version>
91 </dependency> 102 </dependency>
  103 +
  104 + <!-- logging -->
  105 + <dependency>
  106 + <groupId>org.slf4j</groupId>
  107 + <artifactId>slf4j-api</artifactId>
  108 + <version>${slf4j-api.version}</version>
  109 + </dependency>
  110 +
  111 + <!-- test -->
  112 + <dependency>
  113 + <groupId>junit</groupId>
  114 + <artifactId>junit</artifactId>
  115 + <version>${junit.version}</version>
  116 + </dependency>
92 </dependencies> 117 </dependencies>
93 </dependencyManagement> 118 </dependencyManagement>
94 119