Commit 2169abf847196b768600339e9e18d7ab3fe70f3d

Authored by Mateusz Kopeć
1 parent cb490cab

WIP

Showing 44 changed files with 529 additions and 351 deletions
.gitignore
@@ -16,3 +16,5 @@ hs_err_pid* @@ -16,3 +16,5 @@ hs_err_pid*
16 16
17 .idea 17 .idea
18 *.iml 18 *.iml
  19 +
  20 +/data
19 \ No newline at end of file 21 \ No newline at end of file
nicolas-cli/README.md
@@ -3,6 +3,8 @@ @@ -3,6 +3,8 @@
3 This module contains a sample command-line application, which uses Nicolas library to summarize chosen input text file. 3 This module contains a sample command-line application, which uses Nicolas library to summarize chosen input text file.
4 Summary is written to target output file. Additionally, user needs to specify desired number of tokens in the summary. 4 Summary is written to target output file. Additionally, user needs to specify desired number of tokens in the summary.
5 5
  6 +Be aware that summarizer requires internet access and working Multiservice (multiservice.nlp.ipipan.waw.pl).
  7 +
6 ## Installation 8 ## Installation
7 9
8 mvn clean install 10 mvn clean install
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java
@@ -3,10 +3,9 @@ package pl.waw.ipipan.zil.summ.nicolas.cli; @@ -3,10 +3,9 @@ package pl.waw.ipipan.zil.summ.nicolas.cli;
3 import org.slf4j.Logger; 3 import org.slf4j.Logger;
4 import org.slf4j.LoggerFactory; 4 import org.slf4j.LoggerFactory;
5 import pl.waw.ipipan.zil.summ.nicolas.Nicolas; 5 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
  6 +import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
6 import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; 7 import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor;
7 8
8 -import java.io.IOException;  
9 -  
10 public class Main { 9 public class Main {
11 10
12 private static final Logger LOG = LoggerFactory.getLogger(Main.class); 11 private static final Logger LOG = LoggerFactory.getLogger(Main.class);
@@ -26,7 +25,7 @@ public class Main { @@ -26,7 +25,7 @@ public class Main {
26 try { 25 try {
27 nicolas = new Nicolas(); 26 nicolas = new Nicolas();
28 preprocessor = new Preprocessor(); 27 preprocessor = new Preprocessor();
29 - } catch (IOException | ClassNotFoundException e) { 28 + } catch (NicolasException e) {
30 LOG.error("Error loading Nicolas or Multiservice preprocessor! Will exit."); 29 LOG.error("Error loading Nicolas or Multiservice preprocessor! Will exit.");
31 return; 30 return;
32 } 31 }
nicolas-common/pom.xml
@@ -25,7 +25,7 @@ @@ -25,7 +25,7 @@
25 <!-- third party --> 25 <!-- third party -->
26 <dependency> 26 <dependency>
27 <groupId>nz.ac.waikato.cms.weka</groupId> 27 <groupId>nz.ac.waikato.cms.weka</groupId>
28 - <artifactId>weka-dev</artifactId> 28 + <artifactId>weka-stable</artifactId>
29 </dependency> 29 </dependency>
30 <dependency> 30 <dependency>
31 <groupId>commons-io</groupId> 31 <groupId>commons-io</groupId>
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.common;
  2 +
  3 +import com.google.common.base.Predicates;
  4 +import com.google.common.collect.Maps;
  5 +import org.slf4j.Logger;
  6 +import org.slf4j.LoggerFactory;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +
  9 +import java.io.File;
  10 +import java.io.FileInputStream;
  11 +import java.io.IOException;
  12 +import java.io.InputStream;
  13 +import java.util.Map;
  14 +import java.util.function.Predicate;
  15 +
  16 +public class ThriftUtils {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class);
  19 +
  20 + private ThriftUtils() {
  21 + }
  22 +
  23 + public static Map<String, TText> loadThriftTextsFromFolder(File folder, Predicate<String> idFilter) {
  24 + Map<String, TText> id2text = Maps.newHashMap();
  25 + File[] files = folder.listFiles();
  26 + if (files != null) {
  27 + for (File processedFullTextFile : files) {
  28 + String textId = processedFullTextFile.getName().split("\\.")[0];
  29 + if (!idFilter.test(textId))
  30 + continue;
  31 + TText processedFullText = loadThriftTextFromFile(processedFullTextFile);
  32 + id2text.put(textId, processedFullText);
  33 + }
  34 + }
  35 + LOG.info("{} preprocessed texts found.", id2text.size());
  36 + return id2text;
  37 + }
  38 +
  39 + public static Map<String, TText> loadThriftTextsFromFolder(File folder) {
  40 + return loadThriftTextsFromFolder(folder, Predicates.alwaysTrue());
  41 + }
  42 +
  43 + public static TText loadThriftTextFromFile(File originalFile) {
  44 + try (FileInputStream inputStream = new FileInputStream(originalFile)) {
  45 + return loadThriftTextFromStream(inputStream);
  46 + } catch (IOException e) {
  47 + LOG.error("Error reading serialized Thrift file", e);
  48 + return null;
  49 + }
  50 + }
  51 +
  52 + public static TText loadThriftTextFromStream(InputStream stream) {
  53 + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) {
  54 + return (TText) ois.readObject();
  55 + } catch (ClassNotFoundException | IOException e) {
  56 + LOG.error("Error reading serialized Thrift stream", e);
  57 + return null;
  58 + }
  59 + }
  60 +
  61 +}
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
@@ -28,6 +28,12 @@ public class Utils { @@ -28,6 +28,12 @@ public class Utils {
28 private Utils() { 28 private Utils() {
29 } 29 }
30 30
  31 + public static void writeStringToFile(String string, File file) throws IOException {
  32 + try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) {
  33 + bw.append(string);
  34 + }
  35 + }
  36 +
31 public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { 37 public static Classifier loadModelFromResource(String modelResourcePath) throws IOException {
32 LOG.info("Loading classifier from path: {}...", modelResourcePath); 38 LOG.info("Loading classifier from path: {}...", modelResourcePath);
33 try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { 39 try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) {
@@ -76,44 +82,15 @@ public class Utils { @@ -76,44 +82,15 @@ public class Utils {
76 return instances; 82 return instances;
77 } 83 }
78 84
79 - public static Classifier loadClassifier(String path) throws IOException, ClassNotFoundException { 85 + public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException {
80 LOG.info("Loading classifier..."); 86 LOG.info("Loading classifier...");
81 - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) { 87 + try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) {
82 Classifier classifier = (Classifier) ois.readObject(); 88 Classifier classifier = (Classifier) ois.readObject();
83 LOG.info("Done. " + classifier.toString()); 89 LOG.info("Done. " + classifier.toString());
84 return classifier; 90 return classifier;
85 } 91 }
86 } 92 }
87 93
88 - public static Map<String, TText> loadPreprocessedTexts(String path) {  
89 - Map<String, TText> id2text = Maps.newHashMap();  
90 - for (File processedFullTextFile : new File(path).listFiles()) {  
91 - TText processedFullText = loadThrifted(processedFullTextFile);  
92 - id2text.put(processedFullTextFile.getName().split("\\.")[0], processedFullText);  
93 - }  
94 - LOG.info(id2text.size() + " preprocessed texts found.");  
95 - return id2text;  
96 - }  
97 -  
98 -  
99 - public static TText loadThrifted(File originalFile) {  
100 - try (FileInputStream inputStream = new FileInputStream(originalFile)) {  
101 - return loadThrifted(inputStream);  
102 - } catch (IOException e) {  
103 - LOG.error("Error reading serialized file: " + e);  
104 - return null;  
105 - }  
106 - }  
107 -  
108 - public static TText loadThrifted(InputStream stream) {  
109 - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) {  
110 - return (TText) ois.readObject();  
111 - } catch (ClassNotFoundException | IOException e) {  
112 - LOG.error("Error reading serialized file: " + e);  
113 - return null;  
114 - }  
115 - }  
116 -  
117 public static List<String> tokenize(String text) { 94 public static List<String> tokenize(String text) {
118 return Arrays.asList(text.split("[^\\p{L}0-9]+")); 95 return Arrays.asList(text.split("[^\\p{L}0-9]+"));
119 } 96 }
nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java
@@ -14,7 +14,7 @@ public class UtilsTest { @@ -14,7 +14,7 @@ public class UtilsTest {
14 @Test 14 @Test
15 public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { 15 public void shouldDeserializeTextIgnoringClassVersionId() throws Exception {
16 try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { 16 try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
17 - TText text = Utils.loadThrifted(stream); 17 + TText text = ThriftUtils.loadThriftTextFromStream(stream);
18 assertEquals(26, text.getParagraphs().size()); 18 assertEquals(26, text.getParagraphs().size());
19 assertEquals(2, text.getParagraphs().get(4).getSentences().size()); 19 assertEquals(2, text.getParagraphs().get(4).getSentences().size());
20 } 20 }
nicolas-eval/pom.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <parent>
  6 + <artifactId>nicolas-container</artifactId>
  7 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  8 + <version>1.0-SNAPSHOT</version>
  9 + </parent>
  10 + <modelVersion>4.0.0</modelVersion>
  11 +
  12 + <artifactId>nicolas-eval</artifactId>
  13 +
  14 + <dependencies>
  15 + <!-- project -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>nicolas-lib</artifactId>
  19 + </dependency>
  20 + <dependency>
  21 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  22 + <artifactId>nicolas-common</artifactId>
  23 + </dependency>
  24 +
  25 + <!-- internal -->
  26 + <dependency>
  27 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  28 + <artifactId>eval</artifactId>
  29 + </dependency>
  30 +
  31 + <!-- third party -->
  32 + <dependency>
  33 + <groupId>nz.ac.waikato.cms.weka</groupId>
  34 + <artifactId>weka-stable</artifactId>
  35 + </dependency>
  36 + <dependency>
  37 + <groupId>org.apache.commons</groupId>
  38 + <artifactId>commons-lang3</artifactId>
  39 + </dependency>
  40 + <dependency>
  41 + <groupId>com.google.guava</groupId>
  42 + <artifactId>guava</artifactId>
  43 + </dependency>
  44 +
  45 + <!-- logging -->
  46 + <dependency>
  47 + <groupId>org.slf4j</groupId>
  48 + <artifactId>slf4j-api</artifactId>
  49 + </dependency>
  50 + <dependency>
  51 + <groupId>org.slf4j</groupId>
  52 + <artifactId>slf4j-simple</artifactId>
  53 + </dependency>
  54 +
  55 + </dependencies>
  56 +</project>
0 \ No newline at end of file 57 \ No newline at end of file
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import org.apache.commons.io.IOUtils;
  4 +
  5 +import java.io.IOException;
  6 +import java.io.InputStream;
  7 +import java.util.List;
  8 +import java.util.Set;
  9 +import java.util.stream.Collectors;
  10 +
  11 +public class Constants {
  12 +
  13 + private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt";
  14 +
  15 + private Constants() {
  16 + }
  17 +
  18 + public static Set<String> loadTestTextIds() throws IOException {
  19 + try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) {
  20 + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING);
  21 + return testTextIds.stream().map(String::trim).collect(Collectors.toSet());
  22 + }
  23 + }
  24 +}
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import pl.waw.ipipan.zil.summ.eval.Main;
  4 +
  5 +public class Evaluate {
  6 +
  7 + private Evaluate() {
  8 + }
  9 +
  10 + public static void main(String[] args) {
  11 + String goldDirPath = "data/summaries-gold";
  12 + String systemDirPath = "data/summaries";
  13 + Main.main(new String[]{goldDirPath, systemDirPath});
  14 + }
  15 +}
0 \ No newline at end of file 16 \ No newline at end of file
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/ExtractGoldSummaries.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  4 +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
  5 +import pl.waw.ipipan.zil.summ.pscapi.xml.Summary;
  6 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  7 +
  8 +import javax.xml.bind.JAXBException;
  9 +import java.io.File;
  10 +import java.io.IOException;
  11 +import java.util.List;
  12 +import java.util.Set;
  13 +import java.util.stream.Collectors;
  14 +
  15 +import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds;
  16 +
  17 +public class ExtractGoldSummaries {
  18 +
  19 + private ExtractGoldSummaries() {
  20 + }
  21 +
  22 + public static void main(String[] args) throws IOException, JAXBException {
  23 + File corpusDir = new File("data/corpus/PSC_1.0/data");
  24 + File targetDir = new File("data/summaries-gold");
  25 + targetDir.mkdir();
  26 +
  27 + Set<String> testTextIds = loadTestTextIds();
  28 + File[] files = corpusDir.listFiles();
  29 + if (files != null) {
  30 + for (File file : files) {
  31 + Text text = PSC_IO.readText(file);
  32 + if (!testTextIds.contains(text.getId()))
  33 + continue;
  34 +
  35 + List<Summary> goldSummaries = text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals("abstract") && summary.getRatio().equals(20)).collect(Collectors.toList());
  36 +
  37 + for (Summary summary : goldSummaries) {
  38 + File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt");
  39 + Utils.writeStringToFile(summary.getBody(), targetFile);
  40 + }
  41 + }
  42 + }
  43 + }
  44 +
  45 +}
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import com.google.common.collect.Maps;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
  9 +import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
  10 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  12 +
  13 +import java.io.File;
  14 +import java.io.IOException;
  15 +import java.util.List;
  16 +import java.util.Map;
  17 +import java.util.Set;
  18 +
  19 +import static java.util.stream.Collectors.toList;
  20 +import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds;
  21 +
  22 +public class SummarizeTestCorpus {
  23 +
  24 + private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class);
  25 +
  26 +
  27 + private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt";
  28 + private static final double SUMMARY_RATIO = 0.2;
  29 +
  30 + private SummarizeTestCorpus() {
  31 + }
  32 +
  33 + public static void main(String[] args) throws IOException, NicolasException {
  34 + File thriftedCorpusDir = new File("data/preprocessed");
  35 + File targetDir = new File("data/summaries");
  36 + targetDir.mkdir();
  37 +
  38 + Set<String> testTextIds = loadTestTextIds();
  39 + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains);
  40 + LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size());
  41 +
  42 + Map<String, String> id2summary = summarizeTexts(id2preprocessedText);
  43 + LOG.info("Texts summarized.");
  44 +
  45 + saveSummariesToFolder(id2summary, targetDir);
  46 + LOG.info("Texts saved to {} folder.", targetDir);
  47 + }
  48 +
  49 + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException {
  50 + Map<String, String> id2summary = Maps.newHashMap();
  51 + Nicolas nicolas = new Nicolas();
  52 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  53 + TText text = entry.getValue();
  54 + int targetSize = calculateTargetSize(text);
  55 + String summary = nicolas.summarizeThrift(text, targetSize);
  56 + id2summary.put(entry.getKey(), summary);
  57 + }
  58 + return id2summary;
  59 + }
  60 +
  61 + private static int calculateTargetSize(TText text) {
  62 + List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  63 + StringBuilder body = new StringBuilder();
  64 + for (TSentence sentence : sentences)
  65 + body.append(Utils.loadSentence2Orth(sentence)).append(" ");
  66 +
  67 + int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size();
  68 + return (int) (SUMMARY_RATIO * tokenCount);
  69 + }
  70 +
  71 + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException {
  72 + for (Map.Entry<String, String> entry : id2summary.entrySet()) {
  73 + String textId = entry.getKey();
  74 + String summary = entry.getValue();
  75 + String targetFileName = textId + SUMMARY_FILE_SUFFIX;
  76 + Utils.writeStringToFile(summary, new File(targetDir, targetFileName));
  77 + }
  78 + }
  79 +
  80 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateCommon.java renamed to nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train.search; 1 +package pl.waw.ipipan.zil.summ.nicolas.eval.search;
2 2
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.apache.commons.lang3.tuple.Pair; 4 import org.apache.commons.lang3.tuple.Pair;
@@ -35,13 +35,13 @@ import java.util.Random; @@ -35,13 +35,13 @@ import java.util.Random;
35 import java.util.logging.LogManager; 35 import java.util.logging.LogManager;
36 36
37 37
38 -class CrossvalidateCommon { 38 +class Crossvalidate {
39 39
40 - private static final Logger LOG = LoggerFactory.getLogger(CrossvalidateCommon.class); 40 + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
41 41
42 private static final int NUM_FOLDS = 10; 42 private static final int NUM_FOLDS = 10;
43 43
44 - private CrossvalidateCommon() { 44 + private Crossvalidate() {
45 } 45 }
46 46
47 static void crossvalidateClassifiers(String datasetPath) throws IOException { 47 static void crossvalidateClassifiers(String datasetPath) throws IOException {
@@ -77,7 +77,7 @@ class CrossvalidateCommon { @@ -77,7 +77,7 @@ class CrossvalidateCommon {
77 new DecisionTable(), new JRip(), new PART(), 77 new DecisionTable(), new JRip(), new PART(),
78 createAttributeSelectedClassifier()}).parallel().map(cls -> { 78 createAttributeSelectedClassifier()}).parallel().map(cls -> {
79 String name = cls.getClass().getSimpleName(); 79 String name = cls.getClass().getSimpleName();
80 - double acc = 0; 80 + double acc;
81 Evaluation eval; 81 Evaluation eval;
82 try { 82 try {
83 eval = new Evaluation(instances); 83 eval = new Evaluation(instances);
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/test_ids.txt renamed to nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt
nicolas-lib/pom.xml
@@ -35,7 +35,7 @@ @@ -35,7 +35,7 @@
35 <!-- third party --> 35 <!-- third party -->
36 <dependency> 36 <dependency>
37 <groupId>nz.ac.waikato.cms.weka</groupId> 37 <groupId>nz.ac.waikato.cms.weka</groupId>
38 - <artifactId>weka-dev</artifactId> 38 + <artifactId>weka-stable</artifactId>
39 </dependency> 39 </dependency>
40 <dependency> 40 <dependency>
41 <groupId>org.apache.commons</groupId> 41 <groupId>org.apache.commons</groupId>
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java
@@ -18,18 +18,18 @@ import java.util.Set; @@ -18,18 +18,18 @@ import java.util.Set;
18 18
19 import static java.util.stream.Collectors.toList; 19 import static java.util.stream.Collectors.toList;
20 20
21 -public class ThriftUtils { 21 +public class InstanceUtils {
22 22
23 - private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class); 23 + private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class);
24 24
25 - private ThriftUtils() { 25 + private InstanceUtils() {
26 } 26 }
27 27
28 public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { 28 public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
29 List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); 29 List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
30 Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); 30 Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
31 31
32 - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention."); 32 + LOG.info("Extracting {} features of each mention.", featureExtractor.getAttributesList().size());
33 Map<TMention, Instance> mention2instance = Maps.newHashMap(); 33 Map<TMention, Instance> mention2instance = Maps.newHashMap();
34 for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { 34 for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) {
35 Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); 35 Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
@@ -39,7 +39,7 @@ public class ThriftUtils { @@ -39,7 +39,7 @@ public class ThriftUtils {
39 } 39 }
40 mention2instance.put(tMention, instance); 40 mention2instance.put(tMention, instance);
41 } 41 }
42 - LOG.info("Extracted features of " + mention2instance.size() + " mentions."); 42 + LOG.info("Extracted features of {} mentions.", mention2instance.size());
43 return mention2instance; 43 return mention2instance;
44 } 44 }
45 45
@@ -47,7 +47,7 @@ public class ThriftUtils { @@ -47,7 +47,7 @@ public class ThriftUtils {
47 List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); 47 List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
48 Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); 48 Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions);
49 49
50 - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence."); 50 + LOG.info("Extracting {} features of each sentence.", featureExtractor.getAttributesList().size());
51 Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); 51 Map<TSentence, Instance> sentence2instance = Maps.newHashMap();
52 for (TSentence sentence : sentences) { 52 for (TSentence sentence : sentences) {
53 Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); 53 Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
@@ -57,7 +57,7 @@ public class ThriftUtils { @@ -57,7 +57,7 @@ public class ThriftUtils {
57 } 57 }
58 sentence2instance.put(sentence, instance); 58 sentence2instance.put(sentence, instance);
59 } 59 }
60 - LOG.info("Extracted features of " + sentence2instance.size() + " sentences."); 60 + LOG.info("Extracted features of {} sentences.", sentence2instance.size());
61 return sentence2instance; 61 return sentence2instance;
62 } 62 }
63 } 63 }
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -29,14 +29,18 @@ public class Nicolas { @@ -29,14 +29,18 @@ public class Nicolas {
29 private final SentenceFeatureExtractor sentenceFeatureExtractor; 29 private final SentenceFeatureExtractor sentenceFeatureExtractor;
30 private final ZeroFeatureExtractor zeroFeatureExtractor; 30 private final ZeroFeatureExtractor zeroFeatureExtractor;
31 31
32 - public Nicolas() throws IOException, ClassNotFoundException {  
33 - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);  
34 - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);  
35 - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);  
36 -  
37 - mentionFeatureExtractor = new MentionFeatureExtractor();  
38 - sentenceFeatureExtractor = new SentenceFeatureExtractor();  
39 - zeroFeatureExtractor = new ZeroFeatureExtractor(); 32 + public Nicolas() throws NicolasException {
  33 + try {
  34 + mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
  35 + sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
  36 + zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);
  37 +
  38 + mentionFeatureExtractor = new MentionFeatureExtractor();
  39 + sentenceFeatureExtractor = new SentenceFeatureExtractor();
  40 + zeroFeatureExtractor = new ZeroFeatureExtractor();
  41 + } catch (IOException e) {
  42 + throw new NicolasException(e);
  43 + }
40 } 44 }
41 45
42 public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { 46 public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException {
@@ -59,17 +63,17 @@ public class Nicolas { @@ -59,17 +63,17 @@ public class Nicolas {
59 } 63 }
60 64
61 private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { 65 private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception {
62 - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); 66 + List<TSentence> sentences = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
63 67
64 Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor); 68 Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor);
65 69
66 - List<TSentence> sortedSents = Lists.newArrayList(sents);  
67 - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); 70 + List<TSentence> sortedSentences = Lists.newArrayList(sentences);
  71 + sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed());
68 72
69 int size = 0; 73 int size = 0;
70 Random r = new Random(1); 74 Random r = new Random(1);
71 Set<TSentence> summary = Sets.newHashSet(); 75 Set<TSentence> summary = Sets.newHashSet();
72 - for (TSentence sent : sortedSents) { 76 + for (TSentence sent : sortedSentences) {
73 size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); 77 size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size();
74 if (r.nextDouble() > 0.4 && size > targetSize) 78 if (r.nextDouble() > 0.4 && size > targetSize)
75 break; 79 break;
@@ -78,7 +82,7 @@ public class Nicolas { @@ -78,7 +82,7 @@ public class Nicolas {
78 break; 82 break;
79 } 83 }
80 List<TSentence> selectedSentences = Lists.newArrayList(); 84 List<TSentence> selectedSentences = Lists.newArrayList();
81 - for (TSentence sent : sents) { 85 + for (TSentence sent : sentences) {
82 if (summary.contains(sent)) 86 if (summary.contains(sent))
83 selectedSentences.add(sent); 87 selectedSentences.add(sent);
84 } 88 }
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +public class NicolasException extends Exception {
  4 + public NicolasException(Exception e) {
  5 + super(e);
  6 + }
  7 +}
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java
@@ -8,8 +8,9 @@ import org.slf4j.LoggerFactory; @@ -8,8 +8,9 @@ import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; 11 +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
12 import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 12 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  13 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
13 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 14 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 16 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
@@ -34,15 +35,15 @@ public class ApplyModel { @@ -34,15 +35,15 @@ public class ApplyModel {
34 private static final String TARGET_DIR = "corpora/summaries"; 35 private static final String TARGET_DIR = "corpora/summaries";
35 36
36 public static void main(String[] args) throws Exception { 37 public static void main(String[] args) throws Exception {
37 - Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); 38 + Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
38 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); 39 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
39 40
40 - Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCE_MODEL_RESOURCE_PATH); 41 + Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
41 SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); 42 SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();
42 43
43 ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); 44 ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector();
44 45
45 - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH); 46 + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH));
46 int i = 1; 47 int i = 1;
47 double avgSize = 0; 48 double avgSize = 0;
48 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { 49 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
@@ -91,7 +92,7 @@ public class ApplyModel { @@ -91,7 +92,7 @@ public class ApplyModel {
91 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); 92 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
92 93
93 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); 94 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
94 - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); 95 + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
95 96
96 Map<TSentence, Double> sentence2score = Maps.newHashMap(); 97 Map<TSentence, Double> sentence2score = Maps.newHashMap();
97 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { 98 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
1 -package pl.waw.ipipan.zil.summ.nicolas.common.features; 1 +package pl.waw.ipipan.zil.summ.nicolas.features;
2 2
3 import com.google.common.collect.*; 3 import com.google.common.collect.*;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
1 -package pl.waw.ipipan.zil.summ.nicolas.common.features; 1 +package pl.waw.ipipan.zil.summ.nicolas.features;
2 2
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import com.google.common.collect.Sets; 4 import com.google.common.collect.Sets;
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java
1 -package pl.waw.ipipan.zil.summ.nicolas.common.features; 1 +package pl.waw.ipipan.zil.summ.nicolas.features;
2 2
3 import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation; 3 import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation;
4 4
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
@@ -5,9 +5,9 @@ import com.google.common.collect.Maps; @@ -5,9 +5,9 @@ import com.google.common.collect.Maps;
5 import pl.waw.ipipan.zil.multiservice.thrift.types.*; 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
6 import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 6 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 7 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
8 -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;  
9 -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;  
10 -import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; 8 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
  9 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
  10 +import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation;
11 import weka.core.Attribute; 11 import weka.core.Attribute;
12 12
13 import java.io.IOException; 13 import java.io.IOException;
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
@@ -5,7 +5,7 @@ import org.slf4j.Logger; @@ -5,7 +5,7 @@ import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8 -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; 8 +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
9 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 9 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
10 import weka.classifiers.Classifier; 10 import weka.classifiers.Classifier;
11 import weka.core.Instance; 11 import weka.core.Instance;
@@ -25,7 +25,7 @@ public class MentionModel { @@ -25,7 +25,7 @@ public class MentionModel {
25 Set<TMention> goodMentions = Sets.newHashSet(); 25 Set<TMention> goodMentions = Sets.newHashSet();
26 26
27 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 27 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
28 - Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(text, featureExtractor); 28 + Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor);
29 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { 29 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
30 Instance instance = entry.getValue(); 30 Instance instance = entry.getValue();
31 instance.setDataset(instances); 31 instance.setDataset(instances);
@@ -34,7 +34,7 @@ public class MentionModel { @@ -34,7 +34,7 @@ public class MentionModel {
34 if (good) 34 if (good)
35 goodMentions.add(entry.getKey()); 35 goodMentions.add(entry.getKey());
36 } 36 }
37 - LOG.info("Classified " + goodMentions.size() + " mentions as good."); 37 + LOG.info("Classified {} mentions as good.", goodMentions.size());
38 return goodMentions; 38 return goodMentions;
39 } 39 }
40 40
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
@@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; @@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence;
2 2
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import pl.waw.ipipan.zil.multiservice.thrift.types.*; 4 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
5 -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;  
6 -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; 5 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
  6 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
7 import weka.core.Attribute; 7 import weka.core.Attribute;
8 8
9 import java.util.List; 9 import java.util.List;
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
@@ -6,7 +6,7 @@ import org.slf4j.LoggerFactory; @@ -6,7 +6,7 @@ import org.slf4j.LoggerFactory;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; 9 +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
10 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 10 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
11 import weka.classifiers.Classifier; 11 import weka.classifiers.Classifier;
12 import weka.core.Instance; 12 import weka.core.Instance;
@@ -24,7 +24,7 @@ public class SentenceModel { @@ -24,7 +24,7 @@ public class SentenceModel {
24 24
25 public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { 25 public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
26 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); 26 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
27 - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); 27 + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
28 28
29 Map<TSentence, Double> sentence2score = Maps.newHashMap(); 29 Map<TSentence, Double> sentence2score = Maps.newHashMap();
30 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { 30 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
@@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; @@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
10 import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 10 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
11 -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;  
12 -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; 11 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
  12 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
13 import weka.core.Attribute; 13 import weka.core.Attribute;
14 14
15 import java.util.List; 15 import java.util.List;
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils;
5 import org.junit.Test; 5 import org.junit.Test;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;  
9 -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; 8 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 10
11 import java.io.IOException; 11 import java.io.IOException;
12 import java.io.InputStream; 12 import java.io.InputStream;
@@ -47,7 +47,7 @@ public class CandidateFinderTest { @@ -47,7 +47,7 @@ public class CandidateFinderTest {
47 47
48 private FeatureHelper loadSampleTextHelper() throws IOException { 48 private FeatureHelper loadSampleTextHelper() throws IOException {
49 try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { 49 try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
50 - return new FeatureHelper(Utils.loadThrifted(stream)); 50 + return new FeatureHelper(ThriftUtils.loadThriftTextFromStream(stream));
51 } 51 }
52 } 52 }
53 } 53 }
54 \ No newline at end of file 54 \ No newline at end of file
nicolas-train/pom.xml
@@ -25,6 +25,11 @@ @@ -25,6 +25,11 @@
25 <groupId>pl.waw.ipipan.zil.summ</groupId> 25 <groupId>pl.waw.ipipan.zil.summ</groupId>
26 <artifactId>nicolas-multiservice</artifactId> 26 <artifactId>nicolas-multiservice</artifactId>
27 </dependency> 27 </dependency>
  28 + <dependency>
  29 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  30 + <artifactId>nicolas-model</artifactId>
  31 + <scope>runtime</scope>
  32 + </dependency>
28 33
29 <!-- internal --> 34 <!-- internal -->
30 <dependency> 35 <dependency>
@@ -39,7 +44,7 @@ @@ -39,7 +44,7 @@
39 <!-- third party --> 44 <!-- third party -->
40 <dependency> 45 <dependency>
41 <groupId>nz.ac.waikato.cms.weka</groupId> 46 <groupId>nz.ac.waikato.cms.weka</groupId>
42 - <artifactId>weka-dev</artifactId> 47 + <artifactId>weka-stable</artifactId>
43 </dependency> 48 </dependency>
44 <dependency> 49 <dependency>
45 <groupId>org.apache.commons</groupId> 50 <groupId>org.apache.commons</groupId>
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/ModelConstants.java
@@ -5,11 +5,11 @@ import weka.classifiers.trees.RandomForest; @@ -5,11 +5,11 @@ import weka.classifiers.trees.RandomForest;
5 5
6 public class ModelConstants { 6 public class ModelConstants {
7 7
8 - public static final String MENTION_DATASET_PATH = "mentions_train.arff";  
9 - public static final String SENTENCE_DATASET_PATH = "sentences_train.arff";  
10 - public static final String ZERO_DATASET_PATH = "zeros_train.arff"; 8 + public static final String MENTION_DATASET_PATH = "data/arff/mentions_train.arff";
  9 + public static final String SENTENCE_DATASET_PATH = "data/arff/sentences_train.arff";
  10 + public static final String ZERO_DATASET_PATH = "data/arff/zeros_train.arff";
11 11
12 - private static final int NUM_ITERATIONS = 16; 12 + private static final int NUM_ITERATIONS = 250;
13 private static final int NUM_EXECUTION_SLOTS = 8; 13 private static final int NUM_EXECUTION_SLOTS = 8;
14 private static final int SEED = 0; 14 private static final int SEED = 0;
15 15
@@ -26,17 +26,17 @@ public class ModelConstants { @@ -26,17 +26,17 @@ public class ModelConstants {
26 26
27 public static Classifier getSentenceClassifier() { 27 public static Classifier getSentenceClassifier() {
28 RandomForest classifier = new RandomForest(); 28 RandomForest classifier = new RandomForest();
29 - classifier.setNumIterations(16);  
30 - classifier.setSeed(0);  
31 - classifier.setNumExecutionSlots(8); 29 + classifier.setNumIterations(NUM_ITERATIONS);
  30 + classifier.setSeed(SEED);
  31 + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS);
32 return classifier; 32 return classifier;
33 } 33 }
34 34
35 public static Classifier getZeroClassifier() { 35 public static Classifier getZeroClassifier() {
36 RandomForest classifier = new RandomForest(); 36 RandomForest classifier = new RandomForest();
37 - classifier.setNumIterations(16);  
38 - classifier.setSeed(0);  
39 - classifier.setNumExecutionSlots(8); 37 + classifier.setNumIterations(NUM_ITERATIONS);
  38 + classifier.setSeed(SEED);
  39 + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS);
40 return classifier; 40 return classifier;
41 } 41 }
42 42
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/TrainModelCommon.java
@@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.model.common; @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.model.common;
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 -import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel;  
7 import weka.classifiers.Classifier; 6 import weka.classifiers.Classifier;
8 import weka.core.Instances; 7 import weka.core.Instances;
9 import weka.core.converters.ArffLoader; 8 import weka.core.converters.ArffLoader;
@@ -16,7 +15,7 @@ import java.util.logging.LogManager; @@ -16,7 +15,7 @@ import java.util.logging.LogManager;
16 @SuppressWarnings("squid:S2118") 15 @SuppressWarnings("squid:S2118")
17 public class TrainModelCommon { 16 public class TrainModelCommon {
18 17
19 - private static final Logger LOG = LoggerFactory.getLogger(TrainZeroModel.class); 18 + private static final Logger LOG = LoggerFactory.getLogger(TrainModelCommon.class);
20 19
21 private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources"; 20 private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources";
22 21
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/PrepareTrainingData.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.mention;  
2 -  
3 -import com.google.common.base.Charsets;  
4 -import com.google.common.collect.Maps;  
5 -import com.google.common.io.Files;  
6 -import org.slf4j.Logger;  
7 -import org.slf4j.LoggerFactory;  
8 -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;  
9 -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;  
10 -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;  
11 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
12 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;  
13 -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;  
14 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;  
15 -import weka.core.Instance;  
16 -import weka.core.Instances;  
17 -import weka.core.converters.ArffSaver;  
18 -  
19 -import java.io.File;  
20 -import java.io.IOException;  
21 -import java.util.Map;  
22 -  
23 -  
24 -public class PrepareTrainingData {  
25 -  
26 - private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class);  
27 -  
28 - private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";  
29 - private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";  
30 -  
31 - private PrepareTrainingData() {  
32 - }  
33 -  
34 - public static void main(String[] args) throws IOException {  
35 -  
36 - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);  
37 - Map<String, String> id2optimalSummary = loadOptimalSummaries();  
38 -  
39 - MentionScorer mentionScorer = new MentionScorer();  
40 - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();  
41 -  
42 - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());  
43 -  
44 - int i = 1;  
45 - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {  
46 - LOG.info(i++ + "/" + id2preprocessedText.size());  
47 -  
48 - String id = entry.getKey();  
49 - TText preprocessedText = entry.getValue();  
50 - String optimalSummary = id2optimalSummary.get(id);  
51 - if (optimalSummary == null)  
52 - continue;  
53 - Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);  
54 -  
55 - Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor);  
56 - for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) {  
57 - TMention mention = entry2.getKey();  
58 - Instance instance = entry2.getValue();  
59 - instance.setDataset(instances);  
60 - instance.setClassValue(mention2score.get(mention));  
61 - instances.add(instance);  
62 - }  
63 - }  
64 - saveInstancesToFile(instances);  
65 - }  
66 -  
67 - private static void saveInstancesToFile(Instances instances) throws IOException {  
68 - ArffSaver saver = new ArffSaver();  
69 - saver.setInstances(instances);  
70 - saver.setFile(new File(ModelConstants.MENTION_DATASET_PATH));  
71 - saver.writeBatch();  
72 - }  
73 -  
74 - private static Map<String, String> loadOptimalSummaries() throws IOException {  
75 - Map<String, String> id2optimalSummary = Maps.newHashMap();  
76 - for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {  
77 - String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);  
78 - id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);  
79 - }  
80 - LOG.info(id2optimalSummary.size() + " optimal summaries found.");  
81 - return id2optimalSummary;  
82 - }  
83 -  
84 -  
85 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/PrepareTrainingData.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.zero;  
2 -  
3 -import com.google.common.collect.Maps;  
4 -import com.google.common.collect.Sets;  
5 -import org.apache.commons.io.IOUtils;  
6 -import org.slf4j.Logger;  
7 -import org.slf4j.LoggerFactory;  
8 -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;  
9 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;  
10 -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;  
11 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;  
12 -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;  
13 -import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;  
14 -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;  
15 -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;  
16 -import weka.core.Instance;  
17 -import weka.core.Instances;  
18 -import weka.core.converters.ArffSaver;  
19 -  
20 -import java.io.File;  
21 -import java.io.FileReader;  
22 -import java.io.IOException;  
23 -import java.util.List;  
24 -import java.util.Map;  
25 -import java.util.Set;  
26 -  
27 -public class PrepareTrainingData {  
28 -  
29 - private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class);  
30 -  
31 - private static final String IDS_PATH = "corpora/summaries_dev";  
32 - private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/";  
33 - private static final String GOLD_ZEROS_PATH = "/zeros.tsv";  
34 -  
35 - private PrepareTrainingData() {  
36 - }  
37 -  
38 - public static void main(String[] args) throws IOException {  
39 -  
40 - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);  
41 - Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH);  
42 -  
43 - ZeroScorer zeroScorer = new ZeroScorer(GOLD_ZEROS_PATH);  
44 - ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor();  
45 -  
46 - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());  
47 -  
48 - int i = 1;  
49 - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {  
50 - LOG.info(i++ + "/" + id2preprocessedText.size());  
51 -  
52 - String textId = entry.getKey();  
53 -  
54 - TText text = entry.getValue();  
55 - Set<String> sentenceIds = id2sentIds.get(textId);  
56 - FeatureHelper featureHelper = new FeatureHelper(text);  
57 -  
58 - List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds);  
59 - Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);  
60 -  
61 - for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) {  
62 - boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper);  
63 - Instance instance = entry2.getValue();  
64 - instance.setDataset(instances);  
65 - instance.setClassValue(good ? 1 : 0);  
66 - instances.add(instance);  
67 - }  
68 - }  
69 -  
70 - saveInstancesToFile(instances);  
71 - }  
72 -  
73 -  
74 - private static void saveInstancesToFile(Instances instances) throws IOException {  
75 - ArffSaver saver = new ArffSaver();  
76 - saver.setInstances(instances);  
77 - saver.setFile(new File(ModelConstants.ZERO_DATASET_PATH));  
78 - saver.writeBatch();  
79 - }  
80 -  
81 - private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException {  
82 - Map<String, Set<String>> result = Maps.newHashMap();  
83 - for (File f : new File(idsPath).listFiles()) {  
84 - String id = f.getName().split("_")[0];  
85 - List<String> sentenceIds = IOUtils.readLines(new FileReader(f));  
86 - result.put(id, Sets.newHashSet(sentenceIds));  
87 - }  
88 - return result;  
89 - }  
90 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/ZeroScorer.java
@@ -6,7 +6,7 @@ import org.apache.commons.csv.CSVParser; @@ -6,7 +6,7 @@ import org.apache.commons.csv.CSVParser;
6 import org.apache.commons.csv.CSVRecord; 6 import org.apache.commons.csv.CSVRecord;
7 import org.apache.commons.csv.QuoteMode; 7 import org.apache.commons.csv.QuoteMode;
8 import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 8 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
9 -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; 9 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; 10 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
11 11
12 import java.io.IOException; 12 import java.io.IOException;
@@ -21,8 +21,8 @@ public class ZeroScorer { @@ -21,8 +21,8 @@ public class ZeroScorer {
21 21
22 private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap(); 22 private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap();
23 23
24 - public ZeroScorer(String goldZerosPath) throws IOException {  
25 - try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosPath); 24 + public ZeroScorer(String goldZerosResourcePath) throws IOException {
  25 + try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosResourcePath);
26 InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING); 26 InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING);
27 CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) { 27 CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) {
28 List<CSVRecord> records = parser.getRecords(); 28 List<CSVRecord> records = parser.getRecords();
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadAndPreprocessCorpus.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 import net.lingala.zip4j.core.ZipFile; 3 import net.lingala.zip4j.core.ZipFile;
4 import org.apache.commons.io.FileUtils; 4 import org.apache.commons.io.FileUtils;
5 import org.slf4j.Logger; 5 import org.slf4j.Logger;
6 import org.slf4j.LoggerFactory; 6 import org.slf4j.LoggerFactory;
7 -import pl.waw.ipipan.zil.summ.nicolas.train.preprocess.Main;  
8 7
9 import java.io.File; 8 import java.io.File;
10 import java.net.URL; 9 import java.net.URL;
@@ -45,7 +44,7 @@ public class DownloadAndPreprocessCorpus { @@ -45,7 +44,7 @@ public class DownloadAndPreprocessCorpus {
45 44
46 File preprocessed = new File(WORKING_DIR, "preprocessed"); 45 File preprocessed = new File(WORKING_DIR, "preprocessed");
47 createFolder(preprocessed.getPath()); 46 createFolder(preprocessed.getPath());
48 - Main.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); 47 + Preprocess.main(new String[]{dataDir.getPath(), preprocessed.getPath()});
49 } 48 }
50 49
51 private static File createFolder(String path) { 50 private static File createFolder(String path) {
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 import com.google.common.base.Charsets; 3 import com.google.common.base.Charsets;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
  5 +import com.google.common.collect.Sets;
5 import com.google.common.io.Files; 6 import com.google.common.io.Files;
  7 +import org.apache.commons.io.IOUtils;
6 import org.slf4j.Logger; 8 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory; 9 import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; 13 +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
12 import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 14 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  15 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
13 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 16 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  17 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 18 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 19 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
16 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 20 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
17 import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; 21 import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  22 +import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.MentionScorer;
  23 +import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.SentenceScorer;
  24 +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.ZeroScorer;
  25 +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
  26 +import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;
  27 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
  28 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
18 import weka.classifiers.Classifier; 29 import weka.classifiers.Classifier;
19 import weka.core.Instance; 30 import weka.core.Instance;
20 import weka.core.Instances; 31 import weka.core.Instances;
21 import weka.core.converters.ArffSaver; 32 import weka.core.converters.ArffSaver;
22 33
23 import java.io.File; 34 import java.io.File;
  35 +import java.io.FileReader;
24 import java.io.IOException; 36 import java.io.IOException;
  37 +import java.io.InputStream;
  38 +import java.util.List;
25 import java.util.Map; 39 import java.util.Map;
26 import java.util.Set; 40 import java.util.Set;
27 - 41 +import java.util.function.Predicate;
  42 +import java.util.stream.Collectors;
28 43
29 public class PrepareTrainingData { 44 public class PrepareTrainingData {
30 45
31 private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); 46 private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class);
32 47
33 - private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";  
34 - private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; 48 + private static final String THRIFT_TEXTS_PATH = "data/preprocessed";
  49 + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "data/summaries-optimal";
  50 + private static final String SUMMARY_SENTENCE_IDS = "data/summaries-sentence-ids";
  51 +
  52 + private static final String ZERO_TRAINING_DATA_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_zero.tsv";
  53 + private static final String TRAIN_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt";
35 54
36 private PrepareTrainingData() { 55 private PrepareTrainingData() {
37 } 56 }
38 57
39 public static void main(String[] args) throws Exception { 58 public static void main(String[] args) throws Exception {
  59 + Set<String> trainTextIds = loadTrainTextIds();
  60 +
  61 + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(THRIFT_TEXTS_PATH), trainTextIds::contains);
  62 + Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains);
  63 +
  64 + prepareMentionsDataset(id2preprocessedText, id2optimalSummary);
  65 + prepareSentencesDataset(id2preprocessedText, id2optimalSummary);
  66 + prepareZerosDataset(id2preprocessedText);
  67 + }
  68 +
  69 + public static void prepareMentionsDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException {
  70 + MentionScorer mentionScorer = new MentionScorer();
  71 + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
  72 +
  73 + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  74 +
  75 + int i = 1;
  76 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  77 + LOG.info("{}/{}", i++, id2preprocessedText.size());
  78 +
  79 + String id = entry.getKey();
  80 + TText preprocessedText = entry.getValue();
  81 + String optimalSummary = id2optimalSummary.get(id);
  82 + if (optimalSummary == null)
  83 + continue;
  84 + Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
  85 +
  86 + Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(preprocessedText, featureExtractor);
  87 + for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) {
  88 + TMention mention = entry2.getKey();
  89 + Instance instance = entry2.getValue();
  90 + instance.setDataset(instances);
  91 + instance.setClassValue(mention2score.get(mention));
  92 + instances.add(instance);
  93 + }
  94 + }
  95 + saveInstancesToFile(instances, new File(ModelConstants.MENTION_DATASET_PATH));
  96 + }
  97 +
  98 + private static Set<String> loadTrainTextIds() throws IOException {
  99 + try (InputStream inputStream = PrepareTrainingData.class.getResourceAsStream(TRAIN_TEXT_IDS_RESOURCE_PATH)) {
  100 + List<String> testTextIds = IOUtils.readLines(inputStream, Constants.ENCODING);
  101 + return testTextIds.stream().map(String::trim).collect(Collectors.toSet());
  102 + }
  103 + }
40 104
41 - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);  
42 - Map<String, String> id2optimalSummary = loadOptimalSummaries(); 105 + public static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception {
43 106
44 SentenceScorer sentenceScorer = new SentenceScorer(); 107 SentenceScorer sentenceScorer = new SentenceScorer();
45 SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); 108 SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor();
46 109
47 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 110 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
48 111
49 - Classifier classifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); 112 + Classifier classifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
50 MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); 113 MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor();
51 114
52 int i = 1; 115 int i = 1;
53 for (String textId : id2preprocessedText.keySet()) { 116 for (String textId : id2preprocessedText.keySet()) {
54 - LOG.info(i++ + "/" + id2preprocessedText.size()); 117 + LOG.info("{}/{}", i++, id2preprocessedText.size());
55 118
56 TText preprocessedText = id2preprocessedText.get(textId); 119 TText preprocessedText = id2preprocessedText.get(textId);
57 String optimalSummary = id2optimalSummary.get(textId); 120 String optimalSummary = id2optimalSummary.get(textId);
@@ -64,7 +127,7 @@ public class PrepareTrainingData { @@ -64,7 +127,7 @@ public class PrepareTrainingData {
64 // Set<TMention> goodMentions 127 // Set<TMention> goodMentions
65 // = Utils.loadGoldGoodMentions(textId, preprocessedText, true); 128 // = Utils.loadGoldGoodMentions(textId, preprocessedText, true);
66 129
67 - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); 130 + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
68 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { 131 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
69 TSentence sentence = entry.getKey(); 132 TSentence sentence = entry.getKey();
70 Instance instance = entry.getValue(); 133 Instance instance = entry.getValue();
@@ -73,25 +136,74 @@ public class PrepareTrainingData { @@ -73,25 +136,74 @@ public class PrepareTrainingData {
73 instances.add(instance); 136 instances.add(instance);
74 } 137 }
75 } 138 }
76 - saveInstancesToFile(instances); 139 + saveInstancesToFile(instances, new File(ModelConstants.SENTENCE_DATASET_PATH));
  140 + }
  141 +
  142 + public static void prepareZerosDataset(Map<String, TText> id2preprocessedText) throws IOException {
  143 +
  144 + Map<String, Set<String>> id2sentIds = loadSentenceIds(SUMMARY_SENTENCE_IDS);
  145 +
  146 + ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_DATA_RESOURCE_PATH);
  147 + ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor();
  148 +
  149 + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  150 +
  151 + int i = 1;
  152 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  153 + LOG.info(i++ + "/" + id2preprocessedText.size());
  154 +
  155 + String textId = entry.getKey();
  156 +
  157 + TText text = entry.getValue();
  158 + Set<String> sentenceIds = id2sentIds.get(textId);
  159 + FeatureHelper featureHelper = new FeatureHelper(text);
  160 +
  161 + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds);
  162 + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
  163 +
  164 + for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) {
  165 + boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper);
  166 + Instance instance = entry2.getValue();
  167 + instance.setDataset(instances);
  168 + instance.setClassValue(good ? 1 : 0);
  169 + instances.add(instance);
  170 + }
  171 + }
  172 +
  173 + saveInstancesToFile(instances, new File(ModelConstants.ZERO_DATASET_PATH));
  174 + }
  175 +
  176 + private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException {
  177 + Map<String, Set<String>> result = Maps.newHashMap();
  178 + File[] files = new File(idsPath).listFiles();
  179 + if (files != null)
  180 + for (File f : files) {
  181 + String id = f.getName().split("_")[0];
  182 + List<String> sentenceIds = IOUtils.readLines(new FileReader(f));
  183 + result.put(id, Sets.newHashSet(sentenceIds));
  184 + }
  185 + return result;
77 } 186 }
78 187
79 - private static void saveInstancesToFile(Instances instances) throws IOException { 188 + private static void saveInstancesToFile(Instances instances, File targetFile) throws IOException {
80 ArffSaver saver = new ArffSaver(); 189 ArffSaver saver = new ArffSaver();
81 saver.setInstances(instances); 190 saver.setInstances(instances);
82 - saver.setFile(new File(ModelConstants.SENTENCE_DATASET_PATH)); 191 + saver.setFile(targetFile);
83 saver.writeBatch(); 192 saver.writeBatch();
84 } 193 }
85 194
86 - private static Map<String, String> loadOptimalSummaries() throws IOException { 195 + private static Map<String, String> loadOptimalSummaries(Predicate<String> idFilter) throws IOException {
87 Map<String, String> id2optimalSummary = Maps.newHashMap(); 196 Map<String, String> id2optimalSummary = Maps.newHashMap();
88 - for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {  
89 - String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);  
90 - id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);  
91 - }  
92 - LOG.info(id2optimalSummary.size() + " optimal summaries found."); 197 + File[] files = new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles();
  198 + if (files != null)
  199 + for (File optimalSummaryFile : files) {
  200 + String textId = optimalSummaryFile.getName().split("_")[0];
  201 + if (!idFilter.test(textId))
  202 + continue;
  203 + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
  204 + id2optimalSummary.put(textId, optimalSummary);
  205 + }
  206 + LOG.info("{} optimal summaries found.", id2optimalSummary.size());
93 return id2optimalSummary; 207 return id2optimalSummary;
94 } 208 }
95 -  
96 -  
97 } 209 }
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/Preprocess.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train.preprocess; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 import org.slf4j.Logger; 3 import org.slf4j.Logger;
4 import org.slf4j.LoggerFactory; 4 import org.slf4j.LoggerFactory;
@@ -9,19 +9,19 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; @@ -9,19 +9,19 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
9 import java.io.File; 9 import java.io.File;
10 import java.util.Arrays; 10 import java.util.Arrays;
11 11
12 -public class Main { 12 +public class Preprocess {
13 13
14 - private static final Logger LOG = LoggerFactory.getLogger(Main.class); 14 + private static final Logger LOG = LoggerFactory.getLogger(Preprocess.class);
15 15
16 private static final String CORPUS_FILE_SUFFIX = ".xml"; 16 private static final String CORPUS_FILE_SUFFIX = ".xml";
17 private static final String OUTPUT_FILE_SUFFIX = ".thrift"; 17 private static final String OUTPUT_FILE_SUFFIX = ".thrift";
18 18
19 - private Main() { 19 + private Preprocess() {
20 } 20 }
21 21
22 public static void main(String[] args) { 22 public static void main(String[] args) {
23 if (args.length != 2) { 23 if (args.length != 2) {
24 - LOG.error("Wrong usage! Try " + Main.class.getSimpleName() + " dirWithCorpusFiles targetDir"); 24 + LOG.error("Wrong usage! Try " + Preprocess.class.getSimpleName() + " dirWithCorpusFiles targetDir");
25 return; 25 return;
26 } 26 }
27 File corpusDir = new File(args[0]); 27 File corpusDir = new File(args[0]);
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/TrainAllModels.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel; 3 import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel;
4 import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel; 4 import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel;
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateMention.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.search;  
2 -  
3 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;  
4 -  
5 -  
6 -public class CrossvalidateMention {  
7 -  
8 - private CrossvalidateMention() {  
9 - }  
10 -  
11 - public static void main(String[] args) throws Exception {  
12 - CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.MENTION_DATASET_PATH);  
13 - }  
14 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateSentence.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.search;  
2 -  
3 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;  
4 -  
5 -  
6 -public class CrossvalidateSentence {  
7 -  
8 - private CrossvalidateSentence() {  
9 - }  
10 -  
11 - public static void main(String[] args) throws Exception {  
12 - CrossvalidateCommon.crossvalidateRegressors(ModelConstants.SENTENCE_DATASET_PATH);  
13 - }  
14 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateZero.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.search;  
2 -  
3 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;  
4 -  
5 -  
6 -public class CrossvalidateZero {  
7 -  
8 - private CrossvalidateZero() {  
9 - }  
10 -  
11 - public static void main(String[] args) throws Exception {  
12 - CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.ZERO_DATASET_PATH);  
13 - }  
14 -}  
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/dev_ids.txt renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/zeros.tsv renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/train_zero.tsv
@@ -18,6 +18,7 @@ @@ -18,6 +18,7 @@
18 <module>nicolas-train</module> 18 <module>nicolas-train</module>
19 <module>nicolas-common</module> 19 <module>nicolas-common</module>
20 <module>nicolas-multiservice</module> 20 <module>nicolas-multiservice</module>
  21 + <module>nicolas-eval</module>
21 </modules> 22 </modules>
22 23
23 <properties> 24 <properties>
@@ -27,10 +28,11 @@ @@ -27,10 +28,11 @@
27 28
28 <pscapi.version>1.0</pscapi.version> 29 <pscapi.version>1.0</pscapi.version>
29 <utils.version>1.0</utils.version> 30 <utils.version>1.0</utils.version>
  31 + <eval.version>1.0</eval.version>
30 32
31 <commons-csv.version>1.4</commons-csv.version> 33 <commons-csv.version>1.4</commons-csv.version>
32 <guava.version>21.0</guava.version> 34 <guava.version>21.0</guava.version>
33 - <weka-dev.version>3.9.1</weka-dev.version> 35 + <weka-stable.version>3.8.1</weka-stable.version>
34 <commons-lang3.version>3.5</commons-lang3.version> 36 <commons-lang3.version>3.5</commons-lang3.version>
35 <commons-io.version>2.5</commons-io.version> 37 <commons-io.version>2.5</commons-io.version>
36 <slf4j-api.version>1.7.22</slf4j-api.version> 38 <slf4j-api.version>1.7.22</slf4j-api.version>
@@ -98,6 +100,11 @@ @@ -98,6 +100,11 @@
98 <artifactId>utils</artifactId> 100 <artifactId>utils</artifactId>
99 <version>${utils.version}</version> 101 <version>${utils.version}</version>
100 </dependency> 102 </dependency>
  103 + <dependency>
  104 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  105 + <artifactId>eval</artifactId>
  106 + <version>${eval.version}</version>
  107 + </dependency>
101 108
102 <!-- third party --> 109 <!-- third party -->
103 <dependency> 110 <dependency>
@@ -112,8 +119,8 @@ @@ -112,8 +119,8 @@
112 </dependency> 119 </dependency>
113 <dependency> 120 <dependency>
114 <groupId>nz.ac.waikato.cms.weka</groupId> 121 <groupId>nz.ac.waikato.cms.weka</groupId>
115 - <artifactId>weka-dev</artifactId>  
116 - <version>${weka-dev.version}</version> 122 + <artifactId>weka-stable</artifactId>
  123 + <version>${weka-stable.version}</version>
117 <exclusions> 124 <exclusions>
118 <exclusion> 125 <exclusion>
119 <groupId>org.slf4j</groupId> 126 <groupId>org.slf4j</groupId>