Commit 89870bd0ed20b4282b3bd02c08df70ef087bee30

Authored by Mateusz Kopeć
1 parent 08d128a5

finished evaluation script

1 #!/usr/bin/env bash 1 #!/usr/bin/env bash
2 2
  3 +mvn install -Dmaven.test.skip=true
  4 +mvn -pl nicolas-train exec:java -Dexec.mainClass="pl.waw.ipipan.zil.summ.nicolas.eval.Main"
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +import pl.waw.ipipan.zil.summ.pscapi.xml.Summary;
  4 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  5 +
  6 +import java.io.File;
  7 +import java.io.IOException;
  8 +import java.util.Arrays;
  9 +import java.util.List;
  10 +import java.util.Set;
  11 +import java.util.stream.Collectors;
  12 +
  13 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR;
  14 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.OPTIMAL_SUMMARIES_DIR;
  15 +
  16 +public class CorpusHelper {
  17 +
  18 + private static final String ABSTRACT_SUMMARY_TYPE = "abstract";
  19 + private static final String EXTRACT_SUMMARY_TYPE = "extract";
  20 +
  21 + private static final int SUMMARY_RATIO = 20;
  22 +
  23 + private CorpusHelper() {
  24 + }
  25 +
  26 + public static boolean isTest(Text text) {
  27 + return text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE));
  28 + }
  29 +
  30 + public static List<Summary> getExtractSummaries(Text text) {
  31 + return text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals(EXTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
  32 + }
  33 +
  34 + public static List<Summary> getAbstractSummaries(Text text) {
  35 + return text.getSummaries().getSummary().stream().filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
  36 + }
  37 +
  38 + public static Set<String> loadTrainTextIds() throws IOException {
  39 + return collectIdsFromFolder(OPTIMAL_SUMMARIES_DIR);
  40 + }
  41 +
  42 + public static Set<String> loadTestTextIds() throws IOException {
  43 + return collectIdsFromFolder(GOLD_TEST_SUMMARIES_DIR);
  44 + }
  45 +
  46 + private static Set<String> collectIdsFromFolder(File folder) throws IOException {
  47 + File[] optimalSummaries = folder.listFiles();
  48 + if (optimalSummaries == null)
  49 + throw new IOException("No summaries at " + folder);
  50 +
  51 + return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet());
  52 + }
  53 +
  54 +
  55 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
@@ -18,18 +18,20 @@ public class PathConstants { @@ -18,18 +18,20 @@ public class PathConstants {
18 public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip"; 18 public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip";
19 public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip"; 19 public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip";
20 public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv"; 20 public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv";
  21 + public static final String COMPETITOR_SUMMARIES_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=competitor-summaries.zip";
21 22
22 public static final File WORKING_DIR = new File("data"); 23 public static final File WORKING_DIR = new File("data");
23 -  
24 public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip"); 24 public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip");
25 public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip"); 25 public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip");
26 public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip"); 26 public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip");
  27 + public static final File ZIPPED_COMPETITOR_SUMMARIES_FILE = new File(WORKING_DIR, "competitor-summaries.zip");
27 28
28 public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus"); 29 public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus");
29 public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data"); 30 public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data");
30 public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids"); 31 public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids");
31 public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed"); 32 public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed");
32 public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold"); 33 public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold");
  34 + public static final File SYSTEM_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-system");
33 public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold"); 35 public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold");
34 public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); 36 public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal");
35 public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); 37 public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv");
@@ -39,6 +41,8 @@ public class PathConstants { @@ -39,6 +41,8 @@ public class PathConstants {
39 public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); 41 public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff");
40 public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); 42 public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff");
41 43
  44 + public static final File SUMMARY_LENGTHS_FILE = new File(WORKING_DIR, "summary-lengths.tsv");
  45 +
42 private PathConstants() { 46 private PathConstants() {
43 } 47 }
44 48
@@ -67,13 +71,9 @@ public class PathConstants { @@ -67,13 +71,9 @@ public class PathConstants {
67 } 71 }
68 72
69 private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { 73 private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException {
70 - if (targetDir.exists()) {  
71 - LOG.info("Zip file {} already extracted to dir {}.", targetZipFile, targetDir);  
72 - } else {  
73 - createFolder(targetDir);  
74 - ZipFile zipFile = new ZipFile(targetZipFile);  
75 - zipFile.extractAll(targetDir.getPath());  
76 - LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir);  
77 - } 74 + createFolder(targetDir);
  75 + ZipFile zipFile = new ZipFile(targetZipFile);
  76 + zipFile.extractAll(targetDir.getPath());
  77 + LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir);
78 } 78 }
79 } 79 }
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import com.google.common.io.Files;
  4 +import org.apache.commons.csv.CSVFormat;
  5 +import org.apache.commons.csv.CSVPrinter;
  6 +import org.slf4j.Logger;
  7 +import org.slf4j.LoggerFactory;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  9 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
  10 +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
  11 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  12 +
  13 +import javax.xml.bind.JAXBException;
  14 +import java.io.File;
  15 +import java.io.FileWriter;
  16 +import java.io.IOException;
  17 +import java.util.ArrayList;
  18 +import java.util.HashMap;
  19 +import java.util.List;
  20 +import java.util.Map;
  21 +
  22 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
  23 +
  24 +public class CalculateSystemSummaryLengths {
  25 +
  26 + private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class);
  27 +
  28 + private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId",
  29 + "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t');
  30 +
  31 + private CalculateSystemSummaryLengths() {
  32 + }
  33 +
  34 + public static void main(String[] args) throws IOException {
  35 + Map<String, Integer> textId2wordCount = loadTextLengths(EXTRACTED_CORPUS_DATA_DIR);
  36 +
  37 + try (CSVPrinter printer = new CSVPrinter(new FileWriter(SUMMARY_LENGTHS_FILE), CSV_FORMAT)) {
  38 + File[] files = SYSTEM_TEST_SUMMARIES_DIR.listFiles();
  39 + if (files == null) {
  40 + throw new IOException("No summaries in " + SYSTEM_TEST_SUMMARIES_DIR);
  41 + }
  42 + for (File summaryFile : files) {
  43 + writeLengths(textId2wordCount, printer, summaryFile);
  44 + }
  45 +
  46 + } catch (IOException ex) {
  47 + LOG.error("Error creating target file: " + ex);
  48 + }
  49 + }
  50 +
  51 + private static void writeLengths(Map<String, Integer> textId2wordCount, CSVPrinter printer,
  52 + File summaryFile) throws IOException {
  53 + String[] split = summaryFile.getName().split("[._]");
  54 + String textId = split[0];
  55 + String systemName = split[1];
  56 + String body = Files.toString(summaryFile, Constants.ENCODING);
  57 +
  58 + List<Object> record = new ArrayList<>();
  59 + record.add(textId);
  60 + int textWC = textId2wordCount.get(textId);
  61 + record.add(textWC);
  62 + record.add("automatic");
  63 + record.add(systemName);
  64 + int sumWC = TextUtils.tokenize(body).size();
  65 + record.add(sumWC);
  66 + record.add(sumWC * 1.0 / textWC);
  67 + printer.printRecord(record);
  68 + }
  69 +
  70 + private static Map<String, Integer> loadTextLengths(File manualCorpusDir) throws IOException {
  71 + Map<String, Integer> textId2wordCount = new HashMap<>();
  72 + File[] files = manualCorpusDir.listFiles();
  73 + if (files == null) {
  74 + throw new IOException("No summaries in " + manualCorpusDir);
  75 + }
  76 + for (File file : files) {
  77 + try {
  78 + Text text = PSC_IO.readText(file);
  79 + textId2wordCount.put(text.getId(), TextUtils.tokenize(text.getBody()).size());
  80 + } catch (IOException | JAXBException e) {
  81 + LOG.error("Error reading manual summaries: " + e);
  82 + }
  83 + }
  84 + return textId2wordCount;
  85 + }
  86 +
  87 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.eval;  
2 -  
3 -import org.apache.commons.io.IOUtils;  
4 -  
5 -import java.io.IOException;  
6 -import java.io.InputStream;  
7 -import java.util.List;  
8 -import java.util.Set;  
9 -import java.util.stream.Collectors;  
10 -  
11 -public class Constants {  
12 -  
13 - private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt";  
14 -  
15 - private Constants() {  
16 - }  
17 -  
18 - public static Set<String> loadTestTextIds() throws IOException {  
19 - try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) {  
20 - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING);  
21 - return testTextIds.stream().map(String::trim).collect(Collectors.toSet());  
22 - }  
23 - }  
24 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/DownloadCompetingSummaries.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
  4 +
  5 +public class DownloadCompetingSummaries {
  6 +
  7 + private DownloadCompetingSummaries() {
  8 + }
  9 +
  10 + public static void main(String[] args) throws Exception {
  11 + downloadFileAndExtract(COMPETITOR_SUMMARIES_DOWNLOAD_URL, ZIPPED_COMPETITOR_SUMMARIES_FILE, SYSTEM_TEST_SUMMARIES_DIR);
  12 + }
  13 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
@@ -2,14 +2,17 @@ package pl.waw.ipipan.zil.summ.nicolas.eval; @@ -2,14 +2,17 @@ package pl.waw.ipipan.zil.summ.nicolas.eval;
2 2
3 import pl.waw.ipipan.zil.summ.eval.Main; 3 import pl.waw.ipipan.zil.summ.eval.Main;
4 4
  5 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR;
  6 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR;
  7 +
5 public class Evaluate { 8 public class Evaluate {
6 9
7 private Evaluate() { 10 private Evaluate() {
8 } 11 }
9 12
10 public static void main(String[] args) { 13 public static void main(String[] args) {
11 - String goldDirPath = "data/summaries-gold";  
12 - String systemDirPath = "data/summaries"; 14 + String goldDirPath = GOLD_TEST_SUMMARIES_DIR.getAbsolutePath();
  15 + String systemDirPath = SYSTEM_TEST_SUMMARIES_DIR.getAbsolutePath();
13 Main.main(new String[]{goldDirPath, systemDirPath}); 16 Main.main(new String[]{goldDirPath, systemDirPath});
14 } 17 }
15 } 18 }
16 \ No newline at end of file 19 \ No newline at end of file
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Main.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +public class Main {
  4 +
  5 + private Main() {
  6 + }
  7 +
  8 + public static void main(String[] args) throws Exception {
  9 + SummarizeTestCorpus.main(args);
  10 + DownloadCompetingSummaries.main(args);
  11 + CalculateSystemSummaryLengths.main(args);
  12 + Evaluate.main(args);
  13 + }
  14 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
@@ -5,8 +5,10 @@ import org.slf4j.Logger; @@ -5,8 +5,10 @@ import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper;
8 import pl.waw.ipipan.zil.summ.nicolas.Nicolas; 9 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
9 import pl.waw.ipipan.zil.summ.nicolas.NicolasException; 10 import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
  11 +import pl.waw.ipipan.zil.summ.nicolas.PathConstants;
10 import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; 12 import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
11 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; 13 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
12 14
@@ -19,7 +21,8 @@ import java.util.Map; @@ -19,7 +21,8 @@ import java.util.Map;
19 import java.util.Set; 21 import java.util.Set;
20 22
21 import static java.util.stream.Collectors.toList; 23 import static java.util.stream.Collectors.toList;
22 -import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; 24 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.PREPROCESSED_CORPUS_DIR;
  25 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR;
23 26
24 public class SummarizeTestCorpus { 27 public class SummarizeTestCorpus {
25 28
@@ -32,19 +35,17 @@ public class SummarizeTestCorpus { @@ -32,19 +35,17 @@ public class SummarizeTestCorpus {
32 } 35 }
33 36
34 public static void main(String[] args) throws IOException, NicolasException { 37 public static void main(String[] args) throws IOException, NicolasException {
35 - File thriftedCorpusDir = new File("data/all-preprocessed");  
36 - File targetDir = new File("data/test-system");  
37 - targetDir.mkdir(); 38 + PathConstants.createFolder(SYSTEM_TEST_SUMMARIES_DIR);
38 39
39 - Set<String> testTextIds = loadTestTextIds();  
40 - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains); 40 + Set<String> testTextIds = CorpusHelper.loadTestTextIds();
  41 + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains);
41 LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); 42 LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size());
42 43
43 Map<String, String> id2summary = summarizeTexts(id2preprocessedText); 44 Map<String, String> id2summary = summarizeTexts(id2preprocessedText);
44 LOG.info("Texts summarized."); 45 LOG.info("Texts summarized.");
45 46
46 - saveSummariesToFolder(id2summary, targetDir);  
47 - LOG.info("Texts saved to {} folder.", targetDir); 47 + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR);
  48 + LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR);
48 } 49 }
49 50
50 private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { 51 private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException {
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; 1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 import pl.waw.ipipan.zil.summ.nicolas.Constants; 3 import pl.waw.ipipan.zil.summ.nicolas.Constants;
  4 +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper;
4 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; 5 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
5 import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; 6 import pl.waw.ipipan.zil.summ.pscapi.xml.Summary;
6 import pl.waw.ipipan.zil.summ.pscapi.xml.Text; 7 import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
@@ -11,18 +12,11 @@ import java.io.FileOutputStream; @@ -11,18 +12,11 @@ import java.io.FileOutputStream;
11 import java.io.IOException; 12 import java.io.IOException;
12 import java.io.OutputStreamWriter; 13 import java.io.OutputStreamWriter;
13 import java.util.List; 14 import java.util.List;
14 -import java.util.function.Predicate;  
15 -import java.util.stream.Collectors;  
16 -import java.util.stream.Stream;  
17 15
18 import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; 16 import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
19 17
20 public class ExtractGoldSummaries { 18 public class ExtractGoldSummaries {
21 19
22 - private static final String ABSTRACT_SUMMARY_TYPE = "abstract";  
23 - private static final int SUMMARY_RATIO = 20;  
24 -  
25 - private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE));  
26 20
27 private ExtractGoldSummaries() { 21 private ExtractGoldSummaries() {
28 } 22 }
@@ -37,12 +31,12 @@ public class ExtractGoldSummaries { @@ -37,12 +31,12 @@ public class ExtractGoldSummaries {
37 Text text = PSC_IO.readText(file); 31 Text text = PSC_IO.readText(file);
38 32
39 List<Summary> goldSummaries; 33 List<Summary> goldSummaries;
40 - Stream<Summary> stream = text.getSummaries().getSummary().stream();  
41 - boolean isTest = IS_TEST.test(text); 34 +
  35 + boolean isTest = CorpusHelper.isTest(text);
42 if (isTest) { 36 if (isTest) {
43 - goldSummaries = stream.filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); 37 + goldSummaries = CorpusHelper.getAbstractSummaries(text);
44 } else { 38 } else {
45 - goldSummaries = stream.filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); 39 + goldSummaries = CorpusHelper.getExtractSummaries(text);
46 } 40 }
47 41
48 for (Summary summary : goldSummaries) { 42 for (Summary summary : goldSummaries) {
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
@@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory; @@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  13 +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper;
13 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 14 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 16 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
@@ -29,12 +30,10 @@ import weka.core.converters.ArffSaver; @@ -29,12 +30,10 @@ import weka.core.converters.ArffSaver;
29 import java.io.File; 30 import java.io.File;
30 import java.io.FileReader; 31 import java.io.FileReader;
31 import java.io.IOException; 32 import java.io.IOException;
32 -import java.util.Arrays;  
33 import java.util.List; 33 import java.util.List;
34 import java.util.Map; 34 import java.util.Map;
35 import java.util.Set; 35 import java.util.Set;
36 import java.util.function.Predicate; 36 import java.util.function.Predicate;
37 -import java.util.stream.Collectors;  
38 37
39 import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; 38 import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
40 39
@@ -46,7 +45,7 @@ public class PrepareTrainingData { @@ -46,7 +45,7 @@ public class PrepareTrainingData {
46 } 45 }
47 46
48 public static void main(String[] args) throws Exception { 47 public static void main(String[] args) throws Exception {
49 - Set<String> trainTextIds = loadTrainTextIds(); 48 + Set<String> trainTextIds = CorpusHelper.loadTrainTextIds();
50 49
51 Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, trainTextIds::contains); 50 Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, trainTextIds::contains);
52 Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains); 51 Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains);
@@ -91,14 +90,6 @@ public class PrepareTrainingData { @@ -91,14 +90,6 @@ public class PrepareTrainingData {
91 } 90 }
92 } 91 }
93 92
94 - private static Set<String> loadTrainTextIds() throws IOException {  
95 - File[] optimalSummaries = OPTIMAL_SUMMARIES_DIR.listFiles();  
96 - if (optimalSummaries == null)  
97 - throw new IOException("No optimal summaries at " + OPTIMAL_SUMMARIES_DIR);  
98 -  
99 - return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet());  
100 - }  
101 -  
102 private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { 93 private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception {
103 94
104 SentenceScorer sentenceScorer = new SentenceScorer(); 95 SentenceScorer sentenceScorer = new SentenceScorer();