Commit 89870bd0ed20b4282b3bd02c08df70ef087bee30

Authored by Mateusz Kopeć
1 parent 08d128a5

finished evaluation script

1 1 #!/usr/bin/env bash
2 2  
  3 +mvn install -Dmaven.test.skip=true
  4 +mvn -pl nicolas-train exec:java -Dexec.mainClass="pl.waw.ipipan.zil.summ.nicolas.eval.Main"
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +import pl.waw.ipipan.zil.summ.pscapi.xml.Summary;
  4 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  5 +
  6 +import java.io.File;
  7 +import java.io.IOException;
  8 +import java.util.Arrays;
  9 +import java.util.List;
  10 +import java.util.Set;
  11 +import java.util.stream.Collectors;
  12 +
  13 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR;
  14 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.OPTIMAL_SUMMARIES_DIR;
  15 +
  16 +public class CorpusHelper {
  17 +
  18 + private static final String ABSTRACT_SUMMARY_TYPE = "abstract";
  19 + private static final String EXTRACT_SUMMARY_TYPE = "extract";
  20 +
  21 + private static final int SUMMARY_RATIO = 20;
  22 +
  23 + private CorpusHelper() {
  24 + }
  25 +
  26 + public static boolean isTest(Text text) {
  27 + return text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE));
  28 + }
  29 +
  30 + public static List<Summary> getExtractSummaries(Text text) {
  31 + return text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals(EXTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
  32 + }
  33 +
  34 + public static List<Summary> getAbstractSummaries(Text text) {
  35 + return text.getSummaries().getSummary().stream().filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
  36 + }
  37 +
  38 + public static Set<String> loadTrainTextIds() throws IOException {
  39 + return collectIdsFromFolder(OPTIMAL_SUMMARIES_DIR);
  40 + }
  41 +
  42 + public static Set<String> loadTestTextIds() throws IOException {
  43 + return collectIdsFromFolder(GOLD_TEST_SUMMARIES_DIR);
  44 + }
  45 +
  46 + private static Set<String> collectIdsFromFolder(File folder) throws IOException {
  47 + File[] optimalSummaries = folder.listFiles();
  48 + if (optimalSummaries == null)
  49 + throw new IOException("No summaries at " + folder);
  50 +
  51 + return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet());
  52 + }
  53 +
  54 +
  55 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
... ... @@ -18,18 +18,20 @@ public class PathConstants {
18 18 public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip";
19 19 public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip";
20 20 public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv";
  21 + public static final String COMPETITOR_SUMMARIES_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=competitor-summaries.zip";
21 22  
22 23 public static final File WORKING_DIR = new File("data");
23   -
24 24 public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip");
25 25 public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip");
26 26 public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip");
  27 + public static final File ZIPPED_COMPETITOR_SUMMARIES_FILE = new File(WORKING_DIR, "competitor-summaries.zip");
27 28  
28 29 public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus");
29 30 public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data");
30 31 public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids");
31 32 public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed");
32 33 public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold");
  34 + public static final File SYSTEM_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-system");
33 35 public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold");
34 36 public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal");
35 37 public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv");
... ... @@ -39,6 +41,8 @@ public class PathConstants {
39 41 public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff");
40 42 public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff");
41 43  
  44 + public static final File SUMMARY_LENGTHS_FILE = new File(WORKING_DIR, "summary-lengths.tsv");
  45 +
42 46 private PathConstants() {
43 47 }
44 48  
... ... @@ -67,13 +71,9 @@ public class PathConstants {
67 71 }
68 72  
69 73 private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException {
70   - if (targetDir.exists()) {
71   - LOG.info("Zip file {} already extracted to dir {}.", targetZipFile, targetDir);
72   - } else {
73   - createFolder(targetDir);
74   - ZipFile zipFile = new ZipFile(targetZipFile);
75   - zipFile.extractAll(targetDir.getPath());
76   - LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir);
77   - }
  74 + createFolder(targetDir);
  75 + ZipFile zipFile = new ZipFile(targetZipFile);
  76 + zipFile.extractAll(targetDir.getPath());
  77 + LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir);
78 78 }
79 79 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import com.google.common.io.Files;
  4 +import org.apache.commons.csv.CSVFormat;
  5 +import org.apache.commons.csv.CSVPrinter;
  6 +import org.slf4j.Logger;
  7 +import org.slf4j.LoggerFactory;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  9 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
  10 +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
  11 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  12 +
  13 +import javax.xml.bind.JAXBException;
  14 +import java.io.File;
  15 +import java.io.FileWriter;
  16 +import java.io.IOException;
  17 +import java.util.ArrayList;
  18 +import java.util.HashMap;
  19 +import java.util.List;
  20 +import java.util.Map;
  21 +
  22 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
  23 +
  24 +public class CalculateSystemSummaryLengths {
  25 +
  26 + private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class);
  27 +
  28 + private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId",
  29 + "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t');
  30 +
  31 + private CalculateSystemSummaryLengths() {
  32 + }
  33 +
  34 + public static void main(String[] args) throws IOException {
  35 + Map<String, Integer> textId2wordCount = loadTextLengths(EXTRACTED_CORPUS_DATA_DIR);
  36 +
  37 + try (CSVPrinter printer = new CSVPrinter(new FileWriter(SUMMARY_LENGTHS_FILE), CSV_FORMAT)) {
  38 + File[] files = SYSTEM_TEST_SUMMARIES_DIR.listFiles();
  39 + if (files == null) {
  40 + throw new IOException("No summaries in " + SYSTEM_TEST_SUMMARIES_DIR);
  41 + }
  42 + for (File summaryFile : files) {
  43 + writeLengths(textId2wordCount, printer, summaryFile);
  44 + }
  45 +
  46 + } catch (IOException ex) {
  47 + LOG.error("Error creating target file: " + ex);
  48 + }
  49 + }
  50 +
  51 + private static void writeLengths(Map<String, Integer> textId2wordCount, CSVPrinter printer,
  52 + File summaryFile) throws IOException {
  53 + String[] split = summaryFile.getName().split("[._]");
  54 + String textId = split[0];
  55 + String systemName = split[1];
  56 + String body = Files.toString(summaryFile, Constants.ENCODING);
  57 +
  58 + List<Object> record = new ArrayList<>();
  59 + record.add(textId);
  60 + int textWC = textId2wordCount.get(textId);
  61 + record.add(textWC);
  62 + record.add("automatic");
  63 + record.add(systemName);
  64 + int sumWC = TextUtils.tokenize(body).size();
  65 + record.add(sumWC);
  66 + record.add(sumWC * 1.0 / textWC);
  67 + printer.printRecord(record);
  68 + }
  69 +
  70 + private static Map<String, Integer> loadTextLengths(File manualCorpusDir) throws IOException {
  71 + Map<String, Integer> textId2wordCount = new HashMap<>();
  72 + File[] files = manualCorpusDir.listFiles();
  73 + if (files == null) {
  74 + throw new IOException("No summaries in " + manualCorpusDir);
  75 + }
  76 + for (File file : files) {
  77 + try {
  78 + Text text = PSC_IO.readText(file);
  79 + textId2wordCount.put(text.getId(), TextUtils.tokenize(text.getBody()).size());
  80 + } catch (IOException | JAXBException e) {
  81 + LOG.error("Error reading manual summaries: " + e);
  82 + }
  83 + }
  84 + return textId2wordCount;
  85 + }
  86 +
  87 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.eval;
2   -
3   -import org.apache.commons.io.IOUtils;
4   -
5   -import java.io.IOException;
6   -import java.io.InputStream;
7   -import java.util.List;
8   -import java.util.Set;
9   -import java.util.stream.Collectors;
10   -
11   -public class Constants {
12   -
13   - private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt";
14   -
15   - private Constants() {
16   - }
17   -
18   - public static Set<String> loadTestTextIds() throws IOException {
19   - try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) {
20   - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING);
21   - return testTextIds.stream().map(String::trim).collect(Collectors.toSet());
22   - }
23   - }
24   -}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/DownloadCompetingSummaries.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
  4 +
  5 +public class DownloadCompetingSummaries {
  6 +
  7 + private DownloadCompetingSummaries() {
  8 + }
  9 +
  10 + public static void main(String[] args) throws Exception {
  11 + downloadFileAndExtract(COMPETITOR_SUMMARIES_DOWNLOAD_URL, ZIPPED_COMPETITOR_SUMMARIES_FILE, SYSTEM_TEST_SUMMARIES_DIR);
  12 + }
  13 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
... ... @@ -2,14 +2,17 @@ package pl.waw.ipipan.zil.summ.nicolas.eval;
2 2  
3 3 import pl.waw.ipipan.zil.summ.eval.Main;
4 4  
  5 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR;
  6 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR;
  7 +
5 8 public class Evaluate {
6 9  
7 10 private Evaluate() {
8 11 }
9 12  
10 13 public static void main(String[] args) {
11   - String goldDirPath = "data/summaries-gold";
12   - String systemDirPath = "data/summaries";
  14 + String goldDirPath = GOLD_TEST_SUMMARIES_DIR.getAbsolutePath();
  15 + String systemDirPath = SYSTEM_TEST_SUMMARIES_DIR.getAbsolutePath();
13 16 Main.main(new String[]{goldDirPath, systemDirPath});
14 17 }
15 18 }
16 19 \ No newline at end of file
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Main.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +public class Main {
  4 +
  5 + private Main() {
  6 + }
  7 +
  8 + public static void main(String[] args) throws Exception {
  9 + SummarizeTestCorpus.main(args);
  10 + DownloadCompetingSummaries.main(args);
  11 + CalculateSystemSummaryLengths.main(args);
  12 + Evaluate.main(args);
  13 + }
  14 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
... ... @@ -5,8 +5,10 @@ import org.slf4j.Logger;
5 5 import org.slf4j.LoggerFactory;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper;
8 9 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
9 10 import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
  11 +import pl.waw.ipipan.zil.summ.nicolas.PathConstants;
10 12 import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
11 13 import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
12 14  
... ... @@ -19,7 +21,8 @@ import java.util.Map;
19 21 import java.util.Set;
20 22  
21 23 import static java.util.stream.Collectors.toList;
22   -import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds;
  24 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.PREPROCESSED_CORPUS_DIR;
  25 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR;
23 26  
24 27 public class SummarizeTestCorpus {
25 28  
... ... @@ -32,19 +35,17 @@ public class SummarizeTestCorpus {
32 35 }
33 36  
34 37 public static void main(String[] args) throws IOException, NicolasException {
35   - File thriftedCorpusDir = new File("data/all-preprocessed");
36   - File targetDir = new File("data/test-system");
37   - targetDir.mkdir();
  38 + PathConstants.createFolder(SYSTEM_TEST_SUMMARIES_DIR);
38 39  
39   - Set<String> testTextIds = loadTestTextIds();
40   - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains);
  40 + Set<String> testTextIds = CorpusHelper.loadTestTextIds();
  41 + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains);
41 42 LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size());
42 43  
43 44 Map<String, String> id2summary = summarizeTexts(id2preprocessedText);
44 45 LOG.info("Texts summarized.");
45 46  
46   - saveSummariesToFolder(id2summary, targetDir);
47   - LOG.info("Texts saved to {} folder.", targetDir);
  47 + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR);
  48 + LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR);
48 49 }
49 50  
50 51 private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException {
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
1 1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2  
3 3 import pl.waw.ipipan.zil.summ.nicolas.Constants;
  4 +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper;
4 5 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
5 6 import pl.waw.ipipan.zil.summ.pscapi.xml.Summary;
6 7 import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
... ... @@ -11,18 +12,11 @@ import java.io.FileOutputStream;
11 12 import java.io.IOException;
12 13 import java.io.OutputStreamWriter;
13 14 import java.util.List;
14   -import java.util.function.Predicate;
15   -import java.util.stream.Collectors;
16   -import java.util.stream.Stream;
17 15  
18 16 import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
19 17  
20 18 public class ExtractGoldSummaries {
21 19  
22   - private static final String ABSTRACT_SUMMARY_TYPE = "abstract";
23   - private static final int SUMMARY_RATIO = 20;
24   -
25   - private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE));
26 20  
27 21 private ExtractGoldSummaries() {
28 22 }
... ... @@ -37,12 +31,12 @@ public class ExtractGoldSummaries {
37 31 Text text = PSC_IO.readText(file);
38 32  
39 33 List<Summary> goldSummaries;
40   - Stream<Summary> stream = text.getSummaries().getSummary().stream();
41   - boolean isTest = IS_TEST.test(text);
  34 +
  35 + boolean isTest = CorpusHelper.isTest(text);
42 36 if (isTest) {
43   - goldSummaries = stream.filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
  37 + goldSummaries = CorpusHelper.getAbstractSummaries(text);
44 38 } else {
45   - goldSummaries = stream.filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
  39 + goldSummaries = CorpusHelper.getExtractSummaries(text);
46 40 }
47 41  
48 42 for (Summary summary : goldSummaries) {
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
... ... @@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory;
10 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
11 11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
12 12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  13 +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper;
13 14 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
14 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15 16 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
... ... @@ -29,12 +30,10 @@ import weka.core.converters.ArffSaver;
29 30 import java.io.File;
30 31 import java.io.FileReader;
31 32 import java.io.IOException;
32   -import java.util.Arrays;
33 33 import java.util.List;
34 34 import java.util.Map;
35 35 import java.util.Set;
36 36 import java.util.function.Predicate;
37   -import java.util.stream.Collectors;
38 37  
39 38 import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
40 39  
... ... @@ -46,7 +45,7 @@ public class PrepareTrainingData {
46 45 }
47 46  
48 47 public static void main(String[] args) throws Exception {
49   - Set<String> trainTextIds = loadTrainTextIds();
  48 + Set<String> trainTextIds = CorpusHelper.loadTrainTextIds();
50 49  
51 50 Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, trainTextIds::contains);
52 51 Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains);
... ... @@ -91,14 +90,6 @@ public class PrepareTrainingData {
91 90 }
92 91 }
93 92  
94   - private static Set<String> loadTrainTextIds() throws IOException {
95   - File[] optimalSummaries = OPTIMAL_SUMMARIES_DIR.listFiles();
96   - if (optimalSummaries == null)
97   - throw new IOException("No optimal summaries at " + OPTIMAL_SUMMARIES_DIR);
98   -
99   - return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet());
100   - }
101   -
102 93 private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception {
103 94  
104 95 SentenceScorer sentenceScorer = new SentenceScorer();
... ...