diff --git a/eval.sh b/eval.sh index 20d602b..2b150f5 100755 --- a/eval.sh +++ b/eval.sh @@ -1,2 +1,4 @@ #!/usr/bin/env bash +mvn install -Dmaven.test.skip=true +mvn -pl nicolas-train exec:java -Dexec.mainClass="pl.waw.ipipan.zil.summ.nicolas.eval.Main" diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java new file mode 100644 index 0000000..d71b4fa --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java @@ -0,0 +1,55 @@ +package pl.waw.ipipan.zil.summ.nicolas; + +import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.OPTIMAL_SUMMARIES_DIR; + +public class CorpusHelper { + + private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; + private static final String EXTRACT_SUMMARY_TYPE = "extract"; + + private static final int SUMMARY_RATIO = 20; + + private CorpusHelper() { + } + + public static boolean isTest(Text text) { + return text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); + } + + public static List<Summary> getExtractSummaries(Text text) { + return text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals(EXTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); + } + + public static List<Summary> getAbstractSummaries(Text text) { + return text.getSummaries().getSummary().stream().filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); + } + + public static Set<String> loadTrainTextIds() throws IOException { + return collectIdsFromFolder(OPTIMAL_SUMMARIES_DIR); + } + + public static Set<String> loadTestTextIds() throws IOException { + return collectIdsFromFolder(GOLD_TEST_SUMMARIES_DIR); + } + + private static Set<String> collectIdsFromFolder(File folder) throws IOException { + File[] optimalSummaries = folder.listFiles(); + if (optimalSummaries == null) + throw new IOException("No summaries at " + folder); + + return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet()); + } + + +} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java index 7af55cf..9a3a514 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java @@ -18,18 +18,20 @@ public class PathConstants { public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip"; public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip"; public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv"; + public static final String COMPETITOR_SUMMARIES_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=competitor-summaries.zip"; public static final File WORKING_DIR = new File("data"); - public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip"); public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip"); public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip"); + public static final File ZIPPED_COMPETITOR_SUMMARIES_FILE = new File(WORKING_DIR, "competitor-summaries.zip"); public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus"); public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data"); public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids"); public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed"); public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold"); + public static final File SYSTEM_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-system"); public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold"); public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); @@ -39,6 +41,8 @@ public class PathConstants { public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); + public static final File SUMMARY_LENGTHS_FILE = new File(WORKING_DIR, "summary-lengths.tsv"); + private PathConstants() { } @@ -67,13 +71,9 @@ public class PathConstants { } private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { - if (targetDir.exists()) { - LOG.info("Zip file {} already extracted to dir {}.", targetZipFile, targetDir); - } else { - createFolder(targetDir); - ZipFile zipFile = new ZipFile(targetZipFile); - zipFile.extractAll(targetDir.getPath()); - LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); - } + createFolder(targetDir); + ZipFile zipFile = new ZipFile(targetZipFile); + zipFile.extractAll(targetDir.getPath()); + LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); } } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java new file mode 100644 index 0000000..c09c928 --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java @@ -0,0 +1,87 @@ +package pl.waw.ipipan.zil.summ.nicolas.eval; + +import com.google.common.io.Files; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVPrinter; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.summ.nicolas.Constants; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; + +import javax.xml.bind.JAXBException; +import java.io.File; +import java.io.FileWriter; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; + +public class CalculateSystemSummaryLengths { + + private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class); + + private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId", + "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t'); + + private CalculateSystemSummaryLengths() { + } + + public static void main(String[] args) throws IOException { + Map<String, Integer> textId2wordCount = loadTextLengths(EXTRACTED_CORPUS_DATA_DIR); + + try (CSVPrinter printer = new CSVPrinter(new FileWriter(SUMMARY_LENGTHS_FILE), CSV_FORMAT)) { + File[] files = SYSTEM_TEST_SUMMARIES_DIR.listFiles(); + if (files == null) { + throw new IOException("No summaries in " + SYSTEM_TEST_SUMMARIES_DIR); + } + for (File summaryFile : files) { + writeLengths(textId2wordCount, printer, summaryFile); + } + + } catch (IOException ex) { + LOG.error("Error creating target file: " + ex); + } + } + + private static void writeLengths(Map<String, Integer> textId2wordCount, CSVPrinter printer, + File summaryFile) throws IOException { + String[] split = summaryFile.getName().split("[._]"); + String textId = split[0]; + String systemName = split[1]; + String body = Files.toString(summaryFile, Constants.ENCODING); + + List<Object> record = new ArrayList<>(); + record.add(textId); + int textWC = textId2wordCount.get(textId); + record.add(textWC); + record.add("automatic"); + record.add(systemName); + int sumWC = TextUtils.tokenize(body).size(); + record.add(sumWC); + record.add(sumWC * 1.0 / textWC); + printer.printRecord(record); + } + + private static Map<String, Integer> loadTextLengths(File manualCorpusDir) throws IOException { + Map<String, Integer> textId2wordCount = new HashMap<>(); + File[] files = manualCorpusDir.listFiles(); + if (files == null) { + throw new IOException("No summaries in " + manualCorpusDir); + } + for (File file : files) { + try { + Text text = PSC_IO.readText(file); + textId2wordCount.put(text.getId(), TextUtils.tokenize(text.getBody()).size()); + } catch (IOException | JAXBException e) { + LOG.error("Error reading manual summaries: " + e); + } + } + return textId2wordCount; + } + +} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java deleted file mode 100644 index 17981e5..0000000 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java +++ /dev/null @@ -1,24 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.eval; - -import org.apache.commons.io.IOUtils; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -public class Constants { - - private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt"; - - private Constants() { - } - - public static Set<String> loadTestTextIds() throws IOException { - try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING); - return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); - } - } -} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/DownloadCompetingSummaries.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/DownloadCompetingSummaries.java new file mode 100644 index 0000000..eccccb4 --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/DownloadCompetingSummaries.java @@ -0,0 +1,13 @@ +package pl.waw.ipipan.zil.summ.nicolas.eval; + +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; + +public class DownloadCompetingSummaries { + + private DownloadCompetingSummaries() { + } + + public static void main(String[] args) throws Exception { + downloadFileAndExtract(COMPETITOR_SUMMARIES_DOWNLOAD_URL, ZIPPED_COMPETITOR_SUMMARIES_FILE, SYSTEM_TEST_SUMMARIES_DIR); + } +} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java index de33cae..8846b8a 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java @@ -2,14 +2,17 @@ package pl.waw.ipipan.zil.summ.nicolas.eval; import pl.waw.ipipan.zil.summ.eval.Main; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR; + public class Evaluate { private Evaluate() { } public static void main(String[] args) { - String goldDirPath = "data/summaries-gold"; - String systemDirPath = "data/summaries"; + String goldDirPath = GOLD_TEST_SUMMARIES_DIR.getAbsolutePath(); + String systemDirPath = SYSTEM_TEST_SUMMARIES_DIR.getAbsolutePath(); Main.main(new String[]{goldDirPath, systemDirPath}); } } \ No newline at end of file diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Main.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Main.java new file mode 100644 index 0000000..18dec29 --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Main.java @@ -0,0 +1,14 @@ +package pl.waw.ipipan.zil.summ.nicolas.eval; + +public class Main { + + private Main() { + } + + public static void main(String[] args) throws Exception { + SummarizeTestCorpus.main(args); + DownloadCompetingSummaries.main(args); + CalculateSystemSummaryLengths.main(args); + Evaluate.main(args); + } +} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java index c5dee6c..852ce47 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java @@ -5,8 +5,10 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; import pl.waw.ipipan.zil.summ.nicolas.Nicolas; import pl.waw.ipipan.zil.summ.nicolas.NicolasException; +import pl.waw.ipipan.zil.summ.nicolas.PathConstants; import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; @@ -19,7 +21,8 @@ import java.util.Map; import java.util.Set; import static java.util.stream.Collectors.toList; -import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.PREPROCESSED_CORPUS_DIR; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR; public class SummarizeTestCorpus { @@ -32,19 +35,17 @@ public class SummarizeTestCorpus { } public static void main(String[] args) throws IOException, NicolasException { - File thriftedCorpusDir = new File("data/all-preprocessed"); - File targetDir = new File("data/test-system"); - targetDir.mkdir(); + PathConstants.createFolder(SYSTEM_TEST_SUMMARIES_DIR); - Set<String> testTextIds = loadTestTextIds(); - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains); + Set<String> testTextIds = CorpusHelper.loadTestTextIds(); + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains); LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); Map<String, String> id2summary = summarizeTexts(id2preprocessedText); LOG.info("Texts summarized."); - saveSummariesToFolder(id2summary, targetDir); - LOG.info("Texts saved to {} folder.", targetDir); + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR); + LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR); } private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java index 05fbe5f..2b311f0 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java @@ -1,6 +1,7 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; import pl.waw.ipipan.zil.summ.nicolas.Constants; +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; import pl.waw.ipipan.zil.summ.pscapi.xml.Text; @@ -11,18 +12,11 @@ import java.io.FileOutputStream; import java.io.IOException; import java.io.OutputStreamWriter; import java.util.List; -import java.util.function.Predicate; -import java.util.stream.Collectors; -import java.util.stream.Stream; import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class ExtractGoldSummaries { - private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; - private static final int SUMMARY_RATIO = 20; - - private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); private ExtractGoldSummaries() { } @@ -37,12 +31,12 @@ public class ExtractGoldSummaries { Text text = PSC_IO.readText(file); List<Summary> goldSummaries; - Stream<Summary> stream = text.getSummaries().getSummary().stream(); - boolean isTest = IS_TEST.test(text); + + boolean isTest = CorpusHelper.isTest(text); if (isTest) { - goldSummaries = stream.filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); + goldSummaries = CorpusHelper.getAbstractSummaries(text); } else { - goldSummaries = stream.filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); + goldSummaries = CorpusHelper.getExtractSummaries(text); } for (Summary summary : goldSummaries) { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java index b5b0c09..808e45d 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java @@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; @@ -29,12 +30,10 @@ import weka.core.converters.ArffSaver; import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.Arrays; import java.util.List; import java.util.Map; import java.util.Set; import java.util.function.Predicate; -import java.util.stream.Collectors; import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; @@ -46,7 +45,7 @@ public class PrepareTrainingData { } public static void main(String[] args) throws Exception { - Set<String> trainTextIds = loadTrainTextIds(); + Set<String> trainTextIds = CorpusHelper.loadTrainTextIds(); Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, trainTextIds::contains); Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains); @@ -91,14 +90,6 @@ public class PrepareTrainingData { } } - private static Set<String> loadTrainTextIds() throws IOException { - File[] optimalSummaries = OPTIMAL_SUMMARIES_DIR.listFiles(); - if (optimalSummaries == null) - throw new IOException("No optimal summaries at " + OPTIMAL_SUMMARIES_DIR); - - return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet()); - } - private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { SentenceScorer sentenceScorer = new SentenceScorer();