Commit 89870bd0ed20b4282b3bd02c08df70ef087bee30
1 parent
08d128a5
finished evaluation script
Showing
11 changed files
with
201 additions
and
65 deletions
eval.sh
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; | ||
4 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | ||
5 | + | ||
6 | +import java.io.File; | ||
7 | +import java.io.IOException; | ||
8 | +import java.util.Arrays; | ||
9 | +import java.util.List; | ||
10 | +import java.util.Set; | ||
11 | +import java.util.stream.Collectors; | ||
12 | + | ||
13 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR; | ||
14 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.OPTIMAL_SUMMARIES_DIR; | ||
15 | + | ||
16 | +public class CorpusHelper { | ||
17 | + | ||
18 | + private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; | ||
19 | + private static final String EXTRACT_SUMMARY_TYPE = "extract"; | ||
20 | + | ||
21 | + private static final int SUMMARY_RATIO = 20; | ||
22 | + | ||
23 | + private CorpusHelper() { | ||
24 | + } | ||
25 | + | ||
26 | + public static boolean isTest(Text text) { | ||
27 | + return text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); | ||
28 | + } | ||
29 | + | ||
30 | + public static List<Summary> getExtractSummaries(Text text) { | ||
31 | + return text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals(EXTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | ||
32 | + } | ||
33 | + | ||
34 | + public static List<Summary> getAbstractSummaries(Text text) { | ||
35 | + return text.getSummaries().getSummary().stream().filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | ||
36 | + } | ||
37 | + | ||
38 | + public static Set<String> loadTrainTextIds() throws IOException { | ||
39 | + return collectIdsFromFolder(OPTIMAL_SUMMARIES_DIR); | ||
40 | + } | ||
41 | + | ||
42 | + public static Set<String> loadTestTextIds() throws IOException { | ||
43 | + return collectIdsFromFolder(GOLD_TEST_SUMMARIES_DIR); | ||
44 | + } | ||
45 | + | ||
46 | + private static Set<String> collectIdsFromFolder(File folder) throws IOException { | ||
47 | + File[] optimalSummaries = folder.listFiles(); | ||
48 | + if (optimalSummaries == null) | ||
49 | + throw new IOException("No summaries at " + folder); | ||
50 | + | ||
51 | + return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet()); | ||
52 | + } | ||
53 | + | ||
54 | + | ||
55 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
@@ -18,18 +18,20 @@ public class PathConstants { | @@ -18,18 +18,20 @@ public class PathConstants { | ||
18 | public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip"; | 18 | public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip"; |
19 | public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip"; | 19 | public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip"; |
20 | public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv"; | 20 | public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv"; |
21 | + public static final String COMPETITOR_SUMMARIES_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=competitor-summaries.zip"; | ||
21 | 22 | ||
22 | public static final File WORKING_DIR = new File("data"); | 23 | public static final File WORKING_DIR = new File("data"); |
23 | - | ||
24 | public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip"); | 24 | public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip"); |
25 | public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip"); | 25 | public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip"); |
26 | public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip"); | 26 | public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip"); |
27 | + public static final File ZIPPED_COMPETITOR_SUMMARIES_FILE = new File(WORKING_DIR, "competitor-summaries.zip"); | ||
27 | 28 | ||
28 | public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus"); | 29 | public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus"); |
29 | public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data"); | 30 | public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data"); |
30 | public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids"); | 31 | public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids"); |
31 | public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed"); | 32 | public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed"); |
32 | public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold"); | 33 | public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold"); |
34 | + public static final File SYSTEM_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-system"); | ||
33 | public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold"); | 35 | public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold"); |
34 | public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); | 36 | public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); |
35 | public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); | 37 | public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); |
@@ -39,6 +41,8 @@ public class PathConstants { | @@ -39,6 +41,8 @@ public class PathConstants { | ||
39 | public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); | 41 | public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); |
40 | public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); | 42 | public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); |
41 | 43 | ||
44 | + public static final File SUMMARY_LENGTHS_FILE = new File(WORKING_DIR, "summary-lengths.tsv"); | ||
45 | + | ||
42 | private PathConstants() { | 46 | private PathConstants() { |
43 | } | 47 | } |
44 | 48 | ||
@@ -67,13 +71,9 @@ public class PathConstants { | @@ -67,13 +71,9 @@ public class PathConstants { | ||
67 | } | 71 | } |
68 | 72 | ||
69 | private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { | 73 | private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { |
70 | - if (targetDir.exists()) { | ||
71 | - LOG.info("Zip file {} already extracted to dir {}.", targetZipFile, targetDir); | ||
72 | - } else { | ||
73 | - createFolder(targetDir); | ||
74 | - ZipFile zipFile = new ZipFile(targetZipFile); | ||
75 | - zipFile.extractAll(targetDir.getPath()); | ||
76 | - LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); | ||
77 | - } | 74 | + createFolder(targetDir); |
75 | + ZipFile zipFile = new ZipFile(targetZipFile); | ||
76 | + zipFile.extractAll(targetDir.getPath()); | ||
77 | + LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); | ||
78 | } | 78 | } |
79 | } | 79 | } |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | + | ||
3 | +import com.google.common.io.Files; | ||
4 | +import org.apache.commons.csv.CSVFormat; | ||
5 | +import org.apache.commons.csv.CSVPrinter; | ||
6 | +import org.slf4j.Logger; | ||
7 | +import org.slf4j.LoggerFactory; | ||
8 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
9 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | ||
10 | +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | ||
11 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | ||
12 | + | ||
13 | +import javax.xml.bind.JAXBException; | ||
14 | +import java.io.File; | ||
15 | +import java.io.FileWriter; | ||
16 | +import java.io.IOException; | ||
17 | +import java.util.ArrayList; | ||
18 | +import java.util.HashMap; | ||
19 | +import java.util.List; | ||
20 | +import java.util.Map; | ||
21 | + | ||
22 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | ||
23 | + | ||
24 | +public class CalculateSystemSummaryLengths { | ||
25 | + | ||
26 | + private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class); | ||
27 | + | ||
28 | + private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId", | ||
29 | + "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t'); | ||
30 | + | ||
31 | + private CalculateSystemSummaryLengths() { | ||
32 | + } | ||
33 | + | ||
34 | + public static void main(String[] args) throws IOException { | ||
35 | + Map<String, Integer> textId2wordCount = loadTextLengths(EXTRACTED_CORPUS_DATA_DIR); | ||
36 | + | ||
37 | + try (CSVPrinter printer = new CSVPrinter(new FileWriter(SUMMARY_LENGTHS_FILE), CSV_FORMAT)) { | ||
38 | + File[] files = SYSTEM_TEST_SUMMARIES_DIR.listFiles(); | ||
39 | + if (files == null) { | ||
40 | + throw new IOException("No summaries in " + SYSTEM_TEST_SUMMARIES_DIR); | ||
41 | + } | ||
42 | + for (File summaryFile : files) { | ||
43 | + writeLengths(textId2wordCount, printer, summaryFile); | ||
44 | + } | ||
45 | + | ||
46 | + } catch (IOException ex) { | ||
47 | + LOG.error("Error creating target file: " + ex); | ||
48 | + } | ||
49 | + } | ||
50 | + | ||
51 | + private static void writeLengths(Map<String, Integer> textId2wordCount, CSVPrinter printer, | ||
52 | + File summaryFile) throws IOException { | ||
53 | + String[] split = summaryFile.getName().split("[._]"); | ||
54 | + String textId = split[0]; | ||
55 | + String systemName = split[1]; | ||
56 | + String body = Files.toString(summaryFile, Constants.ENCODING); | ||
57 | + | ||
58 | + List<Object> record = new ArrayList<>(); | ||
59 | + record.add(textId); | ||
60 | + int textWC = textId2wordCount.get(textId); | ||
61 | + record.add(textWC); | ||
62 | + record.add("automatic"); | ||
63 | + record.add(systemName); | ||
64 | + int sumWC = TextUtils.tokenize(body).size(); | ||
65 | + record.add(sumWC); | ||
66 | + record.add(sumWC * 1.0 / textWC); | ||
67 | + printer.printRecord(record); | ||
68 | + } | ||
69 | + | ||
70 | + private static Map<String, Integer> loadTextLengths(File manualCorpusDir) throws IOException { | ||
71 | + Map<String, Integer> textId2wordCount = new HashMap<>(); | ||
72 | + File[] files = manualCorpusDir.listFiles(); | ||
73 | + if (files == null) { | ||
74 | + throw new IOException("No summaries in " + manualCorpusDir); | ||
75 | + } | ||
76 | + for (File file : files) { | ||
77 | + try { | ||
78 | + Text text = PSC_IO.readText(file); | ||
79 | + textId2wordCount.put(text.getId(), TextUtils.tokenize(text.getBody()).size()); | ||
80 | + } catch (IOException | JAXBException e) { | ||
81 | + LOG.error("Error reading manual summaries: " + e); | ||
82 | + } | ||
83 | + } | ||
84 | + return textId2wordCount; | ||
85 | + } | ||
86 | + | ||
87 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | - | ||
3 | -import org.apache.commons.io.IOUtils; | ||
4 | - | ||
5 | -import java.io.IOException; | ||
6 | -import java.io.InputStream; | ||
7 | -import java.util.List; | ||
8 | -import java.util.Set; | ||
9 | -import java.util.stream.Collectors; | ||
10 | - | ||
11 | -public class Constants { | ||
12 | - | ||
13 | - private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt"; | ||
14 | - | ||
15 | - private Constants() { | ||
16 | - } | ||
17 | - | ||
18 | - public static Set<String> loadTestTextIds() throws IOException { | ||
19 | - try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { | ||
20 | - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING); | ||
21 | - return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); | ||
22 | - } | ||
23 | - } | ||
24 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/DownloadCompetingSummaries.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | + | ||
3 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | ||
4 | + | ||
5 | +public class DownloadCompetingSummaries { | ||
6 | + | ||
7 | + private DownloadCompetingSummaries() { | ||
8 | + } | ||
9 | + | ||
10 | + public static void main(String[] args) throws Exception { | ||
11 | + downloadFileAndExtract(COMPETITOR_SUMMARIES_DOWNLOAD_URL, ZIPPED_COMPETITOR_SUMMARIES_FILE, SYSTEM_TEST_SUMMARIES_DIR); | ||
12 | + } | ||
13 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
@@ -2,14 +2,17 @@ package pl.waw.ipipan.zil.summ.nicolas.eval; | @@ -2,14 +2,17 @@ package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | 2 | ||
3 | import pl.waw.ipipan.zil.summ.eval.Main; | 3 | import pl.waw.ipipan.zil.summ.eval.Main; |
4 | 4 | ||
5 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR; | ||
6 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR; | ||
7 | + | ||
5 | public class Evaluate { | 8 | public class Evaluate { |
6 | 9 | ||
7 | private Evaluate() { | 10 | private Evaluate() { |
8 | } | 11 | } |
9 | 12 | ||
10 | public static void main(String[] args) { | 13 | public static void main(String[] args) { |
11 | - String goldDirPath = "data/summaries-gold"; | ||
12 | - String systemDirPath = "data/summaries"; | 14 | + String goldDirPath = GOLD_TEST_SUMMARIES_DIR.getAbsolutePath(); |
15 | + String systemDirPath = SYSTEM_TEST_SUMMARIES_DIR.getAbsolutePath(); | ||
13 | Main.main(new String[]{goldDirPath, systemDirPath}); | 16 | Main.main(new String[]{goldDirPath, systemDirPath}); |
14 | } | 17 | } |
15 | } | 18 | } |
16 | \ No newline at end of file | 19 | \ No newline at end of file |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Main.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | + | ||
3 | +public class Main { | ||
4 | + | ||
5 | + private Main() { | ||
6 | + } | ||
7 | + | ||
8 | + public static void main(String[] args) throws Exception { | ||
9 | + SummarizeTestCorpus.main(args); | ||
10 | + DownloadCompetingSummaries.main(args); | ||
11 | + CalculateSystemSummaryLengths.main(args); | ||
12 | + Evaluate.main(args); | ||
13 | + } | ||
14 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
@@ -5,8 +5,10 @@ import org.slf4j.Logger; | @@ -5,8 +5,10 @@ import org.slf4j.Logger; | ||
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; | ||
8 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | 9 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
9 | import pl.waw.ipipan.zil.summ.nicolas.NicolasException; | 10 | import pl.waw.ipipan.zil.summ.nicolas.NicolasException; |
11 | +import pl.waw.ipipan.zil.summ.nicolas.PathConstants; | ||
10 | import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | 12 | import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; |
11 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | 13 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; |
12 | 14 | ||
@@ -19,7 +21,8 @@ import java.util.Map; | @@ -19,7 +21,8 @@ import java.util.Map; | ||
19 | import java.util.Set; | 21 | import java.util.Set; |
20 | 22 | ||
21 | import static java.util.stream.Collectors.toList; | 23 | import static java.util.stream.Collectors.toList; |
22 | -import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; | 24 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.PREPROCESSED_CORPUS_DIR; |
25 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR; | ||
23 | 26 | ||
24 | public class SummarizeTestCorpus { | 27 | public class SummarizeTestCorpus { |
25 | 28 | ||
@@ -32,19 +35,17 @@ public class SummarizeTestCorpus { | @@ -32,19 +35,17 @@ public class SummarizeTestCorpus { | ||
32 | } | 35 | } |
33 | 36 | ||
34 | public static void main(String[] args) throws IOException, NicolasException { | 37 | public static void main(String[] args) throws IOException, NicolasException { |
35 | - File thriftedCorpusDir = new File("data/all-preprocessed"); | ||
36 | - File targetDir = new File("data/test-system"); | ||
37 | - targetDir.mkdir(); | 38 | + PathConstants.createFolder(SYSTEM_TEST_SUMMARIES_DIR); |
38 | 39 | ||
39 | - Set<String> testTextIds = loadTestTextIds(); | ||
40 | - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains); | 40 | + Set<String> testTextIds = CorpusHelper.loadTestTextIds(); |
41 | + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains); | ||
41 | LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); | 42 | LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); |
42 | 43 | ||
43 | Map<String, String> id2summary = summarizeTexts(id2preprocessedText); | 44 | Map<String, String> id2summary = summarizeTexts(id2preprocessedText); |
44 | LOG.info("Texts summarized."); | 45 | LOG.info("Texts summarized."); |
45 | 46 | ||
46 | - saveSummariesToFolder(id2summary, targetDir); | ||
47 | - LOG.info("Texts saved to {} folder.", targetDir); | 47 | + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR); |
48 | + LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR); | ||
48 | } | 49 | } |
49 | 50 | ||
50 | private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { | 51 | private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; | 1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | ||
3 | import pl.waw.ipipan.zil.summ.nicolas.Constants; | 3 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
4 | +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; | ||
4 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | 5 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; |
5 | import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; | 6 | import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; |
6 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | 7 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; |
@@ -11,18 +12,11 @@ import java.io.FileOutputStream; | @@ -11,18 +12,11 @@ import java.io.FileOutputStream; | ||
11 | import java.io.IOException; | 12 | import java.io.IOException; |
12 | import java.io.OutputStreamWriter; | 13 | import java.io.OutputStreamWriter; |
13 | import java.util.List; | 14 | import java.util.List; |
14 | -import java.util.function.Predicate; | ||
15 | -import java.util.stream.Collectors; | ||
16 | -import java.util.stream.Stream; | ||
17 | 15 | ||
18 | import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | 16 | import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
19 | 17 | ||
20 | public class ExtractGoldSummaries { | 18 | public class ExtractGoldSummaries { |
21 | 19 | ||
22 | - private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; | ||
23 | - private static final int SUMMARY_RATIO = 20; | ||
24 | - | ||
25 | - private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); | ||
26 | 20 | ||
27 | private ExtractGoldSummaries() { | 21 | private ExtractGoldSummaries() { |
28 | } | 22 | } |
@@ -37,12 +31,12 @@ public class ExtractGoldSummaries { | @@ -37,12 +31,12 @@ public class ExtractGoldSummaries { | ||
37 | Text text = PSC_IO.readText(file); | 31 | Text text = PSC_IO.readText(file); |
38 | 32 | ||
39 | List<Summary> goldSummaries; | 33 | List<Summary> goldSummaries; |
40 | - Stream<Summary> stream = text.getSummaries().getSummary().stream(); | ||
41 | - boolean isTest = IS_TEST.test(text); | 34 | + |
35 | + boolean isTest = CorpusHelper.isTest(text); | ||
42 | if (isTest) { | 36 | if (isTest) { |
43 | - goldSummaries = stream.filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | 37 | + goldSummaries = CorpusHelper.getAbstractSummaries(text); |
44 | } else { | 38 | } else { |
45 | - goldSummaries = stream.filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | 39 | + goldSummaries = CorpusHelper.getExtractSummaries(text); |
46 | } | 40 | } |
47 | 41 | ||
48 | for (Summary summary : goldSummaries) { | 42 | for (Summary summary : goldSummaries) { |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
@@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory; | @@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory; | ||
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
12 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 12 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; | ||
13 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 14 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 16 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
@@ -29,12 +30,10 @@ import weka.core.converters.ArffSaver; | @@ -29,12 +30,10 @@ import weka.core.converters.ArffSaver; | ||
29 | import java.io.File; | 30 | import java.io.File; |
30 | import java.io.FileReader; | 31 | import java.io.FileReader; |
31 | import java.io.IOException; | 32 | import java.io.IOException; |
32 | -import java.util.Arrays; | ||
33 | import java.util.List; | 33 | import java.util.List; |
34 | import java.util.Map; | 34 | import java.util.Map; |
35 | import java.util.Set; | 35 | import java.util.Set; |
36 | import java.util.function.Predicate; | 36 | import java.util.function.Predicate; |
37 | -import java.util.stream.Collectors; | ||
38 | 37 | ||
39 | import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | 38 | import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
40 | 39 | ||
@@ -46,7 +45,7 @@ public class PrepareTrainingData { | @@ -46,7 +45,7 @@ public class PrepareTrainingData { | ||
46 | } | 45 | } |
47 | 46 | ||
48 | public static void main(String[] args) throws Exception { | 47 | public static void main(String[] args) throws Exception { |
49 | - Set<String> trainTextIds = loadTrainTextIds(); | 48 | + Set<String> trainTextIds = CorpusHelper.loadTrainTextIds(); |
50 | 49 | ||
51 | Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, trainTextIds::contains); | 50 | Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, trainTextIds::contains); |
52 | Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains); | 51 | Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains); |
@@ -91,14 +90,6 @@ public class PrepareTrainingData { | @@ -91,14 +90,6 @@ public class PrepareTrainingData { | ||
91 | } | 90 | } |
92 | } | 91 | } |
93 | 92 | ||
94 | - private static Set<String> loadTrainTextIds() throws IOException { | ||
95 | - File[] optimalSummaries = OPTIMAL_SUMMARIES_DIR.listFiles(); | ||
96 | - if (optimalSummaries == null) | ||
97 | - throw new IOException("No optimal summaries at " + OPTIMAL_SUMMARIES_DIR); | ||
98 | - | ||
99 | - return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet()); | ||
100 | - } | ||
101 | - | ||
102 | private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { | 93 | private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { |
103 | 94 | ||
104 | SentenceScorer sentenceScorer = new SentenceScorer(); | 95 | SentenceScorer sentenceScorer = new SentenceScorer(); |