Commit 89870bd0ed20b4282b3bd02c08df70ef087bee30
1 parent
08d128a5
finished evaluation script
Showing
11 changed files
with
201 additions
and
65 deletions
eval.sh
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/CorpusHelper.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; | |
4 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | |
5 | + | |
6 | +import java.io.File; | |
7 | +import java.io.IOException; | |
8 | +import java.util.Arrays; | |
9 | +import java.util.List; | |
10 | +import java.util.Set; | |
11 | +import java.util.stream.Collectors; | |
12 | + | |
13 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR; | |
14 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.OPTIMAL_SUMMARIES_DIR; | |
15 | + | |
16 | +public class CorpusHelper { | |
17 | + | |
18 | + private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; | |
19 | + private static final String EXTRACT_SUMMARY_TYPE = "extract"; | |
20 | + | |
21 | + private static final int SUMMARY_RATIO = 20; | |
22 | + | |
23 | + private CorpusHelper() { | |
24 | + } | |
25 | + | |
26 | + public static boolean isTest(Text text) { | |
27 | + return text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); | |
28 | + } | |
29 | + | |
30 | + public static List<Summary> getExtractSummaries(Text text) { | |
31 | + return text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals(EXTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | |
32 | + } | |
33 | + | |
34 | + public static List<Summary> getAbstractSummaries(Text text) { | |
35 | + return text.getSummaries().getSummary().stream().filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | |
36 | + } | |
37 | + | |
38 | + public static Set<String> loadTrainTextIds() throws IOException { | |
39 | + return collectIdsFromFolder(OPTIMAL_SUMMARIES_DIR); | |
40 | + } | |
41 | + | |
42 | + public static Set<String> loadTestTextIds() throws IOException { | |
43 | + return collectIdsFromFolder(GOLD_TEST_SUMMARIES_DIR); | |
44 | + } | |
45 | + | |
46 | + private static Set<String> collectIdsFromFolder(File folder) throws IOException { | |
47 | + File[] optimalSummaries = folder.listFiles(); | |
48 | + if (optimalSummaries == null) | |
49 | + throw new IOException("No summaries at " + folder); | |
50 | + | |
51 | + return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet()); | |
52 | + } | |
53 | + | |
54 | + | |
55 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
... | ... | @@ -18,18 +18,20 @@ public class PathConstants { |
18 | 18 | public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip"; |
19 | 19 | public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip"; |
20 | 20 | public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv"; |
21 | + public static final String COMPETITOR_SUMMARIES_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=competitor-summaries.zip"; | |
21 | 22 | |
22 | 23 | public static final File WORKING_DIR = new File("data"); |
23 | - | |
24 | 24 | public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip"); |
25 | 25 | public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip"); |
26 | 26 | public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip"); |
27 | + public static final File ZIPPED_COMPETITOR_SUMMARIES_FILE = new File(WORKING_DIR, "competitor-summaries.zip"); | |
27 | 28 | |
28 | 29 | public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus"); |
29 | 30 | public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data"); |
30 | 31 | public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids"); |
31 | 32 | public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed"); |
32 | 33 | public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold"); |
34 | + public static final File SYSTEM_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-system"); | |
33 | 35 | public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold"); |
34 | 36 | public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); |
35 | 37 | public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); |
... | ... | @@ -39,6 +41,8 @@ public class PathConstants { |
39 | 41 | public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); |
40 | 42 | public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); |
41 | 43 | |
44 | + public static final File SUMMARY_LENGTHS_FILE = new File(WORKING_DIR, "summary-lengths.tsv"); | |
45 | + | |
42 | 46 | private PathConstants() { |
43 | 47 | } |
44 | 48 | |
... | ... | @@ -67,13 +71,9 @@ public class PathConstants { |
67 | 71 | } |
68 | 72 | |
69 | 73 | private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { |
70 | - if (targetDir.exists()) { | |
71 | - LOG.info("Zip file {} already extracted to dir {}.", targetZipFile, targetDir); | |
72 | - } else { | |
73 | - createFolder(targetDir); | |
74 | - ZipFile zipFile = new ZipFile(targetZipFile); | |
75 | - zipFile.extractAll(targetDir.getPath()); | |
76 | - LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); | |
77 | - } | |
74 | + createFolder(targetDir); | |
75 | + ZipFile zipFile = new ZipFile(targetZipFile); | |
76 | + zipFile.extractAll(targetDir.getPath()); | |
77 | + LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); | |
78 | 78 | } |
79 | 79 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/CalculateSystemSummaryLengths.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | |
2 | + | |
3 | +import com.google.common.io.Files; | |
4 | +import org.apache.commons.csv.CSVFormat; | |
5 | +import org.apache.commons.csv.CSVPrinter; | |
6 | +import org.slf4j.Logger; | |
7 | +import org.slf4j.LoggerFactory; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | |
10 | +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | |
11 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | |
12 | + | |
13 | +import javax.xml.bind.JAXBException; | |
14 | +import java.io.File; | |
15 | +import java.io.FileWriter; | |
16 | +import java.io.IOException; | |
17 | +import java.util.ArrayList; | |
18 | +import java.util.HashMap; | |
19 | +import java.util.List; | |
20 | +import java.util.Map; | |
21 | + | |
22 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | |
23 | + | |
24 | +public class CalculateSystemSummaryLengths { | |
25 | + | |
26 | + private static final Logger LOG = LoggerFactory.getLogger(CalculateSystemSummaryLengths.class); | |
27 | + | |
28 | + private static final CSVFormat CSV_FORMAT = CSVFormat.DEFAULT.withHeader("TextId", | |
29 | + "TextWC", "SumType", "SumAuthor", "SumWC", "SumRealRatio").withDelimiter('\t'); | |
30 | + | |
31 | + private CalculateSystemSummaryLengths() { | |
32 | + } | |
33 | + | |
34 | + public static void main(String[] args) throws IOException { | |
35 | + Map<String, Integer> textId2wordCount = loadTextLengths(EXTRACTED_CORPUS_DATA_DIR); | |
36 | + | |
37 | + try (CSVPrinter printer = new CSVPrinter(new FileWriter(SUMMARY_LENGTHS_FILE), CSV_FORMAT)) { | |
38 | + File[] files = SYSTEM_TEST_SUMMARIES_DIR.listFiles(); | |
39 | + if (files == null) { | |
40 | + throw new IOException("No summaries in " + SYSTEM_TEST_SUMMARIES_DIR); | |
41 | + } | |
42 | + for (File summaryFile : files) { | |
43 | + writeLengths(textId2wordCount, printer, summaryFile); | |
44 | + } | |
45 | + | |
46 | + } catch (IOException ex) { | |
47 | + LOG.error("Error creating target file: " + ex); | |
48 | + } | |
49 | + } | |
50 | + | |
51 | + private static void writeLengths(Map<String, Integer> textId2wordCount, CSVPrinter printer, | |
52 | + File summaryFile) throws IOException { | |
53 | + String[] split = summaryFile.getName().split("[._]"); | |
54 | + String textId = split[0]; | |
55 | + String systemName = split[1]; | |
56 | + String body = Files.toString(summaryFile, Constants.ENCODING); | |
57 | + | |
58 | + List<Object> record = new ArrayList<>(); | |
59 | + record.add(textId); | |
60 | + int textWC = textId2wordCount.get(textId); | |
61 | + record.add(textWC); | |
62 | + record.add("automatic"); | |
63 | + record.add(systemName); | |
64 | + int sumWC = TextUtils.tokenize(body).size(); | |
65 | + record.add(sumWC); | |
66 | + record.add(sumWC * 1.0 / textWC); | |
67 | + printer.printRecord(record); | |
68 | + } | |
69 | + | |
70 | + private static Map<String, Integer> loadTextLengths(File manualCorpusDir) throws IOException { | |
71 | + Map<String, Integer> textId2wordCount = new HashMap<>(); | |
72 | + File[] files = manualCorpusDir.listFiles(); | |
73 | + if (files == null) { | |
74 | + throw new IOException("No summaries in " + manualCorpusDir); | |
75 | + } | |
76 | + for (File file : files) { | |
77 | + try { | |
78 | + Text text = PSC_IO.readText(file); | |
79 | + textId2wordCount.put(text.getId(), TextUtils.tokenize(text.getBody()).size()); | |
80 | + } catch (IOException | JAXBException e) { | |
81 | + LOG.error("Error reading manual summaries: " + e); | |
82 | + } | |
83 | + } | |
84 | + return textId2wordCount; | |
85 | + } | |
86 | + | |
87 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.eval; | |
2 | - | |
3 | -import org.apache.commons.io.IOUtils; | |
4 | - | |
5 | -import java.io.IOException; | |
6 | -import java.io.InputStream; | |
7 | -import java.util.List; | |
8 | -import java.util.Set; | |
9 | -import java.util.stream.Collectors; | |
10 | - | |
11 | -public class Constants { | |
12 | - | |
13 | - private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt"; | |
14 | - | |
15 | - private Constants() { | |
16 | - } | |
17 | - | |
18 | - public static Set<String> loadTestTextIds() throws IOException { | |
19 | - try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { | |
20 | - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING); | |
21 | - return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); | |
22 | - } | |
23 | - } | |
24 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/DownloadCompetingSummaries.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | |
2 | + | |
3 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | |
4 | + | |
5 | +public class DownloadCompetingSummaries { | |
6 | + | |
7 | + private DownloadCompetingSummaries() { | |
8 | + } | |
9 | + | |
10 | + public static void main(String[] args) throws Exception { | |
11 | + downloadFileAndExtract(COMPETITOR_SUMMARIES_DOWNLOAD_URL, ZIPPED_COMPETITOR_SUMMARIES_FILE, SYSTEM_TEST_SUMMARIES_DIR); | |
12 | + } | |
13 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
... | ... | @@ -2,14 +2,17 @@ package pl.waw.ipipan.zil.summ.nicolas.eval; |
2 | 2 | |
3 | 3 | import pl.waw.ipipan.zil.summ.eval.Main; |
4 | 4 | |
5 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.GOLD_TEST_SUMMARIES_DIR; | |
6 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR; | |
7 | + | |
5 | 8 | public class Evaluate { |
6 | 9 | |
7 | 10 | private Evaluate() { |
8 | 11 | } |
9 | 12 | |
10 | 13 | public static void main(String[] args) { |
11 | - String goldDirPath = "data/summaries-gold"; | |
12 | - String systemDirPath = "data/summaries"; | |
14 | + String goldDirPath = GOLD_TEST_SUMMARIES_DIR.getAbsolutePath(); | |
15 | + String systemDirPath = SYSTEM_TEST_SUMMARIES_DIR.getAbsolutePath(); | |
13 | 16 | Main.main(new String[]{goldDirPath, systemDirPath}); |
14 | 17 | } |
15 | 18 | } |
16 | 19 | \ No newline at end of file |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Main.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | |
2 | + | |
3 | +public class Main { | |
4 | + | |
5 | + private Main() { | |
6 | + } | |
7 | + | |
8 | + public static void main(String[] args) throws Exception { | |
9 | + SummarizeTestCorpus.main(args); | |
10 | + DownloadCompetingSummaries.main(args); | |
11 | + CalculateSystemSummaryLengths.main(args); | |
12 | + Evaluate.main(args); | |
13 | + } | |
14 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
... | ... | @@ -5,8 +5,10 @@ import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; | |
8 | 9 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
9 | 10 | import pl.waw.ipipan.zil.summ.nicolas.NicolasException; |
11 | +import pl.waw.ipipan.zil.summ.nicolas.PathConstants; | |
10 | 12 | import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; |
11 | 13 | import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; |
12 | 14 | |
... | ... | @@ -19,7 +21,8 @@ import java.util.Map; |
19 | 21 | import java.util.Set; |
20 | 22 | |
21 | 23 | import static java.util.stream.Collectors.toList; |
22 | -import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; | |
24 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.PREPROCESSED_CORPUS_DIR; | |
25 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.SYSTEM_TEST_SUMMARIES_DIR; | |
23 | 26 | |
24 | 27 | public class SummarizeTestCorpus { |
25 | 28 | |
... | ... | @@ -32,19 +35,17 @@ public class SummarizeTestCorpus { |
32 | 35 | } |
33 | 36 | |
34 | 37 | public static void main(String[] args) throws IOException, NicolasException { |
35 | - File thriftedCorpusDir = new File("data/all-preprocessed"); | |
36 | - File targetDir = new File("data/test-system"); | |
37 | - targetDir.mkdir(); | |
38 | + PathConstants.createFolder(SYSTEM_TEST_SUMMARIES_DIR); | |
38 | 39 | |
39 | - Set<String> testTextIds = loadTestTextIds(); | |
40 | - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains); | |
40 | + Set<String> testTextIds = CorpusHelper.loadTestTextIds(); | |
41 | + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, testTextIds::contains); | |
41 | 42 | LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); |
42 | 43 | |
43 | 44 | Map<String, String> id2summary = summarizeTexts(id2preprocessedText); |
44 | 45 | LOG.info("Texts summarized."); |
45 | 46 | |
46 | - saveSummariesToFolder(id2summary, targetDir); | |
47 | - LOG.info("Texts saved to {} folder.", targetDir); | |
47 | + saveSummariesToFolder(id2summary, SYSTEM_TEST_SUMMARIES_DIR); | |
48 | + LOG.info("Texts saved to {} folder.", SYSTEM_TEST_SUMMARIES_DIR); | |
48 | 49 | } |
49 | 50 | |
50 | 51 | private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
1 | 1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | |
3 | 3 | import pl.waw.ipipan.zil.summ.nicolas.Constants; |
4 | +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; | |
4 | 5 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; |
5 | 6 | import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; |
6 | 7 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; |
... | ... | @@ -11,18 +12,11 @@ import java.io.FileOutputStream; |
11 | 12 | import java.io.IOException; |
12 | 13 | import java.io.OutputStreamWriter; |
13 | 14 | import java.util.List; |
14 | -import java.util.function.Predicate; | |
15 | -import java.util.stream.Collectors; | |
16 | -import java.util.stream.Stream; | |
17 | 15 | |
18 | 16 | import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
19 | 17 | |
20 | 18 | public class ExtractGoldSummaries { |
21 | 19 | |
22 | - private static final String ABSTRACT_SUMMARY_TYPE = "abstract"; | |
23 | - private static final int SUMMARY_RATIO = 20; | |
24 | - | |
25 | - private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); | |
26 | 20 | |
27 | 21 | private ExtractGoldSummaries() { |
28 | 22 | } |
... | ... | @@ -37,12 +31,12 @@ public class ExtractGoldSummaries { |
37 | 31 | Text text = PSC_IO.readText(file); |
38 | 32 | |
39 | 33 | List<Summary> goldSummaries; |
40 | - Stream<Summary> stream = text.getSummaries().getSummary().stream(); | |
41 | - boolean isTest = IS_TEST.test(text); | |
34 | + | |
35 | + boolean isTest = CorpusHelper.isTest(text); | |
42 | 36 | if (isTest) { |
43 | - goldSummaries = stream.filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | |
37 | + goldSummaries = CorpusHelper.getAbstractSummaries(text); | |
44 | 38 | } else { |
45 | - goldSummaries = stream.filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList()); | |
39 | + goldSummaries = CorpusHelper.getExtractSummaries(text); | |
46 | 40 | } |
47 | 41 | |
48 | 42 | for (Summary summary : goldSummaries) { |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
... | ... | @@ -10,6 +10,7 @@ import org.slf4j.LoggerFactory; |
10 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
11 | 11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
12 | 12 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.CorpusHelper; | |
13 | 14 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
14 | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | 16 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
... | ... | @@ -29,12 +30,10 @@ import weka.core.converters.ArffSaver; |
29 | 30 | import java.io.File; |
30 | 31 | import java.io.FileReader; |
31 | 32 | import java.io.IOException; |
32 | -import java.util.Arrays; | |
33 | 33 | import java.util.List; |
34 | 34 | import java.util.Map; |
35 | 35 | import java.util.Set; |
36 | 36 | import java.util.function.Predicate; |
37 | -import java.util.stream.Collectors; | |
38 | 37 | |
39 | 38 | import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
40 | 39 | |
... | ... | @@ -46,7 +45,7 @@ public class PrepareTrainingData { |
46 | 45 | } |
47 | 46 | |
48 | 47 | public static void main(String[] args) throws Exception { |
49 | - Set<String> trainTextIds = loadTrainTextIds(); | |
48 | + Set<String> trainTextIds = CorpusHelper.loadTrainTextIds(); | |
50 | 49 | |
51 | 50 | Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, trainTextIds::contains); |
52 | 51 | Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains); |
... | ... | @@ -91,14 +90,6 @@ public class PrepareTrainingData { |
91 | 90 | } |
92 | 91 | } |
93 | 92 | |
94 | - private static Set<String> loadTrainTextIds() throws IOException { | |
95 | - File[] optimalSummaries = OPTIMAL_SUMMARIES_DIR.listFiles(); | |
96 | - if (optimalSummaries == null) | |
97 | - throw new IOException("No optimal summaries at " + OPTIMAL_SUMMARIES_DIR); | |
98 | - | |
99 | - return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet()); | |
100 | - } | |
101 | - | |
102 | 93 | private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { |
103 | 94 | |
104 | 95 | SentenceScorer sentenceScorer = new SentenceScorer(); |
... | ... |