Commit 7e387f1cdc557ac810c9c4118ddff9e36c78a776

Authored by Mateusz Kopeć
1 parent 2169abf8

training code

Showing 25 changed files with 397 additions and 696 deletions
.gitignore
@@ -17,4 +17,5 @@ hs_err_pid* @@ -17,4 +17,5 @@ hs_err_pid*
17 .idea 17 .idea
18 *.iml 18 *.iml
19 19
20 -/data  
21 \ No newline at end of file 20 \ No newline at end of file
  21 +/data
  22 +/summaries
nicolas-cli/pom.xml
@@ -53,28 +53,31 @@ @@ -53,28 +53,31 @@
53 <build> 53 <build>
54 <plugins> 54 <plugins>
55 <plugin> 55 <plugin>
  56 + <groupId>org.apache.maven.plugins</groupId>
56 <artifactId>maven-assembly-plugin</artifactId> 57 <artifactId>maven-assembly-plugin</artifactId>
57 - <configuration>  
58 - <appendAssemblyId>false</appendAssemblyId>  
59 - <archive>  
60 - <manifest>  
61 - <mainClass>pl.waw.ipipan.zil.summ.nicolas.cli.Main</mainClass>  
62 - </manifest>  
63 - </archive>  
64 - <descriptorRefs>  
65 - <descriptorRef>jar-with-dependencies</descriptorRef>  
66 - </descriptorRefs>  
67 - </configuration>  
68 <executions> 58 <executions>
69 <execution> 59 <execution>
70 - <id>make-assembly</id> 60 + <id>jar-with-dependencies</id>
71 <phase>package</phase> 61 <phase>package</phase>
72 <goals> 62 <goals>
73 <goal>single</goal> 63 <goal>single</goal>
74 </goals> 64 </goals>
  65 + <configuration>
  66 + <descriptorRefs>
  67 + <descriptorRef>jar-with-dependencies</descriptorRef>
  68 + </descriptorRefs>
  69 + <appendAssemblyId>false</appendAssemblyId>
  70 + <finalName>nicolas-cli</finalName>
  71 + <archive>
  72 + <manifest>
  73 + <mainClass>pl.waw.ipipan.zil.summ.nicolas.cli.Main</mainClass>
  74 + </manifest>
  75 + </archive>
  76 + </configuration>
75 </execution> 77 </execution>
76 </executions> 78 </executions>
77 </plugin> 79 </plugin>
  80 +
78 </plugins> 81 </plugins>
79 </build> 82 </build>
80 </project> 83 </project>
81 \ No newline at end of file 84 \ No newline at end of file
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java
@@ -10,18 +10,19 @@ import org.slf4j.Logger; @@ -10,18 +10,19 @@ import org.slf4j.Logger;
10 import org.slf4j.LoggerFactory; 10 import org.slf4j.LoggerFactory;
11 11
12 import java.io.File; 12 import java.io.File;
  13 +import java.io.IOException;
13 14
14 class Cli { 15 class Cli {
15 16
16 private static final Logger LOG = LoggerFactory.getLogger(Cli.class); 17 private static final Logger LOG = LoggerFactory.getLogger(Cli.class);
17 18
18 - @Parameter(names = {"-help", "-h"}, description = "Print help") 19 + @Parameter(names = {"-help", "-h"}, description = "Print help", help = true)
19 private boolean help = false; 20 private boolean help = false;
20 21
21 @Parameter(names = {"-input", "-i"}, description = "Input text file to summarize", required = true, validateWith = FileValidator.class, converter = FileConverter.class) 22 @Parameter(names = {"-input", "-i"}, description = "Input text file to summarize", required = true, validateWith = FileValidator.class, converter = FileConverter.class)
22 private File inputFile; 23 private File inputFile;
23 24
24 - @Parameter(names = {"-output", "-o"}, description = "Output file path for summary", required = true, validateWith = FileValidator.class, converter = FileConverter.class) 25 + @Parameter(names = {"-output", "-o"}, description = "Output file path for summary", required = true, validateWith = OutputFileValidator.class, converter = FileConverter.class)
25 private File outputFile; 26 private File outputFile;
26 27
27 @Parameter(names = {"-target", "-t"}, description = "Target summary token count", required = true, validateWith = PositiveInteger.class) 28 @Parameter(names = {"-target", "-t"}, description = "Target summary token count", required = true, validateWith = PositiveInteger.class)
@@ -84,4 +85,19 @@ class Cli { @@ -84,4 +85,19 @@ class Cli {
84 } 85 }
85 86
86 } 87 }
  88 +
  89 + public static class OutputFileValidator implements IParameterValidator {
  90 +
  91 + @Override
  92 + public void validate(String name, String value) {
  93 + File file = new File(value);
  94 + try {
  95 + file.createNewFile();
  96 + } catch (IOException ex) {
  97 + throw new ParameterException("Parameter " + name
  98 + + " should be a valid file path (found " + value + ")", ex);
  99 + }
  100 + }
  101 +
  102 + }
87 } 103 }
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
@@ -182,5 +182,4 @@ public class Utils { @@ -182,5 +182,4 @@ public class Utils {
182 return sb.toString().trim(); 182 return sb.toString().trim();
183 } 183 }
184 184
185 -  
186 } 185 }
187 \ No newline at end of file 186 \ No newline at end of file
nicolas-train/pom.xml
@@ -40,6 +40,10 @@ @@ -40,6 +40,10 @@
40 <groupId>pl.waw.ipipan.zil.multiservice</groupId> 40 <groupId>pl.waw.ipipan.zil.multiservice</groupId>
41 <artifactId>utils</artifactId> 41 <artifactId>utils</artifactId>
42 </dependency> 42 </dependency>
  43 + <dependency>
  44 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  45 + <artifactId>eval</artifactId>
  46 + </dependency>
43 47
44 <!-- third party --> 48 <!-- third party -->
45 <dependency> 49 <dependency>
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Main.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.train.pipeline.*;
  4 +
  5 +public class Main {
  6 +
  7 + private Main() {
  8 + }
  9 +
  10 + public static void main(String[] args) throws Exception {
  11 + DownloadCorpus.main(args);
  12 + DownloadTrainingResources.main(args);
  13 + ExtractGoldSummaries.main(args);
  14 + CreateOptimalSummaries.main(args);
  15 + PrepareTrainingData.main(args);
  16 + TrainAllModels.main(args);
  17 + }
  18 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train;
  2 +
  3 +import net.lingala.zip4j.core.ZipFile;
  4 +import net.lingala.zip4j.exception.ZipException;
  5 +import org.apache.commons.io.FileUtils;
  6 +import org.slf4j.Logger;
  7 +import org.slf4j.LoggerFactory;
  8 +
  9 +import java.io.File;
  10 +import java.io.IOException;
  11 +import java.net.URL;
  12 +
  13 +public class PathConstants {
  14 +
  15 + private static final Logger LOG = LoggerFactory.getLogger(PathConstants.class);
  16 +
  17 + public static final String CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/PolishSummariesCorpus?action=AttachFile&do=get&target=PSC_1.0.zip";
  18 + public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip";
  19 + public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip";
  20 + public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv";
  21 +
  22 + public static final File WORKING_DIR = new File("data");
  23 +
  24 + public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip");
  25 + public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip");
  26 + public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip");
  27 +
  28 + public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus");
  29 + public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data");
  30 + public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids");
  31 + public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed");
  32 + public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold");
  33 + public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold");
  34 + public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal");
  35 + public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv");
  36 +
  37 + public static final File ARFF_DIR = new File(WORKING_DIR, "train-arff");
  38 + public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff");
  39 + public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff");
  40 + public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff");
  41 +
  42 + private PathConstants() {
  43 + }
  44 +
  45 + public static File createFolder(File folder) {
  46 + if (folder.mkdir()) {
  47 + LOG.info("Created directory at: {}.", folder.getPath());
  48 + } else {
  49 + LOG.info("Directory already present at: {}.", folder.getPath());
  50 + }
  51 + return folder;
  52 + }
  53 +
  54 + public static void downloadFile(String fileUrl, File targetFile) throws IOException {
  55 + if (!targetFile.exists()) {
  56 + LOG.info("Downloading file from url {} to file {} ...", fileUrl, targetFile);
  57 + FileUtils.copyURLToFile(new URL(fileUrl), targetFile);
  58 + LOG.info("done.");
  59 + } else {
  60 + LOG.info("File {} already downloaded.", targetFile);
  61 + }
  62 + }
  63 +
  64 + public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException, ZipException {
  65 + downloadFile(url, targetZipFile);
  66 + extractZipFile(targetZipFile, targetDir);
  67 + }
  68 +
  69 + private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException {
  70 + if (targetDir.exists()) {
  71 + LOG.info("Zip file {} already extracted to dir {}.", targetZipFile, targetDir);
  72 + } else {
  73 + createFolder(targetDir);
  74 + ZipFile zipFile = new ZipFile(targetZipFile);
  75 + zipFile.extractAll(targetDir.getPath());
  76 + LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir);
  77 + }
  78 + }
  79 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/MentionScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.mention; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model;
2 2
3 import com.google.common.collect.HashMultiset; 3 import com.google.common.collect.HashMultiset;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/SentenceScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model;
2 2
3 import com.google.common.collect.HashMultiset; 3 import com.google.common.collect.HashMultiset;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
5 import com.google.common.collect.Multiset; 5 import com.google.common.collect.Multiset;
6 -import com.google.common.collect.Sets;  
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/ModelConstants.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/Settings.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.common; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model;
2 2
3 import weka.classifiers.Classifier; 3 import weka.classifiers.Classifier;
4 import weka.classifiers.trees.RandomForest; 4 import weka.classifiers.trees.RandomForest;
5 5
6 -public class ModelConstants { 6 +public class Settings {
7 7
8 - public static final String MENTION_DATASET_PATH = "data/arff/mentions_train.arff";  
9 - public static final String SENTENCE_DATASET_PATH = "data/arff/sentences_train.arff";  
10 - public static final String ZERO_DATASET_PATH = "data/arff/zeros_train.arff";  
11 -  
12 - private static final int NUM_ITERATIONS = 250; 8 + private static final int NUM_ITERATIONS = 20;
13 private static final int NUM_EXECUTION_SLOTS = 8; 9 private static final int NUM_EXECUTION_SLOTS = 8;
14 private static final int SEED = 0; 10 private static final int SEED = 0;
15 11
16 - private ModelConstants() { 12 + private Settings() {
17 } 13 }
18 14
19 public static Classifier getMentionClassifier() { 15 public static Classifier getMentionClassifier() {
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/ZeroScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.zero; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model;
2 2
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import org.apache.commons.csv.CSVFormat; 4 import org.apache.commons.csv.CSVFormat;
@@ -9,9 +9,7 @@ import pl.waw.ipipan.zil.summ.nicolas.common.Constants; @@ -9,9 +9,7 @@ import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; 10 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
11 11
12 -import java.io.IOException;  
13 -import java.io.InputStream;  
14 -import java.io.InputStreamReader; 12 +import java.io.*;
15 import java.util.List; 13 import java.util.List;
16 import java.util.Map; 14 import java.util.Map;
17 15
@@ -21,8 +19,8 @@ public class ZeroScorer { @@ -21,8 +19,8 @@ public class ZeroScorer {
21 19
22 private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap(); 20 private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap();
23 21
24 - public ZeroScorer(String goldZerosResourcePath) throws IOException {  
25 - try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosResourcePath); 22 + public ZeroScorer(File zeroTrainingCorpusFile) throws IOException {
  23 + try (InputStream stream = new FileInputStream(zeroTrainingCorpusFile);
26 InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING); 24 InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING);
27 CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) { 25 CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) {
28 List<CSVRecord> records = parser.getRecords(); 26 List<CSVRecord> records = parser.getRecords();
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/TrainModelCommon.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.common;  
2 -  
3 -import org.apache.commons.lang3.time.StopWatch;  
4 -import org.slf4j.Logger;  
5 -import org.slf4j.LoggerFactory;  
6 -import weka.classifiers.Classifier;  
7 -import weka.core.Instances;  
8 -import weka.core.converters.ArffLoader;  
9 -  
10 -import java.io.File;  
11 -import java.io.FileOutputStream;  
12 -import java.io.ObjectOutputStream;  
13 -import java.util.logging.LogManager;  
14 -  
15 -@SuppressWarnings("squid:S2118")  
16 -public class TrainModelCommon {  
17 -  
18 - private static final Logger LOG = LoggerFactory.getLogger(TrainModelCommon.class);  
19 -  
20 - private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources";  
21 -  
22 - private TrainModelCommon() {  
23 - }  
24 -  
25 - public static void trainAndSaveModel(String datasetPath, Classifier classifier, String targetPath) throws Exception {  
26 - LogManager.getLogManager().reset(); // disable WEKA logging  
27 -  
28 - ArffLoader loader = new ArffLoader();  
29 - loader.setFile(new File(datasetPath));  
30 - Instances instances = loader.getDataSet();  
31 - instances.setClassIndex(0);  
32 - LOG.info("{} instances loaded.", instances.size());  
33 - LOG.info("{} attributes for each instance.", instances.numAttributes());  
34 -  
35 - StopWatch watch = new StopWatch();  
36 - watch.start();  
37 -  
38 - LOG.info("Building classifier...");  
39 - classifier.buildClassifier(instances);  
40 - LOG.info("...done. Build classifier: {}", classifier);  
41 -  
42 - String target = TARGET_MODEL_DIR + targetPath;  
43 - LOG.info("Saving classifier at: {}", target);  
44 - try (ObjectOutputStream oos = new ObjectOutputStream(  
45 - new FileOutputStream(target))) {  
46 - oos.writeObject(classifier);  
47 - }  
48 -  
49 - watch.stop();  
50 - LOG.info("Elapsed time: {}", watch);  
51 - }  
52 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/TrainMentionModel.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.mention;  
2 -  
3 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
4 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;  
5 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon;  
6 -import weka.classifiers.Classifier;  
7 -  
8 -public class TrainMentionModel {  
9 -  
10 - private TrainMentionModel() {  
11 - }  
12 -  
13 - public static void main(String[] args) throws Exception {  
14 - Classifier classifier = ModelConstants.getMentionClassifier();  
15 - String datasetPath = ModelConstants.MENTION_DATASET_PATH;  
16 - String targetPath = Constants.MENTION_MODEL_RESOURCE_PATH;  
17 - TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath);  
18 - }  
19 -  
20 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/TrainSentenceModel.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence;  
2 -  
3 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
4 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;  
5 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon;  
6 -import weka.classifiers.Classifier;  
7 -  
8 -public class TrainSentenceModel {  
9 -  
10 - private TrainSentenceModel() {  
11 - }  
12 -  
13 - public static void main(String[] args) throws Exception {  
14 - Classifier classifier = ModelConstants.getSentenceClassifier();  
15 - String datasetPath = ModelConstants.SENTENCE_DATASET_PATH;  
16 - String targetPath = Constants.SENTENCE_MODEL_RESOURCE_PATH;  
17 - TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath);  
18 - }  
19 -  
20 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/TrainZeroModel.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.model.zero;  
2 -  
3 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
4 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;  
5 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon;  
6 -import weka.classifiers.Classifier;  
7 -  
8 -public class TrainZeroModel {  
9 -  
10 - private TrainZeroModel() {  
11 - }  
12 -  
13 - public static void main(String[] args) throws Exception {  
14 - Classifier classifier = ModelConstants.getZeroClassifier();  
15 - String datasetPath = ModelConstants.ZERO_DATASET_PATH;  
16 - String targetPath = Constants.ZERO_MODEL_RESOURCE_PATH;  
17 - TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath);  
18 - }  
19 -  
20 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
  2 +
  3 +
  4 +import com.google.common.collect.HashMultiset;
  5 +import com.google.common.collect.Lists;
  6 +import com.google.common.collect.Maps;
  7 +import com.google.common.collect.Multiset;
  8 +import org.apache.commons.io.FileUtils;
  9 +import pl.waw.ipipan.zil.summ.eval.Main;
  10 +import pl.waw.ipipan.zil.summ.eval.rouge.RougeN;
  11 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  12 +
  13 +import java.io.File;
  14 +import java.io.IOException;
  15 +import java.util.*;
  16 +import java.util.stream.Collectors;
  17 +
  18 +import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  19 +
  20 +public class CreateOptimalSummaries {
  21 +
  22 + private static final int ROUGE_N = 1;
  23 + private static final String SUMMARY_SUFFIX = "_optimal.txt";
  24 +
  25 + private CreateOptimalSummaries() {
  26 + }
  27 +
  28 + public static void main(String[] args) throws IOException {
  29 + createFolder(OPTIMAL_SUMMARIES_DIR);
  30 + Map<String, Set<File>> id2goldSummaryFiles = Main.loadFiles(GOLD_TRAIN_SUMMARIES_DIR);
  31 + createUpperBoundSummaries(id2goldSummaryFiles, OPTIMAL_SUMMARIES_DIR);
  32 + }
  33 +
  34 + private static void createUpperBoundSummaries(Map<String, Set<File>> id2goldSummaryFiles, File targetDir)
  35 + throws IOException {
  36 + for (Map.Entry<String, Set<File>> entry : id2goldSummaryFiles.entrySet()) {
  37 + String id = entry.getKey();
  38 + Set<File> goldSummaryFiles = entry.getValue();
  39 + Set<String> goldSummaries = goldSummaryFiles.stream().map(Main::loadSummaryFromFile)
  40 + .collect(Collectors.toSet());
  41 + int averageGoldWordCount = (int) goldSummaries.stream().mapToDouble(s -> RougeN.tokenize(s).size())
  42 + .average().orElse(0);
  43 + String optimalSummary = createOptimalSummary(goldSummaries, ROUGE_N, averageGoldWordCount);
  44 + File targetFile = new File(targetDir, id + SUMMARY_SUFFIX);
  45 + FileUtils.writeStringToFile(targetFile, optimalSummary, Constants.ENCODING);
  46 + }
  47 + }
  48 +
  49 + private static String createOptimalSummary(Set<String> goldSummaries, int i, int averageGoldWordCount) {
  50 + Map<List<String>, List<Integer>> ngram2counts = Maps.newHashMap();
  51 + for (String goldSummary : goldSummaries) {
  52 + Multiset<List<String>> goldNgrams = HashMultiset.create();
  53 + RougeN.countNgrams(goldNgrams, i, goldSummary);
  54 + for (List<String> ngram : goldNgrams.elementSet()) {
  55 + ngram2counts.putIfAbsent(ngram, Lists.newArrayList());
  56 + ngram2counts.get(ngram).add(goldNgrams.count(ngram));
  57 + }
  58 + }
  59 +
  60 + int summaryWordCount = 0;
  61 + StringBuilder summary = new StringBuilder();
  62 + while (averageGoldWordCount >= summaryWordCount) {
  63 + List<String> ngram = pickBestNgram(ngram2counts);
  64 + summary.append(" ").append(String.join(" ", ngram));
  65 + summaryWordCount += ngram.size();
  66 + }
  67 + return summary.toString().trim();
  68 + }
  69 +
  70 + private static List<String> pickBestNgram(Map<List<String>, List<Integer>> ngram2counts) {
  71 + Optional<List<String>> optional = ngram2counts.keySet().stream()
  72 + .sorted(Comparator.comparing((List<String> ngram) -> ngram2counts.get(ngram).size()).reversed()).findFirst();
  73 + if (!optional.isPresent()) {
  74 + throw new IllegalArgumentException("No more ngrams to pick!");
  75 + }
  76 + List<String> optimalNgram = optional.get();
  77 + List<Integer> counts = ngram2counts.get(optimalNgram);
  78 + List<Integer> newCounts = Lists.newArrayList();
  79 + for (Integer c : counts)
  80 + if (c > 1)
  81 + newCounts.add(c - 1);
  82 + if (newCounts.isEmpty())
  83 + ngram2counts.remove(optimalNgram);
  84 + else
  85 + ngram2counts.put(optimalNgram, newCounts);
  86 + return optimalNgram;
  87 + }
  88 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadAndPreprocessCorpus.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;  
2 -  
3 -import net.lingala.zip4j.core.ZipFile;  
4 -import org.apache.commons.io.FileUtils;  
5 -import org.slf4j.Logger;  
6 -import org.slf4j.LoggerFactory;  
7 -  
8 -import java.io.File;  
9 -import java.net.URL;  
10 -  
11 -public class DownloadAndPreprocessCorpus {  
12 -  
13 - private static final Logger LOG = LoggerFactory.getLogger(DownloadAndPreprocessCorpus.class);  
14 -  
15 - private static final String WORKING_DIR = "data";  
16 - private static final String CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/PolishSummariesCorpus?action=AttachFile&do=get&target=PSC_1.0.zip";  
17 -  
18 - private DownloadAndPreprocessCorpus() {  
19 - }  
20 -  
21 - public static void main(String[] args) throws Exception {  
22 - File workDir = createFolder(WORKING_DIR);  
23 -  
24 - File corpusFile = new File(workDir, "corpus.zip");  
25 - if (!corpusFile.exists()) {  
26 - LOG.info("Downloading corpus file...");  
27 - FileUtils.copyURLToFile(new URL(CORPUS_DOWNLOAD_URL), corpusFile);  
28 - LOG.info("done.");  
29 - } else {  
30 - LOG.info("Corpus file already downloaded.");  
31 - }  
32 -  
33 - File extractedCorpusDir = new File(workDir, "corpus");  
34 - if (extractedCorpusDir.exists()) {  
35 - LOG.info("Corpus file already extracted.");  
36 - } else {  
37 - ZipFile zipFile = new ZipFile(corpusFile);  
38 - zipFile.extractAll(extractedCorpusDir.getPath());  
39 - LOG.info("Extracted corpus file.");  
40 - }  
41 -  
42 - File pscDir = new File(extractedCorpusDir, "PSC_1.0");  
43 - File dataDir = new File(pscDir, "data");  
44 -  
45 - File preprocessed = new File(WORKING_DIR, "preprocessed");  
46 - createFolder(preprocessed.getPath());  
47 - Preprocess.main(new String[]{dataDir.getPath(), preprocessed.getPath()});  
48 - }  
49 -  
50 - private static File createFolder(String path) {  
51 - File folder = new File(path);  
52 - if (folder.mkdir()) {  
53 - LOG.info("Created directory at: {}.", path);  
54 - } else {  
55 - LOG.info("Directory already present at: {}.", path);  
56 - }  
57 - return folder;  
58 - }  
59 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
  2 +
  3 +import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  4 +
  5 +public class DownloadCorpus {
  6 +
  7 + private DownloadCorpus() {
  8 + }
  9 +
  10 + public static void main(String[] args) throws Exception {
  11 + createFolder(WORKING_DIR);
  12 + downloadFileAndExtract(CORPUS_DOWNLOAD_URL, ZIPPED_CORPUS_FILE, EXTRACTED_CORPUS_DIR);
  13 + }
  14 +
  15 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
  2 +
  3 +import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  4 +
  5 +public class DownloadTrainingResources {
  6 +
  7 + private DownloadTrainingResources() {
  8 + }
  9 +
  10 + public static void main(String[] args) throws Exception {
  11 + createFolder(WORKING_DIR);
  12 + downloadFileAndExtract(PREPROCESSED_CORPUS_DOWNLOAD_URL, ZIPPED_PREPROCESSED_CORPUS_FILE, PREPROCESSED_CORPUS_DIR);
  13 + downloadFileAndExtract(SUMMARY_SENTENCE_IDS_DOWNLOAD_URL, ZIPPED_SUMMARY_SENTENCE_IDS_FILE, SUMMARY_SENTENCE_IDS_DIR);
  14 + downloadFile(ZERO_TRAINING_CORPUS_URL, ZERO_TRAINING_CORPUS);
  15 + }
  16 +
  17 +
  18 +}
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/ExtractGoldSummaries.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
1 -package pl.waw.ipipan.zil.summ.nicolas.eval; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 3 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
4 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; 4 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
@@ -9,32 +9,43 @@ import javax.xml.bind.JAXBException; @@ -9,32 +9,43 @@ import javax.xml.bind.JAXBException;
9 import java.io.File; 9 import java.io.File;
10 import java.io.IOException; 10 import java.io.IOException;
11 import java.util.List; 11 import java.util.List;
12 -import java.util.Set; 12 +import java.util.function.Predicate;
13 import java.util.stream.Collectors; 13 import java.util.stream.Collectors;
  14 +import java.util.stream.Stream;
14 15
15 -import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; 16 +import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
16 17
17 public class ExtractGoldSummaries { 18 public class ExtractGoldSummaries {
18 19
  20 + private static final String ABSTRACT_SUMMARY_TYPE = "abstract";
  21 + private static final int SUMMARY_RATIO = 20;
  22 +
  23 + private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE));
  24 +
  25 +
19 private ExtractGoldSummaries() { 26 private ExtractGoldSummaries() {
20 } 27 }
21 28
22 public static void main(String[] args) throws IOException, JAXBException { 29 public static void main(String[] args) throws IOException, JAXBException {
23 - File corpusDir = new File("data/corpus/PSC_1.0/data");  
24 - File targetDir = new File("data/summaries-gold");  
25 - targetDir.mkdir(); 30 + createFolder(GOLD_TEST_SUMMARIES_DIR);
  31 + createFolder(GOLD_TRAIN_SUMMARIES_DIR);
26 32
27 - Set<String> testTextIds = loadTestTextIds();  
28 - File[] files = corpusDir.listFiles(); 33 + File[] files = EXTRACTED_CORPUS_DATA_DIR.listFiles();
29 if (files != null) { 34 if (files != null) {
30 for (File file : files) { 35 for (File file : files) {
31 Text text = PSC_IO.readText(file); 36 Text text = PSC_IO.readText(file);
32 - if (!testTextIds.contains(text.getId()))  
33 - continue;  
34 37
35 - List<Summary> goldSummaries = text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals("abstract") && summary.getRatio().equals(20)).collect(Collectors.toList()); 38 + List<Summary> goldSummaries;
  39 + Stream<Summary> stream = text.getSummaries().getSummary().stream();
  40 + boolean isTest = IS_TEST.test(text);
  41 + if (isTest) {
  42 + goldSummaries = stream.filter(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
  43 + } else {
  44 + goldSummaries = stream.filter(summary -> !summary.getType().equals(ABSTRACT_SUMMARY_TYPE) && summary.getRatio().equals(SUMMARY_RATIO)).collect(Collectors.toList());
  45 + }
36 46
37 for (Summary summary : goldSummaries) { 47 for (Summary summary : goldSummaries) {
  48 + File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR;
38 File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); 49 File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt");
39 Utils.writeStringToFile(summary.getBody(), targetFile); 50 Utils.writeStringToFile(summary.getBody(), targetFile);
40 } 51 }
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
@@ -11,22 +11,18 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; @@ -11,22 +11,18 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
13 import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; 13 import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
14 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
15 import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; 14 import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
16 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 15 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
17 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 16 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
18 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 17 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
19 -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;  
20 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 18 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
21 -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;  
22 -import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.MentionScorer;  
23 -import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.SentenceScorer;  
24 -import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.ZeroScorer; 19 +import pl.waw.ipipan.zil.summ.nicolas.train.model.MentionScorer;
  20 +import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer;
  21 +import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer;
25 import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; 22 import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
26 import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; 23 import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;
27 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; 24 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
28 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; 25 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
29 -import weka.classifiers.Classifier;  
30 import weka.core.Instance; 26 import weka.core.Instance;
31 import weka.core.Instances; 27 import weka.core.Instances;
32 import weka.core.converters.ArffSaver; 28 import weka.core.converters.ArffSaver;
@@ -34,31 +30,26 @@ import weka.core.converters.ArffSaver; @@ -34,31 +30,26 @@ import weka.core.converters.ArffSaver;
34 import java.io.File; 30 import java.io.File;
35 import java.io.FileReader; 31 import java.io.FileReader;
36 import java.io.IOException; 32 import java.io.IOException;
37 -import java.io.InputStream; 33 +import java.util.Arrays;
38 import java.util.List; 34 import java.util.List;
39 import java.util.Map; 35 import java.util.Map;
40 import java.util.Set; 36 import java.util.Set;
41 import java.util.function.Predicate; 37 import java.util.function.Predicate;
42 import java.util.stream.Collectors; 38 import java.util.stream.Collectors;
43 39
  40 +import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  41 +
44 public class PrepareTrainingData { 42 public class PrepareTrainingData {
45 43
46 private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); 44 private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class);
47 45
48 - private static final String THRIFT_TEXTS_PATH = "data/preprocessed";  
49 - private static final String OPTIMAL_SUMMARIES_DIR_PATH = "data/summaries-optimal";  
50 - private static final String SUMMARY_SENTENCE_IDS = "data/summaries-sentence-ids";  
51 -  
52 - private static final String ZERO_TRAINING_DATA_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_zero.tsv";  
53 - private static final String TRAIN_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt";  
54 -  
55 private PrepareTrainingData() { 46 private PrepareTrainingData() {
56 } 47 }
57 48
58 public static void main(String[] args) throws Exception { 49 public static void main(String[] args) throws Exception {
59 Set<String> trainTextIds = loadTrainTextIds(); 50 Set<String> trainTextIds = loadTrainTextIds();
60 51
61 - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(THRIFT_TEXTS_PATH), trainTextIds::contains); 52 + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(PREPROCESSED_CORPUS_DIR, trainTextIds::contains);
62 Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains); 53 Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains);
63 54
64 prepareMentionsDataset(id2preprocessedText, id2optimalSummary); 55 prepareMentionsDataset(id2preprocessedText, id2optimalSummary);
@@ -66,7 +57,7 @@ public class PrepareTrainingData { @@ -66,7 +57,7 @@ public class PrepareTrainingData {
66 prepareZerosDataset(id2preprocessedText); 57 prepareZerosDataset(id2preprocessedText);
67 } 58 }
68 59
69 - public static void prepareMentionsDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException { 60 + private static void prepareMentionsDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException {
70 MentionScorer mentionScorer = new MentionScorer(); 61 MentionScorer mentionScorer = new MentionScorer();
71 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); 62 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
72 63
@@ -74,7 +65,7 @@ public class PrepareTrainingData { @@ -74,7 +65,7 @@ public class PrepareTrainingData {
74 65
75 int i = 1; 66 int i = 1;
76 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { 67 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
77 - LOG.info("{}/{}", i++, id2preprocessedText.size()); 68 + logProgress(id2preprocessedText, i++);
78 69
79 String id = entry.getKey(); 70 String id = entry.getKey();
80 TText preprocessedText = entry.getValue(); 71 TText preprocessedText = entry.getValue();
@@ -92,29 +83,33 @@ public class PrepareTrainingData { @@ -92,29 +83,33 @@ public class PrepareTrainingData {
92 instances.add(instance); 83 instances.add(instance);
93 } 84 }
94 } 85 }
95 - saveInstancesToFile(instances, new File(ModelConstants.MENTION_DATASET_PATH)); 86 + saveInstancesToFile(instances, MENTION_ARFF);
96 } 87 }
97 88
98 - private static Set<String> loadTrainTextIds() throws IOException {  
99 - try (InputStream inputStream = PrepareTrainingData.class.getResourceAsStream(TRAIN_TEXT_IDS_RESOURCE_PATH)) {  
100 - List<String> testTextIds = IOUtils.readLines(inputStream, Constants.ENCODING);  
101 - return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); 89 + private static void logProgress(Map<String, TText> id2preprocessedText, int i) {
  90 + if (i % 10 == 0) {
  91 + LOG.info("{}/{}", i, id2preprocessedText.size());
102 } 92 }
103 } 93 }
104 94
105 - public static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { 95 + private static Set<String> loadTrainTextIds() throws IOException {
  96 + File[] optimalSummaries = OPTIMAL_SUMMARIES_DIR.listFiles();
  97 + if (optimalSummaries == null)
  98 + throw new IOException("No optimal summaries at " + OPTIMAL_SUMMARIES_DIR);
  99 +
  100 + return Arrays.stream(optimalSummaries).map(file -> file.getName().split("_")[0]).collect(Collectors.toSet());
  101 + }
  102 +
  103 + private static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception {
106 104
107 SentenceScorer sentenceScorer = new SentenceScorer(); 105 SentenceScorer sentenceScorer = new SentenceScorer();
108 SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); 106 SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor();
109 107
110 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 108 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
111 109
112 - Classifier classifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);  
113 - MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor();  
114 -  
115 int i = 1; 110 int i = 1;
116 for (String textId : id2preprocessedText.keySet()) { 111 for (String textId : id2preprocessedText.keySet()) {
117 - LOG.info("{}/{}", i++, id2preprocessedText.size()); 112 + logProgress(id2preprocessedText, i++);
118 113
119 TText preprocessedText = id2preprocessedText.get(textId); 114 TText preprocessedText = id2preprocessedText.get(textId);
120 String optimalSummary = id2optimalSummary.get(textId); 115 String optimalSummary = id2optimalSummary.get(textId);
@@ -123,9 +118,7 @@ public class PrepareTrainingData { @@ -123,9 +118,7 @@ public class PrepareTrainingData {
123 Map<TSentence, Double> sentence2score = sentenceScorer.calculateSentenceScores(optimalSummary, preprocessedText); 118 Map<TSentence, Double> sentence2score = sentenceScorer.calculateSentenceScores(optimalSummary, preprocessedText);
124 119
125 Set<TMention> goodMentions 120 Set<TMention> goodMentions
126 - = MentionModel.detectGoodMentions(classifier, mentionFeatureExtractor, preprocessedText);  
127 -// Set<TMention> goodMentions  
128 -// = Utils.loadGoldGoodMentions(textId, preprocessedText, true); 121 + = loadGoldGoodMentions(textId, preprocessedText, id2optimalSummary);
129 122
130 Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); 123 Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
131 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { 124 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
@@ -136,21 +129,31 @@ public class PrepareTrainingData { @@ -136,21 +129,31 @@ public class PrepareTrainingData {
136 instances.add(instance); 129 instances.add(instance);
137 } 130 }
138 } 131 }
139 - saveInstancesToFile(instances, new File(ModelConstants.SENTENCE_DATASET_PATH)); 132 + saveInstancesToFile(instances, SENTENCE_ARFF);
  133 + }
  134 +
  135 + private static Set<TMention> loadGoldGoodMentions(String id, TText text, Map<String, String> id2optimalSummary) throws IOException {
  136 + String optimalSummary = id2optimalSummary.get(id);
  137 +
  138 + MentionScorer scorer = new MentionScorer();
  139 + Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);
  140 +
  141 + mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) < 1);
  142 + return mention2score.keySet();
140 } 143 }
141 144
142 - public static void prepareZerosDataset(Map<String, TText> id2preprocessedText) throws IOException { 145 + private static void prepareZerosDataset(Map<String, TText> id2preprocessedText) throws IOException {
143 146
144 - Map<String, Set<String>> id2sentIds = loadSentenceIds(SUMMARY_SENTENCE_IDS); 147 + Map<String, Set<String>> id2sentIds = loadSentenceIds(SUMMARY_SENTENCE_IDS_DIR);
145 148
146 - ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_DATA_RESOURCE_PATH); 149 + ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_CORPUS);
147 ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); 150 ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor();
148 151
149 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 152 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
150 153
151 int i = 1; 154 int i = 1;
152 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { 155 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
153 - LOG.info(i++ + "/" + id2preprocessedText.size()); 156 + logProgress(id2preprocessedText, i++);
154 157
155 String textId = entry.getKey(); 158 String textId = entry.getKey();
156 159
@@ -170,12 +173,12 @@ public class PrepareTrainingData { @@ -170,12 +173,12 @@ public class PrepareTrainingData {
170 } 173 }
171 } 174 }
172 175
173 - saveInstancesToFile(instances, new File(ModelConstants.ZERO_DATASET_PATH)); 176 + saveInstancesToFile(instances, ZERO_ARFF);
174 } 177 }
175 178
176 - private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { 179 + private static Map<String, Set<String>> loadSentenceIds(File idsFolder) throws IOException {
177 Map<String, Set<String>> result = Maps.newHashMap(); 180 Map<String, Set<String>> result = Maps.newHashMap();
178 - File[] files = new File(idsPath).listFiles(); 181 + File[] files = idsFolder.listFiles();
179 if (files != null) 182 if (files != null)
180 for (File f : files) { 183 for (File f : files) {
181 String id = f.getName().split("_")[0]; 184 String id = f.getName().split("_")[0];
@@ -194,7 +197,7 @@ public class PrepareTrainingData { @@ -194,7 +197,7 @@ public class PrepareTrainingData {
194 197
195 private static Map<String, String> loadOptimalSummaries(Predicate<String> idFilter) throws IOException { 198 private static Map<String, String> loadOptimalSummaries(Predicate<String> idFilter) throws IOException {
196 Map<String, String> id2optimalSummary = Maps.newHashMap(); 199 Map<String, String> id2optimalSummary = Maps.newHashMap();
197 - File[] files = new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles(); 200 + File[] files = OPTIMAL_SUMMARIES_DIR.listFiles();
198 if (files != null) 201 if (files != null)
199 for (File optimalSummaryFile : files) { 202 for (File optimalSummaryFile : files) {
200 String textId = optimalSummaryFile.getName().split("_")[0]; 203 String textId = optimalSummaryFile.getName().split("_")[0];
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/Preprocess.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java
@@ -9,37 +9,27 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; @@ -9,37 +9,27 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
9 import java.io.File; 9 import java.io.File;
10 import java.util.Arrays; 10 import java.util.Arrays;
11 11
12 -public class Preprocess { 12 +import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
13 13
14 - private static final Logger LOG = LoggerFactory.getLogger(Preprocess.class); 14 +public class PreprocessCorpus {
  15 +
  16 + private static final Logger LOG = LoggerFactory.getLogger(PreprocessCorpus.class);
15 17
16 private static final String CORPUS_FILE_SUFFIX = ".xml"; 18 private static final String CORPUS_FILE_SUFFIX = ".xml";
17 private static final String OUTPUT_FILE_SUFFIX = ".thrift"; 19 private static final String OUTPUT_FILE_SUFFIX = ".thrift";
18 20
19 - private Preprocess() { 21 + private PreprocessCorpus() {
20 } 22 }
21 23
22 public static void main(String[] args) { 24 public static void main(String[] args) {
23 - if (args.length != 2) {  
24 - LOG.error("Wrong usage! Try " + Preprocess.class.getSimpleName() + " dirWithCorpusFiles targetDir");  
25 - return;  
26 - }  
27 - File corpusDir = new File(args[0]);  
28 - if (!corpusDir.isDirectory()) {  
29 - LOG.error("Corpus directory does not exist: {}", corpusDir);  
30 - return;  
31 - }  
32 - File targetDir = new File(args[1]);  
33 - if (!targetDir.isDirectory()) {  
34 - LOG.error("Target directory does not exist: {}", targetDir);  
35 - return;  
36 - } 25 +
  26 + createFolder(PREPROCESSED_CORPUS_DIR);
37 27
38 int ok = 0; 28 int ok = 0;
39 int err = 0; 29 int err = 0;
40 - File[] files = corpusDir.listFiles(f -> f.getName().endsWith(CORPUS_FILE_SUFFIX)); 30 + File[] files = EXTRACTED_CORPUS_DATA_DIR.listFiles(f -> f.getName().endsWith(CORPUS_FILE_SUFFIX));
41 if (files == null || files.length == 0) { 31 if (files == null || files.length == 0) {
42 - LOG.error("No corpus files found at: {}", corpusDir); 32 + LOG.error("No corpus files found at: {}", EXTRACTED_CORPUS_DATA_DIR);
43 return; 33 return;
44 } 34 }
45 Arrays.sort(files); 35 Arrays.sort(files);
@@ -49,7 +39,7 @@ public class Preprocess { @@ -49,7 +39,7 @@ public class Preprocess {
49 for (File file : files) { 39 for (File file : files) {
50 try { 40 try {
51 Text text = PSC_IO.readText(file); 41 Text text = PSC_IO.readText(file);
52 - File targetFile = new File(targetDir, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX)); 42 + File targetFile = new File(PREPROCESSED_CORPUS_DIR, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX));
53 processor.preprocessToFile(text.getBody(), targetFile); 43 processor.preprocessToFile(text.getBody(), targetFile);
54 ok++; 44 ok++;
55 } catch (Exception e) { 45 } catch (Exception e) {
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; 1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 -import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel;  
4 -import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel;  
5 -import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel; 3 +import org.apache.commons.lang3.time.StopWatch;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  7 +import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings;
  8 +import weka.classifiers.Classifier;
  9 +import weka.core.Instances;
  10 +import weka.core.converters.ArffLoader;
  11 +
  12 +import java.io.File;
  13 +import java.io.FileOutputStream;
  14 +import java.io.ObjectOutputStream;
  15 +import java.util.logging.LogManager;
  16 +
  17 +import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
6 18
7 public class TrainAllModels { 19 public class TrainAllModels {
8 20
  21 + private static final Logger LOG = LoggerFactory.getLogger(TrainAllModels.class);
  22 +
  23 + private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources";
  24 +
9 private TrainAllModels() { 25 private TrainAllModels() {
10 } 26 }
11 27
12 public static void main(String[] args) throws Exception { 28 public static void main(String[] args) throws Exception {
13 - TrainMentionModel.main(args);  
14 - TrainSentenceModel.main(args);  
15 - TrainZeroModel.main(args); 29 + trainAndSaveModel(MENTION_ARFF, Settings.getMentionClassifier(), Constants.MENTION_MODEL_RESOURCE_PATH);
  30 + trainAndSaveModel(SENTENCE_ARFF, Settings.getSentenceClassifier(), Constants.SENTENCE_MODEL_RESOURCE_PATH);
  31 + trainAndSaveModel(ZERO_ARFF, Settings.getZeroClassifier(), Constants.ZERO_MODEL_RESOURCE_PATH);
  32 + }
  33 +
  34 + private static void trainAndSaveModel(File dataset, Classifier classifier, String targetPath) throws Exception {
  35 + LogManager.getLogManager().reset(); // disable WEKA logging
  36 +
  37 + ArffLoader loader = new ArffLoader();
  38 + loader.setFile(dataset);
  39 + Instances instances = loader.getDataSet();
  40 + instances.setClassIndex(0);
  41 + LOG.info("{} instances loaded.", instances.size());
  42 + LOG.info("{} attributes for each instance.", instances.numAttributes());
  43 +
  44 + StopWatch watch = new StopWatch();
  45 + watch.start();
  46 +
  47 + LOG.info("Building classifier...");
  48 + classifier.buildClassifier(instances);
  49 + LOG.info("...done. Build classifier: {}", classifier);
  50 +
  51 + String target = TARGET_MODEL_DIR + targetPath;
  52 + LOG.info("Saving classifier at: {}", target);
  53 + try (ObjectOutputStream oos = new ObjectOutputStream(
  54 + new FileOutputStream(target))) {
  55 + oos.writeObject(classifier);
  56 + }
  57 +
  58 + watch.stop();
  59 + LOG.info("Elapsed time: {}", watch);
16 } 60 }
17 } 61 }
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt deleted
1 -199704210011  
2 -199704210013  
3 -199704250031  
4 -199704260017  
5 -199801030156  
6 -199801100009  
7 -199801150038  
8 -199801150133  
9 -199801170001  
10 -199801170129  
11 -199801170130  
12 -199801200002  
13 -199801200132  
14 -199801210007  
15 -199801220030  
16 -199801220127  
17 -199801230001  
18 -199801230095  
19 -199801240116  
20 -199801240123  
21 -199801260113  
22 -199801270108  
23 -199801280128  
24 -199801290020  
25 -199801310032  
26 -199802040201  
27 -199901180149  
28 -199901190049  
29 -199901230088  
30 -199901250006  
31 -199901250008  
32 -199901250111  
33 -199901250113  
34 -199901300064  
35 -199901300098  
36 -199902240123  
37 -199906220027  
38 -199906220037  
39 -199906220038  
40 -199906220056  
41 -199906220065  
42 -199906230040  
43 -199906230052  
44 -199906240040  
45 -199906240088  
46 -199906250007  
47 -199906250091  
48 -199906260015  
49 -199906260018  
50 -199906260038  
51 -199907030016  
52 -199907030018  
53 -199907030042  
54 -199907030059  
55 -199907050032  
56 -199907050040  
57 -199907050047  
58 -199907050071  
59 -199907270095  
60 -199907270137  
61 -199907270145  
62 -199909210045  
63 -199909250054  
64 -199909300064  
65 -199909300065  
66 -199909300066  
67 -199910020049  
68 -199910020050  
69 -199910090047  
70 -199910090049  
71 -199910090051  
72 -199910110055  
73 -199910110057  
74 -199910210058  
75 -199910210059  
76 -199910270041  
77 -199910280054  
78 -199910280055  
79 -199910280057  
80 -199910300026  
81 -199911030039  
82 -199911030040  
83 -199911030041  
84 -199911060031  
85 -199911060042  
86 -199911060043  
87 -199911080054  
88 -199911080055  
89 -199911080056  
90 -199911100061  
91 -199911100062  
92 -199911100063  
93 -199911130036  
94 -199911130037  
95 -199911130038  
96 -199911180042  
97 -199911180043  
98 -199911180044  
99 -199911220059  
100 -199911220061  
101 -199911220066  
102 -199911230041  
103 -199911240035  
104 -199911240037  
105 -199911240038  
106 -199911250055  
107 -199911250057  
108 -199912020059  
109 -199912090045  
110 -199912090047  
111 -199912090061  
112 -199912110041  
113 -199912110042  
114 -199912130055  
115 -199912130057  
116 -199912170065  
117 -199912180052  
118 -199912210018  
119 -199912210037  
120 -199912210040  
121 -199912220045  
122 -199912220046  
123 -199912220047  
124 -199912230058  
125 -199912230059  
126 -199912230097  
127 -199912280028  
128 -199912280044  
129 -199912280045  
130 -199912310085  
131 -199912310087  
132 -200001030047  
133 -200001030106  
134 -200001040030  
135 -200001040031  
136 -200001060052  
137 -200001060053  
138 -200001060055  
139 -200001070062  
140 -200001070066  
141 -200001080040  
142 -200001080041  
143 -200001140061  
144 -200001140064  
145 -200001170049  
146 -200001170051  
147 -200001170052  
148 -200001170053  
149 -200001180040  
150 -200001200056  
151 -200001220023  
152 -200001220118  
153 -200001240016  
154 -200001290042  
155 -200001310048  
156 -200001310049  
157 -200001310050  
158 -200001310054  
159 -200002090042  
160 -200002090043  
161 -200002120045  
162 -200002120046  
163 -200002160046  
164 -200002160047  
165 -200002250063  
166 -200002250065  
167 -200002250066  
168 -200002290044  
169 -200002290045  
170 -200002290046  
171 -200002290047  
172 -200002290048  
173 -200003010058  
174 -200003010059  
175 -200003060054  
176 -200003060055  
177 -200003060057  
178 -200003110047  
179 -200003110048  
180 -200003110049  
181 -200003210044  
182 -200003210045  
183 -200004120021  
184 -200004120022  
185 -200004120023  
186 -200004150048  
187 -200004150049  
188 -200004150050  
189 -200004170026  
190 -200004170065  
191 -200004220044  
192 -200004220045  
193 -200004220046  
194 -200004220047  
195 -200004220048  
196 -200005060030  
197 -200005150055  
198 -200005150059  
199 -200005300045  
200 -200005300047  
201 -200005300048  
202 -200006010065  
203 -200006010066  
204 -200006010067  
205 -200006050056  
206 -200006050057  
207 -200006050058  
208 -200006050059  
209 -200006050061  
210 -200006050068  
211 -200006070056  
212 -200006080033  
213 -200006120031  
214 -200006130055  
215 -200006130057  
216 -200006130059  
217 -200006260069  
218 -200006260071  
219 -200006270059  
220 -200007120068  
221 -200007120070  
222 -200007120072  
223 -200007170026  
224 -200007180051  
225 -200007240034  
226 -200007270050  
227 -200007280033  
228 -200008040071  
229 -200008040073  
230 -200008250077  
231 -200008250079  
232 -200008260055  
233 -200008310046  
234 -200010120066  
235 -200010120074  
236 -200010130063  
237 -200010140048  
238 -200010140049  
239 -200010160039  
240 -200010160048  
241 -200010160049  
242 -200010180059  
243 -200010180063  
244 -200010190066  
245 -200010190068  
246 -200011210063  
247 -200011210064  
248 -200011210066  
249 -200012050066  
250 -200012050067  
251 -200012050068  
252 -200012050069  
253 -200012050070  
254 -200012050071  
255 -200012080134  
256 -200012080137  
257 -200012110069  
258 -200012110070  
259 -200012110071  
260 -200012110075  
261 -200012120028  
262 -200012120068  
263 -200012120072  
264 -200012130056  
265 -200012130100  
266 -200012130102  
267 -200012130103  
268 -200012140095  
269 -200012140096  
270 -200012140097  
271 -200012140098  
272 -200012140099  
273 -200012140100  
274 -200012150076  
275 -200012160048  
276 -200012160049  
277 -200012180083  
278 -200012180084  
279 -200012180088  
280 -200012230028  
281 -200012230045  
282 -200012230046  
283 -200012230047  
284 -200012230048  
285 -200012230050  
286 -200012270055  
287 -200012270056  
288 -200101020059  
289 -200101020062  
290 -200101020063  
291 -200101020075  
292 -200101130048  
293 -200101130050  
294 -200101130051  
295 -200101130055  
296 -200101150043  
297 -200101150045  
298 -200101180050  
299 -200101180051  
300 -200101180052  
301 -200101200048  
302 -200101220047  
303 -200101220053  
304 -200102070011  
305 -200102070016  
306 -200102120034  
307 -200102120057  
308 -200102130014  
309 -200102150001  
310 -200102150014  
311 -200102160011  
312 -200102190016  
313 -200102220001  
314 -200102220013  
315 -200102270041  
316 -200102270062  
317 -200102280169  
318 -200103010049  
319 -200103060022  
320 -200103060032  
321 -200103060057  
322 -200103080026  
323 -200103080030  
324 -200103080036  
325 -200103100019  
326 -200103100021  
327 -200103100058  
328 -200103100062  
329 -200103130008  
330 -200103130023  
331 -200103130069  
332 -200103200066  
333 -200103200080  
334 -200103270069  
335 -200103310092  
336 -200104020007  
337 -200104050011  
338 -200104100021  
339 -200104100023  
340 -200104170015  
341 -200104170040  
342 -200104170055  
343 -200104170057  
344 -200104190039  
345 -200104190066  
346 -200104230031  
347 -200104230069  
348 -200104260051  
349 -200104260053  
350 -200104300213  
351 -200104300215  
352 -200104300217  
353 -200105020092  
354 -200105050042  
355 -200105050043  
356 -200105050046  
357 -200105050048  
358 -200105070017  
359 -200105140050  
360 -200105140052  
361 -200105220096  
362 -200105290074  
363 -200105290075  
364 -200106120068  
365 -200106120069  
366 -200106180051  
367 -200106180053  
368 -200106200064  
369 -200106220086  
370 -200106220087  
371 -200106220088  
372 -200106220090  
373 -200106250050  
374 -200107120071  
375 -200107120073  
376 -200107210129  
377 -200107240070  
378 -200107250080  
379 -200108060051  
380 -200108060155  
381 -200108060156  
382 -200108060157  
383 -200108070038  
384 -200108160040  
385 -200108180123  
386 -200108200033  
387 -200108210066  
388 -200108210074  
389 -200108270077  
390 -200108280064  
391 -200109060061  
392 -200109130091  
393 -200109250092  
394 -200109260097  
395 -200109270116  
396 -200110020075  
397 -200110150056  
398 -200110150062  
399 -200110200070  
400 -200110200071  
401 -200110220068  
402 -200111080086  
403 -200111140055  
404 -200111210078  
405 -200111240060  
406 -200112040031  
407 -200112040077  
408 -200112050063  
409 -200112100041  
410 -200112190067  
411 -200201280011  
412 -200201290029  
413 -200202280078  
414 -200203280057  
415 -200203290107  
train.sh 0 → 100755
  1 +#!/usr/bin/env bash
  2 +
  3 +mvn clean install -Dmaven.test.skip=true
  4 +mvn -pl nicolas-train exec:java -Dexec.mainClass="pl.waw.ipipan.zil.summ.nicolas.train.Main"
  5 +mvn install -Dmaven.test.skip=true