Commit 2169abf847196b768600339e9e18d7ab3fe70f3d

Authored by Mateusz Kopeć
1 parent cb490cab

WIP

Showing 44 changed files with 529 additions and 351 deletions
.gitignore
... ... @@ -16,3 +16,5 @@ hs_err_pid*
16 16  
17 17 .idea
18 18 *.iml
  19 +
  20 +/data
19 21 \ No newline at end of file
... ...
nicolas-cli/README.md
... ... @@ -3,6 +3,8 @@
3 3 This module contains a sample command-line application, which uses Nicolas library to summarize chosen input text file.
4 4 Summary is written to target output file. Additionally, user needs to specify desired number of tokens in the summary.
5 5  
  6 +Be aware that summarizer requires internet access and working Multiservice (multiservice.nlp.ipipan.waw.pl).
  7 +
6 8 ## Installation
7 9  
8 10 mvn clean install
... ...
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java
... ... @@ -3,10 +3,9 @@ package pl.waw.ipipan.zil.summ.nicolas.cli;
3 3 import org.slf4j.Logger;
4 4 import org.slf4j.LoggerFactory;
5 5 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
  6 +import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
6 7 import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor;
7 8  
8   -import java.io.IOException;
9   -
10 9 public class Main {
11 10  
12 11 private static final Logger LOG = LoggerFactory.getLogger(Main.class);
... ... @@ -26,7 +25,7 @@ public class Main {
26 25 try {
27 26 nicolas = new Nicolas();
28 27 preprocessor = new Preprocessor();
29   - } catch (IOException | ClassNotFoundException e) {
  28 + } catch (NicolasException e) {
30 29 LOG.error("Error loading Nicolas or Multiservice preprocessor! Will exit.");
31 30 return;
32 31 }
... ...
nicolas-common/pom.xml
... ... @@ -25,7 +25,7 @@
25 25 <!-- third party -->
26 26 <dependency>
27 27 <groupId>nz.ac.waikato.cms.weka</groupId>
28   - <artifactId>weka-dev</artifactId>
  28 + <artifactId>weka-stable</artifactId>
29 29 </dependency>
30 30 <dependency>
31 31 <groupId>commons-io</groupId>
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.common;
  2 +
  3 +import com.google.common.base.Predicates;
  4 +import com.google.common.collect.Maps;
  5 +import org.slf4j.Logger;
  6 +import org.slf4j.LoggerFactory;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +
  9 +import java.io.File;
  10 +import java.io.FileInputStream;
  11 +import java.io.IOException;
  12 +import java.io.InputStream;
  13 +import java.util.Map;
  14 +import java.util.function.Predicate;
  15 +
  16 +public class ThriftUtils {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class);
  19 +
  20 + private ThriftUtils() {
  21 + }
  22 +
  23 + public static Map<String, TText> loadThriftTextsFromFolder(File folder, Predicate<String> idFilter) {
  24 + Map<String, TText> id2text = Maps.newHashMap();
  25 + File[] files = folder.listFiles();
  26 + if (files != null) {
  27 + for (File processedFullTextFile : files) {
  28 + String textId = processedFullTextFile.getName().split("\\.")[0];
  29 + if (!idFilter.test(textId))
  30 + continue;
  31 + TText processedFullText = loadThriftTextFromFile(processedFullTextFile);
  32 + id2text.put(textId, processedFullText);
  33 + }
  34 + }
  35 + LOG.info("{} preprocessed texts found.", id2text.size());
  36 + return id2text;
  37 + }
  38 +
  39 + public static Map<String, TText> loadThriftTextsFromFolder(File folder) {
  40 + return loadThriftTextsFromFolder(folder, Predicates.alwaysTrue());
  41 + }
  42 +
  43 + public static TText loadThriftTextFromFile(File originalFile) {
  44 + try (FileInputStream inputStream = new FileInputStream(originalFile)) {
  45 + return loadThriftTextFromStream(inputStream);
  46 + } catch (IOException e) {
  47 + LOG.error("Error reading serialized Thrift file", e);
  48 + return null;
  49 + }
  50 + }
  51 +
  52 + public static TText loadThriftTextFromStream(InputStream stream) {
  53 + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) {
  54 + return (TText) ois.readObject();
  55 + } catch (ClassNotFoundException | IOException e) {
  56 + LOG.error("Error reading serialized Thrift stream", e);
  57 + return null;
  58 + }
  59 + }
  60 +
  61 +}
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
... ... @@ -28,6 +28,12 @@ public class Utils {
28 28 private Utils() {
29 29 }
30 30  
  31 + public static void writeStringToFile(String string, File file) throws IOException {
  32 + try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) {
  33 + bw.append(string);
  34 + }
  35 + }
  36 +
31 37 public static Classifier loadModelFromResource(String modelResourcePath) throws IOException {
32 38 LOG.info("Loading classifier from path: {}...", modelResourcePath);
33 39 try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) {
... ... @@ -76,44 +82,15 @@ public class Utils {
76 82 return instances;
77 83 }
78 84  
79   - public static Classifier loadClassifier(String path) throws IOException, ClassNotFoundException {
  85 + public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException {
80 86 LOG.info("Loading classifier...");
81   - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) {
  87 + try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) {
82 88 Classifier classifier = (Classifier) ois.readObject();
83 89 LOG.info("Done. " + classifier.toString());
84 90 return classifier;
85 91 }
86 92 }
87 93  
88   - public static Map<String, TText> loadPreprocessedTexts(String path) {
89   - Map<String, TText> id2text = Maps.newHashMap();
90   - for (File processedFullTextFile : new File(path).listFiles()) {
91   - TText processedFullText = loadThrifted(processedFullTextFile);
92   - id2text.put(processedFullTextFile.getName().split("\\.")[0], processedFullText);
93   - }
94   - LOG.info(id2text.size() + " preprocessed texts found.");
95   - return id2text;
96   - }
97   -
98   -
99   - public static TText loadThrifted(File originalFile) {
100   - try (FileInputStream inputStream = new FileInputStream(originalFile)) {
101   - return loadThrifted(inputStream);
102   - } catch (IOException e) {
103   - LOG.error("Error reading serialized file: " + e);
104   - return null;
105   - }
106   - }
107   -
108   - public static TText loadThrifted(InputStream stream) {
109   - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) {
110   - return (TText) ois.readObject();
111   - } catch (ClassNotFoundException | IOException e) {
112   - LOG.error("Error reading serialized file: " + e);
113   - return null;
114   - }
115   - }
116   -
117 94 public static List<String> tokenize(String text) {
118 95 return Arrays.asList(text.split("[^\\p{L}0-9]+"));
119 96 }
... ...
nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java
... ... @@ -14,7 +14,7 @@ public class UtilsTest {
14 14 @Test
15 15 public void shouldDeserializeTextIgnoringClassVersionId() throws Exception {
16 16 try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
17   - TText text = Utils.loadThrifted(stream);
  17 + TText text = ThriftUtils.loadThriftTextFromStream(stream);
18 18 assertEquals(26, text.getParagraphs().size());
19 19 assertEquals(2, text.getParagraphs().get(4).getSentences().size());
20 20 }
... ...
nicolas-eval/pom.xml 0 → 100644
  1 +<?xml version="1.0" encoding="UTF-8"?>
  2 +<project xmlns="http://maven.apache.org/POM/4.0.0"
  3 + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  4 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  5 + <parent>
  6 + <artifactId>nicolas-container</artifactId>
  7 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  8 + <version>1.0-SNAPSHOT</version>
  9 + </parent>
  10 + <modelVersion>4.0.0</modelVersion>
  11 +
  12 + <artifactId>nicolas-eval</artifactId>
  13 +
  14 + <dependencies>
  15 + <!-- project -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>nicolas-lib</artifactId>
  19 + </dependency>
  20 + <dependency>
  21 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  22 + <artifactId>nicolas-common</artifactId>
  23 + </dependency>
  24 +
  25 + <!-- internal -->
  26 + <dependency>
  27 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  28 + <artifactId>eval</artifactId>
  29 + </dependency>
  30 +
  31 + <!-- third party -->
  32 + <dependency>
  33 + <groupId>nz.ac.waikato.cms.weka</groupId>
  34 + <artifactId>weka-stable</artifactId>
  35 + </dependency>
  36 + <dependency>
  37 + <groupId>org.apache.commons</groupId>
  38 + <artifactId>commons-lang3</artifactId>
  39 + </dependency>
  40 + <dependency>
  41 + <groupId>com.google.guava</groupId>
  42 + <artifactId>guava</artifactId>
  43 + </dependency>
  44 +
  45 + <!-- logging -->
  46 + <dependency>
  47 + <groupId>org.slf4j</groupId>
  48 + <artifactId>slf4j-api</artifactId>
  49 + </dependency>
  50 + <dependency>
  51 + <groupId>org.slf4j</groupId>
  52 + <artifactId>slf4j-simple</artifactId>
  53 + </dependency>
  54 +
  55 + </dependencies>
  56 +</project>
0 57 \ No newline at end of file
... ...
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import org.apache.commons.io.IOUtils;
  4 +
  5 +import java.io.IOException;
  6 +import java.io.InputStream;
  7 +import java.util.List;
  8 +import java.util.Set;
  9 +import java.util.stream.Collectors;
  10 +
  11 +public class Constants {
  12 +
  13 + private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt";
  14 +
  15 + private Constants() {
  16 + }
  17 +
  18 + public static Set<String> loadTestTextIds() throws IOException {
  19 + try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) {
  20 + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING);
  21 + return testTextIds.stream().map(String::trim).collect(Collectors.toSet());
  22 + }
  23 + }
  24 +}
... ...
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import pl.waw.ipipan.zil.summ.eval.Main;
  4 +
  5 +public class Evaluate {
  6 +
  7 + private Evaluate() {
  8 + }
  9 +
  10 + public static void main(String[] args) {
  11 + String goldDirPath = "data/summaries-gold";
  12 + String systemDirPath = "data/summaries";
  13 + Main.main(new String[]{goldDirPath, systemDirPath});
  14 + }
  15 +}
0 16 \ No newline at end of file
... ...
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/ExtractGoldSummaries.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  4 +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
  5 +import pl.waw.ipipan.zil.summ.pscapi.xml.Summary;
  6 +import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
  7 +
  8 +import javax.xml.bind.JAXBException;
  9 +import java.io.File;
  10 +import java.io.IOException;
  11 +import java.util.List;
  12 +import java.util.Set;
  13 +import java.util.stream.Collectors;
  14 +
  15 +import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds;
  16 +
  17 +public class ExtractGoldSummaries {
  18 +
  19 + private ExtractGoldSummaries() {
  20 + }
  21 +
  22 + public static void main(String[] args) throws IOException, JAXBException {
  23 + File corpusDir = new File("data/corpus/PSC_1.0/data");
  24 + File targetDir = new File("data/summaries-gold");
  25 + targetDir.mkdir();
  26 +
  27 + Set<String> testTextIds = loadTestTextIds();
  28 + File[] files = corpusDir.listFiles();
  29 + if (files != null) {
  30 + for (File file : files) {
  31 + Text text = PSC_IO.readText(file);
  32 + if (!testTextIds.contains(text.getId()))
  33 + continue;
  34 +
  35 + List<Summary> goldSummaries = text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals("abstract") && summary.getRatio().equals(20)).collect(Collectors.toList());
  36 +
  37 + for (Summary summary : goldSummaries) {
  38 + File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt");
  39 + Utils.writeStringToFile(summary.getBody(), targetFile);
  40 + }
  41 + }
  42 + }
  43 + }
  44 +
  45 +}
... ...
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval;
  2 +
  3 +import com.google.common.collect.Maps;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
  9 +import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
  10 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  12 +
  13 +import java.io.File;
  14 +import java.io.IOException;
  15 +import java.util.List;
  16 +import java.util.Map;
  17 +import java.util.Set;
  18 +
  19 +import static java.util.stream.Collectors.toList;
  20 +import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds;
  21 +
  22 +public class SummarizeTestCorpus {
  23 +
  24 + private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class);
  25 +
  26 +
  27 + private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt";
  28 + private static final double SUMMARY_RATIO = 0.2;
  29 +
  30 + private SummarizeTestCorpus() {
  31 + }
  32 +
  33 + public static void main(String[] args) throws IOException, NicolasException {
  34 + File thriftedCorpusDir = new File("data/preprocessed");
  35 + File targetDir = new File("data/summaries");
  36 + targetDir.mkdir();
  37 +
  38 + Set<String> testTextIds = loadTestTextIds();
  39 + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains);
  40 + LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size());
  41 +
  42 + Map<String, String> id2summary = summarizeTexts(id2preprocessedText);
  43 + LOG.info("Texts summarized.");
  44 +
  45 + saveSummariesToFolder(id2summary, targetDir);
  46 + LOG.info("Texts saved to {} folder.", targetDir);
  47 + }
  48 +
  49 + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException {
  50 + Map<String, String> id2summary = Maps.newHashMap();
  51 + Nicolas nicolas = new Nicolas();
  52 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  53 + TText text = entry.getValue();
  54 + int targetSize = calculateTargetSize(text);
  55 + String summary = nicolas.summarizeThrift(text, targetSize);
  56 + id2summary.put(entry.getKey(), summary);
  57 + }
  58 + return id2summary;
  59 + }
  60 +
  61 + private static int calculateTargetSize(TText text) {
  62 + List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  63 + StringBuilder body = new StringBuilder();
  64 + for (TSentence sentence : sentences)
  65 + body.append(Utils.loadSentence2Orth(sentence)).append(" ");
  66 +
  67 + int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size();
  68 + return (int) (SUMMARY_RATIO * tokenCount);
  69 + }
  70 +
  71 + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException {
  72 + for (Map.Entry<String, String> entry : id2summary.entrySet()) {
  73 + String textId = entry.getKey();
  74 + String summary = entry.getValue();
  75 + String targetFileName = textId + SUMMARY_FILE_SUFFIX;
  76 + Utils.writeStringToFile(summary, new File(targetDir, targetFileName));
  77 + }
  78 + }
  79 +
  80 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateCommon.java renamed to nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
1   -package pl.waw.ipipan.zil.summ.nicolas.train.search;
  1 +package pl.waw.ipipan.zil.summ.nicolas.eval.search;
2 2  
3 3 import org.apache.commons.lang3.time.StopWatch;
4 4 import org.apache.commons.lang3.tuple.Pair;
... ... @@ -35,13 +35,13 @@ import java.util.Random;
35 35 import java.util.logging.LogManager;
36 36  
37 37  
38   -class CrossvalidateCommon {
  38 +class Crossvalidate {
39 39  
40   - private static final Logger LOG = LoggerFactory.getLogger(CrossvalidateCommon.class);
  40 + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
41 41  
42 42 private static final int NUM_FOLDS = 10;
43 43  
44   - private CrossvalidateCommon() {
  44 + private Crossvalidate() {
45 45 }
46 46  
47 47 static void crossvalidateClassifiers(String datasetPath) throws IOException {
... ... @@ -77,7 +77,7 @@ class CrossvalidateCommon {
77 77 new DecisionTable(), new JRip(), new PART(),
78 78 createAttributeSelectedClassifier()}).parallel().map(cls -> {
79 79 String name = cls.getClass().getSimpleName();
80   - double acc = 0;
  80 + double acc;
81 81 Evaluation eval;
82 82 try {
83 83 eval = new Evaluation(instances);
... ...
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/test_ids.txt renamed to nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt
nicolas-lib/pom.xml
... ... @@ -35,7 +35,7 @@
35 35 <!-- third party -->
36 36 <dependency>
37 37 <groupId>nz.ac.waikato.cms.weka</groupId>
38   - <artifactId>weka-dev</artifactId>
  38 + <artifactId>weka-stable</artifactId>
39 39 </dependency>
40 40 <dependency>
41 41 <groupId>org.apache.commons</groupId>
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java
... ... @@ -18,18 +18,18 @@ import java.util.Set;
18 18  
19 19 import static java.util.stream.Collectors.toList;
20 20  
21   -public class ThriftUtils {
  21 +public class InstanceUtils {
22 22  
23   - private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class);
  23 + private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class);
24 24  
25   - private ThriftUtils() {
  25 + private InstanceUtils() {
26 26 }
27 27  
28 28 public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
29 29 List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
30 30 Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
31 31  
32   - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention.");
  32 + LOG.info("Extracting {} features of each mention.", featureExtractor.getAttributesList().size());
33 33 Map<TMention, Instance> mention2instance = Maps.newHashMap();
34 34 for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) {
35 35 Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
... ... @@ -39,7 +39,7 @@ public class ThriftUtils {
39 39 }
40 40 mention2instance.put(tMention, instance);
41 41 }
42   - LOG.info("Extracted features of " + mention2instance.size() + " mentions.");
  42 + LOG.info("Extracted features of {} mentions.", mention2instance.size());
43 43 return mention2instance;
44 44 }
45 45  
... ... @@ -47,7 +47,7 @@ public class ThriftUtils {
47 47 List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
48 48 Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions);
49 49  
50   - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence.");
  50 + LOG.info("Extracting {} features of each sentence.", featureExtractor.getAttributesList().size());
51 51 Map<TSentence, Instance> sentence2instance = Maps.newHashMap();
52 52 for (TSentence sentence : sentences) {
53 53 Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
... ... @@ -57,7 +57,7 @@ public class ThriftUtils {
57 57 }
58 58 sentence2instance.put(sentence, instance);
59 59 }
60   - LOG.info("Extracted features of " + sentence2instance.size() + " sentences.");
  60 + LOG.info("Extracted features of {} sentences.", sentence2instance.size());
61 61 return sentence2instance;
62 62 }
63 63 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... ... @@ -29,14 +29,18 @@ public class Nicolas {
29 29 private final SentenceFeatureExtractor sentenceFeatureExtractor;
30 30 private final ZeroFeatureExtractor zeroFeatureExtractor;
31 31  
32   - public Nicolas() throws IOException, ClassNotFoundException {
33   - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
34   - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
35   - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);
36   -
37   - mentionFeatureExtractor = new MentionFeatureExtractor();
38   - sentenceFeatureExtractor = new SentenceFeatureExtractor();
39   - zeroFeatureExtractor = new ZeroFeatureExtractor();
  32 + public Nicolas() throws NicolasException {
  33 + try {
  34 + mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
  35 + sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
  36 + zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);
  37 +
  38 + mentionFeatureExtractor = new MentionFeatureExtractor();
  39 + sentenceFeatureExtractor = new SentenceFeatureExtractor();
  40 + zeroFeatureExtractor = new ZeroFeatureExtractor();
  41 + } catch (IOException e) {
  42 + throw new NicolasException(e);
  43 + }
40 44 }
41 45  
42 46 public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException {
... ... @@ -59,17 +63,17 @@ public class Nicolas {
59 63 }
60 64  
61 65 private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception {
62   - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
  66 + List<TSentence> sentences = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
63 67  
64 68 Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor);
65 69  
66   - List<TSentence> sortedSents = Lists.newArrayList(sents);
67   - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed());
  70 + List<TSentence> sortedSentences = Lists.newArrayList(sentences);
  71 + sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed());
68 72  
69 73 int size = 0;
70 74 Random r = new Random(1);
71 75 Set<TSentence> summary = Sets.newHashSet();
72   - for (TSentence sent : sortedSents) {
  76 + for (TSentence sent : sortedSentences) {
73 77 size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size();
74 78 if (r.nextDouble() > 0.4 && size > targetSize)
75 79 break;
... ... @@ -78,7 +82,7 @@ public class Nicolas {
78 82 break;
79 83 }
80 84 List<TSentence> selectedSentences = Lists.newArrayList();
81   - for (TSentence sent : sents) {
  85 + for (TSentence sent : sentences) {
82 86 if (summary.contains(sent))
83 87 selectedSentences.add(sent);
84 88 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +public class NicolasException extends Exception {
  4 + public NicolasException(Exception e) {
  5 + super(e);
  6 + }
  7 +}
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java
... ... @@ -8,8 +8,9 @@ import org.slf4j.LoggerFactory;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11   -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
12 12 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  13 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
13 14 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
14 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15 16 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
... ... @@ -34,15 +35,15 @@ public class ApplyModel {
34 35 private static final String TARGET_DIR = "corpora/summaries";
35 36  
36 37 public static void main(String[] args) throws Exception {
37   - Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH);
  38 + Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
38 39 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
39 40  
40   - Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCE_MODEL_RESOURCE_PATH);
  41 + Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
41 42 SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();
42 43  
43 44 ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector();
44 45  
45   - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH);
  46 + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH));
46 47 int i = 1;
47 48 double avgSize = 0;
48 49 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
... ... @@ -91,7 +92,7 @@ public class ApplyModel {
91 92 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
92 93  
93 94 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
94   - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
  95 + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
95 96  
96 97 Map<TSentence, Double> sentence2score = Maps.newHashMap();
97 98 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
1   -package pl.waw.ipipan.zil.summ.nicolas.common.features;
  1 +package pl.waw.ipipan.zil.summ.nicolas.features;
2 2  
3 3 import com.google.common.collect.*;
4 4 import org.slf4j.Logger;
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
1   -package pl.waw.ipipan.zil.summ.nicolas.common.features;
  1 +package pl.waw.ipipan.zil.summ.nicolas.features;
2 2  
3 3 import com.google.common.collect.Maps;
4 4 import com.google.common.collect.Sets;
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java
1   -package pl.waw.ipipan.zil.summ.nicolas.common.features;
  1 +package pl.waw.ipipan.zil.summ.nicolas.features;
2 2  
3 3 import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation;
4 4  
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
... ... @@ -5,9 +5,9 @@ import com.google.common.collect.Maps;
5 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
6 6 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7 7 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
8   -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
10   -import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation;
  8 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
  9 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
  10 +import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation;
11 11 import weka.core.Attribute;
12 12  
13 13 import java.io.IOException;
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
... ... @@ -5,7 +5,7 @@ import org.slf4j.Logger;
5 5 import org.slf4j.LoggerFactory;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8   -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  8 +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
9 9 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
10 10 import weka.classifiers.Classifier;
11 11 import weka.core.Instance;
... ... @@ -25,7 +25,7 @@ public class MentionModel {
25 25 Set<TMention> goodMentions = Sets.newHashSet();
26 26  
27 27 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
28   - Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(text, featureExtractor);
  28 + Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor);
29 29 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
30 30 Instance instance = entry.getValue();
31 31 instance.setDataset(instances);
... ... @@ -34,7 +34,7 @@ public class MentionModel {
34 34 if (good)
35 35 goodMentions.add(entry.getKey());
36 36 }
37   - LOG.info("Classified " + goodMentions.size() + " mentions as good.");
  37 + LOG.info("Classified {} mentions as good.", goodMentions.size());
38 38 return goodMentions;
39 39 }
40 40  
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
... ... @@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence;
2 2  
3 3 import com.google.common.collect.Maps;
4 4 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
5   -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  5 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
  6 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
7 7 import weka.core.Attribute;
8 8  
9 9 import java.util.List;
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
... ... @@ -6,7 +6,7 @@ import org.slf4j.LoggerFactory;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9   -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
10 10 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
11 11 import weka.classifiers.Classifier;
12 12 import weka.core.Instance;
... ... @@ -24,7 +24,7 @@ public class SentenceModel {
24 24  
25 25 public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
26 26 Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
27   - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
  27 + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
28 28  
29 29 Map<TSentence, Double> sentence2score = Maps.newHashMap();
30 30 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... ... @@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
10 10 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
11   -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;
12   -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  11 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
  12 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
13 13 import weka.core.Attribute;
14 14  
15 15 import java.util.List;
... ...
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
... ... @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils;
5 5 import org.junit.Test;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  8 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 10  
11 11 import java.io.IOException;
12 12 import java.io.InputStream;
... ... @@ -47,7 +47,7 @@ public class CandidateFinderTest {
47 47  
48 48 private FeatureHelper loadSampleTextHelper() throws IOException {
49 49 try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
50   - return new FeatureHelper(Utils.loadThrifted(stream));
  50 + return new FeatureHelper(ThriftUtils.loadThriftTextFromStream(stream));
51 51 }
52 52 }
53 53 }
54 54 \ No newline at end of file
... ...
nicolas-train/pom.xml
... ... @@ -25,6 +25,11 @@
25 25 <groupId>pl.waw.ipipan.zil.summ</groupId>
26 26 <artifactId>nicolas-multiservice</artifactId>
27 27 </dependency>
  28 + <dependency>
  29 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  30 + <artifactId>nicolas-model</artifactId>
  31 + <scope>runtime</scope>
  32 + </dependency>
28 33  
29 34 <!-- internal -->
30 35 <dependency>
... ... @@ -39,7 +44,7 @@
39 44 <!-- third party -->
40 45 <dependency>
41 46 <groupId>nz.ac.waikato.cms.weka</groupId>
42   - <artifactId>weka-dev</artifactId>
  47 + <artifactId>weka-stable</artifactId>
43 48 </dependency>
44 49 <dependency>
45 50 <groupId>org.apache.commons</groupId>
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/ModelConstants.java
... ... @@ -5,11 +5,11 @@ import weka.classifiers.trees.RandomForest;
5 5  
6 6 public class ModelConstants {
7 7  
8   - public static final String MENTION_DATASET_PATH = "mentions_train.arff";
9   - public static final String SENTENCE_DATASET_PATH = "sentences_train.arff";
10   - public static final String ZERO_DATASET_PATH = "zeros_train.arff";
  8 + public static final String MENTION_DATASET_PATH = "data/arff/mentions_train.arff";
  9 + public static final String SENTENCE_DATASET_PATH = "data/arff/sentences_train.arff";
  10 + public static final String ZERO_DATASET_PATH = "data/arff/zeros_train.arff";
11 11  
12   - private static final int NUM_ITERATIONS = 16;
  12 + private static final int NUM_ITERATIONS = 250;
13 13 private static final int NUM_EXECUTION_SLOTS = 8;
14 14 private static final int SEED = 0;
15 15  
... ... @@ -26,17 +26,17 @@ public class ModelConstants {
26 26  
27 27 public static Classifier getSentenceClassifier() {
28 28 RandomForest classifier = new RandomForest();
29   - classifier.setNumIterations(16);
30   - classifier.setSeed(0);
31   - classifier.setNumExecutionSlots(8);
  29 + classifier.setNumIterations(NUM_ITERATIONS);
  30 + classifier.setSeed(SEED);
  31 + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS);
32 32 return classifier;
33 33 }
34 34  
35 35 public static Classifier getZeroClassifier() {
36 36 RandomForest classifier = new RandomForest();
37   - classifier.setNumIterations(16);
38   - classifier.setSeed(0);
39   - classifier.setNumExecutionSlots(8);
  37 + classifier.setNumIterations(NUM_ITERATIONS);
  38 + classifier.setSeed(SEED);
  39 + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS);
40 40 return classifier;
41 41 }
42 42  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/TrainModelCommon.java
... ... @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.model.common;
3 3 import org.apache.commons.lang3.time.StopWatch;
4 4 import org.slf4j.Logger;
5 5 import org.slf4j.LoggerFactory;
6   -import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel;
7 6 import weka.classifiers.Classifier;
8 7 import weka.core.Instances;
9 8 import weka.core.converters.ArffLoader;
... ... @@ -16,7 +15,7 @@ import java.util.logging.LogManager;
16 15 @SuppressWarnings("squid:S2118")
17 16 public class TrainModelCommon {
18 17  
19   - private static final Logger LOG = LoggerFactory.getLogger(TrainZeroModel.class);
  18 + private static final Logger LOG = LoggerFactory.getLogger(TrainModelCommon.class);
20 19  
21 20 private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources";
22 21  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/PrepareTrainingData.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.train.model.mention;
2   -
3   -import com.google.common.base.Charsets;
4   -import com.google.common.collect.Maps;
5   -import com.google.common.io.Files;
6   -import org.slf4j.Logger;
7   -import org.slf4j.LoggerFactory;
8   -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9   -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
10   -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
11   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
12   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
13   -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
14   -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
15   -import weka.core.Instance;
16   -import weka.core.Instances;
17   -import weka.core.converters.ArffSaver;
18   -
19   -import java.io.File;
20   -import java.io.IOException;
21   -import java.util.Map;
22   -
23   -
24   -public class PrepareTrainingData {
25   -
26   - private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class);
27   -
28   - private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
29   - private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
30   -
31   - private PrepareTrainingData() {
32   - }
33   -
34   - public static void main(String[] args) throws IOException {
35   -
36   - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
37   - Map<String, String> id2optimalSummary = loadOptimalSummaries();
38   -
39   - MentionScorer mentionScorer = new MentionScorer();
40   - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
41   -
42   - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
43   -
44   - int i = 1;
45   - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
46   - LOG.info(i++ + "/" + id2preprocessedText.size());
47   -
48   - String id = entry.getKey();
49   - TText preprocessedText = entry.getValue();
50   - String optimalSummary = id2optimalSummary.get(id);
51   - if (optimalSummary == null)
52   - continue;
53   - Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
54   -
55   - Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor);
56   - for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) {
57   - TMention mention = entry2.getKey();
58   - Instance instance = entry2.getValue();
59   - instance.setDataset(instances);
60   - instance.setClassValue(mention2score.get(mention));
61   - instances.add(instance);
62   - }
63   - }
64   - saveInstancesToFile(instances);
65   - }
66   -
67   - private static void saveInstancesToFile(Instances instances) throws IOException {
68   - ArffSaver saver = new ArffSaver();
69   - saver.setInstances(instances);
70   - saver.setFile(new File(ModelConstants.MENTION_DATASET_PATH));
71   - saver.writeBatch();
72   - }
73   -
74   - private static Map<String, String> loadOptimalSummaries() throws IOException {
75   - Map<String, String> id2optimalSummary = Maps.newHashMap();
76   - for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {
77   - String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
78   - id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);
79   - }
80   - LOG.info(id2optimalSummary.size() + " optimal summaries found.");
81   - return id2optimalSummary;
82   - }
83   -
84   -
85   -}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/PrepareTrainingData.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.train.model.zero;
2   -
3   -import com.google.common.collect.Maps;
4   -import com.google.common.collect.Sets;
5   -import org.apache.commons.io.IOUtils;
6   -import org.slf4j.Logger;
7   -import org.slf4j.LoggerFactory;
8   -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
10   -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
11   -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
12   -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
13   -import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;
14   -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
15   -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
16   -import weka.core.Instance;
17   -import weka.core.Instances;
18   -import weka.core.converters.ArffSaver;
19   -
20   -import java.io.File;
21   -import java.io.FileReader;
22   -import java.io.IOException;
23   -import java.util.List;
24   -import java.util.Map;
25   -import java.util.Set;
26   -
27   -public class PrepareTrainingData {
28   -
29   - private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class);
30   -
31   - private static final String IDS_PATH = "corpora/summaries_dev";
32   - private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/";
33   - private static final String GOLD_ZEROS_PATH = "/zeros.tsv";
34   -
35   - private PrepareTrainingData() {
36   - }
37   -
38   - public static void main(String[] args) throws IOException {
39   -
40   - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH);
41   - Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH);
42   -
43   - ZeroScorer zeroScorer = new ZeroScorer(GOLD_ZEROS_PATH);
44   - ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor();
45   -
46   - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
47   -
48   - int i = 1;
49   - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
50   - LOG.info(i++ + "/" + id2preprocessedText.size());
51   -
52   - String textId = entry.getKey();
53   -
54   - TText text = entry.getValue();
55   - Set<String> sentenceIds = id2sentIds.get(textId);
56   - FeatureHelper featureHelper = new FeatureHelper(text);
57   -
58   - List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds);
59   - Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
60   -
61   - for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) {
62   - boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper);
63   - Instance instance = entry2.getValue();
64   - instance.setDataset(instances);
65   - instance.setClassValue(good ? 1 : 0);
66   - instances.add(instance);
67   - }
68   - }
69   -
70   - saveInstancesToFile(instances);
71   - }
72   -
73   -
74   - private static void saveInstancesToFile(Instances instances) throws IOException {
75   - ArffSaver saver = new ArffSaver();
76   - saver.setInstances(instances);
77   - saver.setFile(new File(ModelConstants.ZERO_DATASET_PATH));
78   - saver.writeBatch();
79   - }
80   -
81   - private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException {
82   - Map<String, Set<String>> result = Maps.newHashMap();
83   - for (File f : new File(idsPath).listFiles()) {
84   - String id = f.getName().split("_")[0];
85   - List<String> sentenceIds = IOUtils.readLines(new FileReader(f));
86   - result.put(id, Sets.newHashSet(sentenceIds));
87   - }
88   - return result;
89   - }
90   -}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/ZeroScorer.java
... ... @@ -6,7 +6,7 @@ import org.apache.commons.csv.CSVParser;
6 6 import org.apache.commons.csv.CSVRecord;
7 7 import org.apache.commons.csv.QuoteMode;
8 8 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  9 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 10 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
11 11  
12 12 import java.io.IOException;
... ... @@ -21,8 +21,8 @@ public class ZeroScorer {
21 21  
22 22 private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap();
23 23  
24   - public ZeroScorer(String goldZerosPath) throws IOException {
25   - try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosPath);
  24 + public ZeroScorer(String goldZerosResourcePath) throws IOException {
  25 + try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosResourcePath);
26 26 InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING);
27 27 CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) {
28 28 List<CSVRecord> records = parser.getRecords();
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadAndPreprocessCorpus.java
1   -package pl.waw.ipipan.zil.summ.nicolas.train;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2  
3 3 import net.lingala.zip4j.core.ZipFile;
4 4 import org.apache.commons.io.FileUtils;
5 5 import org.slf4j.Logger;
6 6 import org.slf4j.LoggerFactory;
7   -import pl.waw.ipipan.zil.summ.nicolas.train.preprocess.Main;
8 7  
9 8 import java.io.File;
10 9 import java.net.URL;
... ... @@ -45,7 +44,7 @@ public class DownloadAndPreprocessCorpus {
45 44  
46 45 File preprocessed = new File(WORKING_DIR, "preprocessed");
47 46 createFolder(preprocessed.getPath());
48   - Main.main(new String[]{dataDir.getPath(), preprocessed.getPath()});
  47 + Preprocess.main(new String[]{dataDir.getPath(), preprocessed.getPath()});
49 48 }
50 49  
51 50 private static File createFolder(String path) {
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
1   -package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2  
3 3 import com.google.common.base.Charsets;
4 4 import com.google.common.collect.Maps;
  5 +import com.google.common.collect.Sets;
5 6 import com.google.common.io.Files;
  7 +import org.apache.commons.io.IOUtils;
6 8 import org.slf4j.Logger;
7 9 import org.slf4j.LoggerFactory;
8 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11   -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
12 14 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  15 +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
13 16 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  17 +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
14 18 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15 19 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
16 20 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
17 21 import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  22 +import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.MentionScorer;
  23 +import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.SentenceScorer;
  24 +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.ZeroScorer;
  25 +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
  26 +import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;
  27 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
  28 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
18 29 import weka.classifiers.Classifier;
19 30 import weka.core.Instance;
20 31 import weka.core.Instances;
21 32 import weka.core.converters.ArffSaver;
22 33  
23 34 import java.io.File;
  35 +import java.io.FileReader;
24 36 import java.io.IOException;
  37 +import java.io.InputStream;
  38 +import java.util.List;
25 39 import java.util.Map;
26 40 import java.util.Set;
27   -
  41 +import java.util.function.Predicate;
  42 +import java.util.stream.Collectors;
28 43  
29 44 public class PrepareTrainingData {
30 45  
31 46 private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class);
32 47  
33   - private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
34   - private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
  48 + private static final String THRIFT_TEXTS_PATH = "data/preprocessed";
  49 + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "data/summaries-optimal";
  50 + private static final String SUMMARY_SENTENCE_IDS = "data/summaries-sentence-ids";
  51 +
  52 + private static final String ZERO_TRAINING_DATA_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_zero.tsv";
  53 + private static final String TRAIN_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt";
35 54  
36 55 private PrepareTrainingData() {
37 56 }
38 57  
39 58 public static void main(String[] args) throws Exception {
  59 + Set<String> trainTextIds = loadTrainTextIds();
  60 +
  61 + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(THRIFT_TEXTS_PATH), trainTextIds::contains);
  62 + Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains);
  63 +
  64 + prepareMentionsDataset(id2preprocessedText, id2optimalSummary);
  65 + prepareSentencesDataset(id2preprocessedText, id2optimalSummary);
  66 + prepareZerosDataset(id2preprocessedText);
  67 + }
  68 +
  69 + public static void prepareMentionsDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException {
  70 + MentionScorer mentionScorer = new MentionScorer();
  71 + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
  72 +
  73 + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  74 +
  75 + int i = 1;
  76 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  77 + LOG.info("{}/{}", i++, id2preprocessedText.size());
  78 +
  79 + String id = entry.getKey();
  80 + TText preprocessedText = entry.getValue();
  81 + String optimalSummary = id2optimalSummary.get(id);
  82 + if (optimalSummary == null)
  83 + continue;
  84 + Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
  85 +
  86 + Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(preprocessedText, featureExtractor);
  87 + for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) {
  88 + TMention mention = entry2.getKey();
  89 + Instance instance = entry2.getValue();
  90 + instance.setDataset(instances);
  91 + instance.setClassValue(mention2score.get(mention));
  92 + instances.add(instance);
  93 + }
  94 + }
  95 + saveInstancesToFile(instances, new File(ModelConstants.MENTION_DATASET_PATH));
  96 + }
  97 +
  98 + private static Set<String> loadTrainTextIds() throws IOException {
  99 + try (InputStream inputStream = PrepareTrainingData.class.getResourceAsStream(TRAIN_TEXT_IDS_RESOURCE_PATH)) {
  100 + List<String> testTextIds = IOUtils.readLines(inputStream, Constants.ENCODING);
  101 + return testTextIds.stream().map(String::trim).collect(Collectors.toSet());
  102 + }
  103 + }
40 104  
41   - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
42   - Map<String, String> id2optimalSummary = loadOptimalSummaries();
  105 + public static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception {
43 106  
44 107 SentenceScorer sentenceScorer = new SentenceScorer();
45 108 SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor();
46 109  
47 110 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
48 111  
49   - Classifier classifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH);
  112 + Classifier classifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
50 113 MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor();
51 114  
52 115 int i = 1;
53 116 for (String textId : id2preprocessedText.keySet()) {
54   - LOG.info(i++ + "/" + id2preprocessedText.size());
  117 + LOG.info("{}/{}", i++, id2preprocessedText.size());
55 118  
56 119 TText preprocessedText = id2preprocessedText.get(textId);
57 120 String optimalSummary = id2optimalSummary.get(textId);
... ... @@ -64,7 +127,7 @@ public class PrepareTrainingData {
64 127 // Set<TMention> goodMentions
65 128 // = Utils.loadGoldGoodMentions(textId, preprocessedText, true);
66 129  
67   - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
  130 + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions);
68 131 for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
69 132 TSentence sentence = entry.getKey();
70 133 Instance instance = entry.getValue();
... ... @@ -73,25 +136,74 @@ public class PrepareTrainingData {
73 136 instances.add(instance);
74 137 }
75 138 }
76   - saveInstancesToFile(instances);
  139 + saveInstancesToFile(instances, new File(ModelConstants.SENTENCE_DATASET_PATH));
  140 + }
  141 +
  142 + public static void prepareZerosDataset(Map<String, TText> id2preprocessedText) throws IOException {
  143 +
  144 + Map<String, Set<String>> id2sentIds = loadSentenceIds(SUMMARY_SENTENCE_IDS);
  145 +
  146 + ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_DATA_RESOURCE_PATH);
  147 + ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor();
  148 +
  149 + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  150 +
  151 + int i = 1;
  152 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
  153 + LOG.info(i++ + "/" + id2preprocessedText.size());
  154 +
  155 + String textId = entry.getKey();
  156 +
  157 + TText text = entry.getValue();
  158 + Set<String> sentenceIds = id2sentIds.get(textId);
  159 + FeatureHelper featureHelper = new FeatureHelper(text);
  160 +
  161 + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds);
  162 + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
  163 +
  164 + for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) {
  165 + boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper);
  166 + Instance instance = entry2.getValue();
  167 + instance.setDataset(instances);
  168 + instance.setClassValue(good ? 1 : 0);
  169 + instances.add(instance);
  170 + }
  171 + }
  172 +
  173 + saveInstancesToFile(instances, new File(ModelConstants.ZERO_DATASET_PATH));
  174 + }
  175 +
  176 + private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException {
  177 + Map<String, Set<String>> result = Maps.newHashMap();
  178 + File[] files = new File(idsPath).listFiles();
  179 + if (files != null)
  180 + for (File f : files) {
  181 + String id = f.getName().split("_")[0];
  182 + List<String> sentenceIds = IOUtils.readLines(new FileReader(f));
  183 + result.put(id, Sets.newHashSet(sentenceIds));
  184 + }
  185 + return result;
77 186 }
78 187  
79   - private static void saveInstancesToFile(Instances instances) throws IOException {
  188 + private static void saveInstancesToFile(Instances instances, File targetFile) throws IOException {
80 189 ArffSaver saver = new ArffSaver();
81 190 saver.setInstances(instances);
82   - saver.setFile(new File(ModelConstants.SENTENCE_DATASET_PATH));
  191 + saver.setFile(targetFile);
83 192 saver.writeBatch();
84 193 }
85 194  
86   - private static Map<String, String> loadOptimalSummaries() throws IOException {
  195 + private static Map<String, String> loadOptimalSummaries(Predicate<String> idFilter) throws IOException {
87 196 Map<String, String> id2optimalSummary = Maps.newHashMap();
88   - for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) {
89   - String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
90   - id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary);
91   - }
92   - LOG.info(id2optimalSummary.size() + " optimal summaries found.");
  197 + File[] files = new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles();
  198 + if (files != null)
  199 + for (File optimalSummaryFile : files) {
  200 + String textId = optimalSummaryFile.getName().split("_")[0];
  201 + if (!idFilter.test(textId))
  202 + continue;
  203 + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8);
  204 + id2optimalSummary.put(textId, optimalSummary);
  205 + }
  206 + LOG.info("{} optimal summaries found.", id2optimalSummary.size());
93 207 return id2optimalSummary;
94 208 }
95   -
96   -
97 209 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/Preprocess.java
1   -package pl.waw.ipipan.zil.summ.nicolas.train.preprocess;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2  
3 3 import org.slf4j.Logger;
4 4 import org.slf4j.LoggerFactory;
... ... @@ -9,19 +9,19 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
9 9 import java.io.File;
10 10 import java.util.Arrays;
11 11  
12   -public class Main {
  12 +public class Preprocess {
13 13  
14   - private static final Logger LOG = LoggerFactory.getLogger(Main.class);
  14 + private static final Logger LOG = LoggerFactory.getLogger(Preprocess.class);
15 15  
16 16 private static final String CORPUS_FILE_SUFFIX = ".xml";
17 17 private static final String OUTPUT_FILE_SUFFIX = ".thrift";
18 18  
19   - private Main() {
  19 + private Preprocess() {
20 20 }
21 21  
22 22 public static void main(String[] args) {
23 23 if (args.length != 2) {
24   - LOG.error("Wrong usage! Try " + Main.class.getSimpleName() + " dirWithCorpusFiles targetDir");
  24 + LOG.error("Wrong usage! Try " + Preprocess.class.getSimpleName() + " dirWithCorpusFiles targetDir");
25 25 return;
26 26 }
27 27 File corpusDir = new File(args[0]);
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/TrainAllModels.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
1   -package pl.waw.ipipan.zil.summ.nicolas.train;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2  
3 3 import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel;
4 4 import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel;
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateMention.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.train.search;
2   -
3   -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
4   -
5   -
6   -public class CrossvalidateMention {
7   -
8   - private CrossvalidateMention() {
9   - }
10   -
11   - public static void main(String[] args) throws Exception {
12   - CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.MENTION_DATASET_PATH);
13   - }
14   -}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateSentence.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.train.search;
2   -
3   -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
4   -
5   -
6   -public class CrossvalidateSentence {
7   -
8   - private CrossvalidateSentence() {
9   - }
10   -
11   - public static void main(String[] args) throws Exception {
12   - CrossvalidateCommon.crossvalidateRegressors(ModelConstants.SENTENCE_DATASET_PATH);
13   - }
14   -}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateZero.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.train.search;
2   -
3   -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
4   -
5   -
6   -public class CrossvalidateZero {
7   -
8   - private CrossvalidateZero() {
9   - }
10   -
11   - public static void main(String[] args) throws Exception {
12   - CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.ZERO_DATASET_PATH);
13   - }
14   -}
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/dev_ids.txt renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/zeros.tsv renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/train_zero.tsv
... ... @@ -18,6 +18,7 @@
18 18 <module>nicolas-train</module>
19 19 <module>nicolas-common</module>
20 20 <module>nicolas-multiservice</module>
  21 + <module>nicolas-eval</module>
21 22 </modules>
22 23  
23 24 <properties>
... ... @@ -27,10 +28,11 @@
27 28  
28 29 <pscapi.version>1.0</pscapi.version>
29 30 <utils.version>1.0</utils.version>
  31 + <eval.version>1.0</eval.version>
30 32  
31 33 <commons-csv.version>1.4</commons-csv.version>
32 34 <guava.version>21.0</guava.version>
33   - <weka-dev.version>3.9.1</weka-dev.version>
  35 + <weka-stable.version>3.8.1</weka-stable.version>
34 36 <commons-lang3.version>3.5</commons-lang3.version>
35 37 <commons-io.version>2.5</commons-io.version>
36 38 <slf4j-api.version>1.7.22</slf4j-api.version>
... ... @@ -98,6 +100,11 @@
98 100 <artifactId>utils</artifactId>
99 101 <version>${utils.version}</version>
100 102 </dependency>
  103 + <dependency>
  104 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  105 + <artifactId>eval</artifactId>
  106 + <version>${eval.version}</version>
  107 + </dependency>
101 108  
102 109 <!-- third party -->
103 110 <dependency>
... ... @@ -112,8 +119,8 @@
112 119 </dependency>
113 120 <dependency>
114 121 <groupId>nz.ac.waikato.cms.weka</groupId>
115   - <artifactId>weka-dev</artifactId>
116   - <version>${weka-dev.version}</version>
  122 + <artifactId>weka-stable</artifactId>
  123 + <version>${weka-stable.version}</version>
117 124 <exclusions>
118 125 <exclusion>
119 126 <groupId>org.slf4j</groupId>
... ...