Commit 76eeceb70c81d7fdfe3860db5a8576f0e4234daf

Authored by Mateusz Kopeć
1 parent f04fcb1a

large refactor

Showing 60 changed files with 1238 additions and 478 deletions
.gitignore
... ... @@ -15,4 +15,4 @@ target/
15 15 hs_err_pid*
16 16  
17 17 .idea
18   -*.iml
19 18 \ No newline at end of file
  19 +*.iml
... ...
nicolas-common/pom.xml
... ... @@ -27,6 +27,10 @@
27 27 <groupId>nz.ac.waikato.cms.weka</groupId>
28 28 <artifactId>weka-dev</artifactId>
29 29 </dependency>
  30 + <dependency>
  31 + <groupId>commons-io</groupId>
  32 + <artifactId>commons-io</artifactId>
  33 + </dependency>
30 34  
31 35 <!-- logging -->
32 36 <dependency>
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java
... ... @@ -2,26 +2,21 @@ package pl.waw.ipipan.zil.summ.nicolas.common;
2 2  
3 3 import com.google.common.base.Charsets;
4 4 import com.google.common.collect.ImmutableList;
5   -import weka.classifiers.Classifier;
6   -import weka.classifiers.functions.SMO;
7   -import weka.classifiers.meta.AdaBoostM1;
8   -import weka.classifiers.meta.AttributeSelectedClassifier;
9   -import weka.classifiers.rules.JRip;
10   -import weka.classifiers.trees.J48;
11   -import weka.classifiers.trees.RandomForest;
12 5  
13 6 import java.nio.charset.Charset;
14 7  
15 8  
16 9 public class Constants {
17 10  
18   - public static final String MENTIONS_MODEL_PATH = "mentions_model.bin";
19   - public static final String SENTENCES_MODEL_PATH = "sentences_model.bin";
20   - public static final String ZERO_MODEL_PATH = "zeros_model.bin";
  11 + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/";
21 12  
22   - public static final String MENTIONS_DATASET_PATH = "mentions_train.arff";
23   - public static final String SENTENCES_DATASET_PATH = "sentences_train.arff";
24   - public static final String ZERO_DATASET_PATH = "zeros_train.arff";
  13 + private static final String MODELS_PATH = ROOT_PATH + "models/";
  14 + public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin";
  15 + public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin";
  16 + public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin";
  17 +
  18 + private static final String RESOURCES_PATH = ROOT_PATH + "resources/";
  19 + public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt";
25 20  
26 21 public static final Charset ENCODING = Charsets.UTF_8;
27 22  
... ... @@ -30,24 +25,4 @@ public class Constants {
30 25 private Constants() {
31 26 }
32 27  
33   - public static Classifier getMentionClassifier() {
34   - RandomForest classifier = new RandomForest();
35   - classifier.setNumIterations(250);
36   - classifier.setSeed(0);
37   - classifier.setNumExecutionSlots(8);
38   - return classifier;
39   - }
40   -
41   - public static Classifier getSentencesClassifier() {
42   - RandomForest classifier = new RandomForest();
43   - classifier.setNumIterations(10);
44   - classifier.setSeed(0);
45   - classifier.setNumExecutionSlots(8);
46   - return classifier;
47   - }
48   -
49   - public static Classifier getZerosClassifier() {
50   - Classifier classifier = new J48();
51   - return classifier;
52   - }
53 28 }
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
... ... @@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.common;
3 3 import com.google.common.collect.Lists;
4 4 import com.google.common.collect.Maps;
5 5 import com.google.common.collect.Sets;
  6 +import org.apache.commons.io.IOUtils;
6 7 import org.slf4j.Logger;
7 8 import org.slf4j.LoggerFactory;
8 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
... ... @@ -24,6 +25,47 @@ public class Utils {
24 25  
25 26 private static final String DATASET_NAME = "Dataset";
26 27  
  28 + private Utils() {
  29 + }
  30 +
  31 + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException {
  32 + LOG.info("Loading classifier from path: {}...", modelResourcePath);
  33 + try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) {
  34 + if (stream == null) {
  35 + throw new IOException("Model not found at: " + modelResourcePath);
  36 + }
  37 + try (ObjectInputStream ois = new ObjectInputStream(stream)) {
  38 + Classifier classifier = (Classifier) ois.readObject();
  39 + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName());
  40 + return classifier;
  41 + } catch (ClassNotFoundException e) {
  42 + LOG.error("Error loading serialized classifier, class not found.", e);
  43 + throw new IOException(e);
  44 + }
  45 + }
  46 + }
  47 +
  48 + public static TText loadThriftTextFromResource(String textResourcePath) throws IOException {
  49 + try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) {
  50 + if (stream == null) {
  51 + throw new IOException("Resource not found at: " + textResourcePath);
  52 + }
  53 + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) {
  54 + return (TText) ois.readObject();
  55 + } catch (ClassNotFoundException e) {
  56 + LOG.error("Error reading serialized thrift text file, class not found.", e);
  57 + throw new IOException(e);
  58 + }
  59 + }
  60 + }
  61 +
  62 + public static List<String> loadLinesFromResource(String resourcePath) throws IOException {
  63 + try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) {
  64 + return IOUtils.readLines(stream, Constants.ENCODING);
  65 + }
  66 + }
  67 +
  68 + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList
27 69 public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
28 70 Instances instances = new Instances(DATASET_NAME, attributesList, 0);
29 71 instances.setClassIndex(0);
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java
... ... @@ -8,10 +8,12 @@ import java.io.ObjectStreamClass;
8 8  
9 9 public class VersionIgnoringObjectInputStream extends ObjectInputStream {
10 10  
11   - public VersionIgnoringObjectInputStream(InputStream in) throws IOException {
  11 + VersionIgnoringObjectInputStream(InputStream in) throws IOException {
12 12 super(in);
13 13 }
14 14  
  15 + @Override
  16 + @SuppressWarnings("squid:S1166")
15 17 protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException {
16 18 ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor
17 19 Class localClass; // the class in the local JVM that this descriptor represents.
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java
... ... @@ -17,6 +17,7 @@ public class FeatureExtractor {
17 17  
18 18 private final Set<String> normalizedAttributes = Sets.newHashSet();
19 19  
  20 + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList
20 21 public ArrayList<Attribute> getAttributesList() {
21 22 return Lists.newArrayList(sortedAttributes);
22 23 }
... ... @@ -46,15 +47,14 @@ public class FeatureExtractor {
46 47 protected void fillSortedAttributes(String scoreAttName) {
47 48 sortedAttributes.addAll(name2attribute.values());
48 49 sortedAttributes.remove(getAttributeByName(scoreAttName));
49   - Collections.sort(sortedAttributes, (o1, o2) -> name2attribute.inverse().get(o1).compareTo(name2attribute.inverse().get(o2)));
  50 + sortedAttributes.sort(Comparator.comparing(name2attribute.inverse()::get));
50 51 sortedAttributes.add(0, getAttributeByName(scoreAttName));
51 52 }
52 53  
53 54 protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) {
54 55 Map<Attribute, Double> attribute2max = Maps.newHashMap();
55 56 Map<Attribute, Double> attribute2min = Maps.newHashMap();
56   - for (T entity : entity2attributes.keySet()) {
57   - Map<Attribute, Double> entityAttributes = entity2attributes.get(entity);
  57 + for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) {
58 58 for (String attributeName : normalizedAttributes) {
59 59 Attribute attribute = getAttributeByName(attributeName);
60 60 Double value = entityAttributes.get(attribute);
... ... @@ -66,8 +66,7 @@ public class FeatureExtractor {
66 66 attribute2min.compute(attribute, (k, v) -> Math.min(v, value));
67 67 }
68 68 }
69   - for (T mention : entity2attributes.keySet()) {
70   - Map<Attribute, Double> entityAttributes = entity2attributes.get(mention);
  69 + for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) {
71 70 for (Attribute attribute : attribute2max.keySet()) {
72 71 Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized");
73 72 entityAttributes.put(normalizedAttribute,
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java
... ... @@ -174,11 +174,11 @@ public class FeatureHelper {
174 174 }
175 175  
176 176 public boolean isNested(TMention mention) {
177   - return mentions.stream().anyMatch(m -> m.getChildIds().containsAll(mention.getChildIds()));
  177 + return mentions.stream().anyMatch(m -> !m.equals(mention) && m.getChildIds().containsAll(mention.getChildIds()));
178 178 }
179 179  
180 180 public boolean isNesting(TMention mention) {
181   - return mentions.stream().anyMatch(m -> mention.getChildIds().containsAll(m.getChildIds()));
  181 + return mentions.stream().anyMatch(m -> !m.equals(mention) && mention.getChildIds().containsAll(m.getChildIds()));
182 182 }
183 183  
184 184 public Set<TCoreference> getClusters() {
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java
... ... @@ -33,6 +33,7 @@ public class Interpretation {
33 33 person = split[3];
34 34 break;
35 35 case "siebie":
  36 + case "prep":
36 37 casee = split[0];
37 38 break;
38 39 case "fin":
... ... @@ -47,9 +48,6 @@ public class Interpretation {
47 48 number = split[0];
48 49 gender = split[1];
49 50 break;
50   - case "prep":
51   - casee = split[0];
52   - break;
53 51 default:
54 52 break;
55 53 }
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.mention;
2   -
3   -import org.apache.commons.lang3.time.StopWatch;
4   -import org.slf4j.Logger;
5   -import org.slf4j.LoggerFactory;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7   -import weka.classifiers.Classifier;
8   -import weka.core.Instances;
9   -import weka.core.converters.ArffLoader;
10   -
11   -import java.io.File;
12   -import java.io.FileOutputStream;
13   -import java.io.ObjectOutputStream;
14   -
15   -
16   -public class TrainModel {
17   - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
18   -
19   - public static void main(String[] args) throws Exception {
20   -
21   - ArffLoader loader = new ArffLoader();
22   - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
23   - Instances instances = loader.getDataSet();
24   - instances.setClassIndex(0);
25   - LOG.info(instances.size() + " instances loaded.");
26   - LOG.info(instances.numAttributes() + " attributes for each instance.");
27   -
28   - StopWatch watch = new StopWatch();
29   - watch.start();
30   -
31   - Classifier classifier = Constants.getMentionClassifier();
32   -
33   - LOG.info("Building classifier...");
34   - classifier.buildClassifier(instances);
35   - LOG.info("...done.");
36   -
37   - try (ObjectOutputStream oos = new ObjectOutputStream(
38   - new FileOutputStream(Constants.MENTIONS_MODEL_PATH))) {
39   - oos.writeObject(classifier);
40   - }
41   -
42   - watch.stop();
43   - LOG.info("Elapsed time: " + watch);
44   -
45   - LOG.info(classifier.toString());
46   - }
47   -}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.mention.test;
2   -
3   -import org.slf4j.Logger;
4   -import org.slf4j.LoggerFactory;
5   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
6   -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils;
7   -import weka.core.Instances;
8   -import weka.core.converters.ArffLoader;
9   -
10   -import java.io.File;
11   -
12   -
13   -public class Crossvalidate {
14   -
15   - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
16   -
17   - private Crossvalidate() {
18   - }
19   -
20   - public static void main(String[] args) throws Exception {
21   - ArffLoader loader = new ArffLoader();
22   - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
23   - Instances instances = loader.getDataSet();
24   - instances.setClassIndex(0);
25   - LOG.info(instances.size() + " instances loaded.");
26   - LOG.info(instances.numAttributes() + " attributes for each instance.");
27   -
28   - EvalUtils.crossvalidateClassification(instances);
29   - }
30   -}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.mention.test;
2   -
3   -import org.apache.commons.lang3.time.StopWatch;
4   -import org.slf4j.Logger;
5   -import org.slf4j.LoggerFactory;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7   -import weka.classifiers.Classifier;
8   -import weka.classifiers.evaluation.Evaluation;
9   -import weka.core.Instances;
10   -import weka.core.converters.ArffLoader;
11   -
12   -import java.io.File;
13   -import java.io.FileInputStream;
14   -import java.io.IOException;
15   -import java.io.ObjectInputStream;
16   -
17   -
18   -public class Validate {
19   - private static final Logger LOG = LoggerFactory.getLogger(Validate.class);
20   -
21   - public static void main(String[] args) throws Exception {
22   -
23   - ArffLoader loader = new ArffLoader();
24   - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));
25   - Instances instances = loader.getDataSet();
26   - instances.setClassIndex(0);
27   - LOG.info(instances.size() + " instances loaded.");
28   - LOG.info(instances.numAttributes() + " attributes for each instance.");
29   -
30   - Classifier classifier = loadClassifier();
31   -
32   - StopWatch watch = new StopWatch();
33   - watch.start();
34   -
35   - Evaluation eval = new Evaluation(instances);
36   - eval.evaluateModel(classifier, instances);
37   -
38   - LOG.info(eval.toSummaryString());
39   -
40   - watch.stop();
41   - LOG.info("Elapsed time: " + watch);
42   - }
43   -
44   - private static Classifier loadClassifier() throws IOException, ClassNotFoundException {
45   - LOG.info("Loading classifier...");
46   - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Constants.MENTIONS_MODEL_PATH))) {
47   - Classifier classifier = (Classifier) ois.readObject();
48   - LOG.info("Done. " + classifier.toString());
49   - return classifier;
50   - }
51   - }
52   -}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.sentence;
2   -
3   -import org.apache.commons.lang3.time.StopWatch;
4   -import org.slf4j.Logger;
5   -import org.slf4j.LoggerFactory;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7   -import weka.classifiers.Classifier;
8   -import weka.core.Instances;
9   -import weka.core.converters.ArffLoader;
10   -
11   -import java.io.File;
12   -import java.io.FileOutputStream;
13   -import java.io.ObjectOutputStream;
14   -
15   -
16   -public class TrainModel {
17   - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
18   -
19   - public static void main(String[] args) throws Exception {
20   -
21   - ArffLoader loader = new ArffLoader();
22   - loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));
23   - Instances instances = loader.getDataSet();
24   - instances.setClassIndex(0);
25   - LOG.info(instances.size() + " instances loaded.");
26   - LOG.info(instances.numAttributes() + " attributes for each instance.");
27   -
28   - StopWatch watch = new StopWatch();
29   - watch.start();
30   -
31   - Classifier classifier = Constants.getSentencesClassifier();
32   -
33   - LOG.info("Building classifier...");
34   - classifier.buildClassifier(instances);
35   - LOG.info("...done.");
36   -
37   - try (ObjectOutputStream oos = new ObjectOutputStream(
38   - new FileOutputStream(Constants.SENTENCES_MODEL_PATH))) {
39   - oos.writeObject(classifier);
40   - }
41   -
42   - watch.stop();
43   - LOG.info("Elapsed time: " + watch);
44   -
45   - LOG.info(classifier.toString());
46   - }
47   -}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.sentence.test;
2   -
3   -import org.slf4j.Logger;
4   -import org.slf4j.LoggerFactory;
5   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
6   -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils;
7   -import weka.core.Instances;
8   -import weka.core.converters.ArffLoader;
9   -
10   -import java.io.File;
11   -
12   -
13   -public class Crossvalidate {
14   -
15   - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
16   -
17   - private Crossvalidate() {
18   - }
19   -
20   - public static void main(String[] args) throws Exception {
21   -
22   - ArffLoader loader = new ArffLoader();
23   - loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));
24   - Instances instances = loader.getDataSet();
25   - instances.setClassIndex(0);
26   - LOG.info(instances.size() + " instances loaded.");
27   - LOG.info(instances.numAttributes() + " attributes for each instance.");
28   -
29   - EvalUtils.crossvalidateRegression(instances);
30   - }
31   -}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/test/Crossvalidate.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.zero.test;
2   -
3   -import org.slf4j.Logger;
4   -import org.slf4j.LoggerFactory;
5   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
6   -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils;
7   -import weka.core.Instances;
8   -import weka.core.converters.ArffLoader;
9   -
10   -import java.io.File;
11   -
12   -
13   -public class Crossvalidate {
14   -
15   - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);
16   -
17   - private Crossvalidate() {
18   - }
19   -
20   - public static void main(String[] args) throws Exception {
21   -
22   - ArffLoader loader = new ArffLoader();
23   - loader.setFile(new File(Constants.ZERO_DATASET_PATH));
24   - Instances instances = loader.getDataSet();
25   - instances.setClassIndex(0);
26   - LOG.info(instances.size() + " instances loaded.");
27   - LOG.info(instances.numAttributes() + " attributes for each instance.");
28   -
29   - EvalUtils.crossvalidateClassification(instances);
30   - }
31   -}
nicolas-core/pom.xml renamed to nicolas-lib/pom.xml
... ... @@ -9,7 +9,7 @@
9 9 <version>1.0-SNAPSHOT</version>
10 10 </parent>
11 11  
12   - <artifactId>nicolas</artifactId>
  12 + <artifactId>nicolas-lib</artifactId>
13 13  
14 14 <dependencies>
15 15 <!-- project -->
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... ... @@ -11,6 +11,7 @@ import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
11 11 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
12 12 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
13 13 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel;
  14 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
14 15 import weka.classifiers.Classifier;
15 16  
16 17 import java.io.IOException;
... ... @@ -20,22 +21,27 @@ import static java.util.stream.Collectors.toList;
20 21  
21 22 public class Nicolas {
22 23  
23   - private final Classifier sentenceClassifier;
24   - private final Classifier mentionClassifier;
25   - private final MentionFeatureExtractor featureExtractor;
  24 + private final Classifier mentionModel;
  25 + private final Classifier sentenceModel;
  26 + private final Classifier zeroModel;
  27 +
  28 + private final MentionFeatureExtractor mentionFeatureExtractor;
26 29 private final SentenceFeatureExtractor sentenceFeatureExtractor;
  30 + private final ZeroFeatureExtractor zeroFeatureExtractor;
27 31  
28 32 public Nicolas() throws IOException, ClassNotFoundException {
29   - mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
30   - featureExtractor = new MentionFeatureExtractor();
  33 + mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
  34 + sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
  35 + zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);
31 36  
32   - sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH);
  37 + mentionFeatureExtractor = new MentionFeatureExtractor();
33 38 sentenceFeatureExtractor = new SentenceFeatureExtractor();
  39 + zeroFeatureExtractor = new ZeroFeatureExtractor();
34 40 }
35 41  
36 42 public String summarizeThrift(TText text, int targetTokenCount) throws Exception {
37 43 Set<TMention> goodMentions
38   - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text);
  44 + = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text);
39 45 return calculateSummary(text, goodMentions, targetTokenCount);
40 46 }
41 47  
... ... @@ -52,10 +58,10 @@ public class Nicolas {
52 58 private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception {
53 59 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
54 60  
55   - Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceClassifier, sentenceFeatureExtractor);
  61 + Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor);
56 62  
57 63 List<TSentence> sortedSents = Lists.newArrayList(sents);
58   - Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed());
  64 + sortedSents.sort(Comparator.comparing(sentence2score::get).reversed());
59 65  
60 66 int size = 0;
61 67 Random r = new Random(1);
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java
1 1 package pl.waw.ipipan.zil.summ.nicolas;
2 2  
3   -import com.google.common.base.Charsets;
4 3 import com.google.common.collect.Maps;
5   -import com.google.common.io.Files;
6 4 import org.slf4j.Logger;
7 5 import org.slf4j.LoggerFactory;
8 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 9 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
12   -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer;
13 10 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
14 11 import weka.core.Attribute;
15 12 import weka.core.DenseInstance;
16 13 import weka.core.Instance;
17 14  
18   -import java.io.File;
19   -import java.io.IOException;
20 15 import java.util.List;
21 16 import java.util.Map;
22 17 import java.util.Set;
... ... @@ -30,16 +25,6 @@ public class ThriftUtils {
30 25 private ThriftUtils() {
31 26 }
32 27  
33   - public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException {
34   - String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8);
35   -
36   - MentionScorer scorer = new MentionScorer();
37   - Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);
38   -
39   - mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0);
40   - return mention2score.keySet();
41   - }
42   -
43 28 public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
44 29 List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
45 30 Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java
... ... @@ -26,18 +26,18 @@ import java.util.*;
26 26  
27 27 import static java.util.stream.Collectors.toList;
28 28  
29   -public class ApplyModel2 {
  29 +public class ApplyModel {
30 30  
31   - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class);
  31 + private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class);
32 32  
33 33 private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test";
34 34 private static final String TARGET_DIR = "corpora/summaries";
35 35  
36 36 public static void main(String[] args) throws Exception {
37   - Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
  37 + Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH);
38 38 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
39 39  
40   - Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH);
  40 + Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCE_MODEL_RESOURCE_PATH);
41 41 SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();
42 42  
43 43 ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector();
... ... @@ -102,7 +102,7 @@ public class ApplyModel2 {
102 102 }
103 103  
104 104 List<TSentence> sortedSents = Lists.newArrayList(sents);
105   - Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed());
  105 + sortedSents.sort(Comparator.comparing(sentence2score::get).reversed());
106 106  
107 107 int size = 0;
108 108 Random r = new Random(1);
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
1 1 package pl.waw.ipipan.zil.summ.nicolas.mention;
2 2  
3   -import com.google.common.collect.*;
  3 +import com.google.common.collect.Lists;
  4 +import com.google.common.collect.Maps;
4 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
5 6 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  7 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
6 8 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;
7 9 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
8 10 import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation;
9 11 import weka.core.Attribute;
10 12  
11   -import java.io.File;
12 13 import java.io.IOException;
13   -import java.nio.file.Files;
14   -import java.util.*;
  14 +import java.util.List;
  15 +import java.util.Map;
15 16 import java.util.stream.Collectors;
16   -import java.util.stream.Stream;
17 17  
18 18  
19 19 public class MentionFeatureExtractor extends FeatureExtractor {
20 20  
21   - private final List<String> frequentBases = Lists.newArrayList();
  21 + private final List<String> frequentBases;
22 22  
23   - public MentionFeatureExtractor() {
  23 + public MentionFeatureExtractor() throws IOException {
  24 + frequentBases = loadFrequentBases();
24 25  
25 26 //coref
26 27 addNumericAttributeNormalized("chain_length");
... ... @@ -70,7 +71,6 @@ public class MentionFeatureExtractor extends FeatureExtractor {
70 71 addBinaryAttribute(prefix + "_sent_ends_with_questionmark");
71 72  
72 73 // frequent bases
73   - loadFrequentBases();
74 74 for (String base : frequentBases) {
75 75 addBinaryAttribute(prefix + "_" + encodeBase(base));
76 76 }
... ... @@ -80,17 +80,12 @@ public class MentionFeatureExtractor extends FeatureExtractor {
80 80 fillSortedAttributes("score");
81 81 }
82 82  
83   - private String encodeBase(String base) {
84   - return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q");
  83 + private List<String> loadFrequentBases() throws IOException {
  84 + return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList());
85 85 }
86 86  
87   - private void loadFrequentBases() {
88   - try {
89   - Stream<String> lines = Files.lines(new File("frequent_bases.txt").toPath());
90   - this.frequentBases.addAll(lines.map(String::trim).collect(Collectors.toList()));
91   - } catch (IOException e) {
92   - e.printStackTrace();
93   - }
  87 + private String encodeBase(String base) {
  88 + return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q");
94 89 }
95 90  
96 91 public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) {
... ... @@ -123,8 +118,6 @@ public class MentionFeatureExtractor extends FeatureExtractor {
123 118 attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size());
124 119 attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size());
125 120 attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size());
126   -
127   - assert (attribute2value.size() == getAttributesList().size());
128 121 }
129 122 addNormalizedAttributeValues(result);
130 123  
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
... ... @@ -87,7 +87,6 @@ public class SentenceFeatureExtractor extends FeatureExtractor {
87 87 feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue());
88 88  
89 89 feature2value.remove(null);
90   - assert (feature2value.size() == getAttributesList().size());
91 90  
92 91 sentence2features.put(sentence, feature2value);
93 92  
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import com.google.common.collect.Maps;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  5 +import weka.core.Attribute;
  6 +import weka.core.DenseInstance;
  7 +import weka.core.Instance;
  8 +
  9 +import java.util.List;
  10 +import java.util.Map;
  11 +
  12 +public class InstanceCreator {
  13 +
  14 + private InstanceCreator() {
  15 + }
  16 +
  17 + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) {
  18 + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text);
  19 + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap();
  20 + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) {
  21 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  22 + Map<Attribute, Double> sentenceFeatures = entry.getValue();
  23 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  24 + instance.setValue(attribute, sentenceFeatures.get(attribute));
  25 + }
  26 + candidate2instance.put(entry.getKey(), instance);
  27 + }
  28 + return candidate2instance;
  29 + }
  30 +
  31 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... ... @@ -4,6 +4,7 @@ import com.google.common.collect.Lists;
4 4 import com.google.common.collect.Maps;
5 5 import com.google.common.collect.Sets;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
7 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
9 10 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
... ... @@ -18,18 +19,56 @@ import java.util.Set;
18 19  
19 20 public class ZeroFeatureExtractor extends FeatureExtractor {
20 21  
  22 + private static final String SCORE = "score";
  23 +
  24 + private static final String ANTECEDENT_PREFIX = "antecedent";
  25 + private static final String CANDIDATE_PREFIX = "candidate";
  26 +
  27 + private static final String SENTENCE_ENDS_WITH_QUESTION_MARK = "_sentence_ends_with_question_mark";
  28 + private static final String IS_NAMED = "_is_named";
  29 + private static final String TOKEN_COUNT = "_token_count";
  30 + private static final String FIRST_TOKEN_INDEX_IN_SENT = "_first_token_index_in_sent";
  31 + private static final String INDEX_IN_SENT = "_index_in_sent";
  32 + private static final String PREV_TOKEN_POS = "_prev_token_pos";
  33 + private static final String NEXT_TOKEN_POS = "_next_token_pos";
  34 + private static final String IS_NESTING = "_is_nesting";
  35 + private static final String IS_NESTED = "_is_nested";
  36 + private static final String SENTENCE_MENTION_COUNT = "_sentence_mention_count";
  37 + private static final String SENTENCE_TOKEN_LENGTH = "_sentence_token_length";
  38 + private static final String IS_PAN_OR_PANI = "_is_pan_or_pani";
  39 +
  40 + // private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet(
  41 +// "zespół", "tylko", "gdy", ".", ":", "też", "kandydat", "do", "dziś", "bo", "by", "z", "a", "jednak", "jak", "który", "ale", "czy", "i", "się", "rok", "-", "\"", "to", "być", "że", ",");
  42 + private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet("to", "z", "do", "o", "czyli", "nie", "\"", "też", "jak", "czy");
  43 +
  44 + private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet();
  45 +// private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet(
  46 +// "mówić", "ii", "twierdzić", "już", "(", "budzić", "stanowić", "powinien", "do", "stać", "musieć", "stanąć", "móc", "o", "chcieć", "się", "-", "zostać", ":", "?", "i", "na", "z", "mieć", "\"", "to", "w", "nie", "być", ".", ",");
  47 +
  48 + private static final String PREV_TOKEN_LEMMA = "_prev_token_lemma_equal_";
  49 + private static final String NEXT_TOKEN_LEMMA = "_next_token_lemma_equal_";
  50 +
21 51 public ZeroFeatureExtractor() {
22 52  
23   - for (String prefix : new String[]{"antecedent", "candidate"}) {
24   - addNumericAttribute(prefix + "_index_in_sent");
25   - addNumericAttribute(prefix + "_first_token_index_in_sent");
26   - addNumericAttribute(prefix + "_token_count");
27   - addBinaryAttribute(prefix + "_is_named");
28   - addNumericAttribute(prefix + "_sentence_mention_count");
29   - addNominalAttribute(prefix + "_next_token_pos", Constants.POS_TAGS);
30   - addNominalAttribute(prefix + "_prev_token_pos", Constants.POS_TAGS);
31   - addBinaryAttribute(prefix + "_is_nested");
32   - addBinaryAttribute(prefix + "_is_nesting");
  53 + for (String prefix : new String[]{ANTECEDENT_PREFIX, CANDIDATE_PREFIX}) {
  54 + addNumericAttribute(prefix + INDEX_IN_SENT);
  55 + addNumericAttribute(prefix + FIRST_TOKEN_INDEX_IN_SENT);
  56 + addNumericAttribute(prefix + TOKEN_COUNT);
  57 + addBinaryAttribute(prefix + IS_NAMED);
  58 + addBinaryAttribute(prefix + IS_PAN_OR_PANI);
  59 + addNominalAttribute(prefix + NEXT_TOKEN_POS, Constants.POS_TAGS);
  60 + addNominalAttribute(prefix + PREV_TOKEN_POS, Constants.POS_TAGS);
  61 + for (String prevLemma : PREV_TOKEN_LEMMAS) {
  62 + addBinaryAttribute(prefix + PREV_TOKEN_LEMMA + prevLemma);
  63 + }
  64 + for (String nextLemma : NEXT_TOKEN_LEMMAS) {
  65 + addBinaryAttribute(prefix + NEXT_TOKEN_LEMMA + nextLemma);
  66 + }
  67 + addBinaryAttribute(prefix + IS_NESTED);
  68 + addBinaryAttribute(prefix + IS_NESTING);
  69 + addNumericAttribute(prefix + SENTENCE_MENTION_COUNT);
  70 + addNumericAttribute(prefix + SENTENCE_TOKEN_LENGTH);
  71 + addBinaryAttribute(prefix + SENTENCE_ENDS_WITH_QUESTION_MARK);
33 72 }
34 73  
35 74 addNumericAttribute("chain_length");
... ... @@ -43,8 +82,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor {
43 82 addNumericAttribute("pair_sent_distance");
44 83 addNumericAttribute("pair_par_distance");
45 84  
46   - addNominalAttribute("score", Lists.newArrayList("bad", "good"));
47   - fillSortedAttributes("score");
  85 + addNominalAttribute(SCORE, Lists.newArrayList("bad", "good"));
  86 + fillSortedAttributes(SCORE);
48 87 }
49 88  
50 89 public Map<ZeroSubjectCandidate, Map<Attribute, Double>> calculateFeatures(List<ZeroSubjectCandidate> candidates, TText text) {
... ... @@ -62,13 +101,13 @@ public class ZeroFeatureExtractor extends FeatureExtractor {
62 101 private Map<Attribute, Double> calculateFeatures(ZeroSubjectCandidate candidate, FeatureHelper helper) {
63 102  
64 103 Map<Attribute, Double> candidateFeatures = Maps.newHashMap();
65   - candidateFeatures.put(getAttributeByName("score"), weka.core.Utils.missingValue());
  104 + candidateFeatures.put(getAttributeByName(SCORE), weka.core.Utils.missingValue());
66 105  
67 106 TMention mention = candidate.getZeroCandidateMention();
68 107 TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get();
69 108  
70   - addMentionFeatures(helper, candidateFeatures, mention, "candidate");
71   - addMentionFeatures(helper, candidateFeatures, antecedent, "antecedent");
  109 + addMentionFeatures(helper, candidateFeatures, mention, CANDIDATE_PREFIX);
  110 + addMentionFeatures(helper, candidateFeatures, antecedent, ANTECEDENT_PREFIX);
72 111  
73 112 candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equals(helper.getMentionOrth(antecedent))));
74 113 candidateFeatures.put(getAttributeByName("pair_equal_base"), toBinary(helper.getMentionBase(mention).equalsIgnoreCase(helper.getMentionBase(antecedent))));
... ... @@ -98,28 +137,41 @@ public class ZeroFeatureExtractor extends FeatureExtractor {
98 137 }
99 138  
100 139 private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) {
101   - candidateFeatures.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention));
102   - candidateFeatures.put(getAttributeByName(attributePrefix + "_first_token_index_in_sent"), (double) helper.getMentionFirstTokenIndex(mention));
  140 + candidateFeatures.put(getAttributeByName(attributePrefix + INDEX_IN_SENT), (double) helper.getMentionIndexInSent(mention));
  141 + candidateFeatures.put(getAttributeByName(attributePrefix + FIRST_TOKEN_INDEX_IN_SENT), (double) helper.getMentionFirstTokenIndex(mention));
103 142  
104   - candidateFeatures.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize());
105   - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention)));
106   - candidateFeatures.put(getAttributeByName(attributePrefix + "_sentence_mention_count"), (double) helper.getMentionSentence(mention).getMentions().size());
  143 + candidateFeatures.put(getAttributeByName(attributePrefix + TOKEN_COUNT), (double) mention.getChildIdsSize());
  144 + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NAMED), toBinary(helper.isMentionNamedEntity(mention)));
  145 + candidateFeatures.put(getAttributeByName(attributePrefix + IS_PAN_OR_PANI), toBinary(helper.getMentionBase(mention).matches("(pan)|(pani)")));
107 146  
108 147 TToken nextToken = helper.getTokenAfterMention(mention);
109   - addNominalAttributeValue(nextToken == null ? "end" : nextToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_next_token_pos");
  148 + addNominalAttributeValue(nextToken == null ? "end" : nextToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + NEXT_TOKEN_POS);
  149 + String nextTokenLemma = nextToken == null ? "" : nextToken.getChosenInterpretation().getBase();
  150 + for (String nextLemma : NEXT_TOKEN_LEMMAS) {
  151 + candidateFeatures.put(getAttributeByName(attributePrefix + NEXT_TOKEN_LEMMA + nextLemma), toBinary(nextTokenLemma.equalsIgnoreCase(nextLemma)));
  152 + }
  153 +
110 154 TToken prevToken = helper.getTokenBeforeMention(mention);
111   - addNominalAttributeValue(prevToken == null ? "end" : prevToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_prev_token_pos");
  155 + addNominalAttributeValue(prevToken == null ? "end" : prevToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + PREV_TOKEN_POS);
  156 + String prevTokenLemma = prevToken == null ? "" : prevToken.getChosenInterpretation().getBase();
  157 + for (String prevLemma : PREV_TOKEN_LEMMAS) {
  158 + candidateFeatures.put(getAttributeByName(attributePrefix + PREV_TOKEN_LEMMA + prevLemma), toBinary(prevTokenLemma.equalsIgnoreCase(prevLemma)));
  159 + }
112 160  
113   - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention)));
114   - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention)));
  161 + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NESTED), toBinary(helper.isNested(mention)));
  162 + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NESTING), toBinary(helper.isNesting(mention)));
115 163  
  164 + TSentence mentionSentence = helper.getMentionSentence(mention);
  165 + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_MENTION_COUNT), (double) mentionSentence.getMentions().size());
  166 + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_TOKEN_LENGTH), (double) mentionSentence.getTokens().size());
  167 + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth().equals("?")));
116 168 }
117 169  
118 170 private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) {
119 171 Attribute att = getAttributeByName(attributeName);
120 172 int index = att.indexOfValue(value);
121 173 if (index == -1)
122   - LOG.warn(value + " not found for attribute " + attributeName);
  174 + LOG.warn(value + "not found for attribute " + attributeName);
123 175 attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index));
124 176 }
125 177 }
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
... ... @@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
8 8 import weka.classifiers.Classifier;
9 9 import weka.core.Instance;
10 10 import weka.core.Instances;
  11 +import weka.core.SerializationHelper;
11 12  
12   -import java.io.IOException;
13 13 import java.util.List;
14 14 import java.util.Map;
15 15 import java.util.Set;
... ... @@ -21,8 +21,8 @@ public class ZeroSubjectInjector {
21 21 private final Classifier classifier;
22 22 private final Instances instances;
23 23  
24   - public ZeroSubjectInjector() throws IOException, ClassNotFoundException {
25   - classifier = Utils.loadClassifier(Constants.ZERO_MODEL_PATH);
  24 + public ZeroSubjectInjector() throws Exception {
  25 + classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH);
26 26 featureExtractor = new ZeroFeatureExtractor();
27 27 instances = Utils.createNewInstances(featureExtractor.getAttributesList());
28 28 }
... ... @@ -31,7 +31,7 @@ public class ZeroSubjectInjector {
31 31 Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet());
32 32 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds);
33 33 Map<ZeroSubjectCandidate, Instance> candidate2instance =
34   - PrepareTrainingData.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
  34 + InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
35 35  
36 36 Set<String> result = Sets.newHashSet();
37 37 for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) {
... ...
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +import org.junit.BeforeClass;
  4 +import org.junit.Test;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  6 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  7 +
  8 +import static org.junit.Assert.assertTrue;
  9 +
  10 +public class NicolasTest {
  11 +
  12 + private static final String SAMPLE_THRIFT_TEXT_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/sample_serialized_text.thrift";
  13 +
  14 + private static Nicolas nicolas;
  15 +
  16 + @BeforeClass
  17 + public static void shouldLoadModels() throws Exception {
  18 + nicolas = new Nicolas();
  19 + }
  20 +
  21 + @Test
  22 + public void shouldSummarizeThriftText() throws Exception {
  23 + TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH);
  24 + String summary = nicolas.summarizeThrift(thriftText, 5);
  25 + int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size();
  26 + assertTrue(summaryTokensCount > 0);
  27 + assertTrue(summaryTokensCount < 10);
  28 + }
  29 +
  30 +}
0 31 \ No newline at end of file
... ...
nicolas-core/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java renamed to nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
... ... @@ -18,7 +18,7 @@ import static org.junit.Assert.assertEquals;
18 18  
19 19 public class CandidateFinderTest {
20 20  
21   - private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin";
  21 + private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.thrift";
22 22 private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt";
23 23  
24 24 @Test
... ...
nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin renamed to nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/sample_serialized_text.thrift
No preview for this file type
nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.thrift 0 → 100644
No preview for this file type
nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt renamed to nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/models/.gitignore 0 → 100644
  1 +*.bin
0 2 \ No newline at end of file
... ...
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/models/README.md 0 → 100644
  1 +To generate models in this folder, use nicolas-trainer module.
0 2 \ No newline at end of file
... ...
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt renamed to nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/frequent_bases.txt
nicolas-train/pom.xml
... ... @@ -12,6 +12,16 @@
12 12 <artifactId>nicolas-train</artifactId>
13 13  
14 14 <dependencies>
  15 + <!-- project -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>nicolas-common</artifactId>
  19 + </dependency>
  20 + <dependency>
  21 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  22 + <artifactId>nicolas-lib</artifactId>
  23 + </dependency>
  24 +
15 25 <!-- internal -->
16 26 <dependency>
17 27 <groupId>pl.waw.ipipan.zil.summ</groupId>
... ... @@ -22,10 +32,28 @@
22 32 <artifactId>utils</artifactId>
23 33 </dependency>
24 34  
  35 + <!-- third party -->
  36 + <dependency>
  37 + <groupId>nz.ac.waikato.cms.weka</groupId>
  38 + <artifactId>weka-dev</artifactId>
  39 + </dependency>
  40 + <dependency>
  41 + <groupId>org.apache.commons</groupId>
  42 + <artifactId>commons-lang3</artifactId>
  43 + </dependency>
  44 + <dependency>
  45 + <groupId>net.lingala.zip4j</groupId>
  46 + <artifactId>zip4j</artifactId>
  47 + </dependency>
  48 +
25 49 <!-- logging -->
26 50 <dependency>
27 51 <groupId>org.slf4j</groupId>
28 52 <artifactId>slf4j-api</artifactId>
29 53 </dependency>
  54 + <dependency>
  55 + <groupId>org.slf4j</groupId>
  56 + <artifactId>slf4j-simple</artifactId>
  57 + </dependency>
30 58 </dependencies>
31 59 </project>
32 60 \ No newline at end of file
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train;
  2 +
  3 +import net.lingala.zip4j.core.ZipFile;
  4 +import org.apache.commons.io.FileUtils;
  5 +import org.slf4j.Logger;
  6 +import org.slf4j.LoggerFactory;
  7 +import pl.waw.ipipan.zil.summ.nicolas.train.multiservice.NLPProcess;
  8 +
  9 +import java.io.File;
  10 +import java.net.URL;
  11 +
  12 +public class DownloadAndPreprocessCorpus {
  13 +
  14 + private static final Logger LOG = LoggerFactory.getLogger(DownloadAndPreprocessCorpus.class);
  15 +
  16 + private static final String WORKING_DIR = "data";
  17 + private static final String CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/PolishSummariesCorpus?action=AttachFile&do=get&target=PSC_1.0.zip";
  18 +
  19 + private DownloadAndPreprocessCorpus() {
  20 + }
  21 +
  22 + public static void main(String[] args) throws Exception {
  23 + File workDir = createFolder(WORKING_DIR);
  24 +
  25 + File corpusFile = new File(workDir, "corpus.zip");
  26 + if (!corpusFile.exists()) {
  27 + LOG.info("Downloading corpus file...");
  28 + FileUtils.copyURLToFile(new URL(CORPUS_DOWNLOAD_URL), corpusFile);
  29 + LOG.info("done.");
  30 + } else {
  31 + LOG.info("Corpus file already downloaded.");
  32 + }
  33 +
  34 + File extractedCorpusDir = new File(workDir, "corpus");
  35 + if (extractedCorpusDir.exists()) {
  36 + LOG.info("Corpus file already extracted.");
  37 + } else {
  38 + ZipFile zipFile = new ZipFile(corpusFile);
  39 + zipFile.extractAll(extractedCorpusDir.getPath());
  40 + LOG.info("Extracted corpus file.");
  41 + }
  42 +
  43 + File pscDir = new File(extractedCorpusDir, "PSC_1.0");
  44 + File dataDir = new File(pscDir, "data");
  45 +
  46 + File preprocessed = new File(WORKING_DIR, "preprocessed");
  47 + createFolder(preprocessed.getPath());
  48 + NLPProcess.main(new String[]{dataDir.getPath(), preprocessed.getPath()});
  49 + }
  50 +
  51 + private static File createFolder(String path) {
  52 + File folder = new File(path);
  53 + if (folder.mkdir()) {
  54 + LOG.info("Created directory at: {}.", path);
  55 + } else {
  56 + LOG.info("Directory already present at: {}.", path);
  57 + }
  58 + return folder;
  59 + }
  60 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/TrainAllModels.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel;
  6 +
  7 +public class TrainAllModels {
  8 +
  9 + private TrainAllModels() {
  10 + }
  11 +
  12 + public static void main(String[] args) throws Exception {
  13 + TrainMentionModel.main(args);
  14 + TrainSentenceModel.main(args);
  15 + TrainZeroModel.main(args);
  16 + }
  17 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Trainer.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.train;
2   -
3   -public class Trainer {
4   -
5   - public static void main(String[] args) {
6   -
7   - }
8   -}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/ModelConstants.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.common;
  2 +
  3 +import weka.classifiers.Classifier;
  4 +import weka.classifiers.trees.RandomForest;
  5 +
  6 +public class ModelConstants {
  7 +
  8 + public static final String MENTION_DATASET_PATH = "mentions_train.arff";
  9 + public static final String SENTENCE_DATASET_PATH = "sentences_train.arff";
  10 + public static final String ZERO_DATASET_PATH = "zeros_train.arff";
  11 +
  12 + private static final int NUM_ITERATIONS = 16;
  13 + private static final int NUM_EXECUTION_SLOTS = 8;
  14 + private static final int SEED = 0;
  15 +
  16 + private ModelConstants() {
  17 + }
  18 +
  19 + public static Classifier getMentionClassifier() {
  20 + RandomForest classifier = new RandomForest();
  21 + classifier.setNumIterations(NUM_ITERATIONS);
  22 + classifier.setSeed(SEED);
  23 + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS);
  24 + return classifier;
  25 + }
  26 +
  27 + public static Classifier getSentenceClassifier() {
  28 + RandomForest classifier = new RandomForest();
  29 + classifier.setNumIterations(16);
  30 + classifier.setSeed(0);
  31 + classifier.setNumExecutionSlots(8);
  32 + return classifier;
  33 + }
  34 +
  35 + public static Classifier getZeroClassifier() {
  36 + RandomForest classifier = new RandomForest();
  37 + classifier.setNumIterations(16);
  38 + classifier.setSeed(0);
  39 + classifier.setNumExecutionSlots(8);
  40 + return classifier;
  41 + }
  42 +
  43 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/TrainModel.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/TrainModelCommon.java
1   -package pl.waw.ipipan.zil.summ.nicolas.zero;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.common;
2 2  
3 3 import org.apache.commons.lang3.time.StopWatch;
4 4 import org.slf4j.Logger;
5 5 import org.slf4j.LoggerFactory;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  6 +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel;
7 7 import weka.classifiers.Classifier;
8 8 import weka.core.Instances;
9 9 import weka.core.converters.ArffLoader;
... ... @@ -11,41 +11,43 @@ import weka.core.converters.ArffLoader;
11 11 import java.io.File;
12 12 import java.io.FileOutputStream;
13 13 import java.io.ObjectOutputStream;
  14 +import java.util.logging.LogManager;
14 15  
  16 +@SuppressWarnings("squid:S2118")
  17 +public class TrainModelCommon {
15 18  
16   -public class TrainModel {
  19 + private static final Logger LOG = LoggerFactory.getLogger(TrainZeroModel.class);
17 20  
18   - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);
  21 + private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources";
19 22  
20   - private TrainModel() {
  23 + private TrainModelCommon() {
21 24 }
22 25  
23   - public static void main(String[] args) throws Exception {
  26 + public static void trainAndSaveModel(String datasetPath, Classifier classifier, String targetPath) throws Exception {
  27 + LogManager.getLogManager().reset(); // disable WEKA logging
24 28  
25 29 ArffLoader loader = new ArffLoader();
26   - loader.setFile(new File(Constants.ZERO_DATASET_PATH));
  30 + loader.setFile(new File(datasetPath));
27 31 Instances instances = loader.getDataSet();
28 32 instances.setClassIndex(0);
29   - LOG.info(instances.size() + " instances loaded.");
30   - LOG.info(instances.numAttributes() + " attributes for each instance.");
  33 + LOG.info("{} instances loaded.", instances.size());
  34 + LOG.info("{} attributes for each instance.", instances.numAttributes());
31 35  
32 36 StopWatch watch = new StopWatch();
33 37 watch.start();
34 38  
35   - Classifier classifier = Constants.getZerosClassifier();
36   -
37 39 LOG.info("Building classifier...");
38 40 classifier.buildClassifier(instances);
39   - LOG.info("...done.");
  41 + LOG.info("...done. Build classifier: {}", classifier);
40 42  
  43 + String target = TARGET_MODEL_DIR + targetPath;
  44 + LOG.info("Saving classifier at: {}", target);
41 45 try (ObjectOutputStream oos = new ObjectOutputStream(
42   - new FileOutputStream(Constants.ZERO_MODEL_PATH))) {
  46 + new FileOutputStream(target))) {
43 47 oos.writeObject(classifier);
44 48 }
45 49  
46 50 watch.stop();
47   - LOG.info("Elapsed time: " + watch);
48   -
49   - LOG.info(classifier.toString());
  51 + LOG.info("Elapsed time: {}", watch);
50 52 }
51 53 }
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/MentionScorer.java
1   -package pl.waw.ipipan.zil.summ.nicolas.mention;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention;
2 2  
3 3 import com.google.common.collect.HashMultiset;
4 4 import com.google.common.collect.Maps;
... ... @@ -14,7 +14,6 @@ import java.util.stream.Collectors;
14 14  
15 15 public class MentionScorer {
16 16  
17   -
18 17 public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) {
19 18 Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
20 19  
... ... @@ -39,20 +38,4 @@ public class MentionScorer {
39 38 }
40 39 return mention2score;
41 40 }
42   -
43   - private static Map<TMention, Double> booleanTokenInclusion(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
44   - Map<TMention, Double> mention2score = Maps.newHashMap();
45   - for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
46   - TMention mention = entry.getKey();
47   - String mentionOrth = mention2Orth.get(mention);
48   - int present = 0;
49   - for (String token : Utils.tokenize(mentionOrth)) {
50   - if (tokenCounts.contains(token.toLowerCase())) {
51   - present++;
52   - }
53   - }
54   - mention2score.putIfAbsent(mention, ((present * 2) >= Utils.tokenize(mentionOrth).size()) ? 1.0 : 0.0);
55   - }
56   - return mention2score;
57   - }
58 41 }
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/PrepareTrainingData.java
1   -package pl.waw.ipipan.zil.summ.nicolas.mention;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention;
2 2  
3 3 import com.google.common.base.Charsets;
4 4 import com.google.common.collect.Maps;
... ... @@ -7,9 +7,11 @@ import org.slf4j.Logger;
7 7 import org.slf4j.LoggerFactory;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
10   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
11 10 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
12 12 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
  14 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
13 15 import weka.core.Instance;
14 16 import weka.core.Instances;
15 17 import weka.core.converters.ArffSaver;
... ... @@ -23,8 +25,11 @@ public class PrepareTrainingData {
23 25  
24 26 private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class);
25 27  
26   - public static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
27   - public static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
  28 + private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
  29 + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
  30 +
  31 + private PrepareTrainingData() {
  32 + }
28 33  
29 34 public static void main(String[] args) throws IOException {
30 35  
... ... @@ -37,19 +42,20 @@ public class PrepareTrainingData {
37 42 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
38 43  
39 44 int i = 1;
40   - for (String textId : id2preprocessedText.keySet()) {
  45 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
41 46 LOG.info(i++ + "/" + id2preprocessedText.size());
42 47  
43   - TText preprocessedText = id2preprocessedText.get(textId);
44   - String optimalSummary = id2optimalSummary.get(textId);
  48 + String id = entry.getKey();
  49 + TText preprocessedText = entry.getValue();
  50 + String optimalSummary = id2optimalSummary.get(id);
45 51 if (optimalSummary == null)
46 52 continue;
47 53 Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
48 54  
49 55 Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor);
50   - for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
51   - TMention mention = entry.getKey();
52   - Instance instance = entry.getValue();
  56 + for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) {
  57 + TMention mention = entry2.getKey();
  58 + Instance instance = entry2.getValue();
53 59 instance.setDataset(instances);
54 60 instance.setClassValue(mention2score.get(mention));
55 61 instances.add(instance);
... ... @@ -61,7 +67,7 @@ public class PrepareTrainingData {
61 67 private static void saveInstancesToFile(Instances instances) throws IOException {
62 68 ArffSaver saver = new ArffSaver();
63 69 saver.setInstances(instances);
64   - saver.setFile(new File(Constants.MENTIONS_DATASET_PATH));
  70 + saver.setFile(new File(ModelConstants.MENTION_DATASET_PATH));
65 71 saver.writeBatch();
66 72 }
67 73  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/TrainMentionModel.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon;
  6 +import weka.classifiers.Classifier;
  7 +
  8 +public class TrainMentionModel {
  9 +
  10 + private TrainMentionModel() {
  11 + }
  12 +
  13 + public static void main(String[] args) throws Exception {
  14 + Classifier classifier = ModelConstants.getMentionClassifier();
  15 + String datasetPath = ModelConstants.MENTION_DATASET_PATH;
  16 + String targetPath = Constants.MENTION_MODEL_RESOURCE_PATH;
  17 + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath);
  18 + }
  19 +
  20 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/PrepareTrainingData.java
1   -package pl.waw.ipipan.zil.summ.nicolas.sentence;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence;
2 2  
3 3 import com.google.common.base.Charsets;
4 4 import com.google.common.collect.Maps;
... ... @@ -8,11 +8,13 @@ import org.slf4j.LoggerFactory;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
12 11 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  12 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
13 13 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
14 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
  16 +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
  17 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
16 18 import weka.classifiers.Classifier;
17 19 import weka.core.Instance;
18 20 import weka.core.Instances;
... ... @@ -31,6 +33,9 @@ public class PrepareTrainingData {
31 33 private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
32 34 private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
33 35  
  36 + private PrepareTrainingData() {
  37 + }
  38 +
34 39 public static void main(String[] args) throws Exception {
35 40  
36 41 Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
... ... @@ -41,7 +46,7 @@ public class PrepareTrainingData {
41 46  
42 47 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
43 48  
44   - Classifier classifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);
  49 + Classifier classifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH);
45 50 MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor();
46 51  
47 52 int i = 1;
... ... @@ -74,7 +79,7 @@ public class PrepareTrainingData {
74 79 private static void saveInstancesToFile(Instances instances) throws IOException {
75 80 ArffSaver saver = new ArffSaver();
76 81 saver.setInstances(instances);
77   - saver.setFile(new File(Constants.SENTENCES_DATASET_PATH));
  82 + saver.setFile(new File(ModelConstants.SENTENCE_DATASET_PATH));
78 83 saver.writeBatch();
79 84 }
80 85  
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/SentenceScorer.java
1   -package pl.waw.ipipan.zil.summ.nicolas.sentence;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence;
2 2  
3 3 import com.google.common.collect.HashMultiset;
4 4 import com.google.common.collect.Maps;
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/TrainSentenceModel.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon;
  6 +import weka.classifiers.Classifier;
  7 +
  8 +public class TrainSentenceModel {
  9 +
  10 + private TrainSentenceModel() {
  11 + }
  12 +
  13 + public static void main(String[] args) throws Exception {
  14 + Classifier classifier = ModelConstants.getSentenceClassifier();
  15 + String datasetPath = ModelConstants.SENTENCE_DATASET_PATH;
  16 + String targetPath = Constants.SENTENCE_MODEL_RESOURCE_PATH;
  17 + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath);
  18 + }
  19 +
  20 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/PrepareTrainingData.java
1   -package pl.waw.ipipan.zil.summ.nicolas.zero;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero;
2 2  
3 3 import com.google.common.collect.Maps;
4 4 import com.google.common.collect.Sets;
... ... @@ -6,11 +6,13 @@ import org.apache.commons.io.IOUtils;
6 6 import org.slf4j.Logger;
7 7 import org.slf4j.LoggerFactory;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
10 9 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
11 10 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
12   -import weka.core.Attribute;
13   -import weka.core.DenseInstance;
  11 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  12 +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
  13 +import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;
  14 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
  15 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
14 16 import weka.core.Instance;
15 17 import weka.core.Instances;
16 18 import weka.core.converters.ArffSaver;
... ... @@ -54,7 +56,7 @@ public class PrepareTrainingData {
54 56 FeatureHelper featureHelper = new FeatureHelper(text);
55 57  
56 58 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds);
57   - Map<ZeroSubjectCandidate, Instance> candidate2instance = extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
  59 + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
58 60  
59 61 for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) {
60 62 boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper);
... ... @@ -68,24 +70,11 @@ public class PrepareTrainingData {
68 70 saveInstancesToFile(instances);
69 71 }
70 72  
71   - public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) {
72   - Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text);
73   - Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap();
74   - for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) {
75   - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
76   - Map<Attribute, Double> sentenceFeatures = entry.getValue();
77   - for (Attribute attribute : featureExtractor.getAttributesList()) {
78   - instance.setValue(attribute, sentenceFeatures.get(attribute));
79   - }
80   - candidate2instance.put(entry.getKey(), instance);
81   - }
82   - return candidate2instance;
83   - }
84 73  
85 74 private static void saveInstancesToFile(Instances instances) throws IOException {
86 75 ArffSaver saver = new ArffSaver();
87 76 saver.setInstances(instances);
88   - saver.setFile(new File(Constants.ZERO_DATASET_PATH));
  77 + saver.setFile(new File(ModelConstants.ZERO_DATASET_PATH));
89 78 saver.writeBatch();
90 79 }
91 80  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/TrainZeroModel.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon;
  6 +import weka.classifiers.Classifier;
  7 +
  8 +public class TrainZeroModel {
  9 +
  10 + private TrainZeroModel() {
  11 + }
  12 +
  13 + public static void main(String[] args) throws Exception {
  14 + Classifier classifier = ModelConstants.getZeroClassifier();
  15 + String datasetPath = ModelConstants.ZERO_DATASET_PATH;
  16 + String targetPath = Constants.ZERO_MODEL_RESOURCE_PATH;
  17 + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath);
  18 + }
  19 +
  20 +}
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/ZeroScorer.java
1   -package pl.waw.ipipan.zil.summ.nicolas.zero;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero;
2 2  
3 3 import com.google.common.collect.Maps;
4 4 import org.apache.commons.csv.CSVFormat;
... ... @@ -7,6 +7,7 @@ import org.apache.commons.csv.CSVRecord;
7 7 import org.apache.commons.csv.QuoteMode;
8 8 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
9 9 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  10 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
10 11  
11 12 import java.io.IOException;
12 13 import java.io.InputStream;
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java
... ... @@ -24,6 +24,9 @@ public class NLPProcess {
24 24  
25 25 private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT);
26 26  
  27 + private static final String CORPUS_FILE_SUFFIX = ".xml";
  28 + private static final String OUTPUT_FILE_SUFFIX = ".thrift";
  29 +
27 30 private NLPProcess() {
28 31 }
29 32  
... ... @@ -34,23 +37,27 @@ public class NLPProcess {
34 37 }
35 38 File corpusDir = new File(args[0]);
36 39 if (!corpusDir.isDirectory()) {
37   - LOG.error("Corpus directory does not exist: " + corpusDir);
  40 + LOG.error("Corpus directory does not exist: {}", corpusDir);
38 41 return;
39 42 }
40 43 File targetDir = new File(args[1]);
41 44 if (!targetDir.isDirectory()) {
42   - LOG.error("Target directory does not exist: " + targetDir);
  45 + LOG.error("Target directory does not exist: {}", targetDir);
43 46 return;
44 47 }
45 48  
46 49 int ok = 0;
47 50 int err = 0;
48   - File[] files = corpusDir.listFiles(f -> f.getName().endsWith(".xml"));
  51 + File[] files = corpusDir.listFiles(f -> f.getName().endsWith(CORPUS_FILE_SUFFIX));
  52 + if (files == null || files.length == 0) {
  53 + LOG.error("No corpus files found at: {}", corpusDir);
  54 + return;
  55 + }
49 56 Arrays.sort(files);
50 57 for (File file : files) {
51 58 try {
52 59 Text text = PSC_IO.readText(file);
53   - File targetFile = new File(targetDir, file.getName().replaceFirst(".xml$", ".bin"));
  60 + File targetFile = new File(targetDir, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX));
54 61 annotateNLP(text, targetFile);
55 62 ok++;
56 63 } catch (Exception e) {
... ... @@ -58,8 +65,8 @@ public class NLPProcess {
58 65 LOG.error("Problem with text in " + file + ", " + e);
59 66 }
60 67 }
61   - LOG.info(ok + " texts processed successfully.");
62   - LOG.info(err + " texts with errors.");
  68 + LOG.info("{} texts processed successfully.", ok);
  69 + LOG.info("{} texts with errors.", err);
63 70 }
64 71  
65 72 private static void annotateNLP(Text text, File targetFile) throws Exception {
... ... @@ -77,8 +84,8 @@ public class NLPProcess {
77 84 }
78 85  
79 86 public static void serialize(TText ttext, File targetFile) throws IOException {
80   - try (FileOutputStream fout = new FileOutputStream(targetFile);
81   - ObjectOutputStream oos = new ObjectOutputStream(fout)) {
  87 + try (FileOutputStream fileOutputStream = new FileOutputStream(targetFile);
  88 + ObjectOutputStream oos = new ObjectOutputStream(fileOutputStream)) {
82 89 oos.writeObject(ttext);
83 90 }
84 91 }
... ...
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/EvalUtils.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateCommon.java
1   -package pl.waw.ipipan.zil.summ.nicolas.eval;
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.search;
2 2  
3 3 import org.apache.commons.lang3.time.StopWatch;
4 4 import org.apache.commons.lang3.tuple.Pair;
... ... @@ -14,6 +14,7 @@ import weka.classifiers.functions.SimpleLogistic;
14 14 import weka.classifiers.lazy.IBk;
15 15 import weka.classifiers.lazy.KStar;
16 16 import weka.classifiers.lazy.LWL;
  17 +import weka.classifiers.meta.AttributeSelectedClassifier;
17 18 import weka.classifiers.rules.DecisionTable;
18 19 import weka.classifiers.rules.JRip;
19 20 import weka.classifiers.rules.PART;
... ... @@ -23,21 +24,49 @@ import weka.classifiers.trees.J48;
23 24 import weka.classifiers.trees.LMT;
24 25 import weka.classifiers.trees.RandomForest;
25 26 import weka.core.Instances;
  27 +import weka.core.converters.ArffLoader;
26 28  
  29 +import java.io.File;
  30 +import java.io.IOException;
27 31 import java.util.Arrays;
28 32 import java.util.Comparator;
29 33 import java.util.Optional;
30 34 import java.util.Random;
  35 +import java.util.logging.LogManager;
31 36  
32   -public class EvalUtils {
33 37  
34   - private static final Logger LOG = LoggerFactory.getLogger(EvalUtils.class);
35   - public static final int NUM_FOLDS = 10;
  38 +class CrossvalidateCommon {
36 39  
37   - private EvalUtils() {
  40 + private static final Logger LOG = LoggerFactory.getLogger(CrossvalidateCommon.class);
  41 +
  42 + private static final int NUM_FOLDS = 10;
  43 +
  44 + private CrossvalidateCommon() {
  45 + }
  46 +
  47 + static void crossvalidateClassifiers(String datasetPath) throws IOException {
  48 + Instances instances = loadInstances(datasetPath);
  49 + crossvalidateClassification(instances);
  50 + }
  51 +
  52 + static void crossvalidateRegressors(String datasetPath) throws IOException {
  53 + Instances instances = loadInstances(datasetPath);
  54 + crossvalidateRegression(instances);
38 55 }
39 56  
40   - public static void crossvalidateClassification(Instances instances) throws Exception {
  57 + private static Instances loadInstances(String datasetPath) throws IOException {
  58 + LogManager.getLogManager().reset(); // disable WEKA logging
  59 +
  60 + ArffLoader loader = new ArffLoader();
  61 + loader.setFile(new File(datasetPath));
  62 + Instances instances = loader.getDataSet();
  63 + instances.setClassIndex(0);
  64 + LOG.info("{} instances loaded.", instances.size());
  65 + LOG.info("{} attributes for each instance.", instances.numAttributes());
  66 + return instances;
  67 + }
  68 +
  69 + private static void crossvalidateClassification(Instances instances) throws IOException {
41 70 StopWatch watch = new StopWatch();
42 71 watch.start();
43 72  
... ... @@ -45,52 +74,58 @@ public class EvalUtils {
45 74 new Logistic(), new ZeroR(),
46 75 new SimpleLogistic(), new BayesNet(), new NaiveBayes(),
47 76 new KStar(), new IBk(), new LWL(),
48   - new DecisionTable(), new JRip(), new PART()}).parallel().map(cls -> {
49   - Evaluation eval = null;
  77 + new DecisionTable(), new JRip(), new PART(),
  78 + createAttributeSelectedClassifier()}).parallel().map(cls -> {
  79 + String name = cls.getClass().getSimpleName();
  80 + double acc = 0;
  81 + Evaluation eval;
50 82 try {
51 83 eval = new Evaluation(instances);
52 84 eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1));
53 85 } catch (Exception e) {
54   - e.printStackTrace();
  86 + LOG.error("Error evaluating model", e);
  87 + return Pair.of(0.0, name);
55 88 }
56   - double acc = eval.correct() / eval.numInstances();
57   - String name = cls.getClass().getSimpleName();
  89 + acc = eval.correct() / eval.numInstances();
58 90 LOG.info(name + " : " + acc);
59   -
60 91 return Pair.of(acc, name);
61 92 }).max(Comparator.comparingDouble(Pair::getLeft));
62 93 LOG.info("#########");
63 94 LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft());
64 95  
65 96 watch.stop();
66   - LOG.info("Elapsed time: " + watch);
  97 + LOG.info("Elapsed time: {}", watch);
  98 + }
  99 +
  100 +
  101 + private static Classifier createAttributeSelectedClassifier() {
  102 + AttributeSelectedClassifier attributeSelectedClassifier = new AttributeSelectedClassifier();
  103 + attributeSelectedClassifier.setClassifier(new LMT());
  104 + return attributeSelectedClassifier;
67 105 }
68 106  
69   - public static void crossvalidateRegression(Instances instances) {
  107 + private static void crossvalidateRegression(Instances instances) {
70 108 StopWatch watch = new StopWatch();
71 109 watch.start();
72 110  
73 111 Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{
74 112 new RandomForest(), new LinearRegression(), new KStar()}).parallel().map(cls -> {
75   - Evaluation eval = null;
76 113 double acc = 0;
  114 + String name = cls.getClass().getSimpleName();
77 115 try {
78   - eval = new Evaluation(instances);
  116 + Evaluation eval = new Evaluation(instances);
79 117 eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1));
80 118 acc = eval.correlationCoefficient();
81   -
82 119 } catch (Exception e) {
83   - e.printStackTrace();
  120 + LOG.error("Error evaluating model", e);
84 121 }
85   - String name = cls.getClass().getSimpleName();
86 122 LOG.info(name + " : " + acc);
87   -
88 123 return Pair.of(acc, name);
89 124 }).max(Comparator.comparingDouble(Pair::getLeft));
90 125 LOG.info("#########");
91 126 LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft());
92 127  
93 128 watch.stop();
94   - LOG.info("Elapsed time: " + watch);
  129 + LOG.info("Elapsed time: {}", watch);
95 130 }
96   -}
97 131 \ No newline at end of file
  132 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateMention.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.search;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  4 +
  5 +
  6 +public class CrossvalidateMention {
  7 +
  8 + private CrossvalidateMention() {
  9 + }
  10 +
  11 + public static void main(String[] args) throws Exception {
  12 + CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.MENTION_DATASET_PATH);
  13 + }
  14 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateSentence.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.search;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  4 +
  5 +
  6 +public class CrossvalidateSentence {
  7 +
  8 + private CrossvalidateSentence() {
  9 + }
  10 +
  11 + public static void main(String[] args) throws Exception {
  12 + CrossvalidateCommon.crossvalidateRegressors(ModelConstants.SENTENCE_DATASET_PATH);
  13 + }
  14 +}
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateZero.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.search;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  4 +
  5 +
  6 +public class CrossvalidateZero {
  7 +
  8 + private CrossvalidateZero() {
  9 + }
  10 +
  11 + public static void main(String[] args) throws Exception {
  12 + CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.ZERO_DATASET_PATH);
  13 + }
  14 +}
... ...
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/dev_ids.txt 0 → 100644
  1 +199704210011
  2 +199704210013
  3 +199704250031
  4 +199704260017
  5 +199801030156
  6 +199801100009
  7 +199801150038
  8 +199801150133
  9 +199801170001
  10 +199801170129
  11 +199801170130
  12 +199801200002
  13 +199801200132
  14 +199801210007
  15 +199801220030
  16 +199801220127
  17 +199801230001
  18 +199801230095
  19 +199801240116
  20 +199801240123
  21 +199801260113
  22 +199801270108
  23 +199801280128
  24 +199801290020
  25 +199801310032
  26 +199802040201
  27 +199901180149
  28 +199901190049
  29 +199901230088
  30 +199901250006
  31 +199901250008
  32 +199901250111
  33 +199901250113
  34 +199901300064
  35 +199901300098
  36 +199902240123
  37 +199906220027
  38 +199906220037
  39 +199906220038
  40 +199906220056
  41 +199906220065
  42 +199906230040
  43 +199906230052
  44 +199906240040
  45 +199906240088
  46 +199906250007
  47 +199906250091
  48 +199906260015
  49 +199906260018
  50 +199906260038
  51 +199907030016
  52 +199907030018
  53 +199907030042
  54 +199907030059
  55 +199907050032
  56 +199907050040
  57 +199907050047
  58 +199907050071
  59 +199907270095
  60 +199907270137
  61 +199907270145
  62 +199909210045
  63 +199909250054
  64 +199909300064
  65 +199909300065
  66 +199909300066
  67 +199910020049
  68 +199910020050
  69 +199910090047
  70 +199910090049
  71 +199910090051
  72 +199910110055
  73 +199910110057
  74 +199910210058
  75 +199910210059
  76 +199910270041
  77 +199910280054
  78 +199910280055
  79 +199910280057
  80 +199910300026
  81 +199911030039
  82 +199911030040
  83 +199911030041
  84 +199911060031
  85 +199911060042
  86 +199911060043
  87 +199911080054
  88 +199911080055
  89 +199911080056
  90 +199911100061
  91 +199911100062
  92 +199911100063
  93 +199911130036
  94 +199911130037
  95 +199911130038
  96 +199911180042
  97 +199911180043
  98 +199911180044
  99 +199911220059
  100 +199911220061
  101 +199911220066
  102 +199911230041
  103 +199911240035
  104 +199911240037
  105 +199911240038
  106 +199911250055
  107 +199911250057
  108 +199912020059
  109 +199912090045
  110 +199912090047
  111 +199912090061
  112 +199912110041
  113 +199912110042
  114 +199912130055
  115 +199912130057
  116 +199912170065
  117 +199912180052
  118 +199912210018
  119 +199912210037
  120 +199912210040
  121 +199912220045
  122 +199912220046
  123 +199912220047
  124 +199912230058
  125 +199912230059
  126 +199912230097
  127 +199912280028
  128 +199912280044
  129 +199912280045
  130 +199912310085
  131 +199912310087
  132 +200001030047
  133 +200001030106
  134 +200001040030
  135 +200001040031
  136 +200001060052
  137 +200001060053
  138 +200001060055
  139 +200001070062
  140 +200001070066
  141 +200001080040
  142 +200001080041
  143 +200001140061
  144 +200001140064
  145 +200001170049
  146 +200001170051
  147 +200001170052
  148 +200001170053
  149 +200001180040
  150 +200001200056
  151 +200001220023
  152 +200001220118
  153 +200001240016
  154 +200001290042
  155 +200001310048
  156 +200001310049
  157 +200001310050
  158 +200001310054
  159 +200002090042
  160 +200002090043
  161 +200002120045
  162 +200002120046
  163 +200002160046
  164 +200002160047
  165 +200002250063
  166 +200002250065
  167 +200002250066
  168 +200002290044
  169 +200002290045
  170 +200002290046
  171 +200002290047
  172 +200002290048
  173 +200003010058
  174 +200003010059
  175 +200003060054
  176 +200003060055
  177 +200003060057
  178 +200003110047
  179 +200003110048
  180 +200003110049
  181 +200003210044
  182 +200003210045
  183 +200004120021
  184 +200004120022
  185 +200004120023
  186 +200004150048
  187 +200004150049
  188 +200004150050
  189 +200004170026
  190 +200004170065
  191 +200004220044
  192 +200004220045
  193 +200004220046
  194 +200004220047
  195 +200004220048
  196 +200005060030
  197 +200005150055
  198 +200005150059
  199 +200005300045
  200 +200005300047
  201 +200005300048
  202 +200006010065
  203 +200006010066
  204 +200006010067
  205 +200006050056
  206 +200006050057
  207 +200006050058
  208 +200006050059
  209 +200006050061
  210 +200006050068
  211 +200006070056
  212 +200006080033
  213 +200006120031
  214 +200006130055
  215 +200006130057
  216 +200006130059
  217 +200006260069
  218 +200006260071
  219 +200006270059
  220 +200007120068
  221 +200007120070
  222 +200007120072
  223 +200007170026
  224 +200007180051
  225 +200007240034
  226 +200007270050
  227 +200007280033
  228 +200008040071
  229 +200008040073
  230 +200008250077
  231 +200008250079
  232 +200008260055
  233 +200008310046
  234 +200010120066
  235 +200010120074
  236 +200010130063
  237 +200010140048
  238 +200010140049
  239 +200010160039
  240 +200010160048
  241 +200010160049
  242 +200010180059
  243 +200010180063
  244 +200010190066
  245 +200010190068
  246 +200011210063
  247 +200011210064
  248 +200011210066
  249 +200012050066
  250 +200012050067
  251 +200012050068
  252 +200012050069
  253 +200012050070
  254 +200012050071
  255 +200012080134
  256 +200012080137
  257 +200012110069
  258 +200012110070
  259 +200012110071
  260 +200012110075
  261 +200012120028
  262 +200012120068
  263 +200012120072
  264 +200012130056
  265 +200012130100
  266 +200012130102
  267 +200012130103
  268 +200012140095
  269 +200012140096
  270 +200012140097
  271 +200012140098
  272 +200012140099
  273 +200012140100
  274 +200012150076
  275 +200012160048
  276 +200012160049
  277 +200012180083
  278 +200012180084
  279 +200012180088
  280 +200012230028
  281 +200012230045
  282 +200012230046
  283 +200012230047
  284 +200012230048
  285 +200012230050
  286 +200012270055
  287 +200012270056
  288 +200101020059
  289 +200101020062
  290 +200101020063
  291 +200101020075
  292 +200101130048
  293 +200101130050
  294 +200101130051
  295 +200101130055
  296 +200101150043
  297 +200101150045
  298 +200101180050
  299 +200101180051
  300 +200101180052
  301 +200101200048
  302 +200101220047
  303 +200101220053
  304 +200102070011
  305 +200102070016
  306 +200102120034
  307 +200102120057
  308 +200102130014
  309 +200102150001
  310 +200102150014
  311 +200102160011
  312 +200102190016
  313 +200102220001
  314 +200102220013
  315 +200102270041
  316 +200102270062
  317 +200102280169
  318 +200103010049
  319 +200103060022
  320 +200103060032
  321 +200103060057
  322 +200103080026
  323 +200103080030
  324 +200103080036
  325 +200103100019
  326 +200103100021
  327 +200103100058
  328 +200103100062
  329 +200103130008
  330 +200103130023
  331 +200103130069
  332 +200103200066
  333 +200103200080
  334 +200103270069
  335 +200103310092
  336 +200104020007
  337 +200104050011
  338 +200104100021
  339 +200104100023
  340 +200104170015
  341 +200104170040
  342 +200104170055
  343 +200104170057
  344 +200104190039
  345 +200104190066
  346 +200104230031
  347 +200104230069
  348 +200104260051
  349 +200104260053
  350 +200104300213
  351 +200104300215
  352 +200104300217
  353 +200105020092
  354 +200105050042
  355 +200105050043
  356 +200105050046
  357 +200105050048
  358 +200105070017
  359 +200105140050
  360 +200105140052
  361 +200105220096
  362 +200105290074
  363 +200105290075
  364 +200106120068
  365 +200106120069
  366 +200106180051
  367 +200106180053
  368 +200106200064
  369 +200106220086
  370 +200106220087
  371 +200106220088
  372 +200106220090
  373 +200106250050
  374 +200107120071
  375 +200107120073
  376 +200107210129
  377 +200107240070
  378 +200107250080
  379 +200108060051
  380 +200108060155
  381 +200108060156
  382 +200108060157
  383 +200108070038
  384 +200108160040
  385 +200108180123
  386 +200108200033
  387 +200108210066
  388 +200108210074
  389 +200108270077
  390 +200108280064
  391 +200109060061
  392 +200109130091
  393 +200109250092
  394 +200109260097
  395 +200109270116
  396 +200110020075
  397 +200110150056
  398 +200110150062
  399 +200110200070
  400 +200110200071
  401 +200110220068
  402 +200111080086
  403 +200111140055
  404 +200111210078
  405 +200111240060
  406 +200112040031
  407 +200112040077
  408 +200112050063
  409 +200112100041
  410 +200112190067
  411 +200201280011
  412 +200201290029
  413 +200202280078
  414 +200203280057
  415 +200203290107
... ...
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/test_ids.txt 0 → 100644
  1 +199704210012
  2 +199704210042
  3 +199704220007
  4 +199704220018
  5 +199704220021
  6 +199704220044
  7 +199704230006
  8 +199704230014
  9 +199704230029
  10 +199704230043
  11 +199704240008
  12 +199704240019
  13 +199704240020
  14 +199704240021
  15 +199704250018
  16 +199704250022
  17 +199704260014
  18 +199704260015
  19 +199704260016
  20 +199704280023
  21 +199704280025
  22 +199704280027
  23 +199704280031
  24 +199704300031
  25 +199704300042
  26 +199704300046
  27 +199801020010
  28 +199801020031
  29 +199801020035
  30 +199801020070
  31 +199801020076
  32 +199801020079
  33 +199801030068
  34 +199801030090
  35 +199801030091
  36 +199801030129
  37 +199801030148
  38 +199801030158
  39 +199801050023
  40 +199801050059
  41 +199801130087
  42 +199801130129
  43 +199801140182
  44 +199801160119
  45 +199801200106
  46 +199801220140
  47 +199801240061
  48 +199801240096
  49 +199801260047
  50 +199801260070
  51 +199801270055
  52 +199801270110
  53 +199801280123
  54 +199801280158
  55 +199801280159
  56 +199801280241
  57 +199801290022
  58 +199801310003
  59 +199801310037
  60 +199802030127
  61 +199802040159
  62 +199802040182
  63 +199802040202
  64 +199805220133
  65 +199808280158
  66 +199901190073
  67 +199901190115
  68 +199901250112
  69 +199901250117
  70 +199901270103
  71 +199901270120
  72 +199901270122
  73 +199901290095
  74 +199901300101
  75 +199902240095
  76 +199906220029
  77 +199906230024
  78 +199906240084
  79 +199906260027
  80 +199907050045
  81 +199907050076
  82 +199907140166
  83 +199907200002
  84 +199907270004
  85 +199908260001
  86 +199909090036
  87 +199909250018
  88 +199909270029
  89 +199910020027
  90 +199910020029
  91 +199910270011
  92 +199911060044
  93 +199911100038
  94 +199911100064
  95 +199911200030
  96 +199911220063
  97 +199912020060
  98 +199912180026
  99 +199912180034
  100 +199912220030
  101 +199912280024
  102 +199912280046
  103 +199912300021
  104 +199912300029
  105 +200001030029
  106 +200001030053
  107 +200001060034
  108 +200001100035
  109 +200001100046
  110 +200001170029
  111 +200001170033
  112 +200001170060
  113 +200001290045
  114 +200002220027
  115 +200002240034
  116 +200002250031
  117 +200003060062
  118 +200003110050
  119 +200004280047
  120 +200004290022
  121 +200006050119
  122 +200006260079
  123 +200006290045
  124 +200007150033
  125 +200008040076
  126 +200008220042
  127 +200008220046
  128 +200010130049
  129 +200010160054
  130 +200012130034
  131 +200012140084
  132 +200012290046
  133 +200104040019
  134 +200106050035
  135 +200108180109
  136 +200108300032
  137 +200111120045
  138 +200111150042
  139 +200111150047
  140 +200111200036
  141 +200111270049
  142 +200112030055
  143 +200112280057
  144 +200201220038
  145 +200201220050
  146 +200202020036
  147 +200202200032
  148 +200202210054
  149 +200202270044
  150 +200203010070
  151 +200203190026
  152 +200203260050
  153 +200203280017
  154 +200203290078
... ...
nicolas-core/src/main/resources/zeros.tsv renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/zeros.tsv
nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessTest.java renamed to nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java
1 1 package pl.waw.ipipan.zil.summ.nicolas.train.multiservice;
2 2  
  3 +import com.google.common.collect.Lists;
  4 +import org.junit.ClassRule;
3 5 import org.junit.Test;
  6 +import org.junit.rules.TemporaryFolder;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
4 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
5 9  
6 10 import java.io.File;
  11 +import java.util.List;
  12 +import java.util.stream.Collectors;
  13 +
  14 +import static junit.framework.TestCase.assertEquals;
  15 +
  16 +public class NLPProcessIT {
  17 +
  18 + @ClassRule
  19 + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder();
7 20  
8   -public class NLPProcessTest {
9 21 @Test
10 22 public void shouldProcessSampleText() throws Exception {
11 23 String text = "Ala ma kota. Ala ma też psa.";
12 24 TText processed = NLPProcess.annotate(text);
13   - processed.getParagraphs().stream().flatMap(p->p.getSentences().stream()).forEach(s->System.out.println(s.getId()));
14   - File targetFile = new File("sample_serialized_text.bin");
  25 + List<String> ids = processed.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).map(TSentence::getId).collect(Collectors.toList());
  26 + assertEquals(Lists.newArrayList("s-2.1", "s-2.2"), ids);
  27 +
  28 + File targetFile = TEMPORARY_FOLDER.newFile();
15 29 NLPProcess.serialize(processed, targetFile);
16 30 }
17 31 }
18 32 \ No newline at end of file
... ...
... ... @@ -11,7 +11,7 @@
11 11 <packaging>pom</packaging>
12 12  
13 13 <modules>
14   - <module>nicolas-core</module>
  14 + <module>nicolas-lib</module>
15 15 <module>nicolas-cli</module>
16 16 <module>nicolas-model</module>
17 17 <module>nicolas-train</module>
... ... @@ -26,12 +26,13 @@
26 26 <utils.version>1.0</utils.version>
27 27  
28 28 <commons-csv.version>1.4</commons-csv.version>
29   - <guava.version>19.0</guava.version>
30   - <weka-dev.version>3.9.0</weka-dev.version>
  29 + <guava.version>20.0</guava.version>
  30 + <weka-dev.version>3.9.1</weka-dev.version>
31 31 <commons-lang3.version>3.5</commons-lang3.version>
32 32 <commons-io.version>2.5</commons-io.version>
33   - <slf4j-api.version>1.7.12</slf4j-api.version>
  33 + <slf4j-api.version>1.7.22</slf4j-api.version>
34 34 <junit.version>4.12</junit.version>
  35 + <zip4j.version>1.3.2</zip4j.version>
35 36 </properties>
36 37  
37 38 <prerequisites>
... ... @@ -65,6 +66,16 @@
65 66 <artifactId>nicolas-zero</artifactId>
66 67 <version>${project.version}</version>
67 68 </dependency>
  69 + <dependency>
  70 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  71 + <artifactId>nicolas-lib</artifactId>
  72 + <version>${project.version}</version>
  73 + </dependency>
  74 + <dependency>
  75 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  76 + <artifactId>nicolas-train</artifactId>
  77 + <version>${project.version}</version>
  78 + </dependency>
68 79  
69 80 <!-- internal -->
70 81 <dependency>
... ... @@ -93,6 +104,12 @@
93 104 <groupId>nz.ac.waikato.cms.weka</groupId>
94 105 <artifactId>weka-dev</artifactId>
95 106 <version>${weka-dev.version}</version>
  107 + <exclusions>
  108 + <exclusion>
  109 + <groupId>org.slf4j</groupId>
  110 + <artifactId>slf4j-simple</artifactId>
  111 + </exclusion>
  112 + </exclusions>
96 113 </dependency>
97 114 <dependency>
98 115 <groupId>org.apache.commons</groupId>
... ... @@ -104,6 +121,11 @@
104 121 <artifactId>commons-io</artifactId>
105 122 <version>${commons-io.version}</version>
106 123 </dependency>
  124 + <dependency>
  125 + <groupId>net.lingala.zip4j</groupId>
  126 + <artifactId>zip4j</artifactId>
  127 + <version>${zip4j.version}</version>
  128 + </dependency>
107 129  
108 130 <!-- logging -->
109 131 <dependency>
... ... @@ -111,6 +133,11 @@
111 133 <artifactId>slf4j-api</artifactId>
112 134 <version>${slf4j-api.version}</version>
113 135 </dependency>
  136 + <dependency>
  137 + <groupId>org.slf4j</groupId>
  138 + <artifactId>slf4j-simple</artifactId>
  139 + <version>${slf4j-api.version}</version>
  140 + </dependency>
114 141  
115 142 <!-- test -->
116 143 <dependency>
... ...