Commit 76eeceb70c81d7fdfe3860db5a8576f0e4234daf

Authored by Mateusz Kopeć
1 parent f04fcb1a

large refactor

Showing 60 changed files with 1238 additions and 478 deletions
.gitignore
@@ -15,4 +15,4 @@ target/ @@ -15,4 +15,4 @@ target/
15 hs_err_pid* 15 hs_err_pid*
16 16
17 .idea 17 .idea
18 -*.iml  
19 \ No newline at end of file 18 \ No newline at end of file
  19 +*.iml
nicolas-common/pom.xml
@@ -27,6 +27,10 @@ @@ -27,6 +27,10 @@
27 <groupId>nz.ac.waikato.cms.weka</groupId> 27 <groupId>nz.ac.waikato.cms.weka</groupId>
28 <artifactId>weka-dev</artifactId> 28 <artifactId>weka-dev</artifactId>
29 </dependency> 29 </dependency>
  30 + <dependency>
  31 + <groupId>commons-io</groupId>
  32 + <artifactId>commons-io</artifactId>
  33 + </dependency>
30 34
31 <!-- logging --> 35 <!-- logging -->
32 <dependency> 36 <dependency>
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java
@@ -2,26 +2,21 @@ package pl.waw.ipipan.zil.summ.nicolas.common; @@ -2,26 +2,21 @@ package pl.waw.ipipan.zil.summ.nicolas.common;
2 2
3 import com.google.common.base.Charsets; 3 import com.google.common.base.Charsets;
4 import com.google.common.collect.ImmutableList; 4 import com.google.common.collect.ImmutableList;
5 -import weka.classifiers.Classifier;  
6 -import weka.classifiers.functions.SMO;  
7 -import weka.classifiers.meta.AdaBoostM1;  
8 -import weka.classifiers.meta.AttributeSelectedClassifier;  
9 -import weka.classifiers.rules.JRip;  
10 -import weka.classifiers.trees.J48;  
11 -import weka.classifiers.trees.RandomForest;  
12 5
13 import java.nio.charset.Charset; 6 import java.nio.charset.Charset;
14 7
15 8
16 public class Constants { 9 public class Constants {
17 10
18 - public static final String MENTIONS_MODEL_PATH = "mentions_model.bin";  
19 - public static final String SENTENCES_MODEL_PATH = "sentences_model.bin";  
20 - public static final String ZERO_MODEL_PATH = "zeros_model.bin"; 11 + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/";
21 12
22 - public static final String MENTIONS_DATASET_PATH = "mentions_train.arff";  
23 - public static final String SENTENCES_DATASET_PATH = "sentences_train.arff";  
24 - public static final String ZERO_DATASET_PATH = "zeros_train.arff"; 13 + private static final String MODELS_PATH = ROOT_PATH + "models/";
  14 + public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin";
  15 + public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin";
  16 + public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin";
  17 +
  18 + private static final String RESOURCES_PATH = ROOT_PATH + "resources/";
  19 + public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt";
25 20
26 public static final Charset ENCODING = Charsets.UTF_8; 21 public static final Charset ENCODING = Charsets.UTF_8;
27 22
@@ -30,24 +25,4 @@ public class Constants { @@ -30,24 +25,4 @@ public class Constants {
30 private Constants() { 25 private Constants() {
31 } 26 }
32 27
33 - public static Classifier getMentionClassifier() {  
34 - RandomForest classifier = new RandomForest();  
35 - classifier.setNumIterations(250);  
36 - classifier.setSeed(0);  
37 - classifier.setNumExecutionSlots(8);  
38 - return classifier;  
39 - }  
40 -  
41 - public static Classifier getSentencesClassifier() {  
42 - RandomForest classifier = new RandomForest();  
43 - classifier.setNumIterations(10);  
44 - classifier.setSeed(0);  
45 - classifier.setNumExecutionSlots(8);  
46 - return classifier;  
47 - }  
48 -  
49 - public static Classifier getZerosClassifier() {  
50 - Classifier classifier = new J48();  
51 - return classifier;  
52 - }  
53 } 28 }
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
@@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.common; @@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.common;
3 import com.google.common.collect.Lists; 3 import com.google.common.collect.Lists;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
5 import com.google.common.collect.Sets; 5 import com.google.common.collect.Sets;
  6 +import org.apache.commons.io.IOUtils;
6 import org.slf4j.Logger; 7 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory; 8 import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
@@ -24,6 +25,47 @@ public class Utils { @@ -24,6 +25,47 @@ public class Utils {
24 25
25 private static final String DATASET_NAME = "Dataset"; 26 private static final String DATASET_NAME = "Dataset";
26 27
  28 + private Utils() {
  29 + }
  30 +
  31 + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException {
  32 + LOG.info("Loading classifier from path: {}...", modelResourcePath);
  33 + try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) {
  34 + if (stream == null) {
  35 + throw new IOException("Model not found at: " + modelResourcePath);
  36 + }
  37 + try (ObjectInputStream ois = new ObjectInputStream(stream)) {
  38 + Classifier classifier = (Classifier) ois.readObject();
  39 + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName());
  40 + return classifier;
  41 + } catch (ClassNotFoundException e) {
  42 + LOG.error("Error loading serialized classifier, class not found.", e);
  43 + throw new IOException(e);
  44 + }
  45 + }
  46 + }
  47 +
  48 + public static TText loadThriftTextFromResource(String textResourcePath) throws IOException {
  49 + try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) {
  50 + if (stream == null) {
  51 + throw new IOException("Resource not found at: " + textResourcePath);
  52 + }
  53 + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) {
  54 + return (TText) ois.readObject();
  55 + } catch (ClassNotFoundException e) {
  56 + LOG.error("Error reading serialized thrift text file, class not found.", e);
  57 + throw new IOException(e);
  58 + }
  59 + }
  60 + }
  61 +
  62 + public static List<String> loadLinesFromResource(String resourcePath) throws IOException {
  63 + try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) {
  64 + return IOUtils.readLines(stream, Constants.ENCODING);
  65 + }
  66 + }
  67 +
  68 + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList
27 public static Instances createNewInstances(ArrayList<Attribute> attributesList) { 69 public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
28 Instances instances = new Instances(DATASET_NAME, attributesList, 0); 70 Instances instances = new Instances(DATASET_NAME, attributesList, 0);
29 instances.setClassIndex(0); 71 instances.setClassIndex(0);
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java
@@ -8,10 +8,12 @@ import java.io.ObjectStreamClass; @@ -8,10 +8,12 @@ import java.io.ObjectStreamClass;
8 8
9 public class VersionIgnoringObjectInputStream extends ObjectInputStream { 9 public class VersionIgnoringObjectInputStream extends ObjectInputStream {
10 10
11 - public VersionIgnoringObjectInputStream(InputStream in) throws IOException { 11 + VersionIgnoringObjectInputStream(InputStream in) throws IOException {
12 super(in); 12 super(in);
13 } 13 }
14 14
  15 + @Override
  16 + @SuppressWarnings("squid:S1166")
15 protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException { 17 protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException {
16 ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor 18 ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor
17 Class localClass; // the class in the local JVM that this descriptor represents. 19 Class localClass; // the class in the local JVM that this descriptor represents.
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java
@@ -17,6 +17,7 @@ public class FeatureExtractor { @@ -17,6 +17,7 @@ public class FeatureExtractor {
17 17
18 private final Set<String> normalizedAttributes = Sets.newHashSet(); 18 private final Set<String> normalizedAttributes = Sets.newHashSet();
19 19
  20 + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList
20 public ArrayList<Attribute> getAttributesList() { 21 public ArrayList<Attribute> getAttributesList() {
21 return Lists.newArrayList(sortedAttributes); 22 return Lists.newArrayList(sortedAttributes);
22 } 23 }
@@ -46,15 +47,14 @@ public class FeatureExtractor { @@ -46,15 +47,14 @@ public class FeatureExtractor {
46 protected void fillSortedAttributes(String scoreAttName) { 47 protected void fillSortedAttributes(String scoreAttName) {
47 sortedAttributes.addAll(name2attribute.values()); 48 sortedAttributes.addAll(name2attribute.values());
48 sortedAttributes.remove(getAttributeByName(scoreAttName)); 49 sortedAttributes.remove(getAttributeByName(scoreAttName));
49 - Collections.sort(sortedAttributes, (o1, o2) -> name2attribute.inverse().get(o1).compareTo(name2attribute.inverse().get(o2))); 50 + sortedAttributes.sort(Comparator.comparing(name2attribute.inverse()::get));
50 sortedAttributes.add(0, getAttributeByName(scoreAttName)); 51 sortedAttributes.add(0, getAttributeByName(scoreAttName));
51 } 52 }
52 53
53 protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) { 54 protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) {
54 Map<Attribute, Double> attribute2max = Maps.newHashMap(); 55 Map<Attribute, Double> attribute2max = Maps.newHashMap();
55 Map<Attribute, Double> attribute2min = Maps.newHashMap(); 56 Map<Attribute, Double> attribute2min = Maps.newHashMap();
56 - for (T entity : entity2attributes.keySet()) {  
57 - Map<Attribute, Double> entityAttributes = entity2attributes.get(entity); 57 + for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) {
58 for (String attributeName : normalizedAttributes) { 58 for (String attributeName : normalizedAttributes) {
59 Attribute attribute = getAttributeByName(attributeName); 59 Attribute attribute = getAttributeByName(attributeName);
60 Double value = entityAttributes.get(attribute); 60 Double value = entityAttributes.get(attribute);
@@ -66,8 +66,7 @@ public class FeatureExtractor { @@ -66,8 +66,7 @@ public class FeatureExtractor {
66 attribute2min.compute(attribute, (k, v) -> Math.min(v, value)); 66 attribute2min.compute(attribute, (k, v) -> Math.min(v, value));
67 } 67 }
68 } 68 }
69 - for (T mention : entity2attributes.keySet()) {  
70 - Map<Attribute, Double> entityAttributes = entity2attributes.get(mention); 69 + for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) {
71 for (Attribute attribute : attribute2max.keySet()) { 70 for (Attribute attribute : attribute2max.keySet()) {
72 Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized"); 71 Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized");
73 entityAttributes.put(normalizedAttribute, 72 entityAttributes.put(normalizedAttribute,
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java
@@ -174,11 +174,11 @@ public class FeatureHelper { @@ -174,11 +174,11 @@ public class FeatureHelper {
174 } 174 }
175 175
176 public boolean isNested(TMention mention) { 176 public boolean isNested(TMention mention) {
177 - return mentions.stream().anyMatch(m -> m.getChildIds().containsAll(mention.getChildIds())); 177 + return mentions.stream().anyMatch(m -> !m.equals(mention) && m.getChildIds().containsAll(mention.getChildIds()));
178 } 178 }
179 179
180 public boolean isNesting(TMention mention) { 180 public boolean isNesting(TMention mention) {
181 - return mentions.stream().anyMatch(m -> mention.getChildIds().containsAll(m.getChildIds())); 181 + return mentions.stream().anyMatch(m -> !m.equals(mention) && mention.getChildIds().containsAll(m.getChildIds()));
182 } 182 }
183 183
184 public Set<TCoreference> getClusters() { 184 public Set<TCoreference> getClusters() {
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java
@@ -33,6 +33,7 @@ public class Interpretation { @@ -33,6 +33,7 @@ public class Interpretation {
33 person = split[3]; 33 person = split[3];
34 break; 34 break;
35 case "siebie": 35 case "siebie":
  36 + case "prep":
36 casee = split[0]; 37 casee = split[0];
37 break; 38 break;
38 case "fin": 39 case "fin":
@@ -47,9 +48,6 @@ public class Interpretation { @@ -47,9 +48,6 @@ public class Interpretation {
47 number = split[0]; 48 number = split[0];
48 gender = split[1]; 49 gender = split[1];
49 break; 50 break;
50 - case "prep":  
51 - casee = split[0];  
52 - break;  
53 default: 51 default:
54 break; 52 break;
55 } 53 }
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.mention;  
2 -  
3 -import org.apache.commons.lang3.time.StopWatch;  
4 -import org.slf4j.Logger;  
5 -import org.slf4j.LoggerFactory;  
6 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
7 -import weka.classifiers.Classifier;  
8 -import weka.core.Instances;  
9 -import weka.core.converters.ArffLoader;  
10 -  
11 -import java.io.File;  
12 -import java.io.FileOutputStream;  
13 -import java.io.ObjectOutputStream;  
14 -  
15 -  
16 -public class TrainModel {  
17 - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);  
18 -  
19 - public static void main(String[] args) throws Exception {  
20 -  
21 - ArffLoader loader = new ArffLoader();  
22 - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));  
23 - Instances instances = loader.getDataSet();  
24 - instances.setClassIndex(0);  
25 - LOG.info(instances.size() + " instances loaded.");  
26 - LOG.info(instances.numAttributes() + " attributes for each instance.");  
27 -  
28 - StopWatch watch = new StopWatch();  
29 - watch.start();  
30 -  
31 - Classifier classifier = Constants.getMentionClassifier();  
32 -  
33 - LOG.info("Building classifier...");  
34 - classifier.buildClassifier(instances);  
35 - LOG.info("...done.");  
36 -  
37 - try (ObjectOutputStream oos = new ObjectOutputStream(  
38 - new FileOutputStream(Constants.MENTIONS_MODEL_PATH))) {  
39 - oos.writeObject(classifier);  
40 - }  
41 -  
42 - watch.stop();  
43 - LOG.info("Elapsed time: " + watch);  
44 -  
45 - LOG.info(classifier.toString());  
46 - }  
47 -}  
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.mention.test;  
2 -  
3 -import org.slf4j.Logger;  
4 -import org.slf4j.LoggerFactory;  
5 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
6 -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils;  
7 -import weka.core.Instances;  
8 -import weka.core.converters.ArffLoader;  
9 -  
10 -import java.io.File;  
11 -  
12 -  
13 -public class Crossvalidate {  
14 -  
15 - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);  
16 -  
17 - private Crossvalidate() {  
18 - }  
19 -  
20 - public static void main(String[] args) throws Exception {  
21 - ArffLoader loader = new ArffLoader();  
22 - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));  
23 - Instances instances = loader.getDataSet();  
24 - instances.setClassIndex(0);  
25 - LOG.info(instances.size() + " instances loaded.");  
26 - LOG.info(instances.numAttributes() + " attributes for each instance.");  
27 -  
28 - EvalUtils.crossvalidateClassification(instances);  
29 - }  
30 -}  
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.mention.test;  
2 -  
3 -import org.apache.commons.lang3.time.StopWatch;  
4 -import org.slf4j.Logger;  
5 -import org.slf4j.LoggerFactory;  
6 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
7 -import weka.classifiers.Classifier;  
8 -import weka.classifiers.evaluation.Evaluation;  
9 -import weka.core.Instances;  
10 -import weka.core.converters.ArffLoader;  
11 -  
12 -import java.io.File;  
13 -import java.io.FileInputStream;  
14 -import java.io.IOException;  
15 -import java.io.ObjectInputStream;  
16 -  
17 -  
18 -public class Validate {  
19 - private static final Logger LOG = LoggerFactory.getLogger(Validate.class);  
20 -  
21 - public static void main(String[] args) throws Exception {  
22 -  
23 - ArffLoader loader = new ArffLoader();  
24 - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH));  
25 - Instances instances = loader.getDataSet();  
26 - instances.setClassIndex(0);  
27 - LOG.info(instances.size() + " instances loaded.");  
28 - LOG.info(instances.numAttributes() + " attributes for each instance.");  
29 -  
30 - Classifier classifier = loadClassifier();  
31 -  
32 - StopWatch watch = new StopWatch();  
33 - watch.start();  
34 -  
35 - Evaluation eval = new Evaluation(instances);  
36 - eval.evaluateModel(classifier, instances);  
37 -  
38 - LOG.info(eval.toSummaryString());  
39 -  
40 - watch.stop();  
41 - LOG.info("Elapsed time: " + watch);  
42 - }  
43 -  
44 - private static Classifier loadClassifier() throws IOException, ClassNotFoundException {  
45 - LOG.info("Loading classifier...");  
46 - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Constants.MENTIONS_MODEL_PATH))) {  
47 - Classifier classifier = (Classifier) ois.readObject();  
48 - LOG.info("Done. " + classifier.toString());  
49 - return classifier;  
50 - }  
51 - }  
52 -}  
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.sentence;  
2 -  
3 -import org.apache.commons.lang3.time.StopWatch;  
4 -import org.slf4j.Logger;  
5 -import org.slf4j.LoggerFactory;  
6 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
7 -import weka.classifiers.Classifier;  
8 -import weka.core.Instances;  
9 -import weka.core.converters.ArffLoader;  
10 -  
11 -import java.io.File;  
12 -import java.io.FileOutputStream;  
13 -import java.io.ObjectOutputStream;  
14 -  
15 -  
16 -public class TrainModel {  
17 - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class);  
18 -  
19 - public static void main(String[] args) throws Exception {  
20 -  
21 - ArffLoader loader = new ArffLoader();  
22 - loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));  
23 - Instances instances = loader.getDataSet();  
24 - instances.setClassIndex(0);  
25 - LOG.info(instances.size() + " instances loaded.");  
26 - LOG.info(instances.numAttributes() + " attributes for each instance.");  
27 -  
28 - StopWatch watch = new StopWatch();  
29 - watch.start();  
30 -  
31 - Classifier classifier = Constants.getSentencesClassifier();  
32 -  
33 - LOG.info("Building classifier...");  
34 - classifier.buildClassifier(instances);  
35 - LOG.info("...done.");  
36 -  
37 - try (ObjectOutputStream oos = new ObjectOutputStream(  
38 - new FileOutputStream(Constants.SENTENCES_MODEL_PATH))) {  
39 - oos.writeObject(classifier);  
40 - }  
41 -  
42 - watch.stop();  
43 - LOG.info("Elapsed time: " + watch);  
44 -  
45 - LOG.info(classifier.toString());  
46 - }  
47 -}  
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.sentence.test;  
2 -  
3 -import org.slf4j.Logger;  
4 -import org.slf4j.LoggerFactory;  
5 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
6 -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils;  
7 -import weka.core.Instances;  
8 -import weka.core.converters.ArffLoader;  
9 -  
10 -import java.io.File;  
11 -  
12 -  
13 -public class Crossvalidate {  
14 -  
15 - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);  
16 -  
17 - private Crossvalidate() {  
18 - }  
19 -  
20 - public static void main(String[] args) throws Exception {  
21 -  
22 - ArffLoader loader = new ArffLoader();  
23 - loader.setFile(new File(Constants.SENTENCES_DATASET_PATH));  
24 - Instances instances = loader.getDataSet();  
25 - instances.setClassIndex(0);  
26 - LOG.info(instances.size() + " instances loaded.");  
27 - LOG.info(instances.numAttributes() + " attributes for each instance.");  
28 -  
29 - EvalUtils.crossvalidateRegression(instances);  
30 - }  
31 -}  
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/test/Crossvalidate.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.zero.test;  
2 -  
3 -import org.slf4j.Logger;  
4 -import org.slf4j.LoggerFactory;  
5 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
6 -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils;  
7 -import weka.core.Instances;  
8 -import weka.core.converters.ArffLoader;  
9 -  
10 -import java.io.File;  
11 -  
12 -  
13 -public class Crossvalidate {  
14 -  
15 - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class);  
16 -  
17 - private Crossvalidate() {  
18 - }  
19 -  
20 - public static void main(String[] args) throws Exception {  
21 -  
22 - ArffLoader loader = new ArffLoader();  
23 - loader.setFile(new File(Constants.ZERO_DATASET_PATH));  
24 - Instances instances = loader.getDataSet();  
25 - instances.setClassIndex(0);  
26 - LOG.info(instances.size() + " instances loaded.");  
27 - LOG.info(instances.numAttributes() + " attributes for each instance.");  
28 -  
29 - EvalUtils.crossvalidateClassification(instances);  
30 - }  
31 -}  
nicolas-core/pom.xml renamed to nicolas-lib/pom.xml
@@ -9,7 +9,7 @@ @@ -9,7 +9,7 @@
9 <version>1.0-SNAPSHOT</version> 9 <version>1.0-SNAPSHOT</version>
10 </parent> 10 </parent>
11 11
12 - <artifactId>nicolas</artifactId> 12 + <artifactId>nicolas-lib</artifactId>
13 13
14 <dependencies> 14 <dependencies>
15 <!-- project --> 15 <!-- project -->
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -11,6 +11,7 @@ import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; @@ -11,6 +11,7 @@ import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
11 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 11 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
12 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 12 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
13 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; 13 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel;
  14 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
14 import weka.classifiers.Classifier; 15 import weka.classifiers.Classifier;
15 16
16 import java.io.IOException; 17 import java.io.IOException;
@@ -20,22 +21,27 @@ import static java.util.stream.Collectors.toList; @@ -20,22 +21,27 @@ import static java.util.stream.Collectors.toList;
20 21
21 public class Nicolas { 22 public class Nicolas {
22 23
23 - private final Classifier sentenceClassifier;  
24 - private final Classifier mentionClassifier;  
25 - private final MentionFeatureExtractor featureExtractor; 24 + private final Classifier mentionModel;
  25 + private final Classifier sentenceModel;
  26 + private final Classifier zeroModel;
  27 +
  28 + private final MentionFeatureExtractor mentionFeatureExtractor;
26 private final SentenceFeatureExtractor sentenceFeatureExtractor; 29 private final SentenceFeatureExtractor sentenceFeatureExtractor;
  30 + private final ZeroFeatureExtractor zeroFeatureExtractor;
27 31
28 public Nicolas() throws IOException, ClassNotFoundException { 32 public Nicolas() throws IOException, ClassNotFoundException {
29 - mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH);  
30 - featureExtractor = new MentionFeatureExtractor(); 33 + mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
  34 + sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
  35 + zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);
31 36
32 - sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); 37 + mentionFeatureExtractor = new MentionFeatureExtractor();
33 sentenceFeatureExtractor = new SentenceFeatureExtractor(); 38 sentenceFeatureExtractor = new SentenceFeatureExtractor();
  39 + zeroFeatureExtractor = new ZeroFeatureExtractor();
34 } 40 }
35 41
36 public String summarizeThrift(TText text, int targetTokenCount) throws Exception { 42 public String summarizeThrift(TText text, int targetTokenCount) throws Exception {
37 Set<TMention> goodMentions 43 Set<TMention> goodMentions
38 - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); 44 + = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text);
39 return calculateSummary(text, goodMentions, targetTokenCount); 45 return calculateSummary(text, goodMentions, targetTokenCount);
40 } 46 }
41 47
@@ -52,10 +58,10 @@ public class Nicolas { @@ -52,10 +58,10 @@ public class Nicolas {
52 private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { 58 private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception {
53 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); 59 List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
54 60
55 - Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceClassifier, sentenceFeatureExtractor); 61 + Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor);
56 62
57 List<TSentence> sortedSents = Lists.newArrayList(sents); 63 List<TSentence> sortedSents = Lists.newArrayList(sents);
58 - Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed()); 64 + sortedSents.sort(Comparator.comparing(sentence2score::get).reversed());
59 65
60 int size = 0; 66 int size = 0;
61 Random r = new Random(1); 67 Random r = new Random(1);
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java
1 package pl.waw.ipipan.zil.summ.nicolas; 1 package pl.waw.ipipan.zil.summ.nicolas;
2 2
3 -import com.google.common.base.Charsets;  
4 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
5 -import com.google.common.io.Files;  
6 import org.slf4j.Logger; 4 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 9 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
12 -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer;  
13 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 10 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
14 import weka.core.Attribute; 11 import weka.core.Attribute;
15 import weka.core.DenseInstance; 12 import weka.core.DenseInstance;
16 import weka.core.Instance; 13 import weka.core.Instance;
17 14
18 -import java.io.File;  
19 -import java.io.IOException;  
20 import java.util.List; 15 import java.util.List;
21 import java.util.Map; 16 import java.util.Map;
22 import java.util.Set; 17 import java.util.Set;
@@ -30,16 +25,6 @@ public class ThriftUtils { @@ -30,16 +25,6 @@ public class ThriftUtils {
30 private ThriftUtils() { 25 private ThriftUtils() {
31 } 26 }
32 27
33 - public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException {  
34 - String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8);  
35 -  
36 - MentionScorer scorer = new MentionScorer();  
37 - Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text);  
38 -  
39 - mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0);  
40 - return mention2score.keySet();  
41 - }  
42 -  
43 public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { 28 public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) {
44 List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); 29 List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
45 Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); 30 Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText);
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java
@@ -26,18 +26,18 @@ import java.util.*; @@ -26,18 +26,18 @@ import java.util.*;
26 26
27 import static java.util.stream.Collectors.toList; 27 import static java.util.stream.Collectors.toList;
28 28
29 -public class ApplyModel2 { 29 +public class ApplyModel {
30 30
31 - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class); 31 + private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class);
32 32
33 private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; 33 private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test";
34 private static final String TARGET_DIR = "corpora/summaries"; 34 private static final String TARGET_DIR = "corpora/summaries";
35 35
36 public static void main(String[] args) throws Exception { 36 public static void main(String[] args) throws Exception {
37 - Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); 37 + Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH);
38 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); 38 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
39 39
40 - Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); 40 + Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCE_MODEL_RESOURCE_PATH);
41 SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); 41 SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();
42 42
43 ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); 43 ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector();
@@ -102,7 +102,7 @@ public class ApplyModel2 { @@ -102,7 +102,7 @@ public class ApplyModel2 {
102 } 102 }
103 103
104 List<TSentence> sortedSents = Lists.newArrayList(sents); 104 List<TSentence> sortedSents = Lists.newArrayList(sents);
105 - Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed()); 105 + sortedSents.sort(Comparator.comparing(sentence2score::get).reversed());
106 106
107 int size = 0; 107 int size = 0;
108 Random r = new Random(1); 108 Random r = new Random(1);
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
1 package pl.waw.ipipan.zil.summ.nicolas.mention; 1 package pl.waw.ipipan.zil.summ.nicolas.mention;
2 2
3 -import com.google.common.collect.*; 3 +import com.google.common.collect.Lists;
  4 +import com.google.common.collect.Maps;
4 import pl.waw.ipipan.zil.multiservice.thrift.types.*; 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
5 import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 6 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  7 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
6 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; 8 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor;
7 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; 9 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
8 import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; 10 import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation;
9 import weka.core.Attribute; 11 import weka.core.Attribute;
10 12
11 -import java.io.File;  
12 import java.io.IOException; 13 import java.io.IOException;
13 -import java.nio.file.Files;  
14 -import java.util.*; 14 +import java.util.List;
  15 +import java.util.Map;
15 import java.util.stream.Collectors; 16 import java.util.stream.Collectors;
16 -import java.util.stream.Stream;  
17 17
18 18
19 public class MentionFeatureExtractor extends FeatureExtractor { 19 public class MentionFeatureExtractor extends FeatureExtractor {
20 20
21 - private final List<String> frequentBases = Lists.newArrayList(); 21 + private final List<String> frequentBases;
22 22
23 - public MentionFeatureExtractor() { 23 + public MentionFeatureExtractor() throws IOException {
  24 + frequentBases = loadFrequentBases();
24 25
25 //coref 26 //coref
26 addNumericAttributeNormalized("chain_length"); 27 addNumericAttributeNormalized("chain_length");
@@ -70,7 +71,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { @@ -70,7 +71,6 @@ public class MentionFeatureExtractor extends FeatureExtractor {
70 addBinaryAttribute(prefix + "_sent_ends_with_questionmark"); 71 addBinaryAttribute(prefix + "_sent_ends_with_questionmark");
71 72
72 // frequent bases 73 // frequent bases
73 - loadFrequentBases();  
74 for (String base : frequentBases) { 74 for (String base : frequentBases) {
75 addBinaryAttribute(prefix + "_" + encodeBase(base)); 75 addBinaryAttribute(prefix + "_" + encodeBase(base));
76 } 76 }
@@ -80,17 +80,12 @@ public class MentionFeatureExtractor extends FeatureExtractor { @@ -80,17 +80,12 @@ public class MentionFeatureExtractor extends FeatureExtractor {
80 fillSortedAttributes("score"); 80 fillSortedAttributes("score");
81 } 81 }
82 82
83 - private String encodeBase(String base) {  
84 - return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); 83 + private List<String> loadFrequentBases() throws IOException {
  84 + return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList());
85 } 85 }
86 86
87 - private void loadFrequentBases() {  
88 - try {  
89 - Stream<String> lines = Files.lines(new File("frequent_bases.txt").toPath());  
90 - this.frequentBases.addAll(lines.map(String::trim).collect(Collectors.toList()));  
91 - } catch (IOException e) {  
92 - e.printStackTrace();  
93 - } 87 + private String encodeBase(String base) {
  88 + return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q");
94 } 89 }
95 90
96 public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) { 91 public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) {
@@ -123,8 +118,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { @@ -123,8 +118,6 @@ public class MentionFeatureExtractor extends FeatureExtractor {
123 attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size()); 118 attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size());
124 attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); 119 attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size());
125 attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); 120 attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size());
126 -  
127 - assert (attribute2value.size() == getAttributesList().size());  
128 } 121 }
129 addNormalizedAttributeValues(result); 122 addNormalizedAttributeValues(result);
130 123
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
@@ -87,7 +87,6 @@ public class SentenceFeatureExtractor extends FeatureExtractor { @@ -87,7 +87,6 @@ public class SentenceFeatureExtractor extends FeatureExtractor {
87 feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue()); 87 feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue());
88 88
89 feature2value.remove(null); 89 feature2value.remove(null);
90 - assert (feature2value.size() == getAttributesList().size());  
91 90
92 sentence2features.put(sentence, feature2value); 91 sentence2features.put(sentence, feature2value);
93 92
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.zero;
  2 +
  3 +import com.google.common.collect.Maps;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  5 +import weka.core.Attribute;
  6 +import weka.core.DenseInstance;
  7 +import weka.core.Instance;
  8 +
  9 +import java.util.List;
  10 +import java.util.Map;
  11 +
  12 +public class InstanceCreator {
  13 +
  14 + private InstanceCreator() {
  15 + }
  16 +
  17 + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) {
  18 + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text);
  19 + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap();
  20 + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) {
  21 + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());
  22 + Map<Attribute, Double> sentenceFeatures = entry.getValue();
  23 + for (Attribute attribute : featureExtractor.getAttributesList()) {
  24 + instance.setValue(attribute, sentenceFeatures.get(attribute));
  25 + }
  26 + candidate2instance.put(entry.getKey(), instance);
  27 + }
  28 + return candidate2instance;
  29 + }
  30 +
  31 +}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
@@ -4,6 +4,7 @@ import com.google.common.collect.Lists; @@ -4,6 +4,7 @@ import com.google.common.collect.Lists;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
5 import com.google.common.collect.Sets; 5 import com.google.common.collect.Sets;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
9 import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 10 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
@@ -18,18 +19,56 @@ import java.util.Set; @@ -18,18 +19,56 @@ import java.util.Set;
18 19
19 public class ZeroFeatureExtractor extends FeatureExtractor { 20 public class ZeroFeatureExtractor extends FeatureExtractor {
20 21
  22 + private static final String SCORE = "score";
  23 +
  24 + private static final String ANTECEDENT_PREFIX = "antecedent";
  25 + private static final String CANDIDATE_PREFIX = "candidate";
  26 +
  27 + private static final String SENTENCE_ENDS_WITH_QUESTION_MARK = "_sentence_ends_with_question_mark";
  28 + private static final String IS_NAMED = "_is_named";
  29 + private static final String TOKEN_COUNT = "_token_count";
  30 + private static final String FIRST_TOKEN_INDEX_IN_SENT = "_first_token_index_in_sent";
  31 + private static final String INDEX_IN_SENT = "_index_in_sent";
  32 + private static final String PREV_TOKEN_POS = "_prev_token_pos";
  33 + private static final String NEXT_TOKEN_POS = "_next_token_pos";
  34 + private static final String IS_NESTING = "_is_nesting";
  35 + private static final String IS_NESTED = "_is_nested";
  36 + private static final String SENTENCE_MENTION_COUNT = "_sentence_mention_count";
  37 + private static final String SENTENCE_TOKEN_LENGTH = "_sentence_token_length";
  38 + private static final String IS_PAN_OR_PANI = "_is_pan_or_pani";
  39 +
  40 + // private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet(
  41 +// "zespół", "tylko", "gdy", ".", ":", "też", "kandydat", "do", "dziś", "bo", "by", "z", "a", "jednak", "jak", "który", "ale", "czy", "i", "się", "rok", "-", "\"", "to", "być", "że", ",");
  42 + private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet("to", "z", "do", "o", "czyli", "nie", "\"", "też", "jak", "czy");
  43 +
  44 + private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet();
  45 +// private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet(
  46 +// "mówić", "ii", "twierdzić", "już", "(", "budzić", "stanowić", "powinien", "do", "stać", "musieć", "stanąć", "móc", "o", "chcieć", "się", "-", "zostać", ":", "?", "i", "na", "z", "mieć", "\"", "to", "w", "nie", "być", ".", ",");
  47 +
  48 + private static final String PREV_TOKEN_LEMMA = "_prev_token_lemma_equal_";
  49 + private static final String NEXT_TOKEN_LEMMA = "_next_token_lemma_equal_";
  50 +
21 public ZeroFeatureExtractor() { 51 public ZeroFeatureExtractor() {
22 52
23 - for (String prefix : new String[]{"antecedent", "candidate"}) {  
24 - addNumericAttribute(prefix + "_index_in_sent");  
25 - addNumericAttribute(prefix + "_first_token_index_in_sent");  
26 - addNumericAttribute(prefix + "_token_count");  
27 - addBinaryAttribute(prefix + "_is_named");  
28 - addNumericAttribute(prefix + "_sentence_mention_count");  
29 - addNominalAttribute(prefix + "_next_token_pos", Constants.POS_TAGS);  
30 - addNominalAttribute(prefix + "_prev_token_pos", Constants.POS_TAGS);  
31 - addBinaryAttribute(prefix + "_is_nested");  
32 - addBinaryAttribute(prefix + "_is_nesting"); 53 + for (String prefix : new String[]{ANTECEDENT_PREFIX, CANDIDATE_PREFIX}) {
  54 + addNumericAttribute(prefix + INDEX_IN_SENT);
  55 + addNumericAttribute(prefix + FIRST_TOKEN_INDEX_IN_SENT);
  56 + addNumericAttribute(prefix + TOKEN_COUNT);
  57 + addBinaryAttribute(prefix + IS_NAMED);
  58 + addBinaryAttribute(prefix + IS_PAN_OR_PANI);
  59 + addNominalAttribute(prefix + NEXT_TOKEN_POS, Constants.POS_TAGS);
  60 + addNominalAttribute(prefix + PREV_TOKEN_POS, Constants.POS_TAGS);
  61 + for (String prevLemma : PREV_TOKEN_LEMMAS) {
  62 + addBinaryAttribute(prefix + PREV_TOKEN_LEMMA + prevLemma);
  63 + }
  64 + for (String nextLemma : NEXT_TOKEN_LEMMAS) {
  65 + addBinaryAttribute(prefix + NEXT_TOKEN_LEMMA + nextLemma);
  66 + }
  67 + addBinaryAttribute(prefix + IS_NESTED);
  68 + addBinaryAttribute(prefix + IS_NESTING);
  69 + addNumericAttribute(prefix + SENTENCE_MENTION_COUNT);
  70 + addNumericAttribute(prefix + SENTENCE_TOKEN_LENGTH);
  71 + addBinaryAttribute(prefix + SENTENCE_ENDS_WITH_QUESTION_MARK);
33 } 72 }
34 73
35 addNumericAttribute("chain_length"); 74 addNumericAttribute("chain_length");
@@ -43,8 +82,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor { @@ -43,8 +82,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor {
43 addNumericAttribute("pair_sent_distance"); 82 addNumericAttribute("pair_sent_distance");
44 addNumericAttribute("pair_par_distance"); 83 addNumericAttribute("pair_par_distance");
45 84
46 - addNominalAttribute("score", Lists.newArrayList("bad", "good"));  
47 - fillSortedAttributes("score"); 85 + addNominalAttribute(SCORE, Lists.newArrayList("bad", "good"));
  86 + fillSortedAttributes(SCORE);
48 } 87 }
49 88
50 public Map<ZeroSubjectCandidate, Map<Attribute, Double>> calculateFeatures(List<ZeroSubjectCandidate> candidates, TText text) { 89 public Map<ZeroSubjectCandidate, Map<Attribute, Double>> calculateFeatures(List<ZeroSubjectCandidate> candidates, TText text) {
@@ -62,13 +101,13 @@ public class ZeroFeatureExtractor extends FeatureExtractor { @@ -62,13 +101,13 @@ public class ZeroFeatureExtractor extends FeatureExtractor {
62 private Map<Attribute, Double> calculateFeatures(ZeroSubjectCandidate candidate, FeatureHelper helper) { 101 private Map<Attribute, Double> calculateFeatures(ZeroSubjectCandidate candidate, FeatureHelper helper) {
63 102
64 Map<Attribute, Double> candidateFeatures = Maps.newHashMap(); 103 Map<Attribute, Double> candidateFeatures = Maps.newHashMap();
65 - candidateFeatures.put(getAttributeByName("score"), weka.core.Utils.missingValue()); 104 + candidateFeatures.put(getAttributeByName(SCORE), weka.core.Utils.missingValue());
66 105
67 TMention mention = candidate.getZeroCandidateMention(); 106 TMention mention = candidate.getZeroCandidateMention();
68 TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get(); 107 TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get();
69 108
70 - addMentionFeatures(helper, candidateFeatures, mention, "candidate");  
71 - addMentionFeatures(helper, candidateFeatures, antecedent, "antecedent"); 109 + addMentionFeatures(helper, candidateFeatures, mention, CANDIDATE_PREFIX);
  110 + addMentionFeatures(helper, candidateFeatures, antecedent, ANTECEDENT_PREFIX);
72 111
73 candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equals(helper.getMentionOrth(antecedent)))); 112 candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equals(helper.getMentionOrth(antecedent))));
74 candidateFeatures.put(getAttributeByName("pair_equal_base"), toBinary(helper.getMentionBase(mention).equalsIgnoreCase(helper.getMentionBase(antecedent)))); 113 candidateFeatures.put(getAttributeByName("pair_equal_base"), toBinary(helper.getMentionBase(mention).equalsIgnoreCase(helper.getMentionBase(antecedent))));
@@ -98,28 +137,41 @@ public class ZeroFeatureExtractor extends FeatureExtractor { @@ -98,28 +137,41 @@ public class ZeroFeatureExtractor extends FeatureExtractor {
98 } 137 }
99 138
100 private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) { 139 private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) {
101 - candidateFeatures.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention));  
102 - candidateFeatures.put(getAttributeByName(attributePrefix + "_first_token_index_in_sent"), (double) helper.getMentionFirstTokenIndex(mention)); 140 + candidateFeatures.put(getAttributeByName(attributePrefix + INDEX_IN_SENT), (double) helper.getMentionIndexInSent(mention));
  141 + candidateFeatures.put(getAttributeByName(attributePrefix + FIRST_TOKEN_INDEX_IN_SENT), (double) helper.getMentionFirstTokenIndex(mention));
103 142
104 - candidateFeatures.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize());  
105 - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention)));  
106 - candidateFeatures.put(getAttributeByName(attributePrefix + "_sentence_mention_count"), (double) helper.getMentionSentence(mention).getMentions().size()); 143 + candidateFeatures.put(getAttributeByName(attributePrefix + TOKEN_COUNT), (double) mention.getChildIdsSize());
  144 + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NAMED), toBinary(helper.isMentionNamedEntity(mention)));
  145 + candidateFeatures.put(getAttributeByName(attributePrefix + IS_PAN_OR_PANI), toBinary(helper.getMentionBase(mention).matches("(pan)|(pani)")));
107 146
108 TToken nextToken = helper.getTokenAfterMention(mention); 147 TToken nextToken = helper.getTokenAfterMention(mention);
109 - addNominalAttributeValue(nextToken == null ? "end" : nextToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_next_token_pos"); 148 + addNominalAttributeValue(nextToken == null ? "end" : nextToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + NEXT_TOKEN_POS);
  149 + String nextTokenLemma = nextToken == null ? "" : nextToken.getChosenInterpretation().getBase();
  150 + for (String nextLemma : NEXT_TOKEN_LEMMAS) {
  151 + candidateFeatures.put(getAttributeByName(attributePrefix + NEXT_TOKEN_LEMMA + nextLemma), toBinary(nextTokenLemma.equalsIgnoreCase(nextLemma)));
  152 + }
  153 +
110 TToken prevToken = helper.getTokenBeforeMention(mention); 154 TToken prevToken = helper.getTokenBeforeMention(mention);
111 - addNominalAttributeValue(prevToken == null ? "end" : prevToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_prev_token_pos"); 155 + addNominalAttributeValue(prevToken == null ? "end" : prevToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + PREV_TOKEN_POS);
  156 + String prevTokenLemma = prevToken == null ? "" : prevToken.getChosenInterpretation().getBase();
  157 + for (String prevLemma : PREV_TOKEN_LEMMAS) {
  158 + candidateFeatures.put(getAttributeByName(attributePrefix + PREV_TOKEN_LEMMA + prevLemma), toBinary(prevTokenLemma.equalsIgnoreCase(prevLemma)));
  159 + }
112 160
113 - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention)));  
114 - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention))); 161 + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NESTED), toBinary(helper.isNested(mention)));
  162 + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NESTING), toBinary(helper.isNesting(mention)));
115 163
  164 + TSentence mentionSentence = helper.getMentionSentence(mention);
  165 + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_MENTION_COUNT), (double) mentionSentence.getMentions().size());
  166 + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_TOKEN_LENGTH), (double) mentionSentence.getTokens().size());
  167 + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth().equals("?")));
116 } 168 }
117 169
118 private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { 170 private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) {
119 Attribute att = getAttributeByName(attributeName); 171 Attribute att = getAttributeByName(attributeName);
120 int index = att.indexOfValue(value); 172 int index = att.indexOfValue(value);
121 if (index == -1) 173 if (index == -1)
122 - LOG.warn(value + " not found for attribute " + attributeName); 174 + LOG.warn(value + "not found for attribute " + attributeName);
123 attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); 175 attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index));
124 } 176 }
125 } 177 }
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
@@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.summ.nicolas.common.Utils; @@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
8 import weka.classifiers.Classifier; 8 import weka.classifiers.Classifier;
9 import weka.core.Instance; 9 import weka.core.Instance;
10 import weka.core.Instances; 10 import weka.core.Instances;
  11 +import weka.core.SerializationHelper;
11 12
12 -import java.io.IOException;  
13 import java.util.List; 13 import java.util.List;
14 import java.util.Map; 14 import java.util.Map;
15 import java.util.Set; 15 import java.util.Set;
@@ -21,8 +21,8 @@ public class ZeroSubjectInjector { @@ -21,8 +21,8 @@ public class ZeroSubjectInjector {
21 private final Classifier classifier; 21 private final Classifier classifier;
22 private final Instances instances; 22 private final Instances instances;
23 23
24 - public ZeroSubjectInjector() throws IOException, ClassNotFoundException {  
25 - classifier = Utils.loadClassifier(Constants.ZERO_MODEL_PATH); 24 + public ZeroSubjectInjector() throws Exception {
  25 + classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH);
26 featureExtractor = new ZeroFeatureExtractor(); 26 featureExtractor = new ZeroFeatureExtractor();
27 instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 27 instances = Utils.createNewInstances(featureExtractor.getAttributesList());
28 } 28 }
@@ -31,7 +31,7 @@ public class ZeroSubjectInjector { @@ -31,7 +31,7 @@ public class ZeroSubjectInjector {
31 Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); 31 Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet());
32 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); 32 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds);
33 Map<ZeroSubjectCandidate, Instance> candidate2instance = 33 Map<ZeroSubjectCandidate, Instance> candidate2instance =
34 - PrepareTrainingData.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); 34 + InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
35 35
36 Set<String> result = Sets.newHashSet(); 36 Set<String> result = Sets.newHashSet();
37 for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { 37 for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) {
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas;
  2 +
  3 +import org.junit.BeforeClass;
  4 +import org.junit.Test;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  6 +import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  7 +
  8 +import static org.junit.Assert.assertTrue;
  9 +
  10 +public class NicolasTest {
  11 +
  12 + private static final String SAMPLE_THRIFT_TEXT_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/sample_serialized_text.thrift";
  13 +
  14 + private static Nicolas nicolas;
  15 +
  16 + @BeforeClass
  17 + public static void shouldLoadModels() throws Exception {
  18 + nicolas = new Nicolas();
  19 + }
  20 +
  21 + @Test
  22 + public void shouldSummarizeThriftText() throws Exception {
  23 + TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH);
  24 + String summary = nicolas.summarizeThrift(thriftText, 5);
  25 + int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size();
  26 + assertTrue(summaryTokensCount > 0);
  27 + assertTrue(summaryTokensCount < 10);
  28 + }
  29 +
  30 +}
0 \ No newline at end of file 31 \ No newline at end of file
nicolas-core/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java renamed to nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -18,7 +18,7 @@ import static org.junit.Assert.assertEquals; @@ -18,7 +18,7 @@ import static org.junit.Assert.assertEquals;
18 18
19 public class CandidateFinderTest { 19 public class CandidateFinderTest {
20 20
21 - private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin"; 21 + private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.thrift";
22 private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; 22 private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt";
23 23
24 @Test 24 @Test
nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin renamed to nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/sample_serialized_text.thrift
No preview for this file type
nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.thrift 0 → 100644
No preview for this file type
nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt renamed to nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/models/.gitignore 0 → 100644
  1 +*.bin
0 \ No newline at end of file 2 \ No newline at end of file
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/models/README.md 0 → 100644
  1 +To generate models in this folder, use nicolas-trainer module.
0 \ No newline at end of file 2 \ No newline at end of file
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt renamed to nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/frequent_bases.txt
nicolas-train/pom.xml
@@ -12,6 +12,16 @@ @@ -12,6 +12,16 @@
12 <artifactId>nicolas-train</artifactId> 12 <artifactId>nicolas-train</artifactId>
13 13
14 <dependencies> 14 <dependencies>
  15 + <!-- project -->
  16 + <dependency>
  17 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  18 + <artifactId>nicolas-common</artifactId>
  19 + </dependency>
  20 + <dependency>
  21 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  22 + <artifactId>nicolas-lib</artifactId>
  23 + </dependency>
  24 +
15 <!-- internal --> 25 <!-- internal -->
16 <dependency> 26 <dependency>
17 <groupId>pl.waw.ipipan.zil.summ</groupId> 27 <groupId>pl.waw.ipipan.zil.summ</groupId>
@@ -22,10 +32,28 @@ @@ -22,10 +32,28 @@
22 <artifactId>utils</artifactId> 32 <artifactId>utils</artifactId>
23 </dependency> 33 </dependency>
24 34
  35 + <!-- third party -->
  36 + <dependency>
  37 + <groupId>nz.ac.waikato.cms.weka</groupId>
  38 + <artifactId>weka-dev</artifactId>
  39 + </dependency>
  40 + <dependency>
  41 + <groupId>org.apache.commons</groupId>
  42 + <artifactId>commons-lang3</artifactId>
  43 + </dependency>
  44 + <dependency>
  45 + <groupId>net.lingala.zip4j</groupId>
  46 + <artifactId>zip4j</artifactId>
  47 + </dependency>
  48 +
25 <!-- logging --> 49 <!-- logging -->
26 <dependency> 50 <dependency>
27 <groupId>org.slf4j</groupId> 51 <groupId>org.slf4j</groupId>
28 <artifactId>slf4j-api</artifactId> 52 <artifactId>slf4j-api</artifactId>
29 </dependency> 53 </dependency>
  54 + <dependency>
  55 + <groupId>org.slf4j</groupId>
  56 + <artifactId>slf4j-simple</artifactId>
  57 + </dependency>
30 </dependencies> 58 </dependencies>
31 </project> 59 </project>
32 \ No newline at end of file 60 \ No newline at end of file
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train;
  2 +
  3 +import net.lingala.zip4j.core.ZipFile;
  4 +import org.apache.commons.io.FileUtils;
  5 +import org.slf4j.Logger;
  6 +import org.slf4j.LoggerFactory;
  7 +import pl.waw.ipipan.zil.summ.nicolas.train.multiservice.NLPProcess;
  8 +
  9 +import java.io.File;
  10 +import java.net.URL;
  11 +
  12 +public class DownloadAndPreprocessCorpus {
  13 +
  14 + private static final Logger LOG = LoggerFactory.getLogger(DownloadAndPreprocessCorpus.class);
  15 +
  16 + private static final String WORKING_DIR = "data";
  17 + private static final String CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/PolishSummariesCorpus?action=AttachFile&do=get&target=PSC_1.0.zip";
  18 +
  19 + private DownloadAndPreprocessCorpus() {
  20 + }
  21 +
  22 + public static void main(String[] args) throws Exception {
  23 + File workDir = createFolder(WORKING_DIR);
  24 +
  25 + File corpusFile = new File(workDir, "corpus.zip");
  26 + if (!corpusFile.exists()) {
  27 + LOG.info("Downloading corpus file...");
  28 + FileUtils.copyURLToFile(new URL(CORPUS_DOWNLOAD_URL), corpusFile);
  29 + LOG.info("done.");
  30 + } else {
  31 + LOG.info("Corpus file already downloaded.");
  32 + }
  33 +
  34 + File extractedCorpusDir = new File(workDir, "corpus");
  35 + if (extractedCorpusDir.exists()) {
  36 + LOG.info("Corpus file already extracted.");
  37 + } else {
  38 + ZipFile zipFile = new ZipFile(corpusFile);
  39 + zipFile.extractAll(extractedCorpusDir.getPath());
  40 + LOG.info("Extracted corpus file.");
  41 + }
  42 +
  43 + File pscDir = new File(extractedCorpusDir, "PSC_1.0");
  44 + File dataDir = new File(pscDir, "data");
  45 +
  46 + File preprocessed = new File(WORKING_DIR, "preprocessed");
  47 + createFolder(preprocessed.getPath());
  48 + NLPProcess.main(new String[]{dataDir.getPath(), preprocessed.getPath()});
  49 + }
  50 +
  51 + private static File createFolder(String path) {
  52 + File folder = new File(path);
  53 + if (folder.mkdir()) {
  54 + LOG.info("Created directory at: {}.", path);
  55 + } else {
  56 + LOG.info("Directory already present at: {}.", path);
  57 + }
  58 + return folder;
  59 + }
  60 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/TrainAllModels.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel;
  6 +
  7 +public class TrainAllModels {
  8 +
  9 + private TrainAllModels() {
  10 + }
  11 +
  12 + public static void main(String[] args) throws Exception {
  13 + TrainMentionModel.main(args);
  14 + TrainSentenceModel.main(args);
  15 + TrainZeroModel.main(args);
  16 + }
  17 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Trainer.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.train;  
2 -  
3 -public class Trainer {  
4 -  
5 - public static void main(String[] args) {  
6 -  
7 - }  
8 -}  
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/ModelConstants.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.common;
  2 +
  3 +import weka.classifiers.Classifier;
  4 +import weka.classifiers.trees.RandomForest;
  5 +
  6 +public class ModelConstants {
  7 +
  8 + public static final String MENTION_DATASET_PATH = "mentions_train.arff";
  9 + public static final String SENTENCE_DATASET_PATH = "sentences_train.arff";
  10 + public static final String ZERO_DATASET_PATH = "zeros_train.arff";
  11 +
  12 + private static final int NUM_ITERATIONS = 16;
  13 + private static final int NUM_EXECUTION_SLOTS = 8;
  14 + private static final int SEED = 0;
  15 +
  16 + private ModelConstants() {
  17 + }
  18 +
  19 + public static Classifier getMentionClassifier() {
  20 + RandomForest classifier = new RandomForest();
  21 + classifier.setNumIterations(NUM_ITERATIONS);
  22 + classifier.setSeed(SEED);
  23 + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS);
  24 + return classifier;
  25 + }
  26 +
  27 + public static Classifier getSentenceClassifier() {
  28 + RandomForest classifier = new RandomForest();
  29 + classifier.setNumIterations(16);
  30 + classifier.setSeed(0);
  31 + classifier.setNumExecutionSlots(8);
  32 + return classifier;
  33 + }
  34 +
  35 + public static Classifier getZeroClassifier() {
  36 + RandomForest classifier = new RandomForest();
  37 + classifier.setNumIterations(16);
  38 + classifier.setSeed(0);
  39 + classifier.setNumExecutionSlots(8);
  40 + return classifier;
  41 + }
  42 +
  43 +}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/TrainModel.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/TrainModelCommon.java
1 -package pl.waw.ipipan.zil.summ.nicolas.zero; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.common;
2 2
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 6 +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel;
7 import weka.classifiers.Classifier; 7 import weka.classifiers.Classifier;
8 import weka.core.Instances; 8 import weka.core.Instances;
9 import weka.core.converters.ArffLoader; 9 import weka.core.converters.ArffLoader;
@@ -11,41 +11,43 @@ import weka.core.converters.ArffLoader; @@ -11,41 +11,43 @@ import weka.core.converters.ArffLoader;
11 import java.io.File; 11 import java.io.File;
12 import java.io.FileOutputStream; 12 import java.io.FileOutputStream;
13 import java.io.ObjectOutputStream; 13 import java.io.ObjectOutputStream;
  14 +import java.util.logging.LogManager;
14 15
  16 +@SuppressWarnings("squid:S2118")
  17 +public class TrainModelCommon {
15 18
16 -public class TrainModel { 19 + private static final Logger LOG = LoggerFactory.getLogger(TrainZeroModel.class);
17 20
18 - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); 21 + private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources";
19 22
20 - private TrainModel() { 23 + private TrainModelCommon() {
21 } 24 }
22 25
23 - public static void main(String[] args) throws Exception { 26 + public static void trainAndSaveModel(String datasetPath, Classifier classifier, String targetPath) throws Exception {
  27 + LogManager.getLogManager().reset(); // disable WEKA logging
24 28
25 ArffLoader loader = new ArffLoader(); 29 ArffLoader loader = new ArffLoader();
26 - loader.setFile(new File(Constants.ZERO_DATASET_PATH)); 30 + loader.setFile(new File(datasetPath));
27 Instances instances = loader.getDataSet(); 31 Instances instances = loader.getDataSet();
28 instances.setClassIndex(0); 32 instances.setClassIndex(0);
29 - LOG.info(instances.size() + " instances loaded.");  
30 - LOG.info(instances.numAttributes() + " attributes for each instance."); 33 + LOG.info("{} instances loaded.", instances.size());
  34 + LOG.info("{} attributes for each instance.", instances.numAttributes());
31 35
32 StopWatch watch = new StopWatch(); 36 StopWatch watch = new StopWatch();
33 watch.start(); 37 watch.start();
34 38
35 - Classifier classifier = Constants.getZerosClassifier();  
36 -  
37 LOG.info("Building classifier..."); 39 LOG.info("Building classifier...");
38 classifier.buildClassifier(instances); 40 classifier.buildClassifier(instances);
39 - LOG.info("...done."); 41 + LOG.info("...done. Build classifier: {}", classifier);
40 42
  43 + String target = TARGET_MODEL_DIR + targetPath;
  44 + LOG.info("Saving classifier at: {}", target);
41 try (ObjectOutputStream oos = new ObjectOutputStream( 45 try (ObjectOutputStream oos = new ObjectOutputStream(
42 - new FileOutputStream(Constants.ZERO_MODEL_PATH))) { 46 + new FileOutputStream(target))) {
43 oos.writeObject(classifier); 47 oos.writeObject(classifier);
44 } 48 }
45 49
46 watch.stop(); 50 watch.stop();
47 - LOG.info("Elapsed time: " + watch);  
48 -  
49 - LOG.info(classifier.toString()); 51 + LOG.info("Elapsed time: {}", watch);
50 } 52 }
51 } 53 }
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/MentionScorer.java
1 -package pl.waw.ipipan.zil.summ.nicolas.mention; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention;
2 2
3 import com.google.common.collect.HashMultiset; 3 import com.google.common.collect.HashMultiset;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
@@ -14,7 +14,6 @@ import java.util.stream.Collectors; @@ -14,7 +14,6 @@ import java.util.stream.Collectors;
14 14
15 public class MentionScorer { 15 public class MentionScorer {
16 16
17 -  
18 public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { 17 public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) {
19 Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); 18 Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
20 19
@@ -39,20 +38,4 @@ public class MentionScorer { @@ -39,20 +38,4 @@ public class MentionScorer {
39 } 38 }
40 return mention2score; 39 return mention2score;
41 } 40 }
42 -  
43 - private static Map<TMention, Double> booleanTokenInclusion(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {  
44 - Map<TMention, Double> mention2score = Maps.newHashMap();  
45 - for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {  
46 - TMention mention = entry.getKey();  
47 - String mentionOrth = mention2Orth.get(mention);  
48 - int present = 0;  
49 - for (String token : Utils.tokenize(mentionOrth)) {  
50 - if (tokenCounts.contains(token.toLowerCase())) {  
51 - present++;  
52 - }  
53 - }  
54 - mention2score.putIfAbsent(mention, ((present * 2) >= Utils.tokenize(mentionOrth).size()) ? 1.0 : 0.0);  
55 - }  
56 - return mention2score;  
57 - }  
58 } 41 }
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/PrepareTrainingData.java
1 -package pl.waw.ipipan.zil.summ.nicolas.mention; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention;
2 2
3 import com.google.common.base.Charsets; 3 import com.google.common.base.Charsets;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
@@ -7,9 +7,11 @@ import org.slf4j.Logger; @@ -7,9 +7,11 @@ import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory; 7 import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
10 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
11 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; 10 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
12 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 12 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
  14 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
13 import weka.core.Instance; 15 import weka.core.Instance;
14 import weka.core.Instances; 16 import weka.core.Instances;
15 import weka.core.converters.ArffSaver; 17 import weka.core.converters.ArffSaver;
@@ -23,8 +25,11 @@ public class PrepareTrainingData { @@ -23,8 +25,11 @@ public class PrepareTrainingData {
23 25
24 private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); 26 private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class);
25 27
26 - public static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";  
27 - public static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; 28 + private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
  29 + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
  30 +
  31 + private PrepareTrainingData() {
  32 + }
28 33
29 public static void main(String[] args) throws IOException { 34 public static void main(String[] args) throws IOException {
30 35
@@ -37,19 +42,20 @@ public class PrepareTrainingData { @@ -37,19 +42,20 @@ public class PrepareTrainingData {
37 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 42 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
38 43
39 int i = 1; 44 int i = 1;
40 - for (String textId : id2preprocessedText.keySet()) { 45 + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
41 LOG.info(i++ + "/" + id2preprocessedText.size()); 46 LOG.info(i++ + "/" + id2preprocessedText.size());
42 47
43 - TText preprocessedText = id2preprocessedText.get(textId);  
44 - String optimalSummary = id2optimalSummary.get(textId); 48 + String id = entry.getKey();
  49 + TText preprocessedText = entry.getValue();
  50 + String optimalSummary = id2optimalSummary.get(id);
45 if (optimalSummary == null) 51 if (optimalSummary == null)
46 continue; 52 continue;
47 Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); 53 Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText);
48 54
49 Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); 55 Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor);
50 - for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {  
51 - TMention mention = entry.getKey();  
52 - Instance instance = entry.getValue(); 56 + for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) {
  57 + TMention mention = entry2.getKey();
  58 + Instance instance = entry2.getValue();
53 instance.setDataset(instances); 59 instance.setDataset(instances);
54 instance.setClassValue(mention2score.get(mention)); 60 instance.setClassValue(mention2score.get(mention));
55 instances.add(instance); 61 instances.add(instance);
@@ -61,7 +67,7 @@ public class PrepareTrainingData { @@ -61,7 +67,7 @@ public class PrepareTrainingData {
61 private static void saveInstancesToFile(Instances instances) throws IOException { 67 private static void saveInstancesToFile(Instances instances) throws IOException {
62 ArffSaver saver = new ArffSaver(); 68 ArffSaver saver = new ArffSaver();
63 saver.setInstances(instances); 69 saver.setInstances(instances);
64 - saver.setFile(new File(Constants.MENTIONS_DATASET_PATH)); 70 + saver.setFile(new File(ModelConstants.MENTION_DATASET_PATH));
65 saver.writeBatch(); 71 saver.writeBatch();
66 } 72 }
67 73
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/TrainMentionModel.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon;
  6 +import weka.classifiers.Classifier;
  7 +
  8 +public class TrainMentionModel {
  9 +
  10 + private TrainMentionModel() {
  11 + }
  12 +
  13 + public static void main(String[] args) throws Exception {
  14 + Classifier classifier = ModelConstants.getMentionClassifier();
  15 + String datasetPath = ModelConstants.MENTION_DATASET_PATH;
  16 + String targetPath = Constants.MENTION_MODEL_RESOURCE_PATH;
  17 + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath);
  18 + }
  19 +
  20 +}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/PrepareTrainingData.java
1 -package pl.waw.ipipan.zil.summ.nicolas.sentence; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence;
2 2
3 import com.google.common.base.Charsets; 3 import com.google.common.base.Charsets;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
@@ -8,11 +8,13 @@ import org.slf4j.LoggerFactory; @@ -8,11 +8,13 @@ import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
12 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; 11 import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils;
  12 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
13 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 13 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 15 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
  16 +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
  17 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
16 import weka.classifiers.Classifier; 18 import weka.classifiers.Classifier;
17 import weka.core.Instance; 19 import weka.core.Instance;
18 import weka.core.Instances; 20 import weka.core.Instances;
@@ -31,6 +33,9 @@ public class PrepareTrainingData { @@ -31,6 +33,9 @@ public class PrepareTrainingData {
31 private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; 33 private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev";
32 private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; 34 private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev";
33 35
  36 + private PrepareTrainingData() {
  37 + }
  38 +
34 public static void main(String[] args) throws Exception { 39 public static void main(String[] args) throws Exception {
35 40
36 Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); 41 Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH);
@@ -41,7 +46,7 @@ public class PrepareTrainingData { @@ -41,7 +46,7 @@ public class PrepareTrainingData {
41 46
42 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 47 Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
43 48
44 - Classifier classifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); 49 + Classifier classifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH);
45 MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); 50 MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor();
46 51
47 int i = 1; 52 int i = 1;
@@ -74,7 +79,7 @@ public class PrepareTrainingData { @@ -74,7 +79,7 @@ public class PrepareTrainingData {
74 private static void saveInstancesToFile(Instances instances) throws IOException { 79 private static void saveInstancesToFile(Instances instances) throws IOException {
75 ArffSaver saver = new ArffSaver(); 80 ArffSaver saver = new ArffSaver();
76 saver.setInstances(instances); 81 saver.setInstances(instances);
77 - saver.setFile(new File(Constants.SENTENCES_DATASET_PATH)); 82 + saver.setFile(new File(ModelConstants.SENTENCE_DATASET_PATH));
78 saver.writeBatch(); 83 saver.writeBatch();
79 } 84 }
80 85
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/SentenceScorer.java
1 -package pl.waw.ipipan.zil.summ.nicolas.sentence; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence;
2 2
3 import com.google.common.collect.HashMultiset; 3 import com.google.common.collect.HashMultiset;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/TrainSentenceModel.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon;
  6 +import weka.classifiers.Classifier;
  7 +
  8 +public class TrainSentenceModel {
  9 +
  10 + private TrainSentenceModel() {
  11 + }
  12 +
  13 + public static void main(String[] args) throws Exception {
  14 + Classifier classifier = ModelConstants.getSentenceClassifier();
  15 + String datasetPath = ModelConstants.SENTENCE_DATASET_PATH;
  16 + String targetPath = Constants.SENTENCE_MODEL_RESOURCE_PATH;
  17 + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath);
  18 + }
  19 +
  20 +}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/PrepareTrainingData.java
1 -package pl.waw.ipipan.zil.summ.nicolas.zero; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero;
2 2
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import com.google.common.collect.Sets; 4 import com.google.common.collect.Sets;
@@ -6,11 +6,13 @@ import org.apache.commons.io.IOUtils; @@ -6,11 +6,13 @@ import org.apache.commons.io.IOUtils;
6 import org.slf4j.Logger; 6 import org.slf4j.Logger;
7 import org.slf4j.LoggerFactory; 7 import org.slf4j.LoggerFactory;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
10 import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 9 import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
11 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; 10 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
12 -import weka.core.Attribute;  
13 -import weka.core.DenseInstance; 11 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  12 +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
  13 +import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;
  14 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
  15 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
14 import weka.core.Instance; 16 import weka.core.Instance;
15 import weka.core.Instances; 17 import weka.core.Instances;
16 import weka.core.converters.ArffSaver; 18 import weka.core.converters.ArffSaver;
@@ -54,7 +56,7 @@ public class PrepareTrainingData { @@ -54,7 +56,7 @@ public class PrepareTrainingData {
54 FeatureHelper featureHelper = new FeatureHelper(text); 56 FeatureHelper featureHelper = new FeatureHelper(text);
55 57
56 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); 58 List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds);
57 - Map<ZeroSubjectCandidate, Instance> candidate2instance = extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); 59 + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor);
58 60
59 for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { 61 for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) {
60 boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); 62 boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper);
@@ -68,24 +70,11 @@ public class PrepareTrainingData { @@ -68,24 +70,11 @@ public class PrepareTrainingData {
68 saveInstancesToFile(instances); 70 saveInstancesToFile(instances);
69 } 71 }
70 72
71 - public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) {  
72 - Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text);  
73 - Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap();  
74 - for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) {  
75 - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size());  
76 - Map<Attribute, Double> sentenceFeatures = entry.getValue();  
77 - for (Attribute attribute : featureExtractor.getAttributesList()) {  
78 - instance.setValue(attribute, sentenceFeatures.get(attribute));  
79 - }  
80 - candidate2instance.put(entry.getKey(), instance);  
81 - }  
82 - return candidate2instance;  
83 - }  
84 73
85 private static void saveInstancesToFile(Instances instances) throws IOException { 74 private static void saveInstancesToFile(Instances instances) throws IOException {
86 ArffSaver saver = new ArffSaver(); 75 ArffSaver saver = new ArffSaver();
87 saver.setInstances(instances); 76 saver.setInstances(instances);
88 - saver.setFile(new File(Constants.ZERO_DATASET_PATH)); 77 + saver.setFile(new File(ModelConstants.ZERO_DATASET_PATH));
89 saver.writeBatch(); 78 saver.writeBatch();
90 } 79 }
91 80
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/TrainZeroModel.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  4 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  5 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon;
  6 +import weka.classifiers.Classifier;
  7 +
  8 +public class TrainZeroModel {
  9 +
  10 + private TrainZeroModel() {
  11 + }
  12 +
  13 + public static void main(String[] args) throws Exception {
  14 + Classifier classifier = ModelConstants.getZeroClassifier();
  15 + String datasetPath = ModelConstants.ZERO_DATASET_PATH;
  16 + String targetPath = Constants.ZERO_MODEL_RESOURCE_PATH;
  17 + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath);
  18 + }
  19 +
  20 +}
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/ZeroScorer.java
1 -package pl.waw.ipipan.zil.summ.nicolas.zero; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero;
2 2
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import org.apache.commons.csv.CSVFormat; 4 import org.apache.commons.csv.CSVFormat;
@@ -7,6 +7,7 @@ import org.apache.commons.csv.CSVRecord; @@ -7,6 +7,7 @@ import org.apache.commons.csv.CSVRecord;
7 import org.apache.commons.csv.QuoteMode; 7 import org.apache.commons.csv.QuoteMode;
8 import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 8 import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
9 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; 9 import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper;
  10 +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
10 11
11 import java.io.IOException; 12 import java.io.IOException;
12 import java.io.InputStream; 13 import java.io.InputStream;
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java
@@ -24,6 +24,9 @@ public class NLPProcess { @@ -24,6 +24,9 @@ public class NLPProcess {
24 24
25 private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT); 25 private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT);
26 26
  27 + private static final String CORPUS_FILE_SUFFIX = ".xml";
  28 + private static final String OUTPUT_FILE_SUFFIX = ".thrift";
  29 +
27 private NLPProcess() { 30 private NLPProcess() {
28 } 31 }
29 32
@@ -34,23 +37,27 @@ public class NLPProcess { @@ -34,23 +37,27 @@ public class NLPProcess {
34 } 37 }
35 File corpusDir = new File(args[0]); 38 File corpusDir = new File(args[0]);
36 if (!corpusDir.isDirectory()) { 39 if (!corpusDir.isDirectory()) {
37 - LOG.error("Corpus directory does not exist: " + corpusDir); 40 + LOG.error("Corpus directory does not exist: {}", corpusDir);
38 return; 41 return;
39 } 42 }
40 File targetDir = new File(args[1]); 43 File targetDir = new File(args[1]);
41 if (!targetDir.isDirectory()) { 44 if (!targetDir.isDirectory()) {
42 - LOG.error("Target directory does not exist: " + targetDir); 45 + LOG.error("Target directory does not exist: {}", targetDir);
43 return; 46 return;
44 } 47 }
45 48
46 int ok = 0; 49 int ok = 0;
47 int err = 0; 50 int err = 0;
48 - File[] files = corpusDir.listFiles(f -> f.getName().endsWith(".xml")); 51 + File[] files = corpusDir.listFiles(f -> f.getName().endsWith(CORPUS_FILE_SUFFIX));
  52 + if (files == null || files.length == 0) {
  53 + LOG.error("No corpus files found at: {}", corpusDir);
  54 + return;
  55 + }
49 Arrays.sort(files); 56 Arrays.sort(files);
50 for (File file : files) { 57 for (File file : files) {
51 try { 58 try {
52 Text text = PSC_IO.readText(file); 59 Text text = PSC_IO.readText(file);
53 - File targetFile = new File(targetDir, file.getName().replaceFirst(".xml$", ".bin")); 60 + File targetFile = new File(targetDir, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX));
54 annotateNLP(text, targetFile); 61 annotateNLP(text, targetFile);
55 ok++; 62 ok++;
56 } catch (Exception e) { 63 } catch (Exception e) {
@@ -58,8 +65,8 @@ public class NLPProcess { @@ -58,8 +65,8 @@ public class NLPProcess {
58 LOG.error("Problem with text in " + file + ", " + e); 65 LOG.error("Problem with text in " + file + ", " + e);
59 } 66 }
60 } 67 }
61 - LOG.info(ok + " texts processed successfully.");  
62 - LOG.info(err + " texts with errors."); 68 + LOG.info("{} texts processed successfully.", ok);
  69 + LOG.info("{} texts with errors.", err);
63 } 70 }
64 71
65 private static void annotateNLP(Text text, File targetFile) throws Exception { 72 private static void annotateNLP(Text text, File targetFile) throws Exception {
@@ -77,8 +84,8 @@ public class NLPProcess { @@ -77,8 +84,8 @@ public class NLPProcess {
77 } 84 }
78 85
79 public static void serialize(TText ttext, File targetFile) throws IOException { 86 public static void serialize(TText ttext, File targetFile) throws IOException {
80 - try (FileOutputStream fout = new FileOutputStream(targetFile);  
81 - ObjectOutputStream oos = new ObjectOutputStream(fout)) { 87 + try (FileOutputStream fileOutputStream = new FileOutputStream(targetFile);
  88 + ObjectOutputStream oos = new ObjectOutputStream(fileOutputStream)) {
82 oos.writeObject(ttext); 89 oos.writeObject(ttext);
83 } 90 }
84 } 91 }
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/EvalUtils.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateCommon.java
1 -package pl.waw.ipipan.zil.summ.nicolas.eval; 1 +package pl.waw.ipipan.zil.summ.nicolas.train.search;
2 2
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.apache.commons.lang3.tuple.Pair; 4 import org.apache.commons.lang3.tuple.Pair;
@@ -14,6 +14,7 @@ import weka.classifiers.functions.SimpleLogistic; @@ -14,6 +14,7 @@ import weka.classifiers.functions.SimpleLogistic;
14 import weka.classifiers.lazy.IBk; 14 import weka.classifiers.lazy.IBk;
15 import weka.classifiers.lazy.KStar; 15 import weka.classifiers.lazy.KStar;
16 import weka.classifiers.lazy.LWL; 16 import weka.classifiers.lazy.LWL;
  17 +import weka.classifiers.meta.AttributeSelectedClassifier;
17 import weka.classifiers.rules.DecisionTable; 18 import weka.classifiers.rules.DecisionTable;
18 import weka.classifiers.rules.JRip; 19 import weka.classifiers.rules.JRip;
19 import weka.classifiers.rules.PART; 20 import weka.classifiers.rules.PART;
@@ -23,21 +24,49 @@ import weka.classifiers.trees.J48; @@ -23,21 +24,49 @@ import weka.classifiers.trees.J48;
23 import weka.classifiers.trees.LMT; 24 import weka.classifiers.trees.LMT;
24 import weka.classifiers.trees.RandomForest; 25 import weka.classifiers.trees.RandomForest;
25 import weka.core.Instances; 26 import weka.core.Instances;
  27 +import weka.core.converters.ArffLoader;
26 28
  29 +import java.io.File;
  30 +import java.io.IOException;
27 import java.util.Arrays; 31 import java.util.Arrays;
28 import java.util.Comparator; 32 import java.util.Comparator;
29 import java.util.Optional; 33 import java.util.Optional;
30 import java.util.Random; 34 import java.util.Random;
  35 +import java.util.logging.LogManager;
31 36
32 -public class EvalUtils {  
33 37
34 - private static final Logger LOG = LoggerFactory.getLogger(EvalUtils.class);  
35 - public static final int NUM_FOLDS = 10; 38 +class CrossvalidateCommon {
36 39
37 - private EvalUtils() { 40 + private static final Logger LOG = LoggerFactory.getLogger(CrossvalidateCommon.class);
  41 +
  42 + private static final int NUM_FOLDS = 10;
  43 +
  44 + private CrossvalidateCommon() {
  45 + }
  46 +
  47 + static void crossvalidateClassifiers(String datasetPath) throws IOException {
  48 + Instances instances = loadInstances(datasetPath);
  49 + crossvalidateClassification(instances);
  50 + }
  51 +
  52 + static void crossvalidateRegressors(String datasetPath) throws IOException {
  53 + Instances instances = loadInstances(datasetPath);
  54 + crossvalidateRegression(instances);
38 } 55 }
39 56
40 - public static void crossvalidateClassification(Instances instances) throws Exception { 57 + private static Instances loadInstances(String datasetPath) throws IOException {
  58 + LogManager.getLogManager().reset(); // disable WEKA logging
  59 +
  60 + ArffLoader loader = new ArffLoader();
  61 + loader.setFile(new File(datasetPath));
  62 + Instances instances = loader.getDataSet();
  63 + instances.setClassIndex(0);
  64 + LOG.info("{} instances loaded.", instances.size());
  65 + LOG.info("{} attributes for each instance.", instances.numAttributes());
  66 + return instances;
  67 + }
  68 +
  69 + private static void crossvalidateClassification(Instances instances) throws IOException {
41 StopWatch watch = new StopWatch(); 70 StopWatch watch = new StopWatch();
42 watch.start(); 71 watch.start();
43 72
@@ -45,52 +74,58 @@ public class EvalUtils { @@ -45,52 +74,58 @@ public class EvalUtils {
45 new Logistic(), new ZeroR(), 74 new Logistic(), new ZeroR(),
46 new SimpleLogistic(), new BayesNet(), new NaiveBayes(), 75 new SimpleLogistic(), new BayesNet(), new NaiveBayes(),
47 new KStar(), new IBk(), new LWL(), 76 new KStar(), new IBk(), new LWL(),
48 - new DecisionTable(), new JRip(), new PART()}).parallel().map(cls -> {  
49 - Evaluation eval = null; 77 + new DecisionTable(), new JRip(), new PART(),
  78 + createAttributeSelectedClassifier()}).parallel().map(cls -> {
  79 + String name = cls.getClass().getSimpleName();
  80 + double acc = 0;
  81 + Evaluation eval;
50 try { 82 try {
51 eval = new Evaluation(instances); 83 eval = new Evaluation(instances);
52 eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); 84 eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1));
53 } catch (Exception e) { 85 } catch (Exception e) {
54 - e.printStackTrace(); 86 + LOG.error("Error evaluating model", e);
  87 + return Pair.of(0.0, name);
55 } 88 }
56 - double acc = eval.correct() / eval.numInstances();  
57 - String name = cls.getClass().getSimpleName(); 89 + acc = eval.correct() / eval.numInstances();
58 LOG.info(name + " : " + acc); 90 LOG.info(name + " : " + acc);
59 -  
60 return Pair.of(acc, name); 91 return Pair.of(acc, name);
61 }).max(Comparator.comparingDouble(Pair::getLeft)); 92 }).max(Comparator.comparingDouble(Pair::getLeft));
62 LOG.info("#########"); 93 LOG.info("#########");
63 LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); 94 LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft());
64 95
65 watch.stop(); 96 watch.stop();
66 - LOG.info("Elapsed time: " + watch); 97 + LOG.info("Elapsed time: {}", watch);
  98 + }
  99 +
  100 +
  101 + private static Classifier createAttributeSelectedClassifier() {
  102 + AttributeSelectedClassifier attributeSelectedClassifier = new AttributeSelectedClassifier();
  103 + attributeSelectedClassifier.setClassifier(new LMT());
  104 + return attributeSelectedClassifier;
67 } 105 }
68 106
69 - public static void crossvalidateRegression(Instances instances) { 107 + private static void crossvalidateRegression(Instances instances) {
70 StopWatch watch = new StopWatch(); 108 StopWatch watch = new StopWatch();
71 watch.start(); 109 watch.start();
72 110
73 Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{ 111 Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{
74 new RandomForest(), new LinearRegression(), new KStar()}).parallel().map(cls -> { 112 new RandomForest(), new LinearRegression(), new KStar()}).parallel().map(cls -> {
75 - Evaluation eval = null;  
76 double acc = 0; 113 double acc = 0;
  114 + String name = cls.getClass().getSimpleName();
77 try { 115 try {
78 - eval = new Evaluation(instances); 116 + Evaluation eval = new Evaluation(instances);
79 eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); 117 eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1));
80 acc = eval.correlationCoefficient(); 118 acc = eval.correlationCoefficient();
81 -  
82 } catch (Exception e) { 119 } catch (Exception e) {
83 - e.printStackTrace(); 120 + LOG.error("Error evaluating model", e);
84 } 121 }
85 - String name = cls.getClass().getSimpleName();  
86 LOG.info(name + " : " + acc); 122 LOG.info(name + " : " + acc);
87 -  
88 return Pair.of(acc, name); 123 return Pair.of(acc, name);
89 }).max(Comparator.comparingDouble(Pair::getLeft)); 124 }).max(Comparator.comparingDouble(Pair::getLeft));
90 LOG.info("#########"); 125 LOG.info("#########");
91 LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); 126 LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft());
92 127
93 watch.stop(); 128 watch.stop();
94 - LOG.info("Elapsed time: " + watch); 129 + LOG.info("Elapsed time: {}", watch);
95 } 130 }
96 -}  
97 \ No newline at end of file 131 \ No newline at end of file
  132 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateMention.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.search;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  4 +
  5 +
  6 +public class CrossvalidateMention {
  7 +
  8 + private CrossvalidateMention() {
  9 + }
  10 +
  11 + public static void main(String[] args) throws Exception {
  12 + CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.MENTION_DATASET_PATH);
  13 + }
  14 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateSentence.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.search;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  4 +
  5 +
  6 +public class CrossvalidateSentence {
  7 +
  8 + private CrossvalidateSentence() {
  9 + }
  10 +
  11 + public static void main(String[] args) throws Exception {
  12 + CrossvalidateCommon.crossvalidateRegressors(ModelConstants.SENTENCE_DATASET_PATH);
  13 + }
  14 +}
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateZero.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.train.search;
  2 +
  3 +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants;
  4 +
  5 +
  6 +public class CrossvalidateZero {
  7 +
  8 + private CrossvalidateZero() {
  9 + }
  10 +
  11 + public static void main(String[] args) throws Exception {
  12 + CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.ZERO_DATASET_PATH);
  13 + }
  14 +}
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/dev_ids.txt 0 → 100644
  1 +199704210011
  2 +199704210013
  3 +199704250031
  4 +199704260017
  5 +199801030156
  6 +199801100009
  7 +199801150038
  8 +199801150133
  9 +199801170001
  10 +199801170129
  11 +199801170130
  12 +199801200002
  13 +199801200132
  14 +199801210007
  15 +199801220030
  16 +199801220127
  17 +199801230001
  18 +199801230095
  19 +199801240116
  20 +199801240123
  21 +199801260113
  22 +199801270108
  23 +199801280128
  24 +199801290020
  25 +199801310032
  26 +199802040201
  27 +199901180149
  28 +199901190049
  29 +199901230088
  30 +199901250006
  31 +199901250008
  32 +199901250111
  33 +199901250113
  34 +199901300064
  35 +199901300098
  36 +199902240123
  37 +199906220027
  38 +199906220037
  39 +199906220038
  40 +199906220056
  41 +199906220065
  42 +199906230040
  43 +199906230052
  44 +199906240040
  45 +199906240088
  46 +199906250007
  47 +199906250091
  48 +199906260015
  49 +199906260018
  50 +199906260038
  51 +199907030016
  52 +199907030018
  53 +199907030042
  54 +199907030059
  55 +199907050032
  56 +199907050040
  57 +199907050047
  58 +199907050071
  59 +199907270095
  60 +199907270137
  61 +199907270145
  62 +199909210045
  63 +199909250054
  64 +199909300064
  65 +199909300065
  66 +199909300066
  67 +199910020049
  68 +199910020050
  69 +199910090047
  70 +199910090049
  71 +199910090051
  72 +199910110055
  73 +199910110057
  74 +199910210058
  75 +199910210059
  76 +199910270041
  77 +199910280054
  78 +199910280055
  79 +199910280057
  80 +199910300026
  81 +199911030039
  82 +199911030040
  83 +199911030041
  84 +199911060031
  85 +199911060042
  86 +199911060043
  87 +199911080054
  88 +199911080055
  89 +199911080056
  90 +199911100061
  91 +199911100062
  92 +199911100063
  93 +199911130036
  94 +199911130037
  95 +199911130038
  96 +199911180042
  97 +199911180043
  98 +199911180044
  99 +199911220059
  100 +199911220061
  101 +199911220066
  102 +199911230041
  103 +199911240035
  104 +199911240037
  105 +199911240038
  106 +199911250055
  107 +199911250057
  108 +199912020059
  109 +199912090045
  110 +199912090047
  111 +199912090061
  112 +199912110041
  113 +199912110042
  114 +199912130055
  115 +199912130057
  116 +199912170065
  117 +199912180052
  118 +199912210018
  119 +199912210037
  120 +199912210040
  121 +199912220045
  122 +199912220046
  123 +199912220047
  124 +199912230058
  125 +199912230059
  126 +199912230097
  127 +199912280028
  128 +199912280044
  129 +199912280045
  130 +199912310085
  131 +199912310087
  132 +200001030047
  133 +200001030106
  134 +200001040030
  135 +200001040031
  136 +200001060052
  137 +200001060053
  138 +200001060055
  139 +200001070062
  140 +200001070066
  141 +200001080040
  142 +200001080041
  143 +200001140061
  144 +200001140064
  145 +200001170049
  146 +200001170051
  147 +200001170052
  148 +200001170053
  149 +200001180040
  150 +200001200056
  151 +200001220023
  152 +200001220118
  153 +200001240016
  154 +200001290042
  155 +200001310048
  156 +200001310049
  157 +200001310050
  158 +200001310054
  159 +200002090042
  160 +200002090043
  161 +200002120045
  162 +200002120046
  163 +200002160046
  164 +200002160047
  165 +200002250063
  166 +200002250065
  167 +200002250066
  168 +200002290044
  169 +200002290045
  170 +200002290046
  171 +200002290047
  172 +200002290048
  173 +200003010058
  174 +200003010059
  175 +200003060054
  176 +200003060055
  177 +200003060057
  178 +200003110047
  179 +200003110048
  180 +200003110049
  181 +200003210044
  182 +200003210045
  183 +200004120021
  184 +200004120022
  185 +200004120023
  186 +200004150048
  187 +200004150049
  188 +200004150050
  189 +200004170026
  190 +200004170065
  191 +200004220044
  192 +200004220045
  193 +200004220046
  194 +200004220047
  195 +200004220048
  196 +200005060030
  197 +200005150055
  198 +200005150059
  199 +200005300045
  200 +200005300047
  201 +200005300048
  202 +200006010065
  203 +200006010066
  204 +200006010067
  205 +200006050056
  206 +200006050057
  207 +200006050058
  208 +200006050059
  209 +200006050061
  210 +200006050068
  211 +200006070056
  212 +200006080033
  213 +200006120031
  214 +200006130055
  215 +200006130057
  216 +200006130059
  217 +200006260069
  218 +200006260071
  219 +200006270059
  220 +200007120068
  221 +200007120070
  222 +200007120072
  223 +200007170026
  224 +200007180051
  225 +200007240034
  226 +200007270050
  227 +200007280033
  228 +200008040071
  229 +200008040073
  230 +200008250077
  231 +200008250079
  232 +200008260055
  233 +200008310046
  234 +200010120066
  235 +200010120074
  236 +200010130063
  237 +200010140048
  238 +200010140049
  239 +200010160039
  240 +200010160048
  241 +200010160049
  242 +200010180059
  243 +200010180063
  244 +200010190066
  245 +200010190068
  246 +200011210063
  247 +200011210064
  248 +200011210066
  249 +200012050066
  250 +200012050067
  251 +200012050068
  252 +200012050069
  253 +200012050070
  254 +200012050071
  255 +200012080134
  256 +200012080137
  257 +200012110069
  258 +200012110070
  259 +200012110071
  260 +200012110075
  261 +200012120028
  262 +200012120068
  263 +200012120072
  264 +200012130056
  265 +200012130100
  266 +200012130102
  267 +200012130103
  268 +200012140095
  269 +200012140096
  270 +200012140097
  271 +200012140098
  272 +200012140099
  273 +200012140100
  274 +200012150076
  275 +200012160048
  276 +200012160049
  277 +200012180083
  278 +200012180084
  279 +200012180088
  280 +200012230028
  281 +200012230045
  282 +200012230046
  283 +200012230047
  284 +200012230048
  285 +200012230050
  286 +200012270055
  287 +200012270056
  288 +200101020059
  289 +200101020062
  290 +200101020063
  291 +200101020075
  292 +200101130048
  293 +200101130050
  294 +200101130051
  295 +200101130055
  296 +200101150043
  297 +200101150045
  298 +200101180050
  299 +200101180051
  300 +200101180052
  301 +200101200048
  302 +200101220047
  303 +200101220053
  304 +200102070011
  305 +200102070016
  306 +200102120034
  307 +200102120057
  308 +200102130014
  309 +200102150001
  310 +200102150014
  311 +200102160011
  312 +200102190016
  313 +200102220001
  314 +200102220013
  315 +200102270041
  316 +200102270062
  317 +200102280169
  318 +200103010049
  319 +200103060022
  320 +200103060032
  321 +200103060057
  322 +200103080026
  323 +200103080030
  324 +200103080036
  325 +200103100019
  326 +200103100021
  327 +200103100058
  328 +200103100062
  329 +200103130008
  330 +200103130023
  331 +200103130069
  332 +200103200066
  333 +200103200080
  334 +200103270069
  335 +200103310092
  336 +200104020007
  337 +200104050011
  338 +200104100021
  339 +200104100023
  340 +200104170015
  341 +200104170040
  342 +200104170055
  343 +200104170057
  344 +200104190039
  345 +200104190066
  346 +200104230031
  347 +200104230069
  348 +200104260051
  349 +200104260053
  350 +200104300213
  351 +200104300215
  352 +200104300217
  353 +200105020092
  354 +200105050042
  355 +200105050043
  356 +200105050046
  357 +200105050048
  358 +200105070017
  359 +200105140050
  360 +200105140052
  361 +200105220096
  362 +200105290074
  363 +200105290075
  364 +200106120068
  365 +200106120069
  366 +200106180051
  367 +200106180053
  368 +200106200064
  369 +200106220086
  370 +200106220087
  371 +200106220088
  372 +200106220090
  373 +200106250050
  374 +200107120071
  375 +200107120073
  376 +200107210129
  377 +200107240070
  378 +200107250080
  379 +200108060051
  380 +200108060155
  381 +200108060156
  382 +200108060157
  383 +200108070038
  384 +200108160040
  385 +200108180123
  386 +200108200033
  387 +200108210066
  388 +200108210074
  389 +200108270077
  390 +200108280064
  391 +200109060061
  392 +200109130091
  393 +200109250092
  394 +200109260097
  395 +200109270116
  396 +200110020075
  397 +200110150056
  398 +200110150062
  399 +200110200070
  400 +200110200071
  401 +200110220068
  402 +200111080086
  403 +200111140055
  404 +200111210078
  405 +200111240060
  406 +200112040031
  407 +200112040077
  408 +200112050063
  409 +200112100041
  410 +200112190067
  411 +200201280011
  412 +200201290029
  413 +200202280078
  414 +200203280057
  415 +200203290107
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/test_ids.txt 0 → 100644
  1 +199704210012
  2 +199704210042
  3 +199704220007
  4 +199704220018
  5 +199704220021
  6 +199704220044
  7 +199704230006
  8 +199704230014
  9 +199704230029
  10 +199704230043
  11 +199704240008
  12 +199704240019
  13 +199704240020
  14 +199704240021
  15 +199704250018
  16 +199704250022
  17 +199704260014
  18 +199704260015
  19 +199704260016
  20 +199704280023
  21 +199704280025
  22 +199704280027
  23 +199704280031
  24 +199704300031
  25 +199704300042
  26 +199704300046
  27 +199801020010
  28 +199801020031
  29 +199801020035
  30 +199801020070
  31 +199801020076
  32 +199801020079
  33 +199801030068
  34 +199801030090
  35 +199801030091
  36 +199801030129
  37 +199801030148
  38 +199801030158
  39 +199801050023
  40 +199801050059
  41 +199801130087
  42 +199801130129
  43 +199801140182
  44 +199801160119
  45 +199801200106
  46 +199801220140
  47 +199801240061
  48 +199801240096
  49 +199801260047
  50 +199801260070
  51 +199801270055
  52 +199801270110
  53 +199801280123
  54 +199801280158
  55 +199801280159
  56 +199801280241
  57 +199801290022
  58 +199801310003
  59 +199801310037
  60 +199802030127
  61 +199802040159
  62 +199802040182
  63 +199802040202
  64 +199805220133
  65 +199808280158
  66 +199901190073
  67 +199901190115
  68 +199901250112
  69 +199901250117
  70 +199901270103
  71 +199901270120
  72 +199901270122
  73 +199901290095
  74 +199901300101
  75 +199902240095
  76 +199906220029
  77 +199906230024
  78 +199906240084
  79 +199906260027
  80 +199907050045
  81 +199907050076
  82 +199907140166
  83 +199907200002
  84 +199907270004
  85 +199908260001
  86 +199909090036
  87 +199909250018
  88 +199909270029
  89 +199910020027
  90 +199910020029
  91 +199910270011
  92 +199911060044
  93 +199911100038
  94 +199911100064
  95 +199911200030
  96 +199911220063
  97 +199912020060
  98 +199912180026
  99 +199912180034
  100 +199912220030
  101 +199912280024
  102 +199912280046
  103 +199912300021
  104 +199912300029
  105 +200001030029
  106 +200001030053
  107 +200001060034
  108 +200001100035
  109 +200001100046
  110 +200001170029
  111 +200001170033
  112 +200001170060
  113 +200001290045
  114 +200002220027
  115 +200002240034
  116 +200002250031
  117 +200003060062
  118 +200003110050
  119 +200004280047
  120 +200004290022
  121 +200006050119
  122 +200006260079
  123 +200006290045
  124 +200007150033
  125 +200008040076
  126 +200008220042
  127 +200008220046
  128 +200010130049
  129 +200010160054
  130 +200012130034
  131 +200012140084
  132 +200012290046
  133 +200104040019
  134 +200106050035
  135 +200108180109
  136 +200108300032
  137 +200111120045
  138 +200111150042
  139 +200111150047
  140 +200111200036
  141 +200111270049
  142 +200112030055
  143 +200112280057
  144 +200201220038
  145 +200201220050
  146 +200202020036
  147 +200202200032
  148 +200202210054
  149 +200202270044
  150 +200203010070
  151 +200203190026
  152 +200203260050
  153 +200203280017
  154 +200203290078
nicolas-core/src/main/resources/zeros.tsv renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/zeros.tsv
nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessTest.java renamed to nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java
1 package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; 1 package pl.waw.ipipan.zil.summ.nicolas.train.multiservice;
2 2
  3 +import com.google.common.collect.Lists;
  4 +import org.junit.ClassRule;
3 import org.junit.Test; 5 import org.junit.Test;
  6 +import org.junit.rules.TemporaryFolder;
  7 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
4 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
5 9
6 import java.io.File; 10 import java.io.File;
  11 +import java.util.List;
  12 +import java.util.stream.Collectors;
  13 +
  14 +import static junit.framework.TestCase.assertEquals;
  15 +
  16 +public class NLPProcessIT {
  17 +
  18 + @ClassRule
  19 + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder();
7 20
8 -public class NLPProcessTest {  
9 @Test 21 @Test
10 public void shouldProcessSampleText() throws Exception { 22 public void shouldProcessSampleText() throws Exception {
11 String text = "Ala ma kota. Ala ma też psa."; 23 String text = "Ala ma kota. Ala ma też psa.";
12 TText processed = NLPProcess.annotate(text); 24 TText processed = NLPProcess.annotate(text);
13 - processed.getParagraphs().stream().flatMap(p->p.getSentences().stream()).forEach(s->System.out.println(s.getId()));  
14 - File targetFile = new File("sample_serialized_text.bin"); 25 + List<String> ids = processed.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).map(TSentence::getId).collect(Collectors.toList());
  26 + assertEquals(Lists.newArrayList("s-2.1", "s-2.2"), ids);
  27 +
  28 + File targetFile = TEMPORARY_FOLDER.newFile();
15 NLPProcess.serialize(processed, targetFile); 29 NLPProcess.serialize(processed, targetFile);
16 } 30 }
17 } 31 }
18 \ No newline at end of file 32 \ No newline at end of file
@@ -11,7 +11,7 @@ @@ -11,7 +11,7 @@
11 <packaging>pom</packaging> 11 <packaging>pom</packaging>
12 12
13 <modules> 13 <modules>
14 - <module>nicolas-core</module> 14 + <module>nicolas-lib</module>
15 <module>nicolas-cli</module> 15 <module>nicolas-cli</module>
16 <module>nicolas-model</module> 16 <module>nicolas-model</module>
17 <module>nicolas-train</module> 17 <module>nicolas-train</module>
@@ -26,12 +26,13 @@ @@ -26,12 +26,13 @@
26 <utils.version>1.0</utils.version> 26 <utils.version>1.0</utils.version>
27 27
28 <commons-csv.version>1.4</commons-csv.version> 28 <commons-csv.version>1.4</commons-csv.version>
29 - <guava.version>19.0</guava.version>  
30 - <weka-dev.version>3.9.0</weka-dev.version> 29 + <guava.version>20.0</guava.version>
  30 + <weka-dev.version>3.9.1</weka-dev.version>
31 <commons-lang3.version>3.5</commons-lang3.version> 31 <commons-lang3.version>3.5</commons-lang3.version>
32 <commons-io.version>2.5</commons-io.version> 32 <commons-io.version>2.5</commons-io.version>
33 - <slf4j-api.version>1.7.12</slf4j-api.version> 33 + <slf4j-api.version>1.7.22</slf4j-api.version>
34 <junit.version>4.12</junit.version> 34 <junit.version>4.12</junit.version>
  35 + <zip4j.version>1.3.2</zip4j.version>
35 </properties> 36 </properties>
36 37
37 <prerequisites> 38 <prerequisites>
@@ -65,6 +66,16 @@ @@ -65,6 +66,16 @@
65 <artifactId>nicolas-zero</artifactId> 66 <artifactId>nicolas-zero</artifactId>
66 <version>${project.version}</version> 67 <version>${project.version}</version>
67 </dependency> 68 </dependency>
  69 + <dependency>
  70 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  71 + <artifactId>nicolas-lib</artifactId>
  72 + <version>${project.version}</version>
  73 + </dependency>
  74 + <dependency>
  75 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  76 + <artifactId>nicolas-train</artifactId>
  77 + <version>${project.version}</version>
  78 + </dependency>
68 79
69 <!-- internal --> 80 <!-- internal -->
70 <dependency> 81 <dependency>
@@ -93,6 +104,12 @@ @@ -93,6 +104,12 @@
93 <groupId>nz.ac.waikato.cms.weka</groupId> 104 <groupId>nz.ac.waikato.cms.weka</groupId>
94 <artifactId>weka-dev</artifactId> 105 <artifactId>weka-dev</artifactId>
95 <version>${weka-dev.version}</version> 106 <version>${weka-dev.version}</version>
  107 + <exclusions>
  108 + <exclusion>
  109 + <groupId>org.slf4j</groupId>
  110 + <artifactId>slf4j-simple</artifactId>
  111 + </exclusion>
  112 + </exclusions>
96 </dependency> 113 </dependency>
97 <dependency> 114 <dependency>
98 <groupId>org.apache.commons</groupId> 115 <groupId>org.apache.commons</groupId>
@@ -104,6 +121,11 @@ @@ -104,6 +121,11 @@
104 <artifactId>commons-io</artifactId> 121 <artifactId>commons-io</artifactId>
105 <version>${commons-io.version}</version> 122 <version>${commons-io.version}</version>
106 </dependency> 123 </dependency>
  124 + <dependency>
  125 + <groupId>net.lingala.zip4j</groupId>
  126 + <artifactId>zip4j</artifactId>
  127 + <version>${zip4j.version}</version>
  128 + </dependency>
107 129
108 <!-- logging --> 130 <!-- logging -->
109 <dependency> 131 <dependency>
@@ -111,6 +133,11 @@ @@ -111,6 +133,11 @@
111 <artifactId>slf4j-api</artifactId> 133 <artifactId>slf4j-api</artifactId>
112 <version>${slf4j-api.version}</version> 134 <version>${slf4j-api.version}</version>
113 </dependency> 135 </dependency>
  136 + <dependency>
  137 + <groupId>org.slf4j</groupId>
  138 + <artifactId>slf4j-simple</artifactId>
  139 + <version>${slf4j-api.version}</version>
  140 + </dependency>
114 141
115 <!-- test --> 142 <!-- test -->
116 <dependency> 143 <dependency>