Commit 76eeceb70c81d7fdfe3860db5a8576f0e4234daf
1 parent
f04fcb1a
large refactor
Showing
60 changed files
with
1238 additions
and
478 deletions
.gitignore
nicolas-common/pom.xml
@@ -27,6 +27,10 @@ | @@ -27,6 +27,10 @@ | ||
27 | <groupId>nz.ac.waikato.cms.weka</groupId> | 27 | <groupId>nz.ac.waikato.cms.weka</groupId> |
28 | <artifactId>weka-dev</artifactId> | 28 | <artifactId>weka-dev</artifactId> |
29 | </dependency> | 29 | </dependency> |
30 | + <dependency> | ||
31 | + <groupId>commons-io</groupId> | ||
32 | + <artifactId>commons-io</artifactId> | ||
33 | + </dependency> | ||
30 | 34 | ||
31 | <!-- logging --> | 35 | <!-- logging --> |
32 | <dependency> | 36 | <dependency> |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java
@@ -2,26 +2,21 @@ package pl.waw.ipipan.zil.summ.nicolas.common; | @@ -2,26 +2,21 @@ package pl.waw.ipipan.zil.summ.nicolas.common; | ||
2 | 2 | ||
3 | import com.google.common.base.Charsets; | 3 | import com.google.common.base.Charsets; |
4 | import com.google.common.collect.ImmutableList; | 4 | import com.google.common.collect.ImmutableList; |
5 | -import weka.classifiers.Classifier; | ||
6 | -import weka.classifiers.functions.SMO; | ||
7 | -import weka.classifiers.meta.AdaBoostM1; | ||
8 | -import weka.classifiers.meta.AttributeSelectedClassifier; | ||
9 | -import weka.classifiers.rules.JRip; | ||
10 | -import weka.classifiers.trees.J48; | ||
11 | -import weka.classifiers.trees.RandomForest; | ||
12 | 5 | ||
13 | import java.nio.charset.Charset; | 6 | import java.nio.charset.Charset; |
14 | 7 | ||
15 | 8 | ||
16 | public class Constants { | 9 | public class Constants { |
17 | 10 | ||
18 | - public static final String MENTIONS_MODEL_PATH = "mentions_model.bin"; | ||
19 | - public static final String SENTENCES_MODEL_PATH = "sentences_model.bin"; | ||
20 | - public static final String ZERO_MODEL_PATH = "zeros_model.bin"; | 11 | + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; |
21 | 12 | ||
22 | - public static final String MENTIONS_DATASET_PATH = "mentions_train.arff"; | ||
23 | - public static final String SENTENCES_DATASET_PATH = "sentences_train.arff"; | ||
24 | - public static final String ZERO_DATASET_PATH = "zeros_train.arff"; | 13 | + private static final String MODELS_PATH = ROOT_PATH + "models/"; |
14 | + public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin"; | ||
15 | + public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin"; | ||
16 | + public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin"; | ||
17 | + | ||
18 | + private static final String RESOURCES_PATH = ROOT_PATH + "resources/"; | ||
19 | + public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt"; | ||
25 | 20 | ||
26 | public static final Charset ENCODING = Charsets.UTF_8; | 21 | public static final Charset ENCODING = Charsets.UTF_8; |
27 | 22 | ||
@@ -30,24 +25,4 @@ public class Constants { | @@ -30,24 +25,4 @@ public class Constants { | ||
30 | private Constants() { | 25 | private Constants() { |
31 | } | 26 | } |
32 | 27 | ||
33 | - public static Classifier getMentionClassifier() { | ||
34 | - RandomForest classifier = new RandomForest(); | ||
35 | - classifier.setNumIterations(250); | ||
36 | - classifier.setSeed(0); | ||
37 | - classifier.setNumExecutionSlots(8); | ||
38 | - return classifier; | ||
39 | - } | ||
40 | - | ||
41 | - public static Classifier getSentencesClassifier() { | ||
42 | - RandomForest classifier = new RandomForest(); | ||
43 | - classifier.setNumIterations(10); | ||
44 | - classifier.setSeed(0); | ||
45 | - classifier.setNumExecutionSlots(8); | ||
46 | - return classifier; | ||
47 | - } | ||
48 | - | ||
49 | - public static Classifier getZerosClassifier() { | ||
50 | - Classifier classifier = new J48(); | ||
51 | - return classifier; | ||
52 | - } | ||
53 | } | 28 | } |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
@@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.common; | @@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.common; | ||
3 | import com.google.common.collect.Lists; | 3 | import com.google.common.collect.Lists; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
5 | import com.google.common.collect.Sets; | 5 | import com.google.common.collect.Sets; |
6 | +import org.apache.commons.io.IOUtils; | ||
6 | import org.slf4j.Logger; | 7 | import org.slf4j.Logger; |
7 | import org.slf4j.LoggerFactory; | 8 | import org.slf4j.LoggerFactory; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
@@ -24,6 +25,47 @@ public class Utils { | @@ -24,6 +25,47 @@ public class Utils { | ||
24 | 25 | ||
25 | private static final String DATASET_NAME = "Dataset"; | 26 | private static final String DATASET_NAME = "Dataset"; |
26 | 27 | ||
28 | + private Utils() { | ||
29 | + } | ||
30 | + | ||
31 | + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { | ||
32 | + LOG.info("Loading classifier from path: {}...", modelResourcePath); | ||
33 | + try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { | ||
34 | + if (stream == null) { | ||
35 | + throw new IOException("Model not found at: " + modelResourcePath); | ||
36 | + } | ||
37 | + try (ObjectInputStream ois = new ObjectInputStream(stream)) { | ||
38 | + Classifier classifier = (Classifier) ois.readObject(); | ||
39 | + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); | ||
40 | + return classifier; | ||
41 | + } catch (ClassNotFoundException e) { | ||
42 | + LOG.error("Error loading serialized classifier, class not found.", e); | ||
43 | + throw new IOException(e); | ||
44 | + } | ||
45 | + } | ||
46 | + } | ||
47 | + | ||
48 | + public static TText loadThriftTextFromResource(String textResourcePath) throws IOException { | ||
49 | + try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) { | ||
50 | + if (stream == null) { | ||
51 | + throw new IOException("Resource not found at: " + textResourcePath); | ||
52 | + } | ||
53 | + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { | ||
54 | + return (TText) ois.readObject(); | ||
55 | + } catch (ClassNotFoundException e) { | ||
56 | + LOG.error("Error reading serialized thrift text file, class not found.", e); | ||
57 | + throw new IOException(e); | ||
58 | + } | ||
59 | + } | ||
60 | + } | ||
61 | + | ||
62 | + public static List<String> loadLinesFromResource(String resourcePath) throws IOException { | ||
63 | + try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) { | ||
64 | + return IOUtils.readLines(stream, Constants.ENCODING); | ||
65 | + } | ||
66 | + } | ||
67 | + | ||
68 | + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList | ||
27 | public static Instances createNewInstances(ArrayList<Attribute> attributesList) { | 69 | public static Instances createNewInstances(ArrayList<Attribute> attributesList) { |
28 | Instances instances = new Instances(DATASET_NAME, attributesList, 0); | 70 | Instances instances = new Instances(DATASET_NAME, attributesList, 0); |
29 | instances.setClassIndex(0); | 71 | instances.setClassIndex(0); |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java
@@ -8,10 +8,12 @@ import java.io.ObjectStreamClass; | @@ -8,10 +8,12 @@ import java.io.ObjectStreamClass; | ||
8 | 8 | ||
9 | public class VersionIgnoringObjectInputStream extends ObjectInputStream { | 9 | public class VersionIgnoringObjectInputStream extends ObjectInputStream { |
10 | 10 | ||
11 | - public VersionIgnoringObjectInputStream(InputStream in) throws IOException { | 11 | + VersionIgnoringObjectInputStream(InputStream in) throws IOException { |
12 | super(in); | 12 | super(in); |
13 | } | 13 | } |
14 | 14 | ||
15 | + @Override | ||
16 | + @SuppressWarnings("squid:S1166") | ||
15 | protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException { | 17 | protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException { |
16 | ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor | 18 | ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor |
17 | Class localClass; // the class in the local JVM that this descriptor represents. | 19 | Class localClass; // the class in the local JVM that this descriptor represents. |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java
@@ -17,6 +17,7 @@ public class FeatureExtractor { | @@ -17,6 +17,7 @@ public class FeatureExtractor { | ||
17 | 17 | ||
18 | private final Set<String> normalizedAttributes = Sets.newHashSet(); | 18 | private final Set<String> normalizedAttributes = Sets.newHashSet(); |
19 | 19 | ||
20 | + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList | ||
20 | public ArrayList<Attribute> getAttributesList() { | 21 | public ArrayList<Attribute> getAttributesList() { |
21 | return Lists.newArrayList(sortedAttributes); | 22 | return Lists.newArrayList(sortedAttributes); |
22 | } | 23 | } |
@@ -46,15 +47,14 @@ public class FeatureExtractor { | @@ -46,15 +47,14 @@ public class FeatureExtractor { | ||
46 | protected void fillSortedAttributes(String scoreAttName) { | 47 | protected void fillSortedAttributes(String scoreAttName) { |
47 | sortedAttributes.addAll(name2attribute.values()); | 48 | sortedAttributes.addAll(name2attribute.values()); |
48 | sortedAttributes.remove(getAttributeByName(scoreAttName)); | 49 | sortedAttributes.remove(getAttributeByName(scoreAttName)); |
49 | - Collections.sort(sortedAttributes, (o1, o2) -> name2attribute.inverse().get(o1).compareTo(name2attribute.inverse().get(o2))); | 50 | + sortedAttributes.sort(Comparator.comparing(name2attribute.inverse()::get)); |
50 | sortedAttributes.add(0, getAttributeByName(scoreAttName)); | 51 | sortedAttributes.add(0, getAttributeByName(scoreAttName)); |
51 | } | 52 | } |
52 | 53 | ||
53 | protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) { | 54 | protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) { |
54 | Map<Attribute, Double> attribute2max = Maps.newHashMap(); | 55 | Map<Attribute, Double> attribute2max = Maps.newHashMap(); |
55 | Map<Attribute, Double> attribute2min = Maps.newHashMap(); | 56 | Map<Attribute, Double> attribute2min = Maps.newHashMap(); |
56 | - for (T entity : entity2attributes.keySet()) { | ||
57 | - Map<Attribute, Double> entityAttributes = entity2attributes.get(entity); | 57 | + for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) { |
58 | for (String attributeName : normalizedAttributes) { | 58 | for (String attributeName : normalizedAttributes) { |
59 | Attribute attribute = getAttributeByName(attributeName); | 59 | Attribute attribute = getAttributeByName(attributeName); |
60 | Double value = entityAttributes.get(attribute); | 60 | Double value = entityAttributes.get(attribute); |
@@ -66,8 +66,7 @@ public class FeatureExtractor { | @@ -66,8 +66,7 @@ public class FeatureExtractor { | ||
66 | attribute2min.compute(attribute, (k, v) -> Math.min(v, value)); | 66 | attribute2min.compute(attribute, (k, v) -> Math.min(v, value)); |
67 | } | 67 | } |
68 | } | 68 | } |
69 | - for (T mention : entity2attributes.keySet()) { | ||
70 | - Map<Attribute, Double> entityAttributes = entity2attributes.get(mention); | 69 | + for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) { |
71 | for (Attribute attribute : attribute2max.keySet()) { | 70 | for (Attribute attribute : attribute2max.keySet()) { |
72 | Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized"); | 71 | Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized"); |
73 | entityAttributes.put(normalizedAttribute, | 72 | entityAttributes.put(normalizedAttribute, |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java
@@ -174,11 +174,11 @@ public class FeatureHelper { | @@ -174,11 +174,11 @@ public class FeatureHelper { | ||
174 | } | 174 | } |
175 | 175 | ||
176 | public boolean isNested(TMention mention) { | 176 | public boolean isNested(TMention mention) { |
177 | - return mentions.stream().anyMatch(m -> m.getChildIds().containsAll(mention.getChildIds())); | 177 | + return mentions.stream().anyMatch(m -> !m.equals(mention) && m.getChildIds().containsAll(mention.getChildIds())); |
178 | } | 178 | } |
179 | 179 | ||
180 | public boolean isNesting(TMention mention) { | 180 | public boolean isNesting(TMention mention) { |
181 | - return mentions.stream().anyMatch(m -> mention.getChildIds().containsAll(m.getChildIds())); | 181 | + return mentions.stream().anyMatch(m -> !m.equals(mention) && mention.getChildIds().containsAll(m.getChildIds())); |
182 | } | 182 | } |
183 | 183 | ||
184 | public Set<TCoreference> getClusters() { | 184 | public Set<TCoreference> getClusters() { |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java
@@ -33,6 +33,7 @@ public class Interpretation { | @@ -33,6 +33,7 @@ public class Interpretation { | ||
33 | person = split[3]; | 33 | person = split[3]; |
34 | break; | 34 | break; |
35 | case "siebie": | 35 | case "siebie": |
36 | + case "prep": | ||
36 | casee = split[0]; | 37 | casee = split[0]; |
37 | break; | 38 | break; |
38 | case "fin": | 39 | case "fin": |
@@ -47,9 +48,6 @@ public class Interpretation { | @@ -47,9 +48,6 @@ public class Interpretation { | ||
47 | number = split[0]; | 48 | number = split[0]; |
48 | gender = split[1]; | 49 | gender = split[1]; |
49 | break; | 50 | break; |
50 | - case "prep": | ||
51 | - casee = split[0]; | ||
52 | - break; | ||
53 | default: | 51 | default: |
54 | break; | 52 | break; |
55 | } | 53 | } |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
2 | - | ||
3 | -import org.apache.commons.lang3.time.StopWatch; | ||
4 | -import org.slf4j.Logger; | ||
5 | -import org.slf4j.LoggerFactory; | ||
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
7 | -import weka.classifiers.Classifier; | ||
8 | -import weka.core.Instances; | ||
9 | -import weka.core.converters.ArffLoader; | ||
10 | - | ||
11 | -import java.io.File; | ||
12 | -import java.io.FileOutputStream; | ||
13 | -import java.io.ObjectOutputStream; | ||
14 | - | ||
15 | - | ||
16 | -public class TrainModel { | ||
17 | - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | ||
18 | - | ||
19 | - public static void main(String[] args) throws Exception { | ||
20 | - | ||
21 | - ArffLoader loader = new ArffLoader(); | ||
22 | - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | ||
23 | - Instances instances = loader.getDataSet(); | ||
24 | - instances.setClassIndex(0); | ||
25 | - LOG.info(instances.size() + " instances loaded."); | ||
26 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
27 | - | ||
28 | - StopWatch watch = new StopWatch(); | ||
29 | - watch.start(); | ||
30 | - | ||
31 | - Classifier classifier = Constants.getMentionClassifier(); | ||
32 | - | ||
33 | - LOG.info("Building classifier..."); | ||
34 | - classifier.buildClassifier(instances); | ||
35 | - LOG.info("...done."); | ||
36 | - | ||
37 | - try (ObjectOutputStream oos = new ObjectOutputStream( | ||
38 | - new FileOutputStream(Constants.MENTIONS_MODEL_PATH))) { | ||
39 | - oos.writeObject(classifier); | ||
40 | - } | ||
41 | - | ||
42 | - watch.stop(); | ||
43 | - LOG.info("Elapsed time: " + watch); | ||
44 | - | ||
45 | - LOG.info(classifier.toString()); | ||
46 | - } | ||
47 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention.test; | ||
2 | - | ||
3 | -import org.slf4j.Logger; | ||
4 | -import org.slf4j.LoggerFactory; | ||
5 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
6 | -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils; | ||
7 | -import weka.core.Instances; | ||
8 | -import weka.core.converters.ArffLoader; | ||
9 | - | ||
10 | -import java.io.File; | ||
11 | - | ||
12 | - | ||
13 | -public class Crossvalidate { | ||
14 | - | ||
15 | - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | ||
16 | - | ||
17 | - private Crossvalidate() { | ||
18 | - } | ||
19 | - | ||
20 | - public static void main(String[] args) throws Exception { | ||
21 | - ArffLoader loader = new ArffLoader(); | ||
22 | - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | ||
23 | - Instances instances = loader.getDataSet(); | ||
24 | - instances.setClassIndex(0); | ||
25 | - LOG.info(instances.size() + " instances loaded."); | ||
26 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
27 | - | ||
28 | - EvalUtils.crossvalidateClassification(instances); | ||
29 | - } | ||
30 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention.test; | ||
2 | - | ||
3 | -import org.apache.commons.lang3.time.StopWatch; | ||
4 | -import org.slf4j.Logger; | ||
5 | -import org.slf4j.LoggerFactory; | ||
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
7 | -import weka.classifiers.Classifier; | ||
8 | -import weka.classifiers.evaluation.Evaluation; | ||
9 | -import weka.core.Instances; | ||
10 | -import weka.core.converters.ArffLoader; | ||
11 | - | ||
12 | -import java.io.File; | ||
13 | -import java.io.FileInputStream; | ||
14 | -import java.io.IOException; | ||
15 | -import java.io.ObjectInputStream; | ||
16 | - | ||
17 | - | ||
18 | -public class Validate { | ||
19 | - private static final Logger LOG = LoggerFactory.getLogger(Validate.class); | ||
20 | - | ||
21 | - public static void main(String[] args) throws Exception { | ||
22 | - | ||
23 | - ArffLoader loader = new ArffLoader(); | ||
24 | - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | ||
25 | - Instances instances = loader.getDataSet(); | ||
26 | - instances.setClassIndex(0); | ||
27 | - LOG.info(instances.size() + " instances loaded."); | ||
28 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
29 | - | ||
30 | - Classifier classifier = loadClassifier(); | ||
31 | - | ||
32 | - StopWatch watch = new StopWatch(); | ||
33 | - watch.start(); | ||
34 | - | ||
35 | - Evaluation eval = new Evaluation(instances); | ||
36 | - eval.evaluateModel(classifier, instances); | ||
37 | - | ||
38 | - LOG.info(eval.toSummaryString()); | ||
39 | - | ||
40 | - watch.stop(); | ||
41 | - LOG.info("Elapsed time: " + watch); | ||
42 | - } | ||
43 | - | ||
44 | - private static Classifier loadClassifier() throws IOException, ClassNotFoundException { | ||
45 | - LOG.info("Loading classifier..."); | ||
46 | - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Constants.MENTIONS_MODEL_PATH))) { | ||
47 | - Classifier classifier = (Classifier) ois.readObject(); | ||
48 | - LOG.info("Done. " + classifier.toString()); | ||
49 | - return classifier; | ||
50 | - } | ||
51 | - } | ||
52 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.sentence; | ||
2 | - | ||
3 | -import org.apache.commons.lang3.time.StopWatch; | ||
4 | -import org.slf4j.Logger; | ||
5 | -import org.slf4j.LoggerFactory; | ||
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
7 | -import weka.classifiers.Classifier; | ||
8 | -import weka.core.Instances; | ||
9 | -import weka.core.converters.ArffLoader; | ||
10 | - | ||
11 | -import java.io.File; | ||
12 | -import java.io.FileOutputStream; | ||
13 | -import java.io.ObjectOutputStream; | ||
14 | - | ||
15 | - | ||
16 | -public class TrainModel { | ||
17 | - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | ||
18 | - | ||
19 | - public static void main(String[] args) throws Exception { | ||
20 | - | ||
21 | - ArffLoader loader = new ArffLoader(); | ||
22 | - loader.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | ||
23 | - Instances instances = loader.getDataSet(); | ||
24 | - instances.setClassIndex(0); | ||
25 | - LOG.info(instances.size() + " instances loaded."); | ||
26 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
27 | - | ||
28 | - StopWatch watch = new StopWatch(); | ||
29 | - watch.start(); | ||
30 | - | ||
31 | - Classifier classifier = Constants.getSentencesClassifier(); | ||
32 | - | ||
33 | - LOG.info("Building classifier..."); | ||
34 | - classifier.buildClassifier(instances); | ||
35 | - LOG.info("...done."); | ||
36 | - | ||
37 | - try (ObjectOutputStream oos = new ObjectOutputStream( | ||
38 | - new FileOutputStream(Constants.SENTENCES_MODEL_PATH))) { | ||
39 | - oos.writeObject(classifier); | ||
40 | - } | ||
41 | - | ||
42 | - watch.stop(); | ||
43 | - LOG.info("Elapsed time: " + watch); | ||
44 | - | ||
45 | - LOG.info(classifier.toString()); | ||
46 | - } | ||
47 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.sentence.test; | ||
2 | - | ||
3 | -import org.slf4j.Logger; | ||
4 | -import org.slf4j.LoggerFactory; | ||
5 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
6 | -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils; | ||
7 | -import weka.core.Instances; | ||
8 | -import weka.core.converters.ArffLoader; | ||
9 | - | ||
10 | -import java.io.File; | ||
11 | - | ||
12 | - | ||
13 | -public class Crossvalidate { | ||
14 | - | ||
15 | - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | ||
16 | - | ||
17 | - private Crossvalidate() { | ||
18 | - } | ||
19 | - | ||
20 | - public static void main(String[] args) throws Exception { | ||
21 | - | ||
22 | - ArffLoader loader = new ArffLoader(); | ||
23 | - loader.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | ||
24 | - Instances instances = loader.getDataSet(); | ||
25 | - instances.setClassIndex(0); | ||
26 | - LOG.info(instances.size() + " instances loaded."); | ||
27 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
28 | - | ||
29 | - EvalUtils.crossvalidateRegression(instances); | ||
30 | - } | ||
31 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/test/Crossvalidate.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero.test; | ||
2 | - | ||
3 | -import org.slf4j.Logger; | ||
4 | -import org.slf4j.LoggerFactory; | ||
5 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
6 | -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils; | ||
7 | -import weka.core.Instances; | ||
8 | -import weka.core.converters.ArffLoader; | ||
9 | - | ||
10 | -import java.io.File; | ||
11 | - | ||
12 | - | ||
13 | -public class Crossvalidate { | ||
14 | - | ||
15 | - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | ||
16 | - | ||
17 | - private Crossvalidate() { | ||
18 | - } | ||
19 | - | ||
20 | - public static void main(String[] args) throws Exception { | ||
21 | - | ||
22 | - ArffLoader loader = new ArffLoader(); | ||
23 | - loader.setFile(new File(Constants.ZERO_DATASET_PATH)); | ||
24 | - Instances instances = loader.getDataSet(); | ||
25 | - instances.setClassIndex(0); | ||
26 | - LOG.info(instances.size() + " instances loaded."); | ||
27 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
28 | - | ||
29 | - EvalUtils.crossvalidateClassification(instances); | ||
30 | - } | ||
31 | -} |
nicolas-core/pom.xml renamed to nicolas-lib/pom.xml
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -11,6 +11,7 @@ import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | @@ -11,6 +11,7 @@ import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | ||
11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
12 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 12 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
13 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; | 13 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; |
14 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | ||
14 | import weka.classifiers.Classifier; | 15 | import weka.classifiers.Classifier; |
15 | 16 | ||
16 | import java.io.IOException; | 17 | import java.io.IOException; |
@@ -20,22 +21,27 @@ import static java.util.stream.Collectors.toList; | @@ -20,22 +21,27 @@ import static java.util.stream.Collectors.toList; | ||
20 | 21 | ||
21 | public class Nicolas { | 22 | public class Nicolas { |
22 | 23 | ||
23 | - private final Classifier sentenceClassifier; | ||
24 | - private final Classifier mentionClassifier; | ||
25 | - private final MentionFeatureExtractor featureExtractor; | 24 | + private final Classifier mentionModel; |
25 | + private final Classifier sentenceModel; | ||
26 | + private final Classifier zeroModel; | ||
27 | + | ||
28 | + private final MentionFeatureExtractor mentionFeatureExtractor; | ||
26 | private final SentenceFeatureExtractor sentenceFeatureExtractor; | 29 | private final SentenceFeatureExtractor sentenceFeatureExtractor; |
30 | + private final ZeroFeatureExtractor zeroFeatureExtractor; | ||
27 | 31 | ||
28 | public Nicolas() throws IOException, ClassNotFoundException { | 32 | public Nicolas() throws IOException, ClassNotFoundException { |
29 | - mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | ||
30 | - featureExtractor = new MentionFeatureExtractor(); | 33 | + mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); |
34 | + sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | ||
35 | + zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | ||
31 | 36 | ||
32 | - sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); | 37 | + mentionFeatureExtractor = new MentionFeatureExtractor(); |
33 | sentenceFeatureExtractor = new SentenceFeatureExtractor(); | 38 | sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
39 | + zeroFeatureExtractor = new ZeroFeatureExtractor(); | ||
34 | } | 40 | } |
35 | 41 | ||
36 | public String summarizeThrift(TText text, int targetTokenCount) throws Exception { | 42 | public String summarizeThrift(TText text, int targetTokenCount) throws Exception { |
37 | Set<TMention> goodMentions | 43 | Set<TMention> goodMentions |
38 | - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); | 44 | + = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); |
39 | return calculateSummary(text, goodMentions, targetTokenCount); | 45 | return calculateSummary(text, goodMentions, targetTokenCount); |
40 | } | 46 | } |
41 | 47 | ||
@@ -52,10 +58,10 @@ public class Nicolas { | @@ -52,10 +58,10 @@ public class Nicolas { | ||
52 | private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { | 58 | private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { |
53 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | 59 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
54 | 60 | ||
55 | - Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceClassifier, sentenceFeatureExtractor); | 61 | + Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor); |
56 | 62 | ||
57 | List<TSentence> sortedSents = Lists.newArrayList(sents); | 63 | List<TSentence> sortedSents = Lists.newArrayList(sents); |
58 | - Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed()); | 64 | + sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); |
59 | 65 | ||
60 | int size = 0; | 66 | int size = 0; |
61 | Random r = new Random(1); | 67 | Random r = new Random(1); |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java
1 | package pl.waw.ipipan.zil.summ.nicolas; | 1 | package pl.waw.ipipan.zil.summ.nicolas; |
2 | 2 | ||
3 | -import com.google.common.base.Charsets; | ||
4 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
5 | -import com.google.common.io.Files; | ||
6 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
7 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 9 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
12 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer; | ||
13 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 10 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
14 | import weka.core.Attribute; | 11 | import weka.core.Attribute; |
15 | import weka.core.DenseInstance; | 12 | import weka.core.DenseInstance; |
16 | import weka.core.Instance; | 13 | import weka.core.Instance; |
17 | 14 | ||
18 | -import java.io.File; | ||
19 | -import java.io.IOException; | ||
20 | import java.util.List; | 15 | import java.util.List; |
21 | import java.util.Map; | 16 | import java.util.Map; |
22 | import java.util.Set; | 17 | import java.util.Set; |
@@ -30,16 +25,6 @@ public class ThriftUtils { | @@ -30,16 +25,6 @@ public class ThriftUtils { | ||
30 | private ThriftUtils() { | 25 | private ThriftUtils() { |
31 | } | 26 | } |
32 | 27 | ||
33 | - public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException { | ||
34 | - String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8); | ||
35 | - | ||
36 | - MentionScorer scorer = new MentionScorer(); | ||
37 | - Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text); | ||
38 | - | ||
39 | - mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0); | ||
40 | - return mention2score.keySet(); | ||
41 | - } | ||
42 | - | ||
43 | public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { | 28 | public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { |
44 | List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | 29 | List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
45 | Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); | 30 | Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java
@@ -26,18 +26,18 @@ import java.util.*; | @@ -26,18 +26,18 @@ import java.util.*; | ||
26 | 26 | ||
27 | import static java.util.stream.Collectors.toList; | 27 | import static java.util.stream.Collectors.toList; |
28 | 28 | ||
29 | -public class ApplyModel2 { | 29 | +public class ApplyModel { |
30 | 30 | ||
31 | - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class); | 31 | + private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class); |
32 | 32 | ||
33 | private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; | 33 | private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; |
34 | private static final String TARGET_DIR = "corpora/summaries"; | 34 | private static final String TARGET_DIR = "corpora/summaries"; |
35 | 35 | ||
36 | public static void main(String[] args) throws Exception { | 36 | public static void main(String[] args) throws Exception { |
37 | - Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | 37 | + Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); |
38 | MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | 38 | MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); |
39 | 39 | ||
40 | - Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); | 40 | + Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCE_MODEL_RESOURCE_PATH); |
41 | SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); | 41 | SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
42 | 42 | ||
43 | ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); | 43 | ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); |
@@ -102,7 +102,7 @@ public class ApplyModel2 { | @@ -102,7 +102,7 @@ public class ApplyModel2 { | ||
102 | } | 102 | } |
103 | 103 | ||
104 | List<TSentence> sortedSents = Lists.newArrayList(sents); | 104 | List<TSentence> sortedSents = Lists.newArrayList(sents); |
105 | - Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed()); | 105 | + sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); |
106 | 106 | ||
107 | int size = 0; | 107 | int size = 0; |
108 | Random r = new Random(1); | 108 | Random r = new Random(1); |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
1 | package pl.waw.ipipan.zil.summ.nicolas.mention; | 1 | package pl.waw.ipipan.zil.summ.nicolas.mention; |
2 | 2 | ||
3 | -import com.google.common.collect.*; | 3 | +import com.google.common.collect.Lists; |
4 | +import com.google.common.collect.Maps; | ||
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
5 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 6 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
6 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; |
7 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
8 | import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; |
9 | import weka.core.Attribute; | 11 | import weka.core.Attribute; |
10 | 12 | ||
11 | -import java.io.File; | ||
12 | import java.io.IOException; | 13 | import java.io.IOException; |
13 | -import java.nio.file.Files; | ||
14 | -import java.util.*; | 14 | +import java.util.List; |
15 | +import java.util.Map; | ||
15 | import java.util.stream.Collectors; | 16 | import java.util.stream.Collectors; |
16 | -import java.util.stream.Stream; | ||
17 | 17 | ||
18 | 18 | ||
19 | public class MentionFeatureExtractor extends FeatureExtractor { | 19 | public class MentionFeatureExtractor extends FeatureExtractor { |
20 | 20 | ||
21 | - private final List<String> frequentBases = Lists.newArrayList(); | 21 | + private final List<String> frequentBases; |
22 | 22 | ||
23 | - public MentionFeatureExtractor() { | 23 | + public MentionFeatureExtractor() throws IOException { |
24 | + frequentBases = loadFrequentBases(); | ||
24 | 25 | ||
25 | //coref | 26 | //coref |
26 | addNumericAttributeNormalized("chain_length"); | 27 | addNumericAttributeNormalized("chain_length"); |
@@ -70,7 +71,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { | @@ -70,7 +71,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { | ||
70 | addBinaryAttribute(prefix + "_sent_ends_with_questionmark"); | 71 | addBinaryAttribute(prefix + "_sent_ends_with_questionmark"); |
71 | 72 | ||
72 | // frequent bases | 73 | // frequent bases |
73 | - loadFrequentBases(); | ||
74 | for (String base : frequentBases) { | 74 | for (String base : frequentBases) { |
75 | addBinaryAttribute(prefix + "_" + encodeBase(base)); | 75 | addBinaryAttribute(prefix + "_" + encodeBase(base)); |
76 | } | 76 | } |
@@ -80,17 +80,12 @@ public class MentionFeatureExtractor extends FeatureExtractor { | @@ -80,17 +80,12 @@ public class MentionFeatureExtractor extends FeatureExtractor { | ||
80 | fillSortedAttributes("score"); | 80 | fillSortedAttributes("score"); |
81 | } | 81 | } |
82 | 82 | ||
83 | - private String encodeBase(String base) { | ||
84 | - return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); | 83 | + private List<String> loadFrequentBases() throws IOException { |
84 | + return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList()); | ||
85 | } | 85 | } |
86 | 86 | ||
87 | - private void loadFrequentBases() { | ||
88 | - try { | ||
89 | - Stream<String> lines = Files.lines(new File("frequent_bases.txt").toPath()); | ||
90 | - this.frequentBases.addAll(lines.map(String::trim).collect(Collectors.toList())); | ||
91 | - } catch (IOException e) { | ||
92 | - e.printStackTrace(); | ||
93 | - } | 87 | + private String encodeBase(String base) { |
88 | + return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); | ||
94 | } | 89 | } |
95 | 90 | ||
96 | public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) { | 91 | public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) { |
@@ -123,8 +118,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { | @@ -123,8 +118,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { | ||
123 | attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size()); | 118 | attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size()); |
124 | attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); | 119 | attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); |
125 | attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); | 120 | attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); |
126 | - | ||
127 | - assert (attribute2value.size() == getAttributesList().size()); | ||
128 | } | 121 | } |
129 | addNormalizedAttributeValues(result); | 122 | addNormalizedAttributeValues(result); |
130 | 123 |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
@@ -87,7 +87,6 @@ public class SentenceFeatureExtractor extends FeatureExtractor { | @@ -87,7 +87,6 @@ public class SentenceFeatureExtractor extends FeatureExtractor { | ||
87 | feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | 87 | feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue()); |
88 | 88 | ||
89 | feature2value.remove(null); | 89 | feature2value.remove(null); |
90 | - assert (feature2value.size() == getAttributesList().size()); | ||
91 | 90 | ||
92 | sentence2features.put(sentence, feature2value); | 91 | sentence2features.put(sentence, feature2value); |
93 | 92 |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | + | ||
3 | +import com.google.common.collect.Maps; | ||
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
5 | +import weka.core.Attribute; | ||
6 | +import weka.core.DenseInstance; | ||
7 | +import weka.core.Instance; | ||
8 | + | ||
9 | +import java.util.List; | ||
10 | +import java.util.Map; | ||
11 | + | ||
12 | +public class InstanceCreator { | ||
13 | + | ||
14 | + private InstanceCreator() { | ||
15 | + } | ||
16 | + | ||
17 | + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | ||
18 | + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | ||
19 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | ||
20 | + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | ||
21 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
22 | + Map<Attribute, Double> sentenceFeatures = entry.getValue(); | ||
23 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
24 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | ||
25 | + } | ||
26 | + candidate2instance.put(entry.getKey(), instance); | ||
27 | + } | ||
28 | + return candidate2instance; | ||
29 | + } | ||
30 | + | ||
31 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
@@ -4,6 +4,7 @@ import com.google.common.collect.Lists; | @@ -4,6 +4,7 @@ import com.google.common.collect.Lists; | ||
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
5 | import com.google.common.collect.Sets; | 5 | import com.google.common.collect.Sets; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
9 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
@@ -18,18 +19,56 @@ import java.util.Set; | @@ -18,18 +19,56 @@ import java.util.Set; | ||
18 | 19 | ||
19 | public class ZeroFeatureExtractor extends FeatureExtractor { | 20 | public class ZeroFeatureExtractor extends FeatureExtractor { |
20 | 21 | ||
22 | + private static final String SCORE = "score"; | ||
23 | + | ||
24 | + private static final String ANTECEDENT_PREFIX = "antecedent"; | ||
25 | + private static final String CANDIDATE_PREFIX = "candidate"; | ||
26 | + | ||
27 | + private static final String SENTENCE_ENDS_WITH_QUESTION_MARK = "_sentence_ends_with_question_mark"; | ||
28 | + private static final String IS_NAMED = "_is_named"; | ||
29 | + private static final String TOKEN_COUNT = "_token_count"; | ||
30 | + private static final String FIRST_TOKEN_INDEX_IN_SENT = "_first_token_index_in_sent"; | ||
31 | + private static final String INDEX_IN_SENT = "_index_in_sent"; | ||
32 | + private static final String PREV_TOKEN_POS = "_prev_token_pos"; | ||
33 | + private static final String NEXT_TOKEN_POS = "_next_token_pos"; | ||
34 | + private static final String IS_NESTING = "_is_nesting"; | ||
35 | + private static final String IS_NESTED = "_is_nested"; | ||
36 | + private static final String SENTENCE_MENTION_COUNT = "_sentence_mention_count"; | ||
37 | + private static final String SENTENCE_TOKEN_LENGTH = "_sentence_token_length"; | ||
38 | + private static final String IS_PAN_OR_PANI = "_is_pan_or_pani"; | ||
39 | + | ||
40 | + // private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet( | ||
41 | +// "zespół", "tylko", "gdy", ".", ":", "też", "kandydat", "do", "dziś", "bo", "by", "z", "a", "jednak", "jak", "który", "ale", "czy", "i", "się", "rok", "-", "\"", "to", "być", "że", ","); | ||
42 | + private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet("to", "z", "do", "o", "czyli", "nie", "\"", "też", "jak", "czy"); | ||
43 | + | ||
44 | + private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet(); | ||
45 | +// private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet( | ||
46 | +// "mówić", "ii", "twierdzić", "już", "(", "budzić", "stanowić", "powinien", "do", "stać", "musieć", "stanąć", "móc", "o", "chcieć", "się", "-", "zostać", ":", "?", "i", "na", "z", "mieć", "\"", "to", "w", "nie", "być", ".", ","); | ||
47 | + | ||
48 | + private static final String PREV_TOKEN_LEMMA = "_prev_token_lemma_equal_"; | ||
49 | + private static final String NEXT_TOKEN_LEMMA = "_next_token_lemma_equal_"; | ||
50 | + | ||
21 | public ZeroFeatureExtractor() { | 51 | public ZeroFeatureExtractor() { |
22 | 52 | ||
23 | - for (String prefix : new String[]{"antecedent", "candidate"}) { | ||
24 | - addNumericAttribute(prefix + "_index_in_sent"); | ||
25 | - addNumericAttribute(prefix + "_first_token_index_in_sent"); | ||
26 | - addNumericAttribute(prefix + "_token_count"); | ||
27 | - addBinaryAttribute(prefix + "_is_named"); | ||
28 | - addNumericAttribute(prefix + "_sentence_mention_count"); | ||
29 | - addNominalAttribute(prefix + "_next_token_pos", Constants.POS_TAGS); | ||
30 | - addNominalAttribute(prefix + "_prev_token_pos", Constants.POS_TAGS); | ||
31 | - addBinaryAttribute(prefix + "_is_nested"); | ||
32 | - addBinaryAttribute(prefix + "_is_nesting"); | 53 | + for (String prefix : new String[]{ANTECEDENT_PREFIX, CANDIDATE_PREFIX}) { |
54 | + addNumericAttribute(prefix + INDEX_IN_SENT); | ||
55 | + addNumericAttribute(prefix + FIRST_TOKEN_INDEX_IN_SENT); | ||
56 | + addNumericAttribute(prefix + TOKEN_COUNT); | ||
57 | + addBinaryAttribute(prefix + IS_NAMED); | ||
58 | + addBinaryAttribute(prefix + IS_PAN_OR_PANI); | ||
59 | + addNominalAttribute(prefix + NEXT_TOKEN_POS, Constants.POS_TAGS); | ||
60 | + addNominalAttribute(prefix + PREV_TOKEN_POS, Constants.POS_TAGS); | ||
61 | + for (String prevLemma : PREV_TOKEN_LEMMAS) { | ||
62 | + addBinaryAttribute(prefix + PREV_TOKEN_LEMMA + prevLemma); | ||
63 | + } | ||
64 | + for (String nextLemma : NEXT_TOKEN_LEMMAS) { | ||
65 | + addBinaryAttribute(prefix + NEXT_TOKEN_LEMMA + nextLemma); | ||
66 | + } | ||
67 | + addBinaryAttribute(prefix + IS_NESTED); | ||
68 | + addBinaryAttribute(prefix + IS_NESTING); | ||
69 | + addNumericAttribute(prefix + SENTENCE_MENTION_COUNT); | ||
70 | + addNumericAttribute(prefix + SENTENCE_TOKEN_LENGTH); | ||
71 | + addBinaryAttribute(prefix + SENTENCE_ENDS_WITH_QUESTION_MARK); | ||
33 | } | 72 | } |
34 | 73 | ||
35 | addNumericAttribute("chain_length"); | 74 | addNumericAttribute("chain_length"); |
@@ -43,8 +82,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | @@ -43,8 +82,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | ||
43 | addNumericAttribute("pair_sent_distance"); | 82 | addNumericAttribute("pair_sent_distance"); |
44 | addNumericAttribute("pair_par_distance"); | 83 | addNumericAttribute("pair_par_distance"); |
45 | 84 | ||
46 | - addNominalAttribute("score", Lists.newArrayList("bad", "good")); | ||
47 | - fillSortedAttributes("score"); | 85 | + addNominalAttribute(SCORE, Lists.newArrayList("bad", "good")); |
86 | + fillSortedAttributes(SCORE); | ||
48 | } | 87 | } |
49 | 88 | ||
50 | public Map<ZeroSubjectCandidate, Map<Attribute, Double>> calculateFeatures(List<ZeroSubjectCandidate> candidates, TText text) { | 89 | public Map<ZeroSubjectCandidate, Map<Attribute, Double>> calculateFeatures(List<ZeroSubjectCandidate> candidates, TText text) { |
@@ -62,13 +101,13 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | @@ -62,13 +101,13 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | ||
62 | private Map<Attribute, Double> calculateFeatures(ZeroSubjectCandidate candidate, FeatureHelper helper) { | 101 | private Map<Attribute, Double> calculateFeatures(ZeroSubjectCandidate candidate, FeatureHelper helper) { |
63 | 102 | ||
64 | Map<Attribute, Double> candidateFeatures = Maps.newHashMap(); | 103 | Map<Attribute, Double> candidateFeatures = Maps.newHashMap(); |
65 | - candidateFeatures.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | 104 | + candidateFeatures.put(getAttributeByName(SCORE), weka.core.Utils.missingValue()); |
66 | 105 | ||
67 | TMention mention = candidate.getZeroCandidateMention(); | 106 | TMention mention = candidate.getZeroCandidateMention(); |
68 | TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get(); | 107 | TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get(); |
69 | 108 | ||
70 | - addMentionFeatures(helper, candidateFeatures, mention, "candidate"); | ||
71 | - addMentionFeatures(helper, candidateFeatures, antecedent, "antecedent"); | 109 | + addMentionFeatures(helper, candidateFeatures, mention, CANDIDATE_PREFIX); |
110 | + addMentionFeatures(helper, candidateFeatures, antecedent, ANTECEDENT_PREFIX); | ||
72 | 111 | ||
73 | candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equals(helper.getMentionOrth(antecedent)))); | 112 | candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equals(helper.getMentionOrth(antecedent)))); |
74 | candidateFeatures.put(getAttributeByName("pair_equal_base"), toBinary(helper.getMentionBase(mention).equalsIgnoreCase(helper.getMentionBase(antecedent)))); | 113 | candidateFeatures.put(getAttributeByName("pair_equal_base"), toBinary(helper.getMentionBase(mention).equalsIgnoreCase(helper.getMentionBase(antecedent)))); |
@@ -98,28 +137,41 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | @@ -98,28 +137,41 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | ||
98 | } | 137 | } |
99 | 138 | ||
100 | private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) { | 139 | private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) { |
101 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention)); | ||
102 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_first_token_index_in_sent"), (double) helper.getMentionFirstTokenIndex(mention)); | 140 | + candidateFeatures.put(getAttributeByName(attributePrefix + INDEX_IN_SENT), (double) helper.getMentionIndexInSent(mention)); |
141 | + candidateFeatures.put(getAttributeByName(attributePrefix + FIRST_TOKEN_INDEX_IN_SENT), (double) helper.getMentionFirstTokenIndex(mention)); | ||
103 | 142 | ||
104 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize()); | ||
105 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention))); | ||
106 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_sentence_mention_count"), (double) helper.getMentionSentence(mention).getMentions().size()); | 143 | + candidateFeatures.put(getAttributeByName(attributePrefix + TOKEN_COUNT), (double) mention.getChildIdsSize()); |
144 | + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NAMED), toBinary(helper.isMentionNamedEntity(mention))); | ||
145 | + candidateFeatures.put(getAttributeByName(attributePrefix + IS_PAN_OR_PANI), toBinary(helper.getMentionBase(mention).matches("(pan)|(pani)"))); | ||
107 | 146 | ||
108 | TToken nextToken = helper.getTokenAfterMention(mention); | 147 | TToken nextToken = helper.getTokenAfterMention(mention); |
109 | - addNominalAttributeValue(nextToken == null ? "end" : nextToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_next_token_pos"); | 148 | + addNominalAttributeValue(nextToken == null ? "end" : nextToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + NEXT_TOKEN_POS); |
149 | + String nextTokenLemma = nextToken == null ? "" : nextToken.getChosenInterpretation().getBase(); | ||
150 | + for (String nextLemma : NEXT_TOKEN_LEMMAS) { | ||
151 | + candidateFeatures.put(getAttributeByName(attributePrefix + NEXT_TOKEN_LEMMA + nextLemma), toBinary(nextTokenLemma.equalsIgnoreCase(nextLemma))); | ||
152 | + } | ||
153 | + | ||
110 | TToken prevToken = helper.getTokenBeforeMention(mention); | 154 | TToken prevToken = helper.getTokenBeforeMention(mention); |
111 | - addNominalAttributeValue(prevToken == null ? "end" : prevToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_prev_token_pos"); | 155 | + addNominalAttributeValue(prevToken == null ? "end" : prevToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + PREV_TOKEN_POS); |
156 | + String prevTokenLemma = prevToken == null ? "" : prevToken.getChosenInterpretation().getBase(); | ||
157 | + for (String prevLemma : PREV_TOKEN_LEMMAS) { | ||
158 | + candidateFeatures.put(getAttributeByName(attributePrefix + PREV_TOKEN_LEMMA + prevLemma), toBinary(prevTokenLemma.equalsIgnoreCase(prevLemma))); | ||
159 | + } | ||
112 | 160 | ||
113 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention))); | ||
114 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention))); | 161 | + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NESTED), toBinary(helper.isNested(mention))); |
162 | + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NESTING), toBinary(helper.isNesting(mention))); | ||
115 | 163 | ||
164 | + TSentence mentionSentence = helper.getMentionSentence(mention); | ||
165 | + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_MENTION_COUNT), (double) mentionSentence.getMentions().size()); | ||
166 | + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_TOKEN_LENGTH), (double) mentionSentence.getTokens().size()); | ||
167 | + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth().equals("?"))); | ||
116 | } | 168 | } |
117 | 169 | ||
118 | private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { | 170 | private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { |
119 | Attribute att = getAttributeByName(attributeName); | 171 | Attribute att = getAttributeByName(attributeName); |
120 | int index = att.indexOfValue(value); | 172 | int index = att.indexOfValue(value); |
121 | if (index == -1) | 173 | if (index == -1) |
122 | - LOG.warn(value + " not found for attribute " + attributeName); | 174 | + LOG.warn(value + "not found for attribute " + attributeName); |
123 | attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); | 175 | attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); |
124 | } | 176 | } |
125 | } | 177 | } |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
@@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | @@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
8 | import weka.classifiers.Classifier; | 8 | import weka.classifiers.Classifier; |
9 | import weka.core.Instance; | 9 | import weka.core.Instance; |
10 | import weka.core.Instances; | 10 | import weka.core.Instances; |
11 | +import weka.core.SerializationHelper; | ||
11 | 12 | ||
12 | -import java.io.IOException; | ||
13 | import java.util.List; | 13 | import java.util.List; |
14 | import java.util.Map; | 14 | import java.util.Map; |
15 | import java.util.Set; | 15 | import java.util.Set; |
@@ -21,8 +21,8 @@ public class ZeroSubjectInjector { | @@ -21,8 +21,8 @@ public class ZeroSubjectInjector { | ||
21 | private final Classifier classifier; | 21 | private final Classifier classifier; |
22 | private final Instances instances; | 22 | private final Instances instances; |
23 | 23 | ||
24 | - public ZeroSubjectInjector() throws IOException, ClassNotFoundException { | ||
25 | - classifier = Utils.loadClassifier(Constants.ZERO_MODEL_PATH); | 24 | + public ZeroSubjectInjector() throws Exception { |
25 | + classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); | ||
26 | featureExtractor = new ZeroFeatureExtractor(); | 26 | featureExtractor = new ZeroFeatureExtractor(); |
27 | instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 27 | instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
28 | } | 28 | } |
@@ -31,7 +31,7 @@ public class ZeroSubjectInjector { | @@ -31,7 +31,7 @@ public class ZeroSubjectInjector { | ||
31 | Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); | 31 | Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); |
32 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); | 32 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); |
33 | Map<ZeroSubjectCandidate, Instance> candidate2instance = | 33 | Map<ZeroSubjectCandidate, Instance> candidate2instance = |
34 | - PrepareTrainingData.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | 34 | + InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); |
35 | 35 | ||
36 | Set<String> result = Sets.newHashSet(); | 36 | Set<String> result = Sets.newHashSet(); |
37 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { | 37 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas; | ||
2 | + | ||
3 | +import org.junit.BeforeClass; | ||
4 | +import org.junit.Test; | ||
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
7 | + | ||
8 | +import static org.junit.Assert.assertTrue; | ||
9 | + | ||
10 | +public class NicolasTest { | ||
11 | + | ||
12 | + private static final String SAMPLE_THRIFT_TEXT_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/sample_serialized_text.thrift"; | ||
13 | + | ||
14 | + private static Nicolas nicolas; | ||
15 | + | ||
16 | + @BeforeClass | ||
17 | + public static void shouldLoadModels() throws Exception { | ||
18 | + nicolas = new Nicolas(); | ||
19 | + } | ||
20 | + | ||
21 | + @Test | ||
22 | + public void shouldSummarizeThriftText() throws Exception { | ||
23 | + TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); | ||
24 | + String summary = nicolas.summarizeThrift(thriftText, 5); | ||
25 | + int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size(); | ||
26 | + assertTrue(summaryTokensCount > 0); | ||
27 | + assertTrue(summaryTokensCount < 10); | ||
28 | + } | ||
29 | + | ||
30 | +} | ||
0 | \ No newline at end of file | 31 | \ No newline at end of file |
nicolas-core/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java renamed to nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -18,7 +18,7 @@ import static org.junit.Assert.assertEquals; | @@ -18,7 +18,7 @@ import static org.junit.Assert.assertEquals; | ||
18 | 18 | ||
19 | public class CandidateFinderTest { | 19 | public class CandidateFinderTest { |
20 | 20 | ||
21 | - private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin"; | 21 | + private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.thrift"; |
22 | private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; | 22 | private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; |
23 | 23 | ||
24 | @Test | 24 | @Test |
nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin renamed to nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/sample_serialized_text.thrift
No preview for this file type
nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.thrift
0 → 100644
No preview for this file type
nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt renamed to nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/models/.gitignore
0 → 100644
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/models/README.md
0 → 100644
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt renamed to nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/frequent_bases.txt
nicolas-train/pom.xml
@@ -12,6 +12,16 @@ | @@ -12,6 +12,16 @@ | ||
12 | <artifactId>nicolas-train</artifactId> | 12 | <artifactId>nicolas-train</artifactId> |
13 | 13 | ||
14 | <dependencies> | 14 | <dependencies> |
15 | + <!-- project --> | ||
16 | + <dependency> | ||
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | + <artifactId>nicolas-common</artifactId> | ||
19 | + </dependency> | ||
20 | + <dependency> | ||
21 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
22 | + <artifactId>nicolas-lib</artifactId> | ||
23 | + </dependency> | ||
24 | + | ||
15 | <!-- internal --> | 25 | <!-- internal --> |
16 | <dependency> | 26 | <dependency> |
17 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 27 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
@@ -22,10 +32,28 @@ | @@ -22,10 +32,28 @@ | ||
22 | <artifactId>utils</artifactId> | 32 | <artifactId>utils</artifactId> |
23 | </dependency> | 33 | </dependency> |
24 | 34 | ||
35 | + <!-- third party --> | ||
36 | + <dependency> | ||
37 | + <groupId>nz.ac.waikato.cms.weka</groupId> | ||
38 | + <artifactId>weka-dev</artifactId> | ||
39 | + </dependency> | ||
40 | + <dependency> | ||
41 | + <groupId>org.apache.commons</groupId> | ||
42 | + <artifactId>commons-lang3</artifactId> | ||
43 | + </dependency> | ||
44 | + <dependency> | ||
45 | + <groupId>net.lingala.zip4j</groupId> | ||
46 | + <artifactId>zip4j</artifactId> | ||
47 | + </dependency> | ||
48 | + | ||
25 | <!-- logging --> | 49 | <!-- logging --> |
26 | <dependency> | 50 | <dependency> |
27 | <groupId>org.slf4j</groupId> | 51 | <groupId>org.slf4j</groupId> |
28 | <artifactId>slf4j-api</artifactId> | 52 | <artifactId>slf4j-api</artifactId> |
29 | </dependency> | 53 | </dependency> |
54 | + <dependency> | ||
55 | + <groupId>org.slf4j</groupId> | ||
56 | + <artifactId>slf4j-simple</artifactId> | ||
57 | + </dependency> | ||
30 | </dependencies> | 58 | </dependencies> |
31 | </project> | 59 | </project> |
32 | \ No newline at end of file | 60 | \ No newline at end of file |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train; | ||
2 | + | ||
3 | +import net.lingala.zip4j.core.ZipFile; | ||
4 | +import org.apache.commons.io.FileUtils; | ||
5 | +import org.slf4j.Logger; | ||
6 | +import org.slf4j.LoggerFactory; | ||
7 | +import pl.waw.ipipan.zil.summ.nicolas.train.multiservice.NLPProcess; | ||
8 | + | ||
9 | +import java.io.File; | ||
10 | +import java.net.URL; | ||
11 | + | ||
12 | +public class DownloadAndPreprocessCorpus { | ||
13 | + | ||
14 | + private static final Logger LOG = LoggerFactory.getLogger(DownloadAndPreprocessCorpus.class); | ||
15 | + | ||
16 | + private static final String WORKING_DIR = "data"; | ||
17 | + private static final String CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/PolishSummariesCorpus?action=AttachFile&do=get&target=PSC_1.0.zip"; | ||
18 | + | ||
19 | + private DownloadAndPreprocessCorpus() { | ||
20 | + } | ||
21 | + | ||
22 | + public static void main(String[] args) throws Exception { | ||
23 | + File workDir = createFolder(WORKING_DIR); | ||
24 | + | ||
25 | + File corpusFile = new File(workDir, "corpus.zip"); | ||
26 | + if (!corpusFile.exists()) { | ||
27 | + LOG.info("Downloading corpus file..."); | ||
28 | + FileUtils.copyURLToFile(new URL(CORPUS_DOWNLOAD_URL), corpusFile); | ||
29 | + LOG.info("done."); | ||
30 | + } else { | ||
31 | + LOG.info("Corpus file already downloaded."); | ||
32 | + } | ||
33 | + | ||
34 | + File extractedCorpusDir = new File(workDir, "corpus"); | ||
35 | + if (extractedCorpusDir.exists()) { | ||
36 | + LOG.info("Corpus file already extracted."); | ||
37 | + } else { | ||
38 | + ZipFile zipFile = new ZipFile(corpusFile); | ||
39 | + zipFile.extractAll(extractedCorpusDir.getPath()); | ||
40 | + LOG.info("Extracted corpus file."); | ||
41 | + } | ||
42 | + | ||
43 | + File pscDir = new File(extractedCorpusDir, "PSC_1.0"); | ||
44 | + File dataDir = new File(pscDir, "data"); | ||
45 | + | ||
46 | + File preprocessed = new File(WORKING_DIR, "preprocessed"); | ||
47 | + createFolder(preprocessed.getPath()); | ||
48 | + NLPProcess.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); | ||
49 | + } | ||
50 | + | ||
51 | + private static File createFolder(String path) { | ||
52 | + File folder = new File(path); | ||
53 | + if (folder.mkdir()) { | ||
54 | + LOG.info("Created directory at: {}.", path); | ||
55 | + } else { | ||
56 | + LOG.info("Directory already present at: {}.", path); | ||
57 | + } | ||
58 | + return folder; | ||
59 | + } | ||
60 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/TrainAllModels.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel; | ||
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel; | ||
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel; | ||
6 | + | ||
7 | +public class TrainAllModels { | ||
8 | + | ||
9 | + private TrainAllModels() { | ||
10 | + } | ||
11 | + | ||
12 | + public static void main(String[] args) throws Exception { | ||
13 | + TrainMentionModel.main(args); | ||
14 | + TrainSentenceModel.main(args); | ||
15 | + TrainZeroModel.main(args); | ||
16 | + } | ||
17 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Trainer.java deleted
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/ModelConstants.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.common; | ||
2 | + | ||
3 | +import weka.classifiers.Classifier; | ||
4 | +import weka.classifiers.trees.RandomForest; | ||
5 | + | ||
6 | +public class ModelConstants { | ||
7 | + | ||
8 | + public static final String MENTION_DATASET_PATH = "mentions_train.arff"; | ||
9 | + public static final String SENTENCE_DATASET_PATH = "sentences_train.arff"; | ||
10 | + public static final String ZERO_DATASET_PATH = "zeros_train.arff"; | ||
11 | + | ||
12 | + private static final int NUM_ITERATIONS = 16; | ||
13 | + private static final int NUM_EXECUTION_SLOTS = 8; | ||
14 | + private static final int SEED = 0; | ||
15 | + | ||
16 | + private ModelConstants() { | ||
17 | + } | ||
18 | + | ||
19 | + public static Classifier getMentionClassifier() { | ||
20 | + RandomForest classifier = new RandomForest(); | ||
21 | + classifier.setNumIterations(NUM_ITERATIONS); | ||
22 | + classifier.setSeed(SEED); | ||
23 | + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS); | ||
24 | + return classifier; | ||
25 | + } | ||
26 | + | ||
27 | + public static Classifier getSentenceClassifier() { | ||
28 | + RandomForest classifier = new RandomForest(); | ||
29 | + classifier.setNumIterations(16); | ||
30 | + classifier.setSeed(0); | ||
31 | + classifier.setNumExecutionSlots(8); | ||
32 | + return classifier; | ||
33 | + } | ||
34 | + | ||
35 | + public static Classifier getZeroClassifier() { | ||
36 | + RandomForest classifier = new RandomForest(); | ||
37 | + classifier.setNumIterations(16); | ||
38 | + classifier.setSeed(0); | ||
39 | + classifier.setNumExecutionSlots(8); | ||
40 | + return classifier; | ||
41 | + } | ||
42 | + | ||
43 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/TrainModel.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/TrainModelCommon.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.common; |
2 | 2 | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel; |
7 | import weka.classifiers.Classifier; | 7 | import weka.classifiers.Classifier; |
8 | import weka.core.Instances; | 8 | import weka.core.Instances; |
9 | import weka.core.converters.ArffLoader; | 9 | import weka.core.converters.ArffLoader; |
@@ -11,41 +11,43 @@ import weka.core.converters.ArffLoader; | @@ -11,41 +11,43 @@ import weka.core.converters.ArffLoader; | ||
11 | import java.io.File; | 11 | import java.io.File; |
12 | import java.io.FileOutputStream; | 12 | import java.io.FileOutputStream; |
13 | import java.io.ObjectOutputStream; | 13 | import java.io.ObjectOutputStream; |
14 | +import java.util.logging.LogManager; | ||
14 | 15 | ||
16 | +@SuppressWarnings("squid:S2118") | ||
17 | +public class TrainModelCommon { | ||
15 | 18 | ||
16 | -public class TrainModel { | 19 | + private static final Logger LOG = LoggerFactory.getLogger(TrainZeroModel.class); |
17 | 20 | ||
18 | - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | 21 | + private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources"; |
19 | 22 | ||
20 | - private TrainModel() { | 23 | + private TrainModelCommon() { |
21 | } | 24 | } |
22 | 25 | ||
23 | - public static void main(String[] args) throws Exception { | 26 | + public static void trainAndSaveModel(String datasetPath, Classifier classifier, String targetPath) throws Exception { |
27 | + LogManager.getLogManager().reset(); // disable WEKA logging | ||
24 | 28 | ||
25 | ArffLoader loader = new ArffLoader(); | 29 | ArffLoader loader = new ArffLoader(); |
26 | - loader.setFile(new File(Constants.ZERO_DATASET_PATH)); | 30 | + loader.setFile(new File(datasetPath)); |
27 | Instances instances = loader.getDataSet(); | 31 | Instances instances = loader.getDataSet(); |
28 | instances.setClassIndex(0); | 32 | instances.setClassIndex(0); |
29 | - LOG.info(instances.size() + " instances loaded."); | ||
30 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | 33 | + LOG.info("{} instances loaded.", instances.size()); |
34 | + LOG.info("{} attributes for each instance.", instances.numAttributes()); | ||
31 | 35 | ||
32 | StopWatch watch = new StopWatch(); | 36 | StopWatch watch = new StopWatch(); |
33 | watch.start(); | 37 | watch.start(); |
34 | 38 | ||
35 | - Classifier classifier = Constants.getZerosClassifier(); | ||
36 | - | ||
37 | LOG.info("Building classifier..."); | 39 | LOG.info("Building classifier..."); |
38 | classifier.buildClassifier(instances); | 40 | classifier.buildClassifier(instances); |
39 | - LOG.info("...done."); | 41 | + LOG.info("...done. Build classifier: {}", classifier); |
40 | 42 | ||
43 | + String target = TARGET_MODEL_DIR + targetPath; | ||
44 | + LOG.info("Saving classifier at: {}", target); | ||
41 | try (ObjectOutputStream oos = new ObjectOutputStream( | 45 | try (ObjectOutputStream oos = new ObjectOutputStream( |
42 | - new FileOutputStream(Constants.ZERO_MODEL_PATH))) { | 46 | + new FileOutputStream(target))) { |
43 | oos.writeObject(classifier); | 47 | oos.writeObject(classifier); |
44 | } | 48 | } |
45 | 49 | ||
46 | watch.stop(); | 50 | watch.stop(); |
47 | - LOG.info("Elapsed time: " + watch); | ||
48 | - | ||
49 | - LOG.info(classifier.toString()); | 51 | + LOG.info("Elapsed time: {}", watch); |
50 | } | 52 | } |
51 | } | 53 | } |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/MentionScorer.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention; |
2 | 2 | ||
3 | import com.google.common.collect.HashMultiset; | 3 | import com.google.common.collect.HashMultiset; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
@@ -14,7 +14,6 @@ import java.util.stream.Collectors; | @@ -14,7 +14,6 @@ import java.util.stream.Collectors; | ||
14 | 14 | ||
15 | public class MentionScorer { | 15 | public class MentionScorer { |
16 | 16 | ||
17 | - | ||
18 | public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { | 17 | public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { |
19 | Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | 18 | Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); |
20 | 19 | ||
@@ -39,20 +38,4 @@ public class MentionScorer { | @@ -39,20 +38,4 @@ public class MentionScorer { | ||
39 | } | 38 | } |
40 | return mention2score; | 39 | return mention2score; |
41 | } | 40 | } |
42 | - | ||
43 | - private static Map<TMention, Double> booleanTokenInclusion(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { | ||
44 | - Map<TMention, Double> mention2score = Maps.newHashMap(); | ||
45 | - for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { | ||
46 | - TMention mention = entry.getKey(); | ||
47 | - String mentionOrth = mention2Orth.get(mention); | ||
48 | - int present = 0; | ||
49 | - for (String token : Utils.tokenize(mentionOrth)) { | ||
50 | - if (tokenCounts.contains(token.toLowerCase())) { | ||
51 | - present++; | ||
52 | - } | ||
53 | - } | ||
54 | - mention2score.putIfAbsent(mention, ((present * 2) >= Utils.tokenize(mentionOrth).size()) ? 1.0 : 0.0); | ||
55 | - } | ||
56 | - return mention2score; | ||
57 | - } | ||
58 | } | 41 | } |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/PrepareTrainingData.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention; |
2 | 2 | ||
3 | import com.google.common.base.Charsets; | 3 | import com.google.common.base.Charsets; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
@@ -7,9 +7,11 @@ import org.slf4j.Logger; | @@ -7,9 +7,11 @@ import org.slf4j.Logger; | ||
7 | import org.slf4j.LoggerFactory; | 7 | import org.slf4j.LoggerFactory; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
11 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | 10 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
11 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
12 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 12 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | ||
14 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
13 | import weka.core.Instance; | 15 | import weka.core.Instance; |
14 | import weka.core.Instances; | 16 | import weka.core.Instances; |
15 | import weka.core.converters.ArffSaver; | 17 | import weka.core.converters.ArffSaver; |
@@ -23,8 +25,11 @@ public class PrepareTrainingData { | @@ -23,8 +25,11 @@ public class PrepareTrainingData { | ||
23 | 25 | ||
24 | private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); | 26 | private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); |
25 | 27 | ||
26 | - public static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | ||
27 | - public static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | 28 | + private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; |
29 | + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | ||
30 | + | ||
31 | + private PrepareTrainingData() { | ||
32 | + } | ||
28 | 33 | ||
29 | public static void main(String[] args) throws IOException { | 34 | public static void main(String[] args) throws IOException { |
30 | 35 | ||
@@ -37,19 +42,20 @@ public class PrepareTrainingData { | @@ -37,19 +42,20 @@ public class PrepareTrainingData { | ||
37 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 42 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
38 | 43 | ||
39 | int i = 1; | 44 | int i = 1; |
40 | - for (String textId : id2preprocessedText.keySet()) { | 45 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
41 | LOG.info(i++ + "/" + id2preprocessedText.size()); | 46 | LOG.info(i++ + "/" + id2preprocessedText.size()); |
42 | 47 | ||
43 | - TText preprocessedText = id2preprocessedText.get(textId); | ||
44 | - String optimalSummary = id2optimalSummary.get(textId); | 48 | + String id = entry.getKey(); |
49 | + TText preprocessedText = entry.getValue(); | ||
50 | + String optimalSummary = id2optimalSummary.get(id); | ||
45 | if (optimalSummary == null) | 51 | if (optimalSummary == null) |
46 | continue; | 52 | continue; |
47 | Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); | 53 | Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); |
48 | 54 | ||
49 | Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); | 55 | Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); |
50 | - for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | ||
51 | - TMention mention = entry.getKey(); | ||
52 | - Instance instance = entry.getValue(); | 56 | + for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) { |
57 | + TMention mention = entry2.getKey(); | ||
58 | + Instance instance = entry2.getValue(); | ||
53 | instance.setDataset(instances); | 59 | instance.setDataset(instances); |
54 | instance.setClassValue(mention2score.get(mention)); | 60 | instance.setClassValue(mention2score.get(mention)); |
55 | instances.add(instance); | 61 | instances.add(instance); |
@@ -61,7 +67,7 @@ public class PrepareTrainingData { | @@ -61,7 +67,7 @@ public class PrepareTrainingData { | ||
61 | private static void saveInstancesToFile(Instances instances) throws IOException { | 67 | private static void saveInstancesToFile(Instances instances) throws IOException { |
62 | ArffSaver saver = new ArffSaver(); | 68 | ArffSaver saver = new ArffSaver(); |
63 | saver.setInstances(instances); | 69 | saver.setInstances(instances); |
64 | - saver.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | 70 | + saver.setFile(new File(ModelConstants.MENTION_DATASET_PATH)); |
65 | saver.writeBatch(); | 71 | saver.writeBatch(); |
66 | } | 72 | } |
67 | 73 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/TrainMentionModel.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon; | ||
6 | +import weka.classifiers.Classifier; | ||
7 | + | ||
8 | +public class TrainMentionModel { | ||
9 | + | ||
10 | + private TrainMentionModel() { | ||
11 | + } | ||
12 | + | ||
13 | + public static void main(String[] args) throws Exception { | ||
14 | + Classifier classifier = ModelConstants.getMentionClassifier(); | ||
15 | + String datasetPath = ModelConstants.MENTION_DATASET_PATH; | ||
16 | + String targetPath = Constants.MENTION_MODEL_RESOURCE_PATH; | ||
17 | + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath); | ||
18 | + } | ||
19 | + | ||
20 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/PrepareTrainingData.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.sentence; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence; |
2 | 2 | ||
3 | import com.google.common.base.Charsets; | 3 | import com.google.common.base.Charsets; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
@@ -8,11 +8,13 @@ import org.slf4j.LoggerFactory; | @@ -8,11 +8,13 @@ import org.slf4j.LoggerFactory; | ||
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
12 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | 11 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
12 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
16 | +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | ||
17 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
16 | import weka.classifiers.Classifier; | 18 | import weka.classifiers.Classifier; |
17 | import weka.core.Instance; | 19 | import weka.core.Instance; |
18 | import weka.core.Instances; | 20 | import weka.core.Instances; |
@@ -31,6 +33,9 @@ public class PrepareTrainingData { | @@ -31,6 +33,9 @@ public class PrepareTrainingData { | ||
31 | private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | 33 | private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; |
32 | private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | 34 | private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; |
33 | 35 | ||
36 | + private PrepareTrainingData() { | ||
37 | + } | ||
38 | + | ||
34 | public static void main(String[] args) throws Exception { | 39 | public static void main(String[] args) throws Exception { |
35 | 40 | ||
36 | Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); | 41 | Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); |
@@ -41,7 +46,7 @@ public class PrepareTrainingData { | @@ -41,7 +46,7 @@ public class PrepareTrainingData { | ||
41 | 46 | ||
42 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 47 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
43 | 48 | ||
44 | - Classifier classifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | 49 | + Classifier classifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); |
45 | MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); | 50 | MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); |
46 | 51 | ||
47 | int i = 1; | 52 | int i = 1; |
@@ -74,7 +79,7 @@ public class PrepareTrainingData { | @@ -74,7 +79,7 @@ public class PrepareTrainingData { | ||
74 | private static void saveInstancesToFile(Instances instances) throws IOException { | 79 | private static void saveInstancesToFile(Instances instances) throws IOException { |
75 | ArffSaver saver = new ArffSaver(); | 80 | ArffSaver saver = new ArffSaver(); |
76 | saver.setInstances(instances); | 81 | saver.setInstances(instances); |
77 | - saver.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | 82 | + saver.setFile(new File(ModelConstants.SENTENCE_DATASET_PATH)); |
78 | saver.writeBatch(); | 83 | saver.writeBatch(); |
79 | } | 84 | } |
80 | 85 |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/SentenceScorer.java
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/TrainSentenceModel.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon; | ||
6 | +import weka.classifiers.Classifier; | ||
7 | + | ||
8 | +public class TrainSentenceModel { | ||
9 | + | ||
10 | + private TrainSentenceModel() { | ||
11 | + } | ||
12 | + | ||
13 | + public static void main(String[] args) throws Exception { | ||
14 | + Classifier classifier = ModelConstants.getSentenceClassifier(); | ||
15 | + String datasetPath = ModelConstants.SENTENCE_DATASET_PATH; | ||
16 | + String targetPath = Constants.SENTENCE_MODEL_RESOURCE_PATH; | ||
17 | + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath); | ||
18 | + } | ||
19 | + | ||
20 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/PrepareTrainingData.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero; |
2 | 2 | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import com.google.common.collect.Sets; | 4 | import com.google.common.collect.Sets; |
@@ -6,11 +6,13 @@ import org.apache.commons.io.IOUtils; | @@ -6,11 +6,13 @@ import org.apache.commons.io.IOUtils; | ||
6 | import org.slf4j.Logger; | 6 | import org.slf4j.Logger; |
7 | import org.slf4j.LoggerFactory; | 7 | import org.slf4j.LoggerFactory; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
10 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
11 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
12 | -import weka.core.Attribute; | ||
13 | -import weka.core.DenseInstance; | 11 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; |
12 | +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | ||
13 | +import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; | ||
14 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | ||
15 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | ||
14 | import weka.core.Instance; | 16 | import weka.core.Instance; |
15 | import weka.core.Instances; | 17 | import weka.core.Instances; |
16 | import weka.core.converters.ArffSaver; | 18 | import weka.core.converters.ArffSaver; |
@@ -54,7 +56,7 @@ public class PrepareTrainingData { | @@ -54,7 +56,7 @@ public class PrepareTrainingData { | ||
54 | FeatureHelper featureHelper = new FeatureHelper(text); | 56 | FeatureHelper featureHelper = new FeatureHelper(text); |
55 | 57 | ||
56 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); | 58 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); |
57 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | 59 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); |
58 | 60 | ||
59 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { | 61 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { |
60 | boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); | 62 | boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); |
@@ -68,24 +70,11 @@ public class PrepareTrainingData { | @@ -68,24 +70,11 @@ public class PrepareTrainingData { | ||
68 | saveInstancesToFile(instances); | 70 | saveInstancesToFile(instances); |
69 | } | 71 | } |
70 | 72 | ||
71 | - public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | ||
72 | - Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | ||
73 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | ||
74 | - for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | ||
75 | - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | ||
76 | - Map<Attribute, Double> sentenceFeatures = entry.getValue(); | ||
77 | - for (Attribute attribute : featureExtractor.getAttributesList()) { | ||
78 | - instance.setValue(attribute, sentenceFeatures.get(attribute)); | ||
79 | - } | ||
80 | - candidate2instance.put(entry.getKey(), instance); | ||
81 | - } | ||
82 | - return candidate2instance; | ||
83 | - } | ||
84 | 73 | ||
85 | private static void saveInstancesToFile(Instances instances) throws IOException { | 74 | private static void saveInstancesToFile(Instances instances) throws IOException { |
86 | ArffSaver saver = new ArffSaver(); | 75 | ArffSaver saver = new ArffSaver(); |
87 | saver.setInstances(instances); | 76 | saver.setInstances(instances); |
88 | - saver.setFile(new File(Constants.ZERO_DATASET_PATH)); | 77 | + saver.setFile(new File(ModelConstants.ZERO_DATASET_PATH)); |
89 | saver.writeBatch(); | 78 | saver.writeBatch(); |
90 | } | 79 | } |
91 | 80 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/TrainZeroModel.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon; | ||
6 | +import weka.classifiers.Classifier; | ||
7 | + | ||
8 | +public class TrainZeroModel { | ||
9 | + | ||
10 | + private TrainZeroModel() { | ||
11 | + } | ||
12 | + | ||
13 | + public static void main(String[] args) throws Exception { | ||
14 | + Classifier classifier = ModelConstants.getZeroClassifier(); | ||
15 | + String datasetPath = ModelConstants.ZERO_DATASET_PATH; | ||
16 | + String targetPath = Constants.ZERO_MODEL_RESOURCE_PATH; | ||
17 | + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath); | ||
18 | + } | ||
19 | + | ||
20 | +} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/ZeroScorer.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero; |
2 | 2 | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import org.apache.commons.csv.CSVFormat; | 4 | import org.apache.commons.csv.CSVFormat; |
@@ -7,6 +7,7 @@ import org.apache.commons.csv.CSVRecord; | @@ -7,6 +7,7 @@ import org.apache.commons.csv.CSVRecord; | ||
7 | import org.apache.commons.csv.QuoteMode; | 7 | import org.apache.commons.csv.QuoteMode; |
8 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
9 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | ||
10 | 11 | ||
11 | import java.io.IOException; | 12 | import java.io.IOException; |
12 | import java.io.InputStream; | 13 | import java.io.InputStream; |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java
@@ -24,6 +24,9 @@ public class NLPProcess { | @@ -24,6 +24,9 @@ public class NLPProcess { | ||
24 | 24 | ||
25 | private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT); | 25 | private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT); |
26 | 26 | ||
27 | + private static final String CORPUS_FILE_SUFFIX = ".xml"; | ||
28 | + private static final String OUTPUT_FILE_SUFFIX = ".thrift"; | ||
29 | + | ||
27 | private NLPProcess() { | 30 | private NLPProcess() { |
28 | } | 31 | } |
29 | 32 | ||
@@ -34,23 +37,27 @@ public class NLPProcess { | @@ -34,23 +37,27 @@ public class NLPProcess { | ||
34 | } | 37 | } |
35 | File corpusDir = new File(args[0]); | 38 | File corpusDir = new File(args[0]); |
36 | if (!corpusDir.isDirectory()) { | 39 | if (!corpusDir.isDirectory()) { |
37 | - LOG.error("Corpus directory does not exist: " + corpusDir); | 40 | + LOG.error("Corpus directory does not exist: {}", corpusDir); |
38 | return; | 41 | return; |
39 | } | 42 | } |
40 | File targetDir = new File(args[1]); | 43 | File targetDir = new File(args[1]); |
41 | if (!targetDir.isDirectory()) { | 44 | if (!targetDir.isDirectory()) { |
42 | - LOG.error("Target directory does not exist: " + targetDir); | 45 | + LOG.error("Target directory does not exist: {}", targetDir); |
43 | return; | 46 | return; |
44 | } | 47 | } |
45 | 48 | ||
46 | int ok = 0; | 49 | int ok = 0; |
47 | int err = 0; | 50 | int err = 0; |
48 | - File[] files = corpusDir.listFiles(f -> f.getName().endsWith(".xml")); | 51 | + File[] files = corpusDir.listFiles(f -> f.getName().endsWith(CORPUS_FILE_SUFFIX)); |
52 | + if (files == null || files.length == 0) { | ||
53 | + LOG.error("No corpus files found at: {}", corpusDir); | ||
54 | + return; | ||
55 | + } | ||
49 | Arrays.sort(files); | 56 | Arrays.sort(files); |
50 | for (File file : files) { | 57 | for (File file : files) { |
51 | try { | 58 | try { |
52 | Text text = PSC_IO.readText(file); | 59 | Text text = PSC_IO.readText(file); |
53 | - File targetFile = new File(targetDir, file.getName().replaceFirst(".xml$", ".bin")); | 60 | + File targetFile = new File(targetDir, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX)); |
54 | annotateNLP(text, targetFile); | 61 | annotateNLP(text, targetFile); |
55 | ok++; | 62 | ok++; |
56 | } catch (Exception e) { | 63 | } catch (Exception e) { |
@@ -58,8 +65,8 @@ public class NLPProcess { | @@ -58,8 +65,8 @@ public class NLPProcess { | ||
58 | LOG.error("Problem with text in " + file + ", " + e); | 65 | LOG.error("Problem with text in " + file + ", " + e); |
59 | } | 66 | } |
60 | } | 67 | } |
61 | - LOG.info(ok + " texts processed successfully."); | ||
62 | - LOG.info(err + " texts with errors."); | 68 | + LOG.info("{} texts processed successfully.", ok); |
69 | + LOG.info("{} texts with errors.", err); | ||
63 | } | 70 | } |
64 | 71 | ||
65 | private static void annotateNLP(Text text, File targetFile) throws Exception { | 72 | private static void annotateNLP(Text text, File targetFile) throws Exception { |
@@ -77,8 +84,8 @@ public class NLPProcess { | @@ -77,8 +84,8 @@ public class NLPProcess { | ||
77 | } | 84 | } |
78 | 85 | ||
79 | public static void serialize(TText ttext, File targetFile) throws IOException { | 86 | public static void serialize(TText ttext, File targetFile) throws IOException { |
80 | - try (FileOutputStream fout = new FileOutputStream(targetFile); | ||
81 | - ObjectOutputStream oos = new ObjectOutputStream(fout)) { | 87 | + try (FileOutputStream fileOutputStream = new FileOutputStream(targetFile); |
88 | + ObjectOutputStream oos = new ObjectOutputStream(fileOutputStream)) { | ||
82 | oos.writeObject(ttext); | 89 | oos.writeObject(ttext); |
83 | } | 90 | } |
84 | } | 91 | } |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/EvalUtils.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateCommon.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.eval; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.search; |
2 | 2 | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.apache.commons.lang3.tuple.Pair; | 4 | import org.apache.commons.lang3.tuple.Pair; |
@@ -14,6 +14,7 @@ import weka.classifiers.functions.SimpleLogistic; | @@ -14,6 +14,7 @@ import weka.classifiers.functions.SimpleLogistic; | ||
14 | import weka.classifiers.lazy.IBk; | 14 | import weka.classifiers.lazy.IBk; |
15 | import weka.classifiers.lazy.KStar; | 15 | import weka.classifiers.lazy.KStar; |
16 | import weka.classifiers.lazy.LWL; | 16 | import weka.classifiers.lazy.LWL; |
17 | +import weka.classifiers.meta.AttributeSelectedClassifier; | ||
17 | import weka.classifiers.rules.DecisionTable; | 18 | import weka.classifiers.rules.DecisionTable; |
18 | import weka.classifiers.rules.JRip; | 19 | import weka.classifiers.rules.JRip; |
19 | import weka.classifiers.rules.PART; | 20 | import weka.classifiers.rules.PART; |
@@ -23,21 +24,49 @@ import weka.classifiers.trees.J48; | @@ -23,21 +24,49 @@ import weka.classifiers.trees.J48; | ||
23 | import weka.classifiers.trees.LMT; | 24 | import weka.classifiers.trees.LMT; |
24 | import weka.classifiers.trees.RandomForest; | 25 | import weka.classifiers.trees.RandomForest; |
25 | import weka.core.Instances; | 26 | import weka.core.Instances; |
27 | +import weka.core.converters.ArffLoader; | ||
26 | 28 | ||
29 | +import java.io.File; | ||
30 | +import java.io.IOException; | ||
27 | import java.util.Arrays; | 31 | import java.util.Arrays; |
28 | import java.util.Comparator; | 32 | import java.util.Comparator; |
29 | import java.util.Optional; | 33 | import java.util.Optional; |
30 | import java.util.Random; | 34 | import java.util.Random; |
35 | +import java.util.logging.LogManager; | ||
31 | 36 | ||
32 | -public class EvalUtils { | ||
33 | 37 | ||
34 | - private static final Logger LOG = LoggerFactory.getLogger(EvalUtils.class); | ||
35 | - public static final int NUM_FOLDS = 10; | 38 | +class CrossvalidateCommon { |
36 | 39 | ||
37 | - private EvalUtils() { | 40 | + private static final Logger LOG = LoggerFactory.getLogger(CrossvalidateCommon.class); |
41 | + | ||
42 | + private static final int NUM_FOLDS = 10; | ||
43 | + | ||
44 | + private CrossvalidateCommon() { | ||
45 | + } | ||
46 | + | ||
47 | + static void crossvalidateClassifiers(String datasetPath) throws IOException { | ||
48 | + Instances instances = loadInstances(datasetPath); | ||
49 | + crossvalidateClassification(instances); | ||
50 | + } | ||
51 | + | ||
52 | + static void crossvalidateRegressors(String datasetPath) throws IOException { | ||
53 | + Instances instances = loadInstances(datasetPath); | ||
54 | + crossvalidateRegression(instances); | ||
38 | } | 55 | } |
39 | 56 | ||
40 | - public static void crossvalidateClassification(Instances instances) throws Exception { | 57 | + private static Instances loadInstances(String datasetPath) throws IOException { |
58 | + LogManager.getLogManager().reset(); // disable WEKA logging | ||
59 | + | ||
60 | + ArffLoader loader = new ArffLoader(); | ||
61 | + loader.setFile(new File(datasetPath)); | ||
62 | + Instances instances = loader.getDataSet(); | ||
63 | + instances.setClassIndex(0); | ||
64 | + LOG.info("{} instances loaded.", instances.size()); | ||
65 | + LOG.info("{} attributes for each instance.", instances.numAttributes()); | ||
66 | + return instances; | ||
67 | + } | ||
68 | + | ||
69 | + private static void crossvalidateClassification(Instances instances) throws IOException { | ||
41 | StopWatch watch = new StopWatch(); | 70 | StopWatch watch = new StopWatch(); |
42 | watch.start(); | 71 | watch.start(); |
43 | 72 | ||
@@ -45,52 +74,58 @@ public class EvalUtils { | @@ -45,52 +74,58 @@ public class EvalUtils { | ||
45 | new Logistic(), new ZeroR(), | 74 | new Logistic(), new ZeroR(), |
46 | new SimpleLogistic(), new BayesNet(), new NaiveBayes(), | 75 | new SimpleLogistic(), new BayesNet(), new NaiveBayes(), |
47 | new KStar(), new IBk(), new LWL(), | 76 | new KStar(), new IBk(), new LWL(), |
48 | - new DecisionTable(), new JRip(), new PART()}).parallel().map(cls -> { | ||
49 | - Evaluation eval = null; | 77 | + new DecisionTable(), new JRip(), new PART(), |
78 | + createAttributeSelectedClassifier()}).parallel().map(cls -> { | ||
79 | + String name = cls.getClass().getSimpleName(); | ||
80 | + double acc = 0; | ||
81 | + Evaluation eval; | ||
50 | try { | 82 | try { |
51 | eval = new Evaluation(instances); | 83 | eval = new Evaluation(instances); |
52 | eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); | 84 | eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); |
53 | } catch (Exception e) { | 85 | } catch (Exception e) { |
54 | - e.printStackTrace(); | 86 | + LOG.error("Error evaluating model", e); |
87 | + return Pair.of(0.0, name); | ||
55 | } | 88 | } |
56 | - double acc = eval.correct() / eval.numInstances(); | ||
57 | - String name = cls.getClass().getSimpleName(); | 89 | + acc = eval.correct() / eval.numInstances(); |
58 | LOG.info(name + " : " + acc); | 90 | LOG.info(name + " : " + acc); |
59 | - | ||
60 | return Pair.of(acc, name); | 91 | return Pair.of(acc, name); |
61 | }).max(Comparator.comparingDouble(Pair::getLeft)); | 92 | }).max(Comparator.comparingDouble(Pair::getLeft)); |
62 | LOG.info("#########"); | 93 | LOG.info("#########"); |
63 | LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); | 94 | LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); |
64 | 95 | ||
65 | watch.stop(); | 96 | watch.stop(); |
66 | - LOG.info("Elapsed time: " + watch); | 97 | + LOG.info("Elapsed time: {}", watch); |
98 | + } | ||
99 | + | ||
100 | + | ||
101 | + private static Classifier createAttributeSelectedClassifier() { | ||
102 | + AttributeSelectedClassifier attributeSelectedClassifier = new AttributeSelectedClassifier(); | ||
103 | + attributeSelectedClassifier.setClassifier(new LMT()); | ||
104 | + return attributeSelectedClassifier; | ||
67 | } | 105 | } |
68 | 106 | ||
69 | - public static void crossvalidateRegression(Instances instances) { | 107 | + private static void crossvalidateRegression(Instances instances) { |
70 | StopWatch watch = new StopWatch(); | 108 | StopWatch watch = new StopWatch(); |
71 | watch.start(); | 109 | watch.start(); |
72 | 110 | ||
73 | Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{ | 111 | Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{ |
74 | new RandomForest(), new LinearRegression(), new KStar()}).parallel().map(cls -> { | 112 | new RandomForest(), new LinearRegression(), new KStar()}).parallel().map(cls -> { |
75 | - Evaluation eval = null; | ||
76 | double acc = 0; | 113 | double acc = 0; |
114 | + String name = cls.getClass().getSimpleName(); | ||
77 | try { | 115 | try { |
78 | - eval = new Evaluation(instances); | 116 | + Evaluation eval = new Evaluation(instances); |
79 | eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); | 117 | eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); |
80 | acc = eval.correlationCoefficient(); | 118 | acc = eval.correlationCoefficient(); |
81 | - | ||
82 | } catch (Exception e) { | 119 | } catch (Exception e) { |
83 | - e.printStackTrace(); | 120 | + LOG.error("Error evaluating model", e); |
84 | } | 121 | } |
85 | - String name = cls.getClass().getSimpleName(); | ||
86 | LOG.info(name + " : " + acc); | 122 | LOG.info(name + " : " + acc); |
87 | - | ||
88 | return Pair.of(acc, name); | 123 | return Pair.of(acc, name); |
89 | }).max(Comparator.comparingDouble(Pair::getLeft)); | 124 | }).max(Comparator.comparingDouble(Pair::getLeft)); |
90 | LOG.info("#########"); | 125 | LOG.info("#########"); |
91 | LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); | 126 | LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); |
92 | 127 | ||
93 | watch.stop(); | 128 | watch.stop(); |
94 | - LOG.info("Elapsed time: " + watch); | 129 | + LOG.info("Elapsed time: {}", watch); |
95 | } | 130 | } |
96 | -} | ||
97 | \ No newline at end of file | 131 | \ No newline at end of file |
132 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateMention.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.search; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
4 | + | ||
5 | + | ||
6 | +public class CrossvalidateMention { | ||
7 | + | ||
8 | + private CrossvalidateMention() { | ||
9 | + } | ||
10 | + | ||
11 | + public static void main(String[] args) throws Exception { | ||
12 | + CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.MENTION_DATASET_PATH); | ||
13 | + } | ||
14 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateSentence.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.search; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
4 | + | ||
5 | + | ||
6 | +public class CrossvalidateSentence { | ||
7 | + | ||
8 | + private CrossvalidateSentence() { | ||
9 | + } | ||
10 | + | ||
11 | + public static void main(String[] args) throws Exception { | ||
12 | + CrossvalidateCommon.crossvalidateRegressors(ModelConstants.SENTENCE_DATASET_PATH); | ||
13 | + } | ||
14 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateZero.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.search; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
4 | + | ||
5 | + | ||
6 | +public class CrossvalidateZero { | ||
7 | + | ||
8 | + private CrossvalidateZero() { | ||
9 | + } | ||
10 | + | ||
11 | + public static void main(String[] args) throws Exception { | ||
12 | + CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.ZERO_DATASET_PATH); | ||
13 | + } | ||
14 | +} |
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/dev_ids.txt
0 → 100644
1 | +199704210011 | ||
2 | +199704210013 | ||
3 | +199704250031 | ||
4 | +199704260017 | ||
5 | +199801030156 | ||
6 | +199801100009 | ||
7 | +199801150038 | ||
8 | +199801150133 | ||
9 | +199801170001 | ||
10 | +199801170129 | ||
11 | +199801170130 | ||
12 | +199801200002 | ||
13 | +199801200132 | ||
14 | +199801210007 | ||
15 | +199801220030 | ||
16 | +199801220127 | ||
17 | +199801230001 | ||
18 | +199801230095 | ||
19 | +199801240116 | ||
20 | +199801240123 | ||
21 | +199801260113 | ||
22 | +199801270108 | ||
23 | +199801280128 | ||
24 | +199801290020 | ||
25 | +199801310032 | ||
26 | +199802040201 | ||
27 | +199901180149 | ||
28 | +199901190049 | ||
29 | +199901230088 | ||
30 | +199901250006 | ||
31 | +199901250008 | ||
32 | +199901250111 | ||
33 | +199901250113 | ||
34 | +199901300064 | ||
35 | +199901300098 | ||
36 | +199902240123 | ||
37 | +199906220027 | ||
38 | +199906220037 | ||
39 | +199906220038 | ||
40 | +199906220056 | ||
41 | +199906220065 | ||
42 | +199906230040 | ||
43 | +199906230052 | ||
44 | +199906240040 | ||
45 | +199906240088 | ||
46 | +199906250007 | ||
47 | +199906250091 | ||
48 | +199906260015 | ||
49 | +199906260018 | ||
50 | +199906260038 | ||
51 | +199907030016 | ||
52 | +199907030018 | ||
53 | +199907030042 | ||
54 | +199907030059 | ||
55 | +199907050032 | ||
56 | +199907050040 | ||
57 | +199907050047 | ||
58 | +199907050071 | ||
59 | +199907270095 | ||
60 | +199907270137 | ||
61 | +199907270145 | ||
62 | +199909210045 | ||
63 | +199909250054 | ||
64 | +199909300064 | ||
65 | +199909300065 | ||
66 | +199909300066 | ||
67 | +199910020049 | ||
68 | +199910020050 | ||
69 | +199910090047 | ||
70 | +199910090049 | ||
71 | +199910090051 | ||
72 | +199910110055 | ||
73 | +199910110057 | ||
74 | +199910210058 | ||
75 | +199910210059 | ||
76 | +199910270041 | ||
77 | +199910280054 | ||
78 | +199910280055 | ||
79 | +199910280057 | ||
80 | +199910300026 | ||
81 | +199911030039 | ||
82 | +199911030040 | ||
83 | +199911030041 | ||
84 | +199911060031 | ||
85 | +199911060042 | ||
86 | +199911060043 | ||
87 | +199911080054 | ||
88 | +199911080055 | ||
89 | +199911080056 | ||
90 | +199911100061 | ||
91 | +199911100062 | ||
92 | +199911100063 | ||
93 | +199911130036 | ||
94 | +199911130037 | ||
95 | +199911130038 | ||
96 | +199911180042 | ||
97 | +199911180043 | ||
98 | +199911180044 | ||
99 | +199911220059 | ||
100 | +199911220061 | ||
101 | +199911220066 | ||
102 | +199911230041 | ||
103 | +199911240035 | ||
104 | +199911240037 | ||
105 | +199911240038 | ||
106 | +199911250055 | ||
107 | +199911250057 | ||
108 | +199912020059 | ||
109 | +199912090045 | ||
110 | +199912090047 | ||
111 | +199912090061 | ||
112 | +199912110041 | ||
113 | +199912110042 | ||
114 | +199912130055 | ||
115 | +199912130057 | ||
116 | +199912170065 | ||
117 | +199912180052 | ||
118 | +199912210018 | ||
119 | +199912210037 | ||
120 | +199912210040 | ||
121 | +199912220045 | ||
122 | +199912220046 | ||
123 | +199912220047 | ||
124 | +199912230058 | ||
125 | +199912230059 | ||
126 | +199912230097 | ||
127 | +199912280028 | ||
128 | +199912280044 | ||
129 | +199912280045 | ||
130 | +199912310085 | ||
131 | +199912310087 | ||
132 | +200001030047 | ||
133 | +200001030106 | ||
134 | +200001040030 | ||
135 | +200001040031 | ||
136 | +200001060052 | ||
137 | +200001060053 | ||
138 | +200001060055 | ||
139 | +200001070062 | ||
140 | +200001070066 | ||
141 | +200001080040 | ||
142 | +200001080041 | ||
143 | +200001140061 | ||
144 | +200001140064 | ||
145 | +200001170049 | ||
146 | +200001170051 | ||
147 | +200001170052 | ||
148 | +200001170053 | ||
149 | +200001180040 | ||
150 | +200001200056 | ||
151 | +200001220023 | ||
152 | +200001220118 | ||
153 | +200001240016 | ||
154 | +200001290042 | ||
155 | +200001310048 | ||
156 | +200001310049 | ||
157 | +200001310050 | ||
158 | +200001310054 | ||
159 | +200002090042 | ||
160 | +200002090043 | ||
161 | +200002120045 | ||
162 | +200002120046 | ||
163 | +200002160046 | ||
164 | +200002160047 | ||
165 | +200002250063 | ||
166 | +200002250065 | ||
167 | +200002250066 | ||
168 | +200002290044 | ||
169 | +200002290045 | ||
170 | +200002290046 | ||
171 | +200002290047 | ||
172 | +200002290048 | ||
173 | +200003010058 | ||
174 | +200003010059 | ||
175 | +200003060054 | ||
176 | +200003060055 | ||
177 | +200003060057 | ||
178 | +200003110047 | ||
179 | +200003110048 | ||
180 | +200003110049 | ||
181 | +200003210044 | ||
182 | +200003210045 | ||
183 | +200004120021 | ||
184 | +200004120022 | ||
185 | +200004120023 | ||
186 | +200004150048 | ||
187 | +200004150049 | ||
188 | +200004150050 | ||
189 | +200004170026 | ||
190 | +200004170065 | ||
191 | +200004220044 | ||
192 | +200004220045 | ||
193 | +200004220046 | ||
194 | +200004220047 | ||
195 | +200004220048 | ||
196 | +200005060030 | ||
197 | +200005150055 | ||
198 | +200005150059 | ||
199 | +200005300045 | ||
200 | +200005300047 | ||
201 | +200005300048 | ||
202 | +200006010065 | ||
203 | +200006010066 | ||
204 | +200006010067 | ||
205 | +200006050056 | ||
206 | +200006050057 | ||
207 | +200006050058 | ||
208 | +200006050059 | ||
209 | +200006050061 | ||
210 | +200006050068 | ||
211 | +200006070056 | ||
212 | +200006080033 | ||
213 | +200006120031 | ||
214 | +200006130055 | ||
215 | +200006130057 | ||
216 | +200006130059 | ||
217 | +200006260069 | ||
218 | +200006260071 | ||
219 | +200006270059 | ||
220 | +200007120068 | ||
221 | +200007120070 | ||
222 | +200007120072 | ||
223 | +200007170026 | ||
224 | +200007180051 | ||
225 | +200007240034 | ||
226 | +200007270050 | ||
227 | +200007280033 | ||
228 | +200008040071 | ||
229 | +200008040073 | ||
230 | +200008250077 | ||
231 | +200008250079 | ||
232 | +200008260055 | ||
233 | +200008310046 | ||
234 | +200010120066 | ||
235 | +200010120074 | ||
236 | +200010130063 | ||
237 | +200010140048 | ||
238 | +200010140049 | ||
239 | +200010160039 | ||
240 | +200010160048 | ||
241 | +200010160049 | ||
242 | +200010180059 | ||
243 | +200010180063 | ||
244 | +200010190066 | ||
245 | +200010190068 | ||
246 | +200011210063 | ||
247 | +200011210064 | ||
248 | +200011210066 | ||
249 | +200012050066 | ||
250 | +200012050067 | ||
251 | +200012050068 | ||
252 | +200012050069 | ||
253 | +200012050070 | ||
254 | +200012050071 | ||
255 | +200012080134 | ||
256 | +200012080137 | ||
257 | +200012110069 | ||
258 | +200012110070 | ||
259 | +200012110071 | ||
260 | +200012110075 | ||
261 | +200012120028 | ||
262 | +200012120068 | ||
263 | +200012120072 | ||
264 | +200012130056 | ||
265 | +200012130100 | ||
266 | +200012130102 | ||
267 | +200012130103 | ||
268 | +200012140095 | ||
269 | +200012140096 | ||
270 | +200012140097 | ||
271 | +200012140098 | ||
272 | +200012140099 | ||
273 | +200012140100 | ||
274 | +200012150076 | ||
275 | +200012160048 | ||
276 | +200012160049 | ||
277 | +200012180083 | ||
278 | +200012180084 | ||
279 | +200012180088 | ||
280 | +200012230028 | ||
281 | +200012230045 | ||
282 | +200012230046 | ||
283 | +200012230047 | ||
284 | +200012230048 | ||
285 | +200012230050 | ||
286 | +200012270055 | ||
287 | +200012270056 | ||
288 | +200101020059 | ||
289 | +200101020062 | ||
290 | +200101020063 | ||
291 | +200101020075 | ||
292 | +200101130048 | ||
293 | +200101130050 | ||
294 | +200101130051 | ||
295 | +200101130055 | ||
296 | +200101150043 | ||
297 | +200101150045 | ||
298 | +200101180050 | ||
299 | +200101180051 | ||
300 | +200101180052 | ||
301 | +200101200048 | ||
302 | +200101220047 | ||
303 | +200101220053 | ||
304 | +200102070011 | ||
305 | +200102070016 | ||
306 | +200102120034 | ||
307 | +200102120057 | ||
308 | +200102130014 | ||
309 | +200102150001 | ||
310 | +200102150014 | ||
311 | +200102160011 | ||
312 | +200102190016 | ||
313 | +200102220001 | ||
314 | +200102220013 | ||
315 | +200102270041 | ||
316 | +200102270062 | ||
317 | +200102280169 | ||
318 | +200103010049 | ||
319 | +200103060022 | ||
320 | +200103060032 | ||
321 | +200103060057 | ||
322 | +200103080026 | ||
323 | +200103080030 | ||
324 | +200103080036 | ||
325 | +200103100019 | ||
326 | +200103100021 | ||
327 | +200103100058 | ||
328 | +200103100062 | ||
329 | +200103130008 | ||
330 | +200103130023 | ||
331 | +200103130069 | ||
332 | +200103200066 | ||
333 | +200103200080 | ||
334 | +200103270069 | ||
335 | +200103310092 | ||
336 | +200104020007 | ||
337 | +200104050011 | ||
338 | +200104100021 | ||
339 | +200104100023 | ||
340 | +200104170015 | ||
341 | +200104170040 | ||
342 | +200104170055 | ||
343 | +200104170057 | ||
344 | +200104190039 | ||
345 | +200104190066 | ||
346 | +200104230031 | ||
347 | +200104230069 | ||
348 | +200104260051 | ||
349 | +200104260053 | ||
350 | +200104300213 | ||
351 | +200104300215 | ||
352 | +200104300217 | ||
353 | +200105020092 | ||
354 | +200105050042 | ||
355 | +200105050043 | ||
356 | +200105050046 | ||
357 | +200105050048 | ||
358 | +200105070017 | ||
359 | +200105140050 | ||
360 | +200105140052 | ||
361 | +200105220096 | ||
362 | +200105290074 | ||
363 | +200105290075 | ||
364 | +200106120068 | ||
365 | +200106120069 | ||
366 | +200106180051 | ||
367 | +200106180053 | ||
368 | +200106200064 | ||
369 | +200106220086 | ||
370 | +200106220087 | ||
371 | +200106220088 | ||
372 | +200106220090 | ||
373 | +200106250050 | ||
374 | +200107120071 | ||
375 | +200107120073 | ||
376 | +200107210129 | ||
377 | +200107240070 | ||
378 | +200107250080 | ||
379 | +200108060051 | ||
380 | +200108060155 | ||
381 | +200108060156 | ||
382 | +200108060157 | ||
383 | +200108070038 | ||
384 | +200108160040 | ||
385 | +200108180123 | ||
386 | +200108200033 | ||
387 | +200108210066 | ||
388 | +200108210074 | ||
389 | +200108270077 | ||
390 | +200108280064 | ||
391 | +200109060061 | ||
392 | +200109130091 | ||
393 | +200109250092 | ||
394 | +200109260097 | ||
395 | +200109270116 | ||
396 | +200110020075 | ||
397 | +200110150056 | ||
398 | +200110150062 | ||
399 | +200110200070 | ||
400 | +200110200071 | ||
401 | +200110220068 | ||
402 | +200111080086 | ||
403 | +200111140055 | ||
404 | +200111210078 | ||
405 | +200111240060 | ||
406 | +200112040031 | ||
407 | +200112040077 | ||
408 | +200112050063 | ||
409 | +200112100041 | ||
410 | +200112190067 | ||
411 | +200201280011 | ||
412 | +200201290029 | ||
413 | +200202280078 | ||
414 | +200203280057 | ||
415 | +200203290107 |
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/test_ids.txt
0 → 100644
1 | +199704210012 | ||
2 | +199704210042 | ||
3 | +199704220007 | ||
4 | +199704220018 | ||
5 | +199704220021 | ||
6 | +199704220044 | ||
7 | +199704230006 | ||
8 | +199704230014 | ||
9 | +199704230029 | ||
10 | +199704230043 | ||
11 | +199704240008 | ||
12 | +199704240019 | ||
13 | +199704240020 | ||
14 | +199704240021 | ||
15 | +199704250018 | ||
16 | +199704250022 | ||
17 | +199704260014 | ||
18 | +199704260015 | ||
19 | +199704260016 | ||
20 | +199704280023 | ||
21 | +199704280025 | ||
22 | +199704280027 | ||
23 | +199704280031 | ||
24 | +199704300031 | ||
25 | +199704300042 | ||
26 | +199704300046 | ||
27 | +199801020010 | ||
28 | +199801020031 | ||
29 | +199801020035 | ||
30 | +199801020070 | ||
31 | +199801020076 | ||
32 | +199801020079 | ||
33 | +199801030068 | ||
34 | +199801030090 | ||
35 | +199801030091 | ||
36 | +199801030129 | ||
37 | +199801030148 | ||
38 | +199801030158 | ||
39 | +199801050023 | ||
40 | +199801050059 | ||
41 | +199801130087 | ||
42 | +199801130129 | ||
43 | +199801140182 | ||
44 | +199801160119 | ||
45 | +199801200106 | ||
46 | +199801220140 | ||
47 | +199801240061 | ||
48 | +199801240096 | ||
49 | +199801260047 | ||
50 | +199801260070 | ||
51 | +199801270055 | ||
52 | +199801270110 | ||
53 | +199801280123 | ||
54 | +199801280158 | ||
55 | +199801280159 | ||
56 | +199801280241 | ||
57 | +199801290022 | ||
58 | +199801310003 | ||
59 | +199801310037 | ||
60 | +199802030127 | ||
61 | +199802040159 | ||
62 | +199802040182 | ||
63 | +199802040202 | ||
64 | +199805220133 | ||
65 | +199808280158 | ||
66 | +199901190073 | ||
67 | +199901190115 | ||
68 | +199901250112 | ||
69 | +199901250117 | ||
70 | +199901270103 | ||
71 | +199901270120 | ||
72 | +199901270122 | ||
73 | +199901290095 | ||
74 | +199901300101 | ||
75 | +199902240095 | ||
76 | +199906220029 | ||
77 | +199906230024 | ||
78 | +199906240084 | ||
79 | +199906260027 | ||
80 | +199907050045 | ||
81 | +199907050076 | ||
82 | +199907140166 | ||
83 | +199907200002 | ||
84 | +199907270004 | ||
85 | +199908260001 | ||
86 | +199909090036 | ||
87 | +199909250018 | ||
88 | +199909270029 | ||
89 | +199910020027 | ||
90 | +199910020029 | ||
91 | +199910270011 | ||
92 | +199911060044 | ||
93 | +199911100038 | ||
94 | +199911100064 | ||
95 | +199911200030 | ||
96 | +199911220063 | ||
97 | +199912020060 | ||
98 | +199912180026 | ||
99 | +199912180034 | ||
100 | +199912220030 | ||
101 | +199912280024 | ||
102 | +199912280046 | ||
103 | +199912300021 | ||
104 | +199912300029 | ||
105 | +200001030029 | ||
106 | +200001030053 | ||
107 | +200001060034 | ||
108 | +200001100035 | ||
109 | +200001100046 | ||
110 | +200001170029 | ||
111 | +200001170033 | ||
112 | +200001170060 | ||
113 | +200001290045 | ||
114 | +200002220027 | ||
115 | +200002240034 | ||
116 | +200002250031 | ||
117 | +200003060062 | ||
118 | +200003110050 | ||
119 | +200004280047 | ||
120 | +200004290022 | ||
121 | +200006050119 | ||
122 | +200006260079 | ||
123 | +200006290045 | ||
124 | +200007150033 | ||
125 | +200008040076 | ||
126 | +200008220042 | ||
127 | +200008220046 | ||
128 | +200010130049 | ||
129 | +200010160054 | ||
130 | +200012130034 | ||
131 | +200012140084 | ||
132 | +200012290046 | ||
133 | +200104040019 | ||
134 | +200106050035 | ||
135 | +200108180109 | ||
136 | +200108300032 | ||
137 | +200111120045 | ||
138 | +200111150042 | ||
139 | +200111150047 | ||
140 | +200111200036 | ||
141 | +200111270049 | ||
142 | +200112030055 | ||
143 | +200112280057 | ||
144 | +200201220038 | ||
145 | +200201220050 | ||
146 | +200202020036 | ||
147 | +200202200032 | ||
148 | +200202210054 | ||
149 | +200202270044 | ||
150 | +200203010070 | ||
151 | +200203190026 | ||
152 | +200203260050 | ||
153 | +200203280017 | ||
154 | +200203290078 |
nicolas-core/src/main/resources/zeros.tsv renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/zeros.tsv
nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessTest.java renamed to nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java
1 | package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; | 1 | package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; |
2 | 2 | ||
3 | +import com.google.common.collect.Lists; | ||
4 | +import org.junit.ClassRule; | ||
3 | import org.junit.Test; | 5 | import org.junit.Test; |
6 | +import org.junit.rules.TemporaryFolder; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
5 | 9 | ||
6 | import java.io.File; | 10 | import java.io.File; |
11 | +import java.util.List; | ||
12 | +import java.util.stream.Collectors; | ||
13 | + | ||
14 | +import static junit.framework.TestCase.assertEquals; | ||
15 | + | ||
16 | +public class NLPProcessIT { | ||
17 | + | ||
18 | + @ClassRule | ||
19 | + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); | ||
7 | 20 | ||
8 | -public class NLPProcessTest { | ||
9 | @Test | 21 | @Test |
10 | public void shouldProcessSampleText() throws Exception { | 22 | public void shouldProcessSampleText() throws Exception { |
11 | String text = "Ala ma kota. Ala ma też psa."; | 23 | String text = "Ala ma kota. Ala ma też psa."; |
12 | TText processed = NLPProcess.annotate(text); | 24 | TText processed = NLPProcess.annotate(text); |
13 | - processed.getParagraphs().stream().flatMap(p->p.getSentences().stream()).forEach(s->System.out.println(s.getId())); | ||
14 | - File targetFile = new File("sample_serialized_text.bin"); | 25 | + List<String> ids = processed.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).map(TSentence::getId).collect(Collectors.toList()); |
26 | + assertEquals(Lists.newArrayList("s-2.1", "s-2.2"), ids); | ||
27 | + | ||
28 | + File targetFile = TEMPORARY_FOLDER.newFile(); | ||
15 | NLPProcess.serialize(processed, targetFile); | 29 | NLPProcess.serialize(processed, targetFile); |
16 | } | 30 | } |
17 | } | 31 | } |
18 | \ No newline at end of file | 32 | \ No newline at end of file |
pom.xml
@@ -11,7 +11,7 @@ | @@ -11,7 +11,7 @@ | ||
11 | <packaging>pom</packaging> | 11 | <packaging>pom</packaging> |
12 | 12 | ||
13 | <modules> | 13 | <modules> |
14 | - <module>nicolas-core</module> | 14 | + <module>nicolas-lib</module> |
15 | <module>nicolas-cli</module> | 15 | <module>nicolas-cli</module> |
16 | <module>nicolas-model</module> | 16 | <module>nicolas-model</module> |
17 | <module>nicolas-train</module> | 17 | <module>nicolas-train</module> |
@@ -26,12 +26,13 @@ | @@ -26,12 +26,13 @@ | ||
26 | <utils.version>1.0</utils.version> | 26 | <utils.version>1.0</utils.version> |
27 | 27 | ||
28 | <commons-csv.version>1.4</commons-csv.version> | 28 | <commons-csv.version>1.4</commons-csv.version> |
29 | - <guava.version>19.0</guava.version> | ||
30 | - <weka-dev.version>3.9.0</weka-dev.version> | 29 | + <guava.version>20.0</guava.version> |
30 | + <weka-dev.version>3.9.1</weka-dev.version> | ||
31 | <commons-lang3.version>3.5</commons-lang3.version> | 31 | <commons-lang3.version>3.5</commons-lang3.version> |
32 | <commons-io.version>2.5</commons-io.version> | 32 | <commons-io.version>2.5</commons-io.version> |
33 | - <slf4j-api.version>1.7.12</slf4j-api.version> | 33 | + <slf4j-api.version>1.7.22</slf4j-api.version> |
34 | <junit.version>4.12</junit.version> | 34 | <junit.version>4.12</junit.version> |
35 | + <zip4j.version>1.3.2</zip4j.version> | ||
35 | </properties> | 36 | </properties> |
36 | 37 | ||
37 | <prerequisites> | 38 | <prerequisites> |
@@ -65,6 +66,16 @@ | @@ -65,6 +66,16 @@ | ||
65 | <artifactId>nicolas-zero</artifactId> | 66 | <artifactId>nicolas-zero</artifactId> |
66 | <version>${project.version}</version> | 67 | <version>${project.version}</version> |
67 | </dependency> | 68 | </dependency> |
69 | + <dependency> | ||
70 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
71 | + <artifactId>nicolas-lib</artifactId> | ||
72 | + <version>${project.version}</version> | ||
73 | + </dependency> | ||
74 | + <dependency> | ||
75 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
76 | + <artifactId>nicolas-train</artifactId> | ||
77 | + <version>${project.version}</version> | ||
78 | + </dependency> | ||
68 | 79 | ||
69 | <!-- internal --> | 80 | <!-- internal --> |
70 | <dependency> | 81 | <dependency> |
@@ -93,6 +104,12 @@ | @@ -93,6 +104,12 @@ | ||
93 | <groupId>nz.ac.waikato.cms.weka</groupId> | 104 | <groupId>nz.ac.waikato.cms.weka</groupId> |
94 | <artifactId>weka-dev</artifactId> | 105 | <artifactId>weka-dev</artifactId> |
95 | <version>${weka-dev.version}</version> | 106 | <version>${weka-dev.version}</version> |
107 | + <exclusions> | ||
108 | + <exclusion> | ||
109 | + <groupId>org.slf4j</groupId> | ||
110 | + <artifactId>slf4j-simple</artifactId> | ||
111 | + </exclusion> | ||
112 | + </exclusions> | ||
96 | </dependency> | 113 | </dependency> |
97 | <dependency> | 114 | <dependency> |
98 | <groupId>org.apache.commons</groupId> | 115 | <groupId>org.apache.commons</groupId> |
@@ -104,6 +121,11 @@ | @@ -104,6 +121,11 @@ | ||
104 | <artifactId>commons-io</artifactId> | 121 | <artifactId>commons-io</artifactId> |
105 | <version>${commons-io.version}</version> | 122 | <version>${commons-io.version}</version> |
106 | </dependency> | 123 | </dependency> |
124 | + <dependency> | ||
125 | + <groupId>net.lingala.zip4j</groupId> | ||
126 | + <artifactId>zip4j</artifactId> | ||
127 | + <version>${zip4j.version}</version> | ||
128 | + </dependency> | ||
107 | 129 | ||
108 | <!-- logging --> | 130 | <!-- logging --> |
109 | <dependency> | 131 | <dependency> |
@@ -111,6 +133,11 @@ | @@ -111,6 +133,11 @@ | ||
111 | <artifactId>slf4j-api</artifactId> | 133 | <artifactId>slf4j-api</artifactId> |
112 | <version>${slf4j-api.version}</version> | 134 | <version>${slf4j-api.version}</version> |
113 | </dependency> | 135 | </dependency> |
136 | + <dependency> | ||
137 | + <groupId>org.slf4j</groupId> | ||
138 | + <artifactId>slf4j-simple</artifactId> | ||
139 | + <version>${slf4j-api.version}</version> | ||
140 | + </dependency> | ||
114 | 141 | ||
115 | <!-- test --> | 142 | <!-- test --> |
116 | <dependency> | 143 | <dependency> |