Commit 76eeceb70c81d7fdfe3860db5a8576f0e4234daf
1 parent
f04fcb1a
large refactor
Showing
60 changed files
with
1238 additions
and
478 deletions
.gitignore
nicolas-common/pom.xml
... | ... | @@ -27,6 +27,10 @@ |
27 | 27 | <groupId>nz.ac.waikato.cms.weka</groupId> |
28 | 28 | <artifactId>weka-dev</artifactId> |
29 | 29 | </dependency> |
30 | + <dependency> | |
31 | + <groupId>commons-io</groupId> | |
32 | + <artifactId>commons-io</artifactId> | |
33 | + </dependency> | |
30 | 34 | |
31 | 35 | <!-- logging --> |
32 | 36 | <dependency> |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java
... | ... | @@ -2,26 +2,21 @@ package pl.waw.ipipan.zil.summ.nicolas.common; |
2 | 2 | |
3 | 3 | import com.google.common.base.Charsets; |
4 | 4 | import com.google.common.collect.ImmutableList; |
5 | -import weka.classifiers.Classifier; | |
6 | -import weka.classifiers.functions.SMO; | |
7 | -import weka.classifiers.meta.AdaBoostM1; | |
8 | -import weka.classifiers.meta.AttributeSelectedClassifier; | |
9 | -import weka.classifiers.rules.JRip; | |
10 | -import weka.classifiers.trees.J48; | |
11 | -import weka.classifiers.trees.RandomForest; | |
12 | 5 | |
13 | 6 | import java.nio.charset.Charset; |
14 | 7 | |
15 | 8 | |
16 | 9 | public class Constants { |
17 | 10 | |
18 | - public static final String MENTIONS_MODEL_PATH = "mentions_model.bin"; | |
19 | - public static final String SENTENCES_MODEL_PATH = "sentences_model.bin"; | |
20 | - public static final String ZERO_MODEL_PATH = "zeros_model.bin"; | |
11 | + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; | |
21 | 12 | |
22 | - public static final String MENTIONS_DATASET_PATH = "mentions_train.arff"; | |
23 | - public static final String SENTENCES_DATASET_PATH = "sentences_train.arff"; | |
24 | - public static final String ZERO_DATASET_PATH = "zeros_train.arff"; | |
13 | + private static final String MODELS_PATH = ROOT_PATH + "models/"; | |
14 | + public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin"; | |
15 | + public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin"; | |
16 | + public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin"; | |
17 | + | |
18 | + private static final String RESOURCES_PATH = ROOT_PATH + "resources/"; | |
19 | + public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt"; | |
25 | 20 | |
26 | 21 | public static final Charset ENCODING = Charsets.UTF_8; |
27 | 22 | |
... | ... | @@ -30,24 +25,4 @@ public class Constants { |
30 | 25 | private Constants() { |
31 | 26 | } |
32 | 27 | |
33 | - public static Classifier getMentionClassifier() { | |
34 | - RandomForest classifier = new RandomForest(); | |
35 | - classifier.setNumIterations(250); | |
36 | - classifier.setSeed(0); | |
37 | - classifier.setNumExecutionSlots(8); | |
38 | - return classifier; | |
39 | - } | |
40 | - | |
41 | - public static Classifier getSentencesClassifier() { | |
42 | - RandomForest classifier = new RandomForest(); | |
43 | - classifier.setNumIterations(10); | |
44 | - classifier.setSeed(0); | |
45 | - classifier.setNumExecutionSlots(8); | |
46 | - return classifier; | |
47 | - } | |
48 | - | |
49 | - public static Classifier getZerosClassifier() { | |
50 | - Classifier classifier = new J48(); | |
51 | - return classifier; | |
52 | - } | |
53 | 28 | } |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
... | ... | @@ -3,6 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.common; |
3 | 3 | import com.google.common.collect.Lists; |
4 | 4 | import com.google.common.collect.Maps; |
5 | 5 | import com.google.common.collect.Sets; |
6 | +import org.apache.commons.io.IOUtils; | |
6 | 7 | import org.slf4j.Logger; |
7 | 8 | import org.slf4j.LoggerFactory; |
8 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
... | ... | @@ -24,6 +25,47 @@ public class Utils { |
24 | 25 | |
25 | 26 | private static final String DATASET_NAME = "Dataset"; |
26 | 27 | |
28 | + private Utils() { | |
29 | + } | |
30 | + | |
31 | + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { | |
32 | + LOG.info("Loading classifier from path: {}...", modelResourcePath); | |
33 | + try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { | |
34 | + if (stream == null) { | |
35 | + throw new IOException("Model not found at: " + modelResourcePath); | |
36 | + } | |
37 | + try (ObjectInputStream ois = new ObjectInputStream(stream)) { | |
38 | + Classifier classifier = (Classifier) ois.readObject(); | |
39 | + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); | |
40 | + return classifier; | |
41 | + } catch (ClassNotFoundException e) { | |
42 | + LOG.error("Error loading serialized classifier, class not found.", e); | |
43 | + throw new IOException(e); | |
44 | + } | |
45 | + } | |
46 | + } | |
47 | + | |
48 | + public static TText loadThriftTextFromResource(String textResourcePath) throws IOException { | |
49 | + try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) { | |
50 | + if (stream == null) { | |
51 | + throw new IOException("Resource not found at: " + textResourcePath); | |
52 | + } | |
53 | + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { | |
54 | + return (TText) ois.readObject(); | |
55 | + } catch (ClassNotFoundException e) { | |
56 | + LOG.error("Error reading serialized thrift text file, class not found.", e); | |
57 | + throw new IOException(e); | |
58 | + } | |
59 | + } | |
60 | + } | |
61 | + | |
62 | + public static List<String> loadLinesFromResource(String resourcePath) throws IOException { | |
63 | + try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) { | |
64 | + return IOUtils.readLines(stream, Constants.ENCODING); | |
65 | + } | |
66 | + } | |
67 | + | |
68 | + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList | |
27 | 69 | public static Instances createNewInstances(ArrayList<Attribute> attributesList) { |
28 | 70 | Instances instances = new Instances(DATASET_NAME, attributesList, 0); |
29 | 71 | instances.setClassIndex(0); |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java
... | ... | @@ -8,10 +8,12 @@ import java.io.ObjectStreamClass; |
8 | 8 | |
9 | 9 | public class VersionIgnoringObjectInputStream extends ObjectInputStream { |
10 | 10 | |
11 | - public VersionIgnoringObjectInputStream(InputStream in) throws IOException { | |
11 | + VersionIgnoringObjectInputStream(InputStream in) throws IOException { | |
12 | 12 | super(in); |
13 | 13 | } |
14 | 14 | |
15 | + @Override | |
16 | + @SuppressWarnings("squid:S1166") | |
15 | 17 | protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException { |
16 | 18 | ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor |
17 | 19 | Class localClass; // the class in the local JVM that this descriptor represents. |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java
... | ... | @@ -17,6 +17,7 @@ public class FeatureExtractor { |
17 | 17 | |
18 | 18 | private final Set<String> normalizedAttributes = Sets.newHashSet(); |
19 | 19 | |
20 | + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList | |
20 | 21 | public ArrayList<Attribute> getAttributesList() { |
21 | 22 | return Lists.newArrayList(sortedAttributes); |
22 | 23 | } |
... | ... | @@ -46,15 +47,14 @@ public class FeatureExtractor { |
46 | 47 | protected void fillSortedAttributes(String scoreAttName) { |
47 | 48 | sortedAttributes.addAll(name2attribute.values()); |
48 | 49 | sortedAttributes.remove(getAttributeByName(scoreAttName)); |
49 | - Collections.sort(sortedAttributes, (o1, o2) -> name2attribute.inverse().get(o1).compareTo(name2attribute.inverse().get(o2))); | |
50 | + sortedAttributes.sort(Comparator.comparing(name2attribute.inverse()::get)); | |
50 | 51 | sortedAttributes.add(0, getAttributeByName(scoreAttName)); |
51 | 52 | } |
52 | 53 | |
53 | 54 | protected <T> void addNormalizedAttributeValues(Map<T, Map<Attribute, Double>> entity2attributes) { |
54 | 55 | Map<Attribute, Double> attribute2max = Maps.newHashMap(); |
55 | 56 | Map<Attribute, Double> attribute2min = Maps.newHashMap(); |
56 | - for (T entity : entity2attributes.keySet()) { | |
57 | - Map<Attribute, Double> entityAttributes = entity2attributes.get(entity); | |
57 | + for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) { | |
58 | 58 | for (String attributeName : normalizedAttributes) { |
59 | 59 | Attribute attribute = getAttributeByName(attributeName); |
60 | 60 | Double value = entityAttributes.get(attribute); |
... | ... | @@ -66,8 +66,7 @@ public class FeatureExtractor { |
66 | 66 | attribute2min.compute(attribute, (k, v) -> Math.min(v, value)); |
67 | 67 | } |
68 | 68 | } |
69 | - for (T mention : entity2attributes.keySet()) { | |
70 | - Map<Attribute, Double> entityAttributes = entity2attributes.get(mention); | |
69 | + for (Map<Attribute, Double> entityAttributes : entity2attributes.values()) { | |
71 | 70 | for (Attribute attribute : attribute2max.keySet()) { |
72 | 71 | Attribute normalizedAttribute = getAttributeByName(name2attribute.inverse().get(attribute) + "_normalized"); |
73 | 72 | entityAttributes.put(normalizedAttribute, |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java
... | ... | @@ -174,11 +174,11 @@ public class FeatureHelper { |
174 | 174 | } |
175 | 175 | |
176 | 176 | public boolean isNested(TMention mention) { |
177 | - return mentions.stream().anyMatch(m -> m.getChildIds().containsAll(mention.getChildIds())); | |
177 | + return mentions.stream().anyMatch(m -> !m.equals(mention) && m.getChildIds().containsAll(mention.getChildIds())); | |
178 | 178 | } |
179 | 179 | |
180 | 180 | public boolean isNesting(TMention mention) { |
181 | - return mentions.stream().anyMatch(m -> mention.getChildIds().containsAll(m.getChildIds())); | |
181 | + return mentions.stream().anyMatch(m -> !m.equals(mention) && mention.getChildIds().containsAll(m.getChildIds())); | |
182 | 182 | } |
183 | 183 | |
184 | 184 | public Set<TCoreference> getClusters() { |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java
... | ... | @@ -33,6 +33,7 @@ public class Interpretation { |
33 | 33 | person = split[3]; |
34 | 34 | break; |
35 | 35 | case "siebie": |
36 | + case "prep": | |
36 | 37 | casee = split[0]; |
37 | 38 | break; |
38 | 39 | case "fin": |
... | ... | @@ -47,9 +48,6 @@ public class Interpretation { |
47 | 48 | number = split[0]; |
48 | 49 | gender = split[1]; |
49 | 50 | break; |
50 | - case "prep": | |
51 | - casee = split[0]; | |
52 | - break; | |
53 | 51 | default: |
54 | 52 | break; |
55 | 53 | } |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/TrainModel.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention; | |
2 | - | |
3 | -import org.apache.commons.lang3.time.StopWatch; | |
4 | -import org.slf4j.Logger; | |
5 | -import org.slf4j.LoggerFactory; | |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | -import weka.classifiers.Classifier; | |
8 | -import weka.core.Instances; | |
9 | -import weka.core.converters.ArffLoader; | |
10 | - | |
11 | -import java.io.File; | |
12 | -import java.io.FileOutputStream; | |
13 | -import java.io.ObjectOutputStream; | |
14 | - | |
15 | - | |
16 | -public class TrainModel { | |
17 | - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | |
18 | - | |
19 | - public static void main(String[] args) throws Exception { | |
20 | - | |
21 | - ArffLoader loader = new ArffLoader(); | |
22 | - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | |
23 | - Instances instances = loader.getDataSet(); | |
24 | - instances.setClassIndex(0); | |
25 | - LOG.info(instances.size() + " instances loaded."); | |
26 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | |
27 | - | |
28 | - StopWatch watch = new StopWatch(); | |
29 | - watch.start(); | |
30 | - | |
31 | - Classifier classifier = Constants.getMentionClassifier(); | |
32 | - | |
33 | - LOG.info("Building classifier..."); | |
34 | - classifier.buildClassifier(instances); | |
35 | - LOG.info("...done."); | |
36 | - | |
37 | - try (ObjectOutputStream oos = new ObjectOutputStream( | |
38 | - new FileOutputStream(Constants.MENTIONS_MODEL_PATH))) { | |
39 | - oos.writeObject(classifier); | |
40 | - } | |
41 | - | |
42 | - watch.stop(); | |
43 | - LOG.info("Elapsed time: " + watch); | |
44 | - | |
45 | - LOG.info(classifier.toString()); | |
46 | - } | |
47 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention.test; | |
2 | - | |
3 | -import org.slf4j.Logger; | |
4 | -import org.slf4j.LoggerFactory; | |
5 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
6 | -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils; | |
7 | -import weka.core.Instances; | |
8 | -import weka.core.converters.ArffLoader; | |
9 | - | |
10 | -import java.io.File; | |
11 | - | |
12 | - | |
13 | -public class Crossvalidate { | |
14 | - | |
15 | - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | |
16 | - | |
17 | - private Crossvalidate() { | |
18 | - } | |
19 | - | |
20 | - public static void main(String[] args) throws Exception { | |
21 | - ArffLoader loader = new ArffLoader(); | |
22 | - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | |
23 | - Instances instances = loader.getDataSet(); | |
24 | - instances.setClassIndex(0); | |
25 | - LOG.info(instances.size() + " instances loaded."); | |
26 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | |
27 | - | |
28 | - EvalUtils.crossvalidateClassification(instances); | |
29 | - } | |
30 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention.test; | |
2 | - | |
3 | -import org.apache.commons.lang3.time.StopWatch; | |
4 | -import org.slf4j.Logger; | |
5 | -import org.slf4j.LoggerFactory; | |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | -import weka.classifiers.Classifier; | |
8 | -import weka.classifiers.evaluation.Evaluation; | |
9 | -import weka.core.Instances; | |
10 | -import weka.core.converters.ArffLoader; | |
11 | - | |
12 | -import java.io.File; | |
13 | -import java.io.FileInputStream; | |
14 | -import java.io.IOException; | |
15 | -import java.io.ObjectInputStream; | |
16 | - | |
17 | - | |
18 | -public class Validate { | |
19 | - private static final Logger LOG = LoggerFactory.getLogger(Validate.class); | |
20 | - | |
21 | - public static void main(String[] args) throws Exception { | |
22 | - | |
23 | - ArffLoader loader = new ArffLoader(); | |
24 | - loader.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | |
25 | - Instances instances = loader.getDataSet(); | |
26 | - instances.setClassIndex(0); | |
27 | - LOG.info(instances.size() + " instances loaded."); | |
28 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | |
29 | - | |
30 | - Classifier classifier = loadClassifier(); | |
31 | - | |
32 | - StopWatch watch = new StopWatch(); | |
33 | - watch.start(); | |
34 | - | |
35 | - Evaluation eval = new Evaluation(instances); | |
36 | - eval.evaluateModel(classifier, instances); | |
37 | - | |
38 | - LOG.info(eval.toSummaryString()); | |
39 | - | |
40 | - watch.stop(); | |
41 | - LOG.info("Elapsed time: " + watch); | |
42 | - } | |
43 | - | |
44 | - private static Classifier loadClassifier() throws IOException, ClassNotFoundException { | |
45 | - LOG.info("Loading classifier..."); | |
46 | - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(Constants.MENTIONS_MODEL_PATH))) { | |
47 | - Classifier classifier = (Classifier) ois.readObject(); | |
48 | - LOG.info("Done. " + classifier.toString()); | |
49 | - return classifier; | |
50 | - } | |
51 | - } | |
52 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/TrainModel.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.sentence; | |
2 | - | |
3 | -import org.apache.commons.lang3.time.StopWatch; | |
4 | -import org.slf4j.Logger; | |
5 | -import org.slf4j.LoggerFactory; | |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | -import weka.classifiers.Classifier; | |
8 | -import weka.core.Instances; | |
9 | -import weka.core.converters.ArffLoader; | |
10 | - | |
11 | -import java.io.File; | |
12 | -import java.io.FileOutputStream; | |
13 | -import java.io.ObjectOutputStream; | |
14 | - | |
15 | - | |
16 | -public class TrainModel { | |
17 | - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | |
18 | - | |
19 | - public static void main(String[] args) throws Exception { | |
20 | - | |
21 | - ArffLoader loader = new ArffLoader(); | |
22 | - loader.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | |
23 | - Instances instances = loader.getDataSet(); | |
24 | - instances.setClassIndex(0); | |
25 | - LOG.info(instances.size() + " instances loaded."); | |
26 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | |
27 | - | |
28 | - StopWatch watch = new StopWatch(); | |
29 | - watch.start(); | |
30 | - | |
31 | - Classifier classifier = Constants.getSentencesClassifier(); | |
32 | - | |
33 | - LOG.info("Building classifier..."); | |
34 | - classifier.buildClassifier(instances); | |
35 | - LOG.info("...done."); | |
36 | - | |
37 | - try (ObjectOutputStream oos = new ObjectOutputStream( | |
38 | - new FileOutputStream(Constants.SENTENCES_MODEL_PATH))) { | |
39 | - oos.writeObject(classifier); | |
40 | - } | |
41 | - | |
42 | - watch.stop(); | |
43 | - LOG.info("Elapsed time: " + watch); | |
44 | - | |
45 | - LOG.info(classifier.toString()); | |
46 | - } | |
47 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.sentence.test; | |
2 | - | |
3 | -import org.slf4j.Logger; | |
4 | -import org.slf4j.LoggerFactory; | |
5 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
6 | -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils; | |
7 | -import weka.core.Instances; | |
8 | -import weka.core.converters.ArffLoader; | |
9 | - | |
10 | -import java.io.File; | |
11 | - | |
12 | - | |
13 | -public class Crossvalidate { | |
14 | - | |
15 | - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | |
16 | - | |
17 | - private Crossvalidate() { | |
18 | - } | |
19 | - | |
20 | - public static void main(String[] args) throws Exception { | |
21 | - | |
22 | - ArffLoader loader = new ArffLoader(); | |
23 | - loader.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | |
24 | - Instances instances = loader.getDataSet(); | |
25 | - instances.setClassIndex(0); | |
26 | - LOG.info(instances.size() + " instances loaded."); | |
27 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | |
28 | - | |
29 | - EvalUtils.crossvalidateRegression(instances); | |
30 | - } | |
31 | -} |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/test/Crossvalidate.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero.test; | |
2 | - | |
3 | -import org.slf4j.Logger; | |
4 | -import org.slf4j.LoggerFactory; | |
5 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
6 | -import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils; | |
7 | -import weka.core.Instances; | |
8 | -import weka.core.converters.ArffLoader; | |
9 | - | |
10 | -import java.io.File; | |
11 | - | |
12 | - | |
13 | -public class Crossvalidate { | |
14 | - | |
15 | - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | |
16 | - | |
17 | - private Crossvalidate() { | |
18 | - } | |
19 | - | |
20 | - public static void main(String[] args) throws Exception { | |
21 | - | |
22 | - ArffLoader loader = new ArffLoader(); | |
23 | - loader.setFile(new File(Constants.ZERO_DATASET_PATH)); | |
24 | - Instances instances = loader.getDataSet(); | |
25 | - instances.setClassIndex(0); | |
26 | - LOG.info(instances.size() + " instances loaded."); | |
27 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | |
28 | - | |
29 | - EvalUtils.crossvalidateClassification(instances); | |
30 | - } | |
31 | -} |
nicolas-core/pom.xml renamed to nicolas-lib/pom.xml
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... | ... | @@ -11,6 +11,7 @@ import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
11 | 11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
12 | 12 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
13 | 13 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; |
14 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | |
14 | 15 | import weka.classifiers.Classifier; |
15 | 16 | |
16 | 17 | import java.io.IOException; |
... | ... | @@ -20,22 +21,27 @@ import static java.util.stream.Collectors.toList; |
20 | 21 | |
21 | 22 | public class Nicolas { |
22 | 23 | |
23 | - private final Classifier sentenceClassifier; | |
24 | - private final Classifier mentionClassifier; | |
25 | - private final MentionFeatureExtractor featureExtractor; | |
24 | + private final Classifier mentionModel; | |
25 | + private final Classifier sentenceModel; | |
26 | + private final Classifier zeroModel; | |
27 | + | |
28 | + private final MentionFeatureExtractor mentionFeatureExtractor; | |
26 | 29 | private final SentenceFeatureExtractor sentenceFeatureExtractor; |
30 | + private final ZeroFeatureExtractor zeroFeatureExtractor; | |
27 | 31 | |
28 | 32 | public Nicolas() throws IOException, ClassNotFoundException { |
29 | - mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | |
30 | - featureExtractor = new MentionFeatureExtractor(); | |
33 | + mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | |
34 | + sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
35 | + zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | |
31 | 36 | |
32 | - sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); | |
37 | + mentionFeatureExtractor = new MentionFeatureExtractor(); | |
33 | 38 | sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
39 | + zeroFeatureExtractor = new ZeroFeatureExtractor(); | |
34 | 40 | } |
35 | 41 | |
36 | 42 | public String summarizeThrift(TText text, int targetTokenCount) throws Exception { |
37 | 43 | Set<TMention> goodMentions |
38 | - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); | |
44 | + = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); | |
39 | 45 | return calculateSummary(text, goodMentions, targetTokenCount); |
40 | 46 | } |
41 | 47 | |
... | ... | @@ -52,10 +58,10 @@ public class Nicolas { |
52 | 58 | private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { |
53 | 59 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
54 | 60 | |
55 | - Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceClassifier, sentenceFeatureExtractor); | |
61 | + Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor); | |
56 | 62 | |
57 | 63 | List<TSentence> sortedSents = Lists.newArrayList(sents); |
58 | - Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed()); | |
64 | + sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); | |
59 | 65 | |
60 | 66 | int size = 0; |
61 | 67 | Random r = new Random(1); |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java
1 | 1 | package pl.waw.ipipan.zil.summ.nicolas; |
2 | 2 | |
3 | -import com.google.common.base.Charsets; | |
4 | 3 | import com.google.common.collect.Maps; |
5 | -import com.google.common.io.Files; | |
6 | 4 | import org.slf4j.Logger; |
7 | 5 | import org.slf4j.LoggerFactory; |
8 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | 9 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
12 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionScorer; | |
13 | 10 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
14 | 11 | import weka.core.Attribute; |
15 | 12 | import weka.core.DenseInstance; |
16 | 13 | import weka.core.Instance; |
17 | 14 | |
18 | -import java.io.File; | |
19 | -import java.io.IOException; | |
20 | 15 | import java.util.List; |
21 | 16 | import java.util.Map; |
22 | 17 | import java.util.Set; |
... | ... | @@ -30,16 +25,6 @@ public class ThriftUtils { |
30 | 25 | private ThriftUtils() { |
31 | 26 | } |
32 | 27 | |
33 | - public static Set<TMention> loadGoldGoodMentions(String id, TText text, boolean dev) throws IOException { | |
34 | - String optimalSummary = Files.toString(new File("src/main/resources/optimal_summaries/" + (dev ? "dev" : "test") + "/" + id + "_theoretic_ub_rouge_1.txt"), Charsets.UTF_8); | |
35 | - | |
36 | - MentionScorer scorer = new MentionScorer(); | |
37 | - Map<TMention, Double> mention2score = scorer.calculateMentionScores(optimalSummary, text); | |
38 | - | |
39 | - mention2score.keySet().removeIf(tMention -> mention2score.get(tMention) != 1.0); | |
40 | - return mention2score.keySet(); | |
41 | - } | |
42 | - | |
43 | 28 | public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { |
44 | 29 | List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
45 | 30 | Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel2.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java
... | ... | @@ -26,18 +26,18 @@ import java.util.*; |
26 | 26 | |
27 | 27 | import static java.util.stream.Collectors.toList; |
28 | 28 | |
29 | -public class ApplyModel2 { | |
29 | +public class ApplyModel { | |
30 | 30 | |
31 | - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel2.class); | |
31 | + private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class); | |
32 | 32 | |
33 | 33 | private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; |
34 | 34 | private static final String TARGET_DIR = "corpora/summaries"; |
35 | 35 | |
36 | 36 | public static void main(String[] args) throws Exception { |
37 | - Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | |
37 | + Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); | |
38 | 38 | MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); |
39 | 39 | |
40 | - Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCES_MODEL_PATH); | |
40 | + Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
41 | 41 | SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
42 | 42 | |
43 | 43 | ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); |
... | ... | @@ -102,7 +102,7 @@ public class ApplyModel2 { |
102 | 102 | } |
103 | 103 | |
104 | 104 | List<TSentence> sortedSents = Lists.newArrayList(sents); |
105 | - Collections.sort(sortedSents, Comparator.comparing(sentence2score::get).reversed()); | |
105 | + sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); | |
106 | 106 | |
107 | 107 | int size = 0; |
108 | 108 | Random r = new Random(1); |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
1 | 1 | package pl.waw.ipipan.zil.summ.nicolas.mention; |
2 | 2 | |
3 | -import com.google.common.collect.*; | |
3 | +import com.google.common.collect.Lists; | |
4 | +import com.google.common.collect.Maps; | |
4 | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
5 | 6 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
6 | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; |
7 | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
8 | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; |
9 | 11 | import weka.core.Attribute; |
10 | 12 | |
11 | -import java.io.File; | |
12 | 13 | import java.io.IOException; |
13 | -import java.nio.file.Files; | |
14 | -import java.util.*; | |
14 | +import java.util.List; | |
15 | +import java.util.Map; | |
15 | 16 | import java.util.stream.Collectors; |
16 | -import java.util.stream.Stream; | |
17 | 17 | |
18 | 18 | |
19 | 19 | public class MentionFeatureExtractor extends FeatureExtractor { |
20 | 20 | |
21 | - private final List<String> frequentBases = Lists.newArrayList(); | |
21 | + private final List<String> frequentBases; | |
22 | 22 | |
23 | - public MentionFeatureExtractor() { | |
23 | + public MentionFeatureExtractor() throws IOException { | |
24 | + frequentBases = loadFrequentBases(); | |
24 | 25 | |
25 | 26 | //coref |
26 | 27 | addNumericAttributeNormalized("chain_length"); |
... | ... | @@ -70,7 +71,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
70 | 71 | addBinaryAttribute(prefix + "_sent_ends_with_questionmark"); |
71 | 72 | |
72 | 73 | // frequent bases |
73 | - loadFrequentBases(); | |
74 | 74 | for (String base : frequentBases) { |
75 | 75 | addBinaryAttribute(prefix + "_" + encodeBase(base)); |
76 | 76 | } |
... | ... | @@ -80,17 +80,12 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
80 | 80 | fillSortedAttributes("score"); |
81 | 81 | } |
82 | 82 | |
83 | - private String encodeBase(String base) { | |
84 | - return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); | |
83 | + private List<String> loadFrequentBases() throws IOException { | |
84 | + return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList()); | |
85 | 85 | } |
86 | 86 | |
87 | - private void loadFrequentBases() { | |
88 | - try { | |
89 | - Stream<String> lines = Files.lines(new File("frequent_bases.txt").toPath()); | |
90 | - this.frequentBases.addAll(lines.map(String::trim).collect(Collectors.toList())); | |
91 | - } catch (IOException e) { | |
92 | - e.printStackTrace(); | |
93 | - } | |
87 | + private String encodeBase(String base) { | |
88 | + return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); | |
94 | 89 | } |
95 | 90 | |
96 | 91 | public Map<TMention, Map<Attribute, Double>> calculateFeatures(TText preprocessedText) { |
... | ... | @@ -123,8 +118,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
123 | 118 | attribute2value.put(getAttributeByName("text_par_count"), (double) pars.size()); |
124 | 119 | attribute2value.put(getAttributeByName("text_mention_count"), (double) helper.getMentions().size()); |
125 | 120 | attribute2value.put(getAttributeByName("text_cluster_count"), (double) helper.getClusters().size()); |
126 | - | |
127 | - assert (attribute2value.size() == getAttributesList().size()); | |
128 | 121 | } |
129 | 122 | addNormalizedAttributeValues(result); |
130 | 123 | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
... | ... | @@ -87,7 +87,6 @@ public class SentenceFeatureExtractor extends FeatureExtractor { |
87 | 87 | feature2value.put(getAttributeByName("score"), weka.core.Utils.missingValue()); |
88 | 88 | |
89 | 89 | feature2value.remove(null); |
90 | - assert (feature2value.size() == getAttributesList().size()); | |
91 | 90 | |
92 | 91 | sentence2features.put(sentence, feature2value); |
93 | 92 | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/InstanceCreator.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; | |
2 | + | |
3 | +import com.google.common.collect.Maps; | |
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
5 | +import weka.core.Attribute; | |
6 | +import weka.core.DenseInstance; | |
7 | +import weka.core.Instance; | |
8 | + | |
9 | +import java.util.List; | |
10 | +import java.util.Map; | |
11 | + | |
12 | +public class InstanceCreator { | |
13 | + | |
14 | + private InstanceCreator() { | |
15 | + } | |
16 | + | |
17 | + public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | |
18 | + Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | |
19 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | |
20 | + for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | |
21 | + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
22 | + Map<Attribute, Double> sentenceFeatures = entry.getValue(); | |
23 | + for (Attribute attribute : featureExtractor.getAttributesList()) { | |
24 | + instance.setValue(attribute, sentenceFeatures.get(attribute)); | |
25 | + } | |
26 | + candidate2instance.put(entry.getKey(), instance); | |
27 | + } | |
28 | + return candidate2instance; | |
29 | + } | |
30 | + | |
31 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... | ... | @@ -4,6 +4,7 @@ import com.google.common.collect.Lists; |
4 | 4 | import com.google.common.collect.Maps; |
5 | 5 | import com.google.common.collect.Sets; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
7 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
9 | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
... | ... | @@ -18,18 +19,56 @@ import java.util.Set; |
18 | 19 | |
19 | 20 | public class ZeroFeatureExtractor extends FeatureExtractor { |
20 | 21 | |
22 | + private static final String SCORE = "score"; | |
23 | + | |
24 | + private static final String ANTECEDENT_PREFIX = "antecedent"; | |
25 | + private static final String CANDIDATE_PREFIX = "candidate"; | |
26 | + | |
27 | + private static final String SENTENCE_ENDS_WITH_QUESTION_MARK = "_sentence_ends_with_question_mark"; | |
28 | + private static final String IS_NAMED = "_is_named"; | |
29 | + private static final String TOKEN_COUNT = "_token_count"; | |
30 | + private static final String FIRST_TOKEN_INDEX_IN_SENT = "_first_token_index_in_sent"; | |
31 | + private static final String INDEX_IN_SENT = "_index_in_sent"; | |
32 | + private static final String PREV_TOKEN_POS = "_prev_token_pos"; | |
33 | + private static final String NEXT_TOKEN_POS = "_next_token_pos"; | |
34 | + private static final String IS_NESTING = "_is_nesting"; | |
35 | + private static final String IS_NESTED = "_is_nested"; | |
36 | + private static final String SENTENCE_MENTION_COUNT = "_sentence_mention_count"; | |
37 | + private static final String SENTENCE_TOKEN_LENGTH = "_sentence_token_length"; | |
38 | + private static final String IS_PAN_OR_PANI = "_is_pan_or_pani"; | |
39 | + | |
40 | + // private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet( | |
41 | +// "zespół", "tylko", "gdy", ".", ":", "też", "kandydat", "do", "dziś", "bo", "by", "z", "a", "jednak", "jak", "który", "ale", "czy", "i", "się", "rok", "-", "\"", "to", "być", "że", ","); | |
42 | + private static final Set<String> PREV_TOKEN_LEMMAS = Sets.newHashSet("to", "z", "do", "o", "czyli", "nie", "\"", "też", "jak", "czy"); | |
43 | + | |
44 | + private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet(); | |
45 | +// private static final Set<String> NEXT_TOKEN_LEMMAS = Sets.newHashSet( | |
46 | +// "mówić", "ii", "twierdzić", "już", "(", "budzić", "stanowić", "powinien", "do", "stać", "musieć", "stanąć", "móc", "o", "chcieć", "się", "-", "zostać", ":", "?", "i", "na", "z", "mieć", "\"", "to", "w", "nie", "być", ".", ","); | |
47 | + | |
48 | + private static final String PREV_TOKEN_LEMMA = "_prev_token_lemma_equal_"; | |
49 | + private static final String NEXT_TOKEN_LEMMA = "_next_token_lemma_equal_"; | |
50 | + | |
21 | 51 | public ZeroFeatureExtractor() { |
22 | 52 | |
23 | - for (String prefix : new String[]{"antecedent", "candidate"}) { | |
24 | - addNumericAttribute(prefix + "_index_in_sent"); | |
25 | - addNumericAttribute(prefix + "_first_token_index_in_sent"); | |
26 | - addNumericAttribute(prefix + "_token_count"); | |
27 | - addBinaryAttribute(prefix + "_is_named"); | |
28 | - addNumericAttribute(prefix + "_sentence_mention_count"); | |
29 | - addNominalAttribute(prefix + "_next_token_pos", Constants.POS_TAGS); | |
30 | - addNominalAttribute(prefix + "_prev_token_pos", Constants.POS_TAGS); | |
31 | - addBinaryAttribute(prefix + "_is_nested"); | |
32 | - addBinaryAttribute(prefix + "_is_nesting"); | |
53 | + for (String prefix : new String[]{ANTECEDENT_PREFIX, CANDIDATE_PREFIX}) { | |
54 | + addNumericAttribute(prefix + INDEX_IN_SENT); | |
55 | + addNumericAttribute(prefix + FIRST_TOKEN_INDEX_IN_SENT); | |
56 | + addNumericAttribute(prefix + TOKEN_COUNT); | |
57 | + addBinaryAttribute(prefix + IS_NAMED); | |
58 | + addBinaryAttribute(prefix + IS_PAN_OR_PANI); | |
59 | + addNominalAttribute(prefix + NEXT_TOKEN_POS, Constants.POS_TAGS); | |
60 | + addNominalAttribute(prefix + PREV_TOKEN_POS, Constants.POS_TAGS); | |
61 | + for (String prevLemma : PREV_TOKEN_LEMMAS) { | |
62 | + addBinaryAttribute(prefix + PREV_TOKEN_LEMMA + prevLemma); | |
63 | + } | |
64 | + for (String nextLemma : NEXT_TOKEN_LEMMAS) { | |
65 | + addBinaryAttribute(prefix + NEXT_TOKEN_LEMMA + nextLemma); | |
66 | + } | |
67 | + addBinaryAttribute(prefix + IS_NESTED); | |
68 | + addBinaryAttribute(prefix + IS_NESTING); | |
69 | + addNumericAttribute(prefix + SENTENCE_MENTION_COUNT); | |
70 | + addNumericAttribute(prefix + SENTENCE_TOKEN_LENGTH); | |
71 | + addBinaryAttribute(prefix + SENTENCE_ENDS_WITH_QUESTION_MARK); | |
33 | 72 | } |
34 | 73 | |
35 | 74 | addNumericAttribute("chain_length"); |
... | ... | @@ -43,8 +82,8 @@ public class ZeroFeatureExtractor extends FeatureExtractor { |
43 | 82 | addNumericAttribute("pair_sent_distance"); |
44 | 83 | addNumericAttribute("pair_par_distance"); |
45 | 84 | |
46 | - addNominalAttribute("score", Lists.newArrayList("bad", "good")); | |
47 | - fillSortedAttributes("score"); | |
85 | + addNominalAttribute(SCORE, Lists.newArrayList("bad", "good")); | |
86 | + fillSortedAttributes(SCORE); | |
48 | 87 | } |
49 | 88 | |
50 | 89 | public Map<ZeroSubjectCandidate, Map<Attribute, Double>> calculateFeatures(List<ZeroSubjectCandidate> candidates, TText text) { |
... | ... | @@ -62,13 +101,13 @@ public class ZeroFeatureExtractor extends FeatureExtractor { |
62 | 101 | private Map<Attribute, Double> calculateFeatures(ZeroSubjectCandidate candidate, FeatureHelper helper) { |
63 | 102 | |
64 | 103 | Map<Attribute, Double> candidateFeatures = Maps.newHashMap(); |
65 | - candidateFeatures.put(getAttributeByName("score"), weka.core.Utils.missingValue()); | |
104 | + candidateFeatures.put(getAttributeByName(SCORE), weka.core.Utils.missingValue()); | |
66 | 105 | |
67 | 106 | TMention mention = candidate.getZeroCandidateMention(); |
68 | 107 | TMention antecedent = candidate.getPreviousSentence().getMentions().stream().filter(ante -> helper.getCoreferentMentions(mention).contains(ante)).findFirst().get(); |
69 | 108 | |
70 | - addMentionFeatures(helper, candidateFeatures, mention, "candidate"); | |
71 | - addMentionFeatures(helper, candidateFeatures, antecedent, "antecedent"); | |
109 | + addMentionFeatures(helper, candidateFeatures, mention, CANDIDATE_PREFIX); | |
110 | + addMentionFeatures(helper, candidateFeatures, antecedent, ANTECEDENT_PREFIX); | |
72 | 111 | |
73 | 112 | candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equals(helper.getMentionOrth(antecedent)))); |
74 | 113 | candidateFeatures.put(getAttributeByName("pair_equal_base"), toBinary(helper.getMentionBase(mention).equalsIgnoreCase(helper.getMentionBase(antecedent)))); |
... | ... | @@ -98,28 +137,41 @@ public class ZeroFeatureExtractor extends FeatureExtractor { |
98 | 137 | } |
99 | 138 | |
100 | 139 | private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) { |
101 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention)); | |
102 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_first_token_index_in_sent"), (double) helper.getMentionFirstTokenIndex(mention)); | |
140 | + candidateFeatures.put(getAttributeByName(attributePrefix + INDEX_IN_SENT), (double) helper.getMentionIndexInSent(mention)); | |
141 | + candidateFeatures.put(getAttributeByName(attributePrefix + FIRST_TOKEN_INDEX_IN_SENT), (double) helper.getMentionFirstTokenIndex(mention)); | |
103 | 142 | |
104 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize()); | |
105 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention))); | |
106 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_sentence_mention_count"), (double) helper.getMentionSentence(mention).getMentions().size()); | |
143 | + candidateFeatures.put(getAttributeByName(attributePrefix + TOKEN_COUNT), (double) mention.getChildIdsSize()); | |
144 | + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NAMED), toBinary(helper.isMentionNamedEntity(mention))); | |
145 | + candidateFeatures.put(getAttributeByName(attributePrefix + IS_PAN_OR_PANI), toBinary(helper.getMentionBase(mention).matches("(pan)|(pani)"))); | |
107 | 146 | |
108 | 147 | TToken nextToken = helper.getTokenAfterMention(mention); |
109 | - addNominalAttributeValue(nextToken == null ? "end" : nextToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_next_token_pos"); | |
148 | + addNominalAttributeValue(nextToken == null ? "end" : nextToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + NEXT_TOKEN_POS); | |
149 | + String nextTokenLemma = nextToken == null ? "" : nextToken.getChosenInterpretation().getBase(); | |
150 | + for (String nextLemma : NEXT_TOKEN_LEMMAS) { | |
151 | + candidateFeatures.put(getAttributeByName(attributePrefix + NEXT_TOKEN_LEMMA + nextLemma), toBinary(nextTokenLemma.equalsIgnoreCase(nextLemma))); | |
152 | + } | |
153 | + | |
110 | 154 | TToken prevToken = helper.getTokenBeforeMention(mention); |
111 | - addNominalAttributeValue(prevToken == null ? "end" : prevToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_prev_token_pos"); | |
155 | + addNominalAttributeValue(prevToken == null ? "end" : prevToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + PREV_TOKEN_POS); | |
156 | + String prevTokenLemma = prevToken == null ? "" : prevToken.getChosenInterpretation().getBase(); | |
157 | + for (String prevLemma : PREV_TOKEN_LEMMAS) { | |
158 | + candidateFeatures.put(getAttributeByName(attributePrefix + PREV_TOKEN_LEMMA + prevLemma), toBinary(prevTokenLemma.equalsIgnoreCase(prevLemma))); | |
159 | + } | |
112 | 160 | |
113 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention))); | |
114 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention))); | |
161 | + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NESTED), toBinary(helper.isNested(mention))); | |
162 | + candidateFeatures.put(getAttributeByName(attributePrefix + IS_NESTING), toBinary(helper.isNesting(mention))); | |
115 | 163 | |
164 | + TSentence mentionSentence = helper.getMentionSentence(mention); | |
165 | + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_MENTION_COUNT), (double) mentionSentence.getMentions().size()); | |
166 | + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_TOKEN_LENGTH), (double) mentionSentence.getTokens().size()); | |
167 | + candidateFeatures.put(getAttributeByName(attributePrefix + SENTENCE_ENDS_WITH_QUESTION_MARK), toBinary(mentionSentence.getTokens().get(mentionSentence.getTokensSize() - 1).getOrth().equals("?"))); | |
116 | 168 | } |
117 | 169 | |
118 | 170 | private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { |
119 | 171 | Attribute att = getAttributeByName(attributeName); |
120 | 172 | int index = att.indexOfValue(value); |
121 | 173 | if (index == -1) |
122 | - LOG.warn(value + " not found for attribute " + attributeName); | |
174 | + LOG.warn(value + "not found for attribute " + attributeName); | |
123 | 175 | attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); |
124 | 176 | } |
125 | 177 | } |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
... | ... | @@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
8 | 8 | import weka.classifiers.Classifier; |
9 | 9 | import weka.core.Instance; |
10 | 10 | import weka.core.Instances; |
11 | +import weka.core.SerializationHelper; | |
11 | 12 | |
12 | -import java.io.IOException; | |
13 | 13 | import java.util.List; |
14 | 14 | import java.util.Map; |
15 | 15 | import java.util.Set; |
... | ... | @@ -21,8 +21,8 @@ public class ZeroSubjectInjector { |
21 | 21 | private final Classifier classifier; |
22 | 22 | private final Instances instances; |
23 | 23 | |
24 | - public ZeroSubjectInjector() throws IOException, ClassNotFoundException { | |
25 | - classifier = Utils.loadClassifier(Constants.ZERO_MODEL_PATH); | |
24 | + public ZeroSubjectInjector() throws Exception { | |
25 | + classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); | |
26 | 26 | featureExtractor = new ZeroFeatureExtractor(); |
27 | 27 | instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
28 | 28 | } |
... | ... | @@ -31,7 +31,7 @@ public class ZeroSubjectInjector { |
31 | 31 | Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); |
32 | 32 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); |
33 | 33 | Map<ZeroSubjectCandidate, Instance> candidate2instance = |
34 | - PrepareTrainingData.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
34 | + InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
35 | 35 | |
36 | 36 | Set<String> result = Sets.newHashSet(); |
37 | 37 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { |
... | ... |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas; | |
2 | + | |
3 | +import org.junit.BeforeClass; | |
4 | +import org.junit.Test; | |
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
7 | + | |
8 | +import static org.junit.Assert.assertTrue; | |
9 | + | |
10 | +public class NicolasTest { | |
11 | + | |
12 | + private static final String SAMPLE_THRIFT_TEXT_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/sample_serialized_text.thrift"; | |
13 | + | |
14 | + private static Nicolas nicolas; | |
15 | + | |
16 | + @BeforeClass | |
17 | + public static void shouldLoadModels() throws Exception { | |
18 | + nicolas = new Nicolas(); | |
19 | + } | |
20 | + | |
21 | + @Test | |
22 | + public void shouldSummarizeThriftText() throws Exception { | |
23 | + TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); | |
24 | + String summary = nicolas.summarizeThrift(thriftText, 5); | |
25 | + int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size(); | |
26 | + assertTrue(summaryTokensCount > 0); | |
27 | + assertTrue(summaryTokensCount < 10); | |
28 | + } | |
29 | + | |
30 | +} | |
0 | 31 | \ No newline at end of file |
... | ... |
nicolas-core/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java renamed to nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
... | ... | @@ -18,7 +18,7 @@ import static org.junit.Assert.assertEquals; |
18 | 18 | |
19 | 19 | public class CandidateFinderTest { |
20 | 20 | |
21 | - private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin"; | |
21 | + private static final String SAMPLE_TEXT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.thrift"; | |
22 | 22 | private static final String SAMPLE_TEXT_SUMMARY_IDS_PATH = "/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt"; |
23 | 23 | |
24 | 24 | @Test |
... | ... |
nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin renamed to nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/sample_serialized_text.thrift
No preview for this file type
nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.thrift
0 → 100644
No preview for this file type
nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt renamed to nicolas-lib/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/models/.gitignore
0 → 100644
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/models/README.md
0 → 100644
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/frequent_bases.txt renamed to nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/frequent_bases.txt
nicolas-train/pom.xml
... | ... | @@ -12,6 +12,16 @@ |
12 | 12 | <artifactId>nicolas-train</artifactId> |
13 | 13 | |
14 | 14 | <dependencies> |
15 | + <!-- project --> | |
16 | + <dependency> | |
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
18 | + <artifactId>nicolas-common</artifactId> | |
19 | + </dependency> | |
20 | + <dependency> | |
21 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
22 | + <artifactId>nicolas-lib</artifactId> | |
23 | + </dependency> | |
24 | + | |
15 | 25 | <!-- internal --> |
16 | 26 | <dependency> |
17 | 27 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
... | ... | @@ -22,10 +32,28 @@ |
22 | 32 | <artifactId>utils</artifactId> |
23 | 33 | </dependency> |
24 | 34 | |
35 | + <!-- third party --> | |
36 | + <dependency> | |
37 | + <groupId>nz.ac.waikato.cms.weka</groupId> | |
38 | + <artifactId>weka-dev</artifactId> | |
39 | + </dependency> | |
40 | + <dependency> | |
41 | + <groupId>org.apache.commons</groupId> | |
42 | + <artifactId>commons-lang3</artifactId> | |
43 | + </dependency> | |
44 | + <dependency> | |
45 | + <groupId>net.lingala.zip4j</groupId> | |
46 | + <artifactId>zip4j</artifactId> | |
47 | + </dependency> | |
48 | + | |
25 | 49 | <!-- logging --> |
26 | 50 | <dependency> |
27 | 51 | <groupId>org.slf4j</groupId> |
28 | 52 | <artifactId>slf4j-api</artifactId> |
29 | 53 | </dependency> |
54 | + <dependency> | |
55 | + <groupId>org.slf4j</groupId> | |
56 | + <artifactId>slf4j-simple</artifactId> | |
57 | + </dependency> | |
30 | 58 | </dependencies> |
31 | 59 | </project> |
32 | 60 | \ No newline at end of file |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train; | |
2 | + | |
3 | +import net.lingala.zip4j.core.ZipFile; | |
4 | +import org.apache.commons.io.FileUtils; | |
5 | +import org.slf4j.Logger; | |
6 | +import org.slf4j.LoggerFactory; | |
7 | +import pl.waw.ipipan.zil.summ.nicolas.train.multiservice.NLPProcess; | |
8 | + | |
9 | +import java.io.File; | |
10 | +import java.net.URL; | |
11 | + | |
12 | +public class DownloadAndPreprocessCorpus { | |
13 | + | |
14 | + private static final Logger LOG = LoggerFactory.getLogger(DownloadAndPreprocessCorpus.class); | |
15 | + | |
16 | + private static final String WORKING_DIR = "data"; | |
17 | + private static final String CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/PolishSummariesCorpus?action=AttachFile&do=get&target=PSC_1.0.zip"; | |
18 | + | |
19 | + private DownloadAndPreprocessCorpus() { | |
20 | + } | |
21 | + | |
22 | + public static void main(String[] args) throws Exception { | |
23 | + File workDir = createFolder(WORKING_DIR); | |
24 | + | |
25 | + File corpusFile = new File(workDir, "corpus.zip"); | |
26 | + if (!corpusFile.exists()) { | |
27 | + LOG.info("Downloading corpus file..."); | |
28 | + FileUtils.copyURLToFile(new URL(CORPUS_DOWNLOAD_URL), corpusFile); | |
29 | + LOG.info("done."); | |
30 | + } else { | |
31 | + LOG.info("Corpus file already downloaded."); | |
32 | + } | |
33 | + | |
34 | + File extractedCorpusDir = new File(workDir, "corpus"); | |
35 | + if (extractedCorpusDir.exists()) { | |
36 | + LOG.info("Corpus file already extracted."); | |
37 | + } else { | |
38 | + ZipFile zipFile = new ZipFile(corpusFile); | |
39 | + zipFile.extractAll(extractedCorpusDir.getPath()); | |
40 | + LOG.info("Extracted corpus file."); | |
41 | + } | |
42 | + | |
43 | + File pscDir = new File(extractedCorpusDir, "PSC_1.0"); | |
44 | + File dataDir = new File(pscDir, "data"); | |
45 | + | |
46 | + File preprocessed = new File(WORKING_DIR, "preprocessed"); | |
47 | + createFolder(preprocessed.getPath()); | |
48 | + NLPProcess.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); | |
49 | + } | |
50 | + | |
51 | + private static File createFolder(String path) { | |
52 | + File folder = new File(path); | |
53 | + if (folder.mkdir()) { | |
54 | + LOG.info("Created directory at: {}.", path); | |
55 | + } else { | |
56 | + LOG.info("Directory already present at: {}.", path); | |
57 | + } | |
58 | + return folder; | |
59 | + } | |
60 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/TrainAllModels.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel; | |
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel; | |
6 | + | |
7 | +public class TrainAllModels { | |
8 | + | |
9 | + private TrainAllModels() { | |
10 | + } | |
11 | + | |
12 | + public static void main(String[] args) throws Exception { | |
13 | + TrainMentionModel.main(args); | |
14 | + TrainSentenceModel.main(args); | |
15 | + TrainZeroModel.main(args); | |
16 | + } | |
17 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/Trainer.java deleted
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/ModelConstants.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.common; | |
2 | + | |
3 | +import weka.classifiers.Classifier; | |
4 | +import weka.classifiers.trees.RandomForest; | |
5 | + | |
6 | +public class ModelConstants { | |
7 | + | |
8 | + public static final String MENTION_DATASET_PATH = "mentions_train.arff"; | |
9 | + public static final String SENTENCE_DATASET_PATH = "sentences_train.arff"; | |
10 | + public static final String ZERO_DATASET_PATH = "zeros_train.arff"; | |
11 | + | |
12 | + private static final int NUM_ITERATIONS = 16; | |
13 | + private static final int NUM_EXECUTION_SLOTS = 8; | |
14 | + private static final int SEED = 0; | |
15 | + | |
16 | + private ModelConstants() { | |
17 | + } | |
18 | + | |
19 | + public static Classifier getMentionClassifier() { | |
20 | + RandomForest classifier = new RandomForest(); | |
21 | + classifier.setNumIterations(NUM_ITERATIONS); | |
22 | + classifier.setSeed(SEED); | |
23 | + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS); | |
24 | + return classifier; | |
25 | + } | |
26 | + | |
27 | + public static Classifier getSentenceClassifier() { | |
28 | + RandomForest classifier = new RandomForest(); | |
29 | + classifier.setNumIterations(16); | |
30 | + classifier.setSeed(0); | |
31 | + classifier.setNumExecutionSlots(8); | |
32 | + return classifier; | |
33 | + } | |
34 | + | |
35 | + public static Classifier getZeroClassifier() { | |
36 | + RandomForest classifier = new RandomForest(); | |
37 | + classifier.setNumIterations(16); | |
38 | + classifier.setSeed(0); | |
39 | + classifier.setNumExecutionSlots(8); | |
40 | + return classifier; | |
41 | + } | |
42 | + | |
43 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/TrainModel.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/TrainModelCommon.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.common; | |
2 | 2 | |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel; | |
7 | 7 | import weka.classifiers.Classifier; |
8 | 8 | import weka.core.Instances; |
9 | 9 | import weka.core.converters.ArffLoader; |
... | ... | @@ -11,41 +11,43 @@ import weka.core.converters.ArffLoader; |
11 | 11 | import java.io.File; |
12 | 12 | import java.io.FileOutputStream; |
13 | 13 | import java.io.ObjectOutputStream; |
14 | +import java.util.logging.LogManager; | |
14 | 15 | |
16 | +@SuppressWarnings("squid:S2118") | |
17 | +public class TrainModelCommon { | |
15 | 18 | |
16 | -public class TrainModel { | |
19 | + private static final Logger LOG = LoggerFactory.getLogger(TrainZeroModel.class); | |
17 | 20 | |
18 | - private static final Logger LOG = LoggerFactory.getLogger(TrainModel.class); | |
21 | + private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources"; | |
19 | 22 | |
20 | - private TrainModel() { | |
23 | + private TrainModelCommon() { | |
21 | 24 | } |
22 | 25 | |
23 | - public static void main(String[] args) throws Exception { | |
26 | + public static void trainAndSaveModel(String datasetPath, Classifier classifier, String targetPath) throws Exception { | |
27 | + LogManager.getLogManager().reset(); // disable WEKA logging | |
24 | 28 | |
25 | 29 | ArffLoader loader = new ArffLoader(); |
26 | - loader.setFile(new File(Constants.ZERO_DATASET_PATH)); | |
30 | + loader.setFile(new File(datasetPath)); | |
27 | 31 | Instances instances = loader.getDataSet(); |
28 | 32 | instances.setClassIndex(0); |
29 | - LOG.info(instances.size() + " instances loaded."); | |
30 | - LOG.info(instances.numAttributes() + " attributes for each instance."); | |
33 | + LOG.info("{} instances loaded.", instances.size()); | |
34 | + LOG.info("{} attributes for each instance.", instances.numAttributes()); | |
31 | 35 | |
32 | 36 | StopWatch watch = new StopWatch(); |
33 | 37 | watch.start(); |
34 | 38 | |
35 | - Classifier classifier = Constants.getZerosClassifier(); | |
36 | - | |
37 | 39 | LOG.info("Building classifier..."); |
38 | 40 | classifier.buildClassifier(instances); |
39 | - LOG.info("...done."); | |
41 | + LOG.info("...done. Build classifier: {}", classifier); | |
40 | 42 | |
43 | + String target = TARGET_MODEL_DIR + targetPath; | |
44 | + LOG.info("Saving classifier at: {}", target); | |
41 | 45 | try (ObjectOutputStream oos = new ObjectOutputStream( |
42 | - new FileOutputStream(Constants.ZERO_MODEL_PATH))) { | |
46 | + new FileOutputStream(target))) { | |
43 | 47 | oos.writeObject(classifier); |
44 | 48 | } |
45 | 49 | |
46 | 50 | watch.stop(); |
47 | - LOG.info("Elapsed time: " + watch); | |
48 | - | |
49 | - LOG.info(classifier.toString()); | |
51 | + LOG.info("Elapsed time: {}", watch); | |
50 | 52 | } |
51 | 53 | } |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/MentionScorer.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention; | |
2 | 2 | |
3 | 3 | import com.google.common.collect.HashMultiset; |
4 | 4 | import com.google.common.collect.Maps; |
... | ... | @@ -14,7 +14,6 @@ import java.util.stream.Collectors; |
14 | 14 | |
15 | 15 | public class MentionScorer { |
16 | 16 | |
17 | - | |
18 | 17 | public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { |
19 | 18 | Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); |
20 | 19 | |
... | ... | @@ -39,20 +38,4 @@ public class MentionScorer { |
39 | 38 | } |
40 | 39 | return mention2score; |
41 | 40 | } |
42 | - | |
43 | - private static Map<TMention, Double> booleanTokenInclusion(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { | |
44 | - Map<TMention, Double> mention2score = Maps.newHashMap(); | |
45 | - for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { | |
46 | - TMention mention = entry.getKey(); | |
47 | - String mentionOrth = mention2Orth.get(mention); | |
48 | - int present = 0; | |
49 | - for (String token : Utils.tokenize(mentionOrth)) { | |
50 | - if (tokenCounts.contains(token.toLowerCase())) { | |
51 | - present++; | |
52 | - } | |
53 | - } | |
54 | - mention2score.putIfAbsent(mention, ((present * 2) >= Utils.tokenize(mentionOrth).size()) ? 1.0 : 0.0); | |
55 | - } | |
56 | - return mention2score; | |
57 | - } | |
58 | 41 | } |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/PrepareTrainingData.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.mention; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention; | |
2 | 2 | |
3 | 3 | import com.google.common.base.Charsets; |
4 | 4 | import com.google.common.collect.Maps; |
... | ... | @@ -7,9 +7,11 @@ import org.slf4j.Logger; |
7 | 7 | import org.slf4j.LoggerFactory; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
11 | 10 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
11 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
12 | 12 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | |
14 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
13 | 15 | import weka.core.Instance; |
14 | 16 | import weka.core.Instances; |
15 | 17 | import weka.core.converters.ArffSaver; |
... | ... | @@ -23,8 +25,11 @@ public class PrepareTrainingData { |
23 | 25 | |
24 | 26 | private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); |
25 | 27 | |
26 | - public static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | |
27 | - public static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | |
28 | + private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | |
29 | + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | |
30 | + | |
31 | + private PrepareTrainingData() { | |
32 | + } | |
28 | 33 | |
29 | 34 | public static void main(String[] args) throws IOException { |
30 | 35 | |
... | ... | @@ -37,19 +42,20 @@ public class PrepareTrainingData { |
37 | 42 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
38 | 43 | |
39 | 44 | int i = 1; |
40 | - for (String textId : id2preprocessedText.keySet()) { | |
45 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
41 | 46 | LOG.info(i++ + "/" + id2preprocessedText.size()); |
42 | 47 | |
43 | - TText preprocessedText = id2preprocessedText.get(textId); | |
44 | - String optimalSummary = id2optimalSummary.get(textId); | |
48 | + String id = entry.getKey(); | |
49 | + TText preprocessedText = entry.getValue(); | |
50 | + String optimalSummary = id2optimalSummary.get(id); | |
45 | 51 | if (optimalSummary == null) |
46 | 52 | continue; |
47 | 53 | Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); |
48 | 54 | |
49 | 55 | Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); |
50 | - for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | |
51 | - TMention mention = entry.getKey(); | |
52 | - Instance instance = entry.getValue(); | |
56 | + for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) { | |
57 | + TMention mention = entry2.getKey(); | |
58 | + Instance instance = entry2.getValue(); | |
53 | 59 | instance.setDataset(instances); |
54 | 60 | instance.setClassValue(mention2score.get(mention)); |
55 | 61 | instances.add(instance); |
... | ... | @@ -61,7 +67,7 @@ public class PrepareTrainingData { |
61 | 67 | private static void saveInstancesToFile(Instances instances) throws IOException { |
62 | 68 | ArffSaver saver = new ArffSaver(); |
63 | 69 | saver.setInstances(instances); |
64 | - saver.setFile(new File(Constants.MENTIONS_DATASET_PATH)); | |
70 | + saver.setFile(new File(ModelConstants.MENTION_DATASET_PATH)); | |
65 | 71 | saver.writeBatch(); |
66 | 72 | } |
67 | 73 | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/TrainMentionModel.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.mention; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon; | |
6 | +import weka.classifiers.Classifier; | |
7 | + | |
8 | +public class TrainMentionModel { | |
9 | + | |
10 | + private TrainMentionModel() { | |
11 | + } | |
12 | + | |
13 | + public static void main(String[] args) throws Exception { | |
14 | + Classifier classifier = ModelConstants.getMentionClassifier(); | |
15 | + String datasetPath = ModelConstants.MENTION_DATASET_PATH; | |
16 | + String targetPath = Constants.MENTION_MODEL_RESOURCE_PATH; | |
17 | + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath); | |
18 | + } | |
19 | + | |
20 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/PrepareTrainingData.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.sentence; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence; | |
2 | 2 | |
3 | 3 | import com.google.common.base.Charsets; |
4 | 4 | import com.google.common.collect.Maps; |
... | ... | @@ -8,11 +8,13 @@ import org.slf4j.LoggerFactory; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
12 | 11 | import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; |
12 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
13 | 13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
14 | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
16 | +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | |
17 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
16 | 18 | import weka.classifiers.Classifier; |
17 | 19 | import weka.core.Instance; |
18 | 20 | import weka.core.Instances; |
... | ... | @@ -31,6 +33,9 @@ public class PrepareTrainingData { |
31 | 33 | private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; |
32 | 34 | private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; |
33 | 35 | |
36 | + private PrepareTrainingData() { | |
37 | + } | |
38 | + | |
34 | 39 | public static void main(String[] args) throws Exception { |
35 | 40 | |
36 | 41 | Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); |
... | ... | @@ -41,7 +46,7 @@ public class PrepareTrainingData { |
41 | 46 | |
42 | 47 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
43 | 48 | |
44 | - Classifier classifier = Utils.loadClassifier(Constants.MENTIONS_MODEL_PATH); | |
49 | + Classifier classifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); | |
45 | 50 | MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); |
46 | 51 | |
47 | 52 | int i = 1; |
... | ... | @@ -74,7 +79,7 @@ public class PrepareTrainingData { |
74 | 79 | private static void saveInstancesToFile(Instances instances) throws IOException { |
75 | 80 | ArffSaver saver = new ArffSaver(); |
76 | 81 | saver.setInstances(instances); |
77 | - saver.setFile(new File(Constants.SENTENCES_DATASET_PATH)); | |
82 | + saver.setFile(new File(ModelConstants.SENTENCE_DATASET_PATH)); | |
78 | 83 | saver.writeBatch(); |
79 | 84 | } |
80 | 85 | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/SentenceScorer.java
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/TrainSentenceModel.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon; | |
6 | +import weka.classifiers.Classifier; | |
7 | + | |
8 | +public class TrainSentenceModel { | |
9 | + | |
10 | + private TrainSentenceModel() { | |
11 | + } | |
12 | + | |
13 | + public static void main(String[] args) throws Exception { | |
14 | + Classifier classifier = ModelConstants.getSentenceClassifier(); | |
15 | + String datasetPath = ModelConstants.SENTENCE_DATASET_PATH; | |
16 | + String targetPath = Constants.SENTENCE_MODEL_RESOURCE_PATH; | |
17 | + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath); | |
18 | + } | |
19 | + | |
20 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/PrepareTrainingData.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero; | |
2 | 2 | |
3 | 3 | import com.google.common.collect.Maps; |
4 | 4 | import com.google.common.collect.Sets; |
... | ... | @@ -6,11 +6,13 @@ import org.apache.commons.io.IOUtils; |
6 | 6 | import org.slf4j.Logger; |
7 | 7 | import org.slf4j.LoggerFactory; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
10 | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
11 | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
12 | -import weka.core.Attribute; | |
13 | -import weka.core.DenseInstance; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | |
13 | +import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; | |
14 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | |
15 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | |
14 | 16 | import weka.core.Instance; |
15 | 17 | import weka.core.Instances; |
16 | 18 | import weka.core.converters.ArffSaver; |
... | ... | @@ -54,7 +56,7 @@ public class PrepareTrainingData { |
54 | 56 | FeatureHelper featureHelper = new FeatureHelper(text); |
55 | 57 | |
56 | 58 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); |
57 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
59 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
58 | 60 | |
59 | 61 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { |
60 | 62 | boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); |
... | ... | @@ -68,24 +70,11 @@ public class PrepareTrainingData { |
68 | 70 | saveInstancesToFile(instances); |
69 | 71 | } |
70 | 72 | |
71 | - public static Map<ZeroSubjectCandidate, Instance> extractInstancesFromZeroCandidates(List<ZeroSubjectCandidate> candidates, TText text, ZeroFeatureExtractor featureExtractor) { | |
72 | - Map<ZeroSubjectCandidate, Map<Attribute, Double>> candidate2features = featureExtractor.calculateFeatures(candidates, text); | |
73 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = Maps.newHashMap(); | |
74 | - for (Map.Entry<ZeroSubjectCandidate, Map<Attribute, Double>> entry : candidate2features.entrySet()) { | |
75 | - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | |
76 | - Map<Attribute, Double> sentenceFeatures = entry.getValue(); | |
77 | - for (Attribute attribute : featureExtractor.getAttributesList()) { | |
78 | - instance.setValue(attribute, sentenceFeatures.get(attribute)); | |
79 | - } | |
80 | - candidate2instance.put(entry.getKey(), instance); | |
81 | - } | |
82 | - return candidate2instance; | |
83 | - } | |
84 | 73 | |
85 | 74 | private static void saveInstancesToFile(Instances instances) throws IOException { |
86 | 75 | ArffSaver saver = new ArffSaver(); |
87 | 76 | saver.setInstances(instances); |
88 | - saver.setFile(new File(Constants.ZERO_DATASET_PATH)); | |
77 | + saver.setFile(new File(ModelConstants.ZERO_DATASET_PATH)); | |
89 | 78 | saver.writeBatch(); |
90 | 79 | } |
91 | 80 | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/TrainZeroModel.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
4 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.TrainModelCommon; | |
6 | +import weka.classifiers.Classifier; | |
7 | + | |
8 | +public class TrainZeroModel { | |
9 | + | |
10 | + private TrainZeroModel() { | |
11 | + } | |
12 | + | |
13 | + public static void main(String[] args) throws Exception { | |
14 | + Classifier classifier = ModelConstants.getZeroClassifier(); | |
15 | + String datasetPath = ModelConstants.ZERO_DATASET_PATH; | |
16 | + String targetPath = Constants.ZERO_MODEL_RESOURCE_PATH; | |
17 | + TrainModelCommon.trainAndSaveModel(datasetPath, classifier, targetPath); | |
18 | + } | |
19 | + | |
20 | +} | |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroScorer.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/ZeroScorer.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.model.zero; | |
2 | 2 | |
3 | 3 | import com.google.common.collect.Maps; |
4 | 4 | import org.apache.commons.csv.CSVFormat; |
... | ... | @@ -7,6 +7,7 @@ import org.apache.commons.csv.CSVRecord; |
7 | 7 | import org.apache.commons.csv.QuoteMode; |
8 | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
9 | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
10 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | |
10 | 11 | |
11 | 12 | import java.io.IOException; |
12 | 13 | import java.io.InputStream; |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java
... | ... | @@ -24,6 +24,9 @@ public class NLPProcess { |
24 | 24 | |
25 | 25 | private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT); |
26 | 26 | |
27 | + private static final String CORPUS_FILE_SUFFIX = ".xml"; | |
28 | + private static final String OUTPUT_FILE_SUFFIX = ".thrift"; | |
29 | + | |
27 | 30 | private NLPProcess() { |
28 | 31 | } |
29 | 32 | |
... | ... | @@ -34,23 +37,27 @@ public class NLPProcess { |
34 | 37 | } |
35 | 38 | File corpusDir = new File(args[0]); |
36 | 39 | if (!corpusDir.isDirectory()) { |
37 | - LOG.error("Corpus directory does not exist: " + corpusDir); | |
40 | + LOG.error("Corpus directory does not exist: {}", corpusDir); | |
38 | 41 | return; |
39 | 42 | } |
40 | 43 | File targetDir = new File(args[1]); |
41 | 44 | if (!targetDir.isDirectory()) { |
42 | - LOG.error("Target directory does not exist: " + targetDir); | |
45 | + LOG.error("Target directory does not exist: {}", targetDir); | |
43 | 46 | return; |
44 | 47 | } |
45 | 48 | |
46 | 49 | int ok = 0; |
47 | 50 | int err = 0; |
48 | - File[] files = corpusDir.listFiles(f -> f.getName().endsWith(".xml")); | |
51 | + File[] files = corpusDir.listFiles(f -> f.getName().endsWith(CORPUS_FILE_SUFFIX)); | |
52 | + if (files == null || files.length == 0) { | |
53 | + LOG.error("No corpus files found at: {}", corpusDir); | |
54 | + return; | |
55 | + } | |
49 | 56 | Arrays.sort(files); |
50 | 57 | for (File file : files) { |
51 | 58 | try { |
52 | 59 | Text text = PSC_IO.readText(file); |
53 | - File targetFile = new File(targetDir, file.getName().replaceFirst(".xml$", ".bin")); | |
60 | + File targetFile = new File(targetDir, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX)); | |
54 | 61 | annotateNLP(text, targetFile); |
55 | 62 | ok++; |
56 | 63 | } catch (Exception e) { |
... | ... | @@ -58,8 +65,8 @@ public class NLPProcess { |
58 | 65 | LOG.error("Problem with text in " + file + ", " + e); |
59 | 66 | } |
60 | 67 | } |
61 | - LOG.info(ok + " texts processed successfully."); | |
62 | - LOG.info(err + " texts with errors."); | |
68 | + LOG.info("{} texts processed successfully.", ok); | |
69 | + LOG.info("{} texts with errors.", err); | |
63 | 70 | } |
64 | 71 | |
65 | 72 | private static void annotateNLP(Text text, File targetFile) throws Exception { |
... | ... | @@ -77,8 +84,8 @@ public class NLPProcess { |
77 | 84 | } |
78 | 85 | |
79 | 86 | public static void serialize(TText ttext, File targetFile) throws IOException { |
80 | - try (FileOutputStream fout = new FileOutputStream(targetFile); | |
81 | - ObjectOutputStream oos = new ObjectOutputStream(fout)) { | |
87 | + try (FileOutputStream fileOutputStream = new FileOutputStream(targetFile); | |
88 | + ObjectOutputStream oos = new ObjectOutputStream(fileOutputStream)) { | |
82 | 89 | oos.writeObject(ttext); |
83 | 90 | } |
84 | 91 | } |
... | ... |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/EvalUtils.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateCommon.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.eval; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.search; | |
2 | 2 | |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.apache.commons.lang3.tuple.Pair; |
... | ... | @@ -14,6 +14,7 @@ import weka.classifiers.functions.SimpleLogistic; |
14 | 14 | import weka.classifiers.lazy.IBk; |
15 | 15 | import weka.classifiers.lazy.KStar; |
16 | 16 | import weka.classifiers.lazy.LWL; |
17 | +import weka.classifiers.meta.AttributeSelectedClassifier; | |
17 | 18 | import weka.classifiers.rules.DecisionTable; |
18 | 19 | import weka.classifiers.rules.JRip; |
19 | 20 | import weka.classifiers.rules.PART; |
... | ... | @@ -23,21 +24,49 @@ import weka.classifiers.trees.J48; |
23 | 24 | import weka.classifiers.trees.LMT; |
24 | 25 | import weka.classifiers.trees.RandomForest; |
25 | 26 | import weka.core.Instances; |
27 | +import weka.core.converters.ArffLoader; | |
26 | 28 | |
29 | +import java.io.File; | |
30 | +import java.io.IOException; | |
27 | 31 | import java.util.Arrays; |
28 | 32 | import java.util.Comparator; |
29 | 33 | import java.util.Optional; |
30 | 34 | import java.util.Random; |
35 | +import java.util.logging.LogManager; | |
31 | 36 | |
32 | -public class EvalUtils { | |
33 | 37 | |
34 | - private static final Logger LOG = LoggerFactory.getLogger(EvalUtils.class); | |
35 | - public static final int NUM_FOLDS = 10; | |
38 | +class CrossvalidateCommon { | |
36 | 39 | |
37 | - private EvalUtils() { | |
40 | + private static final Logger LOG = LoggerFactory.getLogger(CrossvalidateCommon.class); | |
41 | + | |
42 | + private static final int NUM_FOLDS = 10; | |
43 | + | |
44 | + private CrossvalidateCommon() { | |
45 | + } | |
46 | + | |
47 | + static void crossvalidateClassifiers(String datasetPath) throws IOException { | |
48 | + Instances instances = loadInstances(datasetPath); | |
49 | + crossvalidateClassification(instances); | |
50 | + } | |
51 | + | |
52 | + static void crossvalidateRegressors(String datasetPath) throws IOException { | |
53 | + Instances instances = loadInstances(datasetPath); | |
54 | + crossvalidateRegression(instances); | |
38 | 55 | } |
39 | 56 | |
40 | - public static void crossvalidateClassification(Instances instances) throws Exception { | |
57 | + private static Instances loadInstances(String datasetPath) throws IOException { | |
58 | + LogManager.getLogManager().reset(); // disable WEKA logging | |
59 | + | |
60 | + ArffLoader loader = new ArffLoader(); | |
61 | + loader.setFile(new File(datasetPath)); | |
62 | + Instances instances = loader.getDataSet(); | |
63 | + instances.setClassIndex(0); | |
64 | + LOG.info("{} instances loaded.", instances.size()); | |
65 | + LOG.info("{} attributes for each instance.", instances.numAttributes()); | |
66 | + return instances; | |
67 | + } | |
68 | + | |
69 | + private static void crossvalidateClassification(Instances instances) throws IOException { | |
41 | 70 | StopWatch watch = new StopWatch(); |
42 | 71 | watch.start(); |
43 | 72 | |
... | ... | @@ -45,52 +74,58 @@ public class EvalUtils { |
45 | 74 | new Logistic(), new ZeroR(), |
46 | 75 | new SimpleLogistic(), new BayesNet(), new NaiveBayes(), |
47 | 76 | new KStar(), new IBk(), new LWL(), |
48 | - new DecisionTable(), new JRip(), new PART()}).parallel().map(cls -> { | |
49 | - Evaluation eval = null; | |
77 | + new DecisionTable(), new JRip(), new PART(), | |
78 | + createAttributeSelectedClassifier()}).parallel().map(cls -> { | |
79 | + String name = cls.getClass().getSimpleName(); | |
80 | + double acc = 0; | |
81 | + Evaluation eval; | |
50 | 82 | try { |
51 | 83 | eval = new Evaluation(instances); |
52 | 84 | eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); |
53 | 85 | } catch (Exception e) { |
54 | - e.printStackTrace(); | |
86 | + LOG.error("Error evaluating model", e); | |
87 | + return Pair.of(0.0, name); | |
55 | 88 | } |
56 | - double acc = eval.correct() / eval.numInstances(); | |
57 | - String name = cls.getClass().getSimpleName(); | |
89 | + acc = eval.correct() / eval.numInstances(); | |
58 | 90 | LOG.info(name + " : " + acc); |
59 | - | |
60 | 91 | return Pair.of(acc, name); |
61 | 92 | }).max(Comparator.comparingDouble(Pair::getLeft)); |
62 | 93 | LOG.info("#########"); |
63 | 94 | LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); |
64 | 95 | |
65 | 96 | watch.stop(); |
66 | - LOG.info("Elapsed time: " + watch); | |
97 | + LOG.info("Elapsed time: {}", watch); | |
98 | + } | |
99 | + | |
100 | + | |
101 | + private static Classifier createAttributeSelectedClassifier() { | |
102 | + AttributeSelectedClassifier attributeSelectedClassifier = new AttributeSelectedClassifier(); | |
103 | + attributeSelectedClassifier.setClassifier(new LMT()); | |
104 | + return attributeSelectedClassifier; | |
67 | 105 | } |
68 | 106 | |
69 | - public static void crossvalidateRegression(Instances instances) { | |
107 | + private static void crossvalidateRegression(Instances instances) { | |
70 | 108 | StopWatch watch = new StopWatch(); |
71 | 109 | watch.start(); |
72 | 110 | |
73 | 111 | Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{ |
74 | 112 | new RandomForest(), new LinearRegression(), new KStar()}).parallel().map(cls -> { |
75 | - Evaluation eval = null; | |
76 | 113 | double acc = 0; |
114 | + String name = cls.getClass().getSimpleName(); | |
77 | 115 | try { |
78 | - eval = new Evaluation(instances); | |
116 | + Evaluation eval = new Evaluation(instances); | |
79 | 117 | eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); |
80 | 118 | acc = eval.correlationCoefficient(); |
81 | - | |
82 | 119 | } catch (Exception e) { |
83 | - e.printStackTrace(); | |
120 | + LOG.error("Error evaluating model", e); | |
84 | 121 | } |
85 | - String name = cls.getClass().getSimpleName(); | |
86 | 122 | LOG.info(name + " : " + acc); |
87 | - | |
88 | 123 | return Pair.of(acc, name); |
89 | 124 | }).max(Comparator.comparingDouble(Pair::getLeft)); |
90 | 125 | LOG.info("#########"); |
91 | 126 | LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); |
92 | 127 | |
93 | 128 | watch.stop(); |
94 | - LOG.info("Elapsed time: " + watch); | |
129 | + LOG.info("Elapsed time: {}", watch); | |
95 | 130 | } |
96 | -} | |
97 | 131 | \ No newline at end of file |
132 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateMention.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.search; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
4 | + | |
5 | + | |
6 | +public class CrossvalidateMention { | |
7 | + | |
8 | + private CrossvalidateMention() { | |
9 | + } | |
10 | + | |
11 | + public static void main(String[] args) throws Exception { | |
12 | + CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.MENTION_DATASET_PATH); | |
13 | + } | |
14 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateSentence.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.search; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
4 | + | |
5 | + | |
6 | +public class CrossvalidateSentence { | |
7 | + | |
8 | + private CrossvalidateSentence() { | |
9 | + } | |
10 | + | |
11 | + public static void main(String[] args) throws Exception { | |
12 | + CrossvalidateCommon.crossvalidateRegressors(ModelConstants.SENTENCE_DATASET_PATH); | |
13 | + } | |
14 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateZero.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.search; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
4 | + | |
5 | + | |
6 | +public class CrossvalidateZero { | |
7 | + | |
8 | + private CrossvalidateZero() { | |
9 | + } | |
10 | + | |
11 | + public static void main(String[] args) throws Exception { | |
12 | + CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.ZERO_DATASET_PATH); | |
13 | + } | |
14 | +} | |
... | ... |
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/dev_ids.txt
0 → 100644
1 | +199704210011 | |
2 | +199704210013 | |
3 | +199704250031 | |
4 | +199704260017 | |
5 | +199801030156 | |
6 | +199801100009 | |
7 | +199801150038 | |
8 | +199801150133 | |
9 | +199801170001 | |
10 | +199801170129 | |
11 | +199801170130 | |
12 | +199801200002 | |
13 | +199801200132 | |
14 | +199801210007 | |
15 | +199801220030 | |
16 | +199801220127 | |
17 | +199801230001 | |
18 | +199801230095 | |
19 | +199801240116 | |
20 | +199801240123 | |
21 | +199801260113 | |
22 | +199801270108 | |
23 | +199801280128 | |
24 | +199801290020 | |
25 | +199801310032 | |
26 | +199802040201 | |
27 | +199901180149 | |
28 | +199901190049 | |
29 | +199901230088 | |
30 | +199901250006 | |
31 | +199901250008 | |
32 | +199901250111 | |
33 | +199901250113 | |
34 | +199901300064 | |
35 | +199901300098 | |
36 | +199902240123 | |
37 | +199906220027 | |
38 | +199906220037 | |
39 | +199906220038 | |
40 | +199906220056 | |
41 | +199906220065 | |
42 | +199906230040 | |
43 | +199906230052 | |
44 | +199906240040 | |
45 | +199906240088 | |
46 | +199906250007 | |
47 | +199906250091 | |
48 | +199906260015 | |
49 | +199906260018 | |
50 | +199906260038 | |
51 | +199907030016 | |
52 | +199907030018 | |
53 | +199907030042 | |
54 | +199907030059 | |
55 | +199907050032 | |
56 | +199907050040 | |
57 | +199907050047 | |
58 | +199907050071 | |
59 | +199907270095 | |
60 | +199907270137 | |
61 | +199907270145 | |
62 | +199909210045 | |
63 | +199909250054 | |
64 | +199909300064 | |
65 | +199909300065 | |
66 | +199909300066 | |
67 | +199910020049 | |
68 | +199910020050 | |
69 | +199910090047 | |
70 | +199910090049 | |
71 | +199910090051 | |
72 | +199910110055 | |
73 | +199910110057 | |
74 | +199910210058 | |
75 | +199910210059 | |
76 | +199910270041 | |
77 | +199910280054 | |
78 | +199910280055 | |
79 | +199910280057 | |
80 | +199910300026 | |
81 | +199911030039 | |
82 | +199911030040 | |
83 | +199911030041 | |
84 | +199911060031 | |
85 | +199911060042 | |
86 | +199911060043 | |
87 | +199911080054 | |
88 | +199911080055 | |
89 | +199911080056 | |
90 | +199911100061 | |
91 | +199911100062 | |
92 | +199911100063 | |
93 | +199911130036 | |
94 | +199911130037 | |
95 | +199911130038 | |
96 | +199911180042 | |
97 | +199911180043 | |
98 | +199911180044 | |
99 | +199911220059 | |
100 | +199911220061 | |
101 | +199911220066 | |
102 | +199911230041 | |
103 | +199911240035 | |
104 | +199911240037 | |
105 | +199911240038 | |
106 | +199911250055 | |
107 | +199911250057 | |
108 | +199912020059 | |
109 | +199912090045 | |
110 | +199912090047 | |
111 | +199912090061 | |
112 | +199912110041 | |
113 | +199912110042 | |
114 | +199912130055 | |
115 | +199912130057 | |
116 | +199912170065 | |
117 | +199912180052 | |
118 | +199912210018 | |
119 | +199912210037 | |
120 | +199912210040 | |
121 | +199912220045 | |
122 | +199912220046 | |
123 | +199912220047 | |
124 | +199912230058 | |
125 | +199912230059 | |
126 | +199912230097 | |
127 | +199912280028 | |
128 | +199912280044 | |
129 | +199912280045 | |
130 | +199912310085 | |
131 | +199912310087 | |
132 | +200001030047 | |
133 | +200001030106 | |
134 | +200001040030 | |
135 | +200001040031 | |
136 | +200001060052 | |
137 | +200001060053 | |
138 | +200001060055 | |
139 | +200001070062 | |
140 | +200001070066 | |
141 | +200001080040 | |
142 | +200001080041 | |
143 | +200001140061 | |
144 | +200001140064 | |
145 | +200001170049 | |
146 | +200001170051 | |
147 | +200001170052 | |
148 | +200001170053 | |
149 | +200001180040 | |
150 | +200001200056 | |
151 | +200001220023 | |
152 | +200001220118 | |
153 | +200001240016 | |
154 | +200001290042 | |
155 | +200001310048 | |
156 | +200001310049 | |
157 | +200001310050 | |
158 | +200001310054 | |
159 | +200002090042 | |
160 | +200002090043 | |
161 | +200002120045 | |
162 | +200002120046 | |
163 | +200002160046 | |
164 | +200002160047 | |
165 | +200002250063 | |
166 | +200002250065 | |
167 | +200002250066 | |
168 | +200002290044 | |
169 | +200002290045 | |
170 | +200002290046 | |
171 | +200002290047 | |
172 | +200002290048 | |
173 | +200003010058 | |
174 | +200003010059 | |
175 | +200003060054 | |
176 | +200003060055 | |
177 | +200003060057 | |
178 | +200003110047 | |
179 | +200003110048 | |
180 | +200003110049 | |
181 | +200003210044 | |
182 | +200003210045 | |
183 | +200004120021 | |
184 | +200004120022 | |
185 | +200004120023 | |
186 | +200004150048 | |
187 | +200004150049 | |
188 | +200004150050 | |
189 | +200004170026 | |
190 | +200004170065 | |
191 | +200004220044 | |
192 | +200004220045 | |
193 | +200004220046 | |
194 | +200004220047 | |
195 | +200004220048 | |
196 | +200005060030 | |
197 | +200005150055 | |
198 | +200005150059 | |
199 | +200005300045 | |
200 | +200005300047 | |
201 | +200005300048 | |
202 | +200006010065 | |
203 | +200006010066 | |
204 | +200006010067 | |
205 | +200006050056 | |
206 | +200006050057 | |
207 | +200006050058 | |
208 | +200006050059 | |
209 | +200006050061 | |
210 | +200006050068 | |
211 | +200006070056 | |
212 | +200006080033 | |
213 | +200006120031 | |
214 | +200006130055 | |
215 | +200006130057 | |
216 | +200006130059 | |
217 | +200006260069 | |
218 | +200006260071 | |
219 | +200006270059 | |
220 | +200007120068 | |
221 | +200007120070 | |
222 | +200007120072 | |
223 | +200007170026 | |
224 | +200007180051 | |
225 | +200007240034 | |
226 | +200007270050 | |
227 | +200007280033 | |
228 | +200008040071 | |
229 | +200008040073 | |
230 | +200008250077 | |
231 | +200008250079 | |
232 | +200008260055 | |
233 | +200008310046 | |
234 | +200010120066 | |
235 | +200010120074 | |
236 | +200010130063 | |
237 | +200010140048 | |
238 | +200010140049 | |
239 | +200010160039 | |
240 | +200010160048 | |
241 | +200010160049 | |
242 | +200010180059 | |
243 | +200010180063 | |
244 | +200010190066 | |
245 | +200010190068 | |
246 | +200011210063 | |
247 | +200011210064 | |
248 | +200011210066 | |
249 | +200012050066 | |
250 | +200012050067 | |
251 | +200012050068 | |
252 | +200012050069 | |
253 | +200012050070 | |
254 | +200012050071 | |
255 | +200012080134 | |
256 | +200012080137 | |
257 | +200012110069 | |
258 | +200012110070 | |
259 | +200012110071 | |
260 | +200012110075 | |
261 | +200012120028 | |
262 | +200012120068 | |
263 | +200012120072 | |
264 | +200012130056 | |
265 | +200012130100 | |
266 | +200012130102 | |
267 | +200012130103 | |
268 | +200012140095 | |
269 | +200012140096 | |
270 | +200012140097 | |
271 | +200012140098 | |
272 | +200012140099 | |
273 | +200012140100 | |
274 | +200012150076 | |
275 | +200012160048 | |
276 | +200012160049 | |
277 | +200012180083 | |
278 | +200012180084 | |
279 | +200012180088 | |
280 | +200012230028 | |
281 | +200012230045 | |
282 | +200012230046 | |
283 | +200012230047 | |
284 | +200012230048 | |
285 | +200012230050 | |
286 | +200012270055 | |
287 | +200012270056 | |
288 | +200101020059 | |
289 | +200101020062 | |
290 | +200101020063 | |
291 | +200101020075 | |
292 | +200101130048 | |
293 | +200101130050 | |
294 | +200101130051 | |
295 | +200101130055 | |
296 | +200101150043 | |
297 | +200101150045 | |
298 | +200101180050 | |
299 | +200101180051 | |
300 | +200101180052 | |
301 | +200101200048 | |
302 | +200101220047 | |
303 | +200101220053 | |
304 | +200102070011 | |
305 | +200102070016 | |
306 | +200102120034 | |
307 | +200102120057 | |
308 | +200102130014 | |
309 | +200102150001 | |
310 | +200102150014 | |
311 | +200102160011 | |
312 | +200102190016 | |
313 | +200102220001 | |
314 | +200102220013 | |
315 | +200102270041 | |
316 | +200102270062 | |
317 | +200102280169 | |
318 | +200103010049 | |
319 | +200103060022 | |
320 | +200103060032 | |
321 | +200103060057 | |
322 | +200103080026 | |
323 | +200103080030 | |
324 | +200103080036 | |
325 | +200103100019 | |
326 | +200103100021 | |
327 | +200103100058 | |
328 | +200103100062 | |
329 | +200103130008 | |
330 | +200103130023 | |
331 | +200103130069 | |
332 | +200103200066 | |
333 | +200103200080 | |
334 | +200103270069 | |
335 | +200103310092 | |
336 | +200104020007 | |
337 | +200104050011 | |
338 | +200104100021 | |
339 | +200104100023 | |
340 | +200104170015 | |
341 | +200104170040 | |
342 | +200104170055 | |
343 | +200104170057 | |
344 | +200104190039 | |
345 | +200104190066 | |
346 | +200104230031 | |
347 | +200104230069 | |
348 | +200104260051 | |
349 | +200104260053 | |
350 | +200104300213 | |
351 | +200104300215 | |
352 | +200104300217 | |
353 | +200105020092 | |
354 | +200105050042 | |
355 | +200105050043 | |
356 | +200105050046 | |
357 | +200105050048 | |
358 | +200105070017 | |
359 | +200105140050 | |
360 | +200105140052 | |
361 | +200105220096 | |
362 | +200105290074 | |
363 | +200105290075 | |
364 | +200106120068 | |
365 | +200106120069 | |
366 | +200106180051 | |
367 | +200106180053 | |
368 | +200106200064 | |
369 | +200106220086 | |
370 | +200106220087 | |
371 | +200106220088 | |
372 | +200106220090 | |
373 | +200106250050 | |
374 | +200107120071 | |
375 | +200107120073 | |
376 | +200107210129 | |
377 | +200107240070 | |
378 | +200107250080 | |
379 | +200108060051 | |
380 | +200108060155 | |
381 | +200108060156 | |
382 | +200108060157 | |
383 | +200108070038 | |
384 | +200108160040 | |
385 | +200108180123 | |
386 | +200108200033 | |
387 | +200108210066 | |
388 | +200108210074 | |
389 | +200108270077 | |
390 | +200108280064 | |
391 | +200109060061 | |
392 | +200109130091 | |
393 | +200109250092 | |
394 | +200109260097 | |
395 | +200109270116 | |
396 | +200110020075 | |
397 | +200110150056 | |
398 | +200110150062 | |
399 | +200110200070 | |
400 | +200110200071 | |
401 | +200110220068 | |
402 | +200111080086 | |
403 | +200111140055 | |
404 | +200111210078 | |
405 | +200111240060 | |
406 | +200112040031 | |
407 | +200112040077 | |
408 | +200112050063 | |
409 | +200112100041 | |
410 | +200112190067 | |
411 | +200201280011 | |
412 | +200201290029 | |
413 | +200202280078 | |
414 | +200203280057 | |
415 | +200203290107 | |
... | ... |
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/test_ids.txt
0 → 100644
1 | +199704210012 | |
2 | +199704210042 | |
3 | +199704220007 | |
4 | +199704220018 | |
5 | +199704220021 | |
6 | +199704220044 | |
7 | +199704230006 | |
8 | +199704230014 | |
9 | +199704230029 | |
10 | +199704230043 | |
11 | +199704240008 | |
12 | +199704240019 | |
13 | +199704240020 | |
14 | +199704240021 | |
15 | +199704250018 | |
16 | +199704250022 | |
17 | +199704260014 | |
18 | +199704260015 | |
19 | +199704260016 | |
20 | +199704280023 | |
21 | +199704280025 | |
22 | +199704280027 | |
23 | +199704280031 | |
24 | +199704300031 | |
25 | +199704300042 | |
26 | +199704300046 | |
27 | +199801020010 | |
28 | +199801020031 | |
29 | +199801020035 | |
30 | +199801020070 | |
31 | +199801020076 | |
32 | +199801020079 | |
33 | +199801030068 | |
34 | +199801030090 | |
35 | +199801030091 | |
36 | +199801030129 | |
37 | +199801030148 | |
38 | +199801030158 | |
39 | +199801050023 | |
40 | +199801050059 | |
41 | +199801130087 | |
42 | +199801130129 | |
43 | +199801140182 | |
44 | +199801160119 | |
45 | +199801200106 | |
46 | +199801220140 | |
47 | +199801240061 | |
48 | +199801240096 | |
49 | +199801260047 | |
50 | +199801260070 | |
51 | +199801270055 | |
52 | +199801270110 | |
53 | +199801280123 | |
54 | +199801280158 | |
55 | +199801280159 | |
56 | +199801280241 | |
57 | +199801290022 | |
58 | +199801310003 | |
59 | +199801310037 | |
60 | +199802030127 | |
61 | +199802040159 | |
62 | +199802040182 | |
63 | +199802040202 | |
64 | +199805220133 | |
65 | +199808280158 | |
66 | +199901190073 | |
67 | +199901190115 | |
68 | +199901250112 | |
69 | +199901250117 | |
70 | +199901270103 | |
71 | +199901270120 | |
72 | +199901270122 | |
73 | +199901290095 | |
74 | +199901300101 | |
75 | +199902240095 | |
76 | +199906220029 | |
77 | +199906230024 | |
78 | +199906240084 | |
79 | +199906260027 | |
80 | +199907050045 | |
81 | +199907050076 | |
82 | +199907140166 | |
83 | +199907200002 | |
84 | +199907270004 | |
85 | +199908260001 | |
86 | +199909090036 | |
87 | +199909250018 | |
88 | +199909270029 | |
89 | +199910020027 | |
90 | +199910020029 | |
91 | +199910270011 | |
92 | +199911060044 | |
93 | +199911100038 | |
94 | +199911100064 | |
95 | +199911200030 | |
96 | +199911220063 | |
97 | +199912020060 | |
98 | +199912180026 | |
99 | +199912180034 | |
100 | +199912220030 | |
101 | +199912280024 | |
102 | +199912280046 | |
103 | +199912300021 | |
104 | +199912300029 | |
105 | +200001030029 | |
106 | +200001030053 | |
107 | +200001060034 | |
108 | +200001100035 | |
109 | +200001100046 | |
110 | +200001170029 | |
111 | +200001170033 | |
112 | +200001170060 | |
113 | +200001290045 | |
114 | +200002220027 | |
115 | +200002240034 | |
116 | +200002250031 | |
117 | +200003060062 | |
118 | +200003110050 | |
119 | +200004280047 | |
120 | +200004290022 | |
121 | +200006050119 | |
122 | +200006260079 | |
123 | +200006290045 | |
124 | +200007150033 | |
125 | +200008040076 | |
126 | +200008220042 | |
127 | +200008220046 | |
128 | +200010130049 | |
129 | +200010160054 | |
130 | +200012130034 | |
131 | +200012140084 | |
132 | +200012290046 | |
133 | +200104040019 | |
134 | +200106050035 | |
135 | +200108180109 | |
136 | +200108300032 | |
137 | +200111120045 | |
138 | +200111150042 | |
139 | +200111150047 | |
140 | +200111200036 | |
141 | +200111270049 | |
142 | +200112030055 | |
143 | +200112280057 | |
144 | +200201220038 | |
145 | +200201220050 | |
146 | +200202020036 | |
147 | +200202200032 | |
148 | +200202210054 | |
149 | +200202270044 | |
150 | +200203010070 | |
151 | +200203190026 | |
152 | +200203260050 | |
153 | +200203280017 | |
154 | +200203290078 | |
... | ... |
nicolas-core/src/main/resources/zeros.tsv renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/zeros.tsv
nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessTest.java renamed to nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java
1 | 1 | package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; |
2 | 2 | |
3 | +import com.google.common.collect.Lists; | |
4 | +import org.junit.ClassRule; | |
3 | 5 | import org.junit.Test; |
6 | +import org.junit.rules.TemporaryFolder; | |
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
4 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
5 | 9 | |
6 | 10 | import java.io.File; |
11 | +import java.util.List; | |
12 | +import java.util.stream.Collectors; | |
13 | + | |
14 | +import static junit.framework.TestCase.assertEquals; | |
15 | + | |
16 | +public class NLPProcessIT { | |
17 | + | |
18 | + @ClassRule | |
19 | + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); | |
7 | 20 | |
8 | -public class NLPProcessTest { | |
9 | 21 | @Test |
10 | 22 | public void shouldProcessSampleText() throws Exception { |
11 | 23 | String text = "Ala ma kota. Ala ma też psa."; |
12 | 24 | TText processed = NLPProcess.annotate(text); |
13 | - processed.getParagraphs().stream().flatMap(p->p.getSentences().stream()).forEach(s->System.out.println(s.getId())); | |
14 | - File targetFile = new File("sample_serialized_text.bin"); | |
25 | + List<String> ids = processed.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).map(TSentence::getId).collect(Collectors.toList()); | |
26 | + assertEquals(Lists.newArrayList("s-2.1", "s-2.2"), ids); | |
27 | + | |
28 | + File targetFile = TEMPORARY_FOLDER.newFile(); | |
15 | 29 | NLPProcess.serialize(processed, targetFile); |
16 | 30 | } |
17 | 31 | } |
18 | 32 | \ No newline at end of file |
... | ... |
pom.xml
... | ... | @@ -11,7 +11,7 @@ |
11 | 11 | <packaging>pom</packaging> |
12 | 12 | |
13 | 13 | <modules> |
14 | - <module>nicolas-core</module> | |
14 | + <module>nicolas-lib</module> | |
15 | 15 | <module>nicolas-cli</module> |
16 | 16 | <module>nicolas-model</module> |
17 | 17 | <module>nicolas-train</module> |
... | ... | @@ -26,12 +26,13 @@ |
26 | 26 | <utils.version>1.0</utils.version> |
27 | 27 | |
28 | 28 | <commons-csv.version>1.4</commons-csv.version> |
29 | - <guava.version>19.0</guava.version> | |
30 | - <weka-dev.version>3.9.0</weka-dev.version> | |
29 | + <guava.version>20.0</guava.version> | |
30 | + <weka-dev.version>3.9.1</weka-dev.version> | |
31 | 31 | <commons-lang3.version>3.5</commons-lang3.version> |
32 | 32 | <commons-io.version>2.5</commons-io.version> |
33 | - <slf4j-api.version>1.7.12</slf4j-api.version> | |
33 | + <slf4j-api.version>1.7.22</slf4j-api.version> | |
34 | 34 | <junit.version>4.12</junit.version> |
35 | + <zip4j.version>1.3.2</zip4j.version> | |
35 | 36 | </properties> |
36 | 37 | |
37 | 38 | <prerequisites> |
... | ... | @@ -65,6 +66,16 @@ |
65 | 66 | <artifactId>nicolas-zero</artifactId> |
66 | 67 | <version>${project.version}</version> |
67 | 68 | </dependency> |
69 | + <dependency> | |
70 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
71 | + <artifactId>nicolas-lib</artifactId> | |
72 | + <version>${project.version}</version> | |
73 | + </dependency> | |
74 | + <dependency> | |
75 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
76 | + <artifactId>nicolas-train</artifactId> | |
77 | + <version>${project.version}</version> | |
78 | + </dependency> | |
68 | 79 | |
69 | 80 | <!-- internal --> |
70 | 81 | <dependency> |
... | ... | @@ -93,6 +104,12 @@ |
93 | 104 | <groupId>nz.ac.waikato.cms.weka</groupId> |
94 | 105 | <artifactId>weka-dev</artifactId> |
95 | 106 | <version>${weka-dev.version}</version> |
107 | + <exclusions> | |
108 | + <exclusion> | |
109 | + <groupId>org.slf4j</groupId> | |
110 | + <artifactId>slf4j-simple</artifactId> | |
111 | + </exclusion> | |
112 | + </exclusions> | |
96 | 113 | </dependency> |
97 | 114 | <dependency> |
98 | 115 | <groupId>org.apache.commons</groupId> |
... | ... | @@ -104,6 +121,11 @@ |
104 | 121 | <artifactId>commons-io</artifactId> |
105 | 122 | <version>${commons-io.version}</version> |
106 | 123 | </dependency> |
124 | + <dependency> | |
125 | + <groupId>net.lingala.zip4j</groupId> | |
126 | + <artifactId>zip4j</artifactId> | |
127 | + <version>${zip4j.version}</version> | |
128 | + </dependency> | |
107 | 129 | |
108 | 130 | <!-- logging --> |
109 | 131 | <dependency> |
... | ... | @@ -111,6 +133,11 @@ |
111 | 133 | <artifactId>slf4j-api</artifactId> |
112 | 134 | <version>${slf4j-api.version}</version> |
113 | 135 | </dependency> |
136 | + <dependency> | |
137 | + <groupId>org.slf4j</groupId> | |
138 | + <artifactId>slf4j-simple</artifactId> | |
139 | + <version>${slf4j-api.version}</version> | |
140 | + </dependency> | |
114 | 141 | |
115 | 142 | <!-- test --> |
116 | 143 | <dependency> |
... | ... |