Commit 88415dbf2896c80d1c6362b9288378c637425ed0
1 parent
91b27b24
refactor, add zero features
Showing
25 changed files
with
303 additions
and
98 deletions
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java
1 | package pl.waw.ipipan.zil.summ.nicolas.common; | 1 | package pl.waw.ipipan.zil.summ.nicolas.common; |
2 | 2 | ||
3 | import com.google.common.base.Charsets; | 3 | import com.google.common.base.Charsets; |
4 | +import com.google.common.collect.ImmutableList; | ||
4 | import weka.classifiers.Classifier; | 5 | import weka.classifiers.Classifier; |
5 | -import weka.classifiers.functions.Logistic; | 6 | +import weka.classifiers.functions.SMO; |
7 | +import weka.classifiers.meta.AdaBoostM1; | ||
8 | +import weka.classifiers.meta.AttributeSelectedClassifier; | ||
9 | +import weka.classifiers.rules.JRip; | ||
10 | +import weka.classifiers.trees.J48; | ||
6 | import weka.classifiers.trees.RandomForest; | 11 | import weka.classifiers.trees.RandomForest; |
7 | 12 | ||
8 | import java.nio.charset.Charset; | 13 | import java.nio.charset.Charset; |
@@ -20,6 +25,8 @@ public class Constants { | @@ -20,6 +25,8 @@ public class Constants { | ||
20 | 25 | ||
21 | public static final Charset ENCODING = Charsets.UTF_8; | 26 | public static final Charset ENCODING = Charsets.UTF_8; |
22 | 27 | ||
28 | + public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); | ||
29 | + | ||
23 | private Constants() { | 30 | private Constants() { |
24 | } | 31 | } |
25 | 32 | ||
@@ -33,14 +40,14 @@ public class Constants { | @@ -33,14 +40,14 @@ public class Constants { | ||
33 | 40 | ||
34 | public static Classifier getSentencesClassifier() { | 41 | public static Classifier getSentencesClassifier() { |
35 | RandomForest classifier = new RandomForest(); | 42 | RandomForest classifier = new RandomForest(); |
36 | - classifier.setNumIterations(250); | 43 | + classifier.setNumIterations(10); |
37 | classifier.setSeed(0); | 44 | classifier.setSeed(0); |
38 | classifier.setNumExecutionSlots(8); | 45 | classifier.setNumExecutionSlots(8); |
39 | return classifier; | 46 | return classifier; |
40 | } | 47 | } |
41 | 48 | ||
42 | public static Classifier getZerosClassifier() { | 49 | public static Classifier getZerosClassifier() { |
43 | - Logistic classifier = new Logistic(); | 50 | + Classifier classifier = new J48(); |
44 | return classifier; | 51 | return classifier; |
45 | } | 52 | } |
46 | } | 53 | } |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java
@@ -30,12 +30,13 @@ public class FeatureHelper { | @@ -30,12 +30,13 @@ public class FeatureHelper { | ||
30 | private final Map<TMention, TToken> mention2head = Maps.newHashMap(); | 30 | private final Map<TMention, TToken> mention2head = Maps.newHashMap(); |
31 | private final Set<TMention> mentionsInNamedEntities = Sets.newHashSet(); | 31 | private final Set<TMention> mentionsInNamedEntities = Sets.newHashSet(); |
32 | 32 | ||
33 | - private final Map<TMention, Integer> mention2Index = Maps.newHashMap(); | 33 | + private final Map<TMention, Integer> mention2index = Maps.newHashMap(); |
34 | private final Map<TSentence, Integer> sent2Index = Maps.newHashMap(); | 34 | private final Map<TSentence, Integer> sent2Index = Maps.newHashMap(); |
35 | private final Map<TParagraph, Integer> par2Index = Maps.newHashMap(); | 35 | private final Map<TParagraph, Integer> par2Index = Maps.newHashMap(); |
36 | private final Map<TSentence, Integer> sent2IndexInPar = Maps.newHashMap(); | 36 | private final Map<TSentence, Integer> sent2IndexInPar = Maps.newHashMap(); |
37 | private final Map<TMention, Integer> mention2indexInPar = Maps.newHashMap(); | 37 | private final Map<TMention, Integer> mention2indexInPar = Maps.newHashMap(); |
38 | private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); | 38 | private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); |
39 | + private final Map<TMention, Integer> mention2firstTokenIndex = Maps.newHashMap(); | ||
39 | 40 | ||
40 | 41 | ||
41 | public FeatureHelper(TText preprocessedText) { | 42 | public FeatureHelper(TText preprocessedText) { |
@@ -82,7 +83,8 @@ public class FeatureHelper { | @@ -82,7 +83,8 @@ public class FeatureHelper { | ||
82 | for (TMention mention : sent.getMentions()) { | 83 | for (TMention mention : sent.getMentions()) { |
83 | mention2sent.put(mention, sent); | 84 | mention2sent.put(mention, sent); |
84 | mention2par.put(mention, par); | 85 | mention2par.put(mention, par); |
85 | - mention2Index.put(mention, mentionIdx++); | 86 | + mention2index.put(mention, mentionIdx++); |
87 | + mention2firstTokenIndex.put(mention, sent.getTokens().indexOf(tokenId2token.get(mention.getChildIds().iterator().next()))); | ||
86 | mention2indexInSent.put(mention, mentionIdxInSent++); | 88 | mention2indexInSent.put(mention, mentionIdxInSent++); |
87 | mention2indexInPar.put(mention, mentionIdxInPar++); | 89 | mention2indexInPar.put(mention, mentionIdxInPar++); |
88 | 90 | ||
@@ -124,7 +126,11 @@ public class FeatureHelper { | @@ -124,7 +126,11 @@ public class FeatureHelper { | ||
124 | } | 126 | } |
125 | 127 | ||
126 | public int getMentionIndex(TMention mention) { | 128 | public int getMentionIndex(TMention mention) { |
127 | - return mention2Index.get(mention); | 129 | + return mention2index.get(mention); |
130 | + } | ||
131 | + | ||
132 | + public int getMentionFirstTokenIndex(TMention mention) { | ||
133 | + return mention2firstTokenIndex.get(mention); | ||
128 | } | 134 | } |
129 | 135 | ||
130 | public int getMentionIndexInSent(TMention mention) { | 136 | public int getMentionIndexInSent(TMention mention) { |
@@ -200,4 +206,19 @@ public class FeatureHelper { | @@ -200,4 +206,19 @@ public class FeatureHelper { | ||
200 | public TText getText() { | 206 | public TText getText() { |
201 | return text; | 207 | return text; |
202 | } | 208 | } |
209 | + | ||
210 | + public TToken getTokenAfterMention(TMention mention) { | ||
211 | + Integer idx = mention2firstTokenIndex.get(mention) + mention.getChildIds().size(); | ||
212 | + List<TToken> sentenceTokens = mention2sent.get(mention).getTokens(); | ||
213 | + if (idx >= sentenceTokens.size()) | ||
214 | + return null; | ||
215 | + return sentenceTokens.get(idx); | ||
216 | + } | ||
217 | + | ||
218 | + public TToken getTokenBeforeMention(TMention mention) { | ||
219 | + Integer idx = mention2firstTokenIndex.get(mention); | ||
220 | + if (idx == 0) | ||
221 | + return null; | ||
222 | + return mention2sent.get(mention).getTokens().get(idx - 1); | ||
223 | + } | ||
203 | } | 224 | } |
nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.common; | ||
2 | + | ||
3 | +import org.junit.Test; | ||
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
5 | + | ||
6 | +import java.io.InputStream; | ||
7 | + | ||
8 | +import static org.junit.Assert.assertEquals; | ||
9 | + | ||
10 | +public class UtilsTest { | ||
11 | + | ||
12 | + private static final String SAMPLE_TEXT_PATH = "/199704210011.bin"; | ||
13 | + | ||
14 | + @Test | ||
15 | + public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { | ||
16 | + try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { | ||
17 | + TText text = Utils.loadThrifted(stream); | ||
18 | + assertEquals(26, text.getParagraphs().size()); | ||
19 | + assertEquals(2, text.getParagraphs().get(4).getSentences().size()); | ||
20 | + } | ||
21 | + } | ||
22 | +} | ||
0 | \ No newline at end of file | 23 | \ No newline at end of file |
nicolas-common/src/test/resources/199704210011.bin
0 → 100644
No preview for this file type
nicolas-core/pom.xml
@@ -21,11 +21,8 @@ | @@ -21,11 +21,8 @@ | ||
21 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 21 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
22 | <artifactId>nicolas-model</artifactId> | 22 | <artifactId>nicolas-model</artifactId> |
23 | </dependency> | 23 | </dependency> |
24 | - <dependency> | ||
25 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
26 | - <artifactId>nicolas-zero</artifactId> | ||
27 | - </dependency> | ||
28 | 24 | ||
25 | + <!-- internal --> | ||
29 | <dependency> | 26 | <dependency> |
30 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 27 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
31 | <artifactId>pscapi</artifactId> | 28 | <artifactId>pscapi</artifactId> |
@@ -35,6 +32,7 @@ | @@ -35,6 +32,7 @@ | ||
35 | <artifactId>utils</artifactId> | 32 | <artifactId>utils</artifactId> |
36 | </dependency> | 33 | </dependency> |
37 | 34 | ||
35 | + <!-- third party --> | ||
38 | <dependency> | 36 | <dependency> |
39 | <groupId>nz.ac.waikato.cms.weka</groupId> | 37 | <groupId>nz.ac.waikato.cms.weka</groupId> |
40 | <artifactId>weka-dev</artifactId> | 38 | <artifactId>weka-dev</artifactId> |
@@ -51,5 +49,17 @@ | @@ -51,5 +49,17 @@ | ||
51 | <groupId>org.apache.commons</groupId> | 49 | <groupId>org.apache.commons</groupId> |
52 | <artifactId>commons-lang3</artifactId> | 50 | <artifactId>commons-lang3</artifactId> |
53 | </dependency> | 51 | </dependency> |
52 | + | ||
53 | + <!-- logging --> | ||
54 | + <dependency> | ||
55 | + <groupId>org.slf4j</groupId> | ||
56 | + <artifactId>slf4j-api</artifactId> | ||
57 | + </dependency> | ||
58 | + | ||
59 | + <!-- test --> | ||
60 | + <dependency> | ||
61 | + <groupId>junit</groupId> | ||
62 | + <artifactId>junit</artifactId> | ||
63 | + </dependency> | ||
54 | </dependencies> | 64 | </dependencies> |
55 | </project> | 65 | </project> |
56 | \ No newline at end of file | 66 | \ No newline at end of file |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/EvalUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | + | ||
3 | +import org.apache.commons.lang3.time.StopWatch; | ||
4 | +import org.apache.commons.lang3.tuple.Pair; | ||
5 | +import org.slf4j.Logger; | ||
6 | +import org.slf4j.LoggerFactory; | ||
7 | +import weka.classifiers.Classifier; | ||
8 | +import weka.classifiers.bayes.BayesNet; | ||
9 | +import weka.classifiers.bayes.NaiveBayes; | ||
10 | +import weka.classifiers.evaluation.Evaluation; | ||
11 | +import weka.classifiers.functions.LinearRegression; | ||
12 | +import weka.classifiers.functions.Logistic; | ||
13 | +import weka.classifiers.functions.SMOreg; | ||
14 | +import weka.classifiers.functions.SimpleLogistic; | ||
15 | +import weka.classifiers.lazy.IBk; | ||
16 | +import weka.classifiers.lazy.KStar; | ||
17 | +import weka.classifiers.lazy.LWL; | ||
18 | +import weka.classifiers.rules.DecisionTable; | ||
19 | +import weka.classifiers.rules.JRip; | ||
20 | +import weka.classifiers.rules.PART; | ||
21 | +import weka.classifiers.trees.HoeffdingTree; | ||
22 | +import weka.classifiers.trees.J48; | ||
23 | +import weka.classifiers.trees.LMT; | ||
24 | +import weka.classifiers.trees.RandomForest; | ||
25 | +import weka.core.Instances; | ||
26 | + | ||
27 | +import java.util.Arrays; | ||
28 | +import java.util.Comparator; | ||
29 | +import java.util.Optional; | ||
30 | +import java.util.Random; | ||
31 | + | ||
32 | +public class EvalUtils { | ||
33 | + | ||
34 | + private static final Logger LOG = LoggerFactory.getLogger(EvalUtils.class); | ||
35 | + public static final int NUM_FOLDS = 10; | ||
36 | + | ||
37 | + private EvalUtils() { | ||
38 | + } | ||
39 | + | ||
40 | + public static void crossvalidateClassification(Instances instances) throws Exception { | ||
41 | + StopWatch watch = new StopWatch(); | ||
42 | + watch.start(); | ||
43 | + | ||
44 | + Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{new J48(), new RandomForest(), new HoeffdingTree(), new LMT(), | ||
45 | + new Logistic(), | ||
46 | + new SimpleLogistic(), new BayesNet(), new NaiveBayes(), | ||
47 | + new KStar(), new IBk(), new LWL(), | ||
48 | + new DecisionTable(), new JRip(), new PART()}).parallel().map(cls -> { | ||
49 | + Evaluation eval = null; | ||
50 | + try { | ||
51 | + eval = new Evaluation(instances); | ||
52 | + eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); | ||
53 | + } catch (Exception e) { | ||
54 | + e.printStackTrace(); | ||
55 | + } | ||
56 | + double acc = eval.correct() / eval.numInstances(); | ||
57 | + String name = cls.getClass().getSimpleName(); | ||
58 | + LOG.info(name + " : " + acc); | ||
59 | + | ||
60 | + return Pair.of(acc, name); | ||
61 | + }).max(Comparator.comparingDouble(Pair::getLeft)); | ||
62 | + LOG.info("#########"); | ||
63 | + LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); | ||
64 | + | ||
65 | + watch.stop(); | ||
66 | + LOG.info("Elapsed time: " + watch); | ||
67 | + } | ||
68 | + | ||
69 | + public static void crossvalidateRegression(Instances instances) { | ||
70 | + StopWatch watch = new StopWatch(); | ||
71 | + watch.start(); | ||
72 | + | ||
73 | + Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{ | ||
74 | + new RandomForest(), new LinearRegression(), new SMOreg()}).parallel().map(cls -> { | ||
75 | + Evaluation eval = null; | ||
76 | + double acc = 0; | ||
77 | + try { | ||
78 | + eval = new Evaluation(instances); | ||
79 | + eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); | ||
80 | + acc = eval.correlationCoefficient(); | ||
81 | + | ||
82 | + } catch (Exception e) { | ||
83 | + e.printStackTrace(); | ||
84 | + } | ||
85 | + String name = cls.getClass().getSimpleName(); | ||
86 | + LOG.info(name + " : " + acc); | ||
87 | + | ||
88 | + return Pair.of(acc, name); | ||
89 | + }).max(Comparator.comparingDouble(Pair::getLeft)); | ||
90 | + LOG.info("#########"); | ||
91 | + LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); | ||
92 | + | ||
93 | + watch.stop(); | ||
94 | + LOG.info("Elapsed time: " + watch); | ||
95 | + } | ||
96 | +} | ||
0 | \ No newline at end of file | 97 | \ No newline at end of file |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
@@ -2,6 +2,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; | @@ -2,6 +2,7 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
2 | 2 | ||
3 | import com.google.common.collect.*; | 3 | import com.google.common.collect.*; |
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
5 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
5 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | 6 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; |
6 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 7 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
7 | import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; |
@@ -45,7 +46,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { | @@ -45,7 +46,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { | ||
45 | addBinaryAttribute(prefix + "_is_zero"); | 46 | addBinaryAttribute(prefix + "_is_zero"); |
46 | addBinaryAttribute(prefix + "_is_named"); | 47 | addBinaryAttribute(prefix + "_is_named"); |
47 | addBinaryAttribute(prefix + "_is_pronoun"); | 48 | addBinaryAttribute(prefix + "_is_pronoun"); |
48 | - addNominalAttribute(prefix + "_ctag", Lists.newArrayList("other", "null", "impt", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact")); | 49 | + addNominalAttribute(prefix + "_ctag", Constants.POS_TAGS); |
49 | addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter")); | 50 | addNominalAttribute(prefix + "_person", Lists.newArrayList("other", "null", "pri", "sec", "ter")); |
50 | addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc")); | 51 | addNominalAttribute(prefix + "_case", Lists.newArrayList("other", "null", "nom", "acc", "dat", "gen", "loc", "inst", "voc")); |
51 | addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl")); | 52 | addNominalAttribute(prefix + "_number", Lists.newArrayList("other", "null", "sg", "pl")); |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Crossvalidate.java
@@ -17,6 +17,9 @@ public class Crossvalidate { | @@ -17,6 +17,9 @@ public class Crossvalidate { | ||
17 | 17 | ||
18 | private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | 18 | private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); |
19 | 19 | ||
20 | + private Crossvalidate() { | ||
21 | + } | ||
22 | + | ||
20 | public static void main(String[] args) throws Exception { | 23 | public static void main(String[] args) throws Exception { |
21 | 24 | ||
22 | ArffLoader loader = new ArffLoader(); | 25 | ArffLoader loader = new ArffLoader(); |
@@ -26,9 +29,6 @@ public class Crossvalidate { | @@ -26,9 +29,6 @@ public class Crossvalidate { | ||
26 | LOG.info(instances.size() + " instances loaded."); | 29 | LOG.info(instances.size() + " instances loaded."); |
27 | LOG.info(instances.numAttributes() + " attributes for each instance."); | 30 | LOG.info(instances.numAttributes() + " attributes for each instance."); |
28 | 31 | ||
29 | -// while (instances.size() > 10000) | ||
30 | -// instances.remove(instances.size() - 1); | ||
31 | - | ||
32 | StopWatch watch = new StopWatch(); | 32 | StopWatch watch = new StopWatch(); |
33 | watch.start(); | 33 | watch.start(); |
34 | 34 |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/test/Validate.java
@@ -14,9 +14,7 @@ import java.io.FileInputStream; | @@ -14,9 +14,7 @@ import java.io.FileInputStream; | ||
14 | import java.io.IOException; | 14 | import java.io.IOException; |
15 | import java.io.ObjectInputStream; | 15 | import java.io.ObjectInputStream; |
16 | 16 | ||
17 | -/** | ||
18 | - * Created by me2 on 05.04.16. | ||
19 | - */ | 17 | + |
20 | public class Validate { | 18 | public class Validate { |
21 | private static final Logger LOG = LoggerFactory.getLogger(Validate.class); | 19 | private static final Logger LOG = LoggerFactory.getLogger(Validate.class); |
22 | 20 |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/test/Crossvalidate.java
1 | package pl.waw.ipipan.zil.summ.nicolas.sentence.test; | 1 | package pl.waw.ipipan.zil.summ.nicolas.sentence.test; |
2 | 2 | ||
3 | -import org.apache.commons.lang3.time.StopWatch; | ||
4 | import org.slf4j.Logger; | 3 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 4 | import org.slf4j.LoggerFactory; |
6 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 5 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | -import weka.classifiers.Classifier; | ||
8 | -import weka.classifiers.evaluation.Evaluation; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils; |
9 | import weka.core.Instances; | 7 | import weka.core.Instances; |
10 | import weka.core.converters.ArffLoader; | 8 | import weka.core.converters.ArffLoader; |
11 | 9 | ||
12 | import java.io.File; | 10 | import java.io.File; |
13 | -import java.util.Random; | ||
14 | 11 | ||
15 | 12 | ||
16 | public class Crossvalidate { | 13 | public class Crossvalidate { |
17 | 14 | ||
18 | private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | 15 | private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); |
19 | 16 | ||
17 | + private Crossvalidate() { | ||
18 | + } | ||
19 | + | ||
20 | public static void main(String[] args) throws Exception { | 20 | public static void main(String[] args) throws Exception { |
21 | 21 | ||
22 | ArffLoader loader = new ArffLoader(); | 22 | ArffLoader loader = new ArffLoader(); |
@@ -26,16 +26,6 @@ public class Crossvalidate { | @@ -26,16 +26,6 @@ public class Crossvalidate { | ||
26 | LOG.info(instances.size() + " instances loaded."); | 26 | LOG.info(instances.size() + " instances loaded."); |
27 | LOG.info(instances.numAttributes() + " attributes for each instance."); | 27 | LOG.info(instances.numAttributes() + " attributes for each instance."); |
28 | 28 | ||
29 | - StopWatch watch = new StopWatch(); | ||
30 | - watch.start(); | ||
31 | - | ||
32 | - Classifier tree = Constants.getSentencesClassifier(); | ||
33 | - | ||
34 | - Evaluation eval = new Evaluation(instances); | ||
35 | - eval.crossValidateModel(tree, instances, 10, new Random(1)); | ||
36 | - LOG.info(eval.toSummaryString()); | ||
37 | - | ||
38 | - watch.stop(); | ||
39 | - LOG.info("Elapsed time: " + watch); | 29 | + EvalUtils.crossvalidateRegression(instances); |
40 | } | 30 | } |
41 | } | 31 | } |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java renamed to nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinder.java
@@ -61,6 +61,8 @@ public class CandidateFinder { | @@ -61,6 +61,8 @@ public class CandidateFinder { | ||
61 | } | 61 | } |
62 | 62 | ||
63 | private static boolean isInNominative(TInterpretation interp) { | 63 | private static boolean isInNominative(TInterpretation interp) { |
64 | - return interp.getCtag().equals("subst") && Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); | 64 | + boolean isNominative = Arrays.stream(interp.getMsd().split(":")).anyMatch(t -> t.equals("nom")); |
65 | + boolean isSubst = interp.getCtag().equals("subst"); | ||
66 | + return isSubst && isNominative; | ||
65 | } | 67 | } |
66 | } | 68 | } |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/TrainingDataExtractor.java renamed to nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/PrepareTrainingData.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero.train; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; |
2 | 2 | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import com.google.common.collect.Sets; | 4 | import com.google.common.collect.Sets; |
5 | import org.apache.commons.io.IOUtils; | 5 | import org.apache.commons.io.IOUtils; |
6 | +import org.slf4j.Logger; | ||
7 | +import org.slf4j.LoggerFactory; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
7 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
8 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
9 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 11 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | ||
11 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | ||
12 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | ||
13 | import weka.core.Attribute; | 12 | import weka.core.Attribute; |
14 | import weka.core.DenseInstance; | 13 | import weka.core.DenseInstance; |
15 | import weka.core.Instance; | 14 | import weka.core.Instance; |
@@ -23,13 +22,15 @@ import java.util.List; | @@ -23,13 +22,15 @@ import java.util.List; | ||
23 | import java.util.Map; | 22 | import java.util.Map; |
24 | import java.util.Set; | 23 | import java.util.Set; |
25 | 24 | ||
26 | -public class TrainingDataExtractor { | 25 | +public class PrepareTrainingData { |
26 | + | ||
27 | + private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); | ||
27 | 28 | ||
28 | private static final String IDS_PATH = "corpora/summaries_dev"; | 29 | private static final String IDS_PATH = "corpora/summaries_dev"; |
29 | private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; | 30 | private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; |
30 | private static final String GOLD_ZEROS_PATH = "/zeros.tsv"; | 31 | private static final String GOLD_ZEROS_PATH = "/zeros.tsv"; |
31 | 32 | ||
32 | - private TrainingDataExtractor() { | 33 | + private PrepareTrainingData() { |
33 | } | 34 | } |
34 | 35 | ||
35 | public static void main(String[] args) throws IOException { | 36 | public static void main(String[] args) throws IOException { |
@@ -42,7 +43,10 @@ public class TrainingDataExtractor { | @@ -42,7 +43,10 @@ public class TrainingDataExtractor { | ||
42 | 43 | ||
43 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 44 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
44 | 45 | ||
46 | + int i = 1; | ||
45 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | 47 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
48 | + LOG.info(i++ + "/" + id2preprocessedText.size()); | ||
49 | + | ||
46 | String textId = entry.getKey(); | 50 | String textId = entry.getKey(); |
47 | 51 | ||
48 | TText text = entry.getValue(); | 52 | TText text = entry.getValue(); |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/TrainModel.java renamed to nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/TrainModel.java
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java renamed to nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
@@ -2,14 +2,18 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | @@ -2,14 +2,18 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
2 | 2 | ||
3 | import com.google.common.collect.Lists; | 3 | import com.google.common.collect.Lists; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
5 | +import com.google.common.collect.Sets; | ||
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | ||
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
7 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; |
8 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 11 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
9 | import weka.core.Attribute; | 12 | import weka.core.Attribute; |
10 | 13 | ||
11 | import java.util.List; | 14 | import java.util.List; |
12 | import java.util.Map; | 15 | import java.util.Map; |
16 | +import java.util.Set; | ||
13 | 17 | ||
14 | 18 | ||
15 | public class ZeroFeatureExtractor extends FeatureExtractor { | 19 | public class ZeroFeatureExtractor extends FeatureExtractor { |
@@ -18,13 +22,26 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | @@ -18,13 +22,26 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | ||
18 | 22 | ||
19 | for (String prefix : new String[]{"antecedent", "candidate"}) { | 23 | for (String prefix : new String[]{"antecedent", "candidate"}) { |
20 | addNumericAttribute(prefix + "_index_in_sent"); | 24 | addNumericAttribute(prefix + "_index_in_sent"); |
25 | + addNumericAttribute(prefix + "_first_token_index_in_sent"); | ||
21 | addNumericAttribute(prefix + "_token_count"); | 26 | addNumericAttribute(prefix + "_token_count"); |
22 | - addBinaryAttribute(prefix + "_is_zero"); | ||
23 | - addBinaryAttribute(prefix + "_is_pronoun"); | ||
24 | addBinaryAttribute(prefix + "_is_named"); | 27 | addBinaryAttribute(prefix + "_is_named"); |
28 | + addNumericAttribute(prefix + "_sentence_mention_count"); | ||
29 | + addNominalAttribute(prefix + "_next_token_pos", Constants.POS_TAGS); | ||
30 | + addNominalAttribute(prefix + "_prev_token_pos", Constants.POS_TAGS); | ||
31 | + addBinaryAttribute(prefix + "_is_nested"); | ||
32 | + addBinaryAttribute(prefix + "_is_nesting"); | ||
25 | } | 33 | } |
26 | 34 | ||
35 | + addNumericAttribute("chain_length"); | ||
36 | + | ||
27 | addBinaryAttribute("pair_equal_orth"); | 37 | addBinaryAttribute("pair_equal_orth"); |
38 | + addBinaryAttribute("pair_equal_ignore_case_orth"); | ||
39 | + addBinaryAttribute("pair_equal_base"); | ||
40 | + addBinaryAttribute("pair_equal_number"); | ||
41 | + addBinaryAttribute("pair_equal_head_base"); | ||
42 | + | ||
43 | + addNumericAttribute("pair_sent_distance"); | ||
44 | + addNumericAttribute("pair_par_distance"); | ||
28 | 45 | ||
29 | addNominalAttribute("score", Lists.newArrayList("bad", "good")); | 46 | addNominalAttribute("score", Lists.newArrayList("bad", "good")); |
30 | fillSortedAttributes("score"); | 47 | fillSortedAttributes("score"); |
@@ -53,17 +70,57 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | @@ -53,17 +70,57 @@ public class ZeroFeatureExtractor extends FeatureExtractor { | ||
53 | addMentionFeatures(helper, candidateFeatures, mention, "candidate"); | 70 | addMentionFeatures(helper, candidateFeatures, mention, "candidate"); |
54 | addMentionFeatures(helper, candidateFeatures, antecedent, "antecedent"); | 71 | addMentionFeatures(helper, candidateFeatures, antecedent, "antecedent"); |
55 | 72 | ||
56 | - candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equalsIgnoreCase(helper.getMentionOrth(antecedent)))); | 73 | + candidateFeatures.put(getAttributeByName("pair_equal_orth"), toBinary(helper.getMentionOrth(mention).equals(helper.getMentionOrth(antecedent)))); |
74 | + candidateFeatures.put(getAttributeByName("pair_equal_base"), toBinary(helper.getMentionBase(mention).equalsIgnoreCase(helper.getMentionBase(antecedent)))); | ||
75 | + candidateFeatures.put(getAttributeByName("pair_equal_ignore_case_orth"), toBinary(helper.getMentionOrth(mention).equalsIgnoreCase(helper.getMentionOrth(antecedent)))); | ||
76 | + candidateFeatures.put(getAttributeByName("pair_equal_head_base"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getBase().equalsIgnoreCase(helper.getMentionHeadToken(antecedent).getChosenInterpretation().getBase()))); | ||
77 | + | ||
78 | + candidateFeatures.put(getAttributeByName("pair_sent_distance"), (double) Math.abs(helper.getSentIndex(helper.getMentionSentence(mention)) - helper.getSentIndex(helper.getMentionSentence(antecedent)))); | ||
79 | + candidateFeatures.put(getAttributeByName("pair_par_distance"), (double) Math.abs(helper.getParIndex(helper.getMentionParagraph(mention)) - helper.getParIndex(helper.getMentionParagraph(antecedent)))); | ||
80 | + | ||
81 | + String mentionNumber = getNumber(helper.getMentionHeadToken(mention)); | ||
82 | + String antecedentNumber = getNumber(helper.getMentionHeadToken(antecedent)); | ||
83 | + candidateFeatures.put(getAttributeByName("pair_equal_number"), toBinary(mentionNumber != null && mentionNumber.equals(antecedentNumber))); | ||
84 | + | ||
85 | + candidateFeatures.put(getAttributeByName("chain_length"), (double) helper.getChainLength(mention)); | ||
57 | 86 | ||
58 | return candidateFeatures; | 87 | return candidateFeatures; |
59 | } | 88 | } |
60 | 89 | ||
90 | + private String getNumber(TToken token) { | ||
91 | + Set<String> msd = Sets.newHashSet(token.getChosenInterpretation().getMsd().split(":")); | ||
92 | + if (msd.contains("sg")) | ||
93 | + return "sg"; | ||
94 | + else if (msd.contains("pl")) | ||
95 | + return "pl"; | ||
96 | + else | ||
97 | + return null; | ||
98 | + } | ||
99 | + | ||
61 | private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) { | 100 | private void addMentionFeatures(FeatureHelper helper, Map<Attribute, Double> candidateFeatures, TMention mention, String attributePrefix) { |
62 | candidateFeatures.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention)); | 101 | candidateFeatures.put(getAttributeByName(attributePrefix + "_index_in_sent"), (double) helper.getMentionIndexInSent(mention)); |
102 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_first_token_index_in_sent"), (double) helper.getMentionFirstTokenIndex(mention)); | ||
103 | + | ||
63 | candidateFeatures.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize()); | 104 | candidateFeatures.put(getAttributeByName(attributePrefix + "_token_count"), (double) mention.getChildIdsSize()); |
64 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_zero"), toBinary(mention.isZeroSubject())); | ||
65 | - candidateFeatures.put(getAttributeByName(attributePrefix + "_is_pronoun"), toBinary(helper.getMentionHeadToken(mention).getChosenInterpretation().getCtag().matches("ppron.*"))); | ||
66 | candidateFeatures.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention))); | 105 | candidateFeatures.put(getAttributeByName(attributePrefix + "_is_named"), toBinary(helper.isMentionNamedEntity(mention))); |
106 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_sentence_mention_count"), (double) helper.getMentionSentence(mention).getMentions().size()); | ||
107 | + | ||
108 | + TToken nextToken = helper.getTokenAfterMention(mention); | ||
109 | + addNominalAttributeValue(nextToken == null ? "end" : nextToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_next_token_pos"); | ||
110 | + TToken prevToken = helper.getTokenBeforeMention(mention); | ||
111 | + addNominalAttributeValue(prevToken == null ? "end" : prevToken.getChosenInterpretation().getCtag(), candidateFeatures, attributePrefix + "_prev_token_pos"); | ||
112 | + | ||
113 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nested"), toBinary(helper.isNested(mention))); | ||
114 | + candidateFeatures.put(getAttributeByName(attributePrefix + "_is_nesting"), toBinary(helper.isNesting(mention))); | ||
115 | + | ||
67 | } | 116 | } |
68 | 117 | ||
118 | + private void addNominalAttributeValue(String value, Map<Attribute, Double> attribute2value, String attributeName) { | ||
119 | + Attribute att = getAttributeByName(attributeName); | ||
120 | + int index = att.indexOfValue(value); | ||
121 | + if (index == -1) | ||
122 | + LOG.warn(value + " not found for attribute " + attributeName); | ||
123 | + attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); | ||
124 | + } | ||
69 | } | 125 | } |
126 | + |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/train/ZeroScorer.java renamed to nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroScorer.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.zero.train; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.zero; |
2 | 2 | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import org.apache.commons.csv.CSVFormat; | 4 | import org.apache.commons.csv.CSVFormat; |
@@ -7,7 +7,6 @@ import org.apache.commons.csv.CSVRecord; | @@ -7,7 +7,6 @@ import org.apache.commons.csv.CSVRecord; | ||
7 | import org.apache.commons.csv.QuoteMode; | 7 | import org.apache.commons.csv.QuoteMode; |
8 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
9 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | ||
11 | 10 | ||
12 | import java.io.IOException; | 11 | import java.io.IOException; |
13 | import java.io.InputStream; | 12 | import java.io.InputStream; |
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java renamed to nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectCandidate.java
nicolas-zero/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java renamed to nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
@@ -5,7 +5,6 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | @@ -5,7 +5,6 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
6 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 6 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 7 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.zero.train.TrainingDataExtractor; | ||
9 | import weka.classifiers.Classifier; | 8 | import weka.classifiers.Classifier; |
10 | import weka.core.Instance; | 9 | import weka.core.Instance; |
11 | import weka.core.Instances; | 10 | import weka.core.Instances; |
@@ -32,7 +31,7 @@ public class ZeroSubjectInjector { | @@ -32,7 +31,7 @@ public class ZeroSubjectInjector { | ||
32 | Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); | 31 | Set<String> summarySentenceIds = selectedSentences.stream().map(TSentence::getId).collect(Collectors.toSet()); |
33 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); | 32 | List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, summarySentenceIds); |
34 | Map<ZeroSubjectCandidate, Instance> candidate2instance = | 33 | Map<ZeroSubjectCandidate, Instance> candidate2instance = |
35 | - TrainingDataExtractor.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | 34 | + PrepareTrainingData.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); |
36 | 35 | ||
37 | Set<String> result = Sets.newHashSet(); | 36 | Set<String> result = Sets.newHashSet(); |
38 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { | 37 | for (Map.Entry<ZeroSubjectCandidate, Instance> entry : candidate2instance.entrySet()) { |
nicolas-core/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/test/Crossvalidate.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.zero.test; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.eval.EvalUtils; | ||
7 | +import weka.core.Instances; | ||
8 | +import weka.core.converters.ArffLoader; | ||
9 | + | ||
10 | +import java.io.File; | ||
11 | + | ||
12 | + | ||
13 | +public class Crossvalidate { | ||
14 | + | ||
15 | + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | ||
16 | + | ||
17 | + private Crossvalidate() { | ||
18 | + } | ||
19 | + | ||
20 | + public static void main(String[] args) throws Exception { | ||
21 | + | ||
22 | + ArffLoader loader = new ArffLoader(); | ||
23 | + loader.setFile(new File(Constants.ZERO_DATASET_PATH)); | ||
24 | + Instances instances = loader.getDataSet(); | ||
25 | + instances.setClassIndex(0); | ||
26 | + LOG.info(instances.size() + " instances loaded."); | ||
27 | + LOG.info(instances.numAttributes() + " attributes for each instance."); | ||
28 | + | ||
29 | + EvalUtils.crossvalidateClassification(instances); | ||
30 | + } | ||
31 | +} |
nicolas-zero/src/main/resources/zeros.tsv renamed to nicolas-core/src/main/resources/zeros.tsv
nicolas-zero/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java renamed to nicolas-core/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
nicolas-zero/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin renamed to nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_serialized_text.bin
No preview for this file type
nicolas-zero/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt renamed to nicolas-core/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/zero/sample_summary_sentence_ids.txt
nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; | ||
2 | + | ||
3 | +import org.junit.Test; | ||
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
5 | + | ||
6 | +import java.io.File; | ||
7 | + | ||
8 | +public class NLPProcessTest { | ||
9 | + @Test | ||
10 | + public void shouldProcessSampleText() throws Exception { | ||
11 | + String text = "Ala ma kota. Ala ma też psa."; | ||
12 | + TText processed = NLPProcess.annotate(text); | ||
13 | + processed.getParagraphs().stream().flatMap(p->p.getSentences().stream()).forEach(s->System.out.println(s.getId())); | ||
14 | + File targetFile = new File("sample_serialized_text.bin"); | ||
15 | + NLPProcess.serialize(processed, targetFile); | ||
16 | + } | ||
17 | +} | ||
0 | \ No newline at end of file | 18 | \ No newline at end of file |
nicolas-zero/pom.xml deleted
1 | -<?xml version="1.0" encoding="UTF-8"?> | ||
2 | -<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | - <modelVersion>4.0.0</modelVersion> | ||
6 | - <parent> | ||
7 | - <artifactId>nicolas-container</artifactId> | ||
8 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
9 | - <version>1.0-SNAPSHOT</version> | ||
10 | - </parent> | ||
11 | - | ||
12 | - <artifactId>nicolas-zero</artifactId> | ||
13 | - | ||
14 | - <dependencies> | ||
15 | - <!-- project --> | ||
16 | - <dependency> | ||
17 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | - <artifactId>nicolas-common</artifactId> | ||
19 | - </dependency> | ||
20 | - | ||
21 | - <!-- third party --> | ||
22 | - <dependency> | ||
23 | - <groupId>org.apache.commons</groupId> | ||
24 | - <artifactId>commons-csv</artifactId> | ||
25 | - </dependency> | ||
26 | - <dependency> | ||
27 | - <groupId>commons-io</groupId> | ||
28 | - <artifactId>commons-io</artifactId> | ||
29 | - </dependency> | ||
30 | - <dependency> | ||
31 | - <groupId>org.apache.commons</groupId> | ||
32 | - <artifactId>commons-lang3</artifactId> | ||
33 | - </dependency> | ||
34 | - | ||
35 | - <!-- logging --> | ||
36 | - <dependency> | ||
37 | - <groupId>org.slf4j</groupId> | ||
38 | - <artifactId>slf4j-api</artifactId> | ||
39 | - </dependency> | ||
40 | - | ||
41 | - <!-- test --> | ||
42 | - <dependency> | ||
43 | - <groupId>junit</groupId> | ||
44 | - <artifactId>junit</artifactId> | ||
45 | - </dependency> | ||
46 | - </dependencies> | ||
47 | - | ||
48 | -</project> | ||
49 | \ No newline at end of file | 0 | \ No newline at end of file |
pom.xml
@@ -15,7 +15,6 @@ | @@ -15,7 +15,6 @@ | ||
15 | <module>nicolas-cli</module> | 15 | <module>nicolas-cli</module> |
16 | <module>nicolas-model</module> | 16 | <module>nicolas-model</module> |
17 | <module>nicolas-train</module> | 17 | <module>nicolas-train</module> |
18 | - <module>nicolas-zero</module> | ||
19 | <module>nicolas-common</module> | 18 | <module>nicolas-common</module> |
20 | </modules> | 19 | </modules> |
21 | 20 |