Commit 1a009dd0c4f78b9367ce117f0edd6e982cb4ebdf

Authored by Mateusz Kopeć
1 parent 7e387f1c

clean up modules

Showing 50 changed files with 311 additions and 685 deletions
README.md 0 → 100644
  1 +# Nicolas
  2 +
  3 +Summarization tool, using coreference information as main source of information for content selection.
  4 +
... ...
eval.sh 0 → 100755
  1 +#!/usr/bin/env bash
  2 +
... ...
nicolas-cli/pom.xml
... ... @@ -22,6 +22,11 @@
22 22 <groupId>pl.waw.ipipan.zil.summ</groupId>
23 23 <artifactId>nicolas-lib</artifactId>
24 24 </dependency>
  25 + <dependency>
  26 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  27 + <artifactId>nicolas-model</artifactId>
  28 + <scope>runtime</scope>
  29 + </dependency>
25 30  
26 31 <!-- third party -->
27 32 <dependency>
... ...
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java
... ... @@ -5,9 +5,9 @@ import org.slf4j.Logger;
5 5 import org.slf4j.LoggerFactory;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
8 9 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
9 10 import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
10   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
11 11 import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor;
12 12  
13 13 import java.io.*;
... ...
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java
... ... @@ -5,10 +5,10 @@ import org.junit.ClassRule;
5 5 import org.junit.Test;
6 6 import org.junit.rules.TemporaryFolder;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
8 9 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
10   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
11 10 import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor;
  11 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
12 12  
13 13 import java.io.File;
14 14 import java.io.FileInputStream;
... ... @@ -29,7 +29,7 @@ public class ClientTest {
29 29 @Test
30 30 public void processSampleText() throws Exception {
31 31 Preprocessor preprocessor = mock(Preprocessor.class);
32   - TText ttext = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH);
  32 + TText ttext = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH);
33 33 when(preprocessor.preprocess(any())).thenReturn(ttext);
34 34  
35 35 Nicolas nicolas = mock(Nicolas.class);
... ...
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java
... ... @@ -4,7 +4,7 @@ import org.apache.commons.io.IOUtils;
4 4 import org.junit.ClassRule;
5 5 import org.junit.Test;
6 6 import org.junit.rules.TemporaryFolder;
7   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  7 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
8 8  
9 9 import java.io.File;
10 10 import java.io.FileInputStream;
... ...
nicolas-common/pom.xml deleted
1   -<?xml version="1.0" encoding="UTF-8"?>
2   -<project xmlns="http://maven.apache.org/POM/4.0.0"
3   - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4   - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5   - <modelVersion>4.0.0</modelVersion>
6   - <parent>
7   - <artifactId>nicolas-container</artifactId>
8   - <groupId>pl.waw.ipipan.zil.summ</groupId>
9   - <version>1.0-SNAPSHOT</version>
10   - </parent>
11   -
12   - <artifactId>nicolas-common</artifactId>
13   -
14   - <dependencies>
15   - <!-- internal -->
16   - <dependency>
17   - <groupId>pl.waw.ipipan.zil.summ</groupId>
18   - <artifactId>pscapi</artifactId>
19   - </dependency>
20   - <dependency>
21   - <groupId>pl.waw.ipipan.zil.multiservice</groupId>
22   - <artifactId>utils</artifactId>
23   - </dependency>
24   -
25   - <!-- third party -->
26   - <dependency>
27   - <groupId>nz.ac.waikato.cms.weka</groupId>
28   - <artifactId>weka-stable</artifactId>
29   - </dependency>
30   - <dependency>
31   - <groupId>commons-io</groupId>
32   - <artifactId>commons-io</artifactId>
33   - </dependency>
34   -
35   - <!-- logging -->
36   - <dependency>
37   - <groupId>org.slf4j</groupId>
38   - <artifactId>slf4j-api</artifactId>
39   - </dependency>
40   -
41   - </dependencies>
42   -
43   -</project>
44 0 \ No newline at end of file
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.common;
2   -
3   -import com.google.common.collect.Lists;
4   -import com.google.common.collect.Maps;
5   -import com.google.common.collect.Sets;
6   -import org.apache.commons.io.IOUtils;
7   -import org.slf4j.Logger;
8   -import org.slf4j.LoggerFactory;
9   -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
10   -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
11   -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
12   -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
13   -import weka.classifiers.Classifier;
14   -import weka.core.Attribute;
15   -import weka.core.Instances;
16   -
17   -import java.io.*;
18   -import java.util.*;
19   -import java.util.function.Function;
20   -import java.util.stream.Collectors;
21   -
22   -public class Utils {
23   -
24   - private static final Logger LOG = LoggerFactory.getLogger(Utils.class);
25   -
26   - private static final String DATASET_NAME = "Dataset";
27   -
28   - private Utils() {
29   - }
30   -
31   - public static void writeStringToFile(String string, File file) throws IOException {
32   - try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) {
33   - bw.append(string);
34   - }
35   - }
36   -
37   - public static Classifier loadModelFromResource(String modelResourcePath) throws IOException {
38   - LOG.info("Loading classifier from path: {}...", modelResourcePath);
39   - try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) {
40   - if (stream == null) {
41   - throw new IOException("Model not found at: " + modelResourcePath);
42   - }
43   - try (ObjectInputStream ois = new ObjectInputStream(stream)) {
44   - Classifier classifier = (Classifier) ois.readObject();
45   - LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName());
46   - return classifier;
47   - } catch (ClassNotFoundException e) {
48   - LOG.error("Error loading serialized classifier, class not found.", e);
49   - throw new IOException(e);
50   - }
51   - }
52   - }
53   -
54   - public static TText loadThriftTextFromStream(InputStream inputStream) throws IOException {
55   - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(inputStream)) {
56   - return (TText) ois.readObject();
57   - } catch (ClassNotFoundException e) {
58   - LOG.error("Error reading serialized thrift text file, class not found.", e);
59   - throw new IOException(e);
60   - }
61   - }
62   -
63   - public static TText loadThriftTextFromResource(String textResourcePath) throws IOException {
64   - try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) {
65   - if (stream == null) {
66   - throw new IOException("Resource not found at: " + textResourcePath);
67   - }
68   - return loadThriftTextFromStream(stream);
69   - }
70   - }
71   -
72   - public static List<String> loadLinesFromResource(String resourcePath) throws IOException {
73   - try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) {
74   - return IOUtils.readLines(stream, Constants.ENCODING);
75   - }
76   - }
77   -
78   - @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList
79   - public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
80   - Instances instances = new Instances(DATASET_NAME, attributesList, 0);
81   - instances.setClassIndex(0);
82   - return instances;
83   - }
84   -
85   - public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException {
86   - LOG.info("Loading classifier...");
87   - try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) {
88   - Classifier classifier = (Classifier) ois.readObject();
89   - LOG.info("Done. " + classifier.toString());
90   - return classifier;
91   - }
92   - }
93   -
94   - public static List<String> tokenize(String text) {
95   - return Arrays.asList(text.split("[^\\p{L}0-9]+"));
96   - }
97   -
98   - public static List<String> tokenizeOnWhitespace(String text) {
99   - return Arrays.asList(text.split(" +"));
100   - }
101   -
102   - public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) {
103   - Map<TMention, String> mention2orth = Maps.newHashMap();
104   - for (TSentence s : sents) {
105   - Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth));
106   - Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace));
107   -
108   - for (TMention m : s.getMentions()) {
109   - StringBuffer mentionOrth = new StringBuffer();
110   - for (String tokId : m.getHeadIds()) {
111   - if (!tokId2nps.get(tokId))
112   - mentionOrth.append(" ");
113   - mentionOrth.append(tokId2orth.get(tokId));
114   - }
115   - mention2orth.put(m, mentionOrth.toString().trim());
116   - }
117   - }
118   - return mention2orth;
119   - }
120   -
121   - private static final Collection<String> STOPWORDS = Sets.newHashSet();
122   -
123   - static {
124   - STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co"));
125   - }
126   -
127   - public static Map<TMention, String> loadMention2Orth(List<TSentence> sents, boolean discardStopwords) {
128   - Map<TMention, String> mention2orth = Maps.newHashMap();
129   - for (TSentence s : sents) {
130   - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
131   -
132   - for (TMention m : s.getMentions()) {
133   - StringBuffer mentionOrth = new StringBuffer();
134   - for (String tokId : m.getChildIds()) {
135   - TToken token = tokId2tok.get(tokId);
136   - if (discardStopwords && STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) {
137   - continue;
138   - }
139   -
140   - if (!token.isNoPrecedingSpace())
141   - mentionOrth.append(" ");
142   - mentionOrth.append(token.getOrth());
143   - }
144   - mention2orth.put(m, mentionOrth.toString().trim());
145   - }
146   - }
147   - return mention2orth;
148   - }
149   -
150   - public static Map<TMention, String> loadMention2Base(List<TSentence> sents) {
151   - Map<TMention, String> mention2base = Maps.newHashMap();
152   - for (TSentence s : sents) {
153   - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase()));
154   -
155   - for (TMention m : s.getMentions()) {
156   - StringBuilder mentionBase = new StringBuilder();
157   - for (String tokId : m.getChildIds()) {
158   - mentionBase.append(" ");
159   - mentionBase.append(tokId2base.get(tokId));
160   - }
161   - mention2base.put(m, mentionBase.toString().toLowerCase().trim());
162   - }
163   - }
164   - return mention2base;
165   - }
166   -
167   - public static String loadSentence2Orth(TSentence sentence) {
168   - return loadSentence2Orth(sentence, Sets.newHashSet());
169   - }
170   -
171   - public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) {
172   - StringBuilder sb = new StringBuilder();
173   - for (TToken token : sentence.getTokens()) {
174   - if (tokenIdsToSkip.contains(token.getId())) {
175   - System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence));
176   - continue;
177   - }
178   - if (!token.isNoPrecedingSpace())
179   - sb.append(" ");
180   - sb.append(token.getOrth());
181   - }
182   - return sb.toString().trim();
183   - }
184   -
185   -}
186 0 \ No newline at end of file
nicolas-eval/pom.xml deleted
1   -<?xml version="1.0" encoding="UTF-8"?>
2   -<project xmlns="http://maven.apache.org/POM/4.0.0"
3   - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
4   - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
5   - <parent>
6   - <artifactId>nicolas-container</artifactId>
7   - <groupId>pl.waw.ipipan.zil.summ</groupId>
8   - <version>1.0-SNAPSHOT</version>
9   - </parent>
10   - <modelVersion>4.0.0</modelVersion>
11   -
12   - <artifactId>nicolas-eval</artifactId>
13   -
14   - <dependencies>
15   - <!-- project -->
16   - <dependency>
17   - <groupId>pl.waw.ipipan.zil.summ</groupId>
18   - <artifactId>nicolas-lib</artifactId>
19   - </dependency>
20   - <dependency>
21   - <groupId>pl.waw.ipipan.zil.summ</groupId>
22   - <artifactId>nicolas-common</artifactId>
23   - </dependency>
24   -
25   - <!-- internal -->
26   - <dependency>
27   - <groupId>pl.waw.ipipan.zil.summ</groupId>
28   - <artifactId>eval</artifactId>
29   - </dependency>
30   -
31   - <!-- third party -->
32   - <dependency>
33   - <groupId>nz.ac.waikato.cms.weka</groupId>
34   - <artifactId>weka-stable</artifactId>
35   - </dependency>
36   - <dependency>
37   - <groupId>org.apache.commons</groupId>
38   - <artifactId>commons-lang3</artifactId>
39   - </dependency>
40   - <dependency>
41   - <groupId>com.google.guava</groupId>
42   - <artifactId>guava</artifactId>
43   - </dependency>
44   -
45   - <!-- logging -->
46   - <dependency>
47   - <groupId>org.slf4j</groupId>
48   - <artifactId>slf4j-api</artifactId>
49   - </dependency>
50   - <dependency>
51   - <groupId>org.slf4j</groupId>
52   - <artifactId>slf4j-simple</artifactId>
53   - </dependency>
54   -
55   - </dependencies>
56   -</project>
57 0 \ No newline at end of file
nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt deleted
1   -199704210012
2   -199704210042
3   -199704220007
4   -199704220018
5   -199704220021
6   -199704220044
7   -199704230006
8   -199704230014
9   -199704230029
10   -199704230043
11   -199704240008
12   -199704240019
13   -199704240020
14   -199704240021
15   -199704250018
16   -199704250022
17   -199704260014
18   -199704260015
19   -199704260016
20   -199704280023
21   -199704280025
22   -199704280027
23   -199704280031
24   -199704300031
25   -199704300042
26   -199704300046
27   -199801020010
28   -199801020031
29   -199801020035
30   -199801020070
31   -199801020076
32   -199801020079
33   -199801030068
34   -199801030090
35   -199801030091
36   -199801030129
37   -199801030148
38   -199801030158
39   -199801050023
40   -199801050059
41   -199801130087
42   -199801130129
43   -199801140182
44   -199801160119
45   -199801200106
46   -199801220140
47   -199801240061
48   -199801240096
49   -199801260047
50   -199801260070
51   -199801270055
52   -199801270110
53   -199801280123
54   -199801280158
55   -199801280159
56   -199801280241
57   -199801290022
58   -199801310003
59   -199801310037
60   -199802030127
61   -199802040159
62   -199802040182
63   -199802040202
64   -199805220133
65   -199808280158
66   -199901190073
67   -199901190115
68   -199901250112
69   -199901250117
70   -199901270103
71   -199901270120
72   -199901270122
73   -199901290095
74   -199901300101
75   -199902240095
76   -199906220029
77   -199906230024
78   -199906240084
79   -199906260027
80   -199907050045
81   -199907050076
82   -199907140166
83   -199907200002
84   -199907270004
85   -199908260001
86   -199909090036
87   -199909250018
88   -199909270029
89   -199910020027
90   -199910020029
91   -199910270011
92   -199911060044
93   -199911100038
94   -199911100064
95   -199911200030
96   -199911220063
97   -199912020060
98   -199912180026
99   -199912180034
100   -199912220030
101   -199912280024
102   -199912280046
103   -199912300021
104   -199912300029
105   -200001030029
106   -200001030053
107   -200001060034
108   -200001100035
109   -200001100046
110   -200001170029
111   -200001170033
112   -200001170060
113   -200001290045
114   -200002220027
115   -200002240034
116   -200002250031
117   -200003060062
118   -200003110050
119   -200004280047
120   -200004290022
121   -200006050119
122   -200006260079
123   -200006290045
124   -200007150033
125   -200008040076
126   -200008220042
127   -200008220046
128   -200010130049
129   -200010160054
130   -200012130034
131   -200012140084
132   -200012290046
133   -200104040019
134   -200106050035
135   -200108180109
136   -200108300032
137   -200111120045
138   -200111150042
139   -200111150047
140   -200111200036
141   -200111270049
142   -200112030055
143   -200112280057
144   -200201220038
145   -200201220050
146   -200202020036
147   -200202200032
148   -200202210054
149   -200202270044
150   -200203010070
151   -200203190026
152   -200203260050
153   -200203280017
154   -200203290078
nicolas-lib/pom.xml
... ... @@ -12,15 +12,6 @@
12 12 <artifactId>nicolas-lib</artifactId>
13 13  
14 14 <dependencies>
15   - <!-- project -->
16   - <dependency>
17   - <groupId>pl.waw.ipipan.zil.summ</groupId>
18   - <artifactId>nicolas-common</artifactId>
19   - </dependency>
20   - <dependency>
21   - <groupId>pl.waw.ipipan.zil.summ</groupId>
22   - <artifactId>nicolas-model</artifactId>
23   - </dependency>
24 15  
25 16 <!-- internal -->
26 17 <dependency>
... ... @@ -61,5 +52,10 @@
61 52 <groupId>junit</groupId>
62 53 <artifactId>junit</artifactId>
63 54 </dependency>
  55 + <dependency>
  56 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  57 + <artifactId>nicolas-model</artifactId>
  58 + <scope>test</scope>
  59 + </dependency>
64 60 </dependencies>
65 61 </project>
66 62 \ No newline at end of file
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java
1   -package pl.waw.ipipan.zil.summ.nicolas.common;
  1 +package pl.waw.ipipan.zil.summ.nicolas;
2 2  
3   -import com.google.common.base.Charsets;
4 3 import com.google.common.collect.ImmutableList;
5 4  
6 5 import java.nio.charset.Charset;
7   -
  6 +import java.nio.charset.StandardCharsets;
8 7  
9 8 public class Constants {
10 9  
11   - private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/";
  10 + public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact");
  11 + public static final Charset ENCODING = StandardCharsets.UTF_8;
12 12  
  13 + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/";
13 14 private static final String MODELS_PATH = ROOT_PATH + "models/";
  15 +
14 16 public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin";
15 17 public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin";
16 18 public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin";
17 19  
18 20 private static final String RESOURCES_PATH = ROOT_PATH + "resources/";
19 21 public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt";
20   -
21   - public static final Charset ENCODING = Charsets.UTF_8;
22   -
23   - public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact");
  22 + public static final String STOPWORDS_PATH = RESOURCES_PATH + "stopwords.txt";
24 23  
25 24 private Constants() {
26 25 }
27   -
28 26 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... ... @@ -5,12 +5,12 @@ import com.google.common.collect.Sets;
5 5 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
10 8 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
11 9 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
12 10 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
13 11 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel;
  12 +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
14 14 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
15 15 import weka.classifiers.Classifier;
16 16  
... ... @@ -31,9 +31,9 @@ public class Nicolas {
31 31  
32 32 public Nicolas() throws NicolasException {
33 33 try {
34   - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
35   - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
36   - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);
  34 + mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
  35 + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
  36 + zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);
37 37  
38 38 mentionFeatureExtractor = new MentionFeatureExtractor();
39 39 sentenceFeatureExtractor = new SentenceFeatureExtractor();
... ... @@ -57,7 +57,7 @@ public class Nicolas {
57 57  
58 58 StringBuilder sb = new StringBuilder();
59 59 for (TSentence sent : selectedSentences) {
60   - sb.append(" ").append(Utils.loadSentence2Orth(sent));
  60 + sb.append(" ").append(TextUtils.loadSentence2Orth(sent));
61 61 }
62 62 return sb.toString().trim();
63 63 }
... ... @@ -74,7 +74,7 @@ public class Nicolas {
74 74 Random r = new Random(1);
75 75 Set<TSentence> summary = Sets.newHashSet();
76 76 for (TSentence sent : sortedSentences) {
77   - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size();
  77 + size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size();
78 78 if (r.nextDouble() > 0.4 && size > targetSize)
79 79 break;
80 80 summary.add(sent);
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java deleted
1   -package pl.waw.ipipan.zil.summ.nicolas.apply;
2   -
3   -import com.google.common.collect.Lists;
4   -import com.google.common.collect.Maps;
5   -import com.google.common.collect.Sets;
6   -import org.slf4j.Logger;
7   -import org.slf4j.LoggerFactory;
8   -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
9   -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
10   -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
11   -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
12   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
13   -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
14   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
15   -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
16   -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
17   -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
18   -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectInjector;
19   -import weka.classifiers.Classifier;
20   -import weka.core.Instance;
21   -import weka.core.Instances;
22   -
23   -import java.io.BufferedWriter;
24   -import java.io.File;
25   -import java.io.FileWriter;
26   -import java.util.*;
27   -
28   -import static java.util.stream.Collectors.toList;
29   -
30   -public class ApplyModel {
31   -
32   - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class);
33   -
34   - private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test";
35   - private static final String TARGET_DIR = "corpora/summaries";
36   -
37   - public static void main(String[] args) throws Exception {
38   - Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
39   - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
40   -
41   - Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
42   - SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();
43   -
44   - ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector();
45   -
46   - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH));
47   - int i = 1;
48   - double avgSize = 0;
49   - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
50   - TText text = entry.getValue();
51   -
52   - Set<TMention> goodMentions
53   - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text);
54   -
55   - int targetSize = calculateTargetSize(text);
56   - String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor, zeroSubjectInjector);
57   - int size = Utils.tokenize(summary).size();
58   - avgSize += size;
59   - try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily4.txt")))) {
60   - bw.append(summary);
61   - }
62   -
63   - LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey());
64   - }
65   -
66   - LOG.info("Avg size:" + avgSize / id2preprocessedText.size());
67   - }
68   -
69   - private static int calculateTargetSize(TText text) {
70   - List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
71   - StringBuffer body = new StringBuffer();
72   - for (TSentence sent : sents)
73   - body.append(Utils.loadSentence2Orth(sent) + " ");
74   - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size();
75   - return (int) (0.2 * tokenCount);
76   - }
77   -
78   - private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor, ZeroSubjectInjector zeroSubjectInjector) throws Exception {
79   - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor);
80   -
81   - Set<String> zeroSubjectTokenIds = zeroSubjectInjector.findZeroSubjectTokenIds(thrifted, selectedSentences);
82   -
83   - StringBuilder sb = new StringBuilder();
84   - for (TSentence sent : selectedSentences) {
85   - sb.append(" " + Utils.loadSentence2Orth(sent, zeroSubjectTokenIds));
86   - }
87   - return sb.toString().trim();
88   - }
89   -
90   - private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
91   -
92   - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
93   -
94   - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
95   - Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
96   -
97   - Map<TSentence, Double> sentence2score = Maps.newHashMap();
98   - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {
99   - Instance instance = entry.getValue();
100   - instance.setDataset(instances);
101   - double score = sentenceClassifier.classifyInstance(instance);
102   - sentence2score.put(entry.getKey(), score);
103   - }
104   -
105   - List<TSentence> sortedSents = Lists.newArrayList(sents);
106   - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed());
107   -
108   - int size = 0;
109   - Random r = new Random(1);
110   - Set<TSentence> summary = Sets.newHashSet();
111   - for (TSentence sent : sortedSents) {
112   - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size();
113   - if (r.nextDouble() > 0.4 && size > targetSize)
114   - break;
115   - summary.add(sent);
116   - if (size > targetSize)
117   - break;
118   - }
119   - List<TSentence> selectedSentences = Lists.newArrayList();
120   - for (TSentence sent : sents) {
121   - if (summary.contains(sent))
122   - selectedSentences.add(sent);
123   - }
124   - return selectedSentences;
125   - }
126   -
127   -}
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
... ... @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.features;
3 3 import com.google.common.collect.Maps;
4 4 import com.google.common.collect.Sets;
5 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
7 6  
8 7 import java.util.List;
9 8 import java.util.Map;
... ... @@ -38,7 +37,6 @@ public class FeatureHelper {
38 37 private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap();
39 38 private final Map<TMention, Integer> mention2firstTokenIndex = Maps.newHashMap();
40 39  
41   -
42 40 public FeatureHelper(TText preprocessedText) {
43 41 text = preprocessedText;
44 42  
... ... @@ -60,9 +58,9 @@ public class FeatureHelper {
60 58 int sentIdx = 0;
61 59 int mentionIdx = 0;
62 60 for (TParagraph par : preprocessedText.getParagraphs()) {
63   - Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences(), false);
  61 + Map<TMention, String> m2o = loadMention2Orth(par.getSentences());
64 62 mention2Orth.putAll(m2o);
65   - Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences());
  63 + Map<TMention, String> m2b = loadMention2Base(par.getSentences());
66 64 mention2Base.putAll(m2b);
67 65  
68 66 int sentIdxInPar = 0;
... ... @@ -221,4 +219,40 @@ public class FeatureHelper {
221 219 return null;
222 220 return mention2sent.get(mention).getTokens().get(idx - 1);
223 221 }
  222 +
  223 + private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) {
  224 + Map<TMention, String> mention2orth = Maps.newHashMap();
  225 + for (TSentence s : sents) {
  226 + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
  227 +
  228 + for (TMention m : s.getMentions()) {
  229 + StringBuilder mentionOrth = new StringBuilder();
  230 + for (String tokId : m.getChildIds()) {
  231 + TToken token = tokId2tok.get(tokId);
  232 + if (!token.isNoPrecedingSpace())
  233 + mentionOrth.append(" ");
  234 + mentionOrth.append(token.getOrth());
  235 + }
  236 + mention2orth.put(m, mentionOrth.toString().trim());
  237 + }
  238 + }
  239 + return mention2orth;
  240 + }
  241 +
  242 + private static Map<TMention, String> loadMention2Base(List<TSentence> sents) {
  243 + Map<TMention, String> mention2base = Maps.newHashMap();
  244 + for (TSentence s : sents) {
  245 + Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase()));
  246 +
  247 + for (TMention m : s.getMentions()) {
  248 + StringBuilder mentionBase = new StringBuilder();
  249 + for (String tokId : m.getChildIds()) {
  250 + mentionBase.append(" ");
  251 + mentionBase.append(tokId2base.get(tokId));
  252 + }
  253 + mention2base.put(m, mentionBase.toString().toLowerCase().trim());
  254 + }
  255 + }
  256 + return mention2base;
  257 + }
224 258 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
... ... @@ -3,11 +3,11 @@ package pl.waw.ipipan.zil.summ.nicolas.mention;
3 3 import com.google.common.collect.Lists;
4 4 import com.google.common.collect.Maps;
5 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
8 7 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
9 8 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 9 import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation;
  10 +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils;
11 11 import weka.core.Attribute;
12 12  
13 13 import java.io.IOException;
... ... @@ -21,7 +21,7 @@ public class MentionFeatureExtractor extends FeatureExtractor {
21 21 private final List<String> frequentBases;
22 22  
23 23 public MentionFeatureExtractor() throws IOException {
24   - frequentBases = loadFrequentBases();
  24 + frequentBases = ResourceUtils.loadFrequentBases();
25 25  
26 26 //coref
27 27 addNumericAttributeNormalized("chain_length");
... ... @@ -80,10 +80,6 @@ public class MentionFeatureExtractor extends FeatureExtractor {
80 80 fillSortedAttributes("score");
81 81 }
82 82  
83   - private List<String> loadFrequentBases() throws IOException {
84   - return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList());
85   - }
86   -
87 83 private String encodeBase(String base) {
88 84 return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q");
89 85 }
... ... @@ -177,7 +173,7 @@ public class MentionFeatureExtractor extends FeatureExtractor {
177 173 Attribute att = getAttributeByName(attributeName);
178 174 int index = att.indexOfValue(value);
179 175 if (index == -1)
180   - LOG.warn(value + " not found for attribute " + attributeName);
  176 + LOG.warn("{} not found for attribute {}", value, attributeName);
181 177 attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index));
182 178 }
183 179  
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
... ... @@ -5,8 +5,7 @@ import org.slf4j.Logger;
5 5 import org.slf4j.LoggerFactory;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8   -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  8 +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
10 9 import weka.classifiers.Classifier;
11 10 import weka.core.Instance;
12 11 import weka.core.Instances;
... ... @@ -24,7 +23,7 @@ public class MentionModel {
24 23 public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception {
25 24 Set<TMention> goodMentions = Sets.newHashSet();
26 25  
27   - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  26 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
28 27 Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor);
29 28 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
30 29 Instance instance = entry.getValue();
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
... ... @@ -6,8 +6,7 @@ import org.slf4j.LoggerFactory;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9   -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
10   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
11 10 import weka.classifiers.Classifier;
12 11 import weka.core.Instance;
13 12 import weka.core.Instances;
... ... @@ -23,7 +22,7 @@ public class SentenceModel {
23 22 }
24 23  
25 24 public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
26   - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
  25 + Instances instances = InstanceUtils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
27 26 Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
28 27  
29 28 Map<TSentence, Double> sentence2score = Maps.newHashMap();
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java
1   -package pl.waw.ipipan.zil.summ.nicolas;
  1 +package pl.waw.ipipan.zil.summ.nicolas.utils;
2 2  
3 3 import com.google.common.collect.Maps;
4 4 import org.slf4j.Logger;
... ... @@ -11,7 +11,9 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
11 11 import weka.core.Attribute;
12 12 import weka.core.DenseInstance;
13 13 import weka.core.Instance;
  14 +import weka.core.Instances;
14 15  
  16 +import java.util.ArrayList;
15 17 import java.util.List;
16 18 import java.util.Map;
17 19 import java.util.Set;
... ... @@ -22,6 +24,8 @@ public class InstanceUtils {
22 24  
23 25 private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class);
24 26  
  27 + private static final String DATASET_NAME = "Dataset";
  28 +
25 29 private InstanceUtils() {
26 30 }
27 31  
... ... @@ -60,4 +64,11 @@ public class InstanceUtils {
60 64 LOG.info("Extracted features of {} sentences.", sentence2instance.size());
61 65 return sentence2instance;
62 66 }
  67 +
  68 + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList
  69 + public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
  70 + Instances instances = new Instances(DATASET_NAME, attributesList, 0);
  71 + instances.setClassIndex(0);
  72 + return instances;
  73 + }
63 74 }
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.utils;
  2 +
  3 +import org.apache.commons.io.IOUtils;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  7 +import weka.classifiers.Classifier;
  8 +
  9 +import java.io.IOException;
  10 +import java.io.InputStream;
  11 +import java.io.ObjectInputStream;
  12 +import java.util.List;
  13 +import java.util.function.Predicate;
  14 +import java.util.stream.Collectors;
  15 +
  16 +public class ResourceUtils {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(ResourceUtils.class);
  19 +
  20 + private ResourceUtils() {
  21 + }
  22 +
  23 + public static List<String> loadFrequentBases() throws IOException {
  24 + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH);
  25 + }
  26 +
  27 + public static List<String> loadStopwords() throws IOException {
  28 + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.STOPWORDS_PATH);
  29 + }
  30 +
  31 + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException {
  32 + LOG.info("Loading classifier from path: {}...", modelResourcePath);
  33 + try (InputStream stream = ResourceUtils.class.getResourceAsStream(modelResourcePath)) {
  34 + if (stream == null) {
  35 + throw new IOException("Model not found at: " + modelResourcePath);
  36 + }
  37 + try (ObjectInputStream ois = new ObjectInputStream(stream)) {
  38 + Classifier classifier = (Classifier) ois.readObject();
  39 + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName());
  40 + return classifier;
  41 + } catch (ClassNotFoundException e) {
  42 + LOG.error("Error loading serialized classifier, class not found.", e);
  43 + throw new IOException(e);
  44 + }
  45 + }
  46 + }
  47 +
  48 + private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException {
  49 + try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) {
  50 + return IOUtils.readLines(stream, Constants.ENCODING)
  51 + .stream()
  52 + .map(String::trim)
  53 + .map(String::toLowerCase)
  54 + .filter(((Predicate<String>) String::isEmpty).negate())
  55 + .sorted()
  56 + .distinct()
  57 + .collect(Collectors.toList());
  58 + }
  59 + }
  60 +
  61 +
  62 +}
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.utils;
  2 +
  3 +import com.google.common.collect.Sets;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
  6 +
  7 +import java.util.Arrays;
  8 +import java.util.List;
  9 +import java.util.Set;
  10 +
  11 +public class TextUtils {
  12 +
  13 + private TextUtils() {
  14 + }
  15 +
  16 + public static List<String> tokenize(String text) {
  17 + return Arrays.asList(text.split("[^\\p{L}0-9]+"));
  18 + }
  19 +
  20 + public static List<String> tokenizeOnWhitespace(String text) {
  21 + return Arrays.asList(text.split(" +"));
  22 + }
  23 +
  24 + public static String loadSentence2Orth(TSentence sentence) {
  25 + return loadSentence2Orth(sentence, Sets.newHashSet());
  26 + }
  27 +
  28 + public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) {
  29 + StringBuilder sb = new StringBuilder();
  30 + for (TToken token : sentence.getTokens()) {
  31 + if (tokenIdsToSkip.contains(token.getId())) {
  32 + System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence));
  33 + continue;
  34 + }
  35 + if (!token.isNoPrecedingSpace())
  36 + sb.append(" ");
  37 + sb.append(token.getOrth());
  38 + }
  39 + return sb.toString().trim();
  40 + }
  41 +}
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java
1   -package pl.waw.ipipan.zil.summ.nicolas.common;
  1 +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift;
2 2  
3 3 import com.google.common.base.Predicates;
4 4 import com.google.common.collect.Maps;
... ... @@ -58,4 +58,12 @@ public class ThriftUtils {
58 58 }
59 59 }
60 60  
  61 + public static TText loadThriftTextFromResource(String resourcePath) {
  62 + try (InputStream stream = ThriftUtils.class.getResourceAsStream(resourcePath)) {
  63 + return loadThriftTextFromStream(stream);
  64 + } catch (IOException e) {
  65 + LOG.error("Error reading serialized Thrift text from resource", e);
  66 + return null;
  67 + }
  68 + }
61 69 }
... ...
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java
1   -package pl.waw.ipipan.zil.summ.nicolas.common;
  1 +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift;
2 2  
3 3 import java.io.IOException;
4 4 import java.io.InputStream;
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... ... @@ -7,7 +7,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
10   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  10 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
11 11 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
12 12 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
13 13 import weka.core.Attribute;
... ...
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
... ... @@ -3,8 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas.zero;
3 3 import com.google.common.collect.Sets;
4 4 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
5 5 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
7   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  7 +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
8 8 import weka.classifiers.Classifier;
9 9 import weka.core.Instance;
10 10 import weka.core.Instances;
... ... @@ -24,7 +24,7 @@ public class ZeroSubjectInjector {
24 24 public ZeroSubjectInjector() throws Exception {
25 25 classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH);
26 26 featureExtractor = new ZeroFeatureExtractor();
27   - instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  27 + instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
28 28 }
29 29  
30 30 public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception {
... ...
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java
... ... @@ -3,7 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas;
3 3 import org.junit.BeforeClass;
4 4 import org.junit.Test;
5 5 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  6 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
  7 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
7 8  
8 9 import static org.junit.Assert.assertTrue;
9 10  
... ... @@ -20,9 +21,9 @@ public class NicolasTest {
20 21  
21 22 @Test
22 23 public void shouldSummarizeThriftText() throws Exception {
23   - TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH);
  24 + TText thriftText = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH);
24 25 String summary = nicolas.summarizeThrift(thriftText, 5);
25   - int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size();
  26 + int summaryTokensCount = TextUtils.tokenizeOnWhitespace(summary).size();
26 27 assertTrue(summaryTokensCount > 0);
27 28 assertTrue(summaryTokensCount < 10);
28 29 }
... ...
nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java renamed to nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java
1   -package pl.waw.ipipan.zil.summ.nicolas.common;
  1 +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift;
2 2  
3 3 import org.junit.Test;
4 4 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
... ... @@ -7,13 +7,13 @@ import java.io.InputStream;
7 7  
8 8 import static org.junit.Assert.assertEquals;
9 9  
10   -public class UtilsTest {
  10 +public class ThriftUtilsTest {
11 11  
12 12 private static final String SAMPLE_TEXT_PATH = "/199704210011.bin";
13 13  
14 14 @Test
15 15 public void shouldDeserializeTextIgnoringClassVersionId() throws Exception {
16   - try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
  16 + try (InputStream stream = ThriftUtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
17 17 TText text = ThriftUtils.loadThriftTextFromStream(stream);
18 18 assertEquals(26, text.getParagraphs().size());
19 19 assertEquals(2, text.getParagraphs().get(4).getSentences().size());
... ...
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
... ... @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils;
5 5 import org.junit.Test;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8   -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
9 8 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
  9 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
10 10  
11 11 import java.io.IOException;
12 12 import java.io.InputStream;
... ...
nicolas-common/src/test/resources/199704210011.bin renamed to nicolas-lib/src/test/resources/199704210011.bin
No preview for this file type
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore 0 → 100644
  1 +*.txt
0 2 \ No newline at end of file
... ...
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md 0 → 100644
  1 +To generate resources in this folder, use nicolas-trainer module.
0 2 \ No newline at end of file
... ...
nicolas-multiservice/pom.xml
... ... @@ -30,8 +30,12 @@
30 30  
31 31 <!-- test -->
32 32 <dependency>
  33 + <groupId>junit</groupId>
  34 + <artifactId>junit</artifactId>
  35 + </dependency>
  36 + <dependency>
33 37 <groupId>pl.waw.ipipan.zil.summ</groupId>
34   - <artifactId>nicolas-common</artifactId>
  38 + <artifactId>nicolas-lib</artifactId>
35 39 <scope>test</scope>
36 40 </dependency>
37 41  
... ...
nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java
... ... @@ -7,7 +7,7 @@ import org.junit.rules.TemporaryFolder;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
9 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
10   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  10 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
11 11  
12 12 import java.io.File;
13 13 import java.io.FileInputStream;
... ... @@ -67,7 +67,7 @@ public class PreprocessorIT {
67 67 preprocessor.preprocessToFile(text, targetFile);
68 68  
69 69 try (FileInputStream inputStream = new FileInputStream(targetFile)) {
70   - TText processed = Utils.loadThriftTextFromStream(inputStream);
  70 + TText processed = ThriftUtils.loadThriftTextFromStream(inputStream);
71 71 assertSampleProcessedText(processed);
72 72 }
73 73 }
... ...
nicolas-train/pom.xml
... ... @@ -15,10 +15,6 @@
15 15 <!-- project -->
16 16 <dependency>
17 17 <groupId>pl.waw.ipipan.zil.summ</groupId>
18   - <artifactId>nicolas-common</artifactId>
19   - </dependency>
20   - <dependency>
21   - <groupId>pl.waw.ipipan.zil.summ</groupId>
22 18 <artifactId>nicolas-lib</artifactId>
23 19 </dependency>
24 20 <dependency>
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
1   -package pl.waw.ipipan.zil.summ.nicolas.train;
  1 +package pl.waw.ipipan.zil.summ.nicolas;
2 2  
3 3 import net.lingala.zip4j.core.ZipFile;
4 4 import net.lingala.zip4j.exception.ZipException;
... ... @@ -34,7 +34,7 @@ public class PathConstants {
34 34 public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal");
35 35 public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv");
36 36  
37   - public static final File ARFF_DIR = new File(WORKING_DIR, "train-arff");
  37 + private static final File ARFF_DIR = new File(WORKING_DIR, "train-arff");
38 38 public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff");
39 39 public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff");
40 40 public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff");
... ...
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java
... ... @@ -17,7 +17,7 @@ public class Constants {
17 17  
18 18 public static Set<String> loadTestTextIds() throws IOException {
19 19 try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) {
20   - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING);
  20 + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING);
21 21 return testTextIds.stream().map(String::trim).collect(Collectors.toSet());
22 22 }
23 23 }
... ...
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
... ... @@ -7,11 +7,13 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8 8 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
9 9 import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
10   -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
11   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  10 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
12 12  
13 13 import java.io.File;
  14 +import java.io.FileOutputStream;
14 15 import java.io.IOException;
  16 +import java.io.OutputStreamWriter;
15 17 import java.util.List;
16 18 import java.util.Map;
17 19 import java.util.Set;
... ... @@ -23,7 +25,6 @@ public class SummarizeTestCorpus {
23 25  
24 26 private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class);
25 27  
26   -
27 28 private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt";
28 29 private static final double SUMMARY_RATIO = 0.2;
29 30  
... ... @@ -31,8 +32,8 @@ public class SummarizeTestCorpus {
31 32 }
32 33  
33 34 public static void main(String[] args) throws IOException, NicolasException {
34   - File thriftedCorpusDir = new File("data/preprocessed");
35   - File targetDir = new File("data/summaries");
  35 + File thriftedCorpusDir = new File("data/all-preprocessed");
  36 + File targetDir = new File("data/test-system");
36 37 targetDir.mkdir();
37 38  
38 39 Set<String> testTextIds = loadTestTextIds();
... ... @@ -62,9 +63,9 @@ public class SummarizeTestCorpus {
62 63 List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
63 64 StringBuilder body = new StringBuilder();
64 65 for (TSentence sentence : sentences)
65   - body.append(Utils.loadSentence2Orth(sentence)).append(" ");
  66 + body.append(TextUtils.loadSentence2Orth(sentence)).append(" ");
66 67  
67   - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size();
  68 + int tokenCount = TextUtils.tokenizeOnWhitespace(body.toString().trim()).size();
68 69 return (int) (SUMMARY_RATIO * tokenCount);
69 70 }
70 71  
... ... @@ -73,7 +74,9 @@ public class SummarizeTestCorpus {
73 74 String textId = entry.getKey();
74 75 String summary = entry.getValue();
75 76 String targetFileName = textId + SUMMARY_FILE_SUFFIX;
76   - Utils.writeStringToFile(summary, new File(targetDir, targetFileName));
  77 + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) {
  78 + writer.write(summary);
  79 + }
77 80 }
78 81 }
79 82  
... ...
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java
... ... @@ -6,29 +6,63 @@ import com.google.common.collect.Multiset;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  9 +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
  10 +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
10 12  
  13 +import java.io.IOException;
11 14 import java.util.List;
12 15 import java.util.Map;
  16 +import java.util.Set;
  17 +import java.util.function.Function;
13 18 import java.util.stream.Collectors;
14 19  
15 20 public class MentionScorer {
16 21  
  22 + private final Set<String> STOPWORDS;
  23 +
  24 + public MentionScorer() throws IOException {
  25 + STOPWORDS = ResourceUtils.loadStopwords().stream().collect(Collectors.toSet());
  26 + }
  27 +
17 28 public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) {
18   - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
  29 + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase()));
19 30  
20 31 List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
21   - Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences, true);
  32 + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences);
22 33  
23 34 return booleanTokenIntersection(mention2Orth, tokenCounts);
24 35 }
25 36  
  37 + private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sents) {
  38 + Map<TMention, String> mention2orth = Maps.newHashMap();
  39 + for (TSentence s : sents) {
  40 + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
  41 +
  42 + for (TMention m : s.getMentions()) {
  43 + StringBuilder mentionOrth = new StringBuilder();
  44 + for (String tokId : m.getChildIds()) {
  45 + TToken token = tokId2tok.get(tokId);
  46 + if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) {
  47 + continue;
  48 + }
  49 +
  50 + if (!token.isNoPrecedingSpace())
  51 + mentionOrth.append(" ");
  52 + mentionOrth.append(token.getOrth());
  53 + }
  54 + mention2orth.put(m, mentionOrth.toString().trim());
  55 + }
  56 + }
  57 + return mention2orth;
  58 + }
  59 +
26 60 private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
27 61 Map<TMention, Double> mention2score = Maps.newHashMap();
28 62 for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
29 63 TMention mention = entry.getKey();
30 64 String mentionOrth = mention2Orth.get(mention);
31   - for (String token : Utils.tokenize(mentionOrth)) {
  65 + for (String token : TextUtils.tokenize(mentionOrth)) {
32 66 if (tokenCounts.contains(token.toLowerCase())) {
33 67 mention2score.put(mention, 1.0);
34 68 break;
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java
... ... @@ -6,22 +6,23 @@ import com.google.common.collect.Multiset;
6 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
7 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  9 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
10 10  
11 11 import java.util.List;
12 12 import java.util.Map;
13 13  
14 14 public class SentenceScorer {
  15 +
15 16 public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) {
16   - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase()));
  17 + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase()));
17 18  
18 19 Map<TSentence, Double> sentence2score = Maps.newHashMap();
19 20 for (TParagraph paragraph : preprocessedText.getParagraphs())
20 21 for (TSentence sentence : paragraph.getSentences()) {
21 22 double score = 0.0;
22 23  
23   - String orth = Utils.loadSentence2Orth(sentence);
24   - List<String> tokens = Utils.tokenize(orth);
  24 + String orth = TextUtils.loadSentence2Orth(sentence);
  25 + List<String> tokens = TextUtils.tokenize(orth);
25 26 for (String token : tokens) {
26 27 score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0;
27 28 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java
... ... @@ -5,7 +5,7 @@ import org.apache.commons.csv.CSVFormat;
5 5 import org.apache.commons.csv.CSVParser;
6 6 import org.apache.commons.csv.CSVRecord;
7 7 import org.apache.commons.csv.QuoteMode;
8   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
9 9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 10 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
11 11  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java
... ... @@ -8,14 +8,14 @@ import com.google.common.collect.Multiset;
8 8 import org.apache.commons.io.FileUtils;
9 9 import pl.waw.ipipan.zil.summ.eval.Main;
10 10 import pl.waw.ipipan.zil.summ.eval.rouge.RougeN;
11   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  11 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
12 12  
13 13 import java.io.File;
14 14 import java.io.IOException;
15 15 import java.util.*;
16 16 import java.util.stream.Collectors;
17 17  
18   -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  18 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
19 19  
20 20 public class CreateOptimalSummaries {
21 21  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java
1 1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2  
3   -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  3 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
4 4  
5 5 public class DownloadCorpus {
6 6  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java
1 1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2  
3   -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  3 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
4 4  
5 5 public class DownloadTrainingResources {
6 6  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
1 1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2  
3   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
  3 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
4 4 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
5 5 import pl.waw.ipipan.zil.summ.pscapi.xml.Summary;
6 6 import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
7 7  
8 8 import javax.xml.bind.JAXBException;
9 9 import java.io.File;
  10 +import java.io.FileOutputStream;
10 11 import java.io.IOException;
  12 +import java.io.OutputStreamWriter;
11 13 import java.util.List;
12 14 import java.util.function.Predicate;
13 15 import java.util.stream.Collectors;
14 16 import java.util.stream.Stream;
15 17  
16   -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  18 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
17 19  
18 20 public class ExtractGoldSummaries {
19 21  
... ... @@ -22,7 +24,6 @@ public class ExtractGoldSummaries {
22 24  
23 25 private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE));
24 26  
25   -
26 27 private ExtractGoldSummaries() {
27 28 }
28 29  
... ... @@ -47,7 +48,10 @@ public class ExtractGoldSummaries {
47 48 for (Summary summary : goldSummaries) {
48 49 File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR;
49 50 File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt");
50   - Utils.writeStringToFile(summary.getBody(), targetFile);
  51 +
  52 + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) {
  53 + writer.append(summary.getBody());
  54 + }
51 55 }
52 56 }
53 57 }
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
... ... @@ -10,15 +10,14 @@ import org.slf4j.LoggerFactory;
10 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
11 11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
12 12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
13   -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;
14   -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;
15   -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;
16 13 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
17 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
18 15 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
19 16 import pl.waw.ipipan.zil.summ.nicolas.train.model.MentionScorer;
20 17 import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer;
21 18 import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer;
  19 +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
  20 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
22 21 import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
23 22 import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;
24 23 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
... ... @@ -37,7 +36,7 @@ import java.util.Set;
37 36 import java.util.function.Predicate;
38 37 import java.util.stream.Collectors;
39 38  
40   -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  39 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
41 40  
42 41 public class PrepareTrainingData {
43 42  
... ... @@ -61,7 +60,7 @@ public class PrepareTrainingData {
61 60 MentionScorer mentionScorer = new MentionScorer();
62 61 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
63 62  
64   - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  63 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
65 64  
66 65 int i = 1;
67 66 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
... ... @@ -105,7 +104,7 @@ public class PrepareTrainingData {
105 104 SentenceScorer sentenceScorer = new SentenceScorer();
106 105 SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor();
107 106  
108   - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  107 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
109 108  
110 109 int i = 1;
111 110 for (String textId : id2preprocessedText.keySet()) {
... ... @@ -149,7 +148,7 @@ public class PrepareTrainingData {
149 148 ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_CORPUS);
150 149 ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor();
151 150  
152   - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList());
  151 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
153 152  
154 153 int i = 1;
155 154 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java
... ... @@ -9,7 +9,7 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
9 9 import java.io.File;
10 10 import java.util.Arrays;
11 11  
12   -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  12 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
13 13  
14 14 public class PreprocessCorpus {
15 15  
... ...
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
... ... @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
3 3 import org.apache.commons.lang3.time.StopWatch;
4 4 import org.slf4j.Logger;
5 5 import org.slf4j.LoggerFactory;
6   -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
7 7 import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings;
8 8 import weka.classifiers.Classifier;
9 9 import weka.core.Instances;
... ... @@ -14,7 +14,7 @@ import java.io.FileOutputStream;
14 14 import java.io.ObjectOutputStream;
15 15 import java.util.logging.LogManager;
16 16  
17   -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*;
  17 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
18 18  
19 19 public class TrainAllModels {
20 20  
... ...
... ... @@ -10,15 +10,12 @@
10 10  
11 11 <packaging>pom</packaging>
12 12  
13   -
14 13 <modules>
15 14 <module>nicolas-lib</module>
16 15 <module>nicolas-cli</module>
17 16 <module>nicolas-model</module>
18 17 <module>nicolas-train</module>
19   - <module>nicolas-common</module>
20 18 <module>nicolas-multiservice</module>
21   - <module>nicolas-eval</module>
22 19 </modules>
23 20  
24 21 <properties>
... ... @@ -59,23 +56,23 @@
59 56 <!-- project -->
60 57 <dependency>
61 58 <groupId>pl.waw.ipipan.zil.summ</groupId>
62   - <artifactId>nicolas-model</artifactId>
  59 + <artifactId>nicolas-cli</artifactId>
63 60 <version>${project.version}</version>
64   - <scope>runtime</scope>
65 61 </dependency>
66 62 <dependency>
67 63 <groupId>pl.waw.ipipan.zil.summ</groupId>
68   - <artifactId>nicolas-common</artifactId>
  64 + <artifactId>nicolas-lib</artifactId>
69 65 <version>${project.version}</version>
70 66 </dependency>
71 67 <dependency>
72 68 <groupId>pl.waw.ipipan.zil.summ</groupId>
73   - <artifactId>nicolas-zero</artifactId>
  69 + <artifactId>nicolas-model</artifactId>
74 70 <version>${project.version}</version>
  71 + <scope>runtime</scope>
75 72 </dependency>
76 73 <dependency>
77 74 <groupId>pl.waw.ipipan.zil.summ</groupId>
78   - <artifactId>nicolas-lib</artifactId>
  75 + <artifactId>nicolas-multiservice</artifactId>
79 76 <version>${project.version}</version>
80 77 </dependency>
81 78 <dependency>
... ... @@ -83,11 +80,6 @@
83 80 <artifactId>nicolas-train</artifactId>
84 81 <version>${project.version}</version>
85 82 </dependency>
86   - <dependency>
87   - <groupId>pl.waw.ipipan.zil.summ</groupId>
88   - <artifactId>nicolas-multiservice</artifactId>
89   - <version>${project.version}</version>
90   - </dependency>
91 83  
92 84 <!-- internal -->
93 85 <dependency>
... ...