Commit 1a009dd0c4f78b9367ce117f0edd6e982cb4ebdf

Authored by Mateusz Kopeć
1 parent 7e387f1c

clean up modules

Showing 50 changed files with 311 additions and 685 deletions
README.md 0 → 100644
  1 +# Nicolas
  2 +
  3 +Summarization tool, using coreference information as main source of information for content selection.
  4 +
eval.sh 0 → 100755
  1 +#!/usr/bin/env bash
  2 +
nicolas-cli/pom.xml
@@ -22,6 +22,11 @@ @@ -22,6 +22,11 @@
22 <groupId>pl.waw.ipipan.zil.summ</groupId> 22 <groupId>pl.waw.ipipan.zil.summ</groupId>
23 <artifactId>nicolas-lib</artifactId> 23 <artifactId>nicolas-lib</artifactId>
24 </dependency> 24 </dependency>
  25 + <dependency>
  26 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  27 + <artifactId>nicolas-model</artifactId>
  28 + <scope>runtime</scope>
  29 + </dependency>
25 30
26 <!-- third party --> 31 <!-- third party -->
27 <dependency> 32 <dependency>
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java
@@ -5,9 +5,9 @@ import org.slf4j.Logger; @@ -5,9 +5,9 @@ import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
8 import pl.waw.ipipan.zil.summ.nicolas.Nicolas; 9 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
9 import pl.waw.ipipan.zil.summ.nicolas.NicolasException; 10 import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
10 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
11 import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; 11 import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor;
12 12
13 import java.io.*; 13 import java.io.*;
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java
@@ -5,10 +5,10 @@ import org.junit.ClassRule; @@ -5,10 +5,10 @@ import org.junit.ClassRule;
5 import org.junit.Test; 5 import org.junit.Test;
6 import org.junit.rules.TemporaryFolder; 6 import org.junit.rules.TemporaryFolder;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
  8 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
8 import pl.waw.ipipan.zil.summ.nicolas.Nicolas; 9 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
9 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
10 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;  
11 import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; 10 import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor;
  11 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
12 12
13 import java.io.File; 13 import java.io.File;
14 import java.io.FileInputStream; 14 import java.io.FileInputStream;
@@ -29,7 +29,7 @@ public class ClientTest { @@ -29,7 +29,7 @@ public class ClientTest {
29 @Test 29 @Test
30 public void processSampleText() throws Exception { 30 public void processSampleText() throws Exception {
31 Preprocessor preprocessor = mock(Preprocessor.class); 31 Preprocessor preprocessor = mock(Preprocessor.class);
32 - TText ttext = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); 32 + TText ttext = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH);
33 when(preprocessor.preprocess(any())).thenReturn(ttext); 33 when(preprocessor.preprocess(any())).thenReturn(ttext);
34 34
35 Nicolas nicolas = mock(Nicolas.class); 35 Nicolas nicolas = mock(Nicolas.class);
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java
@@ -4,7 +4,7 @@ import org.apache.commons.io.IOUtils; @@ -4,7 +4,7 @@ import org.apache.commons.io.IOUtils;
4 import org.junit.ClassRule; 4 import org.junit.ClassRule;
5 import org.junit.Test; 5 import org.junit.Test;
6 import org.junit.rules.TemporaryFolder; 6 import org.junit.rules.TemporaryFolder;
7 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 7 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
8 8
9 import java.io.File; 9 import java.io.File;
10 import java.io.FileInputStream; 10 import java.io.FileInputStream;
nicolas-common/pom.xml deleted
1 -<?xml version="1.0" encoding="UTF-8"?>  
2 -<project xmlns="http://maven.apache.org/POM/4.0.0"  
3 - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  
4 - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  
5 - <modelVersion>4.0.0</modelVersion>  
6 - <parent>  
7 - <artifactId>nicolas-container</artifactId>  
8 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
9 - <version>1.0-SNAPSHOT</version>  
10 - </parent>  
11 -  
12 - <artifactId>nicolas-common</artifactId>  
13 -  
14 - <dependencies>  
15 - <!-- internal -->  
16 - <dependency>  
17 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
18 - <artifactId>pscapi</artifactId>  
19 - </dependency>  
20 - <dependency>  
21 - <groupId>pl.waw.ipipan.zil.multiservice</groupId>  
22 - <artifactId>utils</artifactId>  
23 - </dependency>  
24 -  
25 - <!-- third party -->  
26 - <dependency>  
27 - <groupId>nz.ac.waikato.cms.weka</groupId>  
28 - <artifactId>weka-stable</artifactId>  
29 - </dependency>  
30 - <dependency>  
31 - <groupId>commons-io</groupId>  
32 - <artifactId>commons-io</artifactId>  
33 - </dependency>  
34 -  
35 - <!-- logging -->  
36 - <dependency>  
37 - <groupId>org.slf4j</groupId>  
38 - <artifactId>slf4j-api</artifactId>  
39 - </dependency>  
40 -  
41 - </dependencies>  
42 -  
43 -</project>  
44 \ No newline at end of file 0 \ No newline at end of file
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.common;  
2 -  
3 -import com.google.common.collect.Lists;  
4 -import com.google.common.collect.Maps;  
5 -import com.google.common.collect.Sets;  
6 -import org.apache.commons.io.IOUtils;  
7 -import org.slf4j.Logger;  
8 -import org.slf4j.LoggerFactory;  
9 -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;  
10 -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;  
11 -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;  
12 -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;  
13 -import weka.classifiers.Classifier;  
14 -import weka.core.Attribute;  
15 -import weka.core.Instances;  
16 -  
17 -import java.io.*;  
18 -import java.util.*;  
19 -import java.util.function.Function;  
20 -import java.util.stream.Collectors;  
21 -  
22 -public class Utils {  
23 -  
24 - private static final Logger LOG = LoggerFactory.getLogger(Utils.class);  
25 -  
26 - private static final String DATASET_NAME = "Dataset";  
27 -  
28 - private Utils() {  
29 - }  
30 -  
31 - public static void writeStringToFile(String string, File file) throws IOException {  
32 - try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) {  
33 - bw.append(string);  
34 - }  
35 - }  
36 -  
37 - public static Classifier loadModelFromResource(String modelResourcePath) throws IOException {  
38 - LOG.info("Loading classifier from path: {}...", modelResourcePath);  
39 - try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) {  
40 - if (stream == null) {  
41 - throw new IOException("Model not found at: " + modelResourcePath);  
42 - }  
43 - try (ObjectInputStream ois = new ObjectInputStream(stream)) {  
44 - Classifier classifier = (Classifier) ois.readObject();  
45 - LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName());  
46 - return classifier;  
47 - } catch (ClassNotFoundException e) {  
48 - LOG.error("Error loading serialized classifier, class not found.", e);  
49 - throw new IOException(e);  
50 - }  
51 - }  
52 - }  
53 -  
54 - public static TText loadThriftTextFromStream(InputStream inputStream) throws IOException {  
55 - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(inputStream)) {  
56 - return (TText) ois.readObject();  
57 - } catch (ClassNotFoundException e) {  
58 - LOG.error("Error reading serialized thrift text file, class not found.", e);  
59 - throw new IOException(e);  
60 - }  
61 - }  
62 -  
63 - public static TText loadThriftTextFromResource(String textResourcePath) throws IOException {  
64 - try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) {  
65 - if (stream == null) {  
66 - throw new IOException("Resource not found at: " + textResourcePath);  
67 - }  
68 - return loadThriftTextFromStream(stream);  
69 - }  
70 - }  
71 -  
72 - public static List<String> loadLinesFromResource(String resourcePath) throws IOException {  
73 - try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) {  
74 - return IOUtils.readLines(stream, Constants.ENCODING);  
75 - }  
76 - }  
77 -  
78 - @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList  
79 - public static Instances createNewInstances(ArrayList<Attribute> attributesList) {  
80 - Instances instances = new Instances(DATASET_NAME, attributesList, 0);  
81 - instances.setClassIndex(0);  
82 - return instances;  
83 - }  
84 -  
85 - public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException {  
86 - LOG.info("Loading classifier...");  
87 - try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) {  
88 - Classifier classifier = (Classifier) ois.readObject();  
89 - LOG.info("Done. " + classifier.toString());  
90 - return classifier;  
91 - }  
92 - }  
93 -  
94 - public static List<String> tokenize(String text) {  
95 - return Arrays.asList(text.split("[^\\p{L}0-9]+"));  
96 - }  
97 -  
98 - public static List<String> tokenizeOnWhitespace(String text) {  
99 - return Arrays.asList(text.split(" +"));  
100 - }  
101 -  
102 - public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) {  
103 - Map<TMention, String> mention2orth = Maps.newHashMap();  
104 - for (TSentence s : sents) {  
105 - Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth));  
106 - Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace));  
107 -  
108 - for (TMention m : s.getMentions()) {  
109 - StringBuffer mentionOrth = new StringBuffer();  
110 - for (String tokId : m.getHeadIds()) {  
111 - if (!tokId2nps.get(tokId))  
112 - mentionOrth.append(" ");  
113 - mentionOrth.append(tokId2orth.get(tokId));  
114 - }  
115 - mention2orth.put(m, mentionOrth.toString().trim());  
116 - }  
117 - }  
118 - return mention2orth;  
119 - }  
120 -  
121 - private static final Collection<String> STOPWORDS = Sets.newHashSet();  
122 -  
123 - static {  
124 - STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co"));  
125 - }  
126 -  
127 - public static Map<TMention, String> loadMention2Orth(List<TSentence> sents, boolean discardStopwords) {  
128 - Map<TMention, String> mention2orth = Maps.newHashMap();  
129 - for (TSentence s : sents) {  
130 - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));  
131 -  
132 - for (TMention m : s.getMentions()) {  
133 - StringBuffer mentionOrth = new StringBuffer();  
134 - for (String tokId : m.getChildIds()) {  
135 - TToken token = tokId2tok.get(tokId);  
136 - if (discardStopwords && STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) {  
137 - continue;  
138 - }  
139 -  
140 - if (!token.isNoPrecedingSpace())  
141 - mentionOrth.append(" ");  
142 - mentionOrth.append(token.getOrth());  
143 - }  
144 - mention2orth.put(m, mentionOrth.toString().trim());  
145 - }  
146 - }  
147 - return mention2orth;  
148 - }  
149 -  
150 - public static Map<TMention, String> loadMention2Base(List<TSentence> sents) {  
151 - Map<TMention, String> mention2base = Maps.newHashMap();  
152 - for (TSentence s : sents) {  
153 - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase()));  
154 -  
155 - for (TMention m : s.getMentions()) {  
156 - StringBuilder mentionBase = new StringBuilder();  
157 - for (String tokId : m.getChildIds()) {  
158 - mentionBase.append(" ");  
159 - mentionBase.append(tokId2base.get(tokId));  
160 - }  
161 - mention2base.put(m, mentionBase.toString().toLowerCase().trim());  
162 - }  
163 - }  
164 - return mention2base;  
165 - }  
166 -  
167 - public static String loadSentence2Orth(TSentence sentence) {  
168 - return loadSentence2Orth(sentence, Sets.newHashSet());  
169 - }  
170 -  
171 - public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) {  
172 - StringBuilder sb = new StringBuilder();  
173 - for (TToken token : sentence.getTokens()) {  
174 - if (tokenIdsToSkip.contains(token.getId())) {  
175 - System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence));  
176 - continue;  
177 - }  
178 - if (!token.isNoPrecedingSpace())  
179 - sb.append(" ");  
180 - sb.append(token.getOrth());  
181 - }  
182 - return sb.toString().trim();  
183 - }  
184 -  
185 -}  
186 \ No newline at end of file 0 \ No newline at end of file
nicolas-eval/pom.xml deleted
1 -<?xml version="1.0" encoding="UTF-8"?>  
2 -<project xmlns="http://maven.apache.org/POM/4.0.0"  
3 - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"  
4 - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">  
5 - <parent>  
6 - <artifactId>nicolas-container</artifactId>  
7 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
8 - <version>1.0-SNAPSHOT</version>  
9 - </parent>  
10 - <modelVersion>4.0.0</modelVersion>  
11 -  
12 - <artifactId>nicolas-eval</artifactId>  
13 -  
14 - <dependencies>  
15 - <!-- project -->  
16 - <dependency>  
17 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
18 - <artifactId>nicolas-lib</artifactId>  
19 - </dependency>  
20 - <dependency>  
21 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
22 - <artifactId>nicolas-common</artifactId>  
23 - </dependency>  
24 -  
25 - <!-- internal -->  
26 - <dependency>  
27 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
28 - <artifactId>eval</artifactId>  
29 - </dependency>  
30 -  
31 - <!-- third party -->  
32 - <dependency>  
33 - <groupId>nz.ac.waikato.cms.weka</groupId>  
34 - <artifactId>weka-stable</artifactId>  
35 - </dependency>  
36 - <dependency>  
37 - <groupId>org.apache.commons</groupId>  
38 - <artifactId>commons-lang3</artifactId>  
39 - </dependency>  
40 - <dependency>  
41 - <groupId>com.google.guava</groupId>  
42 - <artifactId>guava</artifactId>  
43 - </dependency>  
44 -  
45 - <!-- logging -->  
46 - <dependency>  
47 - <groupId>org.slf4j</groupId>  
48 - <artifactId>slf4j-api</artifactId>  
49 - </dependency>  
50 - <dependency>  
51 - <groupId>org.slf4j</groupId>  
52 - <artifactId>slf4j-simple</artifactId>  
53 - </dependency>  
54 -  
55 - </dependencies>  
56 -</project>  
57 \ No newline at end of file 0 \ No newline at end of file
nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt deleted
1 -199704210012  
2 -199704210042  
3 -199704220007  
4 -199704220018  
5 -199704220021  
6 -199704220044  
7 -199704230006  
8 -199704230014  
9 -199704230029  
10 -199704230043  
11 -199704240008  
12 -199704240019  
13 -199704240020  
14 -199704240021  
15 -199704250018  
16 -199704250022  
17 -199704260014  
18 -199704260015  
19 -199704260016  
20 -199704280023  
21 -199704280025  
22 -199704280027  
23 -199704280031  
24 -199704300031  
25 -199704300042  
26 -199704300046  
27 -199801020010  
28 -199801020031  
29 -199801020035  
30 -199801020070  
31 -199801020076  
32 -199801020079  
33 -199801030068  
34 -199801030090  
35 -199801030091  
36 -199801030129  
37 -199801030148  
38 -199801030158  
39 -199801050023  
40 -199801050059  
41 -199801130087  
42 -199801130129  
43 -199801140182  
44 -199801160119  
45 -199801200106  
46 -199801220140  
47 -199801240061  
48 -199801240096  
49 -199801260047  
50 -199801260070  
51 -199801270055  
52 -199801270110  
53 -199801280123  
54 -199801280158  
55 -199801280159  
56 -199801280241  
57 -199801290022  
58 -199801310003  
59 -199801310037  
60 -199802030127  
61 -199802040159  
62 -199802040182  
63 -199802040202  
64 -199805220133  
65 -199808280158  
66 -199901190073  
67 -199901190115  
68 -199901250112  
69 -199901250117  
70 -199901270103  
71 -199901270120  
72 -199901270122  
73 -199901290095  
74 -199901300101  
75 -199902240095  
76 -199906220029  
77 -199906230024  
78 -199906240084  
79 -199906260027  
80 -199907050045  
81 -199907050076  
82 -199907140166  
83 -199907200002  
84 -199907270004  
85 -199908260001  
86 -199909090036  
87 -199909250018  
88 -199909270029  
89 -199910020027  
90 -199910020029  
91 -199910270011  
92 -199911060044  
93 -199911100038  
94 -199911100064  
95 -199911200030  
96 -199911220063  
97 -199912020060  
98 -199912180026  
99 -199912180034  
100 -199912220030  
101 -199912280024  
102 -199912280046  
103 -199912300021  
104 -199912300029  
105 -200001030029  
106 -200001030053  
107 -200001060034  
108 -200001100035  
109 -200001100046  
110 -200001170029  
111 -200001170033  
112 -200001170060  
113 -200001290045  
114 -200002220027  
115 -200002240034  
116 -200002250031  
117 -200003060062  
118 -200003110050  
119 -200004280047  
120 -200004290022  
121 -200006050119  
122 -200006260079  
123 -200006290045  
124 -200007150033  
125 -200008040076  
126 -200008220042  
127 -200008220046  
128 -200010130049  
129 -200010160054  
130 -200012130034  
131 -200012140084  
132 -200012290046  
133 -200104040019  
134 -200106050035  
135 -200108180109  
136 -200108300032  
137 -200111120045  
138 -200111150042  
139 -200111150047  
140 -200111200036  
141 -200111270049  
142 -200112030055  
143 -200112280057  
144 -200201220038  
145 -200201220050  
146 -200202020036  
147 -200202200032  
148 -200202210054  
149 -200202270044  
150 -200203010070  
151 -200203190026  
152 -200203260050  
153 -200203280017  
154 -200203290078  
nicolas-lib/pom.xml
@@ -12,15 +12,6 @@ @@ -12,15 +12,6 @@
12 <artifactId>nicolas-lib</artifactId> 12 <artifactId>nicolas-lib</artifactId>
13 13
14 <dependencies> 14 <dependencies>
15 - <!-- project -->  
16 - <dependency>  
17 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
18 - <artifactId>nicolas-common</artifactId>  
19 - </dependency>  
20 - <dependency>  
21 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
22 - <artifactId>nicolas-model</artifactId>  
23 - </dependency>  
24 15
25 <!-- internal --> 16 <!-- internal -->
26 <dependency> 17 <dependency>
@@ -61,5 +52,10 @@ @@ -61,5 +52,10 @@
61 <groupId>junit</groupId> 52 <groupId>junit</groupId>
62 <artifactId>junit</artifactId> 53 <artifactId>junit</artifactId>
63 </dependency> 54 </dependency>
  55 + <dependency>
  56 + <groupId>pl.waw.ipipan.zil.summ</groupId>
  57 + <artifactId>nicolas-model</artifactId>
  58 + <scope>test</scope>
  59 + </dependency>
64 </dependencies> 60 </dependencies>
65 </project> 61 </project>
66 \ No newline at end of file 62 \ No newline at end of file
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java
1 -package pl.waw.ipipan.zil.summ.nicolas.common; 1 +package pl.waw.ipipan.zil.summ.nicolas;
2 2
3 -import com.google.common.base.Charsets;  
4 import com.google.common.collect.ImmutableList; 3 import com.google.common.collect.ImmutableList;
5 4
6 import java.nio.charset.Charset; 5 import java.nio.charset.Charset;
7 - 6 +import java.nio.charset.StandardCharsets;
8 7
9 public class Constants { 8 public class Constants {
10 9
11 - private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; 10 + public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact");
  11 + public static final Charset ENCODING = StandardCharsets.UTF_8;
12 12
  13 + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/";
13 private static final String MODELS_PATH = ROOT_PATH + "models/"; 14 private static final String MODELS_PATH = ROOT_PATH + "models/";
  15 +
14 public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin"; 16 public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin";
15 public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin"; 17 public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin";
16 public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin"; 18 public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin";
17 19
18 private static final String RESOURCES_PATH = ROOT_PATH + "resources/"; 20 private static final String RESOURCES_PATH = ROOT_PATH + "resources/";
19 public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt"; 21 public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt";
20 -  
21 - public static final Charset ENCODING = Charsets.UTF_8;  
22 -  
23 - public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); 22 + public static final String STOPWORDS_PATH = RESOURCES_PATH + "stopwords.txt";
24 23
25 private Constants() { 24 private Constants() {
26 } 25 }
27 -  
28 } 26 }
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -5,12 +5,12 @@ import com.google.common.collect.Sets; @@ -5,12 +5,12 @@ import com.google.common.collect.Sets;
5 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 5 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
9 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;  
10 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 8 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
11 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; 9 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;
12 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 10 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
13 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; 11 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel;
  12 +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils;
  13 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
14 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; 14 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
15 import weka.classifiers.Classifier; 15 import weka.classifiers.Classifier;
16 16
@@ -31,9 +31,9 @@ public class Nicolas { @@ -31,9 +31,9 @@ public class Nicolas {
31 31
32 public Nicolas() throws NicolasException { 32 public Nicolas() throws NicolasException {
33 try { 33 try {
34 - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);  
35 - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);  
36 - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); 34 + mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);
  35 + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);
  36 + zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH);
37 37
38 mentionFeatureExtractor = new MentionFeatureExtractor(); 38 mentionFeatureExtractor = new MentionFeatureExtractor();
39 sentenceFeatureExtractor = new SentenceFeatureExtractor(); 39 sentenceFeatureExtractor = new SentenceFeatureExtractor();
@@ -57,7 +57,7 @@ public class Nicolas { @@ -57,7 +57,7 @@ public class Nicolas {
57 57
58 StringBuilder sb = new StringBuilder(); 58 StringBuilder sb = new StringBuilder();
59 for (TSentence sent : selectedSentences) { 59 for (TSentence sent : selectedSentences) {
60 - sb.append(" ").append(Utils.loadSentence2Orth(sent)); 60 + sb.append(" ").append(TextUtils.loadSentence2Orth(sent));
61 } 61 }
62 return sb.toString().trim(); 62 return sb.toString().trim();
63 } 63 }
@@ -74,7 +74,7 @@ public class Nicolas { @@ -74,7 +74,7 @@ public class Nicolas {
74 Random r = new Random(1); 74 Random r = new Random(1);
75 Set<TSentence> summary = Sets.newHashSet(); 75 Set<TSentence> summary = Sets.newHashSet();
76 for (TSentence sent : sortedSentences) { 76 for (TSentence sent : sortedSentences) {
77 - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); 77 + size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size();
78 if (r.nextDouble() > 0.4 && size > targetSize) 78 if (r.nextDouble() > 0.4 && size > targetSize)
79 break; 79 break;
80 summary.add(sent); 80 summary.add(sent);
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java deleted
1 -package pl.waw.ipipan.zil.summ.nicolas.apply;  
2 -  
3 -import com.google.common.collect.Lists;  
4 -import com.google.common.collect.Maps;  
5 -import com.google.common.collect.Sets;  
6 -import org.slf4j.Logger;  
7 -import org.slf4j.LoggerFactory;  
8 -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;  
9 -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;  
10 -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;  
11 -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;  
12 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
13 -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;  
14 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;  
15 -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;  
16 -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel;  
17 -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;  
18 -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectInjector;  
19 -import weka.classifiers.Classifier;  
20 -import weka.core.Instance;  
21 -import weka.core.Instances;  
22 -  
23 -import java.io.BufferedWriter;  
24 -import java.io.File;  
25 -import java.io.FileWriter;  
26 -import java.util.*;  
27 -  
28 -import static java.util.stream.Collectors.toList;  
29 -  
30 -public class ApplyModel {  
31 -  
32 - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class);  
33 -  
34 - private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test";  
35 - private static final String TARGET_DIR = "corpora/summaries";  
36 -  
37 - public static void main(String[] args) throws Exception {  
38 - Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH);  
39 - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();  
40 -  
41 - Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH);  
42 - SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor();  
43 -  
44 - ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector();  
45 -  
46 - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH));  
47 - int i = 1;  
48 - double avgSize = 0;  
49 - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {  
50 - TText text = entry.getValue();  
51 -  
52 - Set<TMention> goodMentions  
53 - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text);  
54 -  
55 - int targetSize = calculateTargetSize(text);  
56 - String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor, zeroSubjectInjector);  
57 - int size = Utils.tokenize(summary).size();  
58 - avgSize += size;  
59 - try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily4.txt")))) {  
60 - bw.append(summary);  
61 - }  
62 -  
63 - LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey());  
64 - }  
65 -  
66 - LOG.info("Avg size:" + avgSize / id2preprocessedText.size());  
67 - }  
68 -  
69 - private static int calculateTargetSize(TText text) {  
70 - List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());  
71 - StringBuffer body = new StringBuffer();  
72 - for (TSentence sent : sents)  
73 - body.append(Utils.loadSentence2Orth(sent) + " ");  
74 - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size();  
75 - return (int) (0.2 * tokenCount);  
76 - }  
77 -  
78 - private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor, ZeroSubjectInjector zeroSubjectInjector) throws Exception {  
79 - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor);  
80 -  
81 - Set<String> zeroSubjectTokenIds = zeroSubjectInjector.findZeroSubjectTokenIds(thrifted, selectedSentences);  
82 -  
83 - StringBuilder sb = new StringBuilder();  
84 - for (TSentence sent : selectedSentences) {  
85 - sb.append(" " + Utils.loadSentence2Orth(sent, zeroSubjectTokenIds));  
86 - }  
87 - return sb.toString().trim();  
88 - }  
89 -  
90 - private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {  
91 -  
92 - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());  
93 -  
94 - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList());  
95 - Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);  
96 -  
97 - Map<TSentence, Double> sentence2score = Maps.newHashMap();  
98 - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) {  
99 - Instance instance = entry.getValue();  
100 - instance.setDataset(instances);  
101 - double score = sentenceClassifier.classifyInstance(instance);  
102 - sentence2score.put(entry.getKey(), score);  
103 - }  
104 -  
105 - List<TSentence> sortedSents = Lists.newArrayList(sents);  
106 - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed());  
107 -  
108 - int size = 0;  
109 - Random r = new Random(1);  
110 - Set<TSentence> summary = Sets.newHashSet();  
111 - for (TSentence sent : sortedSents) {  
112 - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size();  
113 - if (r.nextDouble() > 0.4 && size > targetSize)  
114 - break;  
115 - summary.add(sent);  
116 - if (size > targetSize)  
117 - break;  
118 - }  
119 - List<TSentence> selectedSentences = Lists.newArrayList();  
120 - for (TSentence sent : sents) {  
121 - if (summary.contains(sent))  
122 - selectedSentences.add(sent);  
123 - }  
124 - return selectedSentences;  
125 - }  
126 -  
127 -}  
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
@@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.features; @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.features;
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import com.google.common.collect.Sets; 4 import com.google.common.collect.Sets;
5 import pl.waw.ipipan.zil.multiservice.thrift.types.*; 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
6 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;  
7 6
8 import java.util.List; 7 import java.util.List;
9 import java.util.Map; 8 import java.util.Map;
@@ -38,7 +37,6 @@ public class FeatureHelper { @@ -38,7 +37,6 @@ public class FeatureHelper {
38 private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); 37 private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap();
39 private final Map<TMention, Integer> mention2firstTokenIndex = Maps.newHashMap(); 38 private final Map<TMention, Integer> mention2firstTokenIndex = Maps.newHashMap();
40 39
41 -  
42 public FeatureHelper(TText preprocessedText) { 40 public FeatureHelper(TText preprocessedText) {
43 text = preprocessedText; 41 text = preprocessedText;
44 42
@@ -60,9 +58,9 @@ public class FeatureHelper { @@ -60,9 +58,9 @@ public class FeatureHelper {
60 int sentIdx = 0; 58 int sentIdx = 0;
61 int mentionIdx = 0; 59 int mentionIdx = 0;
62 for (TParagraph par : preprocessedText.getParagraphs()) { 60 for (TParagraph par : preprocessedText.getParagraphs()) {
63 - Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences(), false); 61 + Map<TMention, String> m2o = loadMention2Orth(par.getSentences());
64 mention2Orth.putAll(m2o); 62 mention2Orth.putAll(m2o);
65 - Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); 63 + Map<TMention, String> m2b = loadMention2Base(par.getSentences());
66 mention2Base.putAll(m2b); 64 mention2Base.putAll(m2b);
67 65
68 int sentIdxInPar = 0; 66 int sentIdxInPar = 0;
@@ -221,4 +219,40 @@ public class FeatureHelper { @@ -221,4 +219,40 @@ public class FeatureHelper {
221 return null; 219 return null;
222 return mention2sent.get(mention).getTokens().get(idx - 1); 220 return mention2sent.get(mention).getTokens().get(idx - 1);
223 } 221 }
  222 +
  223 + private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) {
  224 + Map<TMention, String> mention2orth = Maps.newHashMap();
  225 + for (TSentence s : sents) {
  226 + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
  227 +
  228 + for (TMention m : s.getMentions()) {
  229 + StringBuilder mentionOrth = new StringBuilder();
  230 + for (String tokId : m.getChildIds()) {
  231 + TToken token = tokId2tok.get(tokId);
  232 + if (!token.isNoPrecedingSpace())
  233 + mentionOrth.append(" ");
  234 + mentionOrth.append(token.getOrth());
  235 + }
  236 + mention2orth.put(m, mentionOrth.toString().trim());
  237 + }
  238 + }
  239 + return mention2orth;
  240 + }
  241 +
  242 + private static Map<TMention, String> loadMention2Base(List<TSentence> sents) {
  243 + Map<TMention, String> mention2base = Maps.newHashMap();
  244 + for (TSentence s : sents) {
  245 + Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase()));
  246 +
  247 + for (TMention m : s.getMentions()) {
  248 + StringBuilder mentionBase = new StringBuilder();
  249 + for (String tokId : m.getChildIds()) {
  250 + mentionBase.append(" ");
  251 + mentionBase.append(tokId2base.get(tokId));
  252 + }
  253 + mention2base.put(m, mentionBase.toString().toLowerCase().trim());
  254 + }
  255 + }
  256 + return mention2base;
  257 + }
224 } 258 }
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
@@ -3,11 +3,11 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; @@ -3,11 +3,11 @@ package pl.waw.ipipan.zil.summ.nicolas.mention;
3 import com.google.common.collect.Lists; 3 import com.google.common.collect.Lists;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
5 import pl.waw.ipipan.zil.multiservice.thrift.types.*; 5 import pl.waw.ipipan.zil.multiservice.thrift.types.*;
6 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
7 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
8 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; 7 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 8 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; 9 import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation;
  10 +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils;
11 import weka.core.Attribute; 11 import weka.core.Attribute;
12 12
13 import java.io.IOException; 13 import java.io.IOException;
@@ -21,7 +21,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { @@ -21,7 +21,7 @@ public class MentionFeatureExtractor extends FeatureExtractor {
21 private final List<String> frequentBases; 21 private final List<String> frequentBases;
22 22
23 public MentionFeatureExtractor() throws IOException { 23 public MentionFeatureExtractor() throws IOException {
24 - frequentBases = loadFrequentBases(); 24 + frequentBases = ResourceUtils.loadFrequentBases();
25 25
26 //coref 26 //coref
27 addNumericAttributeNormalized("chain_length"); 27 addNumericAttributeNormalized("chain_length");
@@ -80,10 +80,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { @@ -80,10 +80,6 @@ public class MentionFeatureExtractor extends FeatureExtractor {
80 fillSortedAttributes("score"); 80 fillSortedAttributes("score");
81 } 81 }
82 82
83 - private List<String> loadFrequentBases() throws IOException {  
84 - return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList());  
85 - }  
86 -  
87 private String encodeBase(String base) { 83 private String encodeBase(String base) {
88 return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); 84 return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q");
89 } 85 }
@@ -177,7 +173,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { @@ -177,7 +173,7 @@ public class MentionFeatureExtractor extends FeatureExtractor {
177 Attribute att = getAttributeByName(attributeName); 173 Attribute att = getAttributeByName(attributeName);
178 int index = att.indexOfValue(value); 174 int index = att.indexOfValue(value);
179 if (index == -1) 175 if (index == -1)
180 - LOG.warn(value + " not found for attribute " + attributeName); 176 + LOG.warn("{} not found for attribute {}", value, attributeName);
181 attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); 177 attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index));
182 } 178 }
183 179
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
@@ -5,8 +5,7 @@ import org.slf4j.Logger; @@ -5,8 +5,7 @@ import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8 -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;  
9 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 8 +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
10 import weka.classifiers.Classifier; 9 import weka.classifiers.Classifier;
11 import weka.core.Instance; 10 import weka.core.Instance;
12 import weka.core.Instances; 11 import weka.core.Instances;
@@ -24,7 +23,7 @@ public class MentionModel { @@ -24,7 +23,7 @@ public class MentionModel {
24 public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception { 23 public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception {
25 Set<TMention> goodMentions = Sets.newHashSet(); 24 Set<TMention> goodMentions = Sets.newHashSet();
26 25
27 - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 26 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
28 Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor); 27 Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor);
29 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { 28 for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) {
30 Instance instance = entry.getValue(); 29 Instance instance = entry.getValue();
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
@@ -6,8 +6,7 @@ import org.slf4j.LoggerFactory; @@ -6,8 +6,7 @@ import org.slf4j.LoggerFactory;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;  
10 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 9 +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
11 import weka.classifiers.Classifier; 10 import weka.classifiers.Classifier;
12 import weka.core.Instance; 11 import weka.core.Instance;
13 import weka.core.Instances; 12 import weka.core.Instances;
@@ -23,7 +22,7 @@ public class SentenceModel { @@ -23,7 +22,7 @@ public class SentenceModel {
23 } 22 }
24 23
25 public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { 24 public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception {
26 - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); 25 + Instances instances = InstanceUtils.createNewInstances(sentenceFeatureExtractor.getAttributesList());
27 Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); 26 Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions);
28 27
29 Map<TSentence, Double> sentence2score = Maps.newHashMap(); 28 Map<TSentence, Double> sentence2score = Maps.newHashMap();
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java
1 -package pl.waw.ipipan.zil.summ.nicolas; 1 +package pl.waw.ipipan.zil.summ.nicolas.utils;
2 2
3 import com.google.common.collect.Maps; 3 import com.google.common.collect.Maps;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
@@ -11,7 +11,9 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; @@ -11,7 +11,9 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
11 import weka.core.Attribute; 11 import weka.core.Attribute;
12 import weka.core.DenseInstance; 12 import weka.core.DenseInstance;
13 import weka.core.Instance; 13 import weka.core.Instance;
  14 +import weka.core.Instances;
14 15
  16 +import java.util.ArrayList;
15 import java.util.List; 17 import java.util.List;
16 import java.util.Map; 18 import java.util.Map;
17 import java.util.Set; 19 import java.util.Set;
@@ -22,6 +24,8 @@ public class InstanceUtils { @@ -22,6 +24,8 @@ public class InstanceUtils {
22 24
23 private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class); 25 private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class);
24 26
  27 + private static final String DATASET_NAME = "Dataset";
  28 +
25 private InstanceUtils() { 29 private InstanceUtils() {
26 } 30 }
27 31
@@ -60,4 +64,11 @@ public class InstanceUtils { @@ -60,4 +64,11 @@ public class InstanceUtils {
60 LOG.info("Extracted features of {} sentences.", sentence2instance.size()); 64 LOG.info("Extracted features of {} sentences.", sentence2instance.size());
61 return sentence2instance; 65 return sentence2instance;
62 } 66 }
  67 +
  68 + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList
  69 + public static Instances createNewInstances(ArrayList<Attribute> attributesList) {
  70 + Instances instances = new Instances(DATASET_NAME, attributesList, 0);
  71 + instances.setClassIndex(0);
  72 + return instances;
  73 + }
63 } 74 }
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.utils;
  2 +
  3 +import org.apache.commons.io.IOUtils;
  4 +import org.slf4j.Logger;
  5 +import org.slf4j.LoggerFactory;
  6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  7 +import weka.classifiers.Classifier;
  8 +
  9 +import java.io.IOException;
  10 +import java.io.InputStream;
  11 +import java.io.ObjectInputStream;
  12 +import java.util.List;
  13 +import java.util.function.Predicate;
  14 +import java.util.stream.Collectors;
  15 +
  16 +public class ResourceUtils {
  17 +
  18 + private static final Logger LOG = LoggerFactory.getLogger(ResourceUtils.class);
  19 +
  20 + private ResourceUtils() {
  21 + }
  22 +
  23 + public static List<String> loadFrequentBases() throws IOException {
  24 + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH);
  25 + }
  26 +
  27 + public static List<String> loadStopwords() throws IOException {
  28 + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.STOPWORDS_PATH);
  29 + }
  30 +
  31 + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException {
  32 + LOG.info("Loading classifier from path: {}...", modelResourcePath);
  33 + try (InputStream stream = ResourceUtils.class.getResourceAsStream(modelResourcePath)) {
  34 + if (stream == null) {
  35 + throw new IOException("Model not found at: " + modelResourcePath);
  36 + }
  37 + try (ObjectInputStream ois = new ObjectInputStream(stream)) {
  38 + Classifier classifier = (Classifier) ois.readObject();
  39 + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName());
  40 + return classifier;
  41 + } catch (ClassNotFoundException e) {
  42 + LOG.error("Error loading serialized classifier, class not found.", e);
  43 + throw new IOException(e);
  44 + }
  45 + }
  46 + }
  47 +
  48 + private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException {
  49 + try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) {
  50 + return IOUtils.readLines(stream, Constants.ENCODING)
  51 + .stream()
  52 + .map(String::trim)
  53 + .map(String::toLowerCase)
  54 + .filter(((Predicate<String>) String::isEmpty).negate())
  55 + .sorted()
  56 + .distinct()
  57 + .collect(Collectors.toList());
  58 + }
  59 + }
  60 +
  61 +
  62 +}
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java 0 → 100644
  1 +package pl.waw.ipipan.zil.summ.nicolas.utils;
  2 +
  3 +import com.google.common.collect.Sets;
  4 +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
  5 +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
  6 +
  7 +import java.util.Arrays;
  8 +import java.util.List;
  9 +import java.util.Set;
  10 +
  11 +public class TextUtils {
  12 +
  13 + private TextUtils() {
  14 + }
  15 +
  16 + public static List<String> tokenize(String text) {
  17 + return Arrays.asList(text.split("[^\\p{L}0-9]+"));
  18 + }
  19 +
  20 + public static List<String> tokenizeOnWhitespace(String text) {
  21 + return Arrays.asList(text.split(" +"));
  22 + }
  23 +
  24 + public static String loadSentence2Orth(TSentence sentence) {
  25 + return loadSentence2Orth(sentence, Sets.newHashSet());
  26 + }
  27 +
  28 + public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) {
  29 + StringBuilder sb = new StringBuilder();
  30 + for (TToken token : sentence.getTokens()) {
  31 + if (tokenIdsToSkip.contains(token.getId())) {
  32 + System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence));
  33 + continue;
  34 + }
  35 + if (!token.isNoPrecedingSpace())
  36 + sb.append(" ");
  37 + sb.append(token.getOrth());
  38 + }
  39 + return sb.toString().trim();
  40 + }
  41 +}
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java
1 -package pl.waw.ipipan.zil.summ.nicolas.common; 1 +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift;
2 2
3 import com.google.common.base.Predicates; 3 import com.google.common.base.Predicates;
4 import com.google.common.collect.Maps; 4 import com.google.common.collect.Maps;
@@ -58,4 +58,12 @@ public class ThriftUtils { @@ -58,4 +58,12 @@ public class ThriftUtils {
58 } 58 }
59 } 59 }
60 60
  61 + public static TText loadThriftTextFromResource(String resourcePath) {
  62 + try (InputStream stream = ThriftUtils.class.getResourceAsStream(resourcePath)) {
  63 + return loadThriftTextFromStream(stream);
  64 + } catch (IOException e) {
  65 + LOG.error("Error reading serialized Thrift text from resource", e);
  66 + return null;
  67 + }
  68 + }
61 } 69 }
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java
1 -package pl.waw.ipipan.zil.summ.nicolas.common; 1 +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift;
2 2
3 import java.io.IOException; 3 import java.io.IOException;
4 import java.io.InputStream; 4 import java.io.InputStream;
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
@@ -7,7 +7,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; @@ -7,7 +7,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
10 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 10 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
11 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; 11 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor;
12 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 12 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
13 import weka.core.Attribute; 13 import weka.core.Attribute;
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
@@ -3,8 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; @@ -3,8 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas.zero;
3 import com.google.common.collect.Sets; 3 import com.google.common.collect.Sets;
4 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 4 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
5 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 5 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
6 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants;  
7 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
  7 +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
8 import weka.classifiers.Classifier; 8 import weka.classifiers.Classifier;
9 import weka.core.Instance; 9 import weka.core.Instance;
10 import weka.core.Instances; 10 import weka.core.Instances;
@@ -24,7 +24,7 @@ public class ZeroSubjectInjector { @@ -24,7 +24,7 @@ public class ZeroSubjectInjector {
24 public ZeroSubjectInjector() throws Exception { 24 public ZeroSubjectInjector() throws Exception {
25 classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); 25 classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH);
26 featureExtractor = new ZeroFeatureExtractor(); 26 featureExtractor = new ZeroFeatureExtractor();
27 - instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 27 + instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
28 } 28 }
29 29
30 public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { 30 public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception {
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java
@@ -3,7 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas; @@ -3,7 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas;
3 import org.junit.BeforeClass; 3 import org.junit.BeforeClass;
4 import org.junit.Test; 4 import org.junit.Test;
5 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 5 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
6 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 6 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
  7 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
7 8
8 import static org.junit.Assert.assertTrue; 9 import static org.junit.Assert.assertTrue;
9 10
@@ -20,9 +21,9 @@ public class NicolasTest { @@ -20,9 +21,9 @@ public class NicolasTest {
20 21
21 @Test 22 @Test
22 public void shouldSummarizeThriftText() throws Exception { 23 public void shouldSummarizeThriftText() throws Exception {
23 - TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); 24 + TText thriftText = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH);
24 String summary = nicolas.summarizeThrift(thriftText, 5); 25 String summary = nicolas.summarizeThrift(thriftText, 5);
25 - int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size(); 26 + int summaryTokensCount = TextUtils.tokenizeOnWhitespace(summary).size();
26 assertTrue(summaryTokensCount > 0); 27 assertTrue(summaryTokensCount > 0);
27 assertTrue(summaryTokensCount < 10); 28 assertTrue(summaryTokensCount < 10);
28 } 29 }
nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java renamed to nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java
1 -package pl.waw.ipipan.zil.summ.nicolas.common; 1 +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift;
2 2
3 import org.junit.Test; 3 import org.junit.Test;
4 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 4 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
@@ -7,13 +7,13 @@ import java.io.InputStream; @@ -7,13 +7,13 @@ import java.io.InputStream;
7 7
8 import static org.junit.Assert.assertEquals; 8 import static org.junit.Assert.assertEquals;
9 9
10 -public class UtilsTest { 10 +public class ThriftUtilsTest {
11 11
12 private static final String SAMPLE_TEXT_PATH = "/199704210011.bin"; 12 private static final String SAMPLE_TEXT_PATH = "/199704210011.bin";
13 13
14 @Test 14 @Test
15 public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { 15 public void shouldDeserializeTextIgnoringClassVersionId() throws Exception {
16 - try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { 16 + try (InputStream stream = ThriftUtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) {
17 TText text = ThriftUtils.loadThriftTextFromStream(stream); 17 TText text = ThriftUtils.loadThriftTextFromStream(stream);
18 assertEquals(26, text.getParagraphs().size()); 18 assertEquals(26, text.getParagraphs().size());
19 assertEquals(2, text.getParagraphs().get(4).getSentences().size()); 19 assertEquals(2, text.getParagraphs().get(4).getSentences().size());
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils;
5 import org.junit.Test; 5 import org.junit.Test;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;  
9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 8 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
  9 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
10 10
11 import java.io.IOException; 11 import java.io.IOException;
12 import java.io.InputStream; 12 import java.io.InputStream;
nicolas-common/src/test/resources/199704210011.bin renamed to nicolas-lib/src/test/resources/199704210011.bin
No preview for this file type
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore 0 → 100644
  1 +*.txt
0 \ No newline at end of file 2 \ No newline at end of file
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md 0 → 100644
  1 +To generate resources in this folder, use nicolas-trainer module.
0 \ No newline at end of file 2 \ No newline at end of file
nicolas-multiservice/pom.xml
@@ -30,8 +30,12 @@ @@ -30,8 +30,12 @@
30 30
31 <!-- test --> 31 <!-- test -->
32 <dependency> 32 <dependency>
  33 + <groupId>junit</groupId>
  34 + <artifactId>junit</artifactId>
  35 + </dependency>
  36 + <dependency>
33 <groupId>pl.waw.ipipan.zil.summ</groupId> 37 <groupId>pl.waw.ipipan.zil.summ</groupId>
34 - <artifactId>nicolas-common</artifactId> 38 + <artifactId>nicolas-lib</artifactId>
35 <scope>test</scope> 39 <scope>test</scope>
36 </dependency> 40 </dependency>
37 41
nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java
@@ -7,7 +7,7 @@ import org.junit.rules.TemporaryFolder; @@ -7,7 +7,7 @@ import org.junit.rules.TemporaryFolder;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 9 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
10 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 10 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
11 11
12 import java.io.File; 12 import java.io.File;
13 import java.io.FileInputStream; 13 import java.io.FileInputStream;
@@ -67,7 +67,7 @@ public class PreprocessorIT { @@ -67,7 +67,7 @@ public class PreprocessorIT {
67 preprocessor.preprocessToFile(text, targetFile); 67 preprocessor.preprocessToFile(text, targetFile);
68 68
69 try (FileInputStream inputStream = new FileInputStream(targetFile)) { 69 try (FileInputStream inputStream = new FileInputStream(targetFile)) {
70 - TText processed = Utils.loadThriftTextFromStream(inputStream); 70 + TText processed = ThriftUtils.loadThriftTextFromStream(inputStream);
71 assertSampleProcessedText(processed); 71 assertSampleProcessedText(processed);
72 } 72 }
73 } 73 }
nicolas-train/pom.xml
@@ -15,10 +15,6 @@ @@ -15,10 +15,6 @@
15 <!-- project --> 15 <!-- project -->
16 <dependency> 16 <dependency>
17 <groupId>pl.waw.ipipan.zil.summ</groupId> 17 <groupId>pl.waw.ipipan.zil.summ</groupId>
18 - <artifactId>nicolas-common</artifactId>  
19 - </dependency>  
20 - <dependency>  
21 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
22 <artifactId>nicolas-lib</artifactId> 18 <artifactId>nicolas-lib</artifactId>
23 </dependency> 19 </dependency>
24 <dependency> 20 <dependency>
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
1 -package pl.waw.ipipan.zil.summ.nicolas.train; 1 +package pl.waw.ipipan.zil.summ.nicolas;
2 2
3 import net.lingala.zip4j.core.ZipFile; 3 import net.lingala.zip4j.core.ZipFile;
4 import net.lingala.zip4j.exception.ZipException; 4 import net.lingala.zip4j.exception.ZipException;
@@ -34,7 +34,7 @@ public class PathConstants { @@ -34,7 +34,7 @@ public class PathConstants {
34 public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); 34 public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal");
35 public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); 35 public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv");
36 36
37 - public static final File ARFF_DIR = new File(WORKING_DIR, "train-arff"); 37 + private static final File ARFF_DIR = new File(WORKING_DIR, "train-arff");
38 public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff"); 38 public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff");
39 public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); 39 public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff");
40 public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); 40 public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff");
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java
@@ -17,7 +17,7 @@ public class Constants { @@ -17,7 +17,7 @@ public class Constants {
17 17
18 public static Set<String> loadTestTextIds() throws IOException { 18 public static Set<String> loadTestTextIds() throws IOException {
19 try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { 19 try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) {
20 - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING); 20 + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING);
21 return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); 21 return testTextIds.stream().map(String::trim).collect(Collectors.toSet());
22 } 22 }
23 } 23 }
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
@@ -7,11 +7,13 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; @@ -7,11 +7,13 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
8 import pl.waw.ipipan.zil.summ.nicolas.Nicolas; 8 import pl.waw.ipipan.zil.summ.nicolas.Nicolas;
9 import pl.waw.ipipan.zil.summ.nicolas.NicolasException; 9 import pl.waw.ipipan.zil.summ.nicolas.NicolasException;
10 -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;  
11 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 10 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
12 12
13 import java.io.File; 13 import java.io.File;
  14 +import java.io.FileOutputStream;
14 import java.io.IOException; 15 import java.io.IOException;
  16 +import java.io.OutputStreamWriter;
15 import java.util.List; 17 import java.util.List;
16 import java.util.Map; 18 import java.util.Map;
17 import java.util.Set; 19 import java.util.Set;
@@ -23,7 +25,6 @@ public class SummarizeTestCorpus { @@ -23,7 +25,6 @@ public class SummarizeTestCorpus {
23 25
24 private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); 26 private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class);
25 27
26 -  
27 private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; 28 private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt";
28 private static final double SUMMARY_RATIO = 0.2; 29 private static final double SUMMARY_RATIO = 0.2;
29 30
@@ -31,8 +32,8 @@ public class SummarizeTestCorpus { @@ -31,8 +32,8 @@ public class SummarizeTestCorpus {
31 } 32 }
32 33
33 public static void main(String[] args) throws IOException, NicolasException { 34 public static void main(String[] args) throws IOException, NicolasException {
34 - File thriftedCorpusDir = new File("data/preprocessed");  
35 - File targetDir = new File("data/summaries"); 35 + File thriftedCorpusDir = new File("data/all-preprocessed");
  36 + File targetDir = new File("data/test-system");
36 targetDir.mkdir(); 37 targetDir.mkdir();
37 38
38 Set<String> testTextIds = loadTestTextIds(); 39 Set<String> testTextIds = loadTestTextIds();
@@ -62,9 +63,9 @@ public class SummarizeTestCorpus { @@ -62,9 +63,9 @@ public class SummarizeTestCorpus {
62 List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); 63 List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList());
63 StringBuilder body = new StringBuilder(); 64 StringBuilder body = new StringBuilder();
64 for (TSentence sentence : sentences) 65 for (TSentence sentence : sentences)
65 - body.append(Utils.loadSentence2Orth(sentence)).append(" "); 66 + body.append(TextUtils.loadSentence2Orth(sentence)).append(" ");
66 67
67 - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); 68 + int tokenCount = TextUtils.tokenizeOnWhitespace(body.toString().trim()).size();
68 return (int) (SUMMARY_RATIO * tokenCount); 69 return (int) (SUMMARY_RATIO * tokenCount);
69 } 70 }
70 71
@@ -73,7 +74,9 @@ public class SummarizeTestCorpus { @@ -73,7 +74,9 @@ public class SummarizeTestCorpus {
73 String textId = entry.getKey(); 74 String textId = entry.getKey();
74 String summary = entry.getValue(); 75 String summary = entry.getValue();
75 String targetFileName = textId + SUMMARY_FILE_SUFFIX; 76 String targetFileName = textId + SUMMARY_FILE_SUFFIX;
76 - Utils.writeStringToFile(summary, new File(targetDir, targetFileName)); 77 + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) {
  78 + writer.write(summary);
  79 + }
77 } 80 }
78 } 81 }
79 82
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java
@@ -6,29 +6,63 @@ import com.google.common.collect.Multiset; @@ -6,29 +6,63 @@ import com.google.common.collect.Multiset;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 9 +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
  10 +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils;
  11 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
10 12
  13 +import java.io.IOException;
11 import java.util.List; 14 import java.util.List;
12 import java.util.Map; 15 import java.util.Map;
  16 +import java.util.Set;
  17 +import java.util.function.Function;
13 import java.util.stream.Collectors; 18 import java.util.stream.Collectors;
14 19
15 public class MentionScorer { 20 public class MentionScorer {
16 21
  22 + private final Set<String> STOPWORDS;
  23 +
  24 + public MentionScorer() throws IOException {
  25 + STOPWORDS = ResourceUtils.loadStopwords().stream().collect(Collectors.toSet());
  26 + }
  27 +
17 public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { 28 public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) {
18 - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); 29 + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase()));
19 30
20 List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); 31 List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList());
21 - Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences, true); 32 + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences);
22 33
23 return booleanTokenIntersection(mention2Orth, tokenCounts); 34 return booleanTokenIntersection(mention2Orth, tokenCounts);
24 } 35 }
25 36
  37 + private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sents) {
  38 + Map<TMention, String> mention2orth = Maps.newHashMap();
  39 + for (TSentence s : sents) {
  40 + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity()));
  41 +
  42 + for (TMention m : s.getMentions()) {
  43 + StringBuilder mentionOrth = new StringBuilder();
  44 + for (String tokId : m.getChildIds()) {
  45 + TToken token = tokId2tok.get(tokId);
  46 + if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) {
  47 + continue;
  48 + }
  49 +
  50 + if (!token.isNoPrecedingSpace())
  51 + mentionOrth.append(" ");
  52 + mentionOrth.append(token.getOrth());
  53 + }
  54 + mention2orth.put(m, mentionOrth.toString().trim());
  55 + }
  56 + }
  57 + return mention2orth;
  58 + }
  59 +
26 private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { 60 private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) {
27 Map<TMention, Double> mention2score = Maps.newHashMap(); 61 Map<TMention, Double> mention2score = Maps.newHashMap();
28 for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { 62 for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) {
29 TMention mention = entry.getKey(); 63 TMention mention = entry.getKey();
30 String mentionOrth = mention2Orth.get(mention); 64 String mentionOrth = mention2Orth.get(mention);
31 - for (String token : Utils.tokenize(mentionOrth)) { 65 + for (String token : TextUtils.tokenize(mentionOrth)) {
32 if (tokenCounts.contains(token.toLowerCase())) { 66 if (tokenCounts.contains(token.toLowerCase())) {
33 mention2score.put(mention, 1.0); 67 mention2score.put(mention, 1.0);
34 break; 68 break;
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java
@@ -6,22 +6,23 @@ import com.google.common.collect.Multiset; @@ -6,22 +6,23 @@ import com.google.common.collect.Multiset;
6 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; 6 import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 7 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 8 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
9 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 9 +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils;
10 10
11 import java.util.List; 11 import java.util.List;
12 import java.util.Map; 12 import java.util.Map;
13 13
14 public class SentenceScorer { 14 public class SentenceScorer {
  15 +
15 public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) { 16 public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) {
16 - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); 17 + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase()));
17 18
18 Map<TSentence, Double> sentence2score = Maps.newHashMap(); 19 Map<TSentence, Double> sentence2score = Maps.newHashMap();
19 for (TParagraph paragraph : preprocessedText.getParagraphs()) 20 for (TParagraph paragraph : preprocessedText.getParagraphs())
20 for (TSentence sentence : paragraph.getSentences()) { 21 for (TSentence sentence : paragraph.getSentences()) {
21 double score = 0.0; 22 double score = 0.0;
22 23
23 - String orth = Utils.loadSentence2Orth(sentence);  
24 - List<String> tokens = Utils.tokenize(orth); 24 + String orth = TextUtils.loadSentence2Orth(sentence);
  25 + List<String> tokens = TextUtils.tokenize(orth);
25 for (String token : tokens) { 26 for (String token : tokens) {
26 score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0; 27 score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0;
27 } 28 }
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java
@@ -5,7 +5,7 @@ import org.apache.commons.csv.CSVFormat; @@ -5,7 +5,7 @@ import org.apache.commons.csv.CSVFormat;
5 import org.apache.commons.csv.CSVParser; 5 import org.apache.commons.csv.CSVParser;
6 import org.apache.commons.csv.CSVRecord; 6 import org.apache.commons.csv.CSVRecord;
7 import org.apache.commons.csv.QuoteMode; 7 import org.apache.commons.csv.QuoteMode;
8 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 8 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 9 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
10 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; 10 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate;
11 11
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java
@@ -8,14 +8,14 @@ import com.google.common.collect.Multiset; @@ -8,14 +8,14 @@ import com.google.common.collect.Multiset;
8 import org.apache.commons.io.FileUtils; 8 import org.apache.commons.io.FileUtils;
9 import pl.waw.ipipan.zil.summ.eval.Main; 9 import pl.waw.ipipan.zil.summ.eval.Main;
10 import pl.waw.ipipan.zil.summ.eval.rouge.RougeN; 10 import pl.waw.ipipan.zil.summ.eval.rouge.RougeN;
11 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 11 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
12 12
13 import java.io.File; 13 import java.io.File;
14 import java.io.IOException; 14 import java.io.IOException;
15 import java.util.*; 15 import java.util.*;
16 import java.util.stream.Collectors; 16 import java.util.stream.Collectors;
17 17
18 -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; 18 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
19 19
20 public class CreateOptimalSummaries { 20 public class CreateOptimalSummaries {
21 21
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java
1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; 1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; 3 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
4 4
5 public class DownloadCorpus { 5 public class DownloadCorpus {
6 6
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java
1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; 1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; 3 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
4 4
5 public class DownloadTrainingResources { 5 public class DownloadTrainingResources {
6 6
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; 1 package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
2 2
3 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; 3 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
4 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; 4 import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO;
5 import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; 5 import pl.waw.ipipan.zil.summ.pscapi.xml.Summary;
6 import pl.waw.ipipan.zil.summ.pscapi.xml.Text; 6 import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
7 7
8 import javax.xml.bind.JAXBException; 8 import javax.xml.bind.JAXBException;
9 import java.io.File; 9 import java.io.File;
  10 +import java.io.FileOutputStream;
10 import java.io.IOException; 11 import java.io.IOException;
  12 +import java.io.OutputStreamWriter;
11 import java.util.List; 13 import java.util.List;
12 import java.util.function.Predicate; 14 import java.util.function.Predicate;
13 import java.util.stream.Collectors; 15 import java.util.stream.Collectors;
14 import java.util.stream.Stream; 16 import java.util.stream.Stream;
15 17
16 -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; 18 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
17 19
18 public class ExtractGoldSummaries { 20 public class ExtractGoldSummaries {
19 21
@@ -22,7 +24,6 @@ public class ExtractGoldSummaries { @@ -22,7 +24,6 @@ public class ExtractGoldSummaries {
22 24
23 private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); 25 private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE));
24 26
25 -  
26 private ExtractGoldSummaries() { 27 private ExtractGoldSummaries() {
27 } 28 }
28 29
@@ -47,7 +48,10 @@ public class ExtractGoldSummaries { @@ -47,7 +48,10 @@ public class ExtractGoldSummaries {
47 for (Summary summary : goldSummaries) { 48 for (Summary summary : goldSummaries) {
48 File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR; 49 File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR;
49 File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); 50 File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt");
50 - Utils.writeStringToFile(summary.getBody(), targetFile); 51 +
  52 + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) {
  53 + writer.append(summary.getBody());
  54 + }
51 } 55 }
52 } 56 }
53 } 57 }
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
@@ -10,15 +10,14 @@ import org.slf4j.LoggerFactory; @@ -10,15 +10,14 @@ import org.slf4j.LoggerFactory;
10 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; 10 import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; 11 import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText; 12 import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
13 -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils;  
14 -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils;  
15 -import pl.waw.ipipan.zil.summ.nicolas.common.Utils;  
16 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; 13 import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper;
17 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; 14 import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor;
18 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; 15 import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor;
19 import pl.waw.ipipan.zil.summ.nicolas.train.model.MentionScorer; 16 import pl.waw.ipipan.zil.summ.nicolas.train.model.MentionScorer;
20 import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; 17 import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer;
21 import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; 18 import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer;
  19 +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils;
  20 +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils;
22 import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; 21 import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder;
23 import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; 22 import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator;
24 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; 23 import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor;
@@ -37,7 +36,7 @@ import java.util.Set; @@ -37,7 +36,7 @@ import java.util.Set;
37 import java.util.function.Predicate; 36 import java.util.function.Predicate;
38 import java.util.stream.Collectors; 37 import java.util.stream.Collectors;
39 38
40 -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; 39 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
41 40
42 public class PrepareTrainingData { 41 public class PrepareTrainingData {
43 42
@@ -61,7 +60,7 @@ public class PrepareTrainingData { @@ -61,7 +60,7 @@ public class PrepareTrainingData {
61 MentionScorer mentionScorer = new MentionScorer(); 60 MentionScorer mentionScorer = new MentionScorer();
62 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); 61 MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor();
63 62
64 - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 63 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
65 64
66 int i = 1; 65 int i = 1;
67 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { 66 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
@@ -105,7 +104,7 @@ public class PrepareTrainingData { @@ -105,7 +104,7 @@ public class PrepareTrainingData {
105 SentenceScorer sentenceScorer = new SentenceScorer(); 104 SentenceScorer sentenceScorer = new SentenceScorer();
106 SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); 105 SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor();
107 106
108 - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 107 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
109 108
110 int i = 1; 109 int i = 1;
111 for (String textId : id2preprocessedText.keySet()) { 110 for (String textId : id2preprocessedText.keySet()) {
@@ -149,7 +148,7 @@ public class PrepareTrainingData { @@ -149,7 +148,7 @@ public class PrepareTrainingData {
149 ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_CORPUS); 148 ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_CORPUS);
150 ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); 149 ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor();
151 150
152 - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); 151 + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList());
153 152
154 int i = 1; 153 int i = 1;
155 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { 154 for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) {
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java
@@ -9,7 +9,7 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; @@ -9,7 +9,7 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text;
9 import java.io.File; 9 import java.io.File;
10 import java.util.Arrays; 10 import java.util.Arrays;
11 11
12 -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; 12 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
13 13
14 public class PreprocessCorpus { 14 public class PreprocessCorpus {
15 15
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline;
3 import org.apache.commons.lang3.time.StopWatch; 3 import org.apache.commons.lang3.time.StopWatch;
4 import org.slf4j.Logger; 4 import org.slf4j.Logger;
5 import org.slf4j.LoggerFactory; 5 import org.slf4j.LoggerFactory;
6 -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; 6 +import pl.waw.ipipan.zil.summ.nicolas.Constants;
7 import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings; 7 import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings;
8 import weka.classifiers.Classifier; 8 import weka.classifiers.Classifier;
9 import weka.core.Instances; 9 import weka.core.Instances;
@@ -14,7 +14,7 @@ import java.io.FileOutputStream; @@ -14,7 +14,7 @@ import java.io.FileOutputStream;
14 import java.io.ObjectOutputStream; 14 import java.io.ObjectOutputStream;
15 import java.util.logging.LogManager; 15 import java.util.logging.LogManager;
16 16
17 -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; 17 +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*;
18 18
19 public class TrainAllModels { 19 public class TrainAllModels {
20 20
@@ -10,15 +10,12 @@ @@ -10,15 +10,12 @@
10 10
11 <packaging>pom</packaging> 11 <packaging>pom</packaging>
12 12
13 -  
14 <modules> 13 <modules>
15 <module>nicolas-lib</module> 14 <module>nicolas-lib</module>
16 <module>nicolas-cli</module> 15 <module>nicolas-cli</module>
17 <module>nicolas-model</module> 16 <module>nicolas-model</module>
18 <module>nicolas-train</module> 17 <module>nicolas-train</module>
19 - <module>nicolas-common</module>  
20 <module>nicolas-multiservice</module> 18 <module>nicolas-multiservice</module>
21 - <module>nicolas-eval</module>  
22 </modules> 19 </modules>
23 20
24 <properties> 21 <properties>
@@ -59,23 +56,23 @@ @@ -59,23 +56,23 @@
59 <!-- project --> 56 <!-- project -->
60 <dependency> 57 <dependency>
61 <groupId>pl.waw.ipipan.zil.summ</groupId> 58 <groupId>pl.waw.ipipan.zil.summ</groupId>
62 - <artifactId>nicolas-model</artifactId> 59 + <artifactId>nicolas-cli</artifactId>
63 <version>${project.version}</version> 60 <version>${project.version}</version>
64 - <scope>runtime</scope>  
65 </dependency> 61 </dependency>
66 <dependency> 62 <dependency>
67 <groupId>pl.waw.ipipan.zil.summ</groupId> 63 <groupId>pl.waw.ipipan.zil.summ</groupId>
68 - <artifactId>nicolas-common</artifactId> 64 + <artifactId>nicolas-lib</artifactId>
69 <version>${project.version}</version> 65 <version>${project.version}</version>
70 </dependency> 66 </dependency>
71 <dependency> 67 <dependency>
72 <groupId>pl.waw.ipipan.zil.summ</groupId> 68 <groupId>pl.waw.ipipan.zil.summ</groupId>
73 - <artifactId>nicolas-zero</artifactId> 69 + <artifactId>nicolas-model</artifactId>
74 <version>${project.version}</version> 70 <version>${project.version}</version>
  71 + <scope>runtime</scope>
75 </dependency> 72 </dependency>
76 <dependency> 73 <dependency>
77 <groupId>pl.waw.ipipan.zil.summ</groupId> 74 <groupId>pl.waw.ipipan.zil.summ</groupId>
78 - <artifactId>nicolas-lib</artifactId> 75 + <artifactId>nicolas-multiservice</artifactId>
79 <version>${project.version}</version> 76 <version>${project.version}</version>
80 </dependency> 77 </dependency>
81 <dependency> 78 <dependency>
@@ -83,11 +80,6 @@ @@ -83,11 +80,6 @@
83 <artifactId>nicolas-train</artifactId> 80 <artifactId>nicolas-train</artifactId>
84 <version>${project.version}</version> 81 <version>${project.version}</version>
85 </dependency> 82 </dependency>
86 - <dependency>  
87 - <groupId>pl.waw.ipipan.zil.summ</groupId>  
88 - <artifactId>nicolas-multiservice</artifactId>  
89 - <version>${project.version}</version>  
90 - </dependency>  
91 83
92 <!-- internal --> 84 <!-- internal -->
93 <dependency> 85 <dependency>