Commit 1a009dd0c4f78b9367ce117f0edd6e982cb4ebdf
1 parent
7e387f1c
clean up modules
Showing
50 changed files
with
311 additions
and
685 deletions
README.md
0 → 100644
eval.sh
0 → 100755
nicolas-cli/pom.xml
@@ -22,6 +22,11 @@ | @@ -22,6 +22,11 @@ | ||
22 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 22 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
23 | <artifactId>nicolas-lib</artifactId> | 23 | <artifactId>nicolas-lib</artifactId> |
24 | </dependency> | 24 | </dependency> |
25 | + <dependency> | ||
26 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
27 | + <artifactId>nicolas-model</artifactId> | ||
28 | + <scope>runtime</scope> | ||
29 | + </dependency> | ||
25 | 30 | ||
26 | <!-- third party --> | 31 | <!-- third party --> |
27 | <dependency> | 32 | <dependency> |
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java
@@ -5,9 +5,9 @@ import org.slf4j.Logger; | @@ -5,9 +5,9 @@ import org.slf4j.Logger; | ||
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
8 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | 9 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
9 | import pl.waw.ipipan.zil.summ.nicolas.NicolasException; | 10 | import pl.waw.ipipan.zil.summ.nicolas.NicolasException; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
11 | import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; | 11 | import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; |
12 | 12 | ||
13 | import java.io.*; | 13 | import java.io.*; |
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java
@@ -5,10 +5,10 @@ import org.junit.ClassRule; | @@ -5,10 +5,10 @@ import org.junit.ClassRule; | ||
5 | import org.junit.Test; | 5 | import org.junit.Test; |
6 | import org.junit.rules.TemporaryFolder; | 6 | import org.junit.rules.TemporaryFolder; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
8 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | 9 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
11 | import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; | 10 | import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; |
11 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | ||
12 | 12 | ||
13 | import java.io.File; | 13 | import java.io.File; |
14 | import java.io.FileInputStream; | 14 | import java.io.FileInputStream; |
@@ -29,7 +29,7 @@ public class ClientTest { | @@ -29,7 +29,7 @@ public class ClientTest { | ||
29 | @Test | 29 | @Test |
30 | public void processSampleText() throws Exception { | 30 | public void processSampleText() throws Exception { |
31 | Preprocessor preprocessor = mock(Preprocessor.class); | 31 | Preprocessor preprocessor = mock(Preprocessor.class); |
32 | - TText ttext = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); | 32 | + TText ttext = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); |
33 | when(preprocessor.preprocess(any())).thenReturn(ttext); | 33 | when(preprocessor.preprocess(any())).thenReturn(ttext); |
34 | 34 | ||
35 | Nicolas nicolas = mock(Nicolas.class); | 35 | Nicolas nicolas = mock(Nicolas.class); |
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java
@@ -4,7 +4,7 @@ import org.apache.commons.io.IOUtils; | @@ -4,7 +4,7 @@ import org.apache.commons.io.IOUtils; | ||
4 | import org.junit.ClassRule; | 4 | import org.junit.ClassRule; |
5 | import org.junit.Test; | 5 | import org.junit.Test; |
6 | import org.junit.rules.TemporaryFolder; | 6 | import org.junit.rules.TemporaryFolder; |
7 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 7 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; |
8 | 8 | ||
9 | import java.io.File; | 9 | import java.io.File; |
10 | import java.io.FileInputStream; | 10 | import java.io.FileInputStream; |
nicolas-common/pom.xml deleted
1 | -<?xml version="1.0" encoding="UTF-8"?> | ||
2 | -<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | - <modelVersion>4.0.0</modelVersion> | ||
6 | - <parent> | ||
7 | - <artifactId>nicolas-container</artifactId> | ||
8 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
9 | - <version>1.0-SNAPSHOT</version> | ||
10 | - </parent> | ||
11 | - | ||
12 | - <artifactId>nicolas-common</artifactId> | ||
13 | - | ||
14 | - <dependencies> | ||
15 | - <!-- internal --> | ||
16 | - <dependency> | ||
17 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | - <artifactId>pscapi</artifactId> | ||
19 | - </dependency> | ||
20 | - <dependency> | ||
21 | - <groupId>pl.waw.ipipan.zil.multiservice</groupId> | ||
22 | - <artifactId>utils</artifactId> | ||
23 | - </dependency> | ||
24 | - | ||
25 | - <!-- third party --> | ||
26 | - <dependency> | ||
27 | - <groupId>nz.ac.waikato.cms.weka</groupId> | ||
28 | - <artifactId>weka-stable</artifactId> | ||
29 | - </dependency> | ||
30 | - <dependency> | ||
31 | - <groupId>commons-io</groupId> | ||
32 | - <artifactId>commons-io</artifactId> | ||
33 | - </dependency> | ||
34 | - | ||
35 | - <!-- logging --> | ||
36 | - <dependency> | ||
37 | - <groupId>org.slf4j</groupId> | ||
38 | - <artifactId>slf4j-api</artifactId> | ||
39 | - </dependency> | ||
40 | - | ||
41 | - </dependencies> | ||
42 | - | ||
43 | -</project> | ||
44 | \ No newline at end of file | 0 | \ No newline at end of file |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.common; | ||
2 | - | ||
3 | -import com.google.common.collect.Lists; | ||
4 | -import com.google.common.collect.Maps; | ||
5 | -import com.google.common.collect.Sets; | ||
6 | -import org.apache.commons.io.IOUtils; | ||
7 | -import org.slf4j.Logger; | ||
8 | -import org.slf4j.LoggerFactory; | ||
9 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
10 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
11 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
12 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | ||
13 | -import weka.classifiers.Classifier; | ||
14 | -import weka.core.Attribute; | ||
15 | -import weka.core.Instances; | ||
16 | - | ||
17 | -import java.io.*; | ||
18 | -import java.util.*; | ||
19 | -import java.util.function.Function; | ||
20 | -import java.util.stream.Collectors; | ||
21 | - | ||
22 | -public class Utils { | ||
23 | - | ||
24 | - private static final Logger LOG = LoggerFactory.getLogger(Utils.class); | ||
25 | - | ||
26 | - private static final String DATASET_NAME = "Dataset"; | ||
27 | - | ||
28 | - private Utils() { | ||
29 | - } | ||
30 | - | ||
31 | - public static void writeStringToFile(String string, File file) throws IOException { | ||
32 | - try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) { | ||
33 | - bw.append(string); | ||
34 | - } | ||
35 | - } | ||
36 | - | ||
37 | - public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { | ||
38 | - LOG.info("Loading classifier from path: {}...", modelResourcePath); | ||
39 | - try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { | ||
40 | - if (stream == null) { | ||
41 | - throw new IOException("Model not found at: " + modelResourcePath); | ||
42 | - } | ||
43 | - try (ObjectInputStream ois = new ObjectInputStream(stream)) { | ||
44 | - Classifier classifier = (Classifier) ois.readObject(); | ||
45 | - LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); | ||
46 | - return classifier; | ||
47 | - } catch (ClassNotFoundException e) { | ||
48 | - LOG.error("Error loading serialized classifier, class not found.", e); | ||
49 | - throw new IOException(e); | ||
50 | - } | ||
51 | - } | ||
52 | - } | ||
53 | - | ||
54 | - public static TText loadThriftTextFromStream(InputStream inputStream) throws IOException { | ||
55 | - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(inputStream)) { | ||
56 | - return (TText) ois.readObject(); | ||
57 | - } catch (ClassNotFoundException e) { | ||
58 | - LOG.error("Error reading serialized thrift text file, class not found.", e); | ||
59 | - throw new IOException(e); | ||
60 | - } | ||
61 | - } | ||
62 | - | ||
63 | - public static TText loadThriftTextFromResource(String textResourcePath) throws IOException { | ||
64 | - try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) { | ||
65 | - if (stream == null) { | ||
66 | - throw new IOException("Resource not found at: " + textResourcePath); | ||
67 | - } | ||
68 | - return loadThriftTextFromStream(stream); | ||
69 | - } | ||
70 | - } | ||
71 | - | ||
72 | - public static List<String> loadLinesFromResource(String resourcePath) throws IOException { | ||
73 | - try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) { | ||
74 | - return IOUtils.readLines(stream, Constants.ENCODING); | ||
75 | - } | ||
76 | - } | ||
77 | - | ||
78 | - @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList | ||
79 | - public static Instances createNewInstances(ArrayList<Attribute> attributesList) { | ||
80 | - Instances instances = new Instances(DATASET_NAME, attributesList, 0); | ||
81 | - instances.setClassIndex(0); | ||
82 | - return instances; | ||
83 | - } | ||
84 | - | ||
85 | - public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException { | ||
86 | - LOG.info("Loading classifier..."); | ||
87 | - try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) { | ||
88 | - Classifier classifier = (Classifier) ois.readObject(); | ||
89 | - LOG.info("Done. " + classifier.toString()); | ||
90 | - return classifier; | ||
91 | - } | ||
92 | - } | ||
93 | - | ||
94 | - public static List<String> tokenize(String text) { | ||
95 | - return Arrays.asList(text.split("[^\\p{L}0-9]+")); | ||
96 | - } | ||
97 | - | ||
98 | - public static List<String> tokenizeOnWhitespace(String text) { | ||
99 | - return Arrays.asList(text.split(" +")); | ||
100 | - } | ||
101 | - | ||
102 | - public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) { | ||
103 | - Map<TMention, String> mention2orth = Maps.newHashMap(); | ||
104 | - for (TSentence s : sents) { | ||
105 | - Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth)); | ||
106 | - Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace)); | ||
107 | - | ||
108 | - for (TMention m : s.getMentions()) { | ||
109 | - StringBuffer mentionOrth = new StringBuffer(); | ||
110 | - for (String tokId : m.getHeadIds()) { | ||
111 | - if (!tokId2nps.get(tokId)) | ||
112 | - mentionOrth.append(" "); | ||
113 | - mentionOrth.append(tokId2orth.get(tokId)); | ||
114 | - } | ||
115 | - mention2orth.put(m, mentionOrth.toString().trim()); | ||
116 | - } | ||
117 | - } | ||
118 | - return mention2orth; | ||
119 | - } | ||
120 | - | ||
121 | - private static final Collection<String> STOPWORDS = Sets.newHashSet(); | ||
122 | - | ||
123 | - static { | ||
124 | - STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co")); | ||
125 | - } | ||
126 | - | ||
127 | - public static Map<TMention, String> loadMention2Orth(List<TSentence> sents, boolean discardStopwords) { | ||
128 | - Map<TMention, String> mention2orth = Maps.newHashMap(); | ||
129 | - for (TSentence s : sents) { | ||
130 | - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | ||
131 | - | ||
132 | - for (TMention m : s.getMentions()) { | ||
133 | - StringBuffer mentionOrth = new StringBuffer(); | ||
134 | - for (String tokId : m.getChildIds()) { | ||
135 | - TToken token = tokId2tok.get(tokId); | ||
136 | - if (discardStopwords && STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { | ||
137 | - continue; | ||
138 | - } | ||
139 | - | ||
140 | - if (!token.isNoPrecedingSpace()) | ||
141 | - mentionOrth.append(" "); | ||
142 | - mentionOrth.append(token.getOrth()); | ||
143 | - } | ||
144 | - mention2orth.put(m, mentionOrth.toString().trim()); | ||
145 | - } | ||
146 | - } | ||
147 | - return mention2orth; | ||
148 | - } | ||
149 | - | ||
150 | - public static Map<TMention, String> loadMention2Base(List<TSentence> sents) { | ||
151 | - Map<TMention, String> mention2base = Maps.newHashMap(); | ||
152 | - for (TSentence s : sents) { | ||
153 | - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase())); | ||
154 | - | ||
155 | - for (TMention m : s.getMentions()) { | ||
156 | - StringBuilder mentionBase = new StringBuilder(); | ||
157 | - for (String tokId : m.getChildIds()) { | ||
158 | - mentionBase.append(" "); | ||
159 | - mentionBase.append(tokId2base.get(tokId)); | ||
160 | - } | ||
161 | - mention2base.put(m, mentionBase.toString().toLowerCase().trim()); | ||
162 | - } | ||
163 | - } | ||
164 | - return mention2base; | ||
165 | - } | ||
166 | - | ||
167 | - public static String loadSentence2Orth(TSentence sentence) { | ||
168 | - return loadSentence2Orth(sentence, Sets.newHashSet()); | ||
169 | - } | ||
170 | - | ||
171 | - public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { | ||
172 | - StringBuilder sb = new StringBuilder(); | ||
173 | - for (TToken token : sentence.getTokens()) { | ||
174 | - if (tokenIdsToSkip.contains(token.getId())) { | ||
175 | - System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); | ||
176 | - continue; | ||
177 | - } | ||
178 | - if (!token.isNoPrecedingSpace()) | ||
179 | - sb.append(" "); | ||
180 | - sb.append(token.getOrth()); | ||
181 | - } | ||
182 | - return sb.toString().trim(); | ||
183 | - } | ||
184 | - | ||
185 | -} | ||
186 | \ No newline at end of file | 0 | \ No newline at end of file |
nicolas-eval/pom.xml deleted
1 | -<?xml version="1.0" encoding="UTF-8"?> | ||
2 | -<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | - <parent> | ||
6 | - <artifactId>nicolas-container</artifactId> | ||
7 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
8 | - <version>1.0-SNAPSHOT</version> | ||
9 | - </parent> | ||
10 | - <modelVersion>4.0.0</modelVersion> | ||
11 | - | ||
12 | - <artifactId>nicolas-eval</artifactId> | ||
13 | - | ||
14 | - <dependencies> | ||
15 | - <!-- project --> | ||
16 | - <dependency> | ||
17 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | - <artifactId>nicolas-lib</artifactId> | ||
19 | - </dependency> | ||
20 | - <dependency> | ||
21 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
22 | - <artifactId>nicolas-common</artifactId> | ||
23 | - </dependency> | ||
24 | - | ||
25 | - <!-- internal --> | ||
26 | - <dependency> | ||
27 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
28 | - <artifactId>eval</artifactId> | ||
29 | - </dependency> | ||
30 | - | ||
31 | - <!-- third party --> | ||
32 | - <dependency> | ||
33 | - <groupId>nz.ac.waikato.cms.weka</groupId> | ||
34 | - <artifactId>weka-stable</artifactId> | ||
35 | - </dependency> | ||
36 | - <dependency> | ||
37 | - <groupId>org.apache.commons</groupId> | ||
38 | - <artifactId>commons-lang3</artifactId> | ||
39 | - </dependency> | ||
40 | - <dependency> | ||
41 | - <groupId>com.google.guava</groupId> | ||
42 | - <artifactId>guava</artifactId> | ||
43 | - </dependency> | ||
44 | - | ||
45 | - <!-- logging --> | ||
46 | - <dependency> | ||
47 | - <groupId>org.slf4j</groupId> | ||
48 | - <artifactId>slf4j-api</artifactId> | ||
49 | - </dependency> | ||
50 | - <dependency> | ||
51 | - <groupId>org.slf4j</groupId> | ||
52 | - <artifactId>slf4j-simple</artifactId> | ||
53 | - </dependency> | ||
54 | - | ||
55 | - </dependencies> | ||
56 | -</project> | ||
57 | \ No newline at end of file | 0 | \ No newline at end of file |
nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt deleted
1 | -199704210012 | ||
2 | -199704210042 | ||
3 | -199704220007 | ||
4 | -199704220018 | ||
5 | -199704220021 | ||
6 | -199704220044 | ||
7 | -199704230006 | ||
8 | -199704230014 | ||
9 | -199704230029 | ||
10 | -199704230043 | ||
11 | -199704240008 | ||
12 | -199704240019 | ||
13 | -199704240020 | ||
14 | -199704240021 | ||
15 | -199704250018 | ||
16 | -199704250022 | ||
17 | -199704260014 | ||
18 | -199704260015 | ||
19 | -199704260016 | ||
20 | -199704280023 | ||
21 | -199704280025 | ||
22 | -199704280027 | ||
23 | -199704280031 | ||
24 | -199704300031 | ||
25 | -199704300042 | ||
26 | -199704300046 | ||
27 | -199801020010 | ||
28 | -199801020031 | ||
29 | -199801020035 | ||
30 | -199801020070 | ||
31 | -199801020076 | ||
32 | -199801020079 | ||
33 | -199801030068 | ||
34 | -199801030090 | ||
35 | -199801030091 | ||
36 | -199801030129 | ||
37 | -199801030148 | ||
38 | -199801030158 | ||
39 | -199801050023 | ||
40 | -199801050059 | ||
41 | -199801130087 | ||
42 | -199801130129 | ||
43 | -199801140182 | ||
44 | -199801160119 | ||
45 | -199801200106 | ||
46 | -199801220140 | ||
47 | -199801240061 | ||
48 | -199801240096 | ||
49 | -199801260047 | ||
50 | -199801260070 | ||
51 | -199801270055 | ||
52 | -199801270110 | ||
53 | -199801280123 | ||
54 | -199801280158 | ||
55 | -199801280159 | ||
56 | -199801280241 | ||
57 | -199801290022 | ||
58 | -199801310003 | ||
59 | -199801310037 | ||
60 | -199802030127 | ||
61 | -199802040159 | ||
62 | -199802040182 | ||
63 | -199802040202 | ||
64 | -199805220133 | ||
65 | -199808280158 | ||
66 | -199901190073 | ||
67 | -199901190115 | ||
68 | -199901250112 | ||
69 | -199901250117 | ||
70 | -199901270103 | ||
71 | -199901270120 | ||
72 | -199901270122 | ||
73 | -199901290095 | ||
74 | -199901300101 | ||
75 | -199902240095 | ||
76 | -199906220029 | ||
77 | -199906230024 | ||
78 | -199906240084 | ||
79 | -199906260027 | ||
80 | -199907050045 | ||
81 | -199907050076 | ||
82 | -199907140166 | ||
83 | -199907200002 | ||
84 | -199907270004 | ||
85 | -199908260001 | ||
86 | -199909090036 | ||
87 | -199909250018 | ||
88 | -199909270029 | ||
89 | -199910020027 | ||
90 | -199910020029 | ||
91 | -199910270011 | ||
92 | -199911060044 | ||
93 | -199911100038 | ||
94 | -199911100064 | ||
95 | -199911200030 | ||
96 | -199911220063 | ||
97 | -199912020060 | ||
98 | -199912180026 | ||
99 | -199912180034 | ||
100 | -199912220030 | ||
101 | -199912280024 | ||
102 | -199912280046 | ||
103 | -199912300021 | ||
104 | -199912300029 | ||
105 | -200001030029 | ||
106 | -200001030053 | ||
107 | -200001060034 | ||
108 | -200001100035 | ||
109 | -200001100046 | ||
110 | -200001170029 | ||
111 | -200001170033 | ||
112 | -200001170060 | ||
113 | -200001290045 | ||
114 | -200002220027 | ||
115 | -200002240034 | ||
116 | -200002250031 | ||
117 | -200003060062 | ||
118 | -200003110050 | ||
119 | -200004280047 | ||
120 | -200004290022 | ||
121 | -200006050119 | ||
122 | -200006260079 | ||
123 | -200006290045 | ||
124 | -200007150033 | ||
125 | -200008040076 | ||
126 | -200008220042 | ||
127 | -200008220046 | ||
128 | -200010130049 | ||
129 | -200010160054 | ||
130 | -200012130034 | ||
131 | -200012140084 | ||
132 | -200012290046 | ||
133 | -200104040019 | ||
134 | -200106050035 | ||
135 | -200108180109 | ||
136 | -200108300032 | ||
137 | -200111120045 | ||
138 | -200111150042 | ||
139 | -200111150047 | ||
140 | -200111200036 | ||
141 | -200111270049 | ||
142 | -200112030055 | ||
143 | -200112280057 | ||
144 | -200201220038 | ||
145 | -200201220050 | ||
146 | -200202020036 | ||
147 | -200202200032 | ||
148 | -200202210054 | ||
149 | -200202270044 | ||
150 | -200203010070 | ||
151 | -200203190026 | ||
152 | -200203260050 | ||
153 | -200203280017 | ||
154 | -200203290078 |
nicolas-lib/pom.xml
@@ -12,15 +12,6 @@ | @@ -12,15 +12,6 @@ | ||
12 | <artifactId>nicolas-lib</artifactId> | 12 | <artifactId>nicolas-lib</artifactId> |
13 | 13 | ||
14 | <dependencies> | 14 | <dependencies> |
15 | - <!-- project --> | ||
16 | - <dependency> | ||
17 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | - <artifactId>nicolas-common</artifactId> | ||
19 | - </dependency> | ||
20 | - <dependency> | ||
21 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
22 | - <artifactId>nicolas-model</artifactId> | ||
23 | - </dependency> | ||
24 | 15 | ||
25 | <!-- internal --> | 16 | <!-- internal --> |
26 | <dependency> | 17 | <dependency> |
@@ -61,5 +52,10 @@ | @@ -61,5 +52,10 @@ | ||
61 | <groupId>junit</groupId> | 52 | <groupId>junit</groupId> |
62 | <artifactId>junit</artifactId> | 53 | <artifactId>junit</artifactId> |
63 | </dependency> | 54 | </dependency> |
55 | + <dependency> | ||
56 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
57 | + <artifactId>nicolas-model</artifactId> | ||
58 | + <scope>test</scope> | ||
59 | + </dependency> | ||
64 | </dependencies> | 60 | </dependencies> |
65 | </project> | 61 | </project> |
66 | \ No newline at end of file | 62 | \ No newline at end of file |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.common; | 1 | +package pl.waw.ipipan.zil.summ.nicolas; |
2 | 2 | ||
3 | -import com.google.common.base.Charsets; | ||
4 | import com.google.common.collect.ImmutableList; | 3 | import com.google.common.collect.ImmutableList; |
5 | 4 | ||
6 | import java.nio.charset.Charset; | 5 | import java.nio.charset.Charset; |
7 | - | 6 | +import java.nio.charset.StandardCharsets; |
8 | 7 | ||
9 | public class Constants { | 8 | public class Constants { |
10 | 9 | ||
11 | - private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; | 10 | + public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); |
11 | + public static final Charset ENCODING = StandardCharsets.UTF_8; | ||
12 | 12 | ||
13 | + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; | ||
13 | private static final String MODELS_PATH = ROOT_PATH + "models/"; | 14 | private static final String MODELS_PATH = ROOT_PATH + "models/"; |
15 | + | ||
14 | public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin"; | 16 | public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin"; |
15 | public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin"; | 17 | public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin"; |
16 | public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin"; | 18 | public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin"; |
17 | 19 | ||
18 | private static final String RESOURCES_PATH = ROOT_PATH + "resources/"; | 20 | private static final String RESOURCES_PATH = ROOT_PATH + "resources/"; |
19 | public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt"; | 21 | public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt"; |
20 | - | ||
21 | - public static final Charset ENCODING = Charsets.UTF_8; | ||
22 | - | ||
23 | - public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); | 22 | + public static final String STOPWORDS_PATH = RESOURCES_PATH + "stopwords.txt"; |
24 | 23 | ||
25 | private Constants() { | 24 | private Constants() { |
26 | } | 25 | } |
27 | - | ||
28 | } | 26 | } |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -5,12 +5,12 @@ import com.google.common.collect.Sets; | @@ -5,12 +5,12 @@ import com.google.common.collect.Sets; | ||
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
10 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 8 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
11 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 9 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
12 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 10 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
13 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; | 11 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; |
12 | +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; | ||
13 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | ||
14 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | 14 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; |
15 | import weka.classifiers.Classifier; | 15 | import weka.classifiers.Classifier; |
16 | 16 | ||
@@ -31,9 +31,9 @@ public class Nicolas { | @@ -31,9 +31,9 @@ public class Nicolas { | ||
31 | 31 | ||
32 | public Nicolas() throws NicolasException { | 32 | public Nicolas() throws NicolasException { |
33 | try { | 33 | try { |
34 | - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | ||
35 | - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | ||
36 | - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | 34 | + mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); |
35 | + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | ||
36 | + zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | ||
37 | 37 | ||
38 | mentionFeatureExtractor = new MentionFeatureExtractor(); | 38 | mentionFeatureExtractor = new MentionFeatureExtractor(); |
39 | sentenceFeatureExtractor = new SentenceFeatureExtractor(); | 39 | sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
@@ -57,7 +57,7 @@ public class Nicolas { | @@ -57,7 +57,7 @@ public class Nicolas { | ||
57 | 57 | ||
58 | StringBuilder sb = new StringBuilder(); | 58 | StringBuilder sb = new StringBuilder(); |
59 | for (TSentence sent : selectedSentences) { | 59 | for (TSentence sent : selectedSentences) { |
60 | - sb.append(" ").append(Utils.loadSentence2Orth(sent)); | 60 | + sb.append(" ").append(TextUtils.loadSentence2Orth(sent)); |
61 | } | 61 | } |
62 | return sb.toString().trim(); | 62 | return sb.toString().trim(); |
63 | } | 63 | } |
@@ -74,7 +74,7 @@ public class Nicolas { | @@ -74,7 +74,7 @@ public class Nicolas { | ||
74 | Random r = new Random(1); | 74 | Random r = new Random(1); |
75 | Set<TSentence> summary = Sets.newHashSet(); | 75 | Set<TSentence> summary = Sets.newHashSet(); |
76 | for (TSentence sent : sortedSentences) { | 76 | for (TSentence sent : sortedSentences) { |
77 | - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); | 77 | + size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); |
78 | if (r.nextDouble() > 0.4 && size > targetSize) | 78 | if (r.nextDouble() > 0.4 && size > targetSize) |
79 | break; | 79 | break; |
80 | summary.add(sent); | 80 | summary.add(sent); |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.apply; | ||
2 | - | ||
3 | -import com.google.common.collect.Lists; | ||
4 | -import com.google.common.collect.Maps; | ||
5 | -import com.google.common.collect.Sets; | ||
6 | -import org.slf4j.Logger; | ||
7 | -import org.slf4j.LoggerFactory; | ||
8 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
9 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
10 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
11 | -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | ||
12 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
13 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | ||
14 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
15 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | ||
16 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | ||
17 | -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | ||
18 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectInjector; | ||
19 | -import weka.classifiers.Classifier; | ||
20 | -import weka.core.Instance; | ||
21 | -import weka.core.Instances; | ||
22 | - | ||
23 | -import java.io.BufferedWriter; | ||
24 | -import java.io.File; | ||
25 | -import java.io.FileWriter; | ||
26 | -import java.util.*; | ||
27 | - | ||
28 | -import static java.util.stream.Collectors.toList; | ||
29 | - | ||
30 | -public class ApplyModel { | ||
31 | - | ||
32 | - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class); | ||
33 | - | ||
34 | - private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; | ||
35 | - private static final String TARGET_DIR = "corpora/summaries"; | ||
36 | - | ||
37 | - public static void main(String[] args) throws Exception { | ||
38 | - Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | ||
39 | - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | ||
40 | - | ||
41 | - Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | ||
42 | - SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); | ||
43 | - | ||
44 | - ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); | ||
45 | - | ||
46 | - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH)); | ||
47 | - int i = 1; | ||
48 | - double avgSize = 0; | ||
49 | - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
50 | - TText text = entry.getValue(); | ||
51 | - | ||
52 | - Set<TMention> goodMentions | ||
53 | - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); | ||
54 | - | ||
55 | - int targetSize = calculateTargetSize(text); | ||
56 | - String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor, zeroSubjectInjector); | ||
57 | - int size = Utils.tokenize(summary).size(); | ||
58 | - avgSize += size; | ||
59 | - try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily4.txt")))) { | ||
60 | - bw.append(summary); | ||
61 | - } | ||
62 | - | ||
63 | - LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey()); | ||
64 | - } | ||
65 | - | ||
66 | - LOG.info("Avg size:" + avgSize / id2preprocessedText.size()); | ||
67 | - } | ||
68 | - | ||
69 | - private static int calculateTargetSize(TText text) { | ||
70 | - List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
71 | - StringBuffer body = new StringBuffer(); | ||
72 | - for (TSentence sent : sents) | ||
73 | - body.append(Utils.loadSentence2Orth(sent) + " "); | ||
74 | - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); | ||
75 | - return (int) (0.2 * tokenCount); | ||
76 | - } | ||
77 | - | ||
78 | - private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor, ZeroSubjectInjector zeroSubjectInjector) throws Exception { | ||
79 | - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); | ||
80 | - | ||
81 | - Set<String> zeroSubjectTokenIds = zeroSubjectInjector.findZeroSubjectTokenIds(thrifted, selectedSentences); | ||
82 | - | ||
83 | - StringBuilder sb = new StringBuilder(); | ||
84 | - for (TSentence sent : selectedSentences) { | ||
85 | - sb.append(" " + Utils.loadSentence2Orth(sent, zeroSubjectTokenIds)); | ||
86 | - } | ||
87 | - return sb.toString().trim(); | ||
88 | - } | ||
89 | - | ||
90 | - private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | ||
91 | - | ||
92 | - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
93 | - | ||
94 | - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | ||
95 | - Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | ||
96 | - | ||
97 | - Map<TSentence, Double> sentence2score = Maps.newHashMap(); | ||
98 | - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | ||
99 | - Instance instance = entry.getValue(); | ||
100 | - instance.setDataset(instances); | ||
101 | - double score = sentenceClassifier.classifyInstance(instance); | ||
102 | - sentence2score.put(entry.getKey(), score); | ||
103 | - } | ||
104 | - | ||
105 | - List<TSentence> sortedSents = Lists.newArrayList(sents); | ||
106 | - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); | ||
107 | - | ||
108 | - int size = 0; | ||
109 | - Random r = new Random(1); | ||
110 | - Set<TSentence> summary = Sets.newHashSet(); | ||
111 | - for (TSentence sent : sortedSents) { | ||
112 | - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); | ||
113 | - if (r.nextDouble() > 0.4 && size > targetSize) | ||
114 | - break; | ||
115 | - summary.add(sent); | ||
116 | - if (size > targetSize) | ||
117 | - break; | ||
118 | - } | ||
119 | - List<TSentence> selectedSentences = Lists.newArrayList(); | ||
120 | - for (TSentence sent : sents) { | ||
121 | - if (summary.contains(sent)) | ||
122 | - selectedSentences.add(sent); | ||
123 | - } | ||
124 | - return selectedSentences; | ||
125 | - } | ||
126 | - | ||
127 | -} |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
@@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.features; | @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.features; | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import com.google.common.collect.Sets; | 4 | import com.google.common.collect.Sets; |
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
7 | 6 | ||
8 | import java.util.List; | 7 | import java.util.List; |
9 | import java.util.Map; | 8 | import java.util.Map; |
@@ -38,7 +37,6 @@ public class FeatureHelper { | @@ -38,7 +37,6 @@ public class FeatureHelper { | ||
38 | private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); | 37 | private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); |
39 | private final Map<TMention, Integer> mention2firstTokenIndex = Maps.newHashMap(); | 38 | private final Map<TMention, Integer> mention2firstTokenIndex = Maps.newHashMap(); |
40 | 39 | ||
41 | - | ||
42 | public FeatureHelper(TText preprocessedText) { | 40 | public FeatureHelper(TText preprocessedText) { |
43 | text = preprocessedText; | 41 | text = preprocessedText; |
44 | 42 | ||
@@ -60,9 +58,9 @@ public class FeatureHelper { | @@ -60,9 +58,9 @@ public class FeatureHelper { | ||
60 | int sentIdx = 0; | 58 | int sentIdx = 0; |
61 | int mentionIdx = 0; | 59 | int mentionIdx = 0; |
62 | for (TParagraph par : preprocessedText.getParagraphs()) { | 60 | for (TParagraph par : preprocessedText.getParagraphs()) { |
63 | - Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences(), false); | 61 | + Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); |
64 | mention2Orth.putAll(m2o); | 62 | mention2Orth.putAll(m2o); |
65 | - Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); | 63 | + Map<TMention, String> m2b = loadMention2Base(par.getSentences()); |
66 | mention2Base.putAll(m2b); | 64 | mention2Base.putAll(m2b); |
67 | 65 | ||
68 | int sentIdxInPar = 0; | 66 | int sentIdxInPar = 0; |
@@ -221,4 +219,40 @@ public class FeatureHelper { | @@ -221,4 +219,40 @@ public class FeatureHelper { | ||
221 | return null; | 219 | return null; |
222 | return mention2sent.get(mention).getTokens().get(idx - 1); | 220 | return mention2sent.get(mention).getTokens().get(idx - 1); |
223 | } | 221 | } |
222 | + | ||
223 | + private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { | ||
224 | + Map<TMention, String> mention2orth = Maps.newHashMap(); | ||
225 | + for (TSentence s : sents) { | ||
226 | + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | ||
227 | + | ||
228 | + for (TMention m : s.getMentions()) { | ||
229 | + StringBuilder mentionOrth = new StringBuilder(); | ||
230 | + for (String tokId : m.getChildIds()) { | ||
231 | + TToken token = tokId2tok.get(tokId); | ||
232 | + if (!token.isNoPrecedingSpace()) | ||
233 | + mentionOrth.append(" "); | ||
234 | + mentionOrth.append(token.getOrth()); | ||
235 | + } | ||
236 | + mention2orth.put(m, mentionOrth.toString().trim()); | ||
237 | + } | ||
238 | + } | ||
239 | + return mention2orth; | ||
240 | + } | ||
241 | + | ||
242 | + private static Map<TMention, String> loadMention2Base(List<TSentence> sents) { | ||
243 | + Map<TMention, String> mention2base = Maps.newHashMap(); | ||
244 | + for (TSentence s : sents) { | ||
245 | + Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); | ||
246 | + | ||
247 | + for (TMention m : s.getMentions()) { | ||
248 | + StringBuilder mentionBase = new StringBuilder(); | ||
249 | + for (String tokId : m.getChildIds()) { | ||
250 | + mentionBase.append(" "); | ||
251 | + mentionBase.append(tokId2base.get(tokId)); | ||
252 | + } | ||
253 | + mention2base.put(m, mentionBase.toString().toLowerCase().trim()); | ||
254 | + } | ||
255 | + } | ||
256 | + return mention2base; | ||
257 | + } | ||
224 | } | 258 | } |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
@@ -3,11 +3,11 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; | @@ -3,11 +3,11 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; | ||
3 | import com.google.common.collect.Lists; | 3 | import com.google.common.collect.Lists; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
7 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; |
8 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | 7 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; |
9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 8 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
10 | import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; | 9 | import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; |
10 | +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; | ||
11 | import weka.core.Attribute; | 11 | import weka.core.Attribute; |
12 | 12 | ||
13 | import java.io.IOException; | 13 | import java.io.IOException; |
@@ -21,7 +21,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { | @@ -21,7 +21,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { | ||
21 | private final List<String> frequentBases; | 21 | private final List<String> frequentBases; |
22 | 22 | ||
23 | public MentionFeatureExtractor() throws IOException { | 23 | public MentionFeatureExtractor() throws IOException { |
24 | - frequentBases = loadFrequentBases(); | 24 | + frequentBases = ResourceUtils.loadFrequentBases(); |
25 | 25 | ||
26 | //coref | 26 | //coref |
27 | addNumericAttributeNormalized("chain_length"); | 27 | addNumericAttributeNormalized("chain_length"); |
@@ -80,10 +80,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { | @@ -80,10 +80,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { | ||
80 | fillSortedAttributes("score"); | 80 | fillSortedAttributes("score"); |
81 | } | 81 | } |
82 | 82 | ||
83 | - private List<String> loadFrequentBases() throws IOException { | ||
84 | - return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList()); | ||
85 | - } | ||
86 | - | ||
87 | private String encodeBase(String base) { | 83 | private String encodeBase(String base) { |
88 | return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); | 84 | return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); |
89 | } | 85 | } |
@@ -177,7 +173,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { | @@ -177,7 +173,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { | ||
177 | Attribute att = getAttributeByName(attributeName); | 173 | Attribute att = getAttributeByName(attributeName); |
178 | int index = att.indexOfValue(value); | 174 | int index = att.indexOfValue(value); |
179 | if (index == -1) | 175 | if (index == -1) |
180 | - LOG.warn(value + " not found for attribute " + attributeName); | 176 | + LOG.warn("{} not found for attribute {}", value, attributeName); |
181 | attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); | 177 | attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); |
182 | } | 178 | } |
183 | 179 |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
@@ -5,8 +5,7 @@ import org.slf4j.Logger; | @@ -5,8 +5,7 @@ import org.slf4j.Logger; | ||
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | ||
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 8 | +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; |
10 | import weka.classifiers.Classifier; | 9 | import weka.classifiers.Classifier; |
11 | import weka.core.Instance; | 10 | import weka.core.Instance; |
12 | import weka.core.Instances; | 11 | import weka.core.Instances; |
@@ -24,7 +23,7 @@ public class MentionModel { | @@ -24,7 +23,7 @@ public class MentionModel { | ||
24 | public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception { | 23 | public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception { |
25 | Set<TMention> goodMentions = Sets.newHashSet(); | 24 | Set<TMention> goodMentions = Sets.newHashSet(); |
26 | 25 | ||
27 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 26 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); |
28 | Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor); | 27 | Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor); |
29 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | 28 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { |
30 | Instance instance = entry.getValue(); | 29 | Instance instance = entry.getValue(); |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
@@ -6,8 +6,7 @@ import org.slf4j.LoggerFactory; | @@ -6,8 +6,7 @@ import org.slf4j.LoggerFactory; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | ||
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 9 | +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; |
11 | import weka.classifiers.Classifier; | 10 | import weka.classifiers.Classifier; |
12 | import weka.core.Instance; | 11 | import weka.core.Instance; |
13 | import weka.core.Instances; | 12 | import weka.core.Instances; |
@@ -23,7 +22,7 @@ public class SentenceModel { | @@ -23,7 +22,7 @@ public class SentenceModel { | ||
23 | } | 22 | } |
24 | 23 | ||
25 | public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | 24 | public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { |
26 | - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | 25 | + Instances instances = InstanceUtils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); |
27 | Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | 26 | Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); |
28 | 27 | ||
29 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); | 28 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java
1 | -package pl.waw.ipipan.zil.summ.nicolas; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.utils; |
2 | 2 | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
@@ -11,7 +11,9 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | @@ -11,7 +11,9 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | ||
11 | import weka.core.Attribute; | 11 | import weka.core.Attribute; |
12 | import weka.core.DenseInstance; | 12 | import weka.core.DenseInstance; |
13 | import weka.core.Instance; | 13 | import weka.core.Instance; |
14 | +import weka.core.Instances; | ||
14 | 15 | ||
16 | +import java.util.ArrayList; | ||
15 | import java.util.List; | 17 | import java.util.List; |
16 | import java.util.Map; | 18 | import java.util.Map; |
17 | import java.util.Set; | 19 | import java.util.Set; |
@@ -22,6 +24,8 @@ public class InstanceUtils { | @@ -22,6 +24,8 @@ public class InstanceUtils { | ||
22 | 24 | ||
23 | private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class); | 25 | private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class); |
24 | 26 | ||
27 | + private static final String DATASET_NAME = "Dataset"; | ||
28 | + | ||
25 | private InstanceUtils() { | 29 | private InstanceUtils() { |
26 | } | 30 | } |
27 | 31 | ||
@@ -60,4 +64,11 @@ public class InstanceUtils { | @@ -60,4 +64,11 @@ public class InstanceUtils { | ||
60 | LOG.info("Extracted features of {} sentences.", sentence2instance.size()); | 64 | LOG.info("Extracted features of {} sentences.", sentence2instance.size()); |
61 | return sentence2instance; | 65 | return sentence2instance; |
62 | } | 66 | } |
67 | + | ||
68 | + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList | ||
69 | + public static Instances createNewInstances(ArrayList<Attribute> attributesList) { | ||
70 | + Instances instances = new Instances(DATASET_NAME, attributesList, 0); | ||
71 | + instances.setClassIndex(0); | ||
72 | + return instances; | ||
73 | + } | ||
63 | } | 74 | } |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.utils; | ||
2 | + | ||
3 | +import org.apache.commons.io.IOUtils; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | ||
7 | +import weka.classifiers.Classifier; | ||
8 | + | ||
9 | +import java.io.IOException; | ||
10 | +import java.io.InputStream; | ||
11 | +import java.io.ObjectInputStream; | ||
12 | +import java.util.List; | ||
13 | +import java.util.function.Predicate; | ||
14 | +import java.util.stream.Collectors; | ||
15 | + | ||
16 | +public class ResourceUtils { | ||
17 | + | ||
18 | + private static final Logger LOG = LoggerFactory.getLogger(ResourceUtils.class); | ||
19 | + | ||
20 | + private ResourceUtils() { | ||
21 | + } | ||
22 | + | ||
23 | + public static List<String> loadFrequentBases() throws IOException { | ||
24 | + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH); | ||
25 | + } | ||
26 | + | ||
27 | + public static List<String> loadStopwords() throws IOException { | ||
28 | + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.STOPWORDS_PATH); | ||
29 | + } | ||
30 | + | ||
31 | + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { | ||
32 | + LOG.info("Loading classifier from path: {}...", modelResourcePath); | ||
33 | + try (InputStream stream = ResourceUtils.class.getResourceAsStream(modelResourcePath)) { | ||
34 | + if (stream == null) { | ||
35 | + throw new IOException("Model not found at: " + modelResourcePath); | ||
36 | + } | ||
37 | + try (ObjectInputStream ois = new ObjectInputStream(stream)) { | ||
38 | + Classifier classifier = (Classifier) ois.readObject(); | ||
39 | + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); | ||
40 | + return classifier; | ||
41 | + } catch (ClassNotFoundException e) { | ||
42 | + LOG.error("Error loading serialized classifier, class not found.", e); | ||
43 | + throw new IOException(e); | ||
44 | + } | ||
45 | + } | ||
46 | + } | ||
47 | + | ||
48 | + private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException { | ||
49 | + try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) { | ||
50 | + return IOUtils.readLines(stream, Constants.ENCODING) | ||
51 | + .stream() | ||
52 | + .map(String::trim) | ||
53 | + .map(String::toLowerCase) | ||
54 | + .filter(((Predicate<String>) String::isEmpty).negate()) | ||
55 | + .sorted() | ||
56 | + .distinct() | ||
57 | + .collect(Collectors.toList()); | ||
58 | + } | ||
59 | + } | ||
60 | + | ||
61 | + | ||
62 | +} |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.utils; | ||
2 | + | ||
3 | +import com.google.common.collect.Sets; | ||
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | ||
6 | + | ||
7 | +import java.util.Arrays; | ||
8 | +import java.util.List; | ||
9 | +import java.util.Set; | ||
10 | + | ||
11 | +public class TextUtils { | ||
12 | + | ||
13 | + private TextUtils() { | ||
14 | + } | ||
15 | + | ||
16 | + public static List<String> tokenize(String text) { | ||
17 | + return Arrays.asList(text.split("[^\\p{L}0-9]+")); | ||
18 | + } | ||
19 | + | ||
20 | + public static List<String> tokenizeOnWhitespace(String text) { | ||
21 | + return Arrays.asList(text.split(" +")); | ||
22 | + } | ||
23 | + | ||
24 | + public static String loadSentence2Orth(TSentence sentence) { | ||
25 | + return loadSentence2Orth(sentence, Sets.newHashSet()); | ||
26 | + } | ||
27 | + | ||
28 | + public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { | ||
29 | + StringBuilder sb = new StringBuilder(); | ||
30 | + for (TToken token : sentence.getTokens()) { | ||
31 | + if (tokenIdsToSkip.contains(token.getId())) { | ||
32 | + System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); | ||
33 | + continue; | ||
34 | + } | ||
35 | + if (!token.isNoPrecedingSpace()) | ||
36 | + sb.append(" "); | ||
37 | + sb.append(token.getOrth()); | ||
38 | + } | ||
39 | + return sb.toString().trim(); | ||
40 | + } | ||
41 | +} |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.common; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; |
2 | 2 | ||
3 | import com.google.common.base.Predicates; | 3 | import com.google.common.base.Predicates; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
@@ -58,4 +58,12 @@ public class ThriftUtils { | @@ -58,4 +58,12 @@ public class ThriftUtils { | ||
58 | } | 58 | } |
59 | } | 59 | } |
60 | 60 | ||
61 | + public static TText loadThriftTextFromResource(String resourcePath) { | ||
62 | + try (InputStream stream = ThriftUtils.class.getResourceAsStream(resourcePath)) { | ||
63 | + return loadThriftTextFromStream(stream); | ||
64 | + } catch (IOException e) { | ||
65 | + LOG.error("Error reading serialized Thrift text from resource", e); | ||
66 | + return null; | ||
67 | + } | ||
68 | + } | ||
61 | } | 69 | } |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
@@ -7,7 +7,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | @@ -7,7 +7,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 10 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; |
11 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | 11 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; |
12 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 12 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
13 | import weka.core.Attribute; | 13 | import weka.core.Attribute; |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
@@ -3,8 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | @@ -3,8 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; | ||
3 | import com.google.common.collect.Sets; | 3 | import com.google.common.collect.Sets; |
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
7 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; |
7 | +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; | ||
8 | import weka.classifiers.Classifier; | 8 | import weka.classifiers.Classifier; |
9 | import weka.core.Instance; | 9 | import weka.core.Instance; |
10 | import weka.core.Instances; | 10 | import weka.core.Instances; |
@@ -24,7 +24,7 @@ public class ZeroSubjectInjector { | @@ -24,7 +24,7 @@ public class ZeroSubjectInjector { | ||
24 | public ZeroSubjectInjector() throws Exception { | 24 | public ZeroSubjectInjector() throws Exception { |
25 | classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); | 25 | classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); |
26 | featureExtractor = new ZeroFeatureExtractor(); | 26 | featureExtractor = new ZeroFeatureExtractor(); |
27 | - instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 27 | + instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); |
28 | } | 28 | } |
29 | 29 | ||
30 | public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { | 30 | public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java
@@ -3,7 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas; | @@ -3,7 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas; | ||
3 | import org.junit.BeforeClass; | 3 | import org.junit.BeforeClass; |
4 | import org.junit.Test; | 4 | import org.junit.Test; |
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; |
7 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | ||
7 | 8 | ||
8 | import static org.junit.Assert.assertTrue; | 9 | import static org.junit.Assert.assertTrue; |
9 | 10 | ||
@@ -20,9 +21,9 @@ public class NicolasTest { | @@ -20,9 +21,9 @@ public class NicolasTest { | ||
20 | 21 | ||
21 | @Test | 22 | @Test |
22 | public void shouldSummarizeThriftText() throws Exception { | 23 | public void shouldSummarizeThriftText() throws Exception { |
23 | - TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); | 24 | + TText thriftText = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); |
24 | String summary = nicolas.summarizeThrift(thriftText, 5); | 25 | String summary = nicolas.summarizeThrift(thriftText, 5); |
25 | - int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size(); | 26 | + int summaryTokensCount = TextUtils.tokenizeOnWhitespace(summary).size(); |
26 | assertTrue(summaryTokensCount > 0); | 27 | assertTrue(summaryTokensCount > 0); |
27 | assertTrue(summaryTokensCount < 10); | 28 | assertTrue(summaryTokensCount < 10); |
28 | } | 29 | } |
nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java renamed to nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.common; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; |
2 | 2 | ||
3 | import org.junit.Test; | 3 | import org.junit.Test; |
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
@@ -7,13 +7,13 @@ import java.io.InputStream; | @@ -7,13 +7,13 @@ import java.io.InputStream; | ||
7 | 7 | ||
8 | import static org.junit.Assert.assertEquals; | 8 | import static org.junit.Assert.assertEquals; |
9 | 9 | ||
10 | -public class UtilsTest { | 10 | +public class ThriftUtilsTest { |
11 | 11 | ||
12 | private static final String SAMPLE_TEXT_PATH = "/199704210011.bin"; | 12 | private static final String SAMPLE_TEXT_PATH = "/199704210011.bin"; |
13 | 13 | ||
14 | @Test | 14 | @Test |
15 | public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { | 15 | public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { |
16 | - try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { | 16 | + try (InputStream stream = ThriftUtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { |
17 | TText text = ThriftUtils.loadThriftTextFromStream(stream); | 17 | TText text = ThriftUtils.loadThriftTextFromStream(stream); |
18 | assertEquals(26, text.getParagraphs().size()); | 18 | assertEquals(26, text.getParagraphs().size()); |
19 | assertEquals(2, text.getParagraphs().get(4).getSentences().size()); | 19 | assertEquals(2, text.getParagraphs().get(4).getSentences().size()); |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; | @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; | ||
5 | import org.junit.Test; | 5 | import org.junit.Test; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | ||
9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 8 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | ||
10 | 10 | ||
11 | import java.io.IOException; | 11 | import java.io.IOException; |
12 | import java.io.InputStream; | 12 | import java.io.InputStream; |
nicolas-common/src/test/resources/199704210011.bin renamed to nicolas-lib/src/test/resources/199704210011.bin
No preview for this file type
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore
0 → 100644
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md
0 → 100644
nicolas-multiservice/pom.xml
@@ -30,8 +30,12 @@ | @@ -30,8 +30,12 @@ | ||
30 | 30 | ||
31 | <!-- test --> | 31 | <!-- test --> |
32 | <dependency> | 32 | <dependency> |
33 | + <groupId>junit</groupId> | ||
34 | + <artifactId>junit</artifactId> | ||
35 | + </dependency> | ||
36 | + <dependency> | ||
33 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 37 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
34 | - <artifactId>nicolas-common</artifactId> | 38 | + <artifactId>nicolas-lib</artifactId> |
35 | <scope>test</scope> | 39 | <scope>test</scope> |
36 | </dependency> | 40 | </dependency> |
37 | 41 |
nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java
@@ -7,7 +7,7 @@ import org.junit.rules.TemporaryFolder; | @@ -7,7 +7,7 @@ import org.junit.rules.TemporaryFolder; | ||
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 10 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; |
11 | 11 | ||
12 | import java.io.File; | 12 | import java.io.File; |
13 | import java.io.FileInputStream; | 13 | import java.io.FileInputStream; |
@@ -67,7 +67,7 @@ public class PreprocessorIT { | @@ -67,7 +67,7 @@ public class PreprocessorIT { | ||
67 | preprocessor.preprocessToFile(text, targetFile); | 67 | preprocessor.preprocessToFile(text, targetFile); |
68 | 68 | ||
69 | try (FileInputStream inputStream = new FileInputStream(targetFile)) { | 69 | try (FileInputStream inputStream = new FileInputStream(targetFile)) { |
70 | - TText processed = Utils.loadThriftTextFromStream(inputStream); | 70 | + TText processed = ThriftUtils.loadThriftTextFromStream(inputStream); |
71 | assertSampleProcessedText(processed); | 71 | assertSampleProcessedText(processed); |
72 | } | 72 | } |
73 | } | 73 | } |
nicolas-train/pom.xml
@@ -15,10 +15,6 @@ | @@ -15,10 +15,6 @@ | ||
15 | <!-- project --> | 15 | <!-- project --> |
16 | <dependency> | 16 | <dependency> |
17 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 17 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
18 | - <artifactId>nicolas-common</artifactId> | ||
19 | - </dependency> | ||
20 | - <dependency> | ||
21 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
22 | <artifactId>nicolas-lib</artifactId> | 18 | <artifactId>nicolas-lib</artifactId> |
23 | </dependency> | 19 | </dependency> |
24 | <dependency> | 20 | <dependency> |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train; | 1 | +package pl.waw.ipipan.zil.summ.nicolas; |
2 | 2 | ||
3 | import net.lingala.zip4j.core.ZipFile; | 3 | import net.lingala.zip4j.core.ZipFile; |
4 | import net.lingala.zip4j.exception.ZipException; | 4 | import net.lingala.zip4j.exception.ZipException; |
@@ -34,7 +34,7 @@ public class PathConstants { | @@ -34,7 +34,7 @@ public class PathConstants { | ||
34 | public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); | 34 | public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); |
35 | public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); | 35 | public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); |
36 | 36 | ||
37 | - public static final File ARFF_DIR = new File(WORKING_DIR, "train-arff"); | 37 | + private static final File ARFF_DIR = new File(WORKING_DIR, "train-arff"); |
38 | public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff"); | 38 | public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff"); |
39 | public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); | 39 | public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); |
40 | public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); | 40 | public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java
@@ -17,7 +17,7 @@ public class Constants { | @@ -17,7 +17,7 @@ public class Constants { | ||
17 | 17 | ||
18 | public static Set<String> loadTestTextIds() throws IOException { | 18 | public static Set<String> loadTestTextIds() throws IOException { |
19 | try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { | 19 | try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { |
20 | - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING); | 20 | + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING); |
21 | return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); | 21 | return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); |
22 | } | 22 | } |
23 | } | 23 | } |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
@@ -7,11 +7,13 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | @@ -7,11 +7,13 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | 8 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
9 | import pl.waw.ipipan.zil.summ.nicolas.NicolasException; | 9 | import pl.waw.ipipan.zil.summ.nicolas.NicolasException; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | ||
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 10 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; |
11 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | ||
12 | 12 | ||
13 | import java.io.File; | 13 | import java.io.File; |
14 | +import java.io.FileOutputStream; | ||
14 | import java.io.IOException; | 15 | import java.io.IOException; |
16 | +import java.io.OutputStreamWriter; | ||
15 | import java.util.List; | 17 | import java.util.List; |
16 | import java.util.Map; | 18 | import java.util.Map; |
17 | import java.util.Set; | 19 | import java.util.Set; |
@@ -23,7 +25,6 @@ public class SummarizeTestCorpus { | @@ -23,7 +25,6 @@ public class SummarizeTestCorpus { | ||
23 | 25 | ||
24 | private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); | 26 | private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); |
25 | 27 | ||
26 | - | ||
27 | private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; | 28 | private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; |
28 | private static final double SUMMARY_RATIO = 0.2; | 29 | private static final double SUMMARY_RATIO = 0.2; |
29 | 30 | ||
@@ -31,8 +32,8 @@ public class SummarizeTestCorpus { | @@ -31,8 +32,8 @@ public class SummarizeTestCorpus { | ||
31 | } | 32 | } |
32 | 33 | ||
33 | public static void main(String[] args) throws IOException, NicolasException { | 34 | public static void main(String[] args) throws IOException, NicolasException { |
34 | - File thriftedCorpusDir = new File("data/preprocessed"); | ||
35 | - File targetDir = new File("data/summaries"); | 35 | + File thriftedCorpusDir = new File("data/all-preprocessed"); |
36 | + File targetDir = new File("data/test-system"); | ||
36 | targetDir.mkdir(); | 37 | targetDir.mkdir(); |
37 | 38 | ||
38 | Set<String> testTextIds = loadTestTextIds(); | 39 | Set<String> testTextIds = loadTestTextIds(); |
@@ -62,9 +63,9 @@ public class SummarizeTestCorpus { | @@ -62,9 +63,9 @@ public class SummarizeTestCorpus { | ||
62 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | 63 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
63 | StringBuilder body = new StringBuilder(); | 64 | StringBuilder body = new StringBuilder(); |
64 | for (TSentence sentence : sentences) | 65 | for (TSentence sentence : sentences) |
65 | - body.append(Utils.loadSentence2Orth(sentence)).append(" "); | 66 | + body.append(TextUtils.loadSentence2Orth(sentence)).append(" "); |
66 | 67 | ||
67 | - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); | 68 | + int tokenCount = TextUtils.tokenizeOnWhitespace(body.toString().trim()).size(); |
68 | return (int) (SUMMARY_RATIO * tokenCount); | 69 | return (int) (SUMMARY_RATIO * tokenCount); |
69 | } | 70 | } |
70 | 71 | ||
@@ -73,7 +74,9 @@ public class SummarizeTestCorpus { | @@ -73,7 +74,9 @@ public class SummarizeTestCorpus { | ||
73 | String textId = entry.getKey(); | 74 | String textId = entry.getKey(); |
74 | String summary = entry.getValue(); | 75 | String summary = entry.getValue(); |
75 | String targetFileName = textId + SUMMARY_FILE_SUFFIX; | 76 | String targetFileName = textId + SUMMARY_FILE_SUFFIX; |
76 | - Utils.writeStringToFile(summary, new File(targetDir, targetFileName)); | 77 | + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) { |
78 | + writer.write(summary); | ||
79 | + } | ||
77 | } | 80 | } |
78 | } | 81 | } |
79 | 82 |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java
@@ -6,29 +6,63 @@ import com.google.common.collect.Multiset; | @@ -6,29 +6,63 @@ import com.google.common.collect.Multiset; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
10 | +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | ||
10 | 12 | ||
13 | +import java.io.IOException; | ||
11 | import java.util.List; | 14 | import java.util.List; |
12 | import java.util.Map; | 15 | import java.util.Map; |
16 | +import java.util.Set; | ||
17 | +import java.util.function.Function; | ||
13 | import java.util.stream.Collectors; | 18 | import java.util.stream.Collectors; |
14 | 19 | ||
15 | public class MentionScorer { | 20 | public class MentionScorer { |
16 | 21 | ||
22 | + private final Set<String> STOPWORDS; | ||
23 | + | ||
24 | + public MentionScorer() throws IOException { | ||
25 | + STOPWORDS = ResourceUtils.loadStopwords().stream().collect(Collectors.toSet()); | ||
26 | + } | ||
27 | + | ||
17 | public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { | 28 | public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { |
18 | - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | 29 | + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); |
19 | 30 | ||
20 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); | 31 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); |
21 | - Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences, true); | 32 | + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences); |
22 | 33 | ||
23 | return booleanTokenIntersection(mention2Orth, tokenCounts); | 34 | return booleanTokenIntersection(mention2Orth, tokenCounts); |
24 | } | 35 | } |
25 | 36 | ||
37 | + private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sents) { | ||
38 | + Map<TMention, String> mention2orth = Maps.newHashMap(); | ||
39 | + for (TSentence s : sents) { | ||
40 | + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | ||
41 | + | ||
42 | + for (TMention m : s.getMentions()) { | ||
43 | + StringBuilder mentionOrth = new StringBuilder(); | ||
44 | + for (String tokId : m.getChildIds()) { | ||
45 | + TToken token = tokId2tok.get(tokId); | ||
46 | + if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { | ||
47 | + continue; | ||
48 | + } | ||
49 | + | ||
50 | + if (!token.isNoPrecedingSpace()) | ||
51 | + mentionOrth.append(" "); | ||
52 | + mentionOrth.append(token.getOrth()); | ||
53 | + } | ||
54 | + mention2orth.put(m, mentionOrth.toString().trim()); | ||
55 | + } | ||
56 | + } | ||
57 | + return mention2orth; | ||
58 | + } | ||
59 | + | ||
26 | private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { | 60 | private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { |
27 | Map<TMention, Double> mention2score = Maps.newHashMap(); | 61 | Map<TMention, Double> mention2score = Maps.newHashMap(); |
28 | for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { | 62 | for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { |
29 | TMention mention = entry.getKey(); | 63 | TMention mention = entry.getKey(); |
30 | String mentionOrth = mention2Orth.get(mention); | 64 | String mentionOrth = mention2Orth.get(mention); |
31 | - for (String token : Utils.tokenize(mentionOrth)) { | 65 | + for (String token : TextUtils.tokenize(mentionOrth)) { |
32 | if (tokenCounts.contains(token.toLowerCase())) { | 66 | if (tokenCounts.contains(token.toLowerCase())) { |
33 | mention2score.put(mention, 1.0); | 67 | mention2score.put(mention, 1.0); |
34 | break; | 68 | break; |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java
@@ -6,22 +6,23 @@ import com.google.common.collect.Multiset; | @@ -6,22 +6,23 @@ import com.google.common.collect.Multiset; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 9 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; |
10 | 10 | ||
11 | import java.util.List; | 11 | import java.util.List; |
12 | import java.util.Map; | 12 | import java.util.Map; |
13 | 13 | ||
14 | public class SentenceScorer { | 14 | public class SentenceScorer { |
15 | + | ||
15 | public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) { | 16 | public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) { |
16 | - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | 17 | + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); |
17 | 18 | ||
18 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); | 19 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
19 | for (TParagraph paragraph : preprocessedText.getParagraphs()) | 20 | for (TParagraph paragraph : preprocessedText.getParagraphs()) |
20 | for (TSentence sentence : paragraph.getSentences()) { | 21 | for (TSentence sentence : paragraph.getSentences()) { |
21 | double score = 0.0; | 22 | double score = 0.0; |
22 | 23 | ||
23 | - String orth = Utils.loadSentence2Orth(sentence); | ||
24 | - List<String> tokens = Utils.tokenize(orth); | 24 | + String orth = TextUtils.loadSentence2Orth(sentence); |
25 | + List<String> tokens = TextUtils.tokenize(orth); | ||
25 | for (String token : tokens) { | 26 | for (String token : tokens) { |
26 | score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0; | 27 | score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0; |
27 | } | 28 | } |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java
@@ -5,7 +5,7 @@ import org.apache.commons.csv.CSVFormat; | @@ -5,7 +5,7 @@ import org.apache.commons.csv.CSVFormat; | ||
5 | import org.apache.commons.csv.CSVParser; | 5 | import org.apache.commons.csv.CSVParser; |
6 | import org.apache.commons.csv.CSVRecord; | 6 | import org.apache.commons.csv.CSVRecord; |
7 | import org.apache.commons.csv.QuoteMode; | 7 | import org.apache.commons.csv.QuoteMode; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 8 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; |
9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
10 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | 10 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; |
11 | 11 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java
@@ -8,14 +8,14 @@ import com.google.common.collect.Multiset; | @@ -8,14 +8,14 @@ import com.google.common.collect.Multiset; | ||
8 | import org.apache.commons.io.FileUtils; | 8 | import org.apache.commons.io.FileUtils; |
9 | import pl.waw.ipipan.zil.summ.eval.Main; | 9 | import pl.waw.ipipan.zil.summ.eval.Main; |
10 | import pl.waw.ipipan.zil.summ.eval.rouge.RougeN; | 10 | import pl.waw.ipipan.zil.summ.eval.rouge.RougeN; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 11 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; |
12 | 12 | ||
13 | import java.io.File; | 13 | import java.io.File; |
14 | import java.io.IOException; | 14 | import java.io.IOException; |
15 | import java.util.*; | 15 | import java.util.*; |
16 | import java.util.stream.Collectors; | 16 | import java.util.stream.Collectors; |
17 | 17 | ||
18 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | 18 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
19 | 19 | ||
20 | public class CreateOptimalSummaries { | 20 | public class CreateOptimalSummaries { |
21 | 21 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java
1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; | 1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | ||
3 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | 3 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
4 | 4 | ||
5 | public class DownloadCorpus { | 5 | public class DownloadCorpus { |
6 | 6 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java
1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; | 1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | ||
3 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | 3 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
4 | 4 | ||
5 | public class DownloadTrainingResources { | 5 | public class DownloadTrainingResources { |
6 | 6 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; | 1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | ||
3 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 3 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; |
4 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | 4 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; |
5 | import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; | 5 | import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; |
6 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | 6 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; |
7 | 7 | ||
8 | import javax.xml.bind.JAXBException; | 8 | import javax.xml.bind.JAXBException; |
9 | import java.io.File; | 9 | import java.io.File; |
10 | +import java.io.FileOutputStream; | ||
10 | import java.io.IOException; | 11 | import java.io.IOException; |
12 | +import java.io.OutputStreamWriter; | ||
11 | import java.util.List; | 13 | import java.util.List; |
12 | import java.util.function.Predicate; | 14 | import java.util.function.Predicate; |
13 | import java.util.stream.Collectors; | 15 | import java.util.stream.Collectors; |
14 | import java.util.stream.Stream; | 16 | import java.util.stream.Stream; |
15 | 17 | ||
16 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | 18 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
17 | 19 | ||
18 | public class ExtractGoldSummaries { | 20 | public class ExtractGoldSummaries { |
19 | 21 | ||
@@ -22,7 +24,6 @@ public class ExtractGoldSummaries { | @@ -22,7 +24,6 @@ public class ExtractGoldSummaries { | ||
22 | 24 | ||
23 | private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); | 25 | private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); |
24 | 26 | ||
25 | - | ||
26 | private ExtractGoldSummaries() { | 27 | private ExtractGoldSummaries() { |
27 | } | 28 | } |
28 | 29 | ||
@@ -47,7 +48,10 @@ public class ExtractGoldSummaries { | @@ -47,7 +48,10 @@ public class ExtractGoldSummaries { | ||
47 | for (Summary summary : goldSummaries) { | 48 | for (Summary summary : goldSummaries) { |
48 | File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR; | 49 | File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR; |
49 | File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); | 50 | File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); |
50 | - Utils.writeStringToFile(summary.getBody(), targetFile); | 51 | + |
52 | + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { | ||
53 | + writer.append(summary.getBody()); | ||
54 | + } | ||
51 | } | 55 | } |
52 | } | 56 | } |
53 | } | 57 | } |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
@@ -10,15 +10,14 @@ import org.slf4j.LoggerFactory; | @@ -10,15 +10,14 @@ import org.slf4j.LoggerFactory; | ||
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
12 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 12 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
13 | -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | ||
14 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | ||
15 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
16 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | 13 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
17 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
18 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 15 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
19 | import pl.waw.ipipan.zil.summ.nicolas.train.model.MentionScorer; | 16 | import pl.waw.ipipan.zil.summ.nicolas.train.model.MentionScorer; |
20 | import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; | 17 | import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; |
21 | import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; | 18 | import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; |
19 | +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; | ||
20 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | ||
22 | import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | 21 | import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; |
23 | import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; | 22 | import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; |
24 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | 23 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; |
@@ -37,7 +36,7 @@ import java.util.Set; | @@ -37,7 +36,7 @@ import java.util.Set; | ||
37 | import java.util.function.Predicate; | 36 | import java.util.function.Predicate; |
38 | import java.util.stream.Collectors; | 37 | import java.util.stream.Collectors; |
39 | 38 | ||
40 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | 39 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
41 | 40 | ||
42 | public class PrepareTrainingData { | 41 | public class PrepareTrainingData { |
43 | 42 | ||
@@ -61,7 +60,7 @@ public class PrepareTrainingData { | @@ -61,7 +60,7 @@ public class PrepareTrainingData { | ||
61 | MentionScorer mentionScorer = new MentionScorer(); | 60 | MentionScorer mentionScorer = new MentionScorer(); |
62 | MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | 61 | MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); |
63 | 62 | ||
64 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 63 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); |
65 | 64 | ||
66 | int i = 1; | 65 | int i = 1; |
67 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | 66 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
@@ -105,7 +104,7 @@ public class PrepareTrainingData { | @@ -105,7 +104,7 @@ public class PrepareTrainingData { | ||
105 | SentenceScorer sentenceScorer = new SentenceScorer(); | 104 | SentenceScorer sentenceScorer = new SentenceScorer(); |
106 | SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); | 105 | SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); |
107 | 106 | ||
108 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 107 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); |
109 | 108 | ||
110 | int i = 1; | 109 | int i = 1; |
111 | for (String textId : id2preprocessedText.keySet()) { | 110 | for (String textId : id2preprocessedText.keySet()) { |
@@ -149,7 +148,7 @@ public class PrepareTrainingData { | @@ -149,7 +148,7 @@ public class PrepareTrainingData { | ||
149 | ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_CORPUS); | 148 | ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_CORPUS); |
150 | ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); | 149 | ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); |
151 | 150 | ||
152 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 151 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); |
153 | 152 | ||
154 | int i = 1; | 153 | int i = 1; |
155 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | 154 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java
@@ -9,7 +9,7 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | @@ -9,7 +9,7 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | ||
9 | import java.io.File; | 9 | import java.io.File; |
10 | import java.util.Arrays; | 10 | import java.util.Arrays; |
11 | 11 | ||
12 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | 12 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
13 | 13 | ||
14 | public class PreprocessCorpus { | 14 | public class PreprocessCorpus { |
15 | 15 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
@@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; |
7 | import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings; | 7 | import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings; |
8 | import weka.classifiers.Classifier; | 8 | import weka.classifiers.Classifier; |
9 | import weka.core.Instances; | 9 | import weka.core.Instances; |
@@ -14,7 +14,7 @@ import java.io.FileOutputStream; | @@ -14,7 +14,7 @@ import java.io.FileOutputStream; | ||
14 | import java.io.ObjectOutputStream; | 14 | import java.io.ObjectOutputStream; |
15 | import java.util.logging.LogManager; | 15 | import java.util.logging.LogManager; |
16 | 16 | ||
17 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | 17 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; |
18 | 18 | ||
19 | public class TrainAllModels { | 19 | public class TrainAllModels { |
20 | 20 |
pom.xml
@@ -10,15 +10,12 @@ | @@ -10,15 +10,12 @@ | ||
10 | 10 | ||
11 | <packaging>pom</packaging> | 11 | <packaging>pom</packaging> |
12 | 12 | ||
13 | - | ||
14 | <modules> | 13 | <modules> |
15 | <module>nicolas-lib</module> | 14 | <module>nicolas-lib</module> |
16 | <module>nicolas-cli</module> | 15 | <module>nicolas-cli</module> |
17 | <module>nicolas-model</module> | 16 | <module>nicolas-model</module> |
18 | <module>nicolas-train</module> | 17 | <module>nicolas-train</module> |
19 | - <module>nicolas-common</module> | ||
20 | <module>nicolas-multiservice</module> | 18 | <module>nicolas-multiservice</module> |
21 | - <module>nicolas-eval</module> | ||
22 | </modules> | 19 | </modules> |
23 | 20 | ||
24 | <properties> | 21 | <properties> |
@@ -59,23 +56,23 @@ | @@ -59,23 +56,23 @@ | ||
59 | <!-- project --> | 56 | <!-- project --> |
60 | <dependency> | 57 | <dependency> |
61 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 58 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
62 | - <artifactId>nicolas-model</artifactId> | 59 | + <artifactId>nicolas-cli</artifactId> |
63 | <version>${project.version}</version> | 60 | <version>${project.version}</version> |
64 | - <scope>runtime</scope> | ||
65 | </dependency> | 61 | </dependency> |
66 | <dependency> | 62 | <dependency> |
67 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 63 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
68 | - <artifactId>nicolas-common</artifactId> | 64 | + <artifactId>nicolas-lib</artifactId> |
69 | <version>${project.version}</version> | 65 | <version>${project.version}</version> |
70 | </dependency> | 66 | </dependency> |
71 | <dependency> | 67 | <dependency> |
72 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 68 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
73 | - <artifactId>nicolas-zero</artifactId> | 69 | + <artifactId>nicolas-model</artifactId> |
74 | <version>${project.version}</version> | 70 | <version>${project.version}</version> |
71 | + <scope>runtime</scope> | ||
75 | </dependency> | 72 | </dependency> |
76 | <dependency> | 73 | <dependency> |
77 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 74 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
78 | - <artifactId>nicolas-lib</artifactId> | 75 | + <artifactId>nicolas-multiservice</artifactId> |
79 | <version>${project.version}</version> | 76 | <version>${project.version}</version> |
80 | </dependency> | 77 | </dependency> |
81 | <dependency> | 78 | <dependency> |
@@ -83,11 +80,6 @@ | @@ -83,11 +80,6 @@ | ||
83 | <artifactId>nicolas-train</artifactId> | 80 | <artifactId>nicolas-train</artifactId> |
84 | <version>${project.version}</version> | 81 | <version>${project.version}</version> |
85 | </dependency> | 82 | </dependency> |
86 | - <dependency> | ||
87 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
88 | - <artifactId>nicolas-multiservice</artifactId> | ||
89 | - <version>${project.version}</version> | ||
90 | - </dependency> | ||
91 | 83 | ||
92 | <!-- internal --> | 84 | <!-- internal --> |
93 | <dependency> | 85 | <dependency> |