Commit 1a009dd0c4f78b9367ce117f0edd6e982cb4ebdf
1 parent
7e387f1c
clean up modules
Showing
50 changed files
with
311 additions
and
685 deletions
README.md
0 → 100644
eval.sh
0 → 100755
nicolas-cli/pom.xml
... | ... | @@ -22,6 +22,11 @@ |
22 | 22 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
23 | 23 | <artifactId>nicolas-lib</artifactId> |
24 | 24 | </dependency> |
25 | + <dependency> | |
26 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
27 | + <artifactId>nicolas-model</artifactId> | |
28 | + <scope>runtime</scope> | |
29 | + </dependency> | |
25 | 30 | |
26 | 31 | <!-- third party --> |
27 | 32 | <dependency> |
... | ... |
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java
... | ... | @@ -5,9 +5,9 @@ import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
8 | 9 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
9 | 10 | import pl.waw.ipipan.zil.summ.nicolas.NicolasException; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
11 | 11 | import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; |
12 | 12 | |
13 | 13 | import java.io.*; |
... | ... |
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java
... | ... | @@ -5,10 +5,10 @@ import org.junit.ClassRule; |
5 | 5 | import org.junit.Test; |
6 | 6 | import org.junit.rules.TemporaryFolder; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
8 | 9 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
11 | 10 | import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; |
11 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | |
12 | 12 | |
13 | 13 | import java.io.File; |
14 | 14 | import java.io.FileInputStream; |
... | ... | @@ -29,7 +29,7 @@ public class ClientTest { |
29 | 29 | @Test |
30 | 30 | public void processSampleText() throws Exception { |
31 | 31 | Preprocessor preprocessor = mock(Preprocessor.class); |
32 | - TText ttext = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); | |
32 | + TText ttext = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); | |
33 | 33 | when(preprocessor.preprocess(any())).thenReturn(ttext); |
34 | 34 | |
35 | 35 | Nicolas nicolas = mock(Nicolas.class); |
... | ... |
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java
... | ... | @@ -4,7 +4,7 @@ import org.apache.commons.io.IOUtils; |
4 | 4 | import org.junit.ClassRule; |
5 | 5 | import org.junit.Test; |
6 | 6 | import org.junit.rules.TemporaryFolder; |
7 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
8 | 8 | |
9 | 9 | import java.io.File; |
10 | 10 | import java.io.FileInputStream; |
... | ... |
nicolas-common/pom.xml deleted
1 | -<?xml version="1.0" encoding="UTF-8"?> | |
2 | -<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | - <modelVersion>4.0.0</modelVersion> | |
6 | - <parent> | |
7 | - <artifactId>nicolas-container</artifactId> | |
8 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
9 | - <version>1.0-SNAPSHOT</version> | |
10 | - </parent> | |
11 | - | |
12 | - <artifactId>nicolas-common</artifactId> | |
13 | - | |
14 | - <dependencies> | |
15 | - <!-- internal --> | |
16 | - <dependency> | |
17 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
18 | - <artifactId>pscapi</artifactId> | |
19 | - </dependency> | |
20 | - <dependency> | |
21 | - <groupId>pl.waw.ipipan.zil.multiservice</groupId> | |
22 | - <artifactId>utils</artifactId> | |
23 | - </dependency> | |
24 | - | |
25 | - <!-- third party --> | |
26 | - <dependency> | |
27 | - <groupId>nz.ac.waikato.cms.weka</groupId> | |
28 | - <artifactId>weka-stable</artifactId> | |
29 | - </dependency> | |
30 | - <dependency> | |
31 | - <groupId>commons-io</groupId> | |
32 | - <artifactId>commons-io</artifactId> | |
33 | - </dependency> | |
34 | - | |
35 | - <!-- logging --> | |
36 | - <dependency> | |
37 | - <groupId>org.slf4j</groupId> | |
38 | - <artifactId>slf4j-api</artifactId> | |
39 | - </dependency> | |
40 | - | |
41 | - </dependencies> | |
42 | - | |
43 | -</project> | |
44 | 0 | \ No newline at end of file |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.common; | |
2 | - | |
3 | -import com.google.common.collect.Lists; | |
4 | -import com.google.common.collect.Maps; | |
5 | -import com.google.common.collect.Sets; | |
6 | -import org.apache.commons.io.IOUtils; | |
7 | -import org.slf4j.Logger; | |
8 | -import org.slf4j.LoggerFactory; | |
9 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
10 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
11 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
12 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | |
13 | -import weka.classifiers.Classifier; | |
14 | -import weka.core.Attribute; | |
15 | -import weka.core.Instances; | |
16 | - | |
17 | -import java.io.*; | |
18 | -import java.util.*; | |
19 | -import java.util.function.Function; | |
20 | -import java.util.stream.Collectors; | |
21 | - | |
22 | -public class Utils { | |
23 | - | |
24 | - private static final Logger LOG = LoggerFactory.getLogger(Utils.class); | |
25 | - | |
26 | - private static final String DATASET_NAME = "Dataset"; | |
27 | - | |
28 | - private Utils() { | |
29 | - } | |
30 | - | |
31 | - public static void writeStringToFile(String string, File file) throws IOException { | |
32 | - try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) { | |
33 | - bw.append(string); | |
34 | - } | |
35 | - } | |
36 | - | |
37 | - public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { | |
38 | - LOG.info("Loading classifier from path: {}...", modelResourcePath); | |
39 | - try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { | |
40 | - if (stream == null) { | |
41 | - throw new IOException("Model not found at: " + modelResourcePath); | |
42 | - } | |
43 | - try (ObjectInputStream ois = new ObjectInputStream(stream)) { | |
44 | - Classifier classifier = (Classifier) ois.readObject(); | |
45 | - LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); | |
46 | - return classifier; | |
47 | - } catch (ClassNotFoundException e) { | |
48 | - LOG.error("Error loading serialized classifier, class not found.", e); | |
49 | - throw new IOException(e); | |
50 | - } | |
51 | - } | |
52 | - } | |
53 | - | |
54 | - public static TText loadThriftTextFromStream(InputStream inputStream) throws IOException { | |
55 | - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(inputStream)) { | |
56 | - return (TText) ois.readObject(); | |
57 | - } catch (ClassNotFoundException e) { | |
58 | - LOG.error("Error reading serialized thrift text file, class not found.", e); | |
59 | - throw new IOException(e); | |
60 | - } | |
61 | - } | |
62 | - | |
63 | - public static TText loadThriftTextFromResource(String textResourcePath) throws IOException { | |
64 | - try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) { | |
65 | - if (stream == null) { | |
66 | - throw new IOException("Resource not found at: " + textResourcePath); | |
67 | - } | |
68 | - return loadThriftTextFromStream(stream); | |
69 | - } | |
70 | - } | |
71 | - | |
72 | - public static List<String> loadLinesFromResource(String resourcePath) throws IOException { | |
73 | - try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) { | |
74 | - return IOUtils.readLines(stream, Constants.ENCODING); | |
75 | - } | |
76 | - } | |
77 | - | |
78 | - @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList | |
79 | - public static Instances createNewInstances(ArrayList<Attribute> attributesList) { | |
80 | - Instances instances = new Instances(DATASET_NAME, attributesList, 0); | |
81 | - instances.setClassIndex(0); | |
82 | - return instances; | |
83 | - } | |
84 | - | |
85 | - public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException { | |
86 | - LOG.info("Loading classifier..."); | |
87 | - try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) { | |
88 | - Classifier classifier = (Classifier) ois.readObject(); | |
89 | - LOG.info("Done. " + classifier.toString()); | |
90 | - return classifier; | |
91 | - } | |
92 | - } | |
93 | - | |
94 | - public static List<String> tokenize(String text) { | |
95 | - return Arrays.asList(text.split("[^\\p{L}0-9]+")); | |
96 | - } | |
97 | - | |
98 | - public static List<String> tokenizeOnWhitespace(String text) { | |
99 | - return Arrays.asList(text.split(" +")); | |
100 | - } | |
101 | - | |
102 | - public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) { | |
103 | - Map<TMention, String> mention2orth = Maps.newHashMap(); | |
104 | - for (TSentence s : sents) { | |
105 | - Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth)); | |
106 | - Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace)); | |
107 | - | |
108 | - for (TMention m : s.getMentions()) { | |
109 | - StringBuffer mentionOrth = new StringBuffer(); | |
110 | - for (String tokId : m.getHeadIds()) { | |
111 | - if (!tokId2nps.get(tokId)) | |
112 | - mentionOrth.append(" "); | |
113 | - mentionOrth.append(tokId2orth.get(tokId)); | |
114 | - } | |
115 | - mention2orth.put(m, mentionOrth.toString().trim()); | |
116 | - } | |
117 | - } | |
118 | - return mention2orth; | |
119 | - } | |
120 | - | |
121 | - private static final Collection<String> STOPWORDS = Sets.newHashSet(); | |
122 | - | |
123 | - static { | |
124 | - STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co")); | |
125 | - } | |
126 | - | |
127 | - public static Map<TMention, String> loadMention2Orth(List<TSentence> sents, boolean discardStopwords) { | |
128 | - Map<TMention, String> mention2orth = Maps.newHashMap(); | |
129 | - for (TSentence s : sents) { | |
130 | - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | |
131 | - | |
132 | - for (TMention m : s.getMentions()) { | |
133 | - StringBuffer mentionOrth = new StringBuffer(); | |
134 | - for (String tokId : m.getChildIds()) { | |
135 | - TToken token = tokId2tok.get(tokId); | |
136 | - if (discardStopwords && STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { | |
137 | - continue; | |
138 | - } | |
139 | - | |
140 | - if (!token.isNoPrecedingSpace()) | |
141 | - mentionOrth.append(" "); | |
142 | - mentionOrth.append(token.getOrth()); | |
143 | - } | |
144 | - mention2orth.put(m, mentionOrth.toString().trim()); | |
145 | - } | |
146 | - } | |
147 | - return mention2orth; | |
148 | - } | |
149 | - | |
150 | - public static Map<TMention, String> loadMention2Base(List<TSentence> sents) { | |
151 | - Map<TMention, String> mention2base = Maps.newHashMap(); | |
152 | - for (TSentence s : sents) { | |
153 | - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase())); | |
154 | - | |
155 | - for (TMention m : s.getMentions()) { | |
156 | - StringBuilder mentionBase = new StringBuilder(); | |
157 | - for (String tokId : m.getChildIds()) { | |
158 | - mentionBase.append(" "); | |
159 | - mentionBase.append(tokId2base.get(tokId)); | |
160 | - } | |
161 | - mention2base.put(m, mentionBase.toString().toLowerCase().trim()); | |
162 | - } | |
163 | - } | |
164 | - return mention2base; | |
165 | - } | |
166 | - | |
167 | - public static String loadSentence2Orth(TSentence sentence) { | |
168 | - return loadSentence2Orth(sentence, Sets.newHashSet()); | |
169 | - } | |
170 | - | |
171 | - public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { | |
172 | - StringBuilder sb = new StringBuilder(); | |
173 | - for (TToken token : sentence.getTokens()) { | |
174 | - if (tokenIdsToSkip.contains(token.getId())) { | |
175 | - System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); | |
176 | - continue; | |
177 | - } | |
178 | - if (!token.isNoPrecedingSpace()) | |
179 | - sb.append(" "); | |
180 | - sb.append(token.getOrth()); | |
181 | - } | |
182 | - return sb.toString().trim(); | |
183 | - } | |
184 | - | |
185 | -} | |
186 | 0 | \ No newline at end of file |
nicolas-eval/pom.xml deleted
1 | -<?xml version="1.0" encoding="UTF-8"?> | |
2 | -<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | - <parent> | |
6 | - <artifactId>nicolas-container</artifactId> | |
7 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
8 | - <version>1.0-SNAPSHOT</version> | |
9 | - </parent> | |
10 | - <modelVersion>4.0.0</modelVersion> | |
11 | - | |
12 | - <artifactId>nicolas-eval</artifactId> | |
13 | - | |
14 | - <dependencies> | |
15 | - <!-- project --> | |
16 | - <dependency> | |
17 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
18 | - <artifactId>nicolas-lib</artifactId> | |
19 | - </dependency> | |
20 | - <dependency> | |
21 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
22 | - <artifactId>nicolas-common</artifactId> | |
23 | - </dependency> | |
24 | - | |
25 | - <!-- internal --> | |
26 | - <dependency> | |
27 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
28 | - <artifactId>eval</artifactId> | |
29 | - </dependency> | |
30 | - | |
31 | - <!-- third party --> | |
32 | - <dependency> | |
33 | - <groupId>nz.ac.waikato.cms.weka</groupId> | |
34 | - <artifactId>weka-stable</artifactId> | |
35 | - </dependency> | |
36 | - <dependency> | |
37 | - <groupId>org.apache.commons</groupId> | |
38 | - <artifactId>commons-lang3</artifactId> | |
39 | - </dependency> | |
40 | - <dependency> | |
41 | - <groupId>com.google.guava</groupId> | |
42 | - <artifactId>guava</artifactId> | |
43 | - </dependency> | |
44 | - | |
45 | - <!-- logging --> | |
46 | - <dependency> | |
47 | - <groupId>org.slf4j</groupId> | |
48 | - <artifactId>slf4j-api</artifactId> | |
49 | - </dependency> | |
50 | - <dependency> | |
51 | - <groupId>org.slf4j</groupId> | |
52 | - <artifactId>slf4j-simple</artifactId> | |
53 | - </dependency> | |
54 | - | |
55 | - </dependencies> | |
56 | -</project> | |
57 | 0 | \ No newline at end of file |
nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt deleted
1 | -199704210012 | |
2 | -199704210042 | |
3 | -199704220007 | |
4 | -199704220018 | |
5 | -199704220021 | |
6 | -199704220044 | |
7 | -199704230006 | |
8 | -199704230014 | |
9 | -199704230029 | |
10 | -199704230043 | |
11 | -199704240008 | |
12 | -199704240019 | |
13 | -199704240020 | |
14 | -199704240021 | |
15 | -199704250018 | |
16 | -199704250022 | |
17 | -199704260014 | |
18 | -199704260015 | |
19 | -199704260016 | |
20 | -199704280023 | |
21 | -199704280025 | |
22 | -199704280027 | |
23 | -199704280031 | |
24 | -199704300031 | |
25 | -199704300042 | |
26 | -199704300046 | |
27 | -199801020010 | |
28 | -199801020031 | |
29 | -199801020035 | |
30 | -199801020070 | |
31 | -199801020076 | |
32 | -199801020079 | |
33 | -199801030068 | |
34 | -199801030090 | |
35 | -199801030091 | |
36 | -199801030129 | |
37 | -199801030148 | |
38 | -199801030158 | |
39 | -199801050023 | |
40 | -199801050059 | |
41 | -199801130087 | |
42 | -199801130129 | |
43 | -199801140182 | |
44 | -199801160119 | |
45 | -199801200106 | |
46 | -199801220140 | |
47 | -199801240061 | |
48 | -199801240096 | |
49 | -199801260047 | |
50 | -199801260070 | |
51 | -199801270055 | |
52 | -199801270110 | |
53 | -199801280123 | |
54 | -199801280158 | |
55 | -199801280159 | |
56 | -199801280241 | |
57 | -199801290022 | |
58 | -199801310003 | |
59 | -199801310037 | |
60 | -199802030127 | |
61 | -199802040159 | |
62 | -199802040182 | |
63 | -199802040202 | |
64 | -199805220133 | |
65 | -199808280158 | |
66 | -199901190073 | |
67 | -199901190115 | |
68 | -199901250112 | |
69 | -199901250117 | |
70 | -199901270103 | |
71 | -199901270120 | |
72 | -199901270122 | |
73 | -199901290095 | |
74 | -199901300101 | |
75 | -199902240095 | |
76 | -199906220029 | |
77 | -199906230024 | |
78 | -199906240084 | |
79 | -199906260027 | |
80 | -199907050045 | |
81 | -199907050076 | |
82 | -199907140166 | |
83 | -199907200002 | |
84 | -199907270004 | |
85 | -199908260001 | |
86 | -199909090036 | |
87 | -199909250018 | |
88 | -199909270029 | |
89 | -199910020027 | |
90 | -199910020029 | |
91 | -199910270011 | |
92 | -199911060044 | |
93 | -199911100038 | |
94 | -199911100064 | |
95 | -199911200030 | |
96 | -199911220063 | |
97 | -199912020060 | |
98 | -199912180026 | |
99 | -199912180034 | |
100 | -199912220030 | |
101 | -199912280024 | |
102 | -199912280046 | |
103 | -199912300021 | |
104 | -199912300029 | |
105 | -200001030029 | |
106 | -200001030053 | |
107 | -200001060034 | |
108 | -200001100035 | |
109 | -200001100046 | |
110 | -200001170029 | |
111 | -200001170033 | |
112 | -200001170060 | |
113 | -200001290045 | |
114 | -200002220027 | |
115 | -200002240034 | |
116 | -200002250031 | |
117 | -200003060062 | |
118 | -200003110050 | |
119 | -200004280047 | |
120 | -200004290022 | |
121 | -200006050119 | |
122 | -200006260079 | |
123 | -200006290045 | |
124 | -200007150033 | |
125 | -200008040076 | |
126 | -200008220042 | |
127 | -200008220046 | |
128 | -200010130049 | |
129 | -200010160054 | |
130 | -200012130034 | |
131 | -200012140084 | |
132 | -200012290046 | |
133 | -200104040019 | |
134 | -200106050035 | |
135 | -200108180109 | |
136 | -200108300032 | |
137 | -200111120045 | |
138 | -200111150042 | |
139 | -200111150047 | |
140 | -200111200036 | |
141 | -200111270049 | |
142 | -200112030055 | |
143 | -200112280057 | |
144 | -200201220038 | |
145 | -200201220050 | |
146 | -200202020036 | |
147 | -200202200032 | |
148 | -200202210054 | |
149 | -200202270044 | |
150 | -200203010070 | |
151 | -200203190026 | |
152 | -200203260050 | |
153 | -200203280017 | |
154 | -200203290078 |
nicolas-lib/pom.xml
... | ... | @@ -12,15 +12,6 @@ |
12 | 12 | <artifactId>nicolas-lib</artifactId> |
13 | 13 | |
14 | 14 | <dependencies> |
15 | - <!-- project --> | |
16 | - <dependency> | |
17 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
18 | - <artifactId>nicolas-common</artifactId> | |
19 | - </dependency> | |
20 | - <dependency> | |
21 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
22 | - <artifactId>nicolas-model</artifactId> | |
23 | - </dependency> | |
24 | 15 | |
25 | 16 | <!-- internal --> |
26 | 17 | <dependency> |
... | ... | @@ -61,5 +52,10 @@ |
61 | 52 | <groupId>junit</groupId> |
62 | 53 | <artifactId>junit</artifactId> |
63 | 54 | </dependency> |
55 | + <dependency> | |
56 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
57 | + <artifactId>nicolas-model</artifactId> | |
58 | + <scope>test</scope> | |
59 | + </dependency> | |
64 | 60 | </dependencies> |
65 | 61 | </project> |
66 | 62 | \ No newline at end of file |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.common; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas; | |
2 | 2 | |
3 | -import com.google.common.base.Charsets; | |
4 | 3 | import com.google.common.collect.ImmutableList; |
5 | 4 | |
6 | 5 | import java.nio.charset.Charset; |
7 | - | |
6 | +import java.nio.charset.StandardCharsets; | |
8 | 7 | |
9 | 8 | public class Constants { |
10 | 9 | |
11 | - private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; | |
10 | + public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); | |
11 | + public static final Charset ENCODING = StandardCharsets.UTF_8; | |
12 | 12 | |
13 | + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; | |
13 | 14 | private static final String MODELS_PATH = ROOT_PATH + "models/"; |
15 | + | |
14 | 16 | public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin"; |
15 | 17 | public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin"; |
16 | 18 | public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin"; |
17 | 19 | |
18 | 20 | private static final String RESOURCES_PATH = ROOT_PATH + "resources/"; |
19 | 21 | public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt"; |
20 | - | |
21 | - public static final Charset ENCODING = Charsets.UTF_8; | |
22 | - | |
23 | - public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); | |
22 | + public static final String STOPWORDS_PATH = RESOURCES_PATH + "stopwords.txt"; | |
24 | 23 | |
25 | 24 | private Constants() { |
26 | 25 | } |
27 | - | |
28 | 26 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... | ... | @@ -5,12 +5,12 @@ import com.google.common.collect.Sets; |
5 | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
10 | 8 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
11 | 9 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
12 | 10 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
13 | 11 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; |
12 | +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; | |
13 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | |
14 | 14 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; |
15 | 15 | import weka.classifiers.Classifier; |
16 | 16 | |
... | ... | @@ -31,9 +31,9 @@ public class Nicolas { |
31 | 31 | |
32 | 32 | public Nicolas() throws NicolasException { |
33 | 33 | try { |
34 | - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | |
35 | - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
36 | - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | |
34 | + mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | |
35 | + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
36 | + zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | |
37 | 37 | |
38 | 38 | mentionFeatureExtractor = new MentionFeatureExtractor(); |
39 | 39 | sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
... | ... | @@ -57,7 +57,7 @@ public class Nicolas { |
57 | 57 | |
58 | 58 | StringBuilder sb = new StringBuilder(); |
59 | 59 | for (TSentence sent : selectedSentences) { |
60 | - sb.append(" ").append(Utils.loadSentence2Orth(sent)); | |
60 | + sb.append(" ").append(TextUtils.loadSentence2Orth(sent)); | |
61 | 61 | } |
62 | 62 | return sb.toString().trim(); |
63 | 63 | } |
... | ... | @@ -74,7 +74,7 @@ public class Nicolas { |
74 | 74 | Random r = new Random(1); |
75 | 75 | Set<TSentence> summary = Sets.newHashSet(); |
76 | 76 | for (TSentence sent : sortedSentences) { |
77 | - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); | |
77 | + size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); | |
78 | 78 | if (r.nextDouble() > 0.4 && size > targetSize) |
79 | 79 | break; |
80 | 80 | summary.add(sent); |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.apply; | |
2 | - | |
3 | -import com.google.common.collect.Lists; | |
4 | -import com.google.common.collect.Maps; | |
5 | -import com.google.common.collect.Sets; | |
6 | -import org.slf4j.Logger; | |
7 | -import org.slf4j.LoggerFactory; | |
8 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
9 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
10 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
11 | -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | |
12 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
13 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | |
14 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
15 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | |
16 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | |
17 | -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | |
18 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectInjector; | |
19 | -import weka.classifiers.Classifier; | |
20 | -import weka.core.Instance; | |
21 | -import weka.core.Instances; | |
22 | - | |
23 | -import java.io.BufferedWriter; | |
24 | -import java.io.File; | |
25 | -import java.io.FileWriter; | |
26 | -import java.util.*; | |
27 | - | |
28 | -import static java.util.stream.Collectors.toList; | |
29 | - | |
30 | -public class ApplyModel { | |
31 | - | |
32 | - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class); | |
33 | - | |
34 | - private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; | |
35 | - private static final String TARGET_DIR = "corpora/summaries"; | |
36 | - | |
37 | - public static void main(String[] args) throws Exception { | |
38 | - Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | |
39 | - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | |
40 | - | |
41 | - Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
42 | - SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); | |
43 | - | |
44 | - ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); | |
45 | - | |
46 | - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH)); | |
47 | - int i = 1; | |
48 | - double avgSize = 0; | |
49 | - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
50 | - TText text = entry.getValue(); | |
51 | - | |
52 | - Set<TMention> goodMentions | |
53 | - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); | |
54 | - | |
55 | - int targetSize = calculateTargetSize(text); | |
56 | - String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor, zeroSubjectInjector); | |
57 | - int size = Utils.tokenize(summary).size(); | |
58 | - avgSize += size; | |
59 | - try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily4.txt")))) { | |
60 | - bw.append(summary); | |
61 | - } | |
62 | - | |
63 | - LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey()); | |
64 | - } | |
65 | - | |
66 | - LOG.info("Avg size:" + avgSize / id2preprocessedText.size()); | |
67 | - } | |
68 | - | |
69 | - private static int calculateTargetSize(TText text) { | |
70 | - List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
71 | - StringBuffer body = new StringBuffer(); | |
72 | - for (TSentence sent : sents) | |
73 | - body.append(Utils.loadSentence2Orth(sent) + " "); | |
74 | - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); | |
75 | - return (int) (0.2 * tokenCount); | |
76 | - } | |
77 | - | |
78 | - private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor, ZeroSubjectInjector zeroSubjectInjector) throws Exception { | |
79 | - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); | |
80 | - | |
81 | - Set<String> zeroSubjectTokenIds = zeroSubjectInjector.findZeroSubjectTokenIds(thrifted, selectedSentences); | |
82 | - | |
83 | - StringBuilder sb = new StringBuilder(); | |
84 | - for (TSentence sent : selectedSentences) { | |
85 | - sb.append(" " + Utils.loadSentence2Orth(sent, zeroSubjectTokenIds)); | |
86 | - } | |
87 | - return sb.toString().trim(); | |
88 | - } | |
89 | - | |
90 | - private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | |
91 | - | |
92 | - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
93 | - | |
94 | - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | |
95 | - Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
96 | - | |
97 | - Map<TSentence, Double> sentence2score = Maps.newHashMap(); | |
98 | - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | |
99 | - Instance instance = entry.getValue(); | |
100 | - instance.setDataset(instances); | |
101 | - double score = sentenceClassifier.classifyInstance(instance); | |
102 | - sentence2score.put(entry.getKey(), score); | |
103 | - } | |
104 | - | |
105 | - List<TSentence> sortedSents = Lists.newArrayList(sents); | |
106 | - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); | |
107 | - | |
108 | - int size = 0; | |
109 | - Random r = new Random(1); | |
110 | - Set<TSentence> summary = Sets.newHashSet(); | |
111 | - for (TSentence sent : sortedSents) { | |
112 | - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); | |
113 | - if (r.nextDouble() > 0.4 && size > targetSize) | |
114 | - break; | |
115 | - summary.add(sent); | |
116 | - if (size > targetSize) | |
117 | - break; | |
118 | - } | |
119 | - List<TSentence> selectedSentences = Lists.newArrayList(); | |
120 | - for (TSentence sent : sents) { | |
121 | - if (summary.contains(sent)) | |
122 | - selectedSentences.add(sent); | |
123 | - } | |
124 | - return selectedSentences; | |
125 | - } | |
126 | - | |
127 | -} |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
... | ... | @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.features; |
3 | 3 | import com.google.common.collect.Maps; |
4 | 4 | import com.google.common.collect.Sets; |
5 | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
7 | 6 | |
8 | 7 | import java.util.List; |
9 | 8 | import java.util.Map; |
... | ... | @@ -38,7 +37,6 @@ public class FeatureHelper { |
38 | 37 | private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); |
39 | 38 | private final Map<TMention, Integer> mention2firstTokenIndex = Maps.newHashMap(); |
40 | 39 | |
41 | - | |
42 | 40 | public FeatureHelper(TText preprocessedText) { |
43 | 41 | text = preprocessedText; |
44 | 42 | |
... | ... | @@ -60,9 +58,9 @@ public class FeatureHelper { |
60 | 58 | int sentIdx = 0; |
61 | 59 | int mentionIdx = 0; |
62 | 60 | for (TParagraph par : preprocessedText.getParagraphs()) { |
63 | - Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences(), false); | |
61 | + Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); | |
64 | 62 | mention2Orth.putAll(m2o); |
65 | - Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); | |
63 | + Map<TMention, String> m2b = loadMention2Base(par.getSentences()); | |
66 | 64 | mention2Base.putAll(m2b); |
67 | 65 | |
68 | 66 | int sentIdxInPar = 0; |
... | ... | @@ -221,4 +219,40 @@ public class FeatureHelper { |
221 | 219 | return null; |
222 | 220 | return mention2sent.get(mention).getTokens().get(idx - 1); |
223 | 221 | } |
222 | + | |
223 | + private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { | |
224 | + Map<TMention, String> mention2orth = Maps.newHashMap(); | |
225 | + for (TSentence s : sents) { | |
226 | + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | |
227 | + | |
228 | + for (TMention m : s.getMentions()) { | |
229 | + StringBuilder mentionOrth = new StringBuilder(); | |
230 | + for (String tokId : m.getChildIds()) { | |
231 | + TToken token = tokId2tok.get(tokId); | |
232 | + if (!token.isNoPrecedingSpace()) | |
233 | + mentionOrth.append(" "); | |
234 | + mentionOrth.append(token.getOrth()); | |
235 | + } | |
236 | + mention2orth.put(m, mentionOrth.toString().trim()); | |
237 | + } | |
238 | + } | |
239 | + return mention2orth; | |
240 | + } | |
241 | + | |
242 | + private static Map<TMention, String> loadMention2Base(List<TSentence> sents) { | |
243 | + Map<TMention, String> mention2base = Maps.newHashMap(); | |
244 | + for (TSentence s : sents) { | |
245 | + Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); | |
246 | + | |
247 | + for (TMention m : s.getMentions()) { | |
248 | + StringBuilder mentionBase = new StringBuilder(); | |
249 | + for (String tokId : m.getChildIds()) { | |
250 | + mentionBase.append(" "); | |
251 | + mentionBase.append(tokId2base.get(tokId)); | |
252 | + } | |
253 | + mention2base.put(m, mentionBase.toString().toLowerCase().trim()); | |
254 | + } | |
255 | + } | |
256 | + return mention2base; | |
257 | + } | |
224 | 258 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
... | ... | @@ -3,11 +3,11 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; |
3 | 3 | import com.google.common.collect.Lists; |
4 | 4 | import com.google.common.collect.Maps; |
5 | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
8 | 7 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; |
9 | 8 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
10 | 9 | import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; |
10 | +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; | |
11 | 11 | import weka.core.Attribute; |
12 | 12 | |
13 | 13 | import java.io.IOException; |
... | ... | @@ -21,7 +21,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
21 | 21 | private final List<String> frequentBases; |
22 | 22 | |
23 | 23 | public MentionFeatureExtractor() throws IOException { |
24 | - frequentBases = loadFrequentBases(); | |
24 | + frequentBases = ResourceUtils.loadFrequentBases(); | |
25 | 25 | |
26 | 26 | //coref |
27 | 27 | addNumericAttributeNormalized("chain_length"); |
... | ... | @@ -80,10 +80,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
80 | 80 | fillSortedAttributes("score"); |
81 | 81 | } |
82 | 82 | |
83 | - private List<String> loadFrequentBases() throws IOException { | |
84 | - return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList()); | |
85 | - } | |
86 | - | |
87 | 83 | private String encodeBase(String base) { |
88 | 84 | return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); |
89 | 85 | } |
... | ... | @@ -177,7 +173,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { |
177 | 173 | Attribute att = getAttributeByName(attributeName); |
178 | 174 | int index = att.indexOfValue(value); |
179 | 175 | if (index == -1) |
180 | - LOG.warn(value + " not found for attribute " + attributeName); | |
176 | + LOG.warn("{} not found for attribute {}", value, attributeName); | |
181 | 177 | attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); |
182 | 178 | } |
183 | 179 | |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
... | ... | @@ -5,8 +5,7 @@ import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; | |
10 | 9 | import weka.classifiers.Classifier; |
11 | 10 | import weka.core.Instance; |
12 | 11 | import weka.core.Instances; |
... | ... | @@ -24,7 +23,7 @@ public class MentionModel { |
24 | 23 | public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception { |
25 | 24 | Set<TMention> goodMentions = Sets.newHashSet(); |
26 | 25 | |
27 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
26 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); | |
28 | 27 | Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor); |
29 | 28 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { |
30 | 29 | Instance instance = entry.getValue(); |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
... | ... | @@ -6,8 +6,7 @@ import org.slf4j.LoggerFactory; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; | |
11 | 10 | import weka.classifiers.Classifier; |
12 | 11 | import weka.core.Instance; |
13 | 12 | import weka.core.Instances; |
... | ... | @@ -23,7 +22,7 @@ public class SentenceModel { |
23 | 22 | } |
24 | 23 | |
25 | 24 | public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { |
26 | - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | |
25 | + Instances instances = InstanceUtils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | |
27 | 26 | Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); |
28 | 27 | |
29 | 28 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java
1 | -package pl.waw.ipipan.zil.summ.nicolas; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.utils; | |
2 | 2 | |
3 | 3 | import com.google.common.collect.Maps; |
4 | 4 | import org.slf4j.Logger; |
... | ... | @@ -11,7 +11,9 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
11 | 11 | import weka.core.Attribute; |
12 | 12 | import weka.core.DenseInstance; |
13 | 13 | import weka.core.Instance; |
14 | +import weka.core.Instances; | |
14 | 15 | |
16 | +import java.util.ArrayList; | |
15 | 17 | import java.util.List; |
16 | 18 | import java.util.Map; |
17 | 19 | import java.util.Set; |
... | ... | @@ -22,6 +24,8 @@ public class InstanceUtils { |
22 | 24 | |
23 | 25 | private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class); |
24 | 26 | |
27 | + private static final String DATASET_NAME = "Dataset"; | |
28 | + | |
25 | 29 | private InstanceUtils() { |
26 | 30 | } |
27 | 31 | |
... | ... | @@ -60,4 +64,11 @@ public class InstanceUtils { |
60 | 64 | LOG.info("Extracted features of {} sentences.", sentence2instance.size()); |
61 | 65 | return sentence2instance; |
62 | 66 | } |
67 | + | |
68 | + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList | |
69 | + public static Instances createNewInstances(ArrayList<Attribute> attributesList) { | |
70 | + Instances instances = new Instances(DATASET_NAME, attributesList, 0); | |
71 | + instances.setClassIndex(0); | |
72 | + return instances; | |
73 | + } | |
63 | 74 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.utils; | |
2 | + | |
3 | +import org.apache.commons.io.IOUtils; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
7 | +import weka.classifiers.Classifier; | |
8 | + | |
9 | +import java.io.IOException; | |
10 | +import java.io.InputStream; | |
11 | +import java.io.ObjectInputStream; | |
12 | +import java.util.List; | |
13 | +import java.util.function.Predicate; | |
14 | +import java.util.stream.Collectors; | |
15 | + | |
16 | +public class ResourceUtils { | |
17 | + | |
18 | + private static final Logger LOG = LoggerFactory.getLogger(ResourceUtils.class); | |
19 | + | |
20 | + private ResourceUtils() { | |
21 | + } | |
22 | + | |
23 | + public static List<String> loadFrequentBases() throws IOException { | |
24 | + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH); | |
25 | + } | |
26 | + | |
27 | + public static List<String> loadStopwords() throws IOException { | |
28 | + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.STOPWORDS_PATH); | |
29 | + } | |
30 | + | |
31 | + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { | |
32 | + LOG.info("Loading classifier from path: {}...", modelResourcePath); | |
33 | + try (InputStream stream = ResourceUtils.class.getResourceAsStream(modelResourcePath)) { | |
34 | + if (stream == null) { | |
35 | + throw new IOException("Model not found at: " + modelResourcePath); | |
36 | + } | |
37 | + try (ObjectInputStream ois = new ObjectInputStream(stream)) { | |
38 | + Classifier classifier = (Classifier) ois.readObject(); | |
39 | + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); | |
40 | + return classifier; | |
41 | + } catch (ClassNotFoundException e) { | |
42 | + LOG.error("Error loading serialized classifier, class not found.", e); | |
43 | + throw new IOException(e); | |
44 | + } | |
45 | + } | |
46 | + } | |
47 | + | |
48 | + private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException { | |
49 | + try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) { | |
50 | + return IOUtils.readLines(stream, Constants.ENCODING) | |
51 | + .stream() | |
52 | + .map(String::trim) | |
53 | + .map(String::toLowerCase) | |
54 | + .filter(((Predicate<String>) String::isEmpty).negate()) | |
55 | + .sorted() | |
56 | + .distinct() | |
57 | + .collect(Collectors.toList()); | |
58 | + } | |
59 | + } | |
60 | + | |
61 | + | |
62 | +} | |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.utils; | |
2 | + | |
3 | +import com.google.common.collect.Sets; | |
4 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | |
6 | + | |
7 | +import java.util.Arrays; | |
8 | +import java.util.List; | |
9 | +import java.util.Set; | |
10 | + | |
11 | +public class TextUtils { | |
12 | + | |
13 | + private TextUtils() { | |
14 | + } | |
15 | + | |
16 | + public static List<String> tokenize(String text) { | |
17 | + return Arrays.asList(text.split("[^\\p{L}0-9]+")); | |
18 | + } | |
19 | + | |
20 | + public static List<String> tokenizeOnWhitespace(String text) { | |
21 | + return Arrays.asList(text.split(" +")); | |
22 | + } | |
23 | + | |
24 | + public static String loadSentence2Orth(TSentence sentence) { | |
25 | + return loadSentence2Orth(sentence, Sets.newHashSet()); | |
26 | + } | |
27 | + | |
28 | + public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { | |
29 | + StringBuilder sb = new StringBuilder(); | |
30 | + for (TToken token : sentence.getTokens()) { | |
31 | + if (tokenIdsToSkip.contains(token.getId())) { | |
32 | + System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); | |
33 | + continue; | |
34 | + } | |
35 | + if (!token.isNoPrecedingSpace()) | |
36 | + sb.append(" "); | |
37 | + sb.append(token.getOrth()); | |
38 | + } | |
39 | + return sb.toString().trim(); | |
40 | + } | |
41 | +} | |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.common; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; | |
2 | 2 | |
3 | 3 | import com.google.common.base.Predicates; |
4 | 4 | import com.google.common.collect.Maps; |
... | ... | @@ -58,4 +58,12 @@ public class ThriftUtils { |
58 | 58 | } |
59 | 59 | } |
60 | 60 | |
61 | + public static TText loadThriftTextFromResource(String resourcePath) { | |
62 | + try (InputStream stream = ThriftUtils.class.getResourceAsStream(resourcePath)) { | |
63 | + return loadThriftTextFromStream(stream); | |
64 | + } catch (IOException e) { | |
65 | + LOG.error("Error reading serialized Thrift text from resource", e); | |
66 | + return null; | |
67 | + } | |
68 | + } | |
61 | 69 | } |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... | ... | @@ -7,7 +7,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
11 | 11 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; |
12 | 12 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
13 | 13 | import weka.core.Attribute; |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java
... | ... | @@ -3,8 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; |
3 | 3 | import com.google.common.collect.Sets; |
4 | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
5 | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
7 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
7 | +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; | |
8 | 8 | import weka.classifiers.Classifier; |
9 | 9 | import weka.core.Instance; |
10 | 10 | import weka.core.Instances; |
... | ... | @@ -24,7 +24,7 @@ public class ZeroSubjectInjector { |
24 | 24 | public ZeroSubjectInjector() throws Exception { |
25 | 25 | classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); |
26 | 26 | featureExtractor = new ZeroFeatureExtractor(); |
27 | - instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
27 | + instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); | |
28 | 28 | } |
29 | 29 | |
30 | 30 | public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { |
... | ... |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java
... | ... | @@ -3,7 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas; |
3 | 3 | import org.junit.BeforeClass; |
4 | 4 | import org.junit.Test; |
5 | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | |
7 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | |
7 | 8 | |
8 | 9 | import static org.junit.Assert.assertTrue; |
9 | 10 | |
... | ... | @@ -20,9 +21,9 @@ public class NicolasTest { |
20 | 21 | |
21 | 22 | @Test |
22 | 23 | public void shouldSummarizeThriftText() throws Exception { |
23 | - TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); | |
24 | + TText thriftText = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); | |
24 | 25 | String summary = nicolas.summarizeThrift(thriftText, 5); |
25 | - int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size(); | |
26 | + int summaryTokensCount = TextUtils.tokenizeOnWhitespace(summary).size(); | |
26 | 27 | assertTrue(summaryTokensCount > 0); |
27 | 28 | assertTrue(summaryTokensCount < 10); |
28 | 29 | } |
... | ... |
nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java renamed to nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.common; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; | |
2 | 2 | |
3 | 3 | import org.junit.Test; |
4 | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
... | ... | @@ -7,13 +7,13 @@ import java.io.InputStream; |
7 | 7 | |
8 | 8 | import static org.junit.Assert.assertEquals; |
9 | 9 | |
10 | -public class UtilsTest { | |
10 | +public class ThriftUtilsTest { | |
11 | 11 | |
12 | 12 | private static final String SAMPLE_TEXT_PATH = "/199704210011.bin"; |
13 | 13 | |
14 | 14 | @Test |
15 | 15 | public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { |
16 | - try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { | |
16 | + try (InputStream stream = ThriftUtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { | |
17 | 17 | TText text = ThriftUtils.loadThriftTextFromStream(stream); |
18 | 18 | assertEquals(26, text.getParagraphs().size()); |
19 | 19 | assertEquals(2, text.getParagraphs().get(4).getSentences().size()); |
... | ... |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
... | ... | @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; |
5 | 5 | import org.junit.Test; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | |
9 | 8 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | |
10 | 10 | |
11 | 11 | import java.io.IOException; |
12 | 12 | import java.io.InputStream; |
... | ... |
nicolas-common/src/test/resources/199704210011.bin renamed to nicolas-lib/src/test/resources/199704210011.bin
No preview for this file type
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore
0 → 100644
nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md
0 → 100644
nicolas-multiservice/pom.xml
... | ... | @@ -30,8 +30,12 @@ |
30 | 30 | |
31 | 31 | <!-- test --> |
32 | 32 | <dependency> |
33 | + <groupId>junit</groupId> | |
34 | + <artifactId>junit</artifactId> | |
35 | + </dependency> | |
36 | + <dependency> | |
33 | 37 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
34 | - <artifactId>nicolas-common</artifactId> | |
38 | + <artifactId>nicolas-lib</artifactId> | |
35 | 39 | <scope>test</scope> |
36 | 40 | </dependency> |
37 | 41 | |
... | ... |
nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java
... | ... | @@ -7,7 +7,7 @@ import org.junit.rules.TemporaryFolder; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | |
11 | 11 | |
12 | 12 | import java.io.File; |
13 | 13 | import java.io.FileInputStream; |
... | ... | @@ -67,7 +67,7 @@ public class PreprocessorIT { |
67 | 67 | preprocessor.preprocessToFile(text, targetFile); |
68 | 68 | |
69 | 69 | try (FileInputStream inputStream = new FileInputStream(targetFile)) { |
70 | - TText processed = Utils.loadThriftTextFromStream(inputStream); | |
70 | + TText processed = ThriftUtils.loadThriftTextFromStream(inputStream); | |
71 | 71 | assertSampleProcessedText(processed); |
72 | 72 | } |
73 | 73 | } |
... | ... |
nicolas-train/pom.xml
... | ... | @@ -15,10 +15,6 @@ |
15 | 15 | <!-- project --> |
16 | 16 | <dependency> |
17 | 17 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
18 | - <artifactId>nicolas-common</artifactId> | |
19 | - </dependency> | |
20 | - <dependency> | |
21 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
22 | 18 | <artifactId>nicolas-lib</artifactId> |
23 | 19 | </dependency> |
24 | 20 | <dependency> |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas; | |
2 | 2 | |
3 | 3 | import net.lingala.zip4j.core.ZipFile; |
4 | 4 | import net.lingala.zip4j.exception.ZipException; |
... | ... | @@ -34,7 +34,7 @@ public class PathConstants { |
34 | 34 | public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); |
35 | 35 | public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); |
36 | 36 | |
37 | - public static final File ARFF_DIR = new File(WORKING_DIR, "train-arff"); | |
37 | + private static final File ARFF_DIR = new File(WORKING_DIR, "train-arff"); | |
38 | 38 | public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff"); |
39 | 39 | public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); |
40 | 40 | public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); |
... | ... |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java
... | ... | @@ -17,7 +17,7 @@ public class Constants { |
17 | 17 | |
18 | 18 | public static Set<String> loadTestTextIds() throws IOException { |
19 | 19 | try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { |
20 | - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING); | |
20 | + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING); | |
21 | 21 | return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); |
22 | 22 | } |
23 | 23 | } |
... | ... |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
... | ... | @@ -7,11 +7,13 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | 8 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
9 | 9 | import pl.waw.ipipan.zil.summ.nicolas.NicolasException; |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | |
12 | 12 | |
13 | 13 | import java.io.File; |
14 | +import java.io.FileOutputStream; | |
14 | 15 | import java.io.IOException; |
16 | +import java.io.OutputStreamWriter; | |
15 | 17 | import java.util.List; |
16 | 18 | import java.util.Map; |
17 | 19 | import java.util.Set; |
... | ... | @@ -23,7 +25,6 @@ public class SummarizeTestCorpus { |
23 | 25 | |
24 | 26 | private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); |
25 | 27 | |
26 | - | |
27 | 28 | private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; |
28 | 29 | private static final double SUMMARY_RATIO = 0.2; |
29 | 30 | |
... | ... | @@ -31,8 +32,8 @@ public class SummarizeTestCorpus { |
31 | 32 | } |
32 | 33 | |
33 | 34 | public static void main(String[] args) throws IOException, NicolasException { |
34 | - File thriftedCorpusDir = new File("data/preprocessed"); | |
35 | - File targetDir = new File("data/summaries"); | |
35 | + File thriftedCorpusDir = new File("data/all-preprocessed"); | |
36 | + File targetDir = new File("data/test-system"); | |
36 | 37 | targetDir.mkdir(); |
37 | 38 | |
38 | 39 | Set<String> testTextIds = loadTestTextIds(); |
... | ... | @@ -62,9 +63,9 @@ public class SummarizeTestCorpus { |
62 | 63 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
63 | 64 | StringBuilder body = new StringBuilder(); |
64 | 65 | for (TSentence sentence : sentences) |
65 | - body.append(Utils.loadSentence2Orth(sentence)).append(" "); | |
66 | + body.append(TextUtils.loadSentence2Orth(sentence)).append(" "); | |
66 | 67 | |
67 | - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); | |
68 | + int tokenCount = TextUtils.tokenizeOnWhitespace(body.toString().trim()).size(); | |
68 | 69 | return (int) (SUMMARY_RATIO * tokenCount); |
69 | 70 | } |
70 | 71 | |
... | ... | @@ -73,7 +74,9 @@ public class SummarizeTestCorpus { |
73 | 74 | String textId = entry.getKey(); |
74 | 75 | String summary = entry.getValue(); |
75 | 76 | String targetFileName = textId + SUMMARY_FILE_SUFFIX; |
76 | - Utils.writeStringToFile(summary, new File(targetDir, targetFileName)); | |
77 | + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) { | |
78 | + writer.write(summary); | |
79 | + } | |
77 | 80 | } |
78 | 81 | } |
79 | 82 | |
... | ... |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java
... | ... | @@ -6,29 +6,63 @@ import com.google.common.collect.Multiset; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | |
10 | 12 | |
13 | +import java.io.IOException; | |
11 | 14 | import java.util.List; |
12 | 15 | import java.util.Map; |
16 | +import java.util.Set; | |
17 | +import java.util.function.Function; | |
13 | 18 | import java.util.stream.Collectors; |
14 | 19 | |
15 | 20 | public class MentionScorer { |
16 | 21 | |
22 | + private final Set<String> STOPWORDS; | |
23 | + | |
24 | + public MentionScorer() throws IOException { | |
25 | + STOPWORDS = ResourceUtils.loadStopwords().stream().collect(Collectors.toSet()); | |
26 | + } | |
27 | + | |
17 | 28 | public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { |
18 | - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | |
29 | + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); | |
19 | 30 | |
20 | 31 | List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); |
21 | - Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences, true); | |
32 | + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences); | |
22 | 33 | |
23 | 34 | return booleanTokenIntersection(mention2Orth, tokenCounts); |
24 | 35 | } |
25 | 36 | |
37 | + private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sents) { | |
38 | + Map<TMention, String> mention2orth = Maps.newHashMap(); | |
39 | + for (TSentence s : sents) { | |
40 | + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); | |
41 | + | |
42 | + for (TMention m : s.getMentions()) { | |
43 | + StringBuilder mentionOrth = new StringBuilder(); | |
44 | + for (String tokId : m.getChildIds()) { | |
45 | + TToken token = tokId2tok.get(tokId); | |
46 | + if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { | |
47 | + continue; | |
48 | + } | |
49 | + | |
50 | + if (!token.isNoPrecedingSpace()) | |
51 | + mentionOrth.append(" "); | |
52 | + mentionOrth.append(token.getOrth()); | |
53 | + } | |
54 | + mention2orth.put(m, mentionOrth.toString().trim()); | |
55 | + } | |
56 | + } | |
57 | + return mention2orth; | |
58 | + } | |
59 | + | |
26 | 60 | private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { |
27 | 61 | Map<TMention, Double> mention2score = Maps.newHashMap(); |
28 | 62 | for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { |
29 | 63 | TMention mention = entry.getKey(); |
30 | 64 | String mentionOrth = mention2Orth.get(mention); |
31 | - for (String token : Utils.tokenize(mentionOrth)) { | |
65 | + for (String token : TextUtils.tokenize(mentionOrth)) { | |
32 | 66 | if (tokenCounts.contains(token.toLowerCase())) { |
33 | 67 | mention2score.put(mention, 1.0); |
34 | 68 | break; |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java
... | ... | @@ -6,22 +6,23 @@ import com.google.common.collect.Multiset; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; | |
10 | 10 | |
11 | 11 | import java.util.List; |
12 | 12 | import java.util.Map; |
13 | 13 | |
14 | 14 | public class SentenceScorer { |
15 | + | |
15 | 16 | public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) { |
16 | - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); | |
17 | + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); | |
17 | 18 | |
18 | 19 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
19 | 20 | for (TParagraph paragraph : preprocessedText.getParagraphs()) |
20 | 21 | for (TSentence sentence : paragraph.getSentences()) { |
21 | 22 | double score = 0.0; |
22 | 23 | |
23 | - String orth = Utils.loadSentence2Orth(sentence); | |
24 | - List<String> tokens = Utils.tokenize(orth); | |
24 | + String orth = TextUtils.loadSentence2Orth(sentence); | |
25 | + List<String> tokens = TextUtils.tokenize(orth); | |
25 | 26 | for (String token : tokens) { |
26 | 27 | score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0; |
27 | 28 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java
... | ... | @@ -5,7 +5,7 @@ import org.apache.commons.csv.CSVFormat; |
5 | 5 | import org.apache.commons.csv.CSVParser; |
6 | 6 | import org.apache.commons.csv.CSVRecord; |
7 | 7 | import org.apache.commons.csv.QuoteMode; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
9 | 9 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
10 | 10 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; |
11 | 11 | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java
... | ... | @@ -8,14 +8,14 @@ import com.google.common.collect.Multiset; |
8 | 8 | import org.apache.commons.io.FileUtils; |
9 | 9 | import pl.waw.ipipan.zil.summ.eval.Main; |
10 | 10 | import pl.waw.ipipan.zil.summ.eval.rouge.RougeN; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
12 | 12 | |
13 | 13 | import java.io.File; |
14 | 14 | import java.io.IOException; |
15 | 15 | import java.util.*; |
16 | 16 | import java.util.stream.Collectors; |
17 | 17 | |
18 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | |
18 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | |
19 | 19 | |
20 | 20 | public class CreateOptimalSummaries { |
21 | 21 | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java
1 | 1 | package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | |
3 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
3 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
4 | 4 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; |
5 | 5 | import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; |
6 | 6 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; |
7 | 7 | |
8 | 8 | import javax.xml.bind.JAXBException; |
9 | 9 | import java.io.File; |
10 | +import java.io.FileOutputStream; | |
10 | 11 | import java.io.IOException; |
12 | +import java.io.OutputStreamWriter; | |
11 | 13 | import java.util.List; |
12 | 14 | import java.util.function.Predicate; |
13 | 15 | import java.util.stream.Collectors; |
14 | 16 | import java.util.stream.Stream; |
15 | 17 | |
16 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | |
18 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | |
17 | 19 | |
18 | 20 | public class ExtractGoldSummaries { |
19 | 21 | |
... | ... | @@ -22,7 +24,6 @@ public class ExtractGoldSummaries { |
22 | 24 | |
23 | 25 | private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); |
24 | 26 | |
25 | - | |
26 | 27 | private ExtractGoldSummaries() { |
27 | 28 | } |
28 | 29 | |
... | ... | @@ -47,7 +48,10 @@ public class ExtractGoldSummaries { |
47 | 48 | for (Summary summary : goldSummaries) { |
48 | 49 | File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR; |
49 | 50 | File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); |
50 | - Utils.writeStringToFile(summary.getBody(), targetFile); | |
51 | + | |
52 | + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { | |
53 | + writer.append(summary.getBody()); | |
54 | + } | |
51 | 55 | } |
52 | 56 | } |
53 | 57 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
... | ... | @@ -10,15 +10,14 @@ import org.slf4j.LoggerFactory; |
10 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
11 | 11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
12 | 12 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
13 | -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | |
14 | -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | |
15 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
16 | 13 | import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
17 | 14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
18 | 15 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
19 | 16 | import pl.waw.ipipan.zil.summ.nicolas.train.model.MentionScorer; |
20 | 17 | import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; |
21 | 18 | import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; |
19 | +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; | |
20 | +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; | |
22 | 21 | import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; |
23 | 22 | import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; |
24 | 23 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; |
... | ... | @@ -37,7 +36,7 @@ import java.util.Set; |
37 | 36 | import java.util.function.Predicate; |
38 | 37 | import java.util.stream.Collectors; |
39 | 38 | |
40 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | |
39 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | |
41 | 40 | |
42 | 41 | public class PrepareTrainingData { |
43 | 42 | |
... | ... | @@ -61,7 +60,7 @@ public class PrepareTrainingData { |
61 | 60 | MentionScorer mentionScorer = new MentionScorer(); |
62 | 61 | MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); |
63 | 62 | |
64 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
63 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); | |
65 | 64 | |
66 | 65 | int i = 1; |
67 | 66 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
... | ... | @@ -105,7 +104,7 @@ public class PrepareTrainingData { |
105 | 104 | SentenceScorer sentenceScorer = new SentenceScorer(); |
106 | 105 | SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); |
107 | 106 | |
108 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
107 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); | |
109 | 108 | |
110 | 109 | int i = 1; |
111 | 110 | for (String textId : id2preprocessedText.keySet()) { |
... | ... | @@ -149,7 +148,7 @@ public class PrepareTrainingData { |
149 | 148 | ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_CORPUS); |
150 | 149 | ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); |
151 | 150 | |
152 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
151 | + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); | |
153 | 152 | |
154 | 153 | int i = 1; |
155 | 154 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java
... | ... | @@ -9,7 +9,7 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; |
9 | 9 | import java.io.File; |
10 | 10 | import java.util.Arrays; |
11 | 11 | |
12 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | |
12 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | |
13 | 13 | |
14 | 14 | public class PreprocessCorpus { |
15 | 15 | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
... | ... | @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.Constants; | |
7 | 7 | import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings; |
8 | 8 | import weka.classifiers.Classifier; |
9 | 9 | import weka.core.Instances; |
... | ... | @@ -14,7 +14,7 @@ import java.io.FileOutputStream; |
14 | 14 | import java.io.ObjectOutputStream; |
15 | 15 | import java.util.logging.LogManager; |
16 | 16 | |
17 | -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; | |
17 | +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; | |
18 | 18 | |
19 | 19 | public class TrainAllModels { |
20 | 20 | |
... | ... |
pom.xml
... | ... | @@ -10,15 +10,12 @@ |
10 | 10 | |
11 | 11 | <packaging>pom</packaging> |
12 | 12 | |
13 | - | |
14 | 13 | <modules> |
15 | 14 | <module>nicolas-lib</module> |
16 | 15 | <module>nicolas-cli</module> |
17 | 16 | <module>nicolas-model</module> |
18 | 17 | <module>nicolas-train</module> |
19 | - <module>nicolas-common</module> | |
20 | 18 | <module>nicolas-multiservice</module> |
21 | - <module>nicolas-eval</module> | |
22 | 19 | </modules> |
23 | 20 | |
24 | 21 | <properties> |
... | ... | @@ -59,23 +56,23 @@ |
59 | 56 | <!-- project --> |
60 | 57 | <dependency> |
61 | 58 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
62 | - <artifactId>nicolas-model</artifactId> | |
59 | + <artifactId>nicolas-cli</artifactId> | |
63 | 60 | <version>${project.version}</version> |
64 | - <scope>runtime</scope> | |
65 | 61 | </dependency> |
66 | 62 | <dependency> |
67 | 63 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
68 | - <artifactId>nicolas-common</artifactId> | |
64 | + <artifactId>nicolas-lib</artifactId> | |
69 | 65 | <version>${project.version}</version> |
70 | 66 | </dependency> |
71 | 67 | <dependency> |
72 | 68 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
73 | - <artifactId>nicolas-zero</artifactId> | |
69 | + <artifactId>nicolas-model</artifactId> | |
74 | 70 | <version>${project.version}</version> |
71 | + <scope>runtime</scope> | |
75 | 72 | </dependency> |
76 | 73 | <dependency> |
77 | 74 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
78 | - <artifactId>nicolas-lib</artifactId> | |
75 | + <artifactId>nicolas-multiservice</artifactId> | |
79 | 76 | <version>${project.version}</version> |
80 | 77 | </dependency> |
81 | 78 | <dependency> |
... | ... | @@ -83,11 +80,6 @@ |
83 | 80 | <artifactId>nicolas-train</artifactId> |
84 | 81 | <version>${project.version}</version> |
85 | 82 | </dependency> |
86 | - <dependency> | |
87 | - <groupId>pl.waw.ipipan.zil.summ</groupId> | |
88 | - <artifactId>nicolas-multiservice</artifactId> | |
89 | - <version>${project.version}</version> | |
90 | - </dependency> | |
91 | 83 | |
92 | 84 | <!-- internal --> |
93 | 85 | <dependency> |
... | ... |