Commit 2169abf847196b768600339e9e18d7ab3fe70f3d
1 parent
cb490cab
WIP
Showing
44 changed files
with
529 additions
and
351 deletions
.gitignore
nicolas-cli/README.md
@@ -3,6 +3,8 @@ | @@ -3,6 +3,8 @@ | ||
3 | This module contains a sample command-line application, which uses Nicolas library to summarize chosen input text file. | 3 | This module contains a sample command-line application, which uses Nicolas library to summarize chosen input text file. |
4 | Summary is written to target output file. Additionally, user needs to specify desired number of tokens in the summary. | 4 | Summary is written to target output file. Additionally, user needs to specify desired number of tokens in the summary. |
5 | 5 | ||
6 | +Be aware that summarizer requires internet access and working Multiservice (multiservice.nlp.ipipan.waw.pl). | ||
7 | + | ||
6 | ## Installation | 8 | ## Installation |
7 | 9 | ||
8 | mvn clean install | 10 | mvn clean install |
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java
@@ -3,10 +3,9 @@ package pl.waw.ipipan.zil.summ.nicolas.cli; | @@ -3,10 +3,9 @@ package pl.waw.ipipan.zil.summ.nicolas.cli; | ||
3 | import org.slf4j.Logger; | 3 | import org.slf4j.Logger; |
4 | import org.slf4j.LoggerFactory; | 4 | import org.slf4j.LoggerFactory; |
5 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | 5 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
6 | +import pl.waw.ipipan.zil.summ.nicolas.NicolasException; | ||
6 | import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; | 7 | import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; |
7 | 8 | ||
8 | -import java.io.IOException; | ||
9 | - | ||
10 | public class Main { | 9 | public class Main { |
11 | 10 | ||
12 | private static final Logger LOG = LoggerFactory.getLogger(Main.class); | 11 | private static final Logger LOG = LoggerFactory.getLogger(Main.class); |
@@ -26,7 +25,7 @@ public class Main { | @@ -26,7 +25,7 @@ public class Main { | ||
26 | try { | 25 | try { |
27 | nicolas = new Nicolas(); | 26 | nicolas = new Nicolas(); |
28 | preprocessor = new Preprocessor(); | 27 | preprocessor = new Preprocessor(); |
29 | - } catch (IOException | ClassNotFoundException e) { | 28 | + } catch (NicolasException e) { |
30 | LOG.error("Error loading Nicolas or Multiservice preprocessor! Will exit."); | 29 | LOG.error("Error loading Nicolas or Multiservice preprocessor! Will exit."); |
31 | return; | 30 | return; |
32 | } | 31 | } |
nicolas-common/pom.xml
@@ -25,7 +25,7 @@ | @@ -25,7 +25,7 @@ | ||
25 | <!-- third party --> | 25 | <!-- third party --> |
26 | <dependency> | 26 | <dependency> |
27 | <groupId>nz.ac.waikato.cms.weka</groupId> | 27 | <groupId>nz.ac.waikato.cms.weka</groupId> |
28 | - <artifactId>weka-dev</artifactId> | 28 | + <artifactId>weka-stable</artifactId> |
29 | </dependency> | 29 | </dependency> |
30 | <dependency> | 30 | <dependency> |
31 | <groupId>commons-io</groupId> | 31 | <groupId>commons-io</groupId> |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.common; | ||
2 | + | ||
3 | +import com.google.common.base.Predicates; | ||
4 | +import com.google.common.collect.Maps; | ||
5 | +import org.slf4j.Logger; | ||
6 | +import org.slf4j.LoggerFactory; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
8 | + | ||
9 | +import java.io.File; | ||
10 | +import java.io.FileInputStream; | ||
11 | +import java.io.IOException; | ||
12 | +import java.io.InputStream; | ||
13 | +import java.util.Map; | ||
14 | +import java.util.function.Predicate; | ||
15 | + | ||
16 | +public class ThriftUtils { | ||
17 | + | ||
18 | + private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class); | ||
19 | + | ||
20 | + private ThriftUtils() { | ||
21 | + } | ||
22 | + | ||
23 | + public static Map<String, TText> loadThriftTextsFromFolder(File folder, Predicate<String> idFilter) { | ||
24 | + Map<String, TText> id2text = Maps.newHashMap(); | ||
25 | + File[] files = folder.listFiles(); | ||
26 | + if (files != null) { | ||
27 | + for (File processedFullTextFile : files) { | ||
28 | + String textId = processedFullTextFile.getName().split("\\.")[0]; | ||
29 | + if (!idFilter.test(textId)) | ||
30 | + continue; | ||
31 | + TText processedFullText = loadThriftTextFromFile(processedFullTextFile); | ||
32 | + id2text.put(textId, processedFullText); | ||
33 | + } | ||
34 | + } | ||
35 | + LOG.info("{} preprocessed texts found.", id2text.size()); | ||
36 | + return id2text; | ||
37 | + } | ||
38 | + | ||
39 | + public static Map<String, TText> loadThriftTextsFromFolder(File folder) { | ||
40 | + return loadThriftTextsFromFolder(folder, Predicates.alwaysTrue()); | ||
41 | + } | ||
42 | + | ||
43 | + public static TText loadThriftTextFromFile(File originalFile) { | ||
44 | + try (FileInputStream inputStream = new FileInputStream(originalFile)) { | ||
45 | + return loadThriftTextFromStream(inputStream); | ||
46 | + } catch (IOException e) { | ||
47 | + LOG.error("Error reading serialized Thrift file", e); | ||
48 | + return null; | ||
49 | + } | ||
50 | + } | ||
51 | + | ||
52 | + public static TText loadThriftTextFromStream(InputStream stream) { | ||
53 | + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { | ||
54 | + return (TText) ois.readObject(); | ||
55 | + } catch (ClassNotFoundException | IOException e) { | ||
56 | + LOG.error("Error reading serialized Thrift stream", e); | ||
57 | + return null; | ||
58 | + } | ||
59 | + } | ||
60 | + | ||
61 | +} |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
@@ -28,6 +28,12 @@ public class Utils { | @@ -28,6 +28,12 @@ public class Utils { | ||
28 | private Utils() { | 28 | private Utils() { |
29 | } | 29 | } |
30 | 30 | ||
31 | + public static void writeStringToFile(String string, File file) throws IOException { | ||
32 | + try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) { | ||
33 | + bw.append(string); | ||
34 | + } | ||
35 | + } | ||
36 | + | ||
31 | public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { | 37 | public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { |
32 | LOG.info("Loading classifier from path: {}...", modelResourcePath); | 38 | LOG.info("Loading classifier from path: {}...", modelResourcePath); |
33 | try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { | 39 | try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { |
@@ -76,44 +82,15 @@ public class Utils { | @@ -76,44 +82,15 @@ public class Utils { | ||
76 | return instances; | 82 | return instances; |
77 | } | 83 | } |
78 | 84 | ||
79 | - public static Classifier loadClassifier(String path) throws IOException, ClassNotFoundException { | 85 | + public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException { |
80 | LOG.info("Loading classifier..."); | 86 | LOG.info("Loading classifier..."); |
81 | - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) { | 87 | + try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) { |
82 | Classifier classifier = (Classifier) ois.readObject(); | 88 | Classifier classifier = (Classifier) ois.readObject(); |
83 | LOG.info("Done. " + classifier.toString()); | 89 | LOG.info("Done. " + classifier.toString()); |
84 | return classifier; | 90 | return classifier; |
85 | } | 91 | } |
86 | } | 92 | } |
87 | 93 | ||
88 | - public static Map<String, TText> loadPreprocessedTexts(String path) { | ||
89 | - Map<String, TText> id2text = Maps.newHashMap(); | ||
90 | - for (File processedFullTextFile : new File(path).listFiles()) { | ||
91 | - TText processedFullText = loadThrifted(processedFullTextFile); | ||
92 | - id2text.put(processedFullTextFile.getName().split("\\.")[0], processedFullText); | ||
93 | - } | ||
94 | - LOG.info(id2text.size() + " preprocessed texts found."); | ||
95 | - return id2text; | ||
96 | - } | ||
97 | - | ||
98 | - | ||
99 | - public static TText loadThrifted(File originalFile) { | ||
100 | - try (FileInputStream inputStream = new FileInputStream(originalFile)) { | ||
101 | - return loadThrifted(inputStream); | ||
102 | - } catch (IOException e) { | ||
103 | - LOG.error("Error reading serialized file: " + e); | ||
104 | - return null; | ||
105 | - } | ||
106 | - } | ||
107 | - | ||
108 | - public static TText loadThrifted(InputStream stream) { | ||
109 | - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { | ||
110 | - return (TText) ois.readObject(); | ||
111 | - } catch (ClassNotFoundException | IOException e) { | ||
112 | - LOG.error("Error reading serialized file: " + e); | ||
113 | - return null; | ||
114 | - } | ||
115 | - } | ||
116 | - | ||
117 | public static List<String> tokenize(String text) { | 94 | public static List<String> tokenize(String text) { |
118 | return Arrays.asList(text.split("[^\\p{L}0-9]+")); | 95 | return Arrays.asList(text.split("[^\\p{L}0-9]+")); |
119 | } | 96 | } |
nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java
@@ -14,7 +14,7 @@ public class UtilsTest { | @@ -14,7 +14,7 @@ public class UtilsTest { | ||
14 | @Test | 14 | @Test |
15 | public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { | 15 | public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { |
16 | try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { | 16 | try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { |
17 | - TText text = Utils.loadThrifted(stream); | 17 | + TText text = ThriftUtils.loadThriftTextFromStream(stream); |
18 | assertEquals(26, text.getParagraphs().size()); | 18 | assertEquals(26, text.getParagraphs().size()); |
19 | assertEquals(2, text.getParagraphs().get(4).getSentences().size()); | 19 | assertEquals(2, text.getParagraphs().get(4).getSentences().size()); |
20 | } | 20 | } |
nicolas-eval/pom.xml
0 → 100644
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | + <parent> | ||
6 | + <artifactId>nicolas-container</artifactId> | ||
7 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
8 | + <version>1.0-SNAPSHOT</version> | ||
9 | + </parent> | ||
10 | + <modelVersion>4.0.0</modelVersion> | ||
11 | + | ||
12 | + <artifactId>nicolas-eval</artifactId> | ||
13 | + | ||
14 | + <dependencies> | ||
15 | + <!-- project --> | ||
16 | + <dependency> | ||
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
18 | + <artifactId>nicolas-lib</artifactId> | ||
19 | + </dependency> | ||
20 | + <dependency> | ||
21 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
22 | + <artifactId>nicolas-common</artifactId> | ||
23 | + </dependency> | ||
24 | + | ||
25 | + <!-- internal --> | ||
26 | + <dependency> | ||
27 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
28 | + <artifactId>eval</artifactId> | ||
29 | + </dependency> | ||
30 | + | ||
31 | + <!-- third party --> | ||
32 | + <dependency> | ||
33 | + <groupId>nz.ac.waikato.cms.weka</groupId> | ||
34 | + <artifactId>weka-stable</artifactId> | ||
35 | + </dependency> | ||
36 | + <dependency> | ||
37 | + <groupId>org.apache.commons</groupId> | ||
38 | + <artifactId>commons-lang3</artifactId> | ||
39 | + </dependency> | ||
40 | + <dependency> | ||
41 | + <groupId>com.google.guava</groupId> | ||
42 | + <artifactId>guava</artifactId> | ||
43 | + </dependency> | ||
44 | + | ||
45 | + <!-- logging --> | ||
46 | + <dependency> | ||
47 | + <groupId>org.slf4j</groupId> | ||
48 | + <artifactId>slf4j-api</artifactId> | ||
49 | + </dependency> | ||
50 | + <dependency> | ||
51 | + <groupId>org.slf4j</groupId> | ||
52 | + <artifactId>slf4j-simple</artifactId> | ||
53 | + </dependency> | ||
54 | + | ||
55 | + </dependencies> | ||
56 | +</project> | ||
0 | \ No newline at end of file | 57 | \ No newline at end of file |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | + | ||
3 | +import org.apache.commons.io.IOUtils; | ||
4 | + | ||
5 | +import java.io.IOException; | ||
6 | +import java.io.InputStream; | ||
7 | +import java.util.List; | ||
8 | +import java.util.Set; | ||
9 | +import java.util.stream.Collectors; | ||
10 | + | ||
11 | +public class Constants { | ||
12 | + | ||
13 | + private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt"; | ||
14 | + | ||
15 | + private Constants() { | ||
16 | + } | ||
17 | + | ||
18 | + public static Set<String> loadTestTextIds() throws IOException { | ||
19 | + try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { | ||
20 | + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING); | ||
21 | + return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); | ||
22 | + } | ||
23 | + } | ||
24 | +} |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.eval.Main; | ||
4 | + | ||
5 | +public class Evaluate { | ||
6 | + | ||
7 | + private Evaluate() { | ||
8 | + } | ||
9 | + | ||
10 | + public static void main(String[] args) { | ||
11 | + String goldDirPath = "data/summaries-gold"; | ||
12 | + String systemDirPath = "data/summaries"; | ||
13 | + Main.main(new String[]{goldDirPath, systemDirPath}); | ||
14 | + } | ||
15 | +} | ||
0 | \ No newline at end of file | 16 | \ No newline at end of file |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/ExtractGoldSummaries.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
4 | +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | ||
5 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; | ||
6 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | ||
7 | + | ||
8 | +import javax.xml.bind.JAXBException; | ||
9 | +import java.io.File; | ||
10 | +import java.io.IOException; | ||
11 | +import java.util.List; | ||
12 | +import java.util.Set; | ||
13 | +import java.util.stream.Collectors; | ||
14 | + | ||
15 | +import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; | ||
16 | + | ||
17 | +public class ExtractGoldSummaries { | ||
18 | + | ||
19 | + private ExtractGoldSummaries() { | ||
20 | + } | ||
21 | + | ||
22 | + public static void main(String[] args) throws IOException, JAXBException { | ||
23 | + File corpusDir = new File("data/corpus/PSC_1.0/data"); | ||
24 | + File targetDir = new File("data/summaries-gold"); | ||
25 | + targetDir.mkdir(); | ||
26 | + | ||
27 | + Set<String> testTextIds = loadTestTextIds(); | ||
28 | + File[] files = corpusDir.listFiles(); | ||
29 | + if (files != null) { | ||
30 | + for (File file : files) { | ||
31 | + Text text = PSC_IO.readText(file); | ||
32 | + if (!testTextIds.contains(text.getId())) | ||
33 | + continue; | ||
34 | + | ||
35 | + List<Summary> goldSummaries = text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals("abstract") && summary.getRatio().equals(20)).collect(Collectors.toList()); | ||
36 | + | ||
37 | + for (Summary summary : goldSummaries) { | ||
38 | + File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); | ||
39 | + Utils.writeStringToFile(summary.getBody(), targetFile); | ||
40 | + } | ||
41 | + } | ||
42 | + } | ||
43 | + } | ||
44 | + | ||
45 | +} |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | ||
2 | + | ||
3 | +import com.google.common.collect.Maps; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
8 | +import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | ||
9 | +import pl.waw.ipipan.zil.summ.nicolas.NicolasException; | ||
10 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
12 | + | ||
13 | +import java.io.File; | ||
14 | +import java.io.IOException; | ||
15 | +import java.util.List; | ||
16 | +import java.util.Map; | ||
17 | +import java.util.Set; | ||
18 | + | ||
19 | +import static java.util.stream.Collectors.toList; | ||
20 | +import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; | ||
21 | + | ||
22 | +public class SummarizeTestCorpus { | ||
23 | + | ||
24 | + private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); | ||
25 | + | ||
26 | + | ||
27 | + private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; | ||
28 | + private static final double SUMMARY_RATIO = 0.2; | ||
29 | + | ||
30 | + private SummarizeTestCorpus() { | ||
31 | + } | ||
32 | + | ||
33 | + public static void main(String[] args) throws IOException, NicolasException { | ||
34 | + File thriftedCorpusDir = new File("data/preprocessed"); | ||
35 | + File targetDir = new File("data/summaries"); | ||
36 | + targetDir.mkdir(); | ||
37 | + | ||
38 | + Set<String> testTextIds = loadTestTextIds(); | ||
39 | + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains); | ||
40 | + LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); | ||
41 | + | ||
42 | + Map<String, String> id2summary = summarizeTexts(id2preprocessedText); | ||
43 | + LOG.info("Texts summarized."); | ||
44 | + | ||
45 | + saveSummariesToFolder(id2summary, targetDir); | ||
46 | + LOG.info("Texts saved to {} folder.", targetDir); | ||
47 | + } | ||
48 | + | ||
49 | + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { | ||
50 | + Map<String, String> id2summary = Maps.newHashMap(); | ||
51 | + Nicolas nicolas = new Nicolas(); | ||
52 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
53 | + TText text = entry.getValue(); | ||
54 | + int targetSize = calculateTargetSize(text); | ||
55 | + String summary = nicolas.summarizeThrift(text, targetSize); | ||
56 | + id2summary.put(entry.getKey(), summary); | ||
57 | + } | ||
58 | + return id2summary; | ||
59 | + } | ||
60 | + | ||
61 | + private static int calculateTargetSize(TText text) { | ||
62 | + List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | ||
63 | + StringBuilder body = new StringBuilder(); | ||
64 | + for (TSentence sentence : sentences) | ||
65 | + body.append(Utils.loadSentence2Orth(sentence)).append(" "); | ||
66 | + | ||
67 | + int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); | ||
68 | + return (int) (SUMMARY_RATIO * tokenCount); | ||
69 | + } | ||
70 | + | ||
71 | + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException { | ||
72 | + for (Map.Entry<String, String> entry : id2summary.entrySet()) { | ||
73 | + String textId = entry.getKey(); | ||
74 | + String summary = entry.getValue(); | ||
75 | + String targetFileName = textId + SUMMARY_FILE_SUFFIX; | ||
76 | + Utils.writeStringToFile(summary, new File(targetDir, targetFileName)); | ||
77 | + } | ||
78 | + } | ||
79 | + | ||
80 | +} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateCommon.java renamed to nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.search; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.eval.search; |
2 | 2 | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.apache.commons.lang3.tuple.Pair; | 4 | import org.apache.commons.lang3.tuple.Pair; |
@@ -35,13 +35,13 @@ import java.util.Random; | @@ -35,13 +35,13 @@ import java.util.Random; | ||
35 | import java.util.logging.LogManager; | 35 | import java.util.logging.LogManager; |
36 | 36 | ||
37 | 37 | ||
38 | -class CrossvalidateCommon { | 38 | +class Crossvalidate { |
39 | 39 | ||
40 | - private static final Logger LOG = LoggerFactory.getLogger(CrossvalidateCommon.class); | 40 | + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); |
41 | 41 | ||
42 | private static final int NUM_FOLDS = 10; | 42 | private static final int NUM_FOLDS = 10; |
43 | 43 | ||
44 | - private CrossvalidateCommon() { | 44 | + private Crossvalidate() { |
45 | } | 45 | } |
46 | 46 | ||
47 | static void crossvalidateClassifiers(String datasetPath) throws IOException { | 47 | static void crossvalidateClassifiers(String datasetPath) throws IOException { |
@@ -77,7 +77,7 @@ class CrossvalidateCommon { | @@ -77,7 +77,7 @@ class CrossvalidateCommon { | ||
77 | new DecisionTable(), new JRip(), new PART(), | 77 | new DecisionTable(), new JRip(), new PART(), |
78 | createAttributeSelectedClassifier()}).parallel().map(cls -> { | 78 | createAttributeSelectedClassifier()}).parallel().map(cls -> { |
79 | String name = cls.getClass().getSimpleName(); | 79 | String name = cls.getClass().getSimpleName(); |
80 | - double acc = 0; | 80 | + double acc; |
81 | Evaluation eval; | 81 | Evaluation eval; |
82 | try { | 82 | try { |
83 | eval = new Evaluation(instances); | 83 | eval = new Evaluation(instances); |
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/test_ids.txt renamed to nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt
nicolas-lib/pom.xml
@@ -35,7 +35,7 @@ | @@ -35,7 +35,7 @@ | ||
35 | <!-- third party --> | 35 | <!-- third party --> |
36 | <dependency> | 36 | <dependency> |
37 | <groupId>nz.ac.waikato.cms.weka</groupId> | 37 | <groupId>nz.ac.waikato.cms.weka</groupId> |
38 | - <artifactId>weka-dev</artifactId> | 38 | + <artifactId>weka-stable</artifactId> |
39 | </dependency> | 39 | </dependency> |
40 | <dependency> | 40 | <dependency> |
41 | <groupId>org.apache.commons</groupId> | 41 | <groupId>org.apache.commons</groupId> |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java
@@ -18,18 +18,18 @@ import java.util.Set; | @@ -18,18 +18,18 @@ import java.util.Set; | ||
18 | 18 | ||
19 | import static java.util.stream.Collectors.toList; | 19 | import static java.util.stream.Collectors.toList; |
20 | 20 | ||
21 | -public class ThriftUtils { | 21 | +public class InstanceUtils { |
22 | 22 | ||
23 | - private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class); | 23 | + private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class); |
24 | 24 | ||
25 | - private ThriftUtils() { | 25 | + private InstanceUtils() { |
26 | } | 26 | } |
27 | 27 | ||
28 | public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { | 28 | public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { |
29 | List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | 29 | List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
30 | Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); | 30 | Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); |
31 | 31 | ||
32 | - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention."); | 32 | + LOG.info("Extracting {} features of each mention.", featureExtractor.getAttributesList().size()); |
33 | Map<TMention, Instance> mention2instance = Maps.newHashMap(); | 33 | Map<TMention, Instance> mention2instance = Maps.newHashMap(); |
34 | for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { | 34 | for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { |
35 | Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | 35 | Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); |
@@ -39,7 +39,7 @@ public class ThriftUtils { | @@ -39,7 +39,7 @@ public class ThriftUtils { | ||
39 | } | 39 | } |
40 | mention2instance.put(tMention, instance); | 40 | mention2instance.put(tMention, instance); |
41 | } | 41 | } |
42 | - LOG.info("Extracted features of " + mention2instance.size() + " mentions."); | 42 | + LOG.info("Extracted features of {} mentions.", mention2instance.size()); |
43 | return mention2instance; | 43 | return mention2instance; |
44 | } | 44 | } |
45 | 45 | ||
@@ -47,7 +47,7 @@ public class ThriftUtils { | @@ -47,7 +47,7 @@ public class ThriftUtils { | ||
47 | List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | 47 | List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
48 | Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); | 48 | Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); |
49 | 49 | ||
50 | - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence."); | 50 | + LOG.info("Extracting {} features of each sentence.", featureExtractor.getAttributesList().size()); |
51 | Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); | 51 | Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); |
52 | for (TSentence sentence : sentences) { | 52 | for (TSentence sentence : sentences) { |
53 | Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); | 53 | Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); |
@@ -57,7 +57,7 @@ public class ThriftUtils { | @@ -57,7 +57,7 @@ public class ThriftUtils { | ||
57 | } | 57 | } |
58 | sentence2instance.put(sentence, instance); | 58 | sentence2instance.put(sentence, instance); |
59 | } | 59 | } |
60 | - LOG.info("Extracted features of " + sentence2instance.size() + " sentences."); | 60 | + LOG.info("Extracted features of {} sentences.", sentence2instance.size()); |
61 | return sentence2instance; | 61 | return sentence2instance; |
62 | } | 62 | } |
63 | } | 63 | } |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -29,14 +29,18 @@ public class Nicolas { | @@ -29,14 +29,18 @@ public class Nicolas { | ||
29 | private final SentenceFeatureExtractor sentenceFeatureExtractor; | 29 | private final SentenceFeatureExtractor sentenceFeatureExtractor; |
30 | private final ZeroFeatureExtractor zeroFeatureExtractor; | 30 | private final ZeroFeatureExtractor zeroFeatureExtractor; |
31 | 31 | ||
32 | - public Nicolas() throws IOException, ClassNotFoundException { | ||
33 | - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | ||
34 | - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | ||
35 | - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | ||
36 | - | ||
37 | - mentionFeatureExtractor = new MentionFeatureExtractor(); | ||
38 | - sentenceFeatureExtractor = new SentenceFeatureExtractor(); | ||
39 | - zeroFeatureExtractor = new ZeroFeatureExtractor(); | 32 | + public Nicolas() throws NicolasException { |
33 | + try { | ||
34 | + mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | ||
35 | + sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | ||
36 | + zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | ||
37 | + | ||
38 | + mentionFeatureExtractor = new MentionFeatureExtractor(); | ||
39 | + sentenceFeatureExtractor = new SentenceFeatureExtractor(); | ||
40 | + zeroFeatureExtractor = new ZeroFeatureExtractor(); | ||
41 | + } catch (IOException e) { | ||
42 | + throw new NicolasException(e); | ||
43 | + } | ||
40 | } | 44 | } |
41 | 45 | ||
42 | public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { | 46 | public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { |
@@ -59,17 +63,17 @@ public class Nicolas { | @@ -59,17 +63,17 @@ public class Nicolas { | ||
59 | } | 63 | } |
60 | 64 | ||
61 | private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { | 65 | private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { |
62 | - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | 66 | + List<TSentence> sentences = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
63 | 67 | ||
64 | Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor); | 68 | Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor); |
65 | 69 | ||
66 | - List<TSentence> sortedSents = Lists.newArrayList(sents); | ||
67 | - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); | 70 | + List<TSentence> sortedSentences = Lists.newArrayList(sentences); |
71 | + sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed()); | ||
68 | 72 | ||
69 | int size = 0; | 73 | int size = 0; |
70 | Random r = new Random(1); | 74 | Random r = new Random(1); |
71 | Set<TSentence> summary = Sets.newHashSet(); | 75 | Set<TSentence> summary = Sets.newHashSet(); |
72 | - for (TSentence sent : sortedSents) { | 76 | + for (TSentence sent : sortedSentences) { |
73 | size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); | 77 | size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); |
74 | if (r.nextDouble() > 0.4 && size > targetSize) | 78 | if (r.nextDouble() > 0.4 && size > targetSize) |
75 | break; | 79 | break; |
@@ -78,7 +82,7 @@ public class Nicolas { | @@ -78,7 +82,7 @@ public class Nicolas { | ||
78 | break; | 82 | break; |
79 | } | 83 | } |
80 | List<TSentence> selectedSentences = Lists.newArrayList(); | 84 | List<TSentence> selectedSentences = Lists.newArrayList(); |
81 | - for (TSentence sent : sents) { | 85 | + for (TSentence sent : sentences) { |
82 | if (summary.contains(sent)) | 86 | if (summary.contains(sent)) |
83 | selectedSentences.add(sent); | 87 | selectedSentences.add(sent); |
84 | } | 88 | } |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java
0 → 100644
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java
@@ -8,8 +8,9 @@ import org.slf4j.LoggerFactory; | @@ -8,8 +8,9 @@ import org.slf4j.LoggerFactory; | ||
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | 11 | +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; |
12 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 12 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | ||
13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 14 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 16 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
@@ -34,15 +35,15 @@ public class ApplyModel { | @@ -34,15 +35,15 @@ public class ApplyModel { | ||
34 | private static final String TARGET_DIR = "corpora/summaries"; | 35 | private static final String TARGET_DIR = "corpora/summaries"; |
35 | 36 | ||
36 | public static void main(String[] args) throws Exception { | 37 | public static void main(String[] args) throws Exception { |
37 | - Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); | 38 | + Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); |
38 | MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | 39 | MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); |
39 | 40 | ||
40 | - Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCE_MODEL_RESOURCE_PATH); | 41 | + Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); |
41 | SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); | 42 | SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
42 | 43 | ||
43 | ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); | 44 | ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); |
44 | 45 | ||
45 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH); | 46 | + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH)); |
46 | int i = 1; | 47 | int i = 1; |
47 | double avgSize = 0; | 48 | double avgSize = 0; |
48 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | 49 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
@@ -91,7 +92,7 @@ public class ApplyModel { | @@ -91,7 +92,7 @@ public class ApplyModel { | ||
91 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | 92 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
92 | 93 | ||
93 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | 94 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); |
94 | - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | 95 | + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); |
95 | 96 | ||
96 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); | 97 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
97 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | 98 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
@@ -5,9 +5,9 @@ import com.google.common.collect.Maps; | @@ -5,9 +5,9 @@ import com.google.common.collect.Maps; | ||
5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
6 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 6 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 7 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | ||
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | ||
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; | 8 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | ||
10 | +import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; | ||
11 | import weka.core.Attribute; | 11 | import weka.core.Attribute; |
12 | 12 | ||
13 | import java.io.IOException; | 13 | import java.io.IOException; |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
@@ -5,7 +5,7 @@ import org.slf4j.Logger; | @@ -5,7 +5,7 @@ import org.slf4j.Logger; | ||
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | 8 | +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; |
9 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
10 | import weka.classifiers.Classifier; | 10 | import weka.classifiers.Classifier; |
11 | import weka.core.Instance; | 11 | import weka.core.Instance; |
@@ -25,7 +25,7 @@ public class MentionModel { | @@ -25,7 +25,7 @@ public class MentionModel { | ||
25 | Set<TMention> goodMentions = Sets.newHashSet(); | 25 | Set<TMention> goodMentions = Sets.newHashSet(); |
26 | 26 | ||
27 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 27 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
28 | - Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(text, featureExtractor); | 28 | + Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor); |
29 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { | 29 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { |
30 | Instance instance = entry.getValue(); | 30 | Instance instance = entry.getValue(); |
31 | instance.setDataset(instances); | 31 | instance.setDataset(instances); |
@@ -34,7 +34,7 @@ public class MentionModel { | @@ -34,7 +34,7 @@ public class MentionModel { | ||
34 | if (good) | 34 | if (good) |
35 | goodMentions.add(entry.getKey()); | 35 | goodMentions.add(entry.getKey()); |
36 | } | 36 | } |
37 | - LOG.info("Classified " + goodMentions.size() + " mentions as good."); | 37 | + LOG.info("Classified {} mentions as good.", goodMentions.size()); |
38 | return goodMentions; | 38 | return goodMentions; |
39 | } | 39 | } |
40 | 40 |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
@@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; | @@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; | ||
2 | 2 | ||
3 | import com.google.common.collect.Maps; | 3 | import com.google.common.collect.Maps; |
4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
5 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | ||
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 5 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; |
6 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | ||
7 | import weka.core.Attribute; | 7 | import weka.core.Attribute; |
8 | 8 | ||
9 | import java.util.List; | 9 | import java.util.List; |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
@@ -6,7 +6,7 @@ import org.slf4j.LoggerFactory; | @@ -6,7 +6,7 @@ import org.slf4j.LoggerFactory; | ||
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | 9 | +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; |
10 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
11 | import weka.classifiers.Classifier; | 11 | import weka.classifiers.Classifier; |
12 | import weka.core.Instance; | 12 | import weka.core.Instance; |
@@ -24,7 +24,7 @@ public class SentenceModel { | @@ -24,7 +24,7 @@ public class SentenceModel { | ||
24 | 24 | ||
25 | public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { | 25 | public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { |
26 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); | 26 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); |
27 | - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | 27 | + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); |
28 | 28 | ||
29 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); | 29 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
30 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | 30 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
@@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | @@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
10 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | ||
12 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 11 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; |
12 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | ||
13 | import weka.core.Attribute; | 13 | import weka.core.Attribute; |
14 | 14 | ||
15 | import java.util.List; | 15 | import java.util.List; |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
@@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; | @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; | ||
5 | import org.junit.Test; | 5 | import org.junit.Test; |
6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 8 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; |
9 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | ||
10 | 10 | ||
11 | import java.io.IOException; | 11 | import java.io.IOException; |
12 | import java.io.InputStream; | 12 | import java.io.InputStream; |
@@ -47,7 +47,7 @@ public class CandidateFinderTest { | @@ -47,7 +47,7 @@ public class CandidateFinderTest { | ||
47 | 47 | ||
48 | private FeatureHelper loadSampleTextHelper() throws IOException { | 48 | private FeatureHelper loadSampleTextHelper() throws IOException { |
49 | try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { | 49 | try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { |
50 | - return new FeatureHelper(Utils.loadThrifted(stream)); | 50 | + return new FeatureHelper(ThriftUtils.loadThriftTextFromStream(stream)); |
51 | } | 51 | } |
52 | } | 52 | } |
53 | } | 53 | } |
54 | \ No newline at end of file | 54 | \ No newline at end of file |
nicolas-train/pom.xml
@@ -25,6 +25,11 @@ | @@ -25,6 +25,11 @@ | ||
25 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 25 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
26 | <artifactId>nicolas-multiservice</artifactId> | 26 | <artifactId>nicolas-multiservice</artifactId> |
27 | </dependency> | 27 | </dependency> |
28 | + <dependency> | ||
29 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
30 | + <artifactId>nicolas-model</artifactId> | ||
31 | + <scope>runtime</scope> | ||
32 | + </dependency> | ||
28 | 33 | ||
29 | <!-- internal --> | 34 | <!-- internal --> |
30 | <dependency> | 35 | <dependency> |
@@ -39,7 +44,7 @@ | @@ -39,7 +44,7 @@ | ||
39 | <!-- third party --> | 44 | <!-- third party --> |
40 | <dependency> | 45 | <dependency> |
41 | <groupId>nz.ac.waikato.cms.weka</groupId> | 46 | <groupId>nz.ac.waikato.cms.weka</groupId> |
42 | - <artifactId>weka-dev</artifactId> | 47 | + <artifactId>weka-stable</artifactId> |
43 | </dependency> | 48 | </dependency> |
44 | <dependency> | 49 | <dependency> |
45 | <groupId>org.apache.commons</groupId> | 50 | <groupId>org.apache.commons</groupId> |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/ModelConstants.java
@@ -5,11 +5,11 @@ import weka.classifiers.trees.RandomForest; | @@ -5,11 +5,11 @@ import weka.classifiers.trees.RandomForest; | ||
5 | 5 | ||
6 | public class ModelConstants { | 6 | public class ModelConstants { |
7 | 7 | ||
8 | - public static final String MENTION_DATASET_PATH = "mentions_train.arff"; | ||
9 | - public static final String SENTENCE_DATASET_PATH = "sentences_train.arff"; | ||
10 | - public static final String ZERO_DATASET_PATH = "zeros_train.arff"; | 8 | + public static final String MENTION_DATASET_PATH = "data/arff/mentions_train.arff"; |
9 | + public static final String SENTENCE_DATASET_PATH = "data/arff/sentences_train.arff"; | ||
10 | + public static final String ZERO_DATASET_PATH = "data/arff/zeros_train.arff"; | ||
11 | 11 | ||
12 | - private static final int NUM_ITERATIONS = 16; | 12 | + private static final int NUM_ITERATIONS = 250; |
13 | private static final int NUM_EXECUTION_SLOTS = 8; | 13 | private static final int NUM_EXECUTION_SLOTS = 8; |
14 | private static final int SEED = 0; | 14 | private static final int SEED = 0; |
15 | 15 | ||
@@ -26,17 +26,17 @@ public class ModelConstants { | @@ -26,17 +26,17 @@ public class ModelConstants { | ||
26 | 26 | ||
27 | public static Classifier getSentenceClassifier() { | 27 | public static Classifier getSentenceClassifier() { |
28 | RandomForest classifier = new RandomForest(); | 28 | RandomForest classifier = new RandomForest(); |
29 | - classifier.setNumIterations(16); | ||
30 | - classifier.setSeed(0); | ||
31 | - classifier.setNumExecutionSlots(8); | 29 | + classifier.setNumIterations(NUM_ITERATIONS); |
30 | + classifier.setSeed(SEED); | ||
31 | + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS); | ||
32 | return classifier; | 32 | return classifier; |
33 | } | 33 | } |
34 | 34 | ||
35 | public static Classifier getZeroClassifier() { | 35 | public static Classifier getZeroClassifier() { |
36 | RandomForest classifier = new RandomForest(); | 36 | RandomForest classifier = new RandomForest(); |
37 | - classifier.setNumIterations(16); | ||
38 | - classifier.setSeed(0); | ||
39 | - classifier.setNumExecutionSlots(8); | 37 | + classifier.setNumIterations(NUM_ITERATIONS); |
38 | + classifier.setSeed(SEED); | ||
39 | + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS); | ||
40 | return classifier; | 40 | return classifier; |
41 | } | 41 | } |
42 | 42 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/TrainModelCommon.java
@@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.model.common; | @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.model.common; | ||
3 | import org.apache.commons.lang3.time.StopWatch; | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | import org.slf4j.Logger; | 4 | import org.slf4j.Logger; |
5 | import org.slf4j.LoggerFactory; | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel; | ||
7 | import weka.classifiers.Classifier; | 6 | import weka.classifiers.Classifier; |
8 | import weka.core.Instances; | 7 | import weka.core.Instances; |
9 | import weka.core.converters.ArffLoader; | 8 | import weka.core.converters.ArffLoader; |
@@ -16,7 +15,7 @@ import java.util.logging.LogManager; | @@ -16,7 +15,7 @@ import java.util.logging.LogManager; | ||
16 | @SuppressWarnings("squid:S2118") | 15 | @SuppressWarnings("squid:S2118") |
17 | public class TrainModelCommon { | 16 | public class TrainModelCommon { |
18 | 17 | ||
19 | - private static final Logger LOG = LoggerFactory.getLogger(TrainZeroModel.class); | 18 | + private static final Logger LOG = LoggerFactory.getLogger(TrainModelCommon.class); |
20 | 19 | ||
21 | private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources"; | 20 | private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources"; |
22 | 21 |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/PrepareTrainingData.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.model.mention; | ||
2 | - | ||
3 | -import com.google.common.base.Charsets; | ||
4 | -import com.google.common.collect.Maps; | ||
5 | -import com.google.common.io.Files; | ||
6 | -import org.slf4j.Logger; | ||
7 | -import org.slf4j.LoggerFactory; | ||
8 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
9 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
10 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | ||
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
12 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
13 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | ||
14 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
15 | -import weka.core.Instance; | ||
16 | -import weka.core.Instances; | ||
17 | -import weka.core.converters.ArffSaver; | ||
18 | - | ||
19 | -import java.io.File; | ||
20 | -import java.io.IOException; | ||
21 | -import java.util.Map; | ||
22 | - | ||
23 | - | ||
24 | -public class PrepareTrainingData { | ||
25 | - | ||
26 | - private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); | ||
27 | - | ||
28 | - private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | ||
29 | - private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | ||
30 | - | ||
31 | - private PrepareTrainingData() { | ||
32 | - } | ||
33 | - | ||
34 | - public static void main(String[] args) throws IOException { | ||
35 | - | ||
36 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); | ||
37 | - Map<String, String> id2optimalSummary = loadOptimalSummaries(); | ||
38 | - | ||
39 | - MentionScorer mentionScorer = new MentionScorer(); | ||
40 | - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | ||
41 | - | ||
42 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | ||
43 | - | ||
44 | - int i = 1; | ||
45 | - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
46 | - LOG.info(i++ + "/" + id2preprocessedText.size()); | ||
47 | - | ||
48 | - String id = entry.getKey(); | ||
49 | - TText preprocessedText = entry.getValue(); | ||
50 | - String optimalSummary = id2optimalSummary.get(id); | ||
51 | - if (optimalSummary == null) | ||
52 | - continue; | ||
53 | - Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); | ||
54 | - | ||
55 | - Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); | ||
56 | - for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) { | ||
57 | - TMention mention = entry2.getKey(); | ||
58 | - Instance instance = entry2.getValue(); | ||
59 | - instance.setDataset(instances); | ||
60 | - instance.setClassValue(mention2score.get(mention)); | ||
61 | - instances.add(instance); | ||
62 | - } | ||
63 | - } | ||
64 | - saveInstancesToFile(instances); | ||
65 | - } | ||
66 | - | ||
67 | - private static void saveInstancesToFile(Instances instances) throws IOException { | ||
68 | - ArffSaver saver = new ArffSaver(); | ||
69 | - saver.setInstances(instances); | ||
70 | - saver.setFile(new File(ModelConstants.MENTION_DATASET_PATH)); | ||
71 | - saver.writeBatch(); | ||
72 | - } | ||
73 | - | ||
74 | - private static Map<String, String> loadOptimalSummaries() throws IOException { | ||
75 | - Map<String, String> id2optimalSummary = Maps.newHashMap(); | ||
76 | - for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) { | ||
77 | - String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | ||
78 | - id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary); | ||
79 | - } | ||
80 | - LOG.info(id2optimalSummary.size() + " optimal summaries found."); | ||
81 | - return id2optimalSummary; | ||
82 | - } | ||
83 | - | ||
84 | - | ||
85 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/PrepareTrainingData.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.model.zero; | ||
2 | - | ||
3 | -import com.google.common.collect.Maps; | ||
4 | -import com.google.common.collect.Sets; | ||
5 | -import org.apache.commons.io.IOUtils; | ||
6 | -import org.slf4j.Logger; | ||
7 | -import org.slf4j.LoggerFactory; | ||
8 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | ||
11 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
12 | -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | ||
13 | -import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; | ||
14 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | ||
15 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | ||
16 | -import weka.core.Instance; | ||
17 | -import weka.core.Instances; | ||
18 | -import weka.core.converters.ArffSaver; | ||
19 | - | ||
20 | -import java.io.File; | ||
21 | -import java.io.FileReader; | ||
22 | -import java.io.IOException; | ||
23 | -import java.util.List; | ||
24 | -import java.util.Map; | ||
25 | -import java.util.Set; | ||
26 | - | ||
27 | -public class PrepareTrainingData { | ||
28 | - | ||
29 | - private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); | ||
30 | - | ||
31 | - private static final String IDS_PATH = "corpora/summaries_dev"; | ||
32 | - private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; | ||
33 | - private static final String GOLD_ZEROS_PATH = "/zeros.tsv"; | ||
34 | - | ||
35 | - private PrepareTrainingData() { | ||
36 | - } | ||
37 | - | ||
38 | - public static void main(String[] args) throws IOException { | ||
39 | - | ||
40 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); | ||
41 | - Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH); | ||
42 | - | ||
43 | - ZeroScorer zeroScorer = new ZeroScorer(GOLD_ZEROS_PATH); | ||
44 | - ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); | ||
45 | - | ||
46 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | ||
47 | - | ||
48 | - int i = 1; | ||
49 | - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
50 | - LOG.info(i++ + "/" + id2preprocessedText.size()); | ||
51 | - | ||
52 | - String textId = entry.getKey(); | ||
53 | - | ||
54 | - TText text = entry.getValue(); | ||
55 | - Set<String> sentenceIds = id2sentIds.get(textId); | ||
56 | - FeatureHelper featureHelper = new FeatureHelper(text); | ||
57 | - | ||
58 | - List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); | ||
59 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | ||
60 | - | ||
61 | - for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { | ||
62 | - boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); | ||
63 | - Instance instance = entry2.getValue(); | ||
64 | - instance.setDataset(instances); | ||
65 | - instance.setClassValue(good ? 1 : 0); | ||
66 | - instances.add(instance); | ||
67 | - } | ||
68 | - } | ||
69 | - | ||
70 | - saveInstancesToFile(instances); | ||
71 | - } | ||
72 | - | ||
73 | - | ||
74 | - private static void saveInstancesToFile(Instances instances) throws IOException { | ||
75 | - ArffSaver saver = new ArffSaver(); | ||
76 | - saver.setInstances(instances); | ||
77 | - saver.setFile(new File(ModelConstants.ZERO_DATASET_PATH)); | ||
78 | - saver.writeBatch(); | ||
79 | - } | ||
80 | - | ||
81 | - private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { | ||
82 | - Map<String, Set<String>> result = Maps.newHashMap(); | ||
83 | - for (File f : new File(idsPath).listFiles()) { | ||
84 | - String id = f.getName().split("_")[0]; | ||
85 | - List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | ||
86 | - result.put(id, Sets.newHashSet(sentenceIds)); | ||
87 | - } | ||
88 | - return result; | ||
89 | - } | ||
90 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/ZeroScorer.java
@@ -6,7 +6,7 @@ import org.apache.commons.csv.CSVParser; | @@ -6,7 +6,7 @@ import org.apache.commons.csv.CSVParser; | ||
6 | import org.apache.commons.csv.CSVRecord; | 6 | import org.apache.commons.csv.CSVRecord; |
7 | import org.apache.commons.csv.QuoteMode; | 7 | import org.apache.commons.csv.QuoteMode; |
8 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | 9 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; |
10 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | 10 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; |
11 | 11 | ||
12 | import java.io.IOException; | 12 | import java.io.IOException; |
@@ -21,8 +21,8 @@ public class ZeroScorer { | @@ -21,8 +21,8 @@ public class ZeroScorer { | ||
21 | 21 | ||
22 | private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap(); | 22 | private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap(); |
23 | 23 | ||
24 | - public ZeroScorer(String goldZerosPath) throws IOException { | ||
25 | - try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosPath); | 24 | + public ZeroScorer(String goldZerosResourcePath) throws IOException { |
25 | + try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosResourcePath); | ||
26 | InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING); | 26 | InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING); |
27 | CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) { | 27 | CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) { |
28 | List<CSVRecord> records = parser.getRecords(); | 28 | List<CSVRecord> records = parser.getRecords(); |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadAndPreprocessCorpus.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | ||
3 | import net.lingala.zip4j.core.ZipFile; | 3 | import net.lingala.zip4j.core.ZipFile; |
4 | import org.apache.commons.io.FileUtils; | 4 | import org.apache.commons.io.FileUtils; |
5 | import org.slf4j.Logger; | 5 | import org.slf4j.Logger; |
6 | import org.slf4j.LoggerFactory; | 6 | import org.slf4j.LoggerFactory; |
7 | -import pl.waw.ipipan.zil.summ.nicolas.train.preprocess.Main; | ||
8 | 7 | ||
9 | import java.io.File; | 8 | import java.io.File; |
10 | import java.net.URL; | 9 | import java.net.URL; |
@@ -45,7 +44,7 @@ public class DownloadAndPreprocessCorpus { | @@ -45,7 +44,7 @@ public class DownloadAndPreprocessCorpus { | ||
45 | 44 | ||
46 | File preprocessed = new File(WORKING_DIR, "preprocessed"); | 45 | File preprocessed = new File(WORKING_DIR, "preprocessed"); |
47 | createFolder(preprocessed.getPath()); | 46 | createFolder(preprocessed.getPath()); |
48 | - Main.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); | 47 | + Preprocess.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); |
49 | } | 48 | } |
50 | 49 | ||
51 | private static File createFolder(String path) { | 50 | private static File createFolder(String path) { |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | ||
3 | import com.google.common.base.Charsets; | 3 | import com.google.common.base.Charsets; |
4 | import com.google.common.collect.Maps; | 4 | import com.google.common.collect.Maps; |
5 | +import com.google.common.collect.Sets; | ||
5 | import com.google.common.io.Files; | 6 | import com.google.common.io.Files; |
7 | +import org.apache.commons.io.IOUtils; | ||
6 | import org.slf4j.Logger; | 8 | import org.slf4j.Logger; |
7 | import org.slf4j.LoggerFactory; | 9 | import org.slf4j.LoggerFactory; |
8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | 11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 12 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | 13 | +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; |
12 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | 14 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
15 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | ||
13 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | 16 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
17 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | ||
14 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | 18 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; | 19 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
16 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; | 20 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
17 | import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | 21 | import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; |
22 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.MentionScorer; | ||
23 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.SentenceScorer; | ||
24 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.ZeroScorer; | ||
25 | +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | ||
26 | +import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; | ||
27 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | ||
28 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | ||
18 | import weka.classifiers.Classifier; | 29 | import weka.classifiers.Classifier; |
19 | import weka.core.Instance; | 30 | import weka.core.Instance; |
20 | import weka.core.Instances; | 31 | import weka.core.Instances; |
21 | import weka.core.converters.ArffSaver; | 32 | import weka.core.converters.ArffSaver; |
22 | 33 | ||
23 | import java.io.File; | 34 | import java.io.File; |
35 | +import java.io.FileReader; | ||
24 | import java.io.IOException; | 36 | import java.io.IOException; |
37 | +import java.io.InputStream; | ||
38 | +import java.util.List; | ||
25 | import java.util.Map; | 39 | import java.util.Map; |
26 | import java.util.Set; | 40 | import java.util.Set; |
27 | - | 41 | +import java.util.function.Predicate; |
42 | +import java.util.stream.Collectors; | ||
28 | 43 | ||
29 | public class PrepareTrainingData { | 44 | public class PrepareTrainingData { |
30 | 45 | ||
31 | private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); | 46 | private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); |
32 | 47 | ||
33 | - private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | ||
34 | - private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | 48 | + private static final String THRIFT_TEXTS_PATH = "data/preprocessed"; |
49 | + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "data/summaries-optimal"; | ||
50 | + private static final String SUMMARY_SENTENCE_IDS = "data/summaries-sentence-ids"; | ||
51 | + | ||
52 | + private static final String ZERO_TRAINING_DATA_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_zero.tsv"; | ||
53 | + private static final String TRAIN_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt"; | ||
35 | 54 | ||
36 | private PrepareTrainingData() { | 55 | private PrepareTrainingData() { |
37 | } | 56 | } |
38 | 57 | ||
39 | public static void main(String[] args) throws Exception { | 58 | public static void main(String[] args) throws Exception { |
59 | + Set<String> trainTextIds = loadTrainTextIds(); | ||
60 | + | ||
61 | + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(THRIFT_TEXTS_PATH), trainTextIds::contains); | ||
62 | + Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains); | ||
63 | + | ||
64 | + prepareMentionsDataset(id2preprocessedText, id2optimalSummary); | ||
65 | + prepareSentencesDataset(id2preprocessedText, id2optimalSummary); | ||
66 | + prepareZerosDataset(id2preprocessedText); | ||
67 | + } | ||
68 | + | ||
69 | + public static void prepareMentionsDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException { | ||
70 | + MentionScorer mentionScorer = new MentionScorer(); | ||
71 | + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | ||
72 | + | ||
73 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | ||
74 | + | ||
75 | + int i = 1; | ||
76 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
77 | + LOG.info("{}/{}", i++, id2preprocessedText.size()); | ||
78 | + | ||
79 | + String id = entry.getKey(); | ||
80 | + TText preprocessedText = entry.getValue(); | ||
81 | + String optimalSummary = id2optimalSummary.get(id); | ||
82 | + if (optimalSummary == null) | ||
83 | + continue; | ||
84 | + Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); | ||
85 | + | ||
86 | + Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); | ||
87 | + for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) { | ||
88 | + TMention mention = entry2.getKey(); | ||
89 | + Instance instance = entry2.getValue(); | ||
90 | + instance.setDataset(instances); | ||
91 | + instance.setClassValue(mention2score.get(mention)); | ||
92 | + instances.add(instance); | ||
93 | + } | ||
94 | + } | ||
95 | + saveInstancesToFile(instances, new File(ModelConstants.MENTION_DATASET_PATH)); | ||
96 | + } | ||
97 | + | ||
98 | + private static Set<String> loadTrainTextIds() throws IOException { | ||
99 | + try (InputStream inputStream = PrepareTrainingData.class.getResourceAsStream(TRAIN_TEXT_IDS_RESOURCE_PATH)) { | ||
100 | + List<String> testTextIds = IOUtils.readLines(inputStream, Constants.ENCODING); | ||
101 | + return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); | ||
102 | + } | ||
103 | + } | ||
40 | 104 | ||
41 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); | ||
42 | - Map<String, String> id2optimalSummary = loadOptimalSummaries(); | 105 | + public static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { |
43 | 106 | ||
44 | SentenceScorer sentenceScorer = new SentenceScorer(); | 107 | SentenceScorer sentenceScorer = new SentenceScorer(); |
45 | SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); | 108 | SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); |
46 | 109 | ||
47 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | 110 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
48 | 111 | ||
49 | - Classifier classifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); | 112 | + Classifier classifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); |
50 | MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); | 113 | MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); |
51 | 114 | ||
52 | int i = 1; | 115 | int i = 1; |
53 | for (String textId : id2preprocessedText.keySet()) { | 116 | for (String textId : id2preprocessedText.keySet()) { |
54 | - LOG.info(i++ + "/" + id2preprocessedText.size()); | 117 | + LOG.info("{}/{}", i++, id2preprocessedText.size()); |
55 | 118 | ||
56 | TText preprocessedText = id2preprocessedText.get(textId); | 119 | TText preprocessedText = id2preprocessedText.get(textId); |
57 | String optimalSummary = id2optimalSummary.get(textId); | 120 | String optimalSummary = id2optimalSummary.get(textId); |
@@ -64,7 +127,7 @@ public class PrepareTrainingData { | @@ -64,7 +127,7 @@ public class PrepareTrainingData { | ||
64 | // Set<TMention> goodMentions | 127 | // Set<TMention> goodMentions |
65 | // = Utils.loadGoldGoodMentions(textId, preprocessedText, true); | 128 | // = Utils.loadGoldGoodMentions(textId, preprocessedText, true); |
66 | 129 | ||
67 | - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); | 130 | + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); |
68 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { | 131 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
69 | TSentence sentence = entry.getKey(); | 132 | TSentence sentence = entry.getKey(); |
70 | Instance instance = entry.getValue(); | 133 | Instance instance = entry.getValue(); |
@@ -73,25 +136,74 @@ public class PrepareTrainingData { | @@ -73,25 +136,74 @@ public class PrepareTrainingData { | ||
73 | instances.add(instance); | 136 | instances.add(instance); |
74 | } | 137 | } |
75 | } | 138 | } |
76 | - saveInstancesToFile(instances); | 139 | + saveInstancesToFile(instances, new File(ModelConstants.SENTENCE_DATASET_PATH)); |
140 | + } | ||
141 | + | ||
142 | + public static void prepareZerosDataset(Map<String, TText> id2preprocessedText) throws IOException { | ||
143 | + | ||
144 | + Map<String, Set<String>> id2sentIds = loadSentenceIds(SUMMARY_SENTENCE_IDS); | ||
145 | + | ||
146 | + ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_DATA_RESOURCE_PATH); | ||
147 | + ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); | ||
148 | + | ||
149 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | ||
150 | + | ||
151 | + int i = 1; | ||
152 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | ||
153 | + LOG.info(i++ + "/" + id2preprocessedText.size()); | ||
154 | + | ||
155 | + String textId = entry.getKey(); | ||
156 | + | ||
157 | + TText text = entry.getValue(); | ||
158 | + Set<String> sentenceIds = id2sentIds.get(textId); | ||
159 | + FeatureHelper featureHelper = new FeatureHelper(text); | ||
160 | + | ||
161 | + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); | ||
162 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | ||
163 | + | ||
164 | + for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { | ||
165 | + boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); | ||
166 | + Instance instance = entry2.getValue(); | ||
167 | + instance.setDataset(instances); | ||
168 | + instance.setClassValue(good ? 1 : 0); | ||
169 | + instances.add(instance); | ||
170 | + } | ||
171 | + } | ||
172 | + | ||
173 | + saveInstancesToFile(instances, new File(ModelConstants.ZERO_DATASET_PATH)); | ||
174 | + } | ||
175 | + | ||
176 | + private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { | ||
177 | + Map<String, Set<String>> result = Maps.newHashMap(); | ||
178 | + File[] files = new File(idsPath).listFiles(); | ||
179 | + if (files != null) | ||
180 | + for (File f : files) { | ||
181 | + String id = f.getName().split("_")[0]; | ||
182 | + List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | ||
183 | + result.put(id, Sets.newHashSet(sentenceIds)); | ||
184 | + } | ||
185 | + return result; | ||
77 | } | 186 | } |
78 | 187 | ||
79 | - private static void saveInstancesToFile(Instances instances) throws IOException { | 188 | + private static void saveInstancesToFile(Instances instances, File targetFile) throws IOException { |
80 | ArffSaver saver = new ArffSaver(); | 189 | ArffSaver saver = new ArffSaver(); |
81 | saver.setInstances(instances); | 190 | saver.setInstances(instances); |
82 | - saver.setFile(new File(ModelConstants.SENTENCE_DATASET_PATH)); | 191 | + saver.setFile(targetFile); |
83 | saver.writeBatch(); | 192 | saver.writeBatch(); |
84 | } | 193 | } |
85 | 194 | ||
86 | - private static Map<String, String> loadOptimalSummaries() throws IOException { | 195 | + private static Map<String, String> loadOptimalSummaries(Predicate<String> idFilter) throws IOException { |
87 | Map<String, String> id2optimalSummary = Maps.newHashMap(); | 196 | Map<String, String> id2optimalSummary = Maps.newHashMap(); |
88 | - for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) { | ||
89 | - String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | ||
90 | - id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary); | ||
91 | - } | ||
92 | - LOG.info(id2optimalSummary.size() + " optimal summaries found."); | 197 | + File[] files = new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles(); |
198 | + if (files != null) | ||
199 | + for (File optimalSummaryFile : files) { | ||
200 | + String textId = optimalSummaryFile.getName().split("_")[0]; | ||
201 | + if (!idFilter.test(textId)) | ||
202 | + continue; | ||
203 | + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | ||
204 | + id2optimalSummary.put(textId, optimalSummary); | ||
205 | + } | ||
206 | + LOG.info("{} optimal summaries found.", id2optimalSummary.size()); | ||
93 | return id2optimalSummary; | 207 | return id2optimalSummary; |
94 | } | 208 | } |
95 | - | ||
96 | - | ||
97 | } | 209 | } |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/Preprocess.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.preprocess; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | ||
3 | import org.slf4j.Logger; | 3 | import org.slf4j.Logger; |
4 | import org.slf4j.LoggerFactory; | 4 | import org.slf4j.LoggerFactory; |
@@ -9,19 +9,19 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | @@ -9,19 +9,19 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | ||
9 | import java.io.File; | 9 | import java.io.File; |
10 | import java.util.Arrays; | 10 | import java.util.Arrays; |
11 | 11 | ||
12 | -public class Main { | 12 | +public class Preprocess { |
13 | 13 | ||
14 | - private static final Logger LOG = LoggerFactory.getLogger(Main.class); | 14 | + private static final Logger LOG = LoggerFactory.getLogger(Preprocess.class); |
15 | 15 | ||
16 | private static final String CORPUS_FILE_SUFFIX = ".xml"; | 16 | private static final String CORPUS_FILE_SUFFIX = ".xml"; |
17 | private static final String OUTPUT_FILE_SUFFIX = ".thrift"; | 17 | private static final String OUTPUT_FILE_SUFFIX = ".thrift"; |
18 | 18 | ||
19 | - private Main() { | 19 | + private Preprocess() { |
20 | } | 20 | } |
21 | 21 | ||
22 | public static void main(String[] args) { | 22 | public static void main(String[] args) { |
23 | if (args.length != 2) { | 23 | if (args.length != 2) { |
24 | - LOG.error("Wrong usage! Try " + Main.class.getSimpleName() + " dirWithCorpusFiles targetDir"); | 24 | + LOG.error("Wrong usage! Try " + Preprocess.class.getSimpleName() + " dirWithCorpusFiles targetDir"); |
25 | return; | 25 | return; |
26 | } | 26 | } |
27 | File corpusDir = new File(args[0]); | 27 | File corpusDir = new File(args[0]); |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/TrainAllModels.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; |
2 | 2 | ||
3 | import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel; | 3 | import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.TrainMentionModel; |
4 | import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel; | 4 | import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.TrainSentenceModel; |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateMention.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.search; | ||
2 | - | ||
3 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
4 | - | ||
5 | - | ||
6 | -public class CrossvalidateMention { | ||
7 | - | ||
8 | - private CrossvalidateMention() { | ||
9 | - } | ||
10 | - | ||
11 | - public static void main(String[] args) throws Exception { | ||
12 | - CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.MENTION_DATASET_PATH); | ||
13 | - } | ||
14 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateSentence.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.search; | ||
2 | - | ||
3 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
4 | - | ||
5 | - | ||
6 | -public class CrossvalidateSentence { | ||
7 | - | ||
8 | - private CrossvalidateSentence() { | ||
9 | - } | ||
10 | - | ||
11 | - public static void main(String[] args) throws Exception { | ||
12 | - CrossvalidateCommon.crossvalidateRegressors(ModelConstants.SENTENCE_DATASET_PATH); | ||
13 | - } | ||
14 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateZero.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.search; | ||
2 | - | ||
3 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | ||
4 | - | ||
5 | - | ||
6 | -public class CrossvalidateZero { | ||
7 | - | ||
8 | - private CrossvalidateZero() { | ||
9 | - } | ||
10 | - | ||
11 | - public static void main(String[] args) throws Exception { | ||
12 | - CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.ZERO_DATASET_PATH); | ||
13 | - } | ||
14 | -} |
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/dev_ids.txt renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/zeros.tsv renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/train_zero.tsv
pom.xml
@@ -18,6 +18,7 @@ | @@ -18,6 +18,7 @@ | ||
18 | <module>nicolas-train</module> | 18 | <module>nicolas-train</module> |
19 | <module>nicolas-common</module> | 19 | <module>nicolas-common</module> |
20 | <module>nicolas-multiservice</module> | 20 | <module>nicolas-multiservice</module> |
21 | + <module>nicolas-eval</module> | ||
21 | </modules> | 22 | </modules> |
22 | 23 | ||
23 | <properties> | 24 | <properties> |
@@ -27,10 +28,11 @@ | @@ -27,10 +28,11 @@ | ||
27 | 28 | ||
28 | <pscapi.version>1.0</pscapi.version> | 29 | <pscapi.version>1.0</pscapi.version> |
29 | <utils.version>1.0</utils.version> | 30 | <utils.version>1.0</utils.version> |
31 | + <eval.version>1.0</eval.version> | ||
30 | 32 | ||
31 | <commons-csv.version>1.4</commons-csv.version> | 33 | <commons-csv.version>1.4</commons-csv.version> |
32 | <guava.version>21.0</guava.version> | 34 | <guava.version>21.0</guava.version> |
33 | - <weka-dev.version>3.9.1</weka-dev.version> | 35 | + <weka-stable.version>3.8.1</weka-stable.version> |
34 | <commons-lang3.version>3.5</commons-lang3.version> | 36 | <commons-lang3.version>3.5</commons-lang3.version> |
35 | <commons-io.version>2.5</commons-io.version> | 37 | <commons-io.version>2.5</commons-io.version> |
36 | <slf4j-api.version>1.7.22</slf4j-api.version> | 38 | <slf4j-api.version>1.7.22</slf4j-api.version> |
@@ -98,6 +100,11 @@ | @@ -98,6 +100,11 @@ | ||
98 | <artifactId>utils</artifactId> | 100 | <artifactId>utils</artifactId> |
99 | <version>${utils.version}</version> | 101 | <version>${utils.version}</version> |
100 | </dependency> | 102 | </dependency> |
103 | + <dependency> | ||
104 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
105 | + <artifactId>eval</artifactId> | ||
106 | + <version>${eval.version}</version> | ||
107 | + </dependency> | ||
101 | 108 | ||
102 | <!-- third party --> | 109 | <!-- third party --> |
103 | <dependency> | 110 | <dependency> |
@@ -112,8 +119,8 @@ | @@ -112,8 +119,8 @@ | ||
112 | </dependency> | 119 | </dependency> |
113 | <dependency> | 120 | <dependency> |
114 | <groupId>nz.ac.waikato.cms.weka</groupId> | 121 | <groupId>nz.ac.waikato.cms.weka</groupId> |
115 | - <artifactId>weka-dev</artifactId> | ||
116 | - <version>${weka-dev.version}</version> | 122 | + <artifactId>weka-stable</artifactId> |
123 | + <version>${weka-stable.version}</version> | ||
117 | <exclusions> | 124 | <exclusions> |
118 | <exclusion> | 125 | <exclusion> |
119 | <groupId>org.slf4j</groupId> | 126 | <groupId>org.slf4j</groupId> |