Commit 2169abf847196b768600339e9e18d7ab3fe70f3d
1 parent
cb490cab
WIP
Showing
44 changed files
with
529 additions
and
351 deletions
.gitignore
nicolas-cli/README.md
... | ... | @@ -3,6 +3,8 @@ |
3 | 3 | This module contains a sample command-line application, which uses Nicolas library to summarize chosen input text file. |
4 | 4 | Summary is written to target output file. Additionally, user needs to specify desired number of tokens in the summary. |
5 | 5 | |
6 | +Be aware that summarizer requires internet access and working Multiservice (multiservice.nlp.ipipan.waw.pl). | |
7 | + | |
6 | 8 | ## Installation |
7 | 9 | |
8 | 10 | mvn clean install |
... | ... |
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java
... | ... | @@ -3,10 +3,9 @@ package pl.waw.ipipan.zil.summ.nicolas.cli; |
3 | 3 | import org.slf4j.Logger; |
4 | 4 | import org.slf4j.LoggerFactory; |
5 | 5 | import pl.waw.ipipan.zil.summ.nicolas.Nicolas; |
6 | +import pl.waw.ipipan.zil.summ.nicolas.NicolasException; | |
6 | 7 | import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; |
7 | 8 | |
8 | -import java.io.IOException; | |
9 | - | |
10 | 9 | public class Main { |
11 | 10 | |
12 | 11 | private static final Logger LOG = LoggerFactory.getLogger(Main.class); |
... | ... | @@ -26,7 +25,7 @@ public class Main { |
26 | 25 | try { |
27 | 26 | nicolas = new Nicolas(); |
28 | 27 | preprocessor = new Preprocessor(); |
29 | - } catch (IOException | ClassNotFoundException e) { | |
28 | + } catch (NicolasException e) { | |
30 | 29 | LOG.error("Error loading Nicolas or Multiservice preprocessor! Will exit."); |
31 | 30 | return; |
32 | 31 | } |
... | ... |
nicolas-common/pom.xml
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.common; | |
2 | + | |
3 | +import com.google.common.base.Predicates; | |
4 | +import com.google.common.collect.Maps; | |
5 | +import org.slf4j.Logger; | |
6 | +import org.slf4j.LoggerFactory; | |
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
8 | + | |
9 | +import java.io.File; | |
10 | +import java.io.FileInputStream; | |
11 | +import java.io.IOException; | |
12 | +import java.io.InputStream; | |
13 | +import java.util.Map; | |
14 | +import java.util.function.Predicate; | |
15 | + | |
16 | +public class ThriftUtils { | |
17 | + | |
18 | + private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class); | |
19 | + | |
20 | + private ThriftUtils() { | |
21 | + } | |
22 | + | |
23 | + public static Map<String, TText> loadThriftTextsFromFolder(File folder, Predicate<String> idFilter) { | |
24 | + Map<String, TText> id2text = Maps.newHashMap(); | |
25 | + File[] files = folder.listFiles(); | |
26 | + if (files != null) { | |
27 | + for (File processedFullTextFile : files) { | |
28 | + String textId = processedFullTextFile.getName().split("\\.")[0]; | |
29 | + if (!idFilter.test(textId)) | |
30 | + continue; | |
31 | + TText processedFullText = loadThriftTextFromFile(processedFullTextFile); | |
32 | + id2text.put(textId, processedFullText); | |
33 | + } | |
34 | + } | |
35 | + LOG.info("{} preprocessed texts found.", id2text.size()); | |
36 | + return id2text; | |
37 | + } | |
38 | + | |
39 | + public static Map<String, TText> loadThriftTextsFromFolder(File folder) { | |
40 | + return loadThriftTextsFromFolder(folder, Predicates.alwaysTrue()); | |
41 | + } | |
42 | + | |
43 | + public static TText loadThriftTextFromFile(File originalFile) { | |
44 | + try (FileInputStream inputStream = new FileInputStream(originalFile)) { | |
45 | + return loadThriftTextFromStream(inputStream); | |
46 | + } catch (IOException e) { | |
47 | + LOG.error("Error reading serialized Thrift file", e); | |
48 | + return null; | |
49 | + } | |
50 | + } | |
51 | + | |
52 | + public static TText loadThriftTextFromStream(InputStream stream) { | |
53 | + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { | |
54 | + return (TText) ois.readObject(); | |
55 | + } catch (ClassNotFoundException | IOException e) { | |
56 | + LOG.error("Error reading serialized Thrift stream", e); | |
57 | + return null; | |
58 | + } | |
59 | + } | |
60 | + | |
61 | +} | |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
... | ... | @@ -28,6 +28,12 @@ public class Utils { |
28 | 28 | private Utils() { |
29 | 29 | } |
30 | 30 | |
31 | + public static void writeStringToFile(String string, File file) throws IOException { | |
32 | + try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) { | |
33 | + bw.append(string); | |
34 | + } | |
35 | + } | |
36 | + | |
31 | 37 | public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { |
32 | 38 | LOG.info("Loading classifier from path: {}...", modelResourcePath); |
33 | 39 | try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { |
... | ... | @@ -76,44 +82,15 @@ public class Utils { |
76 | 82 | return instances; |
77 | 83 | } |
78 | 84 | |
79 | - public static Classifier loadClassifier(String path) throws IOException, ClassNotFoundException { | |
85 | + public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException { | |
80 | 86 | LOG.info("Loading classifier..."); |
81 | - try (ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path))) { | |
87 | + try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) { | |
82 | 88 | Classifier classifier = (Classifier) ois.readObject(); |
83 | 89 | LOG.info("Done. " + classifier.toString()); |
84 | 90 | return classifier; |
85 | 91 | } |
86 | 92 | } |
87 | 93 | |
88 | - public static Map<String, TText> loadPreprocessedTexts(String path) { | |
89 | - Map<String, TText> id2text = Maps.newHashMap(); | |
90 | - for (File processedFullTextFile : new File(path).listFiles()) { | |
91 | - TText processedFullText = loadThrifted(processedFullTextFile); | |
92 | - id2text.put(processedFullTextFile.getName().split("\\.")[0], processedFullText); | |
93 | - } | |
94 | - LOG.info(id2text.size() + " preprocessed texts found."); | |
95 | - return id2text; | |
96 | - } | |
97 | - | |
98 | - | |
99 | - public static TText loadThrifted(File originalFile) { | |
100 | - try (FileInputStream inputStream = new FileInputStream(originalFile)) { | |
101 | - return loadThrifted(inputStream); | |
102 | - } catch (IOException e) { | |
103 | - LOG.error("Error reading serialized file: " + e); | |
104 | - return null; | |
105 | - } | |
106 | - } | |
107 | - | |
108 | - public static TText loadThrifted(InputStream stream) { | |
109 | - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { | |
110 | - return (TText) ois.readObject(); | |
111 | - } catch (ClassNotFoundException | IOException e) { | |
112 | - LOG.error("Error reading serialized file: " + e); | |
113 | - return null; | |
114 | - } | |
115 | - } | |
116 | - | |
117 | 94 | public static List<String> tokenize(String text) { |
118 | 95 | return Arrays.asList(text.split("[^\\p{L}0-9]+")); |
119 | 96 | } |
... | ... |
nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java
... | ... | @@ -14,7 +14,7 @@ public class UtilsTest { |
14 | 14 | @Test |
15 | 15 | public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { |
16 | 16 | try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { |
17 | - TText text = Utils.loadThrifted(stream); | |
17 | + TText text = ThriftUtils.loadThriftTextFromStream(stream); | |
18 | 18 | assertEquals(26, text.getParagraphs().size()); |
19 | 19 | assertEquals(2, text.getParagraphs().get(4).getSentences().size()); |
20 | 20 | } |
... | ... |
nicolas-eval/pom.xml
0 → 100644
1 | +<?xml version="1.0" encoding="UTF-8"?> | |
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | |
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | |
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
5 | + <parent> | |
6 | + <artifactId>nicolas-container</artifactId> | |
7 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
8 | + <version>1.0-SNAPSHOT</version> | |
9 | + </parent> | |
10 | + <modelVersion>4.0.0</modelVersion> | |
11 | + | |
12 | + <artifactId>nicolas-eval</artifactId> | |
13 | + | |
14 | + <dependencies> | |
15 | + <!-- project --> | |
16 | + <dependency> | |
17 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
18 | + <artifactId>nicolas-lib</artifactId> | |
19 | + </dependency> | |
20 | + <dependency> | |
21 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
22 | + <artifactId>nicolas-common</artifactId> | |
23 | + </dependency> | |
24 | + | |
25 | + <!-- internal --> | |
26 | + <dependency> | |
27 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
28 | + <artifactId>eval</artifactId> | |
29 | + </dependency> | |
30 | + | |
31 | + <!-- third party --> | |
32 | + <dependency> | |
33 | + <groupId>nz.ac.waikato.cms.weka</groupId> | |
34 | + <artifactId>weka-stable</artifactId> | |
35 | + </dependency> | |
36 | + <dependency> | |
37 | + <groupId>org.apache.commons</groupId> | |
38 | + <artifactId>commons-lang3</artifactId> | |
39 | + </dependency> | |
40 | + <dependency> | |
41 | + <groupId>com.google.guava</groupId> | |
42 | + <artifactId>guava</artifactId> | |
43 | + </dependency> | |
44 | + | |
45 | + <!-- logging --> | |
46 | + <dependency> | |
47 | + <groupId>org.slf4j</groupId> | |
48 | + <artifactId>slf4j-api</artifactId> | |
49 | + </dependency> | |
50 | + <dependency> | |
51 | + <groupId>org.slf4j</groupId> | |
52 | + <artifactId>slf4j-simple</artifactId> | |
53 | + </dependency> | |
54 | + | |
55 | + </dependencies> | |
56 | +</project> | |
0 | 57 | \ No newline at end of file |
... | ... |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | |
2 | + | |
3 | +import org.apache.commons.io.IOUtils; | |
4 | + | |
5 | +import java.io.IOException; | |
6 | +import java.io.InputStream; | |
7 | +import java.util.List; | |
8 | +import java.util.Set; | |
9 | +import java.util.stream.Collectors; | |
10 | + | |
11 | +public class Constants { | |
12 | + | |
13 | + private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt"; | |
14 | + | |
15 | + private Constants() { | |
16 | + } | |
17 | + | |
18 | + public static Set<String> loadTestTextIds() throws IOException { | |
19 | + try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { | |
20 | + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING); | |
21 | + return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); | |
22 | + } | |
23 | + } | |
24 | +} | |
... | ... |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.eval.Main; | |
4 | + | |
5 | +public class Evaluate { | |
6 | + | |
7 | + private Evaluate() { | |
8 | + } | |
9 | + | |
10 | + public static void main(String[] args) { | |
11 | + String goldDirPath = "data/summaries-gold"; | |
12 | + String systemDirPath = "data/summaries"; | |
13 | + Main.main(new String[]{goldDirPath, systemDirPath}); | |
14 | + } | |
15 | +} | |
0 | 16 | \ No newline at end of file |
... | ... |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/ExtractGoldSummaries.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
4 | +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | |
5 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; | |
6 | +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | |
7 | + | |
8 | +import javax.xml.bind.JAXBException; | |
9 | +import java.io.File; | |
10 | +import java.io.IOException; | |
11 | +import java.util.List; | |
12 | +import java.util.Set; | |
13 | +import java.util.stream.Collectors; | |
14 | + | |
15 | +import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; | |
16 | + | |
17 | +public class ExtractGoldSummaries { | |
18 | + | |
19 | + private ExtractGoldSummaries() { | |
20 | + } | |
21 | + | |
22 | + public static void main(String[] args) throws IOException, JAXBException { | |
23 | + File corpusDir = new File("data/corpus/PSC_1.0/data"); | |
24 | + File targetDir = new File("data/summaries-gold"); | |
25 | + targetDir.mkdir(); | |
26 | + | |
27 | + Set<String> testTextIds = loadTestTextIds(); | |
28 | + File[] files = corpusDir.listFiles(); | |
29 | + if (files != null) { | |
30 | + for (File file : files) { | |
31 | + Text text = PSC_IO.readText(file); | |
32 | + if (!testTextIds.contains(text.getId())) | |
33 | + continue; | |
34 | + | |
35 | + List<Summary> goldSummaries = text.getSummaries().getSummary().stream().filter(summary -> summary.getType().equals("abstract") && summary.getRatio().equals(20)).collect(Collectors.toList()); | |
36 | + | |
37 | + for (Summary summary : goldSummaries) { | |
38 | + File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); | |
39 | + Utils.writeStringToFile(summary.getBody(), targetFile); | |
40 | + } | |
41 | + } | |
42 | + } | |
43 | + } | |
44 | + | |
45 | +} | |
... | ... |
nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval; | |
2 | + | |
3 | +import com.google.common.collect.Maps; | |
4 | +import org.slf4j.Logger; | |
5 | +import org.slf4j.LoggerFactory; | |
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.NicolasException; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
12 | + | |
13 | +import java.io.File; | |
14 | +import java.io.IOException; | |
15 | +import java.util.List; | |
16 | +import java.util.Map; | |
17 | +import java.util.Set; | |
18 | + | |
19 | +import static java.util.stream.Collectors.toList; | |
20 | +import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; | |
21 | + | |
22 | +public class SummarizeTestCorpus { | |
23 | + | |
24 | + private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); | |
25 | + | |
26 | + | |
27 | + private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; | |
28 | + private static final double SUMMARY_RATIO = 0.2; | |
29 | + | |
30 | + private SummarizeTestCorpus() { | |
31 | + } | |
32 | + | |
33 | + public static void main(String[] args) throws IOException, NicolasException { | |
34 | + File thriftedCorpusDir = new File("data/preprocessed"); | |
35 | + File targetDir = new File("data/summaries"); | |
36 | + targetDir.mkdir(); | |
37 | + | |
38 | + Set<String> testTextIds = loadTestTextIds(); | |
39 | + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains); | |
40 | + LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); | |
41 | + | |
42 | + Map<String, String> id2summary = summarizeTexts(id2preprocessedText); | |
43 | + LOG.info("Texts summarized."); | |
44 | + | |
45 | + saveSummariesToFolder(id2summary, targetDir); | |
46 | + LOG.info("Texts saved to {} folder.", targetDir); | |
47 | + } | |
48 | + | |
49 | + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { | |
50 | + Map<String, String> id2summary = Maps.newHashMap(); | |
51 | + Nicolas nicolas = new Nicolas(); | |
52 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
53 | + TText text = entry.getValue(); | |
54 | + int targetSize = calculateTargetSize(text); | |
55 | + String summary = nicolas.summarizeThrift(text, targetSize); | |
56 | + id2summary.put(entry.getKey(), summary); | |
57 | + } | |
58 | + return id2summary; | |
59 | + } | |
60 | + | |
61 | + private static int calculateTargetSize(TText text) { | |
62 | + List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
63 | + StringBuilder body = new StringBuilder(); | |
64 | + for (TSentence sentence : sentences) | |
65 | + body.append(Utils.loadSentence2Orth(sentence)).append(" "); | |
66 | + | |
67 | + int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); | |
68 | + return (int) (SUMMARY_RATIO * tokenCount); | |
69 | + } | |
70 | + | |
71 | + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException { | |
72 | + for (Map.Entry<String, String> entry : id2summary.entrySet()) { | |
73 | + String textId = entry.getKey(); | |
74 | + String summary = entry.getValue(); | |
75 | + String targetFileName = textId + SUMMARY_FILE_SUFFIX; | |
76 | + Utils.writeStringToFile(summary, new File(targetDir, targetFileName)); | |
77 | + } | |
78 | + } | |
79 | + | |
80 | +} | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateCommon.java renamed to nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.search; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.eval.search; | |
2 | 2 | |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.apache.commons.lang3.tuple.Pair; |
... | ... | @@ -35,13 +35,13 @@ import java.util.Random; |
35 | 35 | import java.util.logging.LogManager; |
36 | 36 | |
37 | 37 | |
38 | -class CrossvalidateCommon { | |
38 | +class Crossvalidate { | |
39 | 39 | |
40 | - private static final Logger LOG = LoggerFactory.getLogger(CrossvalidateCommon.class); | |
40 | + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); | |
41 | 41 | |
42 | 42 | private static final int NUM_FOLDS = 10; |
43 | 43 | |
44 | - private CrossvalidateCommon() { | |
44 | + private Crossvalidate() { | |
45 | 45 | } |
46 | 46 | |
47 | 47 | static void crossvalidateClassifiers(String datasetPath) throws IOException { |
... | ... | @@ -77,7 +77,7 @@ class CrossvalidateCommon { |
77 | 77 | new DecisionTable(), new JRip(), new PART(), |
78 | 78 | createAttributeSelectedClassifier()}).parallel().map(cls -> { |
79 | 79 | String name = cls.getClass().getSimpleName(); |
80 | - double acc = 0; | |
80 | + double acc; | |
81 | 81 | Evaluation eval; |
82 | 82 | try { |
83 | 83 | eval = new Evaluation(instances); |
... | ... |
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/test_ids.txt renamed to nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt
nicolas-lib/pom.xml
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/ThriftUtils.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java
... | ... | @@ -18,18 +18,18 @@ import java.util.Set; |
18 | 18 | |
19 | 19 | import static java.util.stream.Collectors.toList; |
20 | 20 | |
21 | -public class ThriftUtils { | |
21 | +public class InstanceUtils { | |
22 | 22 | |
23 | - private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class); | |
23 | + private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class); | |
24 | 24 | |
25 | - private ThriftUtils() { | |
25 | + private InstanceUtils() { | |
26 | 26 | } |
27 | 27 | |
28 | 28 | public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { |
29 | 29 | List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
30 | 30 | Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); |
31 | 31 | |
32 | - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each mention."); | |
32 | + LOG.info("Extracting {} features of each mention.", featureExtractor.getAttributesList().size()); | |
33 | 33 | Map<TMention, Instance> mention2instance = Maps.newHashMap(); |
34 | 34 | for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { |
35 | 35 | Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); |
... | ... | @@ -39,7 +39,7 @@ public class ThriftUtils { |
39 | 39 | } |
40 | 40 | mention2instance.put(tMention, instance); |
41 | 41 | } |
42 | - LOG.info("Extracted features of " + mention2instance.size() + " mentions."); | |
42 | + LOG.info("Extracted features of {} mentions.", mention2instance.size()); | |
43 | 43 | return mention2instance; |
44 | 44 | } |
45 | 45 | |
... | ... | @@ -47,7 +47,7 @@ public class ThriftUtils { |
47 | 47 | List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
48 | 48 | Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); |
49 | 49 | |
50 | - LOG.info("Extracting " + featureExtractor.getAttributesList().size() + " features of each sentence."); | |
50 | + LOG.info("Extracting {} features of each sentence.", featureExtractor.getAttributesList().size()); | |
51 | 51 | Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); |
52 | 52 | for (TSentence sentence : sentences) { |
53 | 53 | Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); |
... | ... | @@ -57,7 +57,7 @@ public class ThriftUtils { |
57 | 57 | } |
58 | 58 | sentence2instance.put(sentence, instance); |
59 | 59 | } |
60 | - LOG.info("Extracted features of " + sentence2instance.size() + " sentences."); | |
60 | + LOG.info("Extracted features of {} sentences.", sentence2instance.size()); | |
61 | 61 | return sentence2instance; |
62 | 62 | } |
63 | 63 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
... | ... | @@ -29,14 +29,18 @@ public class Nicolas { |
29 | 29 | private final SentenceFeatureExtractor sentenceFeatureExtractor; |
30 | 30 | private final ZeroFeatureExtractor zeroFeatureExtractor; |
31 | 31 | |
32 | - public Nicolas() throws IOException, ClassNotFoundException { | |
33 | - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | |
34 | - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
35 | - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | |
36 | - | |
37 | - mentionFeatureExtractor = new MentionFeatureExtractor(); | |
38 | - sentenceFeatureExtractor = new SentenceFeatureExtractor(); | |
39 | - zeroFeatureExtractor = new ZeroFeatureExtractor(); | |
32 | + public Nicolas() throws NicolasException { | |
33 | + try { | |
34 | + mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | |
35 | + sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
36 | + zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); | |
37 | + | |
38 | + mentionFeatureExtractor = new MentionFeatureExtractor(); | |
39 | + sentenceFeatureExtractor = new SentenceFeatureExtractor(); | |
40 | + zeroFeatureExtractor = new ZeroFeatureExtractor(); | |
41 | + } catch (IOException e) { | |
42 | + throw new NicolasException(e); | |
43 | + } | |
40 | 44 | } |
41 | 45 | |
42 | 46 | public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { |
... | ... | @@ -59,17 +63,17 @@ public class Nicolas { |
59 | 63 | } |
60 | 64 | |
61 | 65 | private List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { |
62 | - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
66 | + List<TSentence> sentences = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); | |
63 | 67 | |
64 | 68 | Map<TSentence, Double> sentence2score = SentenceModel.scoreSentences(thrifted, goodMentions, sentenceModel, sentenceFeatureExtractor); |
65 | 69 | |
66 | - List<TSentence> sortedSents = Lists.newArrayList(sents); | |
67 | - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); | |
70 | + List<TSentence> sortedSentences = Lists.newArrayList(sentences); | |
71 | + sortedSentences.sort(Comparator.comparing(sentence2score::get).reversed()); | |
68 | 72 | |
69 | 73 | int size = 0; |
70 | 74 | Random r = new Random(1); |
71 | 75 | Set<TSentence> summary = Sets.newHashSet(); |
72 | - for (TSentence sent : sortedSents) { | |
76 | + for (TSentence sent : sortedSentences) { | |
73 | 77 | size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); |
74 | 78 | if (r.nextDouble() > 0.4 && size > targetSize) |
75 | 79 | break; |
... | ... | @@ -78,7 +82,7 @@ public class Nicolas { |
78 | 82 | break; |
79 | 83 | } |
80 | 84 | List<TSentence> selectedSentences = Lists.newArrayList(); |
81 | - for (TSentence sent : sents) { | |
85 | + for (TSentence sent : sentences) { | |
82 | 86 | if (summary.contains(sent)) |
83 | 87 | selectedSentences.add(sent); |
84 | 88 | } |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/NicolasException.java
0 → 100644
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java
... | ... | @@ -8,8 +8,9 @@ import org.slf4j.LoggerFactory; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | |
12 | 12 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
13 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | |
13 | 14 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
14 | 15 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | 16 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
... | ... | @@ -34,15 +35,15 @@ public class ApplyModel { |
34 | 35 | private static final String TARGET_DIR = "corpora/summaries"; |
35 | 36 | |
36 | 37 | public static void main(String[] args) throws Exception { |
37 | - Classifier mentionClassifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); | |
38 | + Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | |
38 | 39 | MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); |
39 | 40 | |
40 | - Classifier sentenceClassifier = Utils.loadClassifier(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
41 | + Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); | |
41 | 42 | SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); |
42 | 43 | |
43 | 44 | ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); |
44 | 45 | |
45 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(TEST_PREPROCESSED_DATA_PATH); | |
46 | + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH)); | |
46 | 47 | int i = 1; |
47 | 48 | double avgSize = 0; |
48 | 49 | for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { |
... | ... | @@ -91,7 +92,7 @@ public class ApplyModel { |
91 | 92 | List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); |
92 | 93 | |
93 | 94 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); |
94 | - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
95 | + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
95 | 96 | |
96 | 97 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
97 | 98 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
... | ... |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureExtractor.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureExtractor.java
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/FeatureHelper.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/features/Interpretation.java renamed to nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/Interpretation.java
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java
... | ... | @@ -5,9 +5,9 @@ import com.google.common.collect.Maps; |
5 | 5 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
6 | 6 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
7 | 7 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.Interpretation; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
10 | +import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; | |
11 | 11 | import weka.core.Attribute; |
12 | 12 | |
13 | 13 | import java.io.IOException; |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java
... | ... | @@ -5,7 +5,7 @@ import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | |
9 | 9 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
10 | 10 | import weka.classifiers.Classifier; |
11 | 11 | import weka.core.Instance; |
... | ... | @@ -25,7 +25,7 @@ public class MentionModel { |
25 | 25 | Set<TMention> goodMentions = Sets.newHashSet(); |
26 | 26 | |
27 | 27 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
28 | - Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(text, featureExtractor); | |
28 | + Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor); | |
29 | 29 | for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { |
30 | 30 | Instance instance = entry.getValue(); |
31 | 31 | instance.setDataset(instances); |
... | ... | @@ -34,7 +34,7 @@ public class MentionModel { |
34 | 34 | if (good) |
35 | 35 | goodMentions.add(entry.getKey()); |
36 | 36 | } |
37 | - LOG.info("Classified " + goodMentions.size() + " mentions as good."); | |
37 | + LOG.info("Classified {} mentions as good.", goodMentions.size()); | |
38 | 38 | return goodMentions; |
39 | 39 | } |
40 | 40 | |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceFeatureExtractor.java
... | ... | @@ -2,8 +2,8 @@ package pl.waw.ipipan.zil.summ.nicolas.sentence; |
2 | 2 | |
3 | 3 | import com.google.common.collect.Maps; |
4 | 4 | import pl.waw.ipipan.zil.multiservice.thrift.types.*; |
5 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | |
6 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
5 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | |
6 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
7 | 7 | import weka.core.Attribute; |
8 | 8 | |
9 | 9 | import java.util.List; |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java
... | ... | @@ -6,7 +6,7 @@ import org.slf4j.LoggerFactory; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | |
10 | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
11 | 11 | import weka.classifiers.Classifier; |
12 | 12 | import weka.core.Instance; |
... | ... | @@ -24,7 +24,7 @@ public class SentenceModel { |
24 | 24 | |
25 | 25 | public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { |
26 | 26 | Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); |
27 | - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
27 | + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); | |
28 | 28 | |
29 | 29 | Map<TSentence, Double> sentence2score = Maps.newHashMap(); |
30 | 30 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
... | ... |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java
... | ... | @@ -8,8 +8,8 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | 8 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
9 | 9 | import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; |
10 | 10 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureExtractor; | |
12 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
11 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; | |
12 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
13 | 13 | import weka.core.Attribute; |
14 | 14 | |
15 | 15 | import java.util.List; |
... | ... |
nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java
... | ... | @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; |
5 | 5 | import org.junit.Test; |
6 | 6 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
7 | 7 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
8 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
8 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
10 | 10 | |
11 | 11 | import java.io.IOException; |
12 | 12 | import java.io.InputStream; |
... | ... | @@ -47,7 +47,7 @@ public class CandidateFinderTest { |
47 | 47 | |
48 | 48 | private FeatureHelper loadSampleTextHelper() throws IOException { |
49 | 49 | try (InputStream stream = CandidateFinderTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { |
50 | - return new FeatureHelper(Utils.loadThrifted(stream)); | |
50 | + return new FeatureHelper(ThriftUtils.loadThriftTextFromStream(stream)); | |
51 | 51 | } |
52 | 52 | } |
53 | 53 | } |
54 | 54 | \ No newline at end of file |
... | ... |
nicolas-train/pom.xml
... | ... | @@ -25,6 +25,11 @@ |
25 | 25 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
26 | 26 | <artifactId>nicolas-multiservice</artifactId> |
27 | 27 | </dependency> |
28 | + <dependency> | |
29 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
30 | + <artifactId>nicolas-model</artifactId> | |
31 | + <scope>runtime</scope> | |
32 | + </dependency> | |
28 | 33 | |
29 | 34 | <!-- internal --> |
30 | 35 | <dependency> |
... | ... | @@ -39,7 +44,7 @@ |
39 | 44 | <!-- third party --> |
40 | 45 | <dependency> |
41 | 46 | <groupId>nz.ac.waikato.cms.weka</groupId> |
42 | - <artifactId>weka-dev</artifactId> | |
47 | + <artifactId>weka-stable</artifactId> | |
43 | 48 | </dependency> |
44 | 49 | <dependency> |
45 | 50 | <groupId>org.apache.commons</groupId> |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/ModelConstants.java
... | ... | @@ -5,11 +5,11 @@ import weka.classifiers.trees.RandomForest; |
5 | 5 | |
6 | 6 | public class ModelConstants { |
7 | 7 | |
8 | - public static final String MENTION_DATASET_PATH = "mentions_train.arff"; | |
9 | - public static final String SENTENCE_DATASET_PATH = "sentences_train.arff"; | |
10 | - public static final String ZERO_DATASET_PATH = "zeros_train.arff"; | |
8 | + public static final String MENTION_DATASET_PATH = "data/arff/mentions_train.arff"; | |
9 | + public static final String SENTENCE_DATASET_PATH = "data/arff/sentences_train.arff"; | |
10 | + public static final String ZERO_DATASET_PATH = "data/arff/zeros_train.arff"; | |
11 | 11 | |
12 | - private static final int NUM_ITERATIONS = 16; | |
12 | + private static final int NUM_ITERATIONS = 250; | |
13 | 13 | private static final int NUM_EXECUTION_SLOTS = 8; |
14 | 14 | private static final int SEED = 0; |
15 | 15 | |
... | ... | @@ -26,17 +26,17 @@ public class ModelConstants { |
26 | 26 | |
27 | 27 | public static Classifier getSentenceClassifier() { |
28 | 28 | RandomForest classifier = new RandomForest(); |
29 | - classifier.setNumIterations(16); | |
30 | - classifier.setSeed(0); | |
31 | - classifier.setNumExecutionSlots(8); | |
29 | + classifier.setNumIterations(NUM_ITERATIONS); | |
30 | + classifier.setSeed(SEED); | |
31 | + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS); | |
32 | 32 | return classifier; |
33 | 33 | } |
34 | 34 | |
35 | 35 | public static Classifier getZeroClassifier() { |
36 | 36 | RandomForest classifier = new RandomForest(); |
37 | - classifier.setNumIterations(16); | |
38 | - classifier.setSeed(0); | |
39 | - classifier.setNumExecutionSlots(8); | |
37 | + classifier.setNumIterations(NUM_ITERATIONS); | |
38 | + classifier.setSeed(SEED); | |
39 | + classifier.setNumExecutionSlots(NUM_EXECUTION_SLOTS); | |
40 | 40 | return classifier; |
41 | 41 | } |
42 | 42 | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/common/TrainModelCommon.java
... | ... | @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.model.common; |
3 | 3 | import org.apache.commons.lang3.time.StopWatch; |
4 | 4 | import org.slf4j.Logger; |
5 | 5 | import org.slf4j.LoggerFactory; |
6 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.TrainZeroModel; | |
7 | 6 | import weka.classifiers.Classifier; |
8 | 7 | import weka.core.Instances; |
9 | 8 | import weka.core.converters.ArffLoader; |
... | ... | @@ -16,7 +15,7 @@ import java.util.logging.LogManager; |
16 | 15 | @SuppressWarnings("squid:S2118") |
17 | 16 | public class TrainModelCommon { |
18 | 17 | |
19 | - private static final Logger LOG = LoggerFactory.getLogger(TrainZeroModel.class); | |
18 | + private static final Logger LOG = LoggerFactory.getLogger(TrainModelCommon.class); | |
20 | 19 | |
21 | 20 | private static final String TARGET_MODEL_DIR = "nicolas-model/src/main/resources"; |
22 | 21 | |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/mention/PrepareTrainingData.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.model.mention; | |
2 | - | |
3 | -import com.google.common.base.Charsets; | |
4 | -import com.google.common.collect.Maps; | |
5 | -import com.google.common.io.Files; | |
6 | -import org.slf4j.Logger; | |
7 | -import org.slf4j.LoggerFactory; | |
8 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
9 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
10 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | |
11 | -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | |
12 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
13 | -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; | |
14 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
15 | -import weka.core.Instance; | |
16 | -import weka.core.Instances; | |
17 | -import weka.core.converters.ArffSaver; | |
18 | - | |
19 | -import java.io.File; | |
20 | -import java.io.IOException; | |
21 | -import java.util.Map; | |
22 | - | |
23 | - | |
24 | -public class PrepareTrainingData { | |
25 | - | |
26 | - private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); | |
27 | - | |
28 | - private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | |
29 | - private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | |
30 | - | |
31 | - private PrepareTrainingData() { | |
32 | - } | |
33 | - | |
34 | - public static void main(String[] args) throws IOException { | |
35 | - | |
36 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); | |
37 | - Map<String, String> id2optimalSummary = loadOptimalSummaries(); | |
38 | - | |
39 | - MentionScorer mentionScorer = new MentionScorer(); | |
40 | - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | |
41 | - | |
42 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
43 | - | |
44 | - int i = 1; | |
45 | - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
46 | - LOG.info(i++ + "/" + id2preprocessedText.size()); | |
47 | - | |
48 | - String id = entry.getKey(); | |
49 | - TText preprocessedText = entry.getValue(); | |
50 | - String optimalSummary = id2optimalSummary.get(id); | |
51 | - if (optimalSummary == null) | |
52 | - continue; | |
53 | - Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); | |
54 | - | |
55 | - Map<TMention, Instance> mention2instance = ThriftUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); | |
56 | - for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) { | |
57 | - TMention mention = entry2.getKey(); | |
58 | - Instance instance = entry2.getValue(); | |
59 | - instance.setDataset(instances); | |
60 | - instance.setClassValue(mention2score.get(mention)); | |
61 | - instances.add(instance); | |
62 | - } | |
63 | - } | |
64 | - saveInstancesToFile(instances); | |
65 | - } | |
66 | - | |
67 | - private static void saveInstancesToFile(Instances instances) throws IOException { | |
68 | - ArffSaver saver = new ArffSaver(); | |
69 | - saver.setInstances(instances); | |
70 | - saver.setFile(new File(ModelConstants.MENTION_DATASET_PATH)); | |
71 | - saver.writeBatch(); | |
72 | - } | |
73 | - | |
74 | - private static Map<String, String> loadOptimalSummaries() throws IOException { | |
75 | - Map<String, String> id2optimalSummary = Maps.newHashMap(); | |
76 | - for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) { | |
77 | - String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | |
78 | - id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary); | |
79 | - } | |
80 | - LOG.info(id2optimalSummary.size() + " optimal summaries found."); | |
81 | - return id2optimalSummary; | |
82 | - } | |
83 | - | |
84 | - | |
85 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/PrepareTrainingData.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.model.zero; | |
2 | - | |
3 | -import com.google.common.collect.Maps; | |
4 | -import com.google.common.collect.Sets; | |
5 | -import org.apache.commons.io.IOUtils; | |
6 | -import org.slf4j.Logger; | |
7 | -import org.slf4j.LoggerFactory; | |
8 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | |
10 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
11 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
12 | -import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | |
13 | -import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; | |
14 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | |
15 | -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | |
16 | -import weka.core.Instance; | |
17 | -import weka.core.Instances; | |
18 | -import weka.core.converters.ArffSaver; | |
19 | - | |
20 | -import java.io.File; | |
21 | -import java.io.FileReader; | |
22 | -import java.io.IOException; | |
23 | -import java.util.List; | |
24 | -import java.util.Map; | |
25 | -import java.util.Set; | |
26 | - | |
27 | -public class PrepareTrainingData { | |
28 | - | |
29 | - private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); | |
30 | - | |
31 | - private static final String IDS_PATH = "corpora/summaries_dev"; | |
32 | - private static final String THRIFTED_PATH = "corpora/preprocessed_full_texts/dev/"; | |
33 | - private static final String GOLD_ZEROS_PATH = "/zeros.tsv"; | |
34 | - | |
35 | - private PrepareTrainingData() { | |
36 | - } | |
37 | - | |
38 | - public static void main(String[] args) throws IOException { | |
39 | - | |
40 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(THRIFTED_PATH); | |
41 | - Map<String, Set<String>> id2sentIds = loadSentenceIds(IDS_PATH); | |
42 | - | |
43 | - ZeroScorer zeroScorer = new ZeroScorer(GOLD_ZEROS_PATH); | |
44 | - ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); | |
45 | - | |
46 | - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
47 | - | |
48 | - int i = 1; | |
49 | - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
50 | - LOG.info(i++ + "/" + id2preprocessedText.size()); | |
51 | - | |
52 | - String textId = entry.getKey(); | |
53 | - | |
54 | - TText text = entry.getValue(); | |
55 | - Set<String> sentenceIds = id2sentIds.get(textId); | |
56 | - FeatureHelper featureHelper = new FeatureHelper(text); | |
57 | - | |
58 | - List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); | |
59 | - Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
60 | - | |
61 | - for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { | |
62 | - boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); | |
63 | - Instance instance = entry2.getValue(); | |
64 | - instance.setDataset(instances); | |
65 | - instance.setClassValue(good ? 1 : 0); | |
66 | - instances.add(instance); | |
67 | - } | |
68 | - } | |
69 | - | |
70 | - saveInstancesToFile(instances); | |
71 | - } | |
72 | - | |
73 | - | |
74 | - private static void saveInstancesToFile(Instances instances) throws IOException { | |
75 | - ArffSaver saver = new ArffSaver(); | |
76 | - saver.setInstances(instances); | |
77 | - saver.setFile(new File(ModelConstants.ZERO_DATASET_PATH)); | |
78 | - saver.writeBatch(); | |
79 | - } | |
80 | - | |
81 | - private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { | |
82 | - Map<String, Set<String>> result = Maps.newHashMap(); | |
83 | - for (File f : new File(idsPath).listFiles()) { | |
84 | - String id = f.getName().split("_")[0]; | |
85 | - List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | |
86 | - result.put(id, Sets.newHashSet(sentenceIds)); | |
87 | - } | |
88 | - return result; | |
89 | - } | |
90 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/zero/ZeroScorer.java
... | ... | @@ -6,7 +6,7 @@ import org.apache.commons.csv.CSVParser; |
6 | 6 | import org.apache.commons.csv.CSVRecord; |
7 | 7 | import org.apache.commons.csv.QuoteMode; |
8 | 8 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
9 | -import pl.waw.ipipan.zil.summ.nicolas.common.features.FeatureHelper; | |
9 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
10 | 10 | import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; |
11 | 11 | |
12 | 12 | import java.io.IOException; |
... | ... | @@ -21,8 +21,8 @@ public class ZeroScorer { |
21 | 21 | |
22 | 22 | private final Map<String, Boolean> candidateEncoding2Decision = Maps.newHashMap(); |
23 | 23 | |
24 | - public ZeroScorer(String goldZerosPath) throws IOException { | |
25 | - try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosPath); | |
24 | + public ZeroScorer(String goldZerosResourcePath) throws IOException { | |
25 | + try (InputStream stream = ZeroScorer.class.getResourceAsStream(goldZerosResourcePath); | |
26 | 26 | InputStreamReader reader = new InputStreamReader(stream, Constants.ENCODING); |
27 | 27 | CSVParser parser = new CSVParser(reader, CSVFormat.DEFAULT.withDelimiter(DELIMITER).withEscape('|').withQuoteMode(QuoteMode.NONE).withQuote('~'))) { |
28 | 28 | List<CSVRecord> records = parser.getRecords(); |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadAndPreprocessCorpus.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; | |
2 | 2 | |
3 | 3 | import net.lingala.zip4j.core.ZipFile; |
4 | 4 | import org.apache.commons.io.FileUtils; |
5 | 5 | import org.slf4j.Logger; |
6 | 6 | import org.slf4j.LoggerFactory; |
7 | -import pl.waw.ipipan.zil.summ.nicolas.train.preprocess.Main; | |
8 | 7 | |
9 | 8 | import java.io.File; |
10 | 9 | import java.net.URL; |
... | ... | @@ -45,7 +44,7 @@ public class DownloadAndPreprocessCorpus { |
45 | 44 | |
46 | 45 | File preprocessed = new File(WORKING_DIR, "preprocessed"); |
47 | 46 | createFolder(preprocessed.getPath()); |
48 | - Main.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); | |
47 | + Preprocess.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); | |
49 | 48 | } |
50 | 49 | |
51 | 50 | private static File createFolder(String path) { |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/sentence/PrepareTrainingData.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.model.sentence; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; | |
2 | 2 | |
3 | 3 | import com.google.common.base.Charsets; |
4 | 4 | import com.google.common.collect.Maps; |
5 | +import com.google.common.collect.Sets; | |
5 | 6 | import com.google.common.io.Files; |
7 | +import org.apache.commons.io.IOUtils; | |
6 | 8 | import org.slf4j.Logger; |
7 | 9 | import org.slf4j.LoggerFactory; |
8 | 10 | import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; |
9 | 11 | import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; |
10 | 12 | import pl.waw.ipipan.zil.multiservice.thrift.types.TText; |
11 | -import pl.waw.ipipan.zil.summ.nicolas.ThriftUtils; | |
13 | +import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; | |
12 | 14 | import pl.waw.ipipan.zil.summ.nicolas.common.Constants; |
15 | +import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; | |
13 | 16 | import pl.waw.ipipan.zil.summ.nicolas.common.Utils; |
17 | +import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; | |
14 | 18 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; |
15 | 19 | import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; |
16 | 20 | import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; |
17 | 21 | import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; |
22 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.mention.MentionScorer; | |
23 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.sentence.SentenceScorer; | |
24 | +import pl.waw.ipipan.zil.summ.nicolas.train.model.zero.ZeroScorer; | |
25 | +import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; | |
26 | +import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; | |
27 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; | |
28 | +import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; | |
18 | 29 | import weka.classifiers.Classifier; |
19 | 30 | import weka.core.Instance; |
20 | 31 | import weka.core.Instances; |
21 | 32 | import weka.core.converters.ArffSaver; |
22 | 33 | |
23 | 34 | import java.io.File; |
35 | +import java.io.FileReader; | |
24 | 36 | import java.io.IOException; |
37 | +import java.io.InputStream; | |
38 | +import java.util.List; | |
25 | 39 | import java.util.Map; |
26 | 40 | import java.util.Set; |
27 | - | |
41 | +import java.util.function.Predicate; | |
42 | +import java.util.stream.Collectors; | |
28 | 43 | |
29 | 44 | public class PrepareTrainingData { |
30 | 45 | |
31 | 46 | private static final Logger LOG = LoggerFactory.getLogger(PrepareTrainingData.class); |
32 | 47 | |
33 | - private static final String PREPROCESSED_FULL_TEXTS_DIR_PATH = "src/main/resources/preprocessed_full_texts/dev"; | |
34 | - private static final String OPTIMAL_SUMMARIES_DIR_PATH = "src/main/resources/optimal_summaries/dev"; | |
48 | + private static final String THRIFT_TEXTS_PATH = "data/preprocessed"; | |
49 | + private static final String OPTIMAL_SUMMARIES_DIR_PATH = "data/summaries-optimal"; | |
50 | + private static final String SUMMARY_SENTENCE_IDS = "data/summaries-sentence-ids"; | |
51 | + | |
52 | + private static final String ZERO_TRAINING_DATA_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_zero.tsv"; | |
53 | + private static final String TRAIN_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt"; | |
35 | 54 | |
36 | 55 | private PrepareTrainingData() { |
37 | 56 | } |
38 | 57 | |
39 | 58 | public static void main(String[] args) throws Exception { |
59 | + Set<String> trainTextIds = loadTrainTextIds(); | |
60 | + | |
61 | + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(THRIFT_TEXTS_PATH), trainTextIds::contains); | |
62 | + Map<String, String> id2optimalSummary = loadOptimalSummaries(trainTextIds::contains); | |
63 | + | |
64 | + prepareMentionsDataset(id2preprocessedText, id2optimalSummary); | |
65 | + prepareSentencesDataset(id2preprocessedText, id2optimalSummary); | |
66 | + prepareZerosDataset(id2preprocessedText); | |
67 | + } | |
68 | + | |
69 | + public static void prepareMentionsDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws IOException { | |
70 | + MentionScorer mentionScorer = new MentionScorer(); | |
71 | + MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); | |
72 | + | |
73 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
74 | + | |
75 | + int i = 1; | |
76 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
77 | + LOG.info("{}/{}", i++, id2preprocessedText.size()); | |
78 | + | |
79 | + String id = entry.getKey(); | |
80 | + TText preprocessedText = entry.getValue(); | |
81 | + String optimalSummary = id2optimalSummary.get(id); | |
82 | + if (optimalSummary == null) | |
83 | + continue; | |
84 | + Map<TMention, Double> mention2score = mentionScorer.calculateMentionScores(optimalSummary, preprocessedText); | |
85 | + | |
86 | + Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(preprocessedText, featureExtractor); | |
87 | + for (Map.Entry<TMention, Instance> entry2 : mention2instance.entrySet()) { | |
88 | + TMention mention = entry2.getKey(); | |
89 | + Instance instance = entry2.getValue(); | |
90 | + instance.setDataset(instances); | |
91 | + instance.setClassValue(mention2score.get(mention)); | |
92 | + instances.add(instance); | |
93 | + } | |
94 | + } | |
95 | + saveInstancesToFile(instances, new File(ModelConstants.MENTION_DATASET_PATH)); | |
96 | + } | |
97 | + | |
98 | + private static Set<String> loadTrainTextIds() throws IOException { | |
99 | + try (InputStream inputStream = PrepareTrainingData.class.getResourceAsStream(TRAIN_TEXT_IDS_RESOURCE_PATH)) { | |
100 | + List<String> testTextIds = IOUtils.readLines(inputStream, Constants.ENCODING); | |
101 | + return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); | |
102 | + } | |
103 | + } | |
40 | 104 | |
41 | - Map<String, TText> id2preprocessedText = Utils.loadPreprocessedTexts(PREPROCESSED_FULL_TEXTS_DIR_PATH); | |
42 | - Map<String, String> id2optimalSummary = loadOptimalSummaries(); | |
105 | + public static void prepareSentencesDataset(Map<String, TText> id2preprocessedText, Map<String, String> id2optimalSummary) throws Exception { | |
43 | 106 | |
44 | 107 | SentenceScorer sentenceScorer = new SentenceScorer(); |
45 | 108 | SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); |
46 | 109 | |
47 | 110 | Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); |
48 | 111 | |
49 | - Classifier classifier = Utils.loadClassifier(Constants.MENTION_MODEL_RESOURCE_PATH); | |
112 | + Classifier classifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); | |
50 | 113 | MentionFeatureExtractor mentionFeatureExtractor = new MentionFeatureExtractor(); |
51 | 114 | |
52 | 115 | int i = 1; |
53 | 116 | for (String textId : id2preprocessedText.keySet()) { |
54 | - LOG.info(i++ + "/" + id2preprocessedText.size()); | |
117 | + LOG.info("{}/{}", i++, id2preprocessedText.size()); | |
55 | 118 | |
56 | 119 | TText preprocessedText = id2preprocessedText.get(textId); |
57 | 120 | String optimalSummary = id2optimalSummary.get(textId); |
... | ... | @@ -64,7 +127,7 @@ public class PrepareTrainingData { |
64 | 127 | // Set<TMention> goodMentions |
65 | 128 | // = Utils.loadGoldGoodMentions(textId, preprocessedText, true); |
66 | 129 | |
67 | - Map<TSentence, Instance> sentence2instance = ThriftUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); | |
130 | + Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(preprocessedText, featureExtractor, goodMentions); | |
68 | 131 | for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { |
69 | 132 | TSentence sentence = entry.getKey(); |
70 | 133 | Instance instance = entry.getValue(); |
... | ... | @@ -73,25 +136,74 @@ public class PrepareTrainingData { |
73 | 136 | instances.add(instance); |
74 | 137 | } |
75 | 138 | } |
76 | - saveInstancesToFile(instances); | |
139 | + saveInstancesToFile(instances, new File(ModelConstants.SENTENCE_DATASET_PATH)); | |
140 | + } | |
141 | + | |
142 | + public static void prepareZerosDataset(Map<String, TText> id2preprocessedText) throws IOException { | |
143 | + | |
144 | + Map<String, Set<String>> id2sentIds = loadSentenceIds(SUMMARY_SENTENCE_IDS); | |
145 | + | |
146 | + ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_DATA_RESOURCE_PATH); | |
147 | + ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); | |
148 | + | |
149 | + Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); | |
150 | + | |
151 | + int i = 1; | |
152 | + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { | |
153 | + LOG.info(i++ + "/" + id2preprocessedText.size()); | |
154 | + | |
155 | + String textId = entry.getKey(); | |
156 | + | |
157 | + TText text = entry.getValue(); | |
158 | + Set<String> sentenceIds = id2sentIds.get(textId); | |
159 | + FeatureHelper featureHelper = new FeatureHelper(text); | |
160 | + | |
161 | + List<ZeroSubjectCandidate> zeroSubjectCandidates = CandidateFinder.findZeroSubjectCandidates(text, sentenceIds); | |
162 | + Map<ZeroSubjectCandidate, Instance> candidate2instance = InstanceCreator.extractInstancesFromZeroCandidates(zeroSubjectCandidates, text, featureExtractor); | |
163 | + | |
164 | + for (Map.Entry<ZeroSubjectCandidate, Instance> entry2 : candidate2instance.entrySet()) { | |
165 | + boolean good = zeroScorer.isValidCandidate(entry2.getKey(), featureHelper); | |
166 | + Instance instance = entry2.getValue(); | |
167 | + instance.setDataset(instances); | |
168 | + instance.setClassValue(good ? 1 : 0); | |
169 | + instances.add(instance); | |
170 | + } | |
171 | + } | |
172 | + | |
173 | + saveInstancesToFile(instances, new File(ModelConstants.ZERO_DATASET_PATH)); | |
174 | + } | |
175 | + | |
176 | + private static Map<String, Set<String>> loadSentenceIds(String idsPath) throws IOException { | |
177 | + Map<String, Set<String>> result = Maps.newHashMap(); | |
178 | + File[] files = new File(idsPath).listFiles(); | |
179 | + if (files != null) | |
180 | + for (File f : files) { | |
181 | + String id = f.getName().split("_")[0]; | |
182 | + List<String> sentenceIds = IOUtils.readLines(new FileReader(f)); | |
183 | + result.put(id, Sets.newHashSet(sentenceIds)); | |
184 | + } | |
185 | + return result; | |
77 | 186 | } |
78 | 187 | |
79 | - private static void saveInstancesToFile(Instances instances) throws IOException { | |
188 | + private static void saveInstancesToFile(Instances instances, File targetFile) throws IOException { | |
80 | 189 | ArffSaver saver = new ArffSaver(); |
81 | 190 | saver.setInstances(instances); |
82 | - saver.setFile(new File(ModelConstants.SENTENCE_DATASET_PATH)); | |
191 | + saver.setFile(targetFile); | |
83 | 192 | saver.writeBatch(); |
84 | 193 | } |
85 | 194 | |
86 | - private static Map<String, String> loadOptimalSummaries() throws IOException { | |
195 | + private static Map<String, String> loadOptimalSummaries(Predicate<String> idFilter) throws IOException { | |
87 | 196 | Map<String, String> id2optimalSummary = Maps.newHashMap(); |
88 | - for (File optimalSummaryFile : new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles()) { | |
89 | - String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | |
90 | - id2optimalSummary.put(optimalSummaryFile.getName().split("_")[0], optimalSummary); | |
91 | - } | |
92 | - LOG.info(id2optimalSummary.size() + " optimal summaries found."); | |
197 | + File[] files = new File(OPTIMAL_SUMMARIES_DIR_PATH).listFiles(); | |
198 | + if (files != null) | |
199 | + for (File optimalSummaryFile : files) { | |
200 | + String textId = optimalSummaryFile.getName().split("_")[0]; | |
201 | + if (!idFilter.test(textId)) | |
202 | + continue; | |
203 | + String optimalSummary = Files.toString(optimalSummaryFile, Charsets.UTF_8); | |
204 | + id2optimalSummary.put(textId, optimalSummary); | |
205 | + } | |
206 | + LOG.info("{} optimal summaries found.", id2optimalSummary.size()); | |
93 | 207 | return id2optimalSummary; |
94 | 208 | } |
95 | - | |
96 | - | |
97 | 209 | } |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/Preprocess.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.preprocess; | |
1 | +package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; | |
2 | 2 | |
3 | 3 | import org.slf4j.Logger; |
4 | 4 | import org.slf4j.LoggerFactory; |
... | ... | @@ -9,19 +9,19 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; |
9 | 9 | import java.io.File; |
10 | 10 | import java.util.Arrays; |
11 | 11 | |
12 | -public class Main { | |
12 | +public class Preprocess { | |
13 | 13 | |
14 | - private static final Logger LOG = LoggerFactory.getLogger(Main.class); | |
14 | + private static final Logger LOG = LoggerFactory.getLogger(Preprocess.class); | |
15 | 15 | |
16 | 16 | private static final String CORPUS_FILE_SUFFIX = ".xml"; |
17 | 17 | private static final String OUTPUT_FILE_SUFFIX = ".thrift"; |
18 | 18 | |
19 | - private Main() { | |
19 | + private Preprocess() { | |
20 | 20 | } |
21 | 21 | |
22 | 22 | public static void main(String[] args) { |
23 | 23 | if (args.length != 2) { |
24 | - LOG.error("Wrong usage! Try " + Main.class.getSimpleName() + " dirWithCorpusFiles targetDir"); | |
24 | + LOG.error("Wrong usage! Try " + Preprocess.class.getSimpleName() + " dirWithCorpusFiles targetDir"); | |
25 | 25 | return; |
26 | 26 | } |
27 | 27 | File corpusDir = new File(args[0]); |
... | ... |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/TrainAllModels.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateMention.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.search; | |
2 | - | |
3 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
4 | - | |
5 | - | |
6 | -public class CrossvalidateMention { | |
7 | - | |
8 | - private CrossvalidateMention() { | |
9 | - } | |
10 | - | |
11 | - public static void main(String[] args) throws Exception { | |
12 | - CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.MENTION_DATASET_PATH); | |
13 | - } | |
14 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateSentence.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.search; | |
2 | - | |
3 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
4 | - | |
5 | - | |
6 | -public class CrossvalidateSentence { | |
7 | - | |
8 | - private CrossvalidateSentence() { | |
9 | - } | |
10 | - | |
11 | - public static void main(String[] args) throws Exception { | |
12 | - CrossvalidateCommon.crossvalidateRegressors(ModelConstants.SENTENCE_DATASET_PATH); | |
13 | - } | |
14 | -} |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/search/CrossvalidateZero.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.search; | |
2 | - | |
3 | -import pl.waw.ipipan.zil.summ.nicolas.train.model.common.ModelConstants; | |
4 | - | |
5 | - | |
6 | -public class CrossvalidateZero { | |
7 | - | |
8 | - private CrossvalidateZero() { | |
9 | - } | |
10 | - | |
11 | - public static void main(String[] args) throws Exception { | |
12 | - CrossvalidateCommon.crossvalidateClassifiers(ModelConstants.ZERO_DATASET_PATH); | |
13 | - } | |
14 | -} |
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/dev_ids.txt renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/train_text_ids.txt
nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/zero/zeros.tsv renamed to nicolas-train/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/train/train_zero.tsv
pom.xml
... | ... | @@ -18,6 +18,7 @@ |
18 | 18 | <module>nicolas-train</module> |
19 | 19 | <module>nicolas-common</module> |
20 | 20 | <module>nicolas-multiservice</module> |
21 | + <module>nicolas-eval</module> | |
21 | 22 | </modules> |
22 | 23 | |
23 | 24 | <properties> |
... | ... | @@ -27,10 +28,11 @@ |
27 | 28 | |
28 | 29 | <pscapi.version>1.0</pscapi.version> |
29 | 30 | <utils.version>1.0</utils.version> |
31 | + <eval.version>1.0</eval.version> | |
30 | 32 | |
31 | 33 | <commons-csv.version>1.4</commons-csv.version> |
32 | 34 | <guava.version>21.0</guava.version> |
33 | - <weka-dev.version>3.9.1</weka-dev.version> | |
35 | + <weka-stable.version>3.8.1</weka-stable.version> | |
34 | 36 | <commons-lang3.version>3.5</commons-lang3.version> |
35 | 37 | <commons-io.version>2.5</commons-io.version> |
36 | 38 | <slf4j-api.version>1.7.22</slf4j-api.version> |
... | ... | @@ -98,6 +100,11 @@ |
98 | 100 | <artifactId>utils</artifactId> |
99 | 101 | <version>${utils.version}</version> |
100 | 102 | </dependency> |
103 | + <dependency> | |
104 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | |
105 | + <artifactId>eval</artifactId> | |
106 | + <version>${eval.version}</version> | |
107 | + </dependency> | |
101 | 108 | |
102 | 109 | <!-- third party --> |
103 | 110 | <dependency> |
... | ... | @@ -112,8 +119,8 @@ |
112 | 119 | </dependency> |
113 | 120 | <dependency> |
114 | 121 | <groupId>nz.ac.waikato.cms.weka</groupId> |
115 | - <artifactId>weka-dev</artifactId> | |
116 | - <version>${weka-dev.version}</version> | |
122 | + <artifactId>weka-stable</artifactId> | |
123 | + <version>${weka-stable.version}</version> | |
117 | 124 | <exclusions> |
118 | 125 | <exclusion> |
119 | 126 | <groupId>org.slf4j</groupId> |
... | ... |