diff --git a/README.md b/README.md new file mode 100644 index 0000000..bd6e339 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# Nicolas + +Summarization tool, using coreference information as main source of information for content selection. + diff --git a/eval.sh b/eval.sh new file mode 100755 index 0000000..20d602b --- /dev/null +++ b/eval.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash + diff --git a/nicolas-cli/pom.xml b/nicolas-cli/pom.xml index 5062880..21a84ab 100644 --- a/nicolas-cli/pom.xml +++ b/nicolas-cli/pom.xml @@ -22,6 +22,11 @@ <groupId>pl.waw.ipipan.zil.summ</groupId> <artifactId>nicolas-lib</artifactId> </dependency> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>nicolas-model</artifactId> + <scope>runtime</scope> + </dependency> <!-- third party --> <dependency> diff --git a/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java index 4adaa48..ec8d684 100644 --- a/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java +++ b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java @@ -5,9 +5,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.Nicolas; import pl.waw.ipipan.zil.summ.nicolas.NicolasException; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; import java.io.*; diff --git a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java index 2509618..3ad99ae 100644 --- a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java +++ b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java @@ -5,10 +5,10 @@ import org.junit.ClassRule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.Nicolas; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import java.io.File; import java.io.FileInputStream; @@ -29,7 +29,7 @@ public class ClientTest { @Test public void processSampleText() throws Exception { Preprocessor preprocessor = mock(Preprocessor.class); - TText ttext = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); + TText ttext = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); when(preprocessor.preprocess(any())).thenReturn(ttext); Nicolas nicolas = mock(Nicolas.class); diff --git a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java index 4067383..51cd8a9 100644 --- a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java +++ b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java @@ -4,7 +4,7 @@ import org.apache.commons.io.IOUtils; import org.junit.ClassRule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import java.io.File; import java.io.FileInputStream; diff --git a/nicolas-common/pom.xml b/nicolas-common/pom.xml deleted file mode 100644 index e4678c3..0000000 --- a/nicolas-common/pom.xml +++ /dev/null @@ -1,43 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<project xmlns="http://maven.apache.org/POM/4.0.0" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <artifactId>nicolas-container</artifactId> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <version>1.0-SNAPSHOT</version> - </parent> - - <artifactId>nicolas-common</artifactId> - - <dependencies> - <!-- internal --> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>pscapi</artifactId> - </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.multiservice</groupId> - <artifactId>utils</artifactId> - </dependency> - - <!-- third party --> - <dependency> - <groupId>nz.ac.waikato.cms.weka</groupId> - <artifactId>weka-stable</artifactId> - </dependency> - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - </dependency> - - <!-- logging --> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> - </dependency> - - </dependencies> - -</project> \ No newline at end of file diff --git a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java b/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java deleted file mode 100644 index ad7cbb0..0000000 --- a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java +++ /dev/null @@ -1,185 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; -import weka.classifiers.Classifier; -import weka.core.Attribute; -import weka.core.Instances; - -import java.io.*; -import java.util.*; -import java.util.function.Function; -import java.util.stream.Collectors; - -public class Utils { - - private static final Logger LOG = LoggerFactory.getLogger(Utils.class); - - private static final String DATASET_NAME = "Dataset"; - - private Utils() { - } - - public static void writeStringToFile(String string, File file) throws IOException { - try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) { - bw.append(string); - } - } - - public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { - LOG.info("Loading classifier from path: {}...", modelResourcePath); - try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { - if (stream == null) { - throw new IOException("Model not found at: " + modelResourcePath); - } - try (ObjectInputStream ois = new ObjectInputStream(stream)) { - Classifier classifier = (Classifier) ois.readObject(); - LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); - return classifier; - } catch (ClassNotFoundException e) { - LOG.error("Error loading serialized classifier, class not found.", e); - throw new IOException(e); - } - } - } - - public static TText loadThriftTextFromStream(InputStream inputStream) throws IOException { - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(inputStream)) { - return (TText) ois.readObject(); - } catch (ClassNotFoundException e) { - LOG.error("Error reading serialized thrift text file, class not found.", e); - throw new IOException(e); - } - } - - public static TText loadThriftTextFromResource(String textResourcePath) throws IOException { - try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) { - if (stream == null) { - throw new IOException("Resource not found at: " + textResourcePath); - } - return loadThriftTextFromStream(stream); - } - } - - public static List<String> loadLinesFromResource(String resourcePath) throws IOException { - try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) { - return IOUtils.readLines(stream, Constants.ENCODING); - } - } - - @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList - public static Instances createNewInstances(ArrayList<Attribute> attributesList) { - Instances instances = new Instances(DATASET_NAME, attributesList, 0); - instances.setClassIndex(0); - return instances; - } - - public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException { - LOG.info("Loading classifier..."); - try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) { - Classifier classifier = (Classifier) ois.readObject(); - LOG.info("Done. " + classifier.toString()); - return classifier; - } - } - - public static List<String> tokenize(String text) { - return Arrays.asList(text.split("[^\\p{L}0-9]+")); - } - - public static List<String> tokenizeOnWhitespace(String text) { - return Arrays.asList(text.split(" +")); - } - - public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) { - Map<TMention, String> mention2orth = Maps.newHashMap(); - for (TSentence s : sents) { - Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth)); - Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace)); - - for (TMention m : s.getMentions()) { - StringBuffer mentionOrth = new StringBuffer(); - for (String tokId : m.getHeadIds()) { - if (!tokId2nps.get(tokId)) - mentionOrth.append(" "); - mentionOrth.append(tokId2orth.get(tokId)); - } - mention2orth.put(m, mentionOrth.toString().trim()); - } - } - return mention2orth; - } - - private static final Collection<String> STOPWORDS = Sets.newHashSet(); - - static { - STOPWORDS.addAll(Lists.newArrayList("i", "siÄ™", "to", "co")); - } - - public static Map<TMention, String> loadMention2Orth(List<TSentence> sents, boolean discardStopwords) { - Map<TMention, String> mention2orth = Maps.newHashMap(); - for (TSentence s : sents) { - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); - - for (TMention m : s.getMentions()) { - StringBuffer mentionOrth = new StringBuffer(); - for (String tokId : m.getChildIds()) { - TToken token = tokId2tok.get(tokId); - if (discardStopwords && STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { - continue; - } - - if (!token.isNoPrecedingSpace()) - mentionOrth.append(" "); - mentionOrth.append(token.getOrth()); - } - mention2orth.put(m, mentionOrth.toString().trim()); - } - } - return mention2orth; - } - - public static Map<TMention, String> loadMention2Base(List<TSentence> sents) { - Map<TMention, String> mention2base = Maps.newHashMap(); - for (TSentence s : sents) { - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase())); - - for (TMention m : s.getMentions()) { - StringBuilder mentionBase = new StringBuilder(); - for (String tokId : m.getChildIds()) { - mentionBase.append(" "); - mentionBase.append(tokId2base.get(tokId)); - } - mention2base.put(m, mentionBase.toString().toLowerCase().trim()); - } - } - return mention2base; - } - - public static String loadSentence2Orth(TSentence sentence) { - return loadSentence2Orth(sentence, Sets.newHashSet()); - } - - public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { - StringBuilder sb = new StringBuilder(); - for (TToken token : sentence.getTokens()) { - if (tokenIdsToSkip.contains(token.getId())) { - System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); - continue; - } - if (!token.isNoPrecedingSpace()) - sb.append(" "); - sb.append(token.getOrth()); - } - return sb.toString().trim(); - } - -} \ No newline at end of file diff --git a/nicolas-eval/pom.xml b/nicolas-eval/pom.xml deleted file mode 100644 index 31a9981..0000000 --- a/nicolas-eval/pom.xml +++ /dev/null @@ -1,56 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<project xmlns="http://maven.apache.org/POM/4.0.0" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <parent> - <artifactId>nicolas-container</artifactId> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <version>1.0-SNAPSHOT</version> - </parent> - <modelVersion>4.0.0</modelVersion> - - <artifactId>nicolas-eval</artifactId> - - <dependencies> - <!-- project --> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-lib</artifactId> - </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> - </dependency> - - <!-- internal --> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>eval</artifactId> - </dependency> - - <!-- third party --> - <dependency> - <groupId>nz.ac.waikato.cms.weka</groupId> - <artifactId>weka-stable</artifactId> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - </dependency> - - <!-- logging --> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> - </dependency> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-simple</artifactId> - </dependency> - - </dependencies> -</project> \ No newline at end of file diff --git a/nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt b/nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt deleted file mode 100644 index d0c556d..0000000 --- a/nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt +++ /dev/null @@ -1,154 +0,0 @@ -199704210012 -199704210042 -199704220007 -199704220018 -199704220021 -199704220044 -199704230006 -199704230014 -199704230029 -199704230043 -199704240008 -199704240019 -199704240020 -199704240021 -199704250018 -199704250022 -199704260014 -199704260015 -199704260016 -199704280023 -199704280025 -199704280027 -199704280031 -199704300031 -199704300042 -199704300046 -199801020010 -199801020031 -199801020035 -199801020070 -199801020076 -199801020079 -199801030068 -199801030090 -199801030091 -199801030129 -199801030148 -199801030158 -199801050023 -199801050059 -199801130087 -199801130129 -199801140182 -199801160119 -199801200106 -199801220140 -199801240061 -199801240096 -199801260047 -199801260070 -199801270055 -199801270110 -199801280123 -199801280158 -199801280159 -199801280241 -199801290022 -199801310003 -199801310037 -199802030127 -199802040159 -199802040182 -199802040202 -199805220133 -199808280158 -199901190073 -199901190115 -199901250112 -199901250117 -199901270103 -199901270120 -199901270122 -199901290095 -199901300101 -199902240095 -199906220029 -199906230024 -199906240084 -199906260027 -199907050045 -199907050076 -199907140166 -199907200002 -199907270004 -199908260001 -199909090036 -199909250018 -199909270029 -199910020027 -199910020029 -199910270011 -199911060044 -199911100038 -199911100064 -199911200030 -199911220063 -199912020060 -199912180026 -199912180034 -199912220030 -199912280024 -199912280046 -199912300021 -199912300029 -200001030029 -200001030053 -200001060034 -200001100035 -200001100046 -200001170029 -200001170033 -200001170060 -200001290045 -200002220027 -200002240034 -200002250031 -200003060062 -200003110050 -200004280047 -200004290022 -200006050119 -200006260079 -200006290045 -200007150033 -200008040076 -200008220042 -200008220046 -200010130049 -200010160054 -200012130034 -200012140084 -200012290046 -200104040019 -200106050035 -200108180109 -200108300032 -200111120045 -200111150042 -200111150047 -200111200036 -200111270049 -200112030055 -200112280057 -200201220038 -200201220050 -200202020036 -200202200032 -200202210054 -200202270044 -200203010070 -200203190026 -200203260050 -200203280017 -200203290078 diff --git a/nicolas-lib/pom.xml b/nicolas-lib/pom.xml index 6cb91d6..f7ea7e6 100644 --- a/nicolas-lib/pom.xml +++ b/nicolas-lib/pom.xml @@ -12,15 +12,6 @@ <artifactId>nicolas-lib</artifactId> <dependencies> - <!-- project --> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> - </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-model</artifactId> - </dependency> <!-- internal --> <dependency> @@ -61,5 +52,10 @@ <groupId>junit</groupId> <artifactId>junit</artifactId> </dependency> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>nicolas-model</artifactId> + <scope>test</scope> + </dependency> </dependencies> </project> \ No newline at end of file diff --git a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java index 4d2ab97..401e396 100644 --- a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java @@ -1,28 +1,26 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; +package pl.waw.ipipan.zil.summ.nicolas; -import com.google.common.base.Charsets; import com.google.common.collect.ImmutableList; import java.nio.charset.Charset; - +import java.nio.charset.StandardCharsets; public class Constants { - private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; + public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); + public static final Charset ENCODING = StandardCharsets.UTF_8; + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; private static final String MODELS_PATH = ROOT_PATH + "models/"; + public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin"; public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin"; public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin"; private static final String RESOURCES_PATH = ROOT_PATH + "resources/"; public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt"; - - public static final Charset ENCODING = Charsets.UTF_8; - - public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); + public static final String STOPWORDS_PATH = RESOURCES_PATH + "stopwords.txt"; private Constants() { } - } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java index f432020..79d3e34 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java @@ -5,12 +5,12 @@ import com.google.common.collect.Sets; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; import weka.classifiers.Classifier; @@ -31,9 +31,9 @@ public class Nicolas { public Nicolas() throws NicolasException { try { - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); + mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); + zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); mentionFeatureExtractor = new MentionFeatureExtractor(); sentenceFeatureExtractor = new SentenceFeatureExtractor(); @@ -57,7 +57,7 @@ public class Nicolas { StringBuilder sb = new StringBuilder(); for (TSentence sent : selectedSentences) { - sb.append(" ").append(Utils.loadSentence2Orth(sent)); + sb.append(" ").append(TextUtils.loadSentence2Orth(sent)); } return sb.toString().trim(); } @@ -74,7 +74,7 @@ public class Nicolas { Random r = new Random(1); Set<TSentence> summary = Sets.newHashSet(); for (TSentence sent : sortedSentences) { - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); + size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); if (r.nextDouble() > 0.4 && size > targetSize) break; summary.add(sent); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java deleted file mode 100644 index 47b20ea..0000000 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java +++ /dev/null @@ -1,127 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.apply; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectInjector; -import weka.classifiers.Classifier; -import weka.core.Instance; -import weka.core.Instances; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.util.*; - -import static java.util.stream.Collectors.toList; - -public class ApplyModel { - - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class); - - private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; - private static final String TARGET_DIR = "corpora/summaries"; - - public static void main(String[] args) throws Exception { - Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); - - Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); - SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); - - ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); - - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH)); - int i = 1; - double avgSize = 0; - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { - TText text = entry.getValue(); - - Set<TMention> goodMentions - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); - - int targetSize = calculateTargetSize(text); - String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor, zeroSubjectInjector); - int size = Utils.tokenize(summary).size(); - avgSize += size; - try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily4.txt")))) { - bw.append(summary); - } - - LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey()); - } - - LOG.info("Avg size:" + avgSize / id2preprocessedText.size()); - } - - private static int calculateTargetSize(TText text) { - List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); - StringBuffer body = new StringBuffer(); - for (TSentence sent : sents) - body.append(Utils.loadSentence2Orth(sent) + " "); - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); - return (int) (0.2 * tokenCount); - } - - private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor, ZeroSubjectInjector zeroSubjectInjector) throws Exception { - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); - - Set<String> zeroSubjectTokenIds = zeroSubjectInjector.findZeroSubjectTokenIds(thrifted, selectedSentences); - - StringBuilder sb = new StringBuilder(); - for (TSentence sent : selectedSentences) { - sb.append(" " + Utils.loadSentence2Orth(sent, zeroSubjectTokenIds)); - } - return sb.toString().trim(); - } - - private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { - - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); - - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); - Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); - - Map<TSentence, Double> sentence2score = Maps.newHashMap(); - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { - Instance instance = entry.getValue(); - instance.setDataset(instances); - double score = sentenceClassifier.classifyInstance(instance); - sentence2score.put(entry.getKey(), score); - } - - List<TSentence> sortedSents = Lists.newArrayList(sents); - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); - - int size = 0; - Random r = new Random(1); - Set<TSentence> summary = Sets.newHashSet(); - for (TSentence sent : sortedSents) { - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); - if (r.nextDouble() > 0.4 && size > targetSize) - break; - summary.add(sent); - if (size > targetSize) - break; - } - List<TSentence> selectedSentences = Lists.newArrayList(); - for (TSentence sent : sents) { - if (summary.contains(sent)) - selectedSentences.add(sent); - } - return selectedSentences; - } - -} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java index 9ab26a8..0bd02ff 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.features; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import pl.waw.ipipan.zil.multiservice.thrift.types.*; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; import java.util.List; import java.util.Map; @@ -38,7 +37,6 @@ public class FeatureHelper { private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); private final Map<TMention, Integer> mention2firstTokenIndex = Maps.newHashMap(); - public FeatureHelper(TText preprocessedText) { text = preprocessedText; @@ -60,9 +58,9 @@ public class FeatureHelper { int sentIdx = 0; int mentionIdx = 0; for (TParagraph par : preprocessedText.getParagraphs()) { - Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences(), false); + Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); mention2Orth.putAll(m2o); - Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); + Map<TMention, String> m2b = loadMention2Base(par.getSentences()); mention2Base.putAll(m2b); int sentIdxInPar = 0; @@ -221,4 +219,40 @@ public class FeatureHelper { return null; return mention2sent.get(mention).getTokens().get(idx - 1); } + + private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { + Map<TMention, String> mention2orth = Maps.newHashMap(); + for (TSentence s : sents) { + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); + + for (TMention m : s.getMentions()) { + StringBuilder mentionOrth = new StringBuilder(); + for (String tokId : m.getChildIds()) { + TToken token = tokId2tok.get(tokId); + if (!token.isNoPrecedingSpace()) + mentionOrth.append(" "); + mentionOrth.append(token.getOrth()); + } + mention2orth.put(m, mentionOrth.toString().trim()); + } + } + return mention2orth; + } + + private static Map<TMention, String> loadMention2Base(List<TSentence> sents) { + Map<TMention, String> mention2base = Maps.newHashMap(); + for (TSentence s : sents) { + Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); + + for (TMention m : s.getMentions()) { + StringBuilder mentionBase = new StringBuilder(); + for (String tokId : m.getChildIds()) { + mentionBase.append(" "); + mentionBase.append(tokId2base.get(tokId)); + } + mention2base.put(m, mentionBase.toString().toLowerCase().trim()); + } + } + return mention2base; + } } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java index d624f41..9b2b8b5 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java @@ -3,11 +3,11 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import pl.waw.ipipan.zil.multiservice.thrift.types.*; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; import weka.core.Attribute; import java.io.IOException; @@ -21,7 +21,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { private final List<String> frequentBases; public MentionFeatureExtractor() throws IOException { - frequentBases = loadFrequentBases(); + frequentBases = ResourceUtils.loadFrequentBases(); //coref addNumericAttributeNormalized("chain_length"); @@ -80,10 +80,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { fillSortedAttributes("score"); } - private List<String> loadFrequentBases() throws IOException { - return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList()); - } - private String encodeBase(String base) { return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); } @@ -177,7 +173,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { Attribute att = getAttributeByName(attributeName); int index = att.indexOfValue(value); if (index == -1) - LOG.warn(value + " not found for attribute " + attributeName); + LOG.warn("{} not found for attribute {}", value, attributeName); attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java index 46a296b..83468e3 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java @@ -5,8 +5,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; import weka.classifiers.Classifier; import weka.core.Instance; import weka.core.Instances; @@ -24,7 +23,7 @@ public class MentionModel { public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception { Set<TMention> goodMentions = Sets.newHashSet(); - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor); for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { Instance instance = entry.getValue(); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java index 21117da..dc9cc6f 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java @@ -6,8 +6,7 @@ import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; import weka.classifiers.Classifier; import weka.core.Instance; import weka.core.Instances; @@ -23,7 +22,7 @@ public class SentenceModel { } public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); Map<TSentence, Double> sentence2score = Maps.newHashMap(); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java index 8459d82..7fdf82b 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java @@ -1,4 +1,4 @@ -package pl.waw.ipipan.zil.summ.nicolas; +package pl.waw.ipipan.zil.summ.nicolas.utils; import com.google.common.collect.Maps; import org.slf4j.Logger; @@ -11,7 +11,9 @@ import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; import weka.core.Attribute; import weka.core.DenseInstance; import weka.core.Instance; +import weka.core.Instances; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; @@ -22,6 +24,8 @@ public class InstanceUtils { private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class); + private static final String DATASET_NAME = "Dataset"; + private InstanceUtils() { } @@ -60,4 +64,11 @@ public class InstanceUtils { LOG.info("Extracted features of {} sentences.", sentence2instance.size()); return sentence2instance; } + + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList + public static Instances createNewInstances(ArrayList<Attribute> attributesList) { + Instances instances = new Instances(DATASET_NAME, attributesList, 0); + instances.setClassIndex(0); + return instances; + } } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java new file mode 100644 index 0000000..acdf7d2 --- /dev/null +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java @@ -0,0 +1,62 @@ +package pl.waw.ipipan.zil.summ.nicolas.utils; + +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.summ.nicolas.Constants; +import weka.classifiers.Classifier; + +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.util.List; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +public class ResourceUtils { + + private static final Logger LOG = LoggerFactory.getLogger(ResourceUtils.class); + + private ResourceUtils() { + } + + public static List<String> loadFrequentBases() throws IOException { + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH); + } + + public static List<String> loadStopwords() throws IOException { + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.STOPWORDS_PATH); + } + + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { + LOG.info("Loading classifier from path: {}...", modelResourcePath); + try (InputStream stream = ResourceUtils.class.getResourceAsStream(modelResourcePath)) { + if (stream == null) { + throw new IOException("Model not found at: " + modelResourcePath); + } + try (ObjectInputStream ois = new ObjectInputStream(stream)) { + Classifier classifier = (Classifier) ois.readObject(); + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); + return classifier; + } catch (ClassNotFoundException e) { + LOG.error("Error loading serialized classifier, class not found.", e); + throw new IOException(e); + } + } + } + + private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException { + try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) { + return IOUtils.readLines(stream, Constants.ENCODING) + .stream() + .map(String::trim) + .map(String::toLowerCase) + .filter(((Predicate<String>) String::isEmpty).negate()) + .sorted() + .distinct() + .collect(Collectors.toList()); + } + } + + +} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java new file mode 100644 index 0000000..d561a70 --- /dev/null +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java @@ -0,0 +1,41 @@ +package pl.waw.ipipan.zil.summ.nicolas.utils; + +import com.google.common.collect.Sets; +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; + +import java.util.Arrays; +import java.util.List; +import java.util.Set; + +public class TextUtils { + + private TextUtils() { + } + + public static List<String> tokenize(String text) { + return Arrays.asList(text.split("[^\\p{L}0-9]+")); + } + + public static List<String> tokenizeOnWhitespace(String text) { + return Arrays.asList(text.split(" +")); + } + + public static String loadSentence2Orth(TSentence sentence) { + return loadSentence2Orth(sentence, Sets.newHashSet()); + } + + public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { + StringBuilder sb = new StringBuilder(); + for (TToken token : sentence.getTokens()) { + if (tokenIdsToSkip.contains(token.getId())) { + System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); + continue; + } + if (!token.isNoPrecedingSpace()) + sb.append(" "); + sb.append(token.getOrth()); + } + return sb.toString().trim(); + } +} diff --git a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java index 0efb18c..9835fae 100644 --- a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java @@ -1,4 +1,4 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; import com.google.common.base.Predicates; import com.google.common.collect.Maps; @@ -58,4 +58,12 @@ public class ThriftUtils { } } + public static TText loadThriftTextFromResource(String resourcePath) { + try (InputStream stream = ThriftUtils.class.getResourceAsStream(resourcePath)) { + return loadThriftTextFromStream(stream); + } catch (IOException e) { + LOG.error("Error reading serialized Thrift text from resource", e); + return null; + } + } } diff --git a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java index fbbb2a9..bf2ce9a 100644 --- a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java @@ -1,4 +1,4 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; import java.io.IOException; import java.io.InputStream; diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java index c26b629..dfa853b 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java @@ -7,7 +7,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import weka.core.Attribute; diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java index 239aff9..11280f6 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java @@ -3,8 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; import com.google.common.collect.Sets; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.Constants; +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; import weka.classifiers.Classifier; import weka.core.Instance; import weka.core.Instances; @@ -24,7 +24,7 @@ public class ZeroSubjectInjector { public ZeroSubjectInjector() throws Exception { classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); featureExtractor = new ZeroFeatureExtractor(); - instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); } public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { diff --git a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java index 9dae6f4..2385bb1 100644 --- a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java +++ b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java @@ -3,7 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas; import org.junit.BeforeClass; import org.junit.Test; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import static org.junit.Assert.assertTrue; @@ -20,9 +21,9 @@ public class NicolasTest { @Test public void shouldSummarizeThriftText() throws Exception { - TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); + TText thriftText = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); String summary = nicolas.summarizeThrift(thriftText, 5); - int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size(); + int summaryTokensCount = TextUtils.tokenizeOnWhitespace(summary).size(); assertTrue(summaryTokensCount > 0); assertTrue(summaryTokensCount < 10); } diff --git a/nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java index ce09aab..464ae07 100644 --- a/nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java +++ b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java @@ -1,4 +1,4 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; import org.junit.Test; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; @@ -7,13 +7,13 @@ import java.io.InputStream; import static org.junit.Assert.assertEquals; -public class UtilsTest { +public class ThriftUtilsTest { private static final String SAMPLE_TEXT_PATH = "/199704210011.bin"; @Test public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { - try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { + try (InputStream stream = ThriftUtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { TText text = ThriftUtils.loadThriftTextFromStream(stream); assertEquals(26, text.getParagraphs().size()); assertEquals(2, text.getParagraphs().get(4).getSentences().size()); diff --git a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java index e8e7a47..274356b 100644 --- a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java +++ b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; import org.junit.Test; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import java.io.IOException; import java.io.InputStream; diff --git a/nicolas-common/src/test/resources/199704210011.bin b/nicolas-lib/src/test/resources/199704210011.bin index cf072c2..cf072c2 100644 --- a/nicolas-common/src/test/resources/199704210011.bin +++ b/nicolas-lib/src/test/resources/199704210011.bin diff --git a/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore b/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore new file mode 100644 index 0000000..314f02b --- /dev/null +++ b/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore @@ -0,0 +1 @@ +*.txt \ No newline at end of file diff --git a/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md b/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md new file mode 100644 index 0000000..511f97c --- /dev/null +++ b/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md @@ -0,0 +1 @@ +To generate resources in this folder, use nicolas-trainer module. \ No newline at end of file diff --git a/nicolas-multiservice/pom.xml b/nicolas-multiservice/pom.xml index cc051be..a49a932 100644 --- a/nicolas-multiservice/pom.xml +++ b/nicolas-multiservice/pom.xml @@ -30,8 +30,12 @@ <!-- test --> <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + </dependency> + <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> + <artifactId>nicolas-lib</artifactId> <scope>test</scope> </dependency> diff --git a/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java b/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java index e3ce61d..6d209ff 100644 --- a/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java +++ b/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java @@ -7,7 +7,7 @@ import org.junit.rules.TemporaryFolder; import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import java.io.File; import java.io.FileInputStream; @@ -67,7 +67,7 @@ public class PreprocessorIT { preprocessor.preprocessToFile(text, targetFile); try (FileInputStream inputStream = new FileInputStream(targetFile)) { - TText processed = Utils.loadThriftTextFromStream(inputStream); + TText processed = ThriftUtils.loadThriftTextFromStream(inputStream); assertSampleProcessedText(processed); } } diff --git a/nicolas-train/pom.xml b/nicolas-train/pom.xml index 0124f0f..57e2072 100644 --- a/nicolas-train/pom.xml +++ b/nicolas-train/pom.xml @@ -15,10 +15,6 @@ <!-- project --> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> - </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> <artifactId>nicolas-lib</artifactId> </dependency> <dependency> diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java index 44e67c3..7af55cf 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java @@ -1,4 +1,4 @@ -package pl.waw.ipipan.zil.summ.nicolas.train; +package pl.waw.ipipan.zil.summ.nicolas; import net.lingala.zip4j.core.ZipFile; import net.lingala.zip4j.exception.ZipException; @@ -34,7 +34,7 @@ public class PathConstants { public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); - public static final File ARFF_DIR = new File(WORKING_DIR, "train-arff"); + private static final File ARFF_DIR = new File(WORKING_DIR, "train-arff"); public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff"); public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); diff --git a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java index 80ac0a8..17981e5 100644 --- a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java @@ -17,7 +17,7 @@ public class Constants { public static Set<String> loadTestTextIds() throws IOException { try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING); + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING); return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); } } diff --git a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java index de33cae..de33cae 100644 --- a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java diff --git a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java index df1ccb8..c5dee6c 100644 --- a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java @@ -7,11 +7,13 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; import pl.waw.ipipan.zil.summ.nicolas.Nicolas; import pl.waw.ipipan.zil.summ.nicolas.NicolasException; -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; import java.util.List; import java.util.Map; import java.util.Set; @@ -23,7 +25,6 @@ public class SummarizeTestCorpus { private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); - private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; private static final double SUMMARY_RATIO = 0.2; @@ -31,8 +32,8 @@ public class SummarizeTestCorpus { } public static void main(String[] args) throws IOException, NicolasException { - File thriftedCorpusDir = new File("data/preprocessed"); - File targetDir = new File("data/summaries"); + File thriftedCorpusDir = new File("data/all-preprocessed"); + File targetDir = new File("data/test-system"); targetDir.mkdir(); Set<String> testTextIds = loadTestTextIds(); @@ -62,9 +63,9 @@ public class SummarizeTestCorpus { List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); StringBuilder body = new StringBuilder(); for (TSentence sentence : sentences) - body.append(Utils.loadSentence2Orth(sentence)).append(" "); + body.append(TextUtils.loadSentence2Orth(sentence)).append(" "); - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); + int tokenCount = TextUtils.tokenizeOnWhitespace(body.toString().trim()).size(); return (int) (SUMMARY_RATIO * tokenCount); } @@ -73,7 +74,9 @@ public class SummarizeTestCorpus { String textId = entry.getKey(); String summary = entry.getValue(); String targetFileName = textId + SUMMARY_FILE_SUFFIX; - Utils.writeStringToFile(summary, new File(targetDir, targetFileName)); + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) { + writer.write(summary); + } } } diff --git a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java index 5cba028..5cba028 100644 --- a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java index 60e679a..aec39ae 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java @@ -6,29 +6,63 @@ import com.google.common.collect.Multiset; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; +import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.function.Function; import java.util.stream.Collectors; public class MentionScorer { + private final Set<String> STOPWORDS; + + public MentionScorer() throws IOException { + STOPWORDS = ResourceUtils.loadStopwords().stream().collect(Collectors.toSet()); + } + public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); - Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences, true); + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences); return booleanTokenIntersection(mention2Orth, tokenCounts); } + private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sents) { + Map<TMention, String> mention2orth = Maps.newHashMap(); + for (TSentence s : sents) { + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); + + for (TMention m : s.getMentions()) { + StringBuilder mentionOrth = new StringBuilder(); + for (String tokId : m.getChildIds()) { + TToken token = tokId2tok.get(tokId); + if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { + continue; + } + + if (!token.isNoPrecedingSpace()) + mentionOrth.append(" "); + mentionOrth.append(token.getOrth()); + } + mention2orth.put(m, mentionOrth.toString().trim()); + } + } + return mention2orth; + } + private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { Map<TMention, Double> mention2score = Maps.newHashMap(); for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { TMention mention = entry.getKey(); String mentionOrth = mention2Orth.get(mention); - for (String token : Utils.tokenize(mentionOrth)) { + for (String token : TextUtils.tokenize(mentionOrth)) { if (tokenCounts.contains(token.toLowerCase())) { mention2score.put(mention, 1.0); break; diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java index 61d01f0..dcdc297 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java @@ -6,22 +6,23 @@ import com.google.common.collect.Multiset; import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; import java.util.List; import java.util.Map; public class SentenceScorer { + public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) { - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); Map<TSentence, Double> sentence2score = Maps.newHashMap(); for (TParagraph paragraph : preprocessedText.getParagraphs()) for (TSentence sentence : paragraph.getSentences()) { double score = 0.0; - String orth = Utils.loadSentence2Orth(sentence); - List<String> tokens = Utils.tokenize(orth); + String orth = TextUtils.loadSentence2Orth(sentence); + List<String> tokens = TextUtils.tokenize(orth); for (String token : tokens) { score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0; } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java index 241874e..98d0d67 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java @@ -5,7 +5,7 @@ import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.commons.csv.QuoteMode; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java index aeba701..8dde011 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java @@ -8,14 +8,14 @@ import com.google.common.collect.Multiset; import org.apache.commons.io.FileUtils; import pl.waw.ipipan.zil.summ.eval.Main; import pl.waw.ipipan.zil.summ.eval.rouge.RougeN; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import java.io.File; import java.io.IOException; import java.util.*; import java.util.stream.Collectors; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class CreateOptimalSummaries { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java index 7e4b548..3783f8b 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java @@ -1,6 +1,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class DownloadCorpus { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java index 980fa5c..79374e1 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java @@ -1,6 +1,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class DownloadTrainingResources { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java index 19d5171..05fbe5f 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java @@ -1,19 +1,21 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; import pl.waw.ipipan.zil.summ.pscapi.xml.Text; import javax.xml.bind.JAXBException; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; import java.util.List; import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class ExtractGoldSummaries { @@ -22,7 +24,6 @@ public class ExtractGoldSummaries { private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); - private ExtractGoldSummaries() { } @@ -47,7 +48,10 @@ public class ExtractGoldSummaries { for (Summary summary : goldSummaries) { File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR; File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); - Utils.writeStringToFile(summary.getBody(), targetFile); + + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { + writer.append(summary.getBody()); + } } } } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java index b15a291..b5b0c09 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java @@ -10,15 +10,14 @@ import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.train.model.MentionScorer; import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; @@ -37,7 +36,7 @@ import java.util.Set; import java.util.function.Predicate; import java.util.stream.Collectors; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class PrepareTrainingData { @@ -61,7 +60,7 @@ public class PrepareTrainingData { MentionScorer mentionScorer = new MentionScorer(); MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); int i = 1; for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { @@ -105,7 +104,7 @@ public class PrepareTrainingData { SentenceScorer sentenceScorer = new SentenceScorer(); SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); int i = 1; for (String textId : id2preprocessedText.keySet()) { @@ -149,7 +148,7 @@ public class PrepareTrainingData { ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_CORPUS); ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); int i = 1; for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java index 449454b..fd7e862 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java @@ -9,7 +9,7 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; import java.io.File; import java.util.Arrays; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class PreprocessCorpus { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java index 10dfa40..d186dcc 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings; import weka.classifiers.Classifier; import weka.core.Instances; @@ -14,7 +14,7 @@ import java.io.FileOutputStream; import java.io.ObjectOutputStream; import java.util.logging.LogManager; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class TrainAllModels { diff --git a/pom.xml b/pom.xml index f1fad43..f21f5f3 100644 --- a/pom.xml +++ b/pom.xml @@ -10,15 +10,12 @@ <packaging>pom</packaging> - <modules> <module>nicolas-lib</module> <module>nicolas-cli</module> <module>nicolas-model</module> <module>nicolas-train</module> - <module>nicolas-common</module> <module>nicolas-multiservice</module> - <module>nicolas-eval</module> </modules> <properties> @@ -59,23 +56,23 @@ <!-- project --> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-model</artifactId> + <artifactId>nicolas-cli</artifactId> <version>${project.version}</version> - <scope>runtime</scope> </dependency> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> + <artifactId>nicolas-lib</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-zero</artifactId> + <artifactId>nicolas-model</artifactId> <version>${project.version}</version> + <scope>runtime</scope> </dependency> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-lib</artifactId> + <artifactId>nicolas-multiservice</artifactId> <version>${project.version}</version> </dependency> <dependency> @@ -83,11 +80,6 @@ <artifactId>nicolas-train</artifactId> <version>${project.version}</version> </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-multiservice</artifactId> - <version>${project.version}</version> - </dependency> <!-- internal --> <dependency>