From 1a009dd0c4f78b9367ce117f0edd6e982cb4ebdf Mon Sep 17 00:00:00 2001 From: Mateusz Kopeć <m.kopec@ipipan.waw.pl> Date: Wed, 15 Mar 2017 23:08:04 +0100 Subject: [PATCH] clean up modules --- README.md | 4 ++++ eval.sh | 2 ++ nicolas-cli/pom.xml | 5 +++++ nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java | 2 +- nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java | 6 +++--- nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java | 2 +- nicolas-common/pom.xml | 43 ------------------------------------------- nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java | 28 ---------------------------- nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java | 61 ------------------------------------------------------------- nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java | 185 ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java | 35 ----------------------------------- nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java | 22 ---------------------- nicolas-common/src/test/resources/199704210011.bin | Bin 497720 -> 0 bytes nicolas-eval/pom.xml | 56 -------------------------------------------------------- nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java | 24 ------------------------ nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java | 15 --------------- nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java | 80 -------------------------------------------------------------------------------- nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java | 131 ----------------------------------------------------------------------------------------------------------------------------------- nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt | 154 ---------------------------------------------------------------------------------------------------------------------------------------------------------- nicolas-lib/pom.xml | 14 +++++--------- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java | 26 ++++++++++++++++++++++++++ nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java | 63 --------------------------------------------------------------- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java | 14 +++++++------- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java | 127 ------------------------------------------------------------------------------------------------------------------------------- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java | 42 ++++++++++++++++++++++++++++++++++++++---- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java | 12 ++++-------- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java | 5 ++--- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java | 5 ++--- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java | 41 +++++++++++++++++++++++++++++++++++++++++ nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java | 35 +++++++++++++++++++++++++++++++++++ nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java | 2 +- nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java | 6 +++--- nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java | 7 ++++--- nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java | 22 ++++++++++++++++++++++ nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java | 2 +- nicolas-lib/src/test/resources/199704210011.bin | Bin 0 -> 497720 bytes nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore | 1 + nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md | 1 + nicolas-multiservice/pom.xml | 6 +++++- nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java | 4 ++-- nicolas-train/pom.xml | 4 ---- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java | 24 ++++++++++++++++++++++++ nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java | 15 +++++++++++++++ nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java | 79 ------------------------------------------------------------------------------- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java | 42 ++++++++++++++++++++++++++++++++++++++---- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java | 9 +++++---- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java | 2 +- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java | 4 ++-- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java | 2 +- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java | 2 +- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java | 12 ++++++++---- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java | 13 ++++++------- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java | 2 +- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java | 4 ++-- pom.xml | 18 +++++------------- 61 files changed, 823 insertions(+), 1197 deletions(-) create mode 100644 README.md create mode 100755 eval.sh delete mode 100644 nicolas-common/pom.xml delete mode 100644 nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java delete mode 100644 nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java delete mode 100644 nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java delete mode 100644 nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java delete mode 100644 nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java delete mode 100644 nicolas-common/src/test/resources/199704210011.bin delete mode 100644 nicolas-eval/pom.xml delete mode 100644 nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java delete mode 100644 nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java delete mode 100644 nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java delete mode 100644 nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java delete mode 100644 nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt create mode 100644 nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java delete mode 100644 nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java delete mode 100644 nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java create mode 100644 nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java create mode 100644 nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java create mode 100644 nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java create mode 100644 nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java create mode 100644 nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java create mode 100644 nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java create mode 100644 nicolas-lib/src/test/resources/199704210011.bin create mode 100644 nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore create mode 100644 nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md create mode 100644 nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java create mode 100644 nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java create mode 100644 nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java create mode 100644 nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java create mode 100644 nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java delete mode 100644 nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java diff --git a/README.md b/README.md new file mode 100644 index 0000000..bd6e339 --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# Nicolas + +Summarization tool, using coreference information as main source of information for content selection. + diff --git a/eval.sh b/eval.sh new file mode 100755 index 0000000..20d602b --- /dev/null +++ b/eval.sh @@ -0,0 +1,2 @@ +#!/usr/bin/env bash + diff --git a/nicolas-cli/pom.xml b/nicolas-cli/pom.xml index 5062880..21a84ab 100644 --- a/nicolas-cli/pom.xml +++ b/nicolas-cli/pom.xml @@ -22,6 +22,11 @@ <groupId>pl.waw.ipipan.zil.summ</groupId> <artifactId>nicolas-lib</artifactId> </dependency> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>nicolas-model</artifactId> + <scope>runtime</scope> + </dependency> <!-- third party --> <dependency> diff --git a/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java index 4adaa48..ec8d684 100644 --- a/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java +++ b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java @@ -5,9 +5,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.Nicolas; import pl.waw.ipipan.zil.summ.nicolas.NicolasException; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; import java.io.*; diff --git a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java index 2509618..3ad99ae 100644 --- a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java +++ b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java @@ -5,10 +5,10 @@ import org.junit.ClassRule; import org.junit.Test; import org.junit.rules.TemporaryFolder; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.Nicolas; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import java.io.File; import java.io.FileInputStream; @@ -29,7 +29,7 @@ public class ClientTest { @Test public void processSampleText() throws Exception { Preprocessor preprocessor = mock(Preprocessor.class); - TText ttext = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); + TText ttext = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); when(preprocessor.preprocess(any())).thenReturn(ttext); Nicolas nicolas = mock(Nicolas.class); diff --git a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java index 4067383..51cd8a9 100644 --- a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java +++ b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java @@ -4,7 +4,7 @@ import org.apache.commons.io.IOUtils; import org.junit.ClassRule; import org.junit.Test; import org.junit.rules.TemporaryFolder; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import java.io.File; import java.io.FileInputStream; diff --git a/nicolas-common/pom.xml b/nicolas-common/pom.xml deleted file mode 100644 index e4678c3..0000000 --- a/nicolas-common/pom.xml +++ /dev/null @@ -1,43 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<project xmlns="http://maven.apache.org/POM/4.0.0" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <artifactId>nicolas-container</artifactId> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <version>1.0-SNAPSHOT</version> - </parent> - - <artifactId>nicolas-common</artifactId> - - <dependencies> - <!-- internal --> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>pscapi</artifactId> - </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.multiservice</groupId> - <artifactId>utils</artifactId> - </dependency> - - <!-- third party --> - <dependency> - <groupId>nz.ac.waikato.cms.weka</groupId> - <artifactId>weka-stable</artifactId> - </dependency> - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - </dependency> - - <!-- logging --> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> - </dependency> - - </dependencies> - -</project> \ No newline at end of file diff --git a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java b/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java deleted file mode 100644 index 4d2ab97..0000000 --- a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Constants.java +++ /dev/null @@ -1,28 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; - -import com.google.common.base.Charsets; -import com.google.common.collect.ImmutableList; - -import java.nio.charset.Charset; - - -public class Constants { - - private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; - - private static final String MODELS_PATH = ROOT_PATH + "models/"; - public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin"; - public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin"; - public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin"; - - private static final String RESOURCES_PATH = ROOT_PATH + "resources/"; - public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt"; - - public static final Charset ENCODING = Charsets.UTF_8; - - public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); - - private Constants() { - } - -} diff --git a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java b/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java deleted file mode 100644 index 0efb18c..0000000 --- a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/ThriftUtils.java +++ /dev/null @@ -1,61 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; - -import com.google.common.base.Predicates; -import com.google.common.collect.Maps; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; - -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.Map; -import java.util.function.Predicate; - -public class ThriftUtils { - - private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class); - - private ThriftUtils() { - } - - public static Map<String, TText> loadThriftTextsFromFolder(File folder, Predicate<String> idFilter) { - Map<String, TText> id2text = Maps.newHashMap(); - File[] files = folder.listFiles(); - if (files != null) { - for (File processedFullTextFile : files) { - String textId = processedFullTextFile.getName().split("\\.")[0]; - if (!idFilter.test(textId)) - continue; - TText processedFullText = loadThriftTextFromFile(processedFullTextFile); - id2text.put(textId, processedFullText); - } - } - LOG.info("{} preprocessed texts found.", id2text.size()); - return id2text; - } - - public static Map<String, TText> loadThriftTextsFromFolder(File folder) { - return loadThriftTextsFromFolder(folder, Predicates.alwaysTrue()); - } - - public static TText loadThriftTextFromFile(File originalFile) { - try (FileInputStream inputStream = new FileInputStream(originalFile)) { - return loadThriftTextFromStream(inputStream); - } catch (IOException e) { - LOG.error("Error reading serialized Thrift file", e); - return null; - } - } - - public static TText loadThriftTextFromStream(InputStream stream) { - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { - return (TText) ois.readObject(); - } catch (ClassNotFoundException | IOException e) { - LOG.error("Error reading serialized Thrift stream", e); - return null; - } - } - -} diff --git a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java b/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java deleted file mode 100644 index ad7cbb0..0000000 --- a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java +++ /dev/null @@ -1,185 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import org.apache.commons.io.IOUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; -import weka.classifiers.Classifier; -import weka.core.Attribute; -import weka.core.Instances; - -import java.io.*; -import java.util.*; -import java.util.function.Function; -import java.util.stream.Collectors; - -public class Utils { - - private static final Logger LOG = LoggerFactory.getLogger(Utils.class); - - private static final String DATASET_NAME = "Dataset"; - - private Utils() { - } - - public static void writeStringToFile(String string, File file) throws IOException { - try (BufferedWriter bw = new BufferedWriter(new FileWriter(file))) { - bw.append(string); - } - } - - public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { - LOG.info("Loading classifier from path: {}...", modelResourcePath); - try (InputStream stream = Utils.class.getResourceAsStream(modelResourcePath)) { - if (stream == null) { - throw new IOException("Model not found at: " + modelResourcePath); - } - try (ObjectInputStream ois = new ObjectInputStream(stream)) { - Classifier classifier = (Classifier) ois.readObject(); - LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); - return classifier; - } catch (ClassNotFoundException e) { - LOG.error("Error loading serialized classifier, class not found.", e); - throw new IOException(e); - } - } - } - - public static TText loadThriftTextFromStream(InputStream inputStream) throws IOException { - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(inputStream)) { - return (TText) ois.readObject(); - } catch (ClassNotFoundException e) { - LOG.error("Error reading serialized thrift text file, class not found.", e); - throw new IOException(e); - } - } - - public static TText loadThriftTextFromResource(String textResourcePath) throws IOException { - try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) { - if (stream == null) { - throw new IOException("Resource not found at: " + textResourcePath); - } - return loadThriftTextFromStream(stream); - } - } - - public static List<String> loadLinesFromResource(String resourcePath) throws IOException { - try (InputStream stream = Utils.class.getResourceAsStream(resourcePath)) { - return IOUtils.readLines(stream, Constants.ENCODING); - } - } - - @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList - public static Instances createNewInstances(ArrayList<Attribute> attributesList) { - Instances instances = new Instances(DATASET_NAME, attributesList, 0); - instances.setClassIndex(0); - return instances; - } - - public static Classifier loadClassifierFromResource(String resourcePath) throws IOException, ClassNotFoundException { - LOG.info("Loading classifier..."); - try (ObjectInputStream ois = new ObjectInputStream(Utils.class.getResourceAsStream(resourcePath))) { - Classifier classifier = (Classifier) ois.readObject(); - LOG.info("Done. " + classifier.toString()); - return classifier; - } - } - - public static List<String> tokenize(String text) { - return Arrays.asList(text.split("[^\\p{L}0-9]+")); - } - - public static List<String> tokenizeOnWhitespace(String text) { - return Arrays.asList(text.split(" +")); - } - - public static Map<TMention, String> loadMention2HeadOrth(List<TSentence> sents) { - Map<TMention, String> mention2orth = Maps.newHashMap(); - for (TSentence s : sents) { - Map<String, String> tokId2orth = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::getOrth)); - Map<String, Boolean> tokId2nps = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, TToken::isNoPrecedingSpace)); - - for (TMention m : s.getMentions()) { - StringBuffer mentionOrth = new StringBuffer(); - for (String tokId : m.getHeadIds()) { - if (!tokId2nps.get(tokId)) - mentionOrth.append(" "); - mentionOrth.append(tokId2orth.get(tokId)); - } - mention2orth.put(m, mentionOrth.toString().trim()); - } - } - return mention2orth; - } - - private static final Collection<String> STOPWORDS = Sets.newHashSet(); - - static { - STOPWORDS.addAll(Lists.newArrayList("i", "się", "to", "co")); - } - - public static Map<TMention, String> loadMention2Orth(List<TSentence> sents, boolean discardStopwords) { - Map<TMention, String> mention2orth = Maps.newHashMap(); - for (TSentence s : sents) { - Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); - - for (TMention m : s.getMentions()) { - StringBuffer mentionOrth = new StringBuffer(); - for (String tokId : m.getChildIds()) { - TToken token = tokId2tok.get(tokId); - if (discardStopwords && STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { - continue; - } - - if (!token.isNoPrecedingSpace()) - mentionOrth.append(" "); - mentionOrth.append(token.getOrth()); - } - mention2orth.put(m, mentionOrth.toString().trim()); - } - } - return mention2orth; - } - - public static Map<TMention, String> loadMention2Base(List<TSentence> sents) { - Map<TMention, String> mention2base = Maps.newHashMap(); - for (TSentence s : sents) { - Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(tok -> tok.getId(), tok -> tok.getChosenInterpretation().getBase())); - - for (TMention m : s.getMentions()) { - StringBuilder mentionBase = new StringBuilder(); - for (String tokId : m.getChildIds()) { - mentionBase.append(" "); - mentionBase.append(tokId2base.get(tokId)); - } - mention2base.put(m, mentionBase.toString().toLowerCase().trim()); - } - } - return mention2base; - } - - public static String loadSentence2Orth(TSentence sentence) { - return loadSentence2Orth(sentence, Sets.newHashSet()); - } - - public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { - StringBuilder sb = new StringBuilder(); - for (TToken token : sentence.getTokens()) { - if (tokenIdsToSkip.contains(token.getId())) { - System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); - continue; - } - if (!token.isNoPrecedingSpace()) - sb.append(" "); - sb.append(token.getOrth()); - } - return sb.toString().trim(); - } - -} \ No newline at end of file diff --git a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java b/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java deleted file mode 100644 index fbbb2a9..0000000 --- a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/VersionIgnoringObjectInputStream.java +++ /dev/null @@ -1,35 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; - -import java.io.IOException; -import java.io.InputStream; -import java.io.ObjectInputStream; -import java.io.ObjectStreamClass; - - -public class VersionIgnoringObjectInputStream extends ObjectInputStream { - - VersionIgnoringObjectInputStream(InputStream in) throws IOException { - super(in); - } - - @Override - @SuppressWarnings("squid:S1166") - protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException { - ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor - Class localClass; // the class in the local JVM that this descriptor represents. - try { - localClass = Class.forName(resultClassDescriptor.getName()); - } catch (ClassNotFoundException e) { - return resultClassDescriptor; - } - ObjectStreamClass localClassDescriptor = ObjectStreamClass.lookup(localClass); - if (localClassDescriptor != null) { // only if class implements serializable - final long localSUID = localClassDescriptor.getSerialVersionUID(); - final long streamSUID = resultClassDescriptor.getSerialVersionUID(); - if (streamSUID != localSUID) { // check for serialVersionUID mismatch. - resultClassDescriptor = localClassDescriptor; // Use local class descriptor for deserialization - } - } - return resultClassDescriptor; - } -} diff --git a/nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java b/nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java deleted file mode 100644 index ce09aab..0000000 --- a/nicolas-common/src/test/java/pl/waw/ipipan/zil/summ/nicolas/common/UtilsTest.java +++ /dev/null @@ -1,22 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.common; - -import org.junit.Test; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; - -import java.io.InputStream; - -import static org.junit.Assert.assertEquals; - -public class UtilsTest { - - private static final String SAMPLE_TEXT_PATH = "/199704210011.bin"; - - @Test - public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { - try (InputStream stream = UtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { - TText text = ThriftUtils.loadThriftTextFromStream(stream); - assertEquals(26, text.getParagraphs().size()); - assertEquals(2, text.getParagraphs().get(4).getSentences().size()); - } - } -} \ No newline at end of file diff --git a/nicolas-common/src/test/resources/199704210011.bin b/nicolas-common/src/test/resources/199704210011.bin deleted file mode 100644 index cf072c2..0000000 Binary files a/nicolas-common/src/test/resources/199704210011.bin and /dev/null differ diff --git a/nicolas-eval/pom.xml b/nicolas-eval/pom.xml deleted file mode 100644 index 31a9981..0000000 --- a/nicolas-eval/pom.xml +++ /dev/null @@ -1,56 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<project xmlns="http://maven.apache.org/POM/4.0.0" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <parent> - <artifactId>nicolas-container</artifactId> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <version>1.0-SNAPSHOT</version> - </parent> - <modelVersion>4.0.0</modelVersion> - - <artifactId>nicolas-eval</artifactId> - - <dependencies> - <!-- project --> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-lib</artifactId> - </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> - </dependency> - - <!-- internal --> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>eval</artifactId> - </dependency> - - <!-- third party --> - <dependency> - <groupId>nz.ac.waikato.cms.weka</groupId> - <artifactId>weka-stable</artifactId> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-lang3</artifactId> - </dependency> - <dependency> - <groupId>com.google.guava</groupId> - <artifactId>guava</artifactId> - </dependency> - - <!-- logging --> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> - </dependency> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-simple</artifactId> - </dependency> - - </dependencies> -</project> \ No newline at end of file diff --git a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java b/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java deleted file mode 100644 index 80ac0a8..0000000 --- a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java +++ /dev/null @@ -1,24 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.eval; - -import org.apache.commons.io.IOUtils; - -import java.io.IOException; -import java.io.InputStream; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -public class Constants { - - private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt"; - - private Constants() { - } - - public static Set<String> loadTestTextIds() throws IOException { - try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { - List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.common.Constants.ENCODING); - return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); - } - } -} diff --git a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java b/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java deleted file mode 100644 index de33cae..0000000 --- a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java +++ /dev/null @@ -1,15 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.eval; - -import pl.waw.ipipan.zil.summ.eval.Main; - -public class Evaluate { - - private Evaluate() { - } - - public static void main(String[] args) { - String goldDirPath = "data/summaries-gold"; - String systemDirPath = "data/summaries"; - Main.main(new String[]{goldDirPath, systemDirPath}); - } -} \ No newline at end of file diff --git a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java b/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java deleted file mode 100644 index df1ccb8..0000000 --- a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java +++ /dev/null @@ -1,80 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.eval; - -import com.google.common.collect.Maps; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.Nicolas; -import pl.waw.ipipan.zil.summ.nicolas.NicolasException; -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; - -import java.io.File; -import java.io.IOException; -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static java.util.stream.Collectors.toList; -import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; - -public class SummarizeTestCorpus { - - private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); - - - private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; - private static final double SUMMARY_RATIO = 0.2; - - private SummarizeTestCorpus() { - } - - public static void main(String[] args) throws IOException, NicolasException { - File thriftedCorpusDir = new File("data/preprocessed"); - File targetDir = new File("data/summaries"); - targetDir.mkdir(); - - Set<String> testTextIds = loadTestTextIds(); - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains); - LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); - - Map<String, String> id2summary = summarizeTexts(id2preprocessedText); - LOG.info("Texts summarized."); - - saveSummariesToFolder(id2summary, targetDir); - LOG.info("Texts saved to {} folder.", targetDir); - } - - private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { - Map<String, String> id2summary = Maps.newHashMap(); - Nicolas nicolas = new Nicolas(); - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { - TText text = entry.getValue(); - int targetSize = calculateTargetSize(text); - String summary = nicolas.summarizeThrift(text, targetSize); - id2summary.put(entry.getKey(), summary); - } - return id2summary; - } - - private static int calculateTargetSize(TText text) { - List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); - StringBuilder body = new StringBuilder(); - for (TSentence sentence : sentences) - body.append(Utils.loadSentence2Orth(sentence)).append(" "); - - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); - return (int) (SUMMARY_RATIO * tokenCount); - } - - private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException { - for (Map.Entry<String, String> entry : id2summary.entrySet()) { - String textId = entry.getKey(); - String summary = entry.getValue(); - String targetFileName = textId + SUMMARY_FILE_SUFFIX; - Utils.writeStringToFile(summary, new File(targetDir, targetFileName)); - } - } - -} diff --git a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java b/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java deleted file mode 100644 index 5cba028..0000000 --- a/nicolas-eval/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java +++ /dev/null @@ -1,131 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.eval.search; - -import org.apache.commons.lang3.time.StopWatch; -import org.apache.commons.lang3.tuple.Pair; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import weka.classifiers.Classifier; -import weka.classifiers.bayes.BayesNet; -import weka.classifiers.bayes.NaiveBayes; -import weka.classifiers.evaluation.Evaluation; -import weka.classifiers.functions.LinearRegression; -import weka.classifiers.functions.Logistic; -import weka.classifiers.functions.SimpleLogistic; -import weka.classifiers.lazy.IBk; -import weka.classifiers.lazy.KStar; -import weka.classifiers.lazy.LWL; -import weka.classifiers.meta.AttributeSelectedClassifier; -import weka.classifiers.rules.DecisionTable; -import weka.classifiers.rules.JRip; -import weka.classifiers.rules.PART; -import weka.classifiers.rules.ZeroR; -import weka.classifiers.trees.HoeffdingTree; -import weka.classifiers.trees.J48; -import weka.classifiers.trees.LMT; -import weka.classifiers.trees.RandomForest; -import weka.core.Instances; -import weka.core.converters.ArffLoader; - -import java.io.File; -import java.io.IOException; -import java.util.Arrays; -import java.util.Comparator; -import java.util.Optional; -import java.util.Random; -import java.util.logging.LogManager; - - -class Crossvalidate { - - private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); - - private static final int NUM_FOLDS = 10; - - private Crossvalidate() { - } - - static void crossvalidateClassifiers(String datasetPath) throws IOException { - Instances instances = loadInstances(datasetPath); - crossvalidateClassification(instances); - } - - static void crossvalidateRegressors(String datasetPath) throws IOException { - Instances instances = loadInstances(datasetPath); - crossvalidateRegression(instances); - } - - private static Instances loadInstances(String datasetPath) throws IOException { - LogManager.getLogManager().reset(); // disable WEKA logging - - ArffLoader loader = new ArffLoader(); - loader.setFile(new File(datasetPath)); - Instances instances = loader.getDataSet(); - instances.setClassIndex(0); - LOG.info("{} instances loaded.", instances.size()); - LOG.info("{} attributes for each instance.", instances.numAttributes()); - return instances; - } - - private static void crossvalidateClassification(Instances instances) throws IOException { - StopWatch watch = new StopWatch(); - watch.start(); - - Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{new J48(), new RandomForest(), new HoeffdingTree(), new LMT(), - new Logistic(), new ZeroR(), - new SimpleLogistic(), new BayesNet(), new NaiveBayes(), - new KStar(), new IBk(), new LWL(), - new DecisionTable(), new JRip(), new PART(), - createAttributeSelectedClassifier()}).parallel().map(cls -> { - String name = cls.getClass().getSimpleName(); - double acc; - Evaluation eval; - try { - eval = new Evaluation(instances); - eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); - } catch (Exception e) { - LOG.error("Error evaluating model", e); - return Pair.of(0.0, name); - } - acc = eval.correct() / eval.numInstances(); - LOG.info(name + " : " + acc); - return Pair.of(acc, name); - }).max(Comparator.comparingDouble(Pair::getLeft)); - LOG.info("#########"); - LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); - - watch.stop(); - LOG.info("Elapsed time: {}", watch); - } - - - private static Classifier createAttributeSelectedClassifier() { - AttributeSelectedClassifier attributeSelectedClassifier = new AttributeSelectedClassifier(); - attributeSelectedClassifier.setClassifier(new LMT()); - return attributeSelectedClassifier; - } - - private static void crossvalidateRegression(Instances instances) { - StopWatch watch = new StopWatch(); - watch.start(); - - Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{ - new RandomForest(), new LinearRegression(), new KStar()}).parallel().map(cls -> { - double acc = 0; - String name = cls.getClass().getSimpleName(); - try { - Evaluation eval = new Evaluation(instances); - eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); - acc = eval.correlationCoefficient(); - } catch (Exception e) { - LOG.error("Error evaluating model", e); - } - LOG.info(name + " : " + acc); - return Pair.of(acc, name); - }).max(Comparator.comparingDouble(Pair::getLeft)); - LOG.info("#########"); - LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); - - watch.stop(); - LOG.info("Elapsed time: {}", watch); - } -} diff --git a/nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt b/nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt deleted file mode 100644 index d0c556d..0000000 --- a/nicolas-eval/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt +++ /dev/null @@ -1,154 +0,0 @@ -199704210012 -199704210042 -199704220007 -199704220018 -199704220021 -199704220044 -199704230006 -199704230014 -199704230029 -199704230043 -199704240008 -199704240019 -199704240020 -199704240021 -199704250018 -199704250022 -199704260014 -199704260015 -199704260016 -199704280023 -199704280025 -199704280027 -199704280031 -199704300031 -199704300042 -199704300046 -199801020010 -199801020031 -199801020035 -199801020070 -199801020076 -199801020079 -199801030068 -199801030090 -199801030091 -199801030129 -199801030148 -199801030158 -199801050023 -199801050059 -199801130087 -199801130129 -199801140182 -199801160119 -199801200106 -199801220140 -199801240061 -199801240096 -199801260047 -199801260070 -199801270055 -199801270110 -199801280123 -199801280158 -199801280159 -199801280241 -199801290022 -199801310003 -199801310037 -199802030127 -199802040159 -199802040182 -199802040202 -199805220133 -199808280158 -199901190073 -199901190115 -199901250112 -199901250117 -199901270103 -199901270120 -199901270122 -199901290095 -199901300101 -199902240095 -199906220029 -199906230024 -199906240084 -199906260027 -199907050045 -199907050076 -199907140166 -199907200002 -199907270004 -199908260001 -199909090036 -199909250018 -199909270029 -199910020027 -199910020029 -199910270011 -199911060044 -199911100038 -199911100064 -199911200030 -199911220063 -199912020060 -199912180026 -199912180034 -199912220030 -199912280024 -199912280046 -199912300021 -199912300029 -200001030029 -200001030053 -200001060034 -200001100035 -200001100046 -200001170029 -200001170033 -200001170060 -200001290045 -200002220027 -200002240034 -200002250031 -200003060062 -200003110050 -200004280047 -200004290022 -200006050119 -200006260079 -200006290045 -200007150033 -200008040076 -200008220042 -200008220046 -200010130049 -200010160054 -200012130034 -200012140084 -200012290046 -200104040019 -200106050035 -200108180109 -200108300032 -200111120045 -200111150042 -200111150047 -200111200036 -200111270049 -200112030055 -200112280057 -200201220038 -200201220050 -200202020036 -200202200032 -200202210054 -200202270044 -200203010070 -200203190026 -200203260050 -200203280017 -200203290078 diff --git a/nicolas-lib/pom.xml b/nicolas-lib/pom.xml index 6cb91d6..f7ea7e6 100644 --- a/nicolas-lib/pom.xml +++ b/nicolas-lib/pom.xml @@ -12,15 +12,6 @@ <artifactId>nicolas-lib</artifactId> <dependencies> - <!-- project --> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> - </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-model</artifactId> - </dependency> <!-- internal --> <dependency> @@ -61,5 +52,10 @@ <groupId>junit</groupId> <artifactId>junit</artifactId> </dependency> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>nicolas-model</artifactId> + <scope>test</scope> + </dependency> </dependencies> </project> \ No newline at end of file diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java new file mode 100644 index 0000000..401e396 --- /dev/null +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Constants.java @@ -0,0 +1,26 @@ +package pl.waw.ipipan.zil.summ.nicolas; + +import com.google.common.collect.ImmutableList; + +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; + +public class Constants { + + public static final ImmutableList<String> POS_TAGS = ImmutableList.of("end", "other", "null", "impt", "imps", "inf", "pred", "subst", "aglt", "ppron3", "ger", "praet", "fin", "num", "interp", "siebie", "brev", "interj", "ppron12", "adj", "burk", "pcon", "bedzie", "adv", "prep", "depr", "xxx", "winien", "conj", "qub", "adja", "ppas", "comp", "pact"); + public static final Charset ENCODING = StandardCharsets.UTF_8; + + private static final String ROOT_PATH = "/pl/waw/ipipan/zil/summ/nicolas/"; + private static final String MODELS_PATH = ROOT_PATH + "models/"; + + public static final String MENTION_MODEL_RESOURCE_PATH = MODELS_PATH + "mention_model.bin"; + public static final String SENTENCE_MODEL_RESOURCE_PATH = MODELS_PATH + "sentence_model.bin"; + public static final String ZERO_MODEL_RESOURCE_PATH = MODELS_PATH + "zero_model.bin"; + + private static final String RESOURCES_PATH = ROOT_PATH + "resources/"; + public static final String FREQUENT_BASES_RESOURCE_PATH = RESOURCES_PATH + "frequent_bases.txt"; + public static final String STOPWORDS_PATH = RESOURCES_PATH + "stopwords.txt"; + + private Constants() { + } +} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java deleted file mode 100644 index 8459d82..0000000 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/InstanceUtils.java +++ /dev/null @@ -1,63 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas; - -import com.google.common.collect.Maps; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; -import weka.core.Attribute; -import weka.core.DenseInstance; -import weka.core.Instance; - -import java.util.List; -import java.util.Map; -import java.util.Set; - -import static java.util.stream.Collectors.toList; - -public class InstanceUtils { - - private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class); - - private InstanceUtils() { - } - - public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); - Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); - - LOG.info("Extracting {} features of each mention.", featureExtractor.getAttributesList().size()); - Map<TMention, Instance> mention2instance = Maps.newHashMap(); - for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); - Map<Attribute, Double> mentionFeatures = mention2features.get(tMention); - for (Attribute attribute : featureExtractor.getAttributesList()) { - instance.setValue(attribute, mentionFeatures.get(attribute)); - } - mention2instance.put(tMention, instance); - } - LOG.info("Extracted features of {} mentions.", mention2instance.size()); - return mention2instance; - } - - public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) { - List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); - Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); - - LOG.info("Extracting {} features of each sentence.", featureExtractor.getAttributesList().size()); - Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); - for (TSentence sentence : sentences) { - Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); - Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence); - for (Attribute attribute : featureExtractor.getAttributesList()) { - instance.setValue(attribute, sentenceFeatures.get(attribute)); - } - sentence2instance.put(sentence, instance); - } - LOG.info("Extracted features of {} sentences.", sentence2instance.size()); - return sentence2instance; - } -} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java index f432020..79d3e34 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java @@ -5,12 +5,12 @@ import com.google.common.collect.Sets; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceModel; +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; import weka.classifiers.Classifier; @@ -31,9 +31,9 @@ public class Nicolas { public Nicolas() throws NicolasException { try { - mentionModel = Utils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); - sentenceModel = Utils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); - zeroModel = Utils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); + mentionModel = ResourceUtils.loadModelFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); + sentenceModel = ResourceUtils.loadModelFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); + zeroModel = ResourceUtils.loadModelFromResource(Constants.ZERO_MODEL_RESOURCE_PATH); mentionFeatureExtractor = new MentionFeatureExtractor(); sentenceFeatureExtractor = new SentenceFeatureExtractor(); @@ -57,7 +57,7 @@ public class Nicolas { StringBuilder sb = new StringBuilder(); for (TSentence sent : selectedSentences) { - sb.append(" ").append(Utils.loadSentence2Orth(sent)); + sb.append(" ").append(TextUtils.loadSentence2Orth(sent)); } return sb.toString().trim(); } @@ -74,7 +74,7 @@ public class Nicolas { Random r = new Random(1); Set<TSentence> summary = Sets.newHashSet(); for (TSentence sent : sortedSentences) { - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); + size += TextUtils.tokenizeOnWhitespace(TextUtils.loadSentence2Orth(sent)).size(); if (r.nextDouble() > 0.4 && size > targetSize) break; summary.add(sent); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java deleted file mode 100644 index 47b20ea..0000000 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/apply/ApplyModel.java +++ /dev/null @@ -1,127 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.apply; - -import com.google.common.collect.Lists; -import com.google.common.collect.Maps; -import com.google.common.collect.Sets; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; -import pl.waw.ipipan.zil.summ.nicolas.mention.MentionModel; -import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; -import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectInjector; -import weka.classifiers.Classifier; -import weka.core.Instance; -import weka.core.Instances; - -import java.io.BufferedWriter; -import java.io.File; -import java.io.FileWriter; -import java.util.*; - -import static java.util.stream.Collectors.toList; - -public class ApplyModel { - - private static final Logger LOG = LoggerFactory.getLogger(ApplyModel.class); - - private static final String TEST_PREPROCESSED_DATA_PATH = "corpora/preprocessed_full_texts/test"; - private static final String TARGET_DIR = "corpora/summaries"; - - public static void main(String[] args) throws Exception { - Classifier mentionClassifier = Utils.loadClassifierFromResource(Constants.MENTION_MODEL_RESOURCE_PATH); - MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); - - Classifier sentenceClassifier = Utils.loadClassifierFromResource(Constants.SENTENCE_MODEL_RESOURCE_PATH); - SentenceFeatureExtractor sentenceFeatureExtractor = new SentenceFeatureExtractor(); - - ZeroSubjectInjector zeroSubjectInjector = new ZeroSubjectInjector(); - - Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(new File(TEST_PREPROCESSED_DATA_PATH)); - int i = 1; - double avgSize = 0; - for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { - TText text = entry.getValue(); - - Set<TMention> goodMentions - = MentionModel.detectGoodMentions(mentionClassifier, featureExtractor, text); - - int targetSize = calculateTargetSize(text); - String summary = calculateSummary(text, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor, zeroSubjectInjector); - int size = Utils.tokenize(summary).size(); - avgSize += size; - try (BufferedWriter bw = new BufferedWriter(new FileWriter(new File(TARGET_DIR, entry.getKey() + "_emily4.txt")))) { - bw.append(summary); - } - - LOG.info(i++ + "/" + id2preprocessedText.size() + " id: " + entry.getKey()); - } - - LOG.info("Avg size:" + avgSize / id2preprocessedText.size()); - } - - private static int calculateTargetSize(TText text) { - List<TSentence> sents = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); - StringBuffer body = new StringBuffer(); - for (TSentence sent : sents) - body.append(Utils.loadSentence2Orth(sent) + " "); - int tokenCount = Utils.tokenizeOnWhitespace(body.toString().trim()).size(); - return (int) (0.2 * tokenCount); - } - - private static String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor, ZeroSubjectInjector zeroSubjectInjector) throws Exception { - List<TSentence> selectedSentences = selectSummarySentences(thrifted, goodMentions, targetSize, sentenceClassifier, sentenceFeatureExtractor); - - Set<String> zeroSubjectTokenIds = zeroSubjectInjector.findZeroSubjectTokenIds(thrifted, selectedSentences); - - StringBuilder sb = new StringBuilder(); - for (TSentence sent : selectedSentences) { - sb.append(" " + Utils.loadSentence2Orth(sent, zeroSubjectTokenIds)); - } - return sb.toString().trim(); - } - - private static List<TSentence> selectSummarySentences(TText thrifted, Set<TMention> goodMentions, int targetSize, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { - - List<TSentence> sents = thrifted.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); - - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); - Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); - - Map<TSentence, Double> sentence2score = Maps.newHashMap(); - for (Map.Entry<TSentence, Instance> entry : sentence2instance.entrySet()) { - Instance instance = entry.getValue(); - instance.setDataset(instances); - double score = sentenceClassifier.classifyInstance(instance); - sentence2score.put(entry.getKey(), score); - } - - List<TSentence> sortedSents = Lists.newArrayList(sents); - sortedSents.sort(Comparator.comparing(sentence2score::get).reversed()); - - int size = 0; - Random r = new Random(1); - Set<TSentence> summary = Sets.newHashSet(); - for (TSentence sent : sortedSents) { - size += Utils.tokenizeOnWhitespace(Utils.loadSentence2Orth(sent)).size(); - if (r.nextDouble() > 0.4 && size > targetSize) - break; - summary.add(sent); - if (size > targetSize) - break; - } - List<TSentence> selectedSentences = Lists.newArrayList(); - for (TSentence sent : sents) { - if (summary.contains(sent)) - selectedSentences.add(sent); - } - return selectedSentences; - } - -} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java index 9ab26a8..0bd02ff 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/features/FeatureHelper.java @@ -3,7 +3,6 @@ package pl.waw.ipipan.zil.summ.nicolas.features; import com.google.common.collect.Maps; import com.google.common.collect.Sets; import pl.waw.ipipan.zil.multiservice.thrift.types.*; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; import java.util.List; import java.util.Map; @@ -38,7 +37,6 @@ public class FeatureHelper { private final Map<TMention, Integer> mention2indexInSent = Maps.newHashMap(); private final Map<TMention, Integer> mention2firstTokenIndex = Maps.newHashMap(); - public FeatureHelper(TText preprocessedText) { text = preprocessedText; @@ -60,9 +58,9 @@ public class FeatureHelper { int sentIdx = 0; int mentionIdx = 0; for (TParagraph par : preprocessedText.getParagraphs()) { - Map<TMention, String> m2o = Utils.loadMention2Orth(par.getSentences(), false); + Map<TMention, String> m2o = loadMention2Orth(par.getSentences()); mention2Orth.putAll(m2o); - Map<TMention, String> m2b = Utils.loadMention2Base(par.getSentences()); + Map<TMention, String> m2b = loadMention2Base(par.getSentences()); mention2Base.putAll(m2b); int sentIdxInPar = 0; @@ -221,4 +219,40 @@ public class FeatureHelper { return null; return mention2sent.get(mention).getTokens().get(idx - 1); } + + private static Map<TMention, String> loadMention2Orth(List<TSentence> sents) { + Map<TMention, String> mention2orth = Maps.newHashMap(); + for (TSentence s : sents) { + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); + + for (TMention m : s.getMentions()) { + StringBuilder mentionOrth = new StringBuilder(); + for (String tokId : m.getChildIds()) { + TToken token = tokId2tok.get(tokId); + if (!token.isNoPrecedingSpace()) + mentionOrth.append(" "); + mentionOrth.append(token.getOrth()); + } + mention2orth.put(m, mentionOrth.toString().trim()); + } + } + return mention2orth; + } + + private static Map<TMention, String> loadMention2Base(List<TSentence> sents) { + Map<TMention, String> mention2base = Maps.newHashMap(); + for (TSentence s : sents) { + Map<String, String> tokId2base = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, tok -> tok.getChosenInterpretation().getBase())); + + for (TMention m : s.getMentions()) { + StringBuilder mentionBase = new StringBuilder(); + for (String tokId : m.getChildIds()) { + mentionBase.append(" "); + mentionBase.append(tokId2base.get(tokId)); + } + mention2base.put(m, mentionBase.toString().toLowerCase().trim()); + } + } + return mention2base; + } } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java index d624f41..9b2b8b5 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionFeatureExtractor.java @@ -3,11 +3,11 @@ package pl.waw.ipipan.zil.summ.nicolas.mention; import com.google.common.collect.Lists; import com.google.common.collect.Maps; import pl.waw.ipipan.zil.multiservice.thrift.types.*; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import pl.waw.ipipan.zil.summ.nicolas.features.Interpretation; +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; import weka.core.Attribute; import java.io.IOException; @@ -21,7 +21,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { private final List<String> frequentBases; public MentionFeatureExtractor() throws IOException { - frequentBases = loadFrequentBases(); + frequentBases = ResourceUtils.loadFrequentBases(); //coref addNumericAttributeNormalized("chain_length"); @@ -80,10 +80,6 @@ public class MentionFeatureExtractor extends FeatureExtractor { fillSortedAttributes("score"); } - private List<String> loadFrequentBases() throws IOException { - return Utils.loadLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH).stream().map(String::trim).sorted().distinct().collect(Collectors.toList()); - } - private String encodeBase(String base) { return "base_equal_" + base.replaceAll(" ", "_").replaceAll("\"", "Q"); } @@ -177,7 +173,7 @@ public class MentionFeatureExtractor extends FeatureExtractor { Attribute att = getAttributeByName(attributeName); int index = att.indexOfValue(value); if (index == -1) - LOG.warn(value + " not found for attribute " + attributeName); + LOG.warn("{} not found for attribute {}", value, attributeName); attribute2value.put(att, (double) (index == -1 ? att.indexOfValue("other") : index)); } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java index 46a296b..83468e3 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/mention/MentionModel.java @@ -5,8 +5,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; import weka.classifiers.Classifier; import weka.core.Instance; import weka.core.Instances; @@ -24,7 +23,7 @@ public class MentionModel { public static Set<TMention> detectGoodMentions(Classifier classifier, MentionFeatureExtractor featureExtractor, TText text) throws Exception { Set<TMention> goodMentions = Sets.newHashSet(); - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); Map<TMention, Instance> mention2instance = InstanceUtils.extractInstancesFromMentions(text, featureExtractor); for (Map.Entry<TMention, Instance> entry : mention2instance.entrySet()) { Instance instance = entry.getValue(); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java index 21117da..dc9cc6f 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/sentence/SentenceModel.java @@ -6,8 +6,7 @@ import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; import weka.classifiers.Classifier; import weka.core.Instance; import weka.core.Instances; @@ -23,7 +22,7 @@ public class SentenceModel { } public static Map<TSentence, Double> scoreSentences(TText thrifted, Set<TMention> goodMentions, Classifier sentenceClassifier, SentenceFeatureExtractor sentenceFeatureExtractor) throws Exception { - Instances instances = Utils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(sentenceFeatureExtractor.getAttributesList()); Map<TSentence, Instance> sentence2instance = InstanceUtils.extractInstancesFromSentences(thrifted, sentenceFeatureExtractor, goodMentions); Map<TSentence, Double> sentence2score = Maps.newHashMap(); diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java new file mode 100644 index 0000000..7fdf82b --- /dev/null +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/InstanceUtils.java @@ -0,0 +1,74 @@ +package pl.waw.ipipan.zil.summ.nicolas.utils; + +import com.google.common.collect.Maps; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; +import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; +import weka.core.Attribute; +import weka.core.DenseInstance; +import weka.core.Instance; +import weka.core.Instances; + +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static java.util.stream.Collectors.toList; + +public class InstanceUtils { + + private static final Logger LOG = LoggerFactory.getLogger(InstanceUtils.class); + + private static final String DATASET_NAME = "Dataset"; + + private InstanceUtils() { + } + + public static Map<TMention, Instance> extractInstancesFromMentions(TText preprocessedText, MentionFeatureExtractor featureExtractor) { + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); + Map<TMention, Map<Attribute, Double>> mention2features = featureExtractor.calculateFeatures(preprocessedText); + + LOG.info("Extracting {} features of each mention.", featureExtractor.getAttributesList().size()); + Map<TMention, Instance> mention2instance = Maps.newHashMap(); + for (TMention tMention : sentences.stream().flatMap(s -> s.getMentions().stream()).collect(toList())) { + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); + Map<Attribute, Double> mentionFeatures = mention2features.get(tMention); + for (Attribute attribute : featureExtractor.getAttributesList()) { + instance.setValue(attribute, mentionFeatures.get(attribute)); + } + mention2instance.put(tMention, instance); + } + LOG.info("Extracted features of {} mentions.", mention2instance.size()); + return mention2instance; + } + + public static Map<TSentence, Instance> extractInstancesFromSentences(TText preprocessedText, SentenceFeatureExtractor featureExtractor, Set<TMention> goodMentions) { + List<TSentence> sentences = preprocessedText.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); + Map<TSentence, Map<Attribute, Double>> sentence2features = featureExtractor.calculateFeatures(preprocessedText, goodMentions); + + LOG.info("Extracting {} features of each sentence.", featureExtractor.getAttributesList().size()); + Map<TSentence, Instance> sentence2instance = Maps.newHashMap(); + for (TSentence sentence : sentences) { + Instance instance = new DenseInstance(featureExtractor.getAttributesList().size()); + Map<Attribute, Double> sentenceFeatures = sentence2features.get(sentence); + for (Attribute attribute : featureExtractor.getAttributesList()) { + instance.setValue(attribute, sentenceFeatures.get(attribute)); + } + sentence2instance.put(sentence, instance); + } + LOG.info("Extracted features of {} sentences.", sentence2instance.size()); + return sentence2instance; + } + + @SuppressWarnings("squid:S1319") //weka requires explicit ArrayList + public static Instances createNewInstances(ArrayList<Attribute> attributesList) { + Instances instances = new Instances(DATASET_NAME, attributesList, 0); + instances.setClassIndex(0); + return instances; + } +} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java new file mode 100644 index 0000000..acdf7d2 --- /dev/null +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/ResourceUtils.java @@ -0,0 +1,62 @@ +package pl.waw.ipipan.zil.summ.nicolas.utils; + +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.summ.nicolas.Constants; +import weka.classifiers.Classifier; + +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.util.List; +import java.util.function.Predicate; +import java.util.stream.Collectors; + +public class ResourceUtils { + + private static final Logger LOG = LoggerFactory.getLogger(ResourceUtils.class); + + private ResourceUtils() { + } + + public static List<String> loadFrequentBases() throws IOException { + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.FREQUENT_BASES_RESOURCE_PATH); + } + + public static List<String> loadStopwords() throws IOException { + return loadUniqueLowercaseSortedNonemptyLinesFromResource(Constants.STOPWORDS_PATH); + } + + public static Classifier loadModelFromResource(String modelResourcePath) throws IOException { + LOG.info("Loading classifier from path: {}...", modelResourcePath); + try (InputStream stream = ResourceUtils.class.getResourceAsStream(modelResourcePath)) { + if (stream == null) { + throw new IOException("Model not found at: " + modelResourcePath); + } + try (ObjectInputStream ois = new ObjectInputStream(stream)) { + Classifier classifier = (Classifier) ois.readObject(); + LOG.info("Done. Loaded classifier: {}", classifier.getClass().getSimpleName()); + return classifier; + } catch (ClassNotFoundException e) { + LOG.error("Error loading serialized classifier, class not found.", e); + throw new IOException(e); + } + } + } + + private static List<String> loadUniqueLowercaseSortedNonemptyLinesFromResource(String resourcePath) throws IOException { + try (InputStream stream = ResourceUtils.class.getResourceAsStream(resourcePath)) { + return IOUtils.readLines(stream, Constants.ENCODING) + .stream() + .map(String::trim) + .map(String::toLowerCase) + .filter(((Predicate<String>) String::isEmpty).negate()) + .sorted() + .distinct() + .collect(Collectors.toList()); + } + } + + +} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java new file mode 100644 index 0000000..d561a70 --- /dev/null +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/TextUtils.java @@ -0,0 +1,41 @@ +package pl.waw.ipipan.zil.summ.nicolas.utils; + +import com.google.common.collect.Sets; +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; + +import java.util.Arrays; +import java.util.List; +import java.util.Set; + +public class TextUtils { + + private TextUtils() { + } + + public static List<String> tokenize(String text) { + return Arrays.asList(text.split("[^\\p{L}0-9]+")); + } + + public static List<String> tokenizeOnWhitespace(String text) { + return Arrays.asList(text.split(" +")); + } + + public static String loadSentence2Orth(TSentence sentence) { + return loadSentence2Orth(sentence, Sets.newHashSet()); + } + + public static String loadSentence2Orth(TSentence sentence, Set<String> tokenIdsToSkip) { + StringBuilder sb = new StringBuilder(); + for (TToken token : sentence.getTokens()) { + if (tokenIdsToSkip.contains(token.getId())) { + System.out.println("Skipping " + token.getOrth() + " in sentence: " + loadSentence2Orth(sentence)); + continue; + } + if (!token.isNoPrecedingSpace()) + sb.append(" "); + sb.append(token.getOrth()); + } + return sb.toString().trim(); + } +} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java new file mode 100644 index 0000000..9835fae --- /dev/null +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtils.java @@ -0,0 +1,69 @@ +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; + +import com.google.common.base.Predicates; +import com.google.common.collect.Maps; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Map; +import java.util.function.Predicate; + +public class ThriftUtils { + + private static final Logger LOG = LoggerFactory.getLogger(ThriftUtils.class); + + private ThriftUtils() { + } + + public static Map<String, TText> loadThriftTextsFromFolder(File folder, Predicate<String> idFilter) { + Map<String, TText> id2text = Maps.newHashMap(); + File[] files = folder.listFiles(); + if (files != null) { + for (File processedFullTextFile : files) { + String textId = processedFullTextFile.getName().split("\\.")[0]; + if (!idFilter.test(textId)) + continue; + TText processedFullText = loadThriftTextFromFile(processedFullTextFile); + id2text.put(textId, processedFullText); + } + } + LOG.info("{} preprocessed texts found.", id2text.size()); + return id2text; + } + + public static Map<String, TText> loadThriftTextsFromFolder(File folder) { + return loadThriftTextsFromFolder(folder, Predicates.alwaysTrue()); + } + + public static TText loadThriftTextFromFile(File originalFile) { + try (FileInputStream inputStream = new FileInputStream(originalFile)) { + return loadThriftTextFromStream(inputStream); + } catch (IOException e) { + LOG.error("Error reading serialized Thrift file", e); + return null; + } + } + + public static TText loadThriftTextFromStream(InputStream stream) { + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { + return (TText) ois.readObject(); + } catch (ClassNotFoundException | IOException e) { + LOG.error("Error reading serialized Thrift stream", e); + return null; + } + } + + public static TText loadThriftTextFromResource(String resourcePath) { + try (InputStream stream = ThriftUtils.class.getResourceAsStream(resourcePath)) { + return loadThriftTextFromStream(stream); + } catch (IOException e) { + LOG.error("Error reading serialized Thrift text from resource", e); + return null; + } + } +} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java new file mode 100644 index 0000000..bf2ce9a --- /dev/null +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/VersionIgnoringObjectInputStream.java @@ -0,0 +1,35 @@ +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; + +import java.io.IOException; +import java.io.InputStream; +import java.io.ObjectInputStream; +import java.io.ObjectStreamClass; + + +public class VersionIgnoringObjectInputStream extends ObjectInputStream { + + VersionIgnoringObjectInputStream(InputStream in) throws IOException { + super(in); + } + + @Override + @SuppressWarnings("squid:S1166") + protected ObjectStreamClass readClassDescriptor() throws IOException, ClassNotFoundException { + ObjectStreamClass resultClassDescriptor = super.readClassDescriptor(); // initially streams descriptor + Class localClass; // the class in the local JVM that this descriptor represents. + try { + localClass = Class.forName(resultClassDescriptor.getName()); + } catch (ClassNotFoundException e) { + return resultClassDescriptor; + } + ObjectStreamClass localClassDescriptor = ObjectStreamClass.lookup(localClass); + if (localClassDescriptor != null) { // only if class implements serializable + final long localSUID = localClassDescriptor.getSerialVersionUID(); + final long streamSUID = resultClassDescriptor.getSerialVersionUID(); + if (streamSUID != localSUID) { // check for serialVersionUID mismatch. + resultClassDescriptor = localClassDescriptor; // Use local class descriptor for deserialization + } + } + return resultClassDescriptor; + } +} diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java index c26b629..dfa853b 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroFeatureExtractor.java @@ -7,7 +7,7 @@ import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import weka.core.Attribute; diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java index 239aff9..11280f6 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/zero/ZeroSubjectInjector.java @@ -3,8 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas.zero; import com.google.common.collect.Sets; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.Constants; +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; import weka.classifiers.Classifier; import weka.core.Instance; import weka.core.Instances; @@ -24,7 +24,7 @@ public class ZeroSubjectInjector { public ZeroSubjectInjector() throws Exception { classifier = (Classifier) SerializationHelper.read(Constants.ZERO_MODEL_RESOURCE_PATH); featureExtractor = new ZeroFeatureExtractor(); - instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); } public Set<String> findZeroSubjectTokenIds(TText text, List<TSentence> selectedSentences) throws Exception { diff --git a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java index 9dae6f4..2385bb1 100644 --- a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java +++ b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/NicolasTest.java @@ -3,7 +3,8 @@ package pl.waw.ipipan.zil.summ.nicolas; import org.junit.BeforeClass; import org.junit.Test; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import static org.junit.Assert.assertTrue; @@ -20,9 +21,9 @@ public class NicolasTest { @Test public void shouldSummarizeThriftText() throws Exception { - TText thriftText = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); + TText thriftText = ThriftUtils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); String summary = nicolas.summarizeThrift(thriftText, 5); - int summaryTokensCount = Utils.tokenizeOnWhitespace(summary).size(); + int summaryTokensCount = TextUtils.tokenizeOnWhitespace(summary).size(); assertTrue(summaryTokensCount > 0); assertTrue(summaryTokensCount < 10); } diff --git a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java new file mode 100644 index 0000000..464ae07 --- /dev/null +++ b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/utils/thrift/ThriftUtilsTest.java @@ -0,0 +1,22 @@ +package pl.waw.ipipan.zil.summ.nicolas.utils.thrift; + +import org.junit.Test; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; + +import java.io.InputStream; + +import static org.junit.Assert.assertEquals; + +public class ThriftUtilsTest { + + private static final String SAMPLE_TEXT_PATH = "/199704210011.bin"; + + @Test + public void shouldDeserializeTextIgnoringClassVersionId() throws Exception { + try (InputStream stream = ThriftUtilsTest.class.getResourceAsStream(SAMPLE_TEXT_PATH)) { + TText text = ThriftUtils.loadThriftTextFromStream(stream); + assertEquals(26, text.getParagraphs().size()); + assertEquals(2, text.getParagraphs().get(4).getSentences().size()); + } + } +} \ No newline at end of file diff --git a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java index e8e7a47..274356b 100644 --- a/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java +++ b/nicolas-lib/src/test/java/pl/waw/ipipan/zil/summ/nicolas/zero/CandidateFinderTest.java @@ -5,8 +5,8 @@ import org.apache.commons.io.IOUtils; import org.junit.Test; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import java.io.IOException; import java.io.InputStream; diff --git a/nicolas-lib/src/test/resources/199704210011.bin b/nicolas-lib/src/test/resources/199704210011.bin new file mode 100644 index 0000000..cf072c2 Binary files /dev/null and b/nicolas-lib/src/test/resources/199704210011.bin differ diff --git a/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore b/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore new file mode 100644 index 0000000..314f02b --- /dev/null +++ b/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/.gitignore @@ -0,0 +1 @@ +*.txt \ No newline at end of file diff --git a/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md b/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md new file mode 100644 index 0000000..511f97c --- /dev/null +++ b/nicolas-model/src/main/resources/pl/waw/ipipan/zil/summ/nicolas/resources/README.md @@ -0,0 +1 @@ +To generate resources in this folder, use nicolas-trainer module. \ No newline at end of file diff --git a/nicolas-multiservice/pom.xml b/nicolas-multiservice/pom.xml index cc051be..a49a932 100644 --- a/nicolas-multiservice/pom.xml +++ b/nicolas-multiservice/pom.xml @@ -30,8 +30,12 @@ <!-- test --> <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + </dependency> + <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> + <artifactId>nicolas-lib</artifactId> <scope>test</scope> </dependency> diff --git a/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java b/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java index e3ce61d..6d209ff 100644 --- a/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java +++ b/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java @@ -7,7 +7,7 @@ import org.junit.rules.TemporaryFolder; import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import java.io.File; import java.io.FileInputStream; @@ -67,7 +67,7 @@ public class PreprocessorIT { preprocessor.preprocessToFile(text, targetFile); try (FileInputStream inputStream = new FileInputStream(targetFile)) { - TText processed = Utils.loadThriftTextFromStream(inputStream); + TText processed = ThriftUtils.loadThriftTextFromStream(inputStream); assertSampleProcessedText(processed); } } diff --git a/nicolas-train/pom.xml b/nicolas-train/pom.xml index 0124f0f..57e2072 100644 --- a/nicolas-train/pom.xml +++ b/nicolas-train/pom.xml @@ -15,10 +15,6 @@ <!-- project --> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> - </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> <artifactId>nicolas-lib</artifactId> </dependency> <dependency> diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java new file mode 100644 index 0000000..7af55cf --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/PathConstants.java @@ -0,0 +1,79 @@ +package pl.waw.ipipan.zil.summ.nicolas; + +import net.lingala.zip4j.core.ZipFile; +import net.lingala.zip4j.exception.ZipException; +import org.apache.commons.io.FileUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.net.URL; + +public class PathConstants { + + private static final Logger LOG = LoggerFactory.getLogger(PathConstants.class); + + public static final String CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/PolishSummariesCorpus?action=AttachFile&do=get&target=PSC_1.0.zip"; + public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip"; + public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip"; + public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv"; + + public static final File WORKING_DIR = new File("data"); + + public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip"); + public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip"); + public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip"); + + public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus"); + public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data"); + public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids"); + public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed"); + public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold"); + public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold"); + public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); + public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); + + private static final File ARFF_DIR = new File(WORKING_DIR, "train-arff"); + public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff"); + public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); + public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); + + private PathConstants() { + } + + public static File createFolder(File folder) { + if (folder.mkdir()) { + LOG.info("Created directory at: {}.", folder.getPath()); + } else { + LOG.info("Directory already present at: {}.", folder.getPath()); + } + return folder; + } + + public static void downloadFile(String fileUrl, File targetFile) throws IOException { + if (!targetFile.exists()) { + LOG.info("Downloading file from url {} to file {} ...", fileUrl, targetFile); + FileUtils.copyURLToFile(new URL(fileUrl), targetFile); + LOG.info("done."); + } else { + LOG.info("File {} already downloaded.", targetFile); + } + } + + public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException, ZipException { + downloadFile(url, targetZipFile); + extractZipFile(targetZipFile, targetDir); + } + + private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { + if (targetDir.exists()) { + LOG.info("Zip file {} already extracted to dir {}.", targetZipFile, targetDir); + } else { + createFolder(targetDir); + ZipFile zipFile = new ZipFile(targetZipFile); + zipFile.extractAll(targetDir.getPath()); + LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); + } + } +} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java new file mode 100644 index 0000000..17981e5 --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Constants.java @@ -0,0 +1,24 @@ +package pl.waw.ipipan.zil.summ.nicolas.eval; + +import org.apache.commons.io.IOUtils; + +import java.io.IOException; +import java.io.InputStream; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class Constants { + + private static final String TEST_TEXT_IDS_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/eval/test_text_ids.txt"; + + private Constants() { + } + + public static Set<String> loadTestTextIds() throws IOException { + try (InputStream inputStream = SummarizeTestCorpus.class.getResourceAsStream(TEST_TEXT_IDS_RESOURCE_PATH)) { + List<String> testTextIds = IOUtils.readLines(inputStream, pl.waw.ipipan.zil.summ.nicolas.Constants.ENCODING); + return testTextIds.stream().map(String::trim).collect(Collectors.toSet()); + } + } +} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java new file mode 100644 index 0000000..de33cae --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/Evaluate.java @@ -0,0 +1,15 @@ +package pl.waw.ipipan.zil.summ.nicolas.eval; + +import pl.waw.ipipan.zil.summ.eval.Main; + +public class Evaluate { + + private Evaluate() { + } + + public static void main(String[] args) { + String goldDirPath = "data/summaries-gold"; + String systemDirPath = "data/summaries"; + Main.main(new String[]{goldDirPath, systemDirPath}); + } +} \ No newline at end of file diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java new file mode 100644 index 0000000..c5dee6c --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/SummarizeTestCorpus.java @@ -0,0 +1,83 @@ +package pl.waw.ipipan.zil.summ.nicolas.eval; + +import com.google.common.collect.Maps; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.Nicolas; +import pl.waw.ipipan.zil.summ.nicolas.NicolasException; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import static java.util.stream.Collectors.toList; +import static pl.waw.ipipan.zil.summ.nicolas.eval.Constants.loadTestTextIds; + +public class SummarizeTestCorpus { + + private static final Logger LOG = LoggerFactory.getLogger(SummarizeTestCorpus.class); + + private static final String SUMMARY_FILE_SUFFIX = "_nicolas.txt"; + private static final double SUMMARY_RATIO = 0.2; + + private SummarizeTestCorpus() { + } + + public static void main(String[] args) throws IOException, NicolasException { + File thriftedCorpusDir = new File("data/all-preprocessed"); + File targetDir = new File("data/test-system"); + targetDir.mkdir(); + + Set<String> testTextIds = loadTestTextIds(); + Map<String, TText> id2preprocessedText = ThriftUtils.loadThriftTextsFromFolder(thriftedCorpusDir, testTextIds::contains); + LOG.info("{} test corpus texts in Thrift format loaded to be summarized.", id2preprocessedText.keySet().size()); + + Map<String, String> id2summary = summarizeTexts(id2preprocessedText); + LOG.info("Texts summarized."); + + saveSummariesToFolder(id2summary, targetDir); + LOG.info("Texts saved to {} folder.", targetDir); + } + + private static Map<String, String> summarizeTexts(Map<String, TText> id2preprocessedText) throws NicolasException { + Map<String, String> id2summary = Maps.newHashMap(); + Nicolas nicolas = new Nicolas(); + for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { + TText text = entry.getValue(); + int targetSize = calculateTargetSize(text); + String summary = nicolas.summarizeThrift(text, targetSize); + id2summary.put(entry.getKey(), summary); + } + return id2summary; + } + + private static int calculateTargetSize(TText text) { + List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(toList()); + StringBuilder body = new StringBuilder(); + for (TSentence sentence : sentences) + body.append(TextUtils.loadSentence2Orth(sentence)).append(" "); + + int tokenCount = TextUtils.tokenizeOnWhitespace(body.toString().trim()).size(); + return (int) (SUMMARY_RATIO * tokenCount); + } + + private static void saveSummariesToFolder(Map<String, String> id2summary, File targetDir) throws IOException { + for (Map.Entry<String, String> entry : id2summary.entrySet()) { + String textId = entry.getKey(); + String summary = entry.getValue(); + String targetFileName = textId + SUMMARY_FILE_SUFFIX; + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(new File(targetDir, targetFileName)))) { + writer.write(summary); + } + } + } + +} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java new file mode 100644 index 0000000..5cba028 --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/eval/search/Crossvalidate.java @@ -0,0 +1,131 @@ +package pl.waw.ipipan.zil.summ.nicolas.eval.search; + +import org.apache.commons.lang3.time.StopWatch; +import org.apache.commons.lang3.tuple.Pair; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import weka.classifiers.Classifier; +import weka.classifiers.bayes.BayesNet; +import weka.classifiers.bayes.NaiveBayes; +import weka.classifiers.evaluation.Evaluation; +import weka.classifiers.functions.LinearRegression; +import weka.classifiers.functions.Logistic; +import weka.classifiers.functions.SimpleLogistic; +import weka.classifiers.lazy.IBk; +import weka.classifiers.lazy.KStar; +import weka.classifiers.lazy.LWL; +import weka.classifiers.meta.AttributeSelectedClassifier; +import weka.classifiers.rules.DecisionTable; +import weka.classifiers.rules.JRip; +import weka.classifiers.rules.PART; +import weka.classifiers.rules.ZeroR; +import weka.classifiers.trees.HoeffdingTree; +import weka.classifiers.trees.J48; +import weka.classifiers.trees.LMT; +import weka.classifiers.trees.RandomForest; +import weka.core.Instances; +import weka.core.converters.ArffLoader; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.Comparator; +import java.util.Optional; +import java.util.Random; +import java.util.logging.LogManager; + + +class Crossvalidate { + + private static final Logger LOG = LoggerFactory.getLogger(Crossvalidate.class); + + private static final int NUM_FOLDS = 10; + + private Crossvalidate() { + } + + static void crossvalidateClassifiers(String datasetPath) throws IOException { + Instances instances = loadInstances(datasetPath); + crossvalidateClassification(instances); + } + + static void crossvalidateRegressors(String datasetPath) throws IOException { + Instances instances = loadInstances(datasetPath); + crossvalidateRegression(instances); + } + + private static Instances loadInstances(String datasetPath) throws IOException { + LogManager.getLogManager().reset(); // disable WEKA logging + + ArffLoader loader = new ArffLoader(); + loader.setFile(new File(datasetPath)); + Instances instances = loader.getDataSet(); + instances.setClassIndex(0); + LOG.info("{} instances loaded.", instances.size()); + LOG.info("{} attributes for each instance.", instances.numAttributes()); + return instances; + } + + private static void crossvalidateClassification(Instances instances) throws IOException { + StopWatch watch = new StopWatch(); + watch.start(); + + Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{new J48(), new RandomForest(), new HoeffdingTree(), new LMT(), + new Logistic(), new ZeroR(), + new SimpleLogistic(), new BayesNet(), new NaiveBayes(), + new KStar(), new IBk(), new LWL(), + new DecisionTable(), new JRip(), new PART(), + createAttributeSelectedClassifier()}).parallel().map(cls -> { + String name = cls.getClass().getSimpleName(); + double acc; + Evaluation eval; + try { + eval = new Evaluation(instances); + eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); + } catch (Exception e) { + LOG.error("Error evaluating model", e); + return Pair.of(0.0, name); + } + acc = eval.correct() / eval.numInstances(); + LOG.info(name + " : " + acc); + return Pair.of(acc, name); + }).max(Comparator.comparingDouble(Pair::getLeft)); + LOG.info("#########"); + LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); + + watch.stop(); + LOG.info("Elapsed time: {}", watch); + } + + + private static Classifier createAttributeSelectedClassifier() { + AttributeSelectedClassifier attributeSelectedClassifier = new AttributeSelectedClassifier(); + attributeSelectedClassifier.setClassifier(new LMT()); + return attributeSelectedClassifier; + } + + private static void crossvalidateRegression(Instances instances) { + StopWatch watch = new StopWatch(); + watch.start(); + + Optional<Pair<Double, String>> max = Arrays.stream(new Classifier[]{ + new RandomForest(), new LinearRegression(), new KStar()}).parallel().map(cls -> { + double acc = 0; + String name = cls.getClass().getSimpleName(); + try { + Evaluation eval = new Evaluation(instances); + eval.crossValidateModel(cls, instances, NUM_FOLDS, new Random(1)); + acc = eval.correlationCoefficient(); + } catch (Exception e) { + LOG.error("Error evaluating model", e); + } + LOG.info(name + " : " + acc); + return Pair.of(acc, name); + }).max(Comparator.comparingDouble(Pair::getLeft)); + LOG.info("#########"); + LOG.info("Best: " + max.get().getRight() + " : " + max.get().getLeft()); + + watch.stop(); + LOG.info("Elapsed time: {}", watch); + } +} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java deleted file mode 100644 index 44e67c3..0000000 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/PathConstants.java +++ /dev/null @@ -1,79 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.train; - -import net.lingala.zip4j.core.ZipFile; -import net.lingala.zip4j.exception.ZipException; -import org.apache.commons.io.FileUtils; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.IOException; -import java.net.URL; - -public class PathConstants { - - private static final Logger LOG = LoggerFactory.getLogger(PathConstants.class); - - public static final String CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/PolishSummariesCorpus?action=AttachFile&do=get&target=PSC_1.0.zip"; - public static final String PREPROCESSED_CORPUS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=all-preprocessed.zip"; - public static final String SUMMARY_SENTENCE_IDS_DOWNLOAD_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero-sentence-ids.zip"; - public static final String ZERO_TRAINING_CORPUS_URL = "http://zil.ipipan.waw.pl/Nicolas?action=AttachFile&do=get&target=train-zero.tsv"; - - public static final File WORKING_DIR = new File("data"); - - public static final File ZIPPED_CORPUS_FILE = new File(WORKING_DIR, "PSC_1.0.zip"); - public static final File ZIPPED_PREPROCESSED_CORPUS_FILE = new File(WORKING_DIR, "all-preprocessed.zip"); - public static final File ZIPPED_SUMMARY_SENTENCE_IDS_FILE = new File(WORKING_DIR, "train-zero-sentence-ids.zip"); - - public static final File EXTRACTED_CORPUS_DIR = new File(WORKING_DIR, "corpus"); - public static final File EXTRACTED_CORPUS_DATA_DIR = new File(new File(EXTRACTED_CORPUS_DIR, "PSC_1.0"), "data"); - public static final File SUMMARY_SENTENCE_IDS_DIR = new File(WORKING_DIR, "train-zero-sentence-ids"); - public static final File PREPROCESSED_CORPUS_DIR = new File(WORKING_DIR, "all-preprocessed"); - public static final File GOLD_TEST_SUMMARIES_DIR = new File(WORKING_DIR, "test-gold"); - public static final File GOLD_TRAIN_SUMMARIES_DIR = new File(WORKING_DIR, "train-gold"); - public static final File OPTIMAL_SUMMARIES_DIR = new File(WORKING_DIR, "train-optimal"); - public static final File ZERO_TRAINING_CORPUS = new File(WORKING_DIR, "train-zero.tsv"); - - public static final File ARFF_DIR = new File(WORKING_DIR, "train-arff"); - public static final File MENTION_ARFF = new File(ARFF_DIR, "mentions.arff"); - public static final File SENTENCE_ARFF = new File(ARFF_DIR, "sentences.arff"); - public static final File ZERO_ARFF = new File(ARFF_DIR, "zeros.arff"); - - private PathConstants() { - } - - public static File createFolder(File folder) { - if (folder.mkdir()) { - LOG.info("Created directory at: {}.", folder.getPath()); - } else { - LOG.info("Directory already present at: {}.", folder.getPath()); - } - return folder; - } - - public static void downloadFile(String fileUrl, File targetFile) throws IOException { - if (!targetFile.exists()) { - LOG.info("Downloading file from url {} to file {} ...", fileUrl, targetFile); - FileUtils.copyURLToFile(new URL(fileUrl), targetFile); - LOG.info("done."); - } else { - LOG.info("File {} already downloaded.", targetFile); - } - } - - public static void downloadFileAndExtract(String url, File targetZipFile, File targetDir) throws IOException, ZipException { - downloadFile(url, targetZipFile); - extractZipFile(targetZipFile, targetDir); - } - - private static void extractZipFile(File targetZipFile, File targetDir) throws ZipException { - if (targetDir.exists()) { - LOG.info("Zip file {} already extracted to dir {}.", targetZipFile, targetDir); - } else { - createFolder(targetDir); - ZipFile zipFile = new ZipFile(targetZipFile); - zipFile.extractAll(targetDir.getPath()); - LOG.info("Extracted zip file: {} to dir: {}.", targetZipFile, targetDir); - } - } -} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java index 60e679a..aec39ae 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/MentionScorer.java @@ -6,29 +6,63 @@ import com.google.common.collect.Multiset; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; +import pl.waw.ipipan.zil.summ.nicolas.utils.ResourceUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; +import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.function.Function; import java.util.stream.Collectors; public class MentionScorer { + private final Set<String> STOPWORDS; + + public MentionScorer() throws IOException { + STOPWORDS = ResourceUtils.loadStopwords().stream().collect(Collectors.toSet()); + } + public Map<TMention, Double> calculateMentionScores(String optimalSummary, TText text) { - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); List<TSentence> sentences = text.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).collect(Collectors.toList()); - Map<TMention, String> mention2Orth = Utils.loadMention2Orth(sentences, true); + Map<TMention, String> mention2Orth = loadMention2OrthExcludingStopwords(sentences); return booleanTokenIntersection(mention2Orth, tokenCounts); } + private Map<TMention, String> loadMention2OrthExcludingStopwords(List<TSentence> sents) { + Map<TMention, String> mention2orth = Maps.newHashMap(); + for (TSentence s : sents) { + Map<String, TToken> tokId2tok = s.getTokens().stream().collect(Collectors.toMap(TToken::getId, Function.identity())); + + for (TMention m : s.getMentions()) { + StringBuilder mentionOrth = new StringBuilder(); + for (String tokId : m.getChildIds()) { + TToken token = tokId2tok.get(tokId); + if (STOPWORDS.contains(token.getChosenInterpretation().getBase().toLowerCase())) { + continue; + } + + if (!token.isNoPrecedingSpace()) + mentionOrth.append(" "); + mentionOrth.append(token.getOrth()); + } + mention2orth.put(m, mentionOrth.toString().trim()); + } + } + return mention2orth; + } + private static Map<TMention, Double> booleanTokenIntersection(Map<TMention, String> mention2Orth, Multiset<String> tokenCounts) { Map<TMention, Double> mention2score = Maps.newHashMap(); for (Map.Entry<TMention, String> entry : mention2Orth.entrySet()) { TMention mention = entry.getKey(); String mentionOrth = mention2Orth.get(mention); - for (String token : Utils.tokenize(mentionOrth)) { + for (String token : TextUtils.tokenize(mentionOrth)) { if (tokenCounts.contains(token.toLowerCase())) { mention2score.put(mention, 1.0); break; diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java index 61d01f0..dcdc297 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/SentenceScorer.java @@ -6,22 +6,23 @@ import com.google.common.collect.Multiset; import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.utils.TextUtils; import java.util.List; import java.util.Map; public class SentenceScorer { + public Map<TSentence, Double> calculateSentenceScores(String optimalSummary, TText preprocessedText) { - Multiset<String> tokenCounts = HashMultiset.create(Utils.tokenize(optimalSummary.toLowerCase())); + Multiset<String> tokenCounts = HashMultiset.create(TextUtils.tokenize(optimalSummary.toLowerCase())); Map<TSentence, Double> sentence2score = Maps.newHashMap(); for (TParagraph paragraph : preprocessedText.getParagraphs()) for (TSentence sentence : paragraph.getSentences()) { double score = 0.0; - String orth = Utils.loadSentence2Orth(sentence); - List<String> tokens = Utils.tokenize(orth); + String orth = TextUtils.loadSentence2Orth(sentence); + List<String> tokens = TextUtils.tokenize(orth); for (String token : tokens) { score += tokenCounts.contains(token.toLowerCase()) ? 1.0 : 0.0; } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java index 241874e..98d0d67 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/model/ZeroScorer.java @@ -5,7 +5,7 @@ import org.apache.commons.csv.CSVFormat; import org.apache.commons.csv.CSVParser; import org.apache.commons.csv.CSVRecord; import org.apache.commons.csv.QuoteMode; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroSubjectCandidate; diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java index aeba701..8dde011 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/CreateOptimalSummaries.java @@ -8,14 +8,14 @@ import com.google.common.collect.Multiset; import org.apache.commons.io.FileUtils; import pl.waw.ipipan.zil.summ.eval.Main; import pl.waw.ipipan.zil.summ.eval.rouge.RougeN; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import java.io.File; import java.io.IOException; import java.util.*; import java.util.stream.Collectors; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class CreateOptimalSummaries { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java index 7e4b548..3783f8b 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadCorpus.java @@ -1,6 +1,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class DownloadCorpus { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java index 980fa5c..79374e1 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/DownloadTrainingResources.java @@ -1,6 +1,6 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class DownloadTrainingResources { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java index 19d5171..05fbe5f 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/ExtractGoldSummaries.java @@ -1,19 +1,21 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; import pl.waw.ipipan.zil.summ.pscapi.xml.Summary; import pl.waw.ipipan.zil.summ.pscapi.xml.Text; import javax.xml.bind.JAXBException; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; +import java.io.OutputStreamWriter; import java.util.List; import java.util.function.Predicate; import java.util.stream.Collectors; import java.util.stream.Stream; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class ExtractGoldSummaries { @@ -22,7 +24,6 @@ public class ExtractGoldSummaries { private static final Predicate<Text> IS_TEST = text -> text.getSummaries().getSummary().stream().anyMatch(summary -> summary.getType().equals(ABSTRACT_SUMMARY_TYPE)); - private ExtractGoldSummaries() { } @@ -47,7 +48,10 @@ public class ExtractGoldSummaries { for (Summary summary : goldSummaries) { File targetDir = isTest ? GOLD_TEST_SUMMARIES_DIR : GOLD_TRAIN_SUMMARIES_DIR; File targetFile = new File(targetDir, text.getId() + "_" + summary.getAuthor() + ".txt"); - Utils.writeStringToFile(summary.getBody(), targetFile); + + try (OutputStreamWriter writer = new OutputStreamWriter(new FileOutputStream(targetFile), Constants.ENCODING)) { + writer.append(summary.getBody()); + } } } } diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java index b15a291..b5b0c09 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PrepareTrainingData.java @@ -10,15 +10,14 @@ import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.nicolas.InstanceUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.ThriftUtils; -import pl.waw.ipipan.zil.summ.nicolas.common.Utils; import pl.waw.ipipan.zil.summ.nicolas.features.FeatureHelper; import pl.waw.ipipan.zil.summ.nicolas.mention.MentionFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.sentence.SentenceFeatureExtractor; import pl.waw.ipipan.zil.summ.nicolas.train.model.MentionScorer; import pl.waw.ipipan.zil.summ.nicolas.train.model.SentenceScorer; import pl.waw.ipipan.zil.summ.nicolas.train.model.ZeroScorer; +import pl.waw.ipipan.zil.summ.nicolas.utils.InstanceUtils; +import pl.waw.ipipan.zil.summ.nicolas.utils.thrift.ThriftUtils; import pl.waw.ipipan.zil.summ.nicolas.zero.CandidateFinder; import pl.waw.ipipan.zil.summ.nicolas.zero.InstanceCreator; import pl.waw.ipipan.zil.summ.nicolas.zero.ZeroFeatureExtractor; @@ -37,7 +36,7 @@ import java.util.Set; import java.util.function.Predicate; import java.util.stream.Collectors; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class PrepareTrainingData { @@ -61,7 +60,7 @@ public class PrepareTrainingData { MentionScorer mentionScorer = new MentionScorer(); MentionFeatureExtractor featureExtractor = new MentionFeatureExtractor(); - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); int i = 1; for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { @@ -105,7 +104,7 @@ public class PrepareTrainingData { SentenceScorer sentenceScorer = new SentenceScorer(); SentenceFeatureExtractor featureExtractor = new SentenceFeatureExtractor(); - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); int i = 1; for (String textId : id2preprocessedText.keySet()) { @@ -149,7 +148,7 @@ public class PrepareTrainingData { ZeroScorer zeroScorer = new ZeroScorer(ZERO_TRAINING_CORPUS); ZeroFeatureExtractor featureExtractor = new ZeroFeatureExtractor(); - Instances instances = Utils.createNewInstances(featureExtractor.getAttributesList()); + Instances instances = InstanceUtils.createNewInstances(featureExtractor.getAttributesList()); int i = 1; for (Map.Entry<String, TText> entry : id2preprocessedText.entrySet()) { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java index 449454b..fd7e862 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/PreprocessCorpus.java @@ -9,7 +9,7 @@ import pl.waw.ipipan.zil.summ.pscapi.xml.Text; import java.io.File; import java.util.Arrays; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class PreprocessCorpus { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java index 10dfa40..d186dcc 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/pipeline/TrainAllModels.java @@ -3,7 +3,7 @@ package pl.waw.ipipan.zil.summ.nicolas.train.pipeline; import org.apache.commons.lang3.time.StopWatch; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.Constants; import pl.waw.ipipan.zil.summ.nicolas.train.model.Settings; import weka.classifiers.Classifier; import weka.core.Instances; @@ -14,7 +14,7 @@ import java.io.FileOutputStream; import java.io.ObjectOutputStream; import java.util.logging.LogManager; -import static pl.waw.ipipan.zil.summ.nicolas.train.PathConstants.*; +import static pl.waw.ipipan.zil.summ.nicolas.PathConstants.*; public class TrainAllModels { diff --git a/pom.xml b/pom.xml index f1fad43..f21f5f3 100644 --- a/pom.xml +++ b/pom.xml @@ -10,15 +10,12 @@ <packaging>pom</packaging> - <modules> <module>nicolas-lib</module> <module>nicolas-cli</module> <module>nicolas-model</module> <module>nicolas-train</module> - <module>nicolas-common</module> <module>nicolas-multiservice</module> - <module>nicolas-eval</module> </modules> <properties> @@ -59,23 +56,23 @@ <!-- project --> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-model</artifactId> + <artifactId>nicolas-cli</artifactId> <version>${project.version}</version> - <scope>runtime</scope> </dependency> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-common</artifactId> + <artifactId>nicolas-lib</artifactId> <version>${project.version}</version> </dependency> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-zero</artifactId> + <artifactId>nicolas-model</artifactId> <version>${project.version}</version> + <scope>runtime</scope> </dependency> <dependency> <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-lib</artifactId> + <artifactId>nicolas-multiservice</artifactId> <version>${project.version}</version> </dependency> <dependency> @@ -83,11 +80,6 @@ <artifactId>nicolas-train</artifactId> <version>${project.version}</version> </dependency> - <dependency> - <groupId>pl.waw.ipipan.zil.summ</groupId> - <artifactId>nicolas-multiservice</artifactId> - <version>${project.version}</version> - </dependency> <!-- internal --> <dependency> -- libgit2 0.22.2