From cb490cab2a2bfe90e2c4ad0661117321fb2f2e1f Mon Sep 17 00:00:00 2001 From: Mateusz Kopeć <m.kopec@ipipan.waw.pl> Date: Wed, 1 Feb 2017 22:11:55 +0100 Subject: [PATCH] create sample cli client --- nicolas-cli/README.md | 12 ++++++++++++ nicolas-cli/pom.xml | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java | 87 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java | 77 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java | 36 ++++++++++++++++++++++++++++++++++++ nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/CliTest.java | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java | 38 ++++++++++++++++++++++++++++++++++++++ nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/TestUtils.java | 24 ++++++++++++++++++++++++ nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.thrift | Bin 0 -> 497720 bytes nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.txt | 9 +++++++++ nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java | 16 ++++++++++------ nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java | 11 +++++++---- nicolas-multiservice/pom.xml | 39 +++++++++++++++++++++++++++++++++++++++ nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/MultiserviceProxy.java | 104 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/Preprocessor.java | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-train/pom.xml | 4 ++++ nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java | 4 ++-- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java | 110 -------------------------------------------------------------------------------------------------------------- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java | 97 ------------------------------------------------------------------------------------------------- nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java | 31 ------------------------------- pom.xml | 213 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------- 24 files changed, 1039 insertions(+), 257 deletions(-) create mode 100644 nicolas-cli/README.md create mode 100644 nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java create mode 100644 nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java create mode 100644 nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java create mode 100644 nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/CliTest.java create mode 100644 nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java create mode 100644 nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java create mode 100644 nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/TestUtils.java create mode 100644 nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.thrift create mode 100644 nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.txt create mode 100644 nicolas-multiservice/pom.xml create mode 100644 nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/MultiserviceProxy.java create mode 100644 nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/Preprocessor.java create mode 100644 nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java delete mode 100644 nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java delete mode 100644 nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java create mode 100644 nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java delete mode 100644 nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java diff --git a/nicolas-cli/README.md b/nicolas-cli/README.md new file mode 100644 index 0000000..b0c409f --- /dev/null +++ b/nicolas-cli/README.md @@ -0,0 +1,12 @@ +# nicolas-cli + +This module contains a sample command-line application, which uses Nicolas library to summarize chosen input text file. +Summary is written to target output file. Additionally, user needs to specify desired number of tokens in the summary. + +## Installation + + mvn clean install + +## Usage + + java -jar target/nicolas-cli.jar -help \ No newline at end of file diff --git a/nicolas-cli/pom.xml b/nicolas-cli/pom.xml index e65a5b6..422e8f1 100644 --- a/nicolas-cli/pom.xml +++ b/nicolas-cli/pom.xml @@ -11,4 +11,70 @@ <artifactId>nicolas-cli</artifactId> + <dependencies> + + <!-- project --> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>nicolas-multiservice</artifactId> + </dependency> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>nicolas-lib</artifactId> + </dependency> + + <!-- third party --> + <dependency> + <groupId>com.beust</groupId> + <artifactId>jcommander</artifactId> + </dependency> + + <!-- logging --> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-simple</artifactId> + </dependency> + + <!-- test --> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + </dependency> + <dependency> + <groupId>org.mockito</groupId> + <artifactId>mockito-core</artifactId> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <configuration> + <appendAssemblyId>false</appendAssemblyId> + <archive> + <manifest> + <mainClass>pl.waw.ipipan.zil.summ.nicolas.cli.Main</mainClass> + </manifest> + </archive> + <descriptorRefs> + <descriptorRef>jar-with-dependencies</descriptorRef> + </descriptorRefs> + </configuration> + <executions> + <execution> + <id>make-assembly</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> </project> \ No newline at end of file diff --git a/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java new file mode 100644 index 0000000..ace95d1 --- /dev/null +++ b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java @@ -0,0 +1,87 @@ +package pl.waw.ipipan.zil.summ.nicolas.cli; + +import com.beust.jcommander.IParameterValidator; +import com.beust.jcommander.JCommander; +import com.beust.jcommander.Parameter; +import com.beust.jcommander.ParameterException; +import com.beust.jcommander.converters.FileConverter; +import com.beust.jcommander.validators.PositiveInteger; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; + +class Cli { + + private static final Logger LOG = LoggerFactory.getLogger(Cli.class); + + @Parameter(names = {"-help", "-h"}, description = "Print help") + private boolean help = false; + + @Parameter(names = {"-input", "-i"}, description = "Input text file to summarize", required = true, validateWith = FileValidator.class, converter = FileConverter.class) + private File inputFile; + + @Parameter(names = {"-output", "-o"}, description = "Output file path for summary", required = true, validateWith = FileValidator.class, converter = FileConverter.class) + private File outputFile; + + @Parameter(names = {"-target", "-t"}, description = "Target summary token count", required = true, validateWith = PositiveInteger.class) + private int targetTokenCount; + + private boolean invalid = false; + + boolean isHelp() { + return help; + } + + File getInputFile() { + return inputFile; + } + + File getOutputFile() { + return outputFile; + } + + int getTargetTokenCount() { + return targetTokenCount; + } + + @SuppressWarnings("squid:S1166") + static Cli parse(String[] args) { + Cli cli = new Cli(); + JCommander jCommander; + try { + jCommander = new JCommander(cli, args); + } catch (ParameterException ex) { + LOG.error("Error parsing parameters: {}", ex.getLocalizedMessage()); + cli.setInvalid(); + return cli; + } + if (cli.isHelp()) { + StringBuilder stringBuilder = new StringBuilder(); + jCommander.usage(stringBuilder); + LOG.info("{}", stringBuilder); + } + return cli; + } + + private void setInvalid() { + invalid = true; + } + + boolean isInvalid() { + return invalid; + } + + public static class FileValidator implements IParameterValidator { + + @Override + public void validate(String name, String value) { + File file = new File(value); + if (!file.isFile()) { + throw new ParameterException("Parameter " + name + + " should be a valid file path (found " + value + ")"); + } + } + + } +} diff --git a/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java new file mode 100644 index 0000000..4adaa48 --- /dev/null +++ b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java @@ -0,0 +1,77 @@ +package pl.waw.ipipan.zil.summ.nicolas.cli; + +import org.apache.commons.io.IOUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.Nicolas; +import pl.waw.ipipan.zil.summ.nicolas.NicolasException; +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; + +import java.io.*; + +class Client { + + private static final Logger LOG = LoggerFactory.getLogger(Client.class); + + private final Preprocessor preprocessor; + private final Nicolas nicolas; + + Client(Preprocessor preprocessor, Nicolas nicolas) { + this.preprocessor = preprocessor; + this.nicolas = nicolas; + } + + @SuppressWarnings("squid:S1166") + void summarize(File inputFile, File outputFile, int targetTokenCount) { + try { + String inputText = loadInputText(inputFile); + TText preprocessed = preprocess(inputText); + String summary = summarize(preprocessed, targetTokenCount); + saveSummaryToFile(summary, outputFile); + } catch (IOException | MultiserviceException | NicolasException e) { + LOG.error("Exiting because of an error."); + } + } + + private String loadInputText(File inputFile) throws IOException { + String inputText; + try (FileInputStream inputStream = new FileInputStream(inputFile)) { + inputText = IOUtils.toString(inputStream, Constants.ENCODING); + } catch (IOException e) { + LOG.error("Error reading input text."); + throw e; + } + return inputText; + } + + private TText preprocess(String inputText) throws MultiserviceException { + try { + return preprocessor.preprocess(inputText); + } catch (MultiserviceException e) { + LOG.error("Error preprocessing input text."); + throw e; + } + } + + private String summarize(TText preprocessed, int targetTokenCount) throws NicolasException { + try { + return nicolas.summarizeThrift(preprocessed, targetTokenCount); + } catch (NicolasException e) { + LOG.error("Error preprocessing input text."); + throw e; + } + } + + private void saveSummaryToFile(String summary, File outputFile) throws IOException { + try (OutputStream outputStream = new FileOutputStream(outputFile)) { + IOUtils.write(summary, outputStream, Constants.ENCODING); + } catch (IOException e) { + LOG.error("Error writing file with summary."); + throw e; + } + } + +} diff --git a/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java new file mode 100644 index 0000000..4a49f65 --- /dev/null +++ b/nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java @@ -0,0 +1,36 @@ +package pl.waw.ipipan.zil.summ.nicolas.cli; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.summ.nicolas.Nicolas; +import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; + +import java.io.IOException; + +public class Main { + + private static final Logger LOG = LoggerFactory.getLogger(Main.class); + + private Main() { + } + + @SuppressWarnings("squid:S1166") + public static void main(String[] args) { + Cli cli = Cli.parse(args); + if (cli.isHelp() || cli.isInvalid()) { + return; + } + + Nicolas nicolas; + Preprocessor preprocessor; + try { + nicolas = new Nicolas(); + preprocessor = new Preprocessor(); + } catch (IOException | ClassNotFoundException e) { + LOG.error("Error loading Nicolas or Multiservice preprocessor! Will exit."); + return; + } + Client client = new Client(preprocessor, nicolas); + client.summarize(cli.getInputFile(), cli.getOutputFile(), cli.getTargetTokenCount()); + } +} diff --git a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/CliTest.java b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/CliTest.java new file mode 100644 index 0000000..8b09280 --- /dev/null +++ b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/CliTest.java @@ -0,0 +1,75 @@ +package pl.waw.ipipan.zil.summ.nicolas.cli; + +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; + +import java.io.File; +import java.io.IOException; + +import static org.junit.Assert.*; + +public class CliTest { + + @ClassRule + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + + private static File sampleInputFile; + private static File sampleOutputFile; + private static final int TARGET_TOKEN_COUNT = 50; + private static final String INVALID_FILE_PATH = "nonexistent_dir/nonexistent_file"; + + @BeforeClass + public static void initSampleFiles() throws IOException { + sampleInputFile = TEMPORARY_FOLDER.newFile(); + sampleOutputFile = TEMPORARY_FOLDER.newFile(); + } + + @Test + public void failNoArguments() throws Exception { + String[] args = new String[]{}; + Cli cli = Cli.parse(args); + assertTrue(cli.isInvalid()); + } + + @Test + public void failInvalidArgument() throws Exception { + String[] args = new String[]{"-xxxx", "xxx", "-i", sampleInputFile.getPath(), "-o", sampleOutputFile.getPath(), "-t", Integer.toString(TARGET_TOKEN_COUNT)}; + Cli cli = Cli.parse(args); + assertTrue(cli.isInvalid()); + } + + @Test + public void failInvalidInputFile() throws Exception { + String[] args = new String[]{"-i", INVALID_FILE_PATH, "-o", sampleOutputFile.getPath(), "-t", Integer.toString(TARGET_TOKEN_COUNT)}; + Cli cli = Cli.parse(args); + assertTrue(cli.isInvalid()); + } + + @Test + public void failInvalidOutputFile() throws Exception { + String[] args = new String[]{"-i", sampleInputFile.getPath(), "-o", INVALID_FILE_PATH, "-t", Integer.toString(TARGET_TOKEN_COUNT)}; + Cli cli = Cli.parse(args); + assertTrue(cli.isInvalid()); + } + + @Test + public void failInvalidTargetTokenCount() throws Exception { + String[] args = new String[]{"-i", sampleInputFile.getPath(), "-o", sampleOutputFile.getPath(), "-t", Integer.toString(-1)}; + Cli cli = Cli.parse(args); + assertTrue(cli.isInvalid()); + } + + @Test + public void validArguments() throws Exception { + String[] args = new String[]{"-i", sampleInputFile.getPath(), "-o", sampleOutputFile.getPath(), "-t", Integer.toString(TARGET_TOKEN_COUNT)}; + Cli cli = Cli.parse(args); + assertFalse(cli.isInvalid()); + assertEquals(sampleInputFile, cli.getInputFile()); + assertEquals(sampleOutputFile, cli.getOutputFile()); + assertEquals(TARGET_TOKEN_COUNT, cli.getTargetTokenCount()); + } + + +} diff --git a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java new file mode 100644 index 0000000..2509618 --- /dev/null +++ b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java @@ -0,0 +1,54 @@ +package pl.waw.ipipan.zil.summ.nicolas.cli; + +import org.apache.commons.io.IOUtils; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.Nicolas; +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; +import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; + +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; + +import static org.junit.Assert.assertEquals; +import static org.mockito.ArgumentMatchers.*; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; +import static pl.waw.ipipan.zil.summ.nicolas.cli.TestUtils.SAMPLE_INPUT_RESOURCE_PATH; +import static pl.waw.ipipan.zil.summ.nicolas.cli.TestUtils.SAMPLE_THRIFT_TEXT_RESOURCE_PATH; + +public class ClientTest { + + @ClassRule + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + + @Test + public void processSampleText() throws Exception { + Preprocessor preprocessor = mock(Preprocessor.class); + TText ttext = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); + when(preprocessor.preprocess(any())).thenReturn(ttext); + + Nicolas nicolas = mock(Nicolas.class); + String targetSummary = "This is a summary"; + when(nicolas.summarizeThrift(eq(ttext), anyInt())).thenReturn(targetSummary); + + Client client = new Client(preprocessor, nicolas); + + File inputFile = TestUtils.copyResourceToFile(SAMPLE_INPUT_RESOURCE_PATH, TEMPORARY_FOLDER.newFile()); + File outputFile = TEMPORARY_FOLDER.newFile(); + int targetTokenCount = 50; + + String[] args = new String[]{"-i", inputFile.getPath(), "-o", outputFile.getPath(), "-t", Integer.toString(targetTokenCount)}; + Cli cli = Cli.parse(args); + client.summarize(cli.getInputFile(), cli.getOutputFile(), cli.getTargetTokenCount()); + + try (InputStream inputStream = new FileInputStream(outputFile)) { + String summary = IOUtils.toString(inputStream, Constants.ENCODING); + assertEquals(targetSummary, summary); + } + } +} \ No newline at end of file diff --git a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java new file mode 100644 index 0000000..4067383 --- /dev/null +++ b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java @@ -0,0 +1,38 @@ +package pl.waw.ipipan.zil.summ.nicolas.cli; + +import org.apache.commons.io.IOUtils; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; + +import java.io.File; +import java.io.FileInputStream; +import java.io.InputStream; + +import static junit.framework.TestCase.assertTrue; + +public class MainIT { + + private final static String SAMPLE_INPUT_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.txt"; + + @ClassRule + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + + @Test + public void processSampleText() throws Exception { + File inputFile = TestUtils.copyResourceToFile(SAMPLE_INPUT_RESOURCE_PATH, TEMPORARY_FOLDER.newFile()); + File outputFile = TEMPORARY_FOLDER.newFile(); + int targetTokenCount = 50; + + String[] args = new String[]{"-i", inputFile.getPath(), "-o", outputFile.getPath(), "-t", Integer.toString(targetTokenCount)}; + Main.main(args); + + try (InputStream inputStream = new FileInputStream(outputFile)) { + String summary = IOUtils.toString(inputStream, Constants.ENCODING); + assertTrue(summary.length() > 0); + assertTrue(summary.length() < targetTokenCount * 10); + } + } + +} \ No newline at end of file diff --git a/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/TestUtils.java b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/TestUtils.java new file mode 100644 index 0000000..47412af --- /dev/null +++ b/nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/TestUtils.java @@ -0,0 +1,24 @@ +package pl.waw.ipipan.zil.summ.nicolas.cli; + +import org.apache.commons.io.IOUtils; + +import java.io.*; + +class TestUtils { + + private static final String PACKAGE = "/pl/waw/ipipan/zil/summ/nicolas/cli/"; + + static final String SAMPLE_INPUT_RESOURCE_PATH = PACKAGE + "sample_input.txt"; + static final String SAMPLE_THRIFT_TEXT_RESOURCE_PATH = PACKAGE + "sample_input.thrift"; + + private TestUtils() { + } + + static File copyResourceToFile(String resourcePath, File file) throws IOException { + try (InputStream inputStream = MainIT.class.getResourceAsStream(resourcePath); + OutputStream outputStream = new FileOutputStream(file)) { + IOUtils.copy(inputStream, outputStream); + } + return file; + } +} diff --git a/nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.thrift b/nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.thrift new file mode 100644 index 0000000..cf072c2 Binary files /dev/null and b/nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.thrift differ diff --git a/nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.txt b/nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.txt new file mode 100644 index 0000000..3026aea --- /dev/null +++ b/nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.txt @@ -0,0 +1,9 @@ +To będzie już druga próba licytacji nieruchomości na pl. Słonecznym, którą urzędnicy wytropili po latach poszukiwań majątku Adama Gesslera. + +Jego dług wobec miasta szacują dziś na ok. 27 mln zł. Już w 1992 r., wkrótce po podpisaniu umowy najmu lokalu na Rynku Staromiejskim, zaczęły się problemy z czynszem. Sąd orzekł eksmisję. Dotąd miastu udało się odzyskać ledwie kilkadziesiąt tysięcy złotych długu. + +Sprawa budzi wielkie emocje, bo choć Adam Gessler jest słynnym restauratorem, oficjalnie nie ma nic. Nawet wynajęta przez Zakład Gospodarowania Nieruchomościami w Śródmieściu firma detektywistyczna nie znalazła majątku. + +Pozostają dwa mieszkania na Żoliborzu, wyceniane przed rokiem na blisko 4,3 mln zł. Będą licytowane za dwie trzecie ceny. W ZGN wymyślili, żeby miasto przystąpiło do licytacji. Jeśli uda się kupić nieruchomość, komornik pospłaca wierzycieli Adama i Piotra Gesslerów. A miasto będzie mogło w przyszłości sprzedać korzystnie atrakcyjny dom. + +Licytacje odbędą się w środę. - Korzyści z wylicytowania domu będą niewielkie w stosunku do ogromnego długu pana Gesslera. Chodzi jednak o to, żeby wiedział, że miasto nie zrezygnuje z upominania się o swoje - tłumaczyła "Gazecie" Małgorzata Mazur, dyrektorka ZGN. diff --git a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java b/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java index 5524abc..ecba84f 100644 --- a/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java +++ b/nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java @@ -45,17 +45,21 @@ public class Utils { } } + public static TText loadThriftTextFromStream(InputStream inputStream) throws IOException { + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(inputStream)) { + return (TText) ois.readObject(); + } catch (ClassNotFoundException e) { + LOG.error("Error reading serialized thrift text file, class not found.", e); + throw new IOException(e); + } + } + public static TText loadThriftTextFromResource(String textResourcePath) throws IOException { try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) { if (stream == null) { throw new IOException("Resource not found at: " + textResourcePath); } - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { - return (TText) ois.readObject(); - } catch (ClassNotFoundException e) { - LOG.error("Error reading serialized thrift text file, class not found.", e); - throw new IOException(e); - } + return loadThriftTextFromStream(stream); } } diff --git a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java index 3b5b55a..415de45 100644 --- a/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java +++ b/nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java @@ -39,10 +39,13 @@ public class Nicolas { zeroFeatureExtractor = new ZeroFeatureExtractor(); } - public String summarizeThrift(TText text, int targetTokenCount) throws Exception { - Set<TMention> goodMentions - = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); - return calculateSummary(text, goodMentions, targetTokenCount); + public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { + try { + Set<TMention> goodMentions = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); + return calculateSummary(text, goodMentions, targetTokenCount); + } catch (Exception e) { + throw new NicolasException(e); + } } private String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { diff --git a/nicolas-multiservice/pom.xml b/nicolas-multiservice/pom.xml new file mode 100644 index 0000000..cc051be --- /dev/null +++ b/nicolas-multiservice/pom.xml @@ -0,0 +1,39 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <parent> + <artifactId>nicolas-container</artifactId> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <version>1.0-SNAPSHOT</version> + </parent> + <modelVersion>4.0.0</modelVersion> + + <artifactId>nicolas-multiservice</artifactId> + + <dependencies> + <!-- internal --> + <dependency> + <groupId>pl.waw.ipipan.zil.multiservice</groupId> + <artifactId>utils</artifactId> + </dependency> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>pscapi</artifactId> + </dependency> + + <!-- logging --> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </dependency> + + <!-- test --> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>nicolas-common</artifactId> + <scope>test</scope> + </dependency> + + </dependencies> +</project> \ No newline at end of file diff --git a/nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/MultiserviceProxy.java b/nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/MultiserviceProxy.java new file mode 100644 index 0000000..4cdaf84 --- /dev/null +++ b/nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/MultiserviceProxy.java @@ -0,0 +1,104 @@ +package pl.waw.ipipan.zil.summ.nicolas.multiservice; + +import org.apache.thrift.TException; +import org.apache.thrift.protocol.TBinaryProtocol; +import org.apache.thrift.protocol.TProtocol; +import org.apache.thrift.transport.TSocket; +import org.apache.thrift.transport.TTransport; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.multiservice.thrift.Multiservice; +import pl.waw.ipipan.zil.multiservice.thrift.ObjectRequest; +import pl.waw.ipipan.zil.multiservice.thrift.RequestPart; +import pl.waw.ipipan.zil.multiservice.thrift.RequestStatus; +import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class MultiserviceProxy { + + private static final Logger LOG = LoggerFactory.getLogger(MultiserviceProxy.class); + + private int port; + private String host; + + public MultiserviceProxy(String host, int port) { + this.host = host; + this.port = port; + LOG.info("Multiservice at {}:{}", host, port); + } + + public TText process(String text, List<String> services) throws MultiserviceException { + List<Map<String, String>> options = new ArrayList<>(); + for (int i = 0; i < services.size(); i++) + options.add(new HashMap<>()); + return process(text, "", services, options); + } + + public TText process(String text, String title, List<String> services, List<Map<String, String>> options) + throws MultiserviceException { + TTransport transport = new TSocket(host, port); + ObjectRequest objectRequest = createRequest(text, title, services, options); + + try { + transport.open(); + + TProtocol protocol = new TBinaryProtocol(transport); + Multiservice.Client client = new Multiservice.Client(protocol); + + LOG.debug("Sending Multiservice request..."); + TText responseText = request(objectRequest, client); + LOG.debug("...done"); + + return responseText; + + } catch (TException e) { + LOG.error("Error processing request:" + e); + throw new MultiserviceException(e.getMessage()); + + } finally { + transport.close(); + } + } + + private TText request(ObjectRequest objectRequest, Multiservice.Client client) throws TException { + + String requestToken = client.putObjectRequest(objectRequest); + while (true) { + RequestStatus status = client.getRequestStatus(requestToken); + if (RequestStatus.DONE.equals(status)) { + return client.getResultObject(requestToken); + } else if (RequestStatus.FAILED.equals(status) || RequestStatus.DUMPED.equals(status)) { + throw client.getException(requestToken); + } + } + } + + private ObjectRequest createRequest(String textBody, String textTitle, List<String> services, + List<Map<String, String>> options) { + TText text = new TText(); + + TParagraph par = new TParagraph(); + par.setText(textTitle); + text.addToParagraphs(par); + + for (String p : textBody.split("\n\n")) { + par = new TParagraph(); + par.setText(p); + text.addToParagraphs(par); + } + + List<RequestPart> processingChain = new ArrayList<>(); + int i = 0; + for (String serviceName : services) + processingChain.add(new RequestPart(serviceName, options.get(i++))); + + return new ObjectRequest(text, processingChain); + } + +} diff --git a/nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/Preprocessor.java b/nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/Preprocessor.java new file mode 100644 index 0000000..4199c53 --- /dev/null +++ b/nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/Preprocessor.java @@ -0,0 +1,52 @@ +package pl.waw.ipipan.zil.summ.nicolas.multiservice; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.ObjectOutputStream; +import java.util.Arrays; +import java.util.List; + +public class Preprocessor { + + private static final Logger LOG = LoggerFactory.getLogger(Preprocessor.class); + + private static final List<String> SERVICES = Arrays.asList("Concraft", "Spejd", "Nerf", "MentionDetector", + "Bartek"); + private static final int PORT = 20000; + private static final String HOST = "multiservice.nlp.ipipan.waw.pl"; + + private static final MultiserviceProxy MS_PROXY = new MultiserviceProxy(HOST, PORT); + + public TText preprocess(String body) throws MultiserviceException { + return MS_PROXY.process(body, SERVICES); + } + + public void preprocessToFile(String body, File targetFile) throws MultiserviceException { + if (targetFile.exists()) { + LOG.debug("Skipping existing file.."); + return; + } + LOG.info("Processing text into " + targetFile.getPath()); + TText ttext = preprocess(body); + try { + serialize(ttext, targetFile); + } catch (IOException e) { + LOG.error("Error serializing preprocessed text", e); + throw new MultiserviceException(e.getLocalizedMessage()); + } + } + + private static void serialize(TText ttext, File targetFile) throws IOException { + try (FileOutputStream fileOutputStream = new FileOutputStream(targetFile); + ObjectOutputStream oos = new ObjectOutputStream(fileOutputStream)) { + oos.writeObject(ttext); + } + } + +} diff --git a/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java b/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java new file mode 100644 index 0000000..e3ce61d --- /dev/null +++ b/nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java @@ -0,0 +1,74 @@ +package pl.waw.ipipan.zil.summ.nicolas.multiservice; + +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.rules.TemporaryFolder; +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; + +import java.io.File; +import java.io.FileInputStream; +import java.util.List; + +import static junit.framework.TestCase.assertEquals; +import static junit.framework.TestCase.assertTrue; + +public class PreprocessorIT { + + @ClassRule + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); + + private static Preprocessor preprocessor; + + @BeforeClass + public static void initPreprocessor() { + preprocessor = new Preprocessor(); + } + + @Test + public void shouldProcessSampleText() throws Exception { + String text = "Ala ma kota. Ala ma też psa."; + TText processed = preprocessor.preprocess(text); + + assertSampleProcessedText(processed); + } + + private void assertSampleProcessedText(TText processed) { + assertEquals(2, processed.getParagraphsSize()); + + // first paragraph is empty (placeholder for text title) + TParagraph firstParagraph = processed.getParagraphs().get(0); + assertEquals(0, firstParagraph.getSentencesSize()); + + TParagraph secondParagraph = processed.getParagraphs().get(1); + assertEquals(2, secondParagraph.getSentencesSize()); + List<TSentence> sentences = secondParagraph.getSentences(); + + TSentence firstSentence = sentences.get(0); + assertEquals(4, firstSentence.getTokensSize()); + assertEquals("Ala", firstSentence.getTokens().get(0).getOrth()); + + TSentence secondSentence = sentences.get(1); + assertEquals(5, secondSentence.getTokensSize()); + assertEquals("Ala", secondSentence.getTokens().get(0).getOrth()); + + assertEquals(3, processed.getCoreferencesSize()); //Ala, pies, kot + } + + + @Test + public void shouldProcessSampleTextToFile() throws Exception { + String text = "Ala ma kota. Ala ma też psa."; + File targetFile = TEMPORARY_FOLDER.newFile(); + assertTrue(targetFile.delete()); //delete file, because preprocessor skips existing files + preprocessor.preprocessToFile(text, targetFile); + + try (FileInputStream inputStream = new FileInputStream(targetFile)) { + TText processed = Utils.loadThriftTextFromStream(inputStream); + assertSampleProcessedText(processed); + } + } +} \ No newline at end of file diff --git a/nicolas-train/pom.xml b/nicolas-train/pom.xml index 6d71d47..4401bfb 100644 --- a/nicolas-train/pom.xml +++ b/nicolas-train/pom.xml @@ -21,6 +21,10 @@ <groupId>pl.waw.ipipan.zil.summ</groupId> <artifactId>nicolas-lib</artifactId> </dependency> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>nicolas-multiservice</artifactId> + </dependency> <!-- internal --> <dependency> diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java index 439a33b..b62061e 100644 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java @@ -4,7 +4,7 @@ import net.lingala.zip4j.core.ZipFile; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.summ.nicolas.train.multiservice.NLPProcess; +import pl.waw.ipipan.zil.summ.nicolas.train.preprocess.Main; import java.io.File; import java.net.URL; @@ -45,7 +45,7 @@ public class DownloadAndPreprocessCorpus { File preprocessed = new File(WORKING_DIR, "preprocessed"); createFolder(preprocessed.getPath()); - NLPProcess.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); + Main.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); } private static File createFolder(String path) { diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java deleted file mode 100644 index 2c4455a..0000000 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java +++ /dev/null @@ -1,110 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; - -import org.apache.thrift.TException; -import org.apache.thrift.protocol.TBinaryProtocol; -import org.apache.thrift.protocol.TProtocol; -import org.apache.thrift.transport.TSocket; -import org.apache.thrift.transport.TTransport; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.multiservice.thrift.Multiservice; -import pl.waw.ipipan.zil.multiservice.thrift.ObjectRequest; -import pl.waw.ipipan.zil.multiservice.thrift.RequestPart; -import pl.waw.ipipan.zil.multiservice.thrift.RequestStatus; -import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; -import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; - -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class MultiserviceProxy { - - private static final Logger LOG = LoggerFactory.getLogger(MultiserviceProxy.class); - - private int port; - private String host; - - public MultiserviceProxy(String host, int port) { - this.host = host; - this.port = port; - LOG.info("Multiservice at " + host + ":" + port); - } - - public TText process(String text, List<String> services) throws Exception { - List<Map<String, String>> options = new ArrayList<>(); - for (int i = 0; i < services.size(); i++) - options.add(new HashMap<>()); - return process(text, "", services, options); - } - - public TText process(String text, String title, List<String> services, List<Map<String, String>> options) - throws Exception { - TTransport transport = new TSocket(host, port); - ObjectRequest objectRequest = createRequest(text, title, services, options); - - try { - transport.open(); - - TProtocol protocol = new TBinaryProtocol(transport); - Multiservice.Client client = new Multiservice.Client(protocol); - - LOG.debug("Sending Multservice request..."); - TText responseText = request(objectRequest, client); - LOG.debug("...done"); - - return responseText; - - } catch (TException e) { - LOG.error("Error processing request:" + e); - throw new Exception(e); - - } finally { - transport.close(); - } - } - - private TText request(ObjectRequest objectRequest, Multiservice.Client client) throws TException { - - String requestToken = client.putObjectRequest(objectRequest); - while (true) { - RequestStatus status = client.getRequestStatus(requestToken); - if (RequestStatus.DONE.equals(status)) { - TText result = client.getResultObject(requestToken); - return result; - } else if (RequestStatus.FAILED.equals(status) || RequestStatus.DUMPED.equals(status)) { - try { - MultiserviceException exception = client.getException(requestToken); - throw exception; - } catch (TException e) { - throw e; - } - } - } - } - - private ObjectRequest createRequest(String textBody, String textTitle, List<String> services, - List<Map<String, String>> options) { - TText text = new TText(); - - TParagraph par = new TParagraph(); - par.setText(textTitle); - text.addToParagraphs(par); - - for (String p : textBody.split("\n\n")) { - par = new TParagraph(); - par.setText(p); - text.addToParagraphs(par); - } - - List<RequestPart> processingChain = new ArrayList<>(); - int i = 0; - for (String serviceName : services) - processingChain.add(new RequestPart(serviceName, options.get(i++))); - - return new ObjectRequest(text, processingChain); - } - -} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java deleted file mode 100644 index 2922942..0000000 --- a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java +++ /dev/null @@ -1,97 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; -import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; -import pl.waw.ipipan.zil.summ.pscapi.xml.Text; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.ObjectOutputStream; -import java.util.Arrays; -import java.util.List; - -public class NLPProcess { - - private static final Logger LOG = LoggerFactory.getLogger(NLPProcess.class); - - private static final List<String> SERVICES = Arrays.asList("Concraft", "Spejd", "Nerf", "MentionDetector", - "Bartek"); - private static final int PORT = 20000; - private static final String HOST = "multiservice.nlp.ipipan.waw.pl"; - - private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT); - - private static final String CORPUS_FILE_SUFFIX = ".xml"; - private static final String OUTPUT_FILE_SUFFIX = ".thrift"; - - private NLPProcess() { - } - - public static void main(String[] args) { - if (args.length != 2) { - LOG.error("Wrong usage! Try " + NLPProcess.class.getSimpleName() + " dirWithCorpusFiles targetDir"); - return; - } - File corpusDir = new File(args[0]); - if (!corpusDir.isDirectory()) { - LOG.error("Corpus directory does not exist: {}", corpusDir); - return; - } - File targetDir = new File(args[1]); - if (!targetDir.isDirectory()) { - LOG.error("Target directory does not exist: {}", targetDir); - return; - } - - int ok = 0; - int err = 0; - File[] files = corpusDir.listFiles(f -> f.getName().endsWith(CORPUS_FILE_SUFFIX)); - if (files == null || files.length == 0) { - LOG.error("No corpus files found at: {}", corpusDir); - return; - } - Arrays.sort(files); - for (File file : files) { - try { - Text text = PSC_IO.readText(file); - File targetFile = new File(targetDir, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX)); - annotateNLP(text, targetFile); - ok++; - } catch (Exception e) { - err++; - LOG.error("Problem with text in " + file + ", " + e); - } - } - LOG.info("{} texts processed successfully.", ok); - LOG.info("{} texts with errors.", err); - } - - private static void annotateNLP(Text text, File targetFile) throws Exception { - annotate(text.getBody(), targetFile); - } - - private static void annotate(String body, File targetFile) throws Exception { - if (targetFile.exists()) { - LOG.debug("Skipping existing file.."); - return; - } - LOG.info("Processing text into " + targetFile.getPath()); - TText ttext = MSPROXY.process(body, SERVICES); - serialize(ttext, targetFile); - } - - public static void serialize(TText ttext, File targetFile) throws IOException { - try (FileOutputStream fileOutputStream = new FileOutputStream(targetFile); - ObjectOutputStream oos = new ObjectOutputStream(fileOutputStream)) { - oos.writeObject(ttext); - } - } - - public static TText annotate(String body) throws Exception { - return MSPROXY.process(body, SERVICES); - } - -} diff --git a/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java new file mode 100644 index 0000000..738591d --- /dev/null +++ b/nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java @@ -0,0 +1,63 @@ +package pl.waw.ipipan.zil.summ.nicolas.train.preprocess; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; +import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; +import pl.waw.ipipan.zil.summ.pscapi.xml.Text; + +import java.io.File; +import java.util.Arrays; + +public class Main { + + private static final Logger LOG = LoggerFactory.getLogger(Main.class); + + private static final String CORPUS_FILE_SUFFIX = ".xml"; + private static final String OUTPUT_FILE_SUFFIX = ".thrift"; + + private Main() { + } + + public static void main(String[] args) { + if (args.length != 2) { + LOG.error("Wrong usage! Try " + Main.class.getSimpleName() + " dirWithCorpusFiles targetDir"); + return; + } + File corpusDir = new File(args[0]); + if (!corpusDir.isDirectory()) { + LOG.error("Corpus directory does not exist: {}", corpusDir); + return; + } + File targetDir = new File(args[1]); + if (!targetDir.isDirectory()) { + LOG.error("Target directory does not exist: {}", targetDir); + return; + } + + int ok = 0; + int err = 0; + File[] files = corpusDir.listFiles(f -> f.getName().endsWith(CORPUS_FILE_SUFFIX)); + if (files == null || files.length == 0) { + LOG.error("No corpus files found at: {}", corpusDir); + return; + } + Arrays.sort(files); + + Preprocessor processor = new Preprocessor(); + + for (File file : files) { + try { + Text text = PSC_IO.readText(file); + File targetFile = new File(targetDir, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX)); + processor.preprocessToFile(text.getBody(), targetFile); + ok++; + } catch (Exception e) { + err++; + LOG.error("Problem with text in " + file + ", " + e); + } + } + LOG.info("{} texts processed successfully.", ok); + LOG.info("{} texts with errors.", err); + } +} diff --git a/nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java b/nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java deleted file mode 100644 index d66b72a..0000000 --- a/nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java +++ /dev/null @@ -1,31 +0,0 @@ -package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; - -import com.google.common.collect.Lists; -import org.junit.ClassRule; -import org.junit.Test; -import org.junit.rules.TemporaryFolder; -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; - -import java.io.File; -import java.util.List; -import java.util.stream.Collectors; - -import static junit.framework.TestCase.assertEquals; - -public class NLPProcessIT { - - @ClassRule - public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); - - @Test - public void shouldProcessSampleText() throws Exception { - String text = "Ala ma kota. Ala ma też psa."; - TText processed = NLPProcess.annotate(text); - List<String> ids = processed.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).map(TSentence::getId).collect(Collectors.toList()); - assertEquals(Lists.newArrayList("s-2.1", "s-2.2"), ids); - - File targetFile = TEMPORARY_FOLDER.newFile(); - NLPProcess.serialize(processed, targetFile); - } -} \ No newline at end of file diff --git a/pom.xml b/pom.xml index 115e398..a67adaf 100644 --- a/pom.xml +++ b/pom.xml @@ -10,29 +10,34 @@ <packaging>pom</packaging> + <modules> <module>nicolas-lib</module> <module>nicolas-cli</module> <module>nicolas-model</module> <module>nicolas-train</module> <module>nicolas-common</module> + <module>nicolas-multiservice</module> </modules> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> <java.version.build>1.8</java.version.build> <pscapi.version>1.0</pscapi.version> <utils.version>1.0</utils.version> <commons-csv.version>1.4</commons-csv.version> - <guava.version>20.0</guava.version> + <guava.version>21.0</guava.version> <weka-dev.version>3.9.1</weka-dev.version> <commons-lang3.version>3.5</commons-lang3.version> <commons-io.version>2.5</commons-io.version> <slf4j-api.version>1.7.22</slf4j-api.version> <junit.version>4.12</junit.version> <zip4j.version>1.3.2</zip4j.version> + <mockito-core.version>2.7.1</mockito-core.version> + <jcommander.version>1.60</jcommander.version> </properties> <prerequisites> @@ -76,6 +81,11 @@ <artifactId>nicolas-train</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>pl.waw.ipipan.zil.summ</groupId> + <artifactId>nicolas-multiservice</artifactId> + <version>${project.version}</version> + </dependency> <!-- internal --> <dependency> @@ -126,6 +136,11 @@ <artifactId>zip4j</artifactId> <version>${zip4j.version}</version> </dependency> + <dependency> + <groupId>com.beust</groupId> + <artifactId>jcommander</artifactId> + <version>${jcommander.version}</version> + </dependency> <!-- logging --> <dependency> @@ -144,20 +159,204 @@ <groupId>junit</groupId> <artifactId>junit</artifactId> <version>${junit.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.mockito</groupId> + <artifactId>mockito-core</artifactId> + <version>${mockito-core.version}</version> + <scope>test</scope> </dependency> </dependencies> </dependencyManagement> + <build> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-jar-plugin</artifactId> + <version>3.0.2</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-resources-plugin</artifactId> + <version>3.0.1</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-clean-plugin</artifactId> + <version>3.0.0</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-site-plugin</artifactId> + <version>3.5.1</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-install-plugin</artifactId> + <version>2.5.2</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-deploy-plugin</artifactId> + <version>2.8.2</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <version>2.6</version> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>3.5.1</version> + <configuration> + <source>${java.version.build}</source> + <target>${java.version.build}</target> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <version>2.19.1</version> + <configuration> + <!-- Sets the VM argument line used when unit tests are run. --> + <argLine>${surefireArgLine}</argLine> + <!-- Skips unit tests if the value of skip.unit.tests property is true --> + <skipTests>${skip.unit.tests}</skipTests> + <!-- Excludes integration tests when unit tests are run. --> + <excludes> + <exclude>**/IT*.java</exclude> + </excludes> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-failsafe-plugin</artifactId> + <version>2.19.1</version> + <executions> + <execution> + <id>integration-test</id> + <goals> + <goal>integration-test</goal> + <goal>verify</goal> + </goals> + <configuration> + <!-- Sets the VM argument line used when integration tests are run. --> + <argLine>${failsafeArgLine}</argLine> + <!-- + Skips integration tests if the value of skip.integration.tests property + is true + --> + <skipTests>${skip.integration.tests}</skipTests> + </configuration> + </execution> + <execution> + <id>verify</id> + <goals> + <goal>verify</goal> + </goals> + </execution> + </executions> + </plugin> + <plugin> + <groupId>org.jacoco</groupId> + <artifactId>jacoco-maven-plugin</artifactId> + <version>0.7.8</version> + <executions> + <!-- + Prepares the property pointing to the JaCoCo runtime agent which + is passed as VM argument when Maven the Surefire plugin is executed. + --> + <execution> + <id>pre-unit-test</id> + <goals> + <goal>prepare-agent</goal> + </goals> + <configuration> + <!-- Sets the path to the file which contains the execution data. --> + <destFile>${project.build.directory}/jacoco.exec</destFile> + <!-- + Sets the name of the property containing the settings + for JaCoCo runtime agent. + --> + <propertyName>surefireArgLine</propertyName> + </configuration> + </execution> + <!-- + Ensures that the code coverage report for unit tests is created after + unit tests have been run. + --> + <execution> + <id>post-unit-test</id> + <phase>test</phase> + <goals> + <goal>report</goal> + </goals> + <configuration> + <!-- Sets the path to the file which contains the execution data. --> + <dataFile>${project.build.directory}/jacoco.exec</dataFile> + <!-- Sets the output directory for the code coverage report. --> + <outputDirectory>${project.reporting.outputDirectory}/jacoco-ut</outputDirectory> + </configuration> + </execution> + + <!-- + Prepares the property pointing to the JaCoCo runtime agent which + is passed as VM argument when Maven the Failsafe plugin is executed. + --> + <execution> + <id>pre-integration-test</id> + <phase>pre-integration-test</phase> + <goals> + <goal>prepare-agent</goal> + </goals> + <configuration> + <!-- Sets the path to the file which contains the execution data. --> + <destFile>${project.build.directory}/jacoco-it.exec</destFile> + <!-- + Sets the name of the property containing the settings + for JaCoCo runtime agent. + --> + <propertyName>failsafeArgLine</propertyName> + </configuration> + </execution> + <!-- + Ensures that the code coverage report for integration tests after + integration tests have been run. + --> + <execution> + <id>post-integration-test</id> + <phase>post-integration-test</phase> + <goals> + <goal>report</goal> + </goals> + <configuration> + <!-- Sets the path to the file which contains the execution data. --> + <dataFile>${project.build.directory}/jacoco-it.exec</dataFile> + <!-- Sets the output directory for the code coverage report. --> + <outputDirectory>${project.reporting.outputDirectory}/jacoco-it</outputDirectory> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </pluginManagement> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <version>3.5.1</version> - <configuration> - <source>${java.version.build}</source> - <target>${java.version.build}</target> - </configuration> + <artifactId>maven-failsafe-plugin</artifactId> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + </plugin> + <plugin> + <groupId>org.jacoco</groupId> + <artifactId>jacoco-maven-plugin</artifactId> </plugin> </plugins> </build> -- libgit2 0.22.2