Commit cb490cab2a2bfe90e2c4ad0661117321fb2f2e1f
1 parent
76eeceb7
create sample cli client
Showing
22 changed files
with
890 additions
and
108 deletions
nicolas-cli/README.md
0 → 100644
1 | +# nicolas-cli | ||
2 | + | ||
3 | +This module contains a sample command-line application, which uses Nicolas library to summarize chosen input text file. | ||
4 | +Summary is written to target output file. Additionally, user needs to specify desired number of tokens in the summary. | ||
5 | + | ||
6 | +## Installation | ||
7 | + | ||
8 | + mvn clean install | ||
9 | + | ||
10 | +## Usage | ||
11 | + | ||
12 | + java -jar target/nicolas-cli.jar -help | ||
0 | \ No newline at end of file | 13 | \ No newline at end of file |
nicolas-cli/pom.xml
@@ -11,4 +11,70 @@ | @@ -11,4 +11,70 @@ | ||
11 | 11 | ||
12 | <artifactId>nicolas-cli</artifactId> | 12 | <artifactId>nicolas-cli</artifactId> |
13 | 13 | ||
14 | + <dependencies> | ||
15 | + | ||
16 | + <!-- project --> | ||
17 | + <dependency> | ||
18 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
19 | + <artifactId>nicolas-multiservice</artifactId> | ||
20 | + </dependency> | ||
21 | + <dependency> | ||
22 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
23 | + <artifactId>nicolas-lib</artifactId> | ||
24 | + </dependency> | ||
25 | + | ||
26 | + <!-- third party --> | ||
27 | + <dependency> | ||
28 | + <groupId>com.beust</groupId> | ||
29 | + <artifactId>jcommander</artifactId> | ||
30 | + </dependency> | ||
31 | + | ||
32 | + <!-- logging --> | ||
33 | + <dependency> | ||
34 | + <groupId>org.slf4j</groupId> | ||
35 | + <artifactId>slf4j-api</artifactId> | ||
36 | + </dependency> | ||
37 | + <dependency> | ||
38 | + <groupId>org.slf4j</groupId> | ||
39 | + <artifactId>slf4j-simple</artifactId> | ||
40 | + </dependency> | ||
41 | + | ||
42 | + <!-- test --> | ||
43 | + <dependency> | ||
44 | + <groupId>junit</groupId> | ||
45 | + <artifactId>junit</artifactId> | ||
46 | + </dependency> | ||
47 | + <dependency> | ||
48 | + <groupId>org.mockito</groupId> | ||
49 | + <artifactId>mockito-core</artifactId> | ||
50 | + </dependency> | ||
51 | + </dependencies> | ||
52 | + | ||
53 | + <build> | ||
54 | + <plugins> | ||
55 | + <plugin> | ||
56 | + <artifactId>maven-assembly-plugin</artifactId> | ||
57 | + <configuration> | ||
58 | + <appendAssemblyId>false</appendAssemblyId> | ||
59 | + <archive> | ||
60 | + <manifest> | ||
61 | + <mainClass>pl.waw.ipipan.zil.summ.nicolas.cli.Main</mainClass> | ||
62 | + </manifest> | ||
63 | + </archive> | ||
64 | + <descriptorRefs> | ||
65 | + <descriptorRef>jar-with-dependencies</descriptorRef> | ||
66 | + </descriptorRefs> | ||
67 | + </configuration> | ||
68 | + <executions> | ||
69 | + <execution> | ||
70 | + <id>make-assembly</id> | ||
71 | + <phase>package</phase> | ||
72 | + <goals> | ||
73 | + <goal>single</goal> | ||
74 | + </goals> | ||
75 | + </execution> | ||
76 | + </executions> | ||
77 | + </plugin> | ||
78 | + </plugins> | ||
79 | + </build> | ||
14 | </project> | 80 | </project> |
15 | \ No newline at end of file | 81 | \ No newline at end of file |
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Cli.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.cli; | ||
2 | + | ||
3 | +import com.beust.jcommander.IParameterValidator; | ||
4 | +import com.beust.jcommander.JCommander; | ||
5 | +import com.beust.jcommander.Parameter; | ||
6 | +import com.beust.jcommander.ParameterException; | ||
7 | +import com.beust.jcommander.converters.FileConverter; | ||
8 | +import com.beust.jcommander.validators.PositiveInteger; | ||
9 | +import org.slf4j.Logger; | ||
10 | +import org.slf4j.LoggerFactory; | ||
11 | + | ||
12 | +import java.io.File; | ||
13 | + | ||
14 | +class Cli { | ||
15 | + | ||
16 | + private static final Logger LOG = LoggerFactory.getLogger(Cli.class); | ||
17 | + | ||
18 | + @Parameter(names = {"-help", "-h"}, description = "Print help") | ||
19 | + private boolean help = false; | ||
20 | + | ||
21 | + @Parameter(names = {"-input", "-i"}, description = "Input text file to summarize", required = true, validateWith = FileValidator.class, converter = FileConverter.class) | ||
22 | + private File inputFile; | ||
23 | + | ||
24 | + @Parameter(names = {"-output", "-o"}, description = "Output file path for summary", required = true, validateWith = FileValidator.class, converter = FileConverter.class) | ||
25 | + private File outputFile; | ||
26 | + | ||
27 | + @Parameter(names = {"-target", "-t"}, description = "Target summary token count", required = true, validateWith = PositiveInteger.class) | ||
28 | + private int targetTokenCount; | ||
29 | + | ||
30 | + private boolean invalid = false; | ||
31 | + | ||
32 | + boolean isHelp() { | ||
33 | + return help; | ||
34 | + } | ||
35 | + | ||
36 | + File getInputFile() { | ||
37 | + return inputFile; | ||
38 | + } | ||
39 | + | ||
40 | + File getOutputFile() { | ||
41 | + return outputFile; | ||
42 | + } | ||
43 | + | ||
44 | + int getTargetTokenCount() { | ||
45 | + return targetTokenCount; | ||
46 | + } | ||
47 | + | ||
48 | + @SuppressWarnings("squid:S1166") | ||
49 | + static Cli parse(String[] args) { | ||
50 | + Cli cli = new Cli(); | ||
51 | + JCommander jCommander; | ||
52 | + try { | ||
53 | + jCommander = new JCommander(cli, args); | ||
54 | + } catch (ParameterException ex) { | ||
55 | + LOG.error("Error parsing parameters: {}", ex.getLocalizedMessage()); | ||
56 | + cli.setInvalid(); | ||
57 | + return cli; | ||
58 | + } | ||
59 | + if (cli.isHelp()) { | ||
60 | + StringBuilder stringBuilder = new StringBuilder(); | ||
61 | + jCommander.usage(stringBuilder); | ||
62 | + LOG.info("{}", stringBuilder); | ||
63 | + } | ||
64 | + return cli; | ||
65 | + } | ||
66 | + | ||
67 | + private void setInvalid() { | ||
68 | + invalid = true; | ||
69 | + } | ||
70 | + | ||
71 | + boolean isInvalid() { | ||
72 | + return invalid; | ||
73 | + } | ||
74 | + | ||
75 | + public static class FileValidator implements IParameterValidator { | ||
76 | + | ||
77 | + @Override | ||
78 | + public void validate(String name, String value) { | ||
79 | + File file = new File(value); | ||
80 | + if (!file.isFile()) { | ||
81 | + throw new ParameterException("Parameter " + name | ||
82 | + + " should be a valid file path (found " + value + ")"); | ||
83 | + } | ||
84 | + } | ||
85 | + | ||
86 | + } | ||
87 | +} |
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Client.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.cli; | ||
2 | + | ||
3 | +import org.apache.commons.io.IOUtils; | ||
4 | +import org.slf4j.Logger; | ||
5 | +import org.slf4j.LoggerFactory; | ||
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
8 | +import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | ||
9 | +import pl.waw.ipipan.zil.summ.nicolas.NicolasException; | ||
10 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; | ||
12 | + | ||
13 | +import java.io.*; | ||
14 | + | ||
15 | +class Client { | ||
16 | + | ||
17 | + private static final Logger LOG = LoggerFactory.getLogger(Client.class); | ||
18 | + | ||
19 | + private final Preprocessor preprocessor; | ||
20 | + private final Nicolas nicolas; | ||
21 | + | ||
22 | + Client(Preprocessor preprocessor, Nicolas nicolas) { | ||
23 | + this.preprocessor = preprocessor; | ||
24 | + this.nicolas = nicolas; | ||
25 | + } | ||
26 | + | ||
27 | + @SuppressWarnings("squid:S1166") | ||
28 | + void summarize(File inputFile, File outputFile, int targetTokenCount) { | ||
29 | + try { | ||
30 | + String inputText = loadInputText(inputFile); | ||
31 | + TText preprocessed = preprocess(inputText); | ||
32 | + String summary = summarize(preprocessed, targetTokenCount); | ||
33 | + saveSummaryToFile(summary, outputFile); | ||
34 | + } catch (IOException | MultiserviceException | NicolasException e) { | ||
35 | + LOG.error("Exiting because of an error."); | ||
36 | + } | ||
37 | + } | ||
38 | + | ||
39 | + private String loadInputText(File inputFile) throws IOException { | ||
40 | + String inputText; | ||
41 | + try (FileInputStream inputStream = new FileInputStream(inputFile)) { | ||
42 | + inputText = IOUtils.toString(inputStream, Constants.ENCODING); | ||
43 | + } catch (IOException e) { | ||
44 | + LOG.error("Error reading input text."); | ||
45 | + throw e; | ||
46 | + } | ||
47 | + return inputText; | ||
48 | + } | ||
49 | + | ||
50 | + private TText preprocess(String inputText) throws MultiserviceException { | ||
51 | + try { | ||
52 | + return preprocessor.preprocess(inputText); | ||
53 | + } catch (MultiserviceException e) { | ||
54 | + LOG.error("Error preprocessing input text."); | ||
55 | + throw e; | ||
56 | + } | ||
57 | + } | ||
58 | + | ||
59 | + private String summarize(TText preprocessed, int targetTokenCount) throws NicolasException { | ||
60 | + try { | ||
61 | + return nicolas.summarizeThrift(preprocessed, targetTokenCount); | ||
62 | + } catch (NicolasException e) { | ||
63 | + LOG.error("Error preprocessing input text."); | ||
64 | + throw e; | ||
65 | + } | ||
66 | + } | ||
67 | + | ||
68 | + private void saveSummaryToFile(String summary, File outputFile) throws IOException { | ||
69 | + try (OutputStream outputStream = new FileOutputStream(outputFile)) { | ||
70 | + IOUtils.write(summary, outputStream, Constants.ENCODING); | ||
71 | + } catch (IOException e) { | ||
72 | + LOG.error("Error writing file with summary."); | ||
73 | + throw e; | ||
74 | + } | ||
75 | + } | ||
76 | + | ||
77 | +} |
nicolas-cli/src/main/java/pl/waw/ipipan/zil/summ/nicolas/cli/Main.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.cli; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | ||
6 | +import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; | ||
7 | + | ||
8 | +import java.io.IOException; | ||
9 | + | ||
10 | +public class Main { | ||
11 | + | ||
12 | + private static final Logger LOG = LoggerFactory.getLogger(Main.class); | ||
13 | + | ||
14 | + private Main() { | ||
15 | + } | ||
16 | + | ||
17 | + @SuppressWarnings("squid:S1166") | ||
18 | + public static void main(String[] args) { | ||
19 | + Cli cli = Cli.parse(args); | ||
20 | + if (cli.isHelp() || cli.isInvalid()) { | ||
21 | + return; | ||
22 | + } | ||
23 | + | ||
24 | + Nicolas nicolas; | ||
25 | + Preprocessor preprocessor; | ||
26 | + try { | ||
27 | + nicolas = new Nicolas(); | ||
28 | + preprocessor = new Preprocessor(); | ||
29 | + } catch (IOException | ClassNotFoundException e) { | ||
30 | + LOG.error("Error loading Nicolas or Multiservice preprocessor! Will exit."); | ||
31 | + return; | ||
32 | + } | ||
33 | + Client client = new Client(preprocessor, nicolas); | ||
34 | + client.summarize(cli.getInputFile(), cli.getOutputFile(), cli.getTargetTokenCount()); | ||
35 | + } | ||
36 | +} |
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/CliTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.cli; | ||
2 | + | ||
3 | +import org.junit.BeforeClass; | ||
4 | +import org.junit.ClassRule; | ||
5 | +import org.junit.Test; | ||
6 | +import org.junit.rules.TemporaryFolder; | ||
7 | + | ||
8 | +import java.io.File; | ||
9 | +import java.io.IOException; | ||
10 | + | ||
11 | +import static org.junit.Assert.*; | ||
12 | + | ||
13 | +public class CliTest { | ||
14 | + | ||
15 | + @ClassRule | ||
16 | + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); | ||
17 | + | ||
18 | + private static File sampleInputFile; | ||
19 | + private static File sampleOutputFile; | ||
20 | + private static final int TARGET_TOKEN_COUNT = 50; | ||
21 | + private static final String INVALID_FILE_PATH = "nonexistent_dir/nonexistent_file"; | ||
22 | + | ||
23 | + @BeforeClass | ||
24 | + public static void initSampleFiles() throws IOException { | ||
25 | + sampleInputFile = TEMPORARY_FOLDER.newFile(); | ||
26 | + sampleOutputFile = TEMPORARY_FOLDER.newFile(); | ||
27 | + } | ||
28 | + | ||
29 | + @Test | ||
30 | + public void failNoArguments() throws Exception { | ||
31 | + String[] args = new String[]{}; | ||
32 | + Cli cli = Cli.parse(args); | ||
33 | + assertTrue(cli.isInvalid()); | ||
34 | + } | ||
35 | + | ||
36 | + @Test | ||
37 | + public void failInvalidArgument() throws Exception { | ||
38 | + String[] args = new String[]{"-xxxx", "xxx", "-i", sampleInputFile.getPath(), "-o", sampleOutputFile.getPath(), "-t", Integer.toString(TARGET_TOKEN_COUNT)}; | ||
39 | + Cli cli = Cli.parse(args); | ||
40 | + assertTrue(cli.isInvalid()); | ||
41 | + } | ||
42 | + | ||
43 | + @Test | ||
44 | + public void failInvalidInputFile() throws Exception { | ||
45 | + String[] args = new String[]{"-i", INVALID_FILE_PATH, "-o", sampleOutputFile.getPath(), "-t", Integer.toString(TARGET_TOKEN_COUNT)}; | ||
46 | + Cli cli = Cli.parse(args); | ||
47 | + assertTrue(cli.isInvalid()); | ||
48 | + } | ||
49 | + | ||
50 | + @Test | ||
51 | + public void failInvalidOutputFile() throws Exception { | ||
52 | + String[] args = new String[]{"-i", sampleInputFile.getPath(), "-o", INVALID_FILE_PATH, "-t", Integer.toString(TARGET_TOKEN_COUNT)}; | ||
53 | + Cli cli = Cli.parse(args); | ||
54 | + assertTrue(cli.isInvalid()); | ||
55 | + } | ||
56 | + | ||
57 | + @Test | ||
58 | + public void failInvalidTargetTokenCount() throws Exception { | ||
59 | + String[] args = new String[]{"-i", sampleInputFile.getPath(), "-o", sampleOutputFile.getPath(), "-t", Integer.toString(-1)}; | ||
60 | + Cli cli = Cli.parse(args); | ||
61 | + assertTrue(cli.isInvalid()); | ||
62 | + } | ||
63 | + | ||
64 | + @Test | ||
65 | + public void validArguments() throws Exception { | ||
66 | + String[] args = new String[]{"-i", sampleInputFile.getPath(), "-o", sampleOutputFile.getPath(), "-t", Integer.toString(TARGET_TOKEN_COUNT)}; | ||
67 | + Cli cli = Cli.parse(args); | ||
68 | + assertFalse(cli.isInvalid()); | ||
69 | + assertEquals(sampleInputFile, cli.getInputFile()); | ||
70 | + assertEquals(sampleOutputFile, cli.getOutputFile()); | ||
71 | + assertEquals(TARGET_TOKEN_COUNT, cli.getTargetTokenCount()); | ||
72 | + } | ||
73 | + | ||
74 | + | ||
75 | +} |
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/ClientTest.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.cli; | ||
2 | + | ||
3 | +import org.apache.commons.io.IOUtils; | ||
4 | +import org.junit.ClassRule; | ||
5 | +import org.junit.Test; | ||
6 | +import org.junit.rules.TemporaryFolder; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
8 | +import pl.waw.ipipan.zil.summ.nicolas.Nicolas; | ||
9 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
10 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
11 | +import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; | ||
12 | + | ||
13 | +import java.io.File; | ||
14 | +import java.io.FileInputStream; | ||
15 | +import java.io.InputStream; | ||
16 | + | ||
17 | +import static org.junit.Assert.assertEquals; | ||
18 | +import static org.mockito.ArgumentMatchers.*; | ||
19 | +import static org.mockito.Mockito.mock; | ||
20 | +import static org.mockito.Mockito.when; | ||
21 | +import static pl.waw.ipipan.zil.summ.nicolas.cli.TestUtils.SAMPLE_INPUT_RESOURCE_PATH; | ||
22 | +import static pl.waw.ipipan.zil.summ.nicolas.cli.TestUtils.SAMPLE_THRIFT_TEXT_RESOURCE_PATH; | ||
23 | + | ||
24 | +public class ClientTest { | ||
25 | + | ||
26 | + @ClassRule | ||
27 | + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); | ||
28 | + | ||
29 | + @Test | ||
30 | + public void processSampleText() throws Exception { | ||
31 | + Preprocessor preprocessor = mock(Preprocessor.class); | ||
32 | + TText ttext = Utils.loadThriftTextFromResource(SAMPLE_THRIFT_TEXT_RESOURCE_PATH); | ||
33 | + when(preprocessor.preprocess(any())).thenReturn(ttext); | ||
34 | + | ||
35 | + Nicolas nicolas = mock(Nicolas.class); | ||
36 | + String targetSummary = "This is a summary"; | ||
37 | + when(nicolas.summarizeThrift(eq(ttext), anyInt())).thenReturn(targetSummary); | ||
38 | + | ||
39 | + Client client = new Client(preprocessor, nicolas); | ||
40 | + | ||
41 | + File inputFile = TestUtils.copyResourceToFile(SAMPLE_INPUT_RESOURCE_PATH, TEMPORARY_FOLDER.newFile()); | ||
42 | + File outputFile = TEMPORARY_FOLDER.newFile(); | ||
43 | + int targetTokenCount = 50; | ||
44 | + | ||
45 | + String[] args = new String[]{"-i", inputFile.getPath(), "-o", outputFile.getPath(), "-t", Integer.toString(targetTokenCount)}; | ||
46 | + Cli cli = Cli.parse(args); | ||
47 | + client.summarize(cli.getInputFile(), cli.getOutputFile(), cli.getTargetTokenCount()); | ||
48 | + | ||
49 | + try (InputStream inputStream = new FileInputStream(outputFile)) { | ||
50 | + String summary = IOUtils.toString(inputStream, Constants.ENCODING); | ||
51 | + assertEquals(targetSummary, summary); | ||
52 | + } | ||
53 | + } | ||
54 | +} | ||
0 | \ No newline at end of file | 55 | \ No newline at end of file |
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/MainIT.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.cli; | ||
2 | + | ||
3 | +import org.apache.commons.io.IOUtils; | ||
4 | +import org.junit.ClassRule; | ||
5 | +import org.junit.Test; | ||
6 | +import org.junit.rules.TemporaryFolder; | ||
7 | +import pl.waw.ipipan.zil.summ.nicolas.common.Constants; | ||
8 | + | ||
9 | +import java.io.File; | ||
10 | +import java.io.FileInputStream; | ||
11 | +import java.io.InputStream; | ||
12 | + | ||
13 | +import static junit.framework.TestCase.assertTrue; | ||
14 | + | ||
15 | +public class MainIT { | ||
16 | + | ||
17 | + private final static String SAMPLE_INPUT_RESOURCE_PATH = "/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.txt"; | ||
18 | + | ||
19 | + @ClassRule | ||
20 | + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); | ||
21 | + | ||
22 | + @Test | ||
23 | + public void processSampleText() throws Exception { | ||
24 | + File inputFile = TestUtils.copyResourceToFile(SAMPLE_INPUT_RESOURCE_PATH, TEMPORARY_FOLDER.newFile()); | ||
25 | + File outputFile = TEMPORARY_FOLDER.newFile(); | ||
26 | + int targetTokenCount = 50; | ||
27 | + | ||
28 | + String[] args = new String[]{"-i", inputFile.getPath(), "-o", outputFile.getPath(), "-t", Integer.toString(targetTokenCount)}; | ||
29 | + Main.main(args); | ||
30 | + | ||
31 | + try (InputStream inputStream = new FileInputStream(outputFile)) { | ||
32 | + String summary = IOUtils.toString(inputStream, Constants.ENCODING); | ||
33 | + assertTrue(summary.length() > 0); | ||
34 | + assertTrue(summary.length() < targetTokenCount * 10); | ||
35 | + } | ||
36 | + } | ||
37 | + | ||
38 | +} | ||
0 | \ No newline at end of file | 39 | \ No newline at end of file |
nicolas-cli/src/test/java/pl/waw/ipipan/zil/summ/nicolas/cli/TestUtils.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.cli; | ||
2 | + | ||
3 | +import org.apache.commons.io.IOUtils; | ||
4 | + | ||
5 | +import java.io.*; | ||
6 | + | ||
7 | +class TestUtils { | ||
8 | + | ||
9 | + private static final String PACKAGE = "/pl/waw/ipipan/zil/summ/nicolas/cli/"; | ||
10 | + | ||
11 | + static final String SAMPLE_INPUT_RESOURCE_PATH = PACKAGE + "sample_input.txt"; | ||
12 | + static final String SAMPLE_THRIFT_TEXT_RESOURCE_PATH = PACKAGE + "sample_input.thrift"; | ||
13 | + | ||
14 | + private TestUtils() { | ||
15 | + } | ||
16 | + | ||
17 | + static File copyResourceToFile(String resourcePath, File file) throws IOException { | ||
18 | + try (InputStream inputStream = MainIT.class.getResourceAsStream(resourcePath); | ||
19 | + OutputStream outputStream = new FileOutputStream(file)) { | ||
20 | + IOUtils.copy(inputStream, outputStream); | ||
21 | + } | ||
22 | + return file; | ||
23 | + } | ||
24 | +} |
nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.thrift
0 → 100644
No preview for this file type
nicolas-cli/src/test/resources/pl/waw/ipipan/zil/summ/nicolas/cli/sample_input.txt
0 → 100644
1 | +To będzie już druga próba licytacji nieruchomości na pl. Słonecznym, którą urzędnicy wytropili po latach poszukiwań majątku Adama Gesslera. | ||
2 | + | ||
3 | +Jego dług wobec miasta szacują dziś na ok. 27 mln zł. Już w 1992 r., wkrótce po podpisaniu umowy najmu lokalu na Rynku Staromiejskim, zaczęły się problemy z czynszem. Sąd orzekł eksmisję. Dotąd miastu udało się odzyskać ledwie kilkadziesiąt tysięcy złotych długu. | ||
4 | + | ||
5 | +Sprawa budzi wielkie emocje, bo choć Adam Gessler jest słynnym restauratorem, oficjalnie nie ma nic. Nawet wynajęta przez Zakład Gospodarowania Nieruchomościami w Śródmieściu firma detektywistyczna nie znalazła majątku. | ||
6 | + | ||
7 | +Pozostają dwa mieszkania na Żoliborzu, wyceniane przed rokiem na blisko 4,3 mln zł. Będą licytowane za dwie trzecie ceny. W ZGN wymyślili, żeby miasto przystąpiło do licytacji. Jeśli uda się kupić nieruchomość, komornik pospłaca wierzycieli Adama i Piotra Gesslerów. A miasto będzie mogło w przyszłości sprzedać korzystnie atrakcyjny dom. | ||
8 | + | ||
9 | +Licytacje odbędą się w środę. - Korzyści z wylicytowania domu będą niewielkie w stosunku do ogromnego długu pana Gesslera. Chodzi jednak o to, żeby wiedział, że miasto nie zrezygnuje z upominania się o swoje - tłumaczyła "Gazecie" Małgorzata Mazur, dyrektorka ZGN. |
nicolas-common/src/main/java/pl/waw/ipipan/zil/summ/nicolas/common/Utils.java
@@ -45,17 +45,21 @@ public class Utils { | @@ -45,17 +45,21 @@ public class Utils { | ||
45 | } | 45 | } |
46 | } | 46 | } |
47 | 47 | ||
48 | + public static TText loadThriftTextFromStream(InputStream inputStream) throws IOException { | ||
49 | + try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(inputStream)) { | ||
50 | + return (TText) ois.readObject(); | ||
51 | + } catch (ClassNotFoundException e) { | ||
52 | + LOG.error("Error reading serialized thrift text file, class not found.", e); | ||
53 | + throw new IOException(e); | ||
54 | + } | ||
55 | + } | ||
56 | + | ||
48 | public static TText loadThriftTextFromResource(String textResourcePath) throws IOException { | 57 | public static TText loadThriftTextFromResource(String textResourcePath) throws IOException { |
49 | try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) { | 58 | try (InputStream stream = Utils.class.getResourceAsStream(textResourcePath)) { |
50 | if (stream == null) { | 59 | if (stream == null) { |
51 | throw new IOException("Resource not found at: " + textResourcePath); | 60 | throw new IOException("Resource not found at: " + textResourcePath); |
52 | } | 61 | } |
53 | - try (VersionIgnoringObjectInputStream ois = new VersionIgnoringObjectInputStream(stream)) { | ||
54 | - return (TText) ois.readObject(); | ||
55 | - } catch (ClassNotFoundException e) { | ||
56 | - LOG.error("Error reading serialized thrift text file, class not found.", e); | ||
57 | - throw new IOException(e); | ||
58 | - } | 62 | + return loadThriftTextFromStream(stream); |
59 | } | 63 | } |
60 | } | 64 | } |
61 | 65 |
nicolas-lib/src/main/java/pl/waw/ipipan/zil/summ/nicolas/Nicolas.java
@@ -39,10 +39,13 @@ public class Nicolas { | @@ -39,10 +39,13 @@ public class Nicolas { | ||
39 | zeroFeatureExtractor = new ZeroFeatureExtractor(); | 39 | zeroFeatureExtractor = new ZeroFeatureExtractor(); |
40 | } | 40 | } |
41 | 41 | ||
42 | - public String summarizeThrift(TText text, int targetTokenCount) throws Exception { | ||
43 | - Set<TMention> goodMentions | ||
44 | - = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); | ||
45 | - return calculateSummary(text, goodMentions, targetTokenCount); | 42 | + public String summarizeThrift(TText text, int targetTokenCount) throws NicolasException { |
43 | + try { | ||
44 | + Set<TMention> goodMentions = MentionModel.detectGoodMentions(mentionModel, mentionFeatureExtractor, text); | ||
45 | + return calculateSummary(text, goodMentions, targetTokenCount); | ||
46 | + } catch (Exception e) { | ||
47 | + throw new NicolasException(e); | ||
48 | + } | ||
46 | } | 49 | } |
47 | 50 | ||
48 | private String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { | 51 | private String calculateSummary(TText thrifted, Set<TMention> goodMentions, int targetSize) throws Exception { |
nicolas-multiservice/pom.xml
0 → 100644
1 | +<?xml version="1.0" encoding="UTF-8"?> | ||
2 | +<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
3 | + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
4 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
5 | + <parent> | ||
6 | + <artifactId>nicolas-container</artifactId> | ||
7 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
8 | + <version>1.0-SNAPSHOT</version> | ||
9 | + </parent> | ||
10 | + <modelVersion>4.0.0</modelVersion> | ||
11 | + | ||
12 | + <artifactId>nicolas-multiservice</artifactId> | ||
13 | + | ||
14 | + <dependencies> | ||
15 | + <!-- internal --> | ||
16 | + <dependency> | ||
17 | + <groupId>pl.waw.ipipan.zil.multiservice</groupId> | ||
18 | + <artifactId>utils</artifactId> | ||
19 | + </dependency> | ||
20 | + <dependency> | ||
21 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
22 | + <artifactId>pscapi</artifactId> | ||
23 | + </dependency> | ||
24 | + | ||
25 | + <!-- logging --> | ||
26 | + <dependency> | ||
27 | + <groupId>org.slf4j</groupId> | ||
28 | + <artifactId>slf4j-api</artifactId> | ||
29 | + </dependency> | ||
30 | + | ||
31 | + <!-- test --> | ||
32 | + <dependency> | ||
33 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
34 | + <artifactId>nicolas-common</artifactId> | ||
35 | + <scope>test</scope> | ||
36 | + </dependency> | ||
37 | + | ||
38 | + </dependencies> | ||
39 | +</project> | ||
0 | \ No newline at end of file | 40 | \ No newline at end of file |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/MultiserviceProxy.java renamed to nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/MultiserviceProxy.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.multiservice; |
2 | 2 | ||
3 | import org.apache.thrift.TException; | 3 | import org.apache.thrift.TException; |
4 | import org.apache.thrift.protocol.TBinaryProtocol; | 4 | import org.apache.thrift.protocol.TBinaryProtocol; |
@@ -30,10 +30,10 @@ public class MultiserviceProxy { | @@ -30,10 +30,10 @@ public class MultiserviceProxy { | ||
30 | public MultiserviceProxy(String host, int port) { | 30 | public MultiserviceProxy(String host, int port) { |
31 | this.host = host; | 31 | this.host = host; |
32 | this.port = port; | 32 | this.port = port; |
33 | - LOG.info("Multiservice at " + host + ":" + port); | 33 | + LOG.info("Multiservice at {}:{}", host, port); |
34 | } | 34 | } |
35 | 35 | ||
36 | - public TText process(String text, List<String> services) throws Exception { | 36 | + public TText process(String text, List<String> services) throws MultiserviceException { |
37 | List<Map<String, String>> options = new ArrayList<>(); | 37 | List<Map<String, String>> options = new ArrayList<>(); |
38 | for (int i = 0; i < services.size(); i++) | 38 | for (int i = 0; i < services.size(); i++) |
39 | options.add(new HashMap<>()); | 39 | options.add(new HashMap<>()); |
@@ -41,7 +41,7 @@ public class MultiserviceProxy { | @@ -41,7 +41,7 @@ public class MultiserviceProxy { | ||
41 | } | 41 | } |
42 | 42 | ||
43 | public TText process(String text, String title, List<String> services, List<Map<String, String>> options) | 43 | public TText process(String text, String title, List<String> services, List<Map<String, String>> options) |
44 | - throws Exception { | 44 | + throws MultiserviceException { |
45 | TTransport transport = new TSocket(host, port); | 45 | TTransport transport = new TSocket(host, port); |
46 | ObjectRequest objectRequest = createRequest(text, title, services, options); | 46 | ObjectRequest objectRequest = createRequest(text, title, services, options); |
47 | 47 | ||
@@ -51,7 +51,7 @@ public class MultiserviceProxy { | @@ -51,7 +51,7 @@ public class MultiserviceProxy { | ||
51 | TProtocol protocol = new TBinaryProtocol(transport); | 51 | TProtocol protocol = new TBinaryProtocol(transport); |
52 | Multiservice.Client client = new Multiservice.Client(protocol); | 52 | Multiservice.Client client = new Multiservice.Client(protocol); |
53 | 53 | ||
54 | - LOG.debug("Sending Multservice request..."); | 54 | + LOG.debug("Sending Multiservice request..."); |
55 | TText responseText = request(objectRequest, client); | 55 | TText responseText = request(objectRequest, client); |
56 | LOG.debug("...done"); | 56 | LOG.debug("...done"); |
57 | 57 | ||
@@ -59,7 +59,7 @@ public class MultiserviceProxy { | @@ -59,7 +59,7 @@ public class MultiserviceProxy { | ||
59 | 59 | ||
60 | } catch (TException e) { | 60 | } catch (TException e) { |
61 | LOG.error("Error processing request:" + e); | 61 | LOG.error("Error processing request:" + e); |
62 | - throw new Exception(e); | 62 | + throw new MultiserviceException(e.getMessage()); |
63 | 63 | ||
64 | } finally { | 64 | } finally { |
65 | transport.close(); | 65 | transport.close(); |
@@ -72,15 +72,9 @@ public class MultiserviceProxy { | @@ -72,15 +72,9 @@ public class MultiserviceProxy { | ||
72 | while (true) { | 72 | while (true) { |
73 | RequestStatus status = client.getRequestStatus(requestToken); | 73 | RequestStatus status = client.getRequestStatus(requestToken); |
74 | if (RequestStatus.DONE.equals(status)) { | 74 | if (RequestStatus.DONE.equals(status)) { |
75 | - TText result = client.getResultObject(requestToken); | ||
76 | - return result; | 75 | + return client.getResultObject(requestToken); |
77 | } else if (RequestStatus.FAILED.equals(status) || RequestStatus.DUMPED.equals(status)) { | 76 | } else if (RequestStatus.FAILED.equals(status) || RequestStatus.DUMPED.equals(status)) { |
78 | - try { | ||
79 | - MultiserviceException exception = client.getException(requestToken); | ||
80 | - throw exception; | ||
81 | - } catch (TException e) { | ||
82 | - throw e; | ||
83 | - } | 77 | + throw client.getException(requestToken); |
84 | } | 78 | } |
85 | } | 79 | } |
86 | } | 80 | } |
nicolas-multiservice/src/main/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/Preprocessor.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.multiservice; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; | ||
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
7 | + | ||
8 | +import java.io.File; | ||
9 | +import java.io.FileOutputStream; | ||
10 | +import java.io.IOException; | ||
11 | +import java.io.ObjectOutputStream; | ||
12 | +import java.util.Arrays; | ||
13 | +import java.util.List; | ||
14 | + | ||
15 | +public class Preprocessor { | ||
16 | + | ||
17 | + private static final Logger LOG = LoggerFactory.getLogger(Preprocessor.class); | ||
18 | + | ||
19 | + private static final List<String> SERVICES = Arrays.asList("Concraft", "Spejd", "Nerf", "MentionDetector", | ||
20 | + "Bartek"); | ||
21 | + private static final int PORT = 20000; | ||
22 | + private static final String HOST = "multiservice.nlp.ipipan.waw.pl"; | ||
23 | + | ||
24 | + private static final MultiserviceProxy MS_PROXY = new MultiserviceProxy(HOST, PORT); | ||
25 | + | ||
26 | + public TText preprocess(String body) throws MultiserviceException { | ||
27 | + return MS_PROXY.process(body, SERVICES); | ||
28 | + } | ||
29 | + | ||
30 | + public void preprocessToFile(String body, File targetFile) throws MultiserviceException { | ||
31 | + if (targetFile.exists()) { | ||
32 | + LOG.debug("Skipping existing file.."); | ||
33 | + return; | ||
34 | + } | ||
35 | + LOG.info("Processing text into " + targetFile.getPath()); | ||
36 | + TText ttext = preprocess(body); | ||
37 | + try { | ||
38 | + serialize(ttext, targetFile); | ||
39 | + } catch (IOException e) { | ||
40 | + LOG.error("Error serializing preprocessed text", e); | ||
41 | + throw new MultiserviceException(e.getLocalizedMessage()); | ||
42 | + } | ||
43 | + } | ||
44 | + | ||
45 | + private static void serialize(TText ttext, File targetFile) throws IOException { | ||
46 | + try (FileOutputStream fileOutputStream = new FileOutputStream(targetFile); | ||
47 | + ObjectOutputStream oos = new ObjectOutputStream(fileOutputStream)) { | ||
48 | + oos.writeObject(ttext); | ||
49 | + } | ||
50 | + } | ||
51 | + | ||
52 | +} |
nicolas-multiservice/src/test/java/pl/waw/ipipan/zil/summ/nicolas/multiservice/PreprocessorIT.java
0 → 100644
1 | +package pl.waw.ipipan.zil.summ.nicolas.multiservice; | ||
2 | + | ||
3 | +import org.junit.BeforeClass; | ||
4 | +import org.junit.ClassRule; | ||
5 | +import org.junit.Test; | ||
6 | +import org.junit.rules.TemporaryFolder; | ||
7 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | ||
8 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
9 | +import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
10 | +import pl.waw.ipipan.zil.summ.nicolas.common.Utils; | ||
11 | + | ||
12 | +import java.io.File; | ||
13 | +import java.io.FileInputStream; | ||
14 | +import java.util.List; | ||
15 | + | ||
16 | +import static junit.framework.TestCase.assertEquals; | ||
17 | +import static junit.framework.TestCase.assertTrue; | ||
18 | + | ||
19 | +public class PreprocessorIT { | ||
20 | + | ||
21 | + @ClassRule | ||
22 | + public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); | ||
23 | + | ||
24 | + private static Preprocessor preprocessor; | ||
25 | + | ||
26 | + @BeforeClass | ||
27 | + public static void initPreprocessor() { | ||
28 | + preprocessor = new Preprocessor(); | ||
29 | + } | ||
30 | + | ||
31 | + @Test | ||
32 | + public void shouldProcessSampleText() throws Exception { | ||
33 | + String text = "Ala ma kota. Ala ma też psa."; | ||
34 | + TText processed = preprocessor.preprocess(text); | ||
35 | + | ||
36 | + assertSampleProcessedText(processed); | ||
37 | + } | ||
38 | + | ||
39 | + private void assertSampleProcessedText(TText processed) { | ||
40 | + assertEquals(2, processed.getParagraphsSize()); | ||
41 | + | ||
42 | + // first paragraph is empty (placeholder for text title) | ||
43 | + TParagraph firstParagraph = processed.getParagraphs().get(0); | ||
44 | + assertEquals(0, firstParagraph.getSentencesSize()); | ||
45 | + | ||
46 | + TParagraph secondParagraph = processed.getParagraphs().get(1); | ||
47 | + assertEquals(2, secondParagraph.getSentencesSize()); | ||
48 | + List<TSentence> sentences = secondParagraph.getSentences(); | ||
49 | + | ||
50 | + TSentence firstSentence = sentences.get(0); | ||
51 | + assertEquals(4, firstSentence.getTokensSize()); | ||
52 | + assertEquals("Ala", firstSentence.getTokens().get(0).getOrth()); | ||
53 | + | ||
54 | + TSentence secondSentence = sentences.get(1); | ||
55 | + assertEquals(5, secondSentence.getTokensSize()); | ||
56 | + assertEquals("Ala", secondSentence.getTokens().get(0).getOrth()); | ||
57 | + | ||
58 | + assertEquals(3, processed.getCoreferencesSize()); //Ala, pies, kot | ||
59 | + } | ||
60 | + | ||
61 | + | ||
62 | + @Test | ||
63 | + public void shouldProcessSampleTextToFile() throws Exception { | ||
64 | + String text = "Ala ma kota. Ala ma też psa."; | ||
65 | + File targetFile = TEMPORARY_FOLDER.newFile(); | ||
66 | + assertTrue(targetFile.delete()); //delete file, because preprocessor skips existing files | ||
67 | + preprocessor.preprocessToFile(text, targetFile); | ||
68 | + | ||
69 | + try (FileInputStream inputStream = new FileInputStream(targetFile)) { | ||
70 | + TText processed = Utils.loadThriftTextFromStream(inputStream); | ||
71 | + assertSampleProcessedText(processed); | ||
72 | + } | ||
73 | + } | ||
74 | +} | ||
0 | \ No newline at end of file | 75 | \ No newline at end of file |
nicolas-train/pom.xml
@@ -21,6 +21,10 @@ | @@ -21,6 +21,10 @@ | ||
21 | <groupId>pl.waw.ipipan.zil.summ</groupId> | 21 | <groupId>pl.waw.ipipan.zil.summ</groupId> |
22 | <artifactId>nicolas-lib</artifactId> | 22 | <artifactId>nicolas-lib</artifactId> |
23 | </dependency> | 23 | </dependency> |
24 | + <dependency> | ||
25 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
26 | + <artifactId>nicolas-multiservice</artifactId> | ||
27 | + </dependency> | ||
24 | 28 | ||
25 | <!-- internal --> | 29 | <!-- internal --> |
26 | <dependency> | 30 | <dependency> |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/DownloadAndPreprocessCorpus.java
@@ -4,7 +4,7 @@ import net.lingala.zip4j.core.ZipFile; | @@ -4,7 +4,7 @@ import net.lingala.zip4j.core.ZipFile; | ||
4 | import org.apache.commons.io.FileUtils; | 4 | import org.apache.commons.io.FileUtils; |
5 | import org.slf4j.Logger; | 5 | import org.slf4j.Logger; |
6 | import org.slf4j.LoggerFactory; | 6 | import org.slf4j.LoggerFactory; |
7 | -import pl.waw.ipipan.zil.summ.nicolas.train.multiservice.NLPProcess; | 7 | +import pl.waw.ipipan.zil.summ.nicolas.train.preprocess.Main; |
8 | 8 | ||
9 | import java.io.File; | 9 | import java.io.File; |
10 | import java.net.URL; | 10 | import java.net.URL; |
@@ -45,7 +45,7 @@ public class DownloadAndPreprocessCorpus { | @@ -45,7 +45,7 @@ public class DownloadAndPreprocessCorpus { | ||
45 | 45 | ||
46 | File preprocessed = new File(WORKING_DIR, "preprocessed"); | 46 | File preprocessed = new File(WORKING_DIR, "preprocessed"); |
47 | createFolder(preprocessed.getPath()); | 47 | createFolder(preprocessed.getPath()); |
48 | - NLPProcess.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); | 48 | + Main.main(new String[]{dataDir.getPath(), preprocessed.getPath()}); |
49 | } | 49 | } |
50 | 50 | ||
51 | private static File createFolder(String path) { | 51 | private static File createFolder(String path) { |
nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcess.java renamed to nicolas-train/src/main/java/pl/waw/ipipan/zil/summ/nicolas/train/preprocess/Main.java
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; | 1 | +package pl.waw.ipipan.zil.summ.nicolas.train.preprocess; |
2 | 2 | ||
3 | import org.slf4j.Logger; | 3 | import org.slf4j.Logger; |
4 | import org.slf4j.LoggerFactory; | 4 | import org.slf4j.LoggerFactory; |
5 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | 5 | +import pl.waw.ipipan.zil.summ.nicolas.multiservice.Preprocessor; |
6 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; | 6 | import pl.waw.ipipan.zil.summ.pscapi.io.PSC_IO; |
7 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; | 7 | import pl.waw.ipipan.zil.summ.pscapi.xml.Text; |
8 | 8 | ||
9 | import java.io.File; | 9 | import java.io.File; |
10 | -import java.io.FileOutputStream; | ||
11 | -import java.io.IOException; | ||
12 | -import java.io.ObjectOutputStream; | ||
13 | import java.util.Arrays; | 10 | import java.util.Arrays; |
14 | -import java.util.List; | ||
15 | 11 | ||
16 | -public class NLPProcess { | 12 | +public class Main { |
17 | 13 | ||
18 | - private static final Logger LOG = LoggerFactory.getLogger(NLPProcess.class); | ||
19 | - | ||
20 | - private static final List<String> SERVICES = Arrays.asList("Concraft", "Spejd", "Nerf", "MentionDetector", | ||
21 | - "Bartek"); | ||
22 | - private static final int PORT = 20000; | ||
23 | - private static final String HOST = "multiservice.nlp.ipipan.waw.pl"; | ||
24 | - | ||
25 | - private static final MultiserviceProxy MSPROXY = new MultiserviceProxy(HOST, PORT); | 14 | + private static final Logger LOG = LoggerFactory.getLogger(Main.class); |
26 | 15 | ||
27 | private static final String CORPUS_FILE_SUFFIX = ".xml"; | 16 | private static final String CORPUS_FILE_SUFFIX = ".xml"; |
28 | private static final String OUTPUT_FILE_SUFFIX = ".thrift"; | 17 | private static final String OUTPUT_FILE_SUFFIX = ".thrift"; |
29 | 18 | ||
30 | - private NLPProcess() { | 19 | + private Main() { |
31 | } | 20 | } |
32 | 21 | ||
33 | public static void main(String[] args) { | 22 | public static void main(String[] args) { |
34 | if (args.length != 2) { | 23 | if (args.length != 2) { |
35 | - LOG.error("Wrong usage! Try " + NLPProcess.class.getSimpleName() + " dirWithCorpusFiles targetDir"); | 24 | + LOG.error("Wrong usage! Try " + Main.class.getSimpleName() + " dirWithCorpusFiles targetDir"); |
36 | return; | 25 | return; |
37 | } | 26 | } |
38 | File corpusDir = new File(args[0]); | 27 | File corpusDir = new File(args[0]); |
@@ -54,11 +43,14 @@ public class NLPProcess { | @@ -54,11 +43,14 @@ public class NLPProcess { | ||
54 | return; | 43 | return; |
55 | } | 44 | } |
56 | Arrays.sort(files); | 45 | Arrays.sort(files); |
46 | + | ||
47 | + Preprocessor processor = new Preprocessor(); | ||
48 | + | ||
57 | for (File file : files) { | 49 | for (File file : files) { |
58 | try { | 50 | try { |
59 | Text text = PSC_IO.readText(file); | 51 | Text text = PSC_IO.readText(file); |
60 | File targetFile = new File(targetDir, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX)); | 52 | File targetFile = new File(targetDir, file.getName().replaceFirst(CORPUS_FILE_SUFFIX + "$", OUTPUT_FILE_SUFFIX)); |
61 | - annotateNLP(text, targetFile); | 53 | + processor.preprocessToFile(text.getBody(), targetFile); |
62 | ok++; | 54 | ok++; |
63 | } catch (Exception e) { | 55 | } catch (Exception e) { |
64 | err++; | 56 | err++; |
@@ -68,30 +60,4 @@ public class NLPProcess { | @@ -68,30 +60,4 @@ public class NLPProcess { | ||
68 | LOG.info("{} texts processed successfully.", ok); | 60 | LOG.info("{} texts processed successfully.", ok); |
69 | LOG.info("{} texts with errors.", err); | 61 | LOG.info("{} texts with errors.", err); |
70 | } | 62 | } |
71 | - | ||
72 | - private static void annotateNLP(Text text, File targetFile) throws Exception { | ||
73 | - annotate(text.getBody(), targetFile); | ||
74 | - } | ||
75 | - | ||
76 | - private static void annotate(String body, File targetFile) throws Exception { | ||
77 | - if (targetFile.exists()) { | ||
78 | - LOG.debug("Skipping existing file.."); | ||
79 | - return; | ||
80 | - } | ||
81 | - LOG.info("Processing text into " + targetFile.getPath()); | ||
82 | - TText ttext = MSPROXY.process(body, SERVICES); | ||
83 | - serialize(ttext, targetFile); | ||
84 | - } | ||
85 | - | ||
86 | - public static void serialize(TText ttext, File targetFile) throws IOException { | ||
87 | - try (FileOutputStream fileOutputStream = new FileOutputStream(targetFile); | ||
88 | - ObjectOutputStream oos = new ObjectOutputStream(fileOutputStream)) { | ||
89 | - oos.writeObject(ttext); | ||
90 | - } | ||
91 | - } | ||
92 | - | ||
93 | - public static TText annotate(String body) throws Exception { | ||
94 | - return MSPROXY.process(body, SERVICES); | ||
95 | - } | ||
96 | - | ||
97 | } | 63 | } |
nicolas-train/src/test/java/pl/waw/ipipan/zil/summ/nicolas/train/multiservice/NLPProcessIT.java deleted
1 | -package pl.waw.ipipan.zil.summ.nicolas.train.multiservice; | ||
2 | - | ||
3 | -import com.google.common.collect.Lists; | ||
4 | -import org.junit.ClassRule; | ||
5 | -import org.junit.Test; | ||
6 | -import org.junit.rules.TemporaryFolder; | ||
7 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
8 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
9 | - | ||
10 | -import java.io.File; | ||
11 | -import java.util.List; | ||
12 | -import java.util.stream.Collectors; | ||
13 | - | ||
14 | -import static junit.framework.TestCase.assertEquals; | ||
15 | - | ||
16 | -public class NLPProcessIT { | ||
17 | - | ||
18 | - @ClassRule | ||
19 | - public static TemporaryFolder TEMPORARY_FOLDER = new TemporaryFolder(); | ||
20 | - | ||
21 | - @Test | ||
22 | - public void shouldProcessSampleText() throws Exception { | ||
23 | - String text = "Ala ma kota. Ala ma też psa."; | ||
24 | - TText processed = NLPProcess.annotate(text); | ||
25 | - List<String> ids = processed.getParagraphs().stream().flatMap(p -> p.getSentences().stream()).map(TSentence::getId).collect(Collectors.toList()); | ||
26 | - assertEquals(Lists.newArrayList("s-2.1", "s-2.2"), ids); | ||
27 | - | ||
28 | - File targetFile = TEMPORARY_FOLDER.newFile(); | ||
29 | - NLPProcess.serialize(processed, targetFile); | ||
30 | - } | ||
31 | -} | ||
32 | \ No newline at end of file | 0 | \ No newline at end of file |
pom.xml
@@ -10,29 +10,34 @@ | @@ -10,29 +10,34 @@ | ||
10 | 10 | ||
11 | <packaging>pom</packaging> | 11 | <packaging>pom</packaging> |
12 | 12 | ||
13 | + | ||
13 | <modules> | 14 | <modules> |
14 | <module>nicolas-lib</module> | 15 | <module>nicolas-lib</module> |
15 | <module>nicolas-cli</module> | 16 | <module>nicolas-cli</module> |
16 | <module>nicolas-model</module> | 17 | <module>nicolas-model</module> |
17 | <module>nicolas-train</module> | 18 | <module>nicolas-train</module> |
18 | <module>nicolas-common</module> | 19 | <module>nicolas-common</module> |
20 | + <module>nicolas-multiservice</module> | ||
19 | </modules> | 21 | </modules> |
20 | 22 | ||
21 | <properties> | 23 | <properties> |
22 | <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | 24 | <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> |
25 | + <project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding> | ||
23 | <java.version.build>1.8</java.version.build> | 26 | <java.version.build>1.8</java.version.build> |
24 | 27 | ||
25 | <pscapi.version>1.0</pscapi.version> | 28 | <pscapi.version>1.0</pscapi.version> |
26 | <utils.version>1.0</utils.version> | 29 | <utils.version>1.0</utils.version> |
27 | 30 | ||
28 | <commons-csv.version>1.4</commons-csv.version> | 31 | <commons-csv.version>1.4</commons-csv.version> |
29 | - <guava.version>20.0</guava.version> | 32 | + <guava.version>21.0</guava.version> |
30 | <weka-dev.version>3.9.1</weka-dev.version> | 33 | <weka-dev.version>3.9.1</weka-dev.version> |
31 | <commons-lang3.version>3.5</commons-lang3.version> | 34 | <commons-lang3.version>3.5</commons-lang3.version> |
32 | <commons-io.version>2.5</commons-io.version> | 35 | <commons-io.version>2.5</commons-io.version> |
33 | <slf4j-api.version>1.7.22</slf4j-api.version> | 36 | <slf4j-api.version>1.7.22</slf4j-api.version> |
34 | <junit.version>4.12</junit.version> | 37 | <junit.version>4.12</junit.version> |
35 | <zip4j.version>1.3.2</zip4j.version> | 38 | <zip4j.version>1.3.2</zip4j.version> |
39 | + <mockito-core.version>2.7.1</mockito-core.version> | ||
40 | + <jcommander.version>1.60</jcommander.version> | ||
36 | </properties> | 41 | </properties> |
37 | 42 | ||
38 | <prerequisites> | 43 | <prerequisites> |
@@ -76,6 +81,11 @@ | @@ -76,6 +81,11 @@ | ||
76 | <artifactId>nicolas-train</artifactId> | 81 | <artifactId>nicolas-train</artifactId> |
77 | <version>${project.version}</version> | 82 | <version>${project.version}</version> |
78 | </dependency> | 83 | </dependency> |
84 | + <dependency> | ||
85 | + <groupId>pl.waw.ipipan.zil.summ</groupId> | ||
86 | + <artifactId>nicolas-multiservice</artifactId> | ||
87 | + <version>${project.version}</version> | ||
88 | + </dependency> | ||
79 | 89 | ||
80 | <!-- internal --> | 90 | <!-- internal --> |
81 | <dependency> | 91 | <dependency> |
@@ -126,6 +136,11 @@ | @@ -126,6 +136,11 @@ | ||
126 | <artifactId>zip4j</artifactId> | 136 | <artifactId>zip4j</artifactId> |
127 | <version>${zip4j.version}</version> | 137 | <version>${zip4j.version}</version> |
128 | </dependency> | 138 | </dependency> |
139 | + <dependency> | ||
140 | + <groupId>com.beust</groupId> | ||
141 | + <artifactId>jcommander</artifactId> | ||
142 | + <version>${jcommander.version}</version> | ||
143 | + </dependency> | ||
129 | 144 | ||
130 | <!-- logging --> | 145 | <!-- logging --> |
131 | <dependency> | 146 | <dependency> |
@@ -144,20 +159,204 @@ | @@ -144,20 +159,204 @@ | ||
144 | <groupId>junit</groupId> | 159 | <groupId>junit</groupId> |
145 | <artifactId>junit</artifactId> | 160 | <artifactId>junit</artifactId> |
146 | <version>${junit.version}</version> | 161 | <version>${junit.version}</version> |
162 | + <scope>test</scope> | ||
163 | + </dependency> | ||
164 | + <dependency> | ||
165 | + <groupId>org.mockito</groupId> | ||
166 | + <artifactId>mockito-core</artifactId> | ||
167 | + <version>${mockito-core.version}</version> | ||
168 | + <scope>test</scope> | ||
147 | </dependency> | 169 | </dependency> |
148 | </dependencies> | 170 | </dependencies> |
149 | </dependencyManagement> | 171 | </dependencyManagement> |
150 | 172 | ||
173 | + | ||
151 | <build> | 174 | <build> |
175 | + <pluginManagement> | ||
176 | + <plugins> | ||
177 | + <plugin> | ||
178 | + <groupId>org.apache.maven.plugins</groupId> | ||
179 | + <artifactId>maven-jar-plugin</artifactId> | ||
180 | + <version>3.0.2</version> | ||
181 | + </plugin> | ||
182 | + <plugin> | ||
183 | + <groupId>org.apache.maven.plugins</groupId> | ||
184 | + <artifactId>maven-resources-plugin</artifactId> | ||
185 | + <version>3.0.1</version> | ||
186 | + </plugin> | ||
187 | + <plugin> | ||
188 | + <groupId>org.apache.maven.plugins</groupId> | ||
189 | + <artifactId>maven-clean-plugin</artifactId> | ||
190 | + <version>3.0.0</version> | ||
191 | + </plugin> | ||
192 | + <plugin> | ||
193 | + <groupId>org.apache.maven.plugins</groupId> | ||
194 | + <artifactId>maven-site-plugin</artifactId> | ||
195 | + <version>3.5.1</version> | ||
196 | + </plugin> | ||
197 | + <plugin> | ||
198 | + <groupId>org.apache.maven.plugins</groupId> | ||
199 | + <artifactId>maven-install-plugin</artifactId> | ||
200 | + <version>2.5.2</version> | ||
201 | + </plugin> | ||
202 | + <plugin> | ||
203 | + <groupId>org.apache.maven.plugins</groupId> | ||
204 | + <artifactId>maven-deploy-plugin</artifactId> | ||
205 | + <version>2.8.2</version> | ||
206 | + </plugin> | ||
207 | + <plugin> | ||
208 | + <groupId>org.apache.maven.plugins</groupId> | ||
209 | + <artifactId>maven-assembly-plugin</artifactId> | ||
210 | + <version>2.6</version> | ||
211 | + </plugin> | ||
212 | + <plugin> | ||
213 | + <groupId>org.apache.maven.plugins</groupId> | ||
214 | + <artifactId>maven-compiler-plugin</artifactId> | ||
215 | + <version>3.5.1</version> | ||
216 | + <configuration> | ||
217 | + <source>${java.version.build}</source> | ||
218 | + <target>${java.version.build}</target> | ||
219 | + </configuration> | ||
220 | + </plugin> | ||
221 | + <plugin> | ||
222 | + <groupId>org.apache.maven.plugins</groupId> | ||
223 | + <artifactId>maven-surefire-plugin</artifactId> | ||
224 | + <version>2.19.1</version> | ||
225 | + <configuration> | ||
226 | + <!-- Sets the VM argument line used when unit tests are run. --> | ||
227 | + <argLine>${surefireArgLine}</argLine> | ||
228 | + <!-- Skips unit tests if the value of skip.unit.tests property is true --> | ||
229 | + <skipTests>${skip.unit.tests}</skipTests> | ||
230 | + <!-- Excludes integration tests when unit tests are run. --> | ||
231 | + <excludes> | ||
232 | + <exclude>**/IT*.java</exclude> | ||
233 | + </excludes> | ||
234 | + </configuration> | ||
235 | + </plugin> | ||
236 | + <plugin> | ||
237 | + <groupId>org.apache.maven.plugins</groupId> | ||
238 | + <artifactId>maven-failsafe-plugin</artifactId> | ||
239 | + <version>2.19.1</version> | ||
240 | + <executions> | ||
241 | + <execution> | ||
242 | + <id>integration-test</id> | ||
243 | + <goals> | ||
244 | + <goal>integration-test</goal> | ||
245 | + <goal>verify</goal> | ||
246 | + </goals> | ||
247 | + <configuration> | ||
248 | + <!-- Sets the VM argument line used when integration tests are run. --> | ||
249 | + <argLine>${failsafeArgLine}</argLine> | ||
250 | + <!-- | ||
251 | + Skips integration tests if the value of skip.integration.tests property | ||
252 | + is true | ||
253 | + --> | ||
254 | + <skipTests>${skip.integration.tests}</skipTests> | ||
255 | + </configuration> | ||
256 | + </execution> | ||
257 | + <execution> | ||
258 | + <id>verify</id> | ||
259 | + <goals> | ||
260 | + <goal>verify</goal> | ||
261 | + </goals> | ||
262 | + </execution> | ||
263 | + </executions> | ||
264 | + </plugin> | ||
265 | + <plugin> | ||
266 | + <groupId>org.jacoco</groupId> | ||
267 | + <artifactId>jacoco-maven-plugin</artifactId> | ||
268 | + <version>0.7.8</version> | ||
269 | + <executions> | ||
270 | + <!-- | ||
271 | + Prepares the property pointing to the JaCoCo runtime agent which | ||
272 | + is passed as VM argument when Maven the Surefire plugin is executed. | ||
273 | + --> | ||
274 | + <execution> | ||
275 | + <id>pre-unit-test</id> | ||
276 | + <goals> | ||
277 | + <goal>prepare-agent</goal> | ||
278 | + </goals> | ||
279 | + <configuration> | ||
280 | + <!-- Sets the path to the file which contains the execution data. --> | ||
281 | + <destFile>${project.build.directory}/jacoco.exec</destFile> | ||
282 | + <!-- | ||
283 | + Sets the name of the property containing the settings | ||
284 | + for JaCoCo runtime agent. | ||
285 | + --> | ||
286 | + <propertyName>surefireArgLine</propertyName> | ||
287 | + </configuration> | ||
288 | + </execution> | ||
289 | + <!-- | ||
290 | + Ensures that the code coverage report for unit tests is created after | ||
291 | + unit tests have been run. | ||
292 | + --> | ||
293 | + <execution> | ||
294 | + <id>post-unit-test</id> | ||
295 | + <phase>test</phase> | ||
296 | + <goals> | ||
297 | + <goal>report</goal> | ||
298 | + </goals> | ||
299 | + <configuration> | ||
300 | + <!-- Sets the path to the file which contains the execution data. --> | ||
301 | + <dataFile>${project.build.directory}/jacoco.exec</dataFile> | ||
302 | + <!-- Sets the output directory for the code coverage report. --> | ||
303 | + <outputDirectory>${project.reporting.outputDirectory}/jacoco-ut</outputDirectory> | ||
304 | + </configuration> | ||
305 | + </execution> | ||
306 | + | ||
307 | + <!-- | ||
308 | + Prepares the property pointing to the JaCoCo runtime agent which | ||
309 | + is passed as VM argument when Maven the Failsafe plugin is executed. | ||
310 | + --> | ||
311 | + <execution> | ||
312 | + <id>pre-integration-test</id> | ||
313 | + <phase>pre-integration-test</phase> | ||
314 | + <goals> | ||
315 | + <goal>prepare-agent</goal> | ||
316 | + </goals> | ||
317 | + <configuration> | ||
318 | + <!-- Sets the path to the file which contains the execution data. --> | ||
319 | + <destFile>${project.build.directory}/jacoco-it.exec</destFile> | ||
320 | + <!-- | ||
321 | + Sets the name of the property containing the settings | ||
322 | + for JaCoCo runtime agent. | ||
323 | + --> | ||
324 | + <propertyName>failsafeArgLine</propertyName> | ||
325 | + </configuration> | ||
326 | + </execution> | ||
327 | + <!-- | ||
328 | + Ensures that the code coverage report for integration tests after | ||
329 | + integration tests have been run. | ||
330 | + --> | ||
331 | + <execution> | ||
332 | + <id>post-integration-test</id> | ||
333 | + <phase>post-integration-test</phase> | ||
334 | + <goals> | ||
335 | + <goal>report</goal> | ||
336 | + </goals> | ||
337 | + <configuration> | ||
338 | + <!-- Sets the path to the file which contains the execution data. --> | ||
339 | + <dataFile>${project.build.directory}/jacoco-it.exec</dataFile> | ||
340 | + <!-- Sets the output directory for the code coverage report. --> | ||
341 | + <outputDirectory>${project.reporting.outputDirectory}/jacoco-it</outputDirectory> | ||
342 | + </configuration> | ||
343 | + </execution> | ||
344 | + </executions> | ||
345 | + </plugin> | ||
346 | + </plugins> | ||
347 | + </pluginManagement> | ||
152 | <plugins> | 348 | <plugins> |
153 | <plugin> | 349 | <plugin> |
154 | <groupId>org.apache.maven.plugins</groupId> | 350 | <groupId>org.apache.maven.plugins</groupId> |
155 | - <artifactId>maven-compiler-plugin</artifactId> | ||
156 | - <version>3.5.1</version> | ||
157 | - <configuration> | ||
158 | - <source>${java.version.build}</source> | ||
159 | - <target>${java.version.build}</target> | ||
160 | - </configuration> | 351 | + <artifactId>maven-failsafe-plugin</artifactId> |
352 | + </plugin> | ||
353 | + <plugin> | ||
354 | + <groupId>org.apache.maven.plugins</groupId> | ||
355 | + <artifactId>maven-surefire-plugin</artifactId> | ||
356 | + </plugin> | ||
357 | + <plugin> | ||
358 | + <groupId>org.jacoco</groupId> | ||
359 | + <artifactId>jacoco-maven-plugin</artifactId> | ||
161 | </plugin> | 360 | </plugin> |
162 | </plugins> | 361 | </plugins> |
163 | </build> | 362 | </build> |