Commit bd7f5abb07ff32954b95699545ac6194c0a44c7f

Authored by Mateusz Kopeć
1 parent 62ccdfdc

1.3 release

doc/compile.sh 0 → 100755
  1 +#!/bin/bash
  2 +
  3 +pdflatex manual.tex
  4 +bibtex manual.aux
  5 +pdflatex manual.tex
  6 +pdflatex manual.tex
  7 +
  8 +rm manual.aux
  9 +rm manual.bbl
  10 +rm manual.blg
  11 +rm manual.log
0 12 \ No newline at end of file
... ...
doc/manual.pdf
No preview for this file type
doc/manual.tex
... ... @@ -38,10 +38,10 @@ The current version of the program facilitates the automatic mention detection,
38 38 MentionDetector uses information provided in it's input to produce mentions for coreference resolution. It merges entities provided by named entity recognition tools, shallow parsers and taggers.
39 39  
40 40 It also finds zero subjects in clauses and marks the verbs using zero subjects as mentions, using the algorithm presented in \cite{kop:14:eacl:short}, for which a model was trained using the full Polish Coreference Corpus, version 0.92 (corpus description in \cite{ogro:etal:13:ltc}). Training data had 15875 positive and 37798 negative examples; 10-fold cross validation yielded an accuracy of 86.14\% for the task of finding zero subjects. Precision of 79.8\% and recall of 71.2\% for the zero subject class of verbs was obtained.
41   -
  41 +
42 42 \textbf{Homepage:} \url{http://zil.ipipan.waw.pl/MentionDetector} \\
43 43 \textbf{Contact person:} Mateusz Kopeć [mateusz.kopec@ipipan.waw.pl] \\
44   -\textbf{Author:} Mateusz Kopeć \\
  44 +\textbf{Author:} Mateusz Kopeć \\
45 45 \textbf{License:} CC BY v.3
46 46  
47 47  
... ... @@ -49,7 +49,7 @@ It also finds zero subjects in clauses and marks the verbs using zero subjects a
49 49 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
50 50  
51 51 \section{Requirements}
52   -Java Runtime Environment (JRE) 1.7 or newer.
  52 +Java Runtime Environment (JRE) 1.8 or newer.
53 53  
54 54 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
55 55 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
... ... @@ -143,7 +143,7 @@ Zero subjects are distinguished from other mentions by having an additional feat
143 143  
144 144 Standalone jar doesn't need any installation. To run it, simply execute:\\
145 145  
146   -\texttt{java -jar md-1.0-SNAPSHOT.one-jar.jar <dir with input texts> <dir for output texts>}\\
  146 +\texttt{java -jar md-1.3-jar-with-dependencies.jar <dir with input texts> <dir for output texts>}\\
147 147  
148 148 All texts recursively found in \texttt{<dir with input texts>} are going to be annotated with mentions layer and saved in \texttt{<dir for output texts>}.\\
149 149  
... ... @@ -153,7 +153,7 @@ All texts recursively found in \texttt{&lt;dir with input texts&gt;} are going to be a
153 153 \section{Custom zero subject detection model}
154 154 If you want to use custom zero subject detection model, you may try:\\
155 155  
156   -\texttt{java -jar md-1.0-SNAPSHOT.one-jar.jar <dir with input texts> <dir for output texts> <model\_path>}
  156 +\texttt{java -jar md-1.3-jar-with-dependencies.jar <dir with input texts> <dir for output texts> <model\_path>}
157 157  
158 158 To create such model, use the \texttt{pl.waw.ipipan.zil.core.md.detection.zero.Trainer} class.
159 159  
... ...
1 1 <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
2   - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
3   - <modelVersion>4.0.0</modelVersion>
4   - <groupId>pl.waw.ipipan.zil.core</groupId>
5   - <artifactId>md</artifactId>
6   - <version>1.2-SNAPSHOT</version>
7   - <properties>
8   - <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
9   - </properties>
10   - <build>
11   - <plugins>
12   - <plugin>
13   - <artifactId>maven-compiler-plugin</artifactId>
14   - <version>2.3.2</version>
15   - <configuration>
16   - <source>1.7</source>
17   - <target>1.7</target>
18   - </configuration>
19   - </plugin>
20   - <plugin>
21   - <artifactId>maven-source-plugin</artifactId>
22   - <version>2.4</version>
23   - <executions>
24   - <execution>
25   - <id>attach-sources</id>
26   - <phase>deploy</phase>
27   - <goals>
28   - <goal>jar-no-fork</goal>
29   - </goals>
30   - </execution>
31   - </executions>
32   - </plugin>
33   - <plugin>
34   - <artifactId>maven-javadoc-plugin</artifactId>
35   - <version>2.10.3</version>
36   - <executions>
37   - <execution>
38   - <id>attach-javadocs</id>
39   - <phase>deploy</phase>
40   - <goals>
41   - <goal>jar</goal>
42   - </goals>
43   - </execution>
44   - </executions>
45   - </plugin>
46   - <plugin>
47   - <!-- explicitly define maven-deploy-plugin after other to force exec
48   - order -->
49   - <artifactId>maven-deploy-plugin</artifactId>
50   - <version>2.7</version>
51   - <executions>
52   - <execution>
53   - <id>deploy</id>
54   - <phase>deploy</phase>
55   - <goals>
56   - <goal>deploy</goal>
57   - </goals>
58   - </execution>
59   - </executions>
60   - </plugin>
61   - <plugin>
62   - <groupId>org.dstovall</groupId>
63   - <artifactId>onejar-maven-plugin</artifactId>
64   - <version>1.4.4</version>
65   - <executions>
66   - <execution>
67   - <configuration>
68   - <mainClass>pl.waw.ipipan.zil.core.md.Main</mainClass>
69   - </configuration>
70   - <goals>
71   - <goal>one-jar</goal>
72   - </goals>
73   - </execution>
74   - </executions>
75   - </plugin>
76   - </plugins>
77   - </build>
78   - <dependencies>
79   - <dependency>
80   - <groupId>log4j</groupId>
81   - <artifactId>log4j</artifactId>
82   - <version>1.2.17</version>
83   - </dependency>
84   - <dependency>
85   - <groupId>pl.waw.ipipan.zil.multiservice</groupId>
86   - <artifactId>utils</artifactId>
87   - <version>1.0-SNAPSHOT</version>
88   - </dependency>
89   - <dependency>
90   - <groupId>pl.waw.ipipan.zil.nkjp</groupId>
91   - <artifactId>teiapi</artifactId>
92   - <version>1.0-SNAPSHOT</version>
93   - </dependency>
94   - <dependency>
95   - <groupId>junit</groupId>
96   - <artifactId>junit</artifactId>
97   - <version>4.11</version>
98   - </dependency>
99   - <dependency>
100   - <groupId>nz.ac.waikato.cms.weka</groupId>
101   - <artifactId>weka-stable</artifactId>
102   - <version>3.6.10</version>
103   - </dependency>
104   - </dependencies>
  2 + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 + <modelVersion>4.0.0</modelVersion>
105 4  
106   - <repositories>
107   - <repository>
108   - <id>zil-maven-repo</id>
109   - <name>ZIL maven repository</name>
110   - <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots</url>
111   - </repository>
112   - </repositories>
  5 + <groupId>pl.waw.ipipan.zil.core</groupId>
  6 + <artifactId>md</artifactId>
  7 + <version>1.3</version>
113 8  
114   - <pluginRepositories>
115   - <pluginRepository>
116   - <id>onejar-maven-plugin.googlecode.com</id>
117   - <url>http://onejar-maven-plugin.googlecode.com/svn/mavenrepo</url>
118   - </pluginRepository>
119   - </pluginRepositories>
  9 + <developers>
  10 + <developer>
  11 + <name>Mateusz Kopeć</name>
  12 + <organization>ICS PAS</organization>
  13 + <email>m.kopec@ipipan.waw.pl</email>
  14 + </developer>
  15 + </developers>
120 16  
121   - <distributionManagement>
122   - <repository>
123   - <id>deployment</id>
124   - <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url>
125   - </repository>
126   - <snapshotRepository>
127   - <id>deployment</id>
128   - <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url>
129   - </snapshotRepository>
130   - </distributionManagement>
  17 + <properties>
  18 + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
  19 + <java.version>1.8</java.version>
  20 +
  21 + <junit.version>4.12</junit.version>
  22 + <slf4j.version>1.7.21</slf4j.version>
  23 + </properties>
  24 +
  25 + <prerequisites>
  26 + <maven>3.0.5</maven>
  27 + </prerequisites>
  28 +
  29 + <build>
  30 + <pluginManagement>
  31 + <plugins>
  32 + <plugin>
  33 + <artifactId>maven-compiler-plugin</artifactId>
  34 + <version>3.5.1</version>
  35 + <configuration>
  36 + <source>${java.version}</source>
  37 + <target>${java.version}</target>
  38 + </configuration>
  39 + </plugin>
  40 + <plugin>
  41 + <artifactId>maven-clean-plugin</artifactId>
  42 + <version>3.0.0</version>
  43 + </plugin>
  44 + <plugin>
  45 + <artifactId>maven-install-plugin</artifactId>
  46 + <version>2.5.2</version>
  47 + </plugin>
  48 + <plugin>
  49 + <artifactId>maven-jar-plugin</artifactId>
  50 + <version>3.0.2</version>
  51 + </plugin>
  52 + <plugin>
  53 + <artifactId>maven-resources-plugin</artifactId>
  54 + <version>3.0.1</version>
  55 + </plugin>
  56 + <plugin>
  57 + <artifactId>maven-site-plugin</artifactId>
  58 + <version>3.5.1</version>
  59 + </plugin>
  60 + <plugin>
  61 + <artifactId>maven-surefire-plugin</artifactId>
  62 + <version>2.19.1</version>
  63 + </plugin>
  64 +
  65 + <plugin>
  66 + <artifactId>maven-source-plugin</artifactId>
  67 + <version>3.0.1</version>
  68 + <executions>
  69 + <execution>
  70 + <id>attach-sources</id>
  71 + <phase>deploy</phase>
  72 + <goals>
  73 + <goal>jar-no-fork</goal>
  74 + </goals>
  75 + </execution>
  76 + </executions>
  77 + </plugin>
  78 + <plugin>
  79 + <artifactId>maven-javadoc-plugin</artifactId>
  80 + <version>2.10.4</version>
  81 + <executions>
  82 + <execution>
  83 + <id>attach-javadocs</id>
  84 + <phase>deploy</phase>
  85 + <goals>
  86 + <goal>jar</goal>
  87 + </goals>
  88 + </execution>
  89 + </executions>
  90 + </plugin>
  91 + <plugin>
  92 + <!-- explicitly define maven-deploy-plugin after other to force exec order -->
  93 + <artifactId>maven-deploy-plugin</artifactId>
  94 + <version>2.8.2</version>
  95 + <executions>
  96 + <execution>
  97 + <id>deploy</id>
  98 + <phase>deploy</phase>
  99 + <goals>
  100 + <goal>deploy</goal>
  101 + </goals>
  102 + </execution>
  103 + </executions>
  104 + </plugin>
  105 + <plugin>
  106 + <artifactId>maven-assembly-plugin</artifactId>
  107 + <version>2.6</version>
  108 + </plugin>
  109 + </plugins>
  110 + </pluginManagement>
  111 +
  112 + <plugins>
  113 + <plugin>
  114 + <artifactId>maven-assembly-plugin</artifactId>
  115 + <configuration>
  116 + <descriptorRefs>
  117 + <descriptorRef>jar-with-dependencies</descriptorRef>
  118 + </descriptorRefs>
  119 + <archive>
  120 + <manifest>
  121 + <mainClass>pl.waw.ipipan.zil.core.md.Main</mainClass>
  122 + </manifest>
  123 + </archive>
  124 + </configuration>
  125 + <executions>
  126 + <execution>
  127 + <id>make-assembly</id>
  128 + <phase>package</phase>
  129 + <goals>
  130 + <goal>single</goal>
  131 + </goals>
  132 + </execution>
  133 + </executions>
  134 + </plugin>
  135 + </plugins>
  136 + </build>
  137 +
  138 + <dependencies>
  139 + <!-- internal -->
  140 + <dependency>
  141 + <groupId>pl.waw.ipipan.zil.multiservice</groupId>
  142 + <artifactId>utils</artifactId>
  143 + <version>1.0</version>
  144 + </dependency>
  145 + <dependency>
  146 + <groupId>pl.waw.ipipan.zil.nkjp</groupId>
  147 + <artifactId>teiapi</artifactId>
  148 + <version>1.0</version>
  149 + </dependency>
  150 +
  151 + <!-- third party -->
  152 + <dependency>
  153 + <groupId>nz.ac.waikato.cms.weka</groupId>
  154 + <artifactId>weka-stable</artifactId>
  155 + <version>3.6.10</version>
  156 + </dependency>
  157 +
  158 + <!-- logging -->
  159 + <dependency>
  160 + <groupId>org.slf4j</groupId>
  161 + <artifactId>slf4j-api</artifactId>
  162 + <version>1.7.21</version>
  163 + </dependency>
  164 + <dependency>
  165 + <groupId>org.slf4j</groupId>
  166 + <artifactId>slf4j-simple</artifactId>
  167 + <version>1.7.21</version>
  168 + <scope>runtime</scope>
  169 + </dependency>
  170 +
  171 + <!-- test -->
  172 + <dependency>
  173 + <groupId>junit</groupId>
  174 + <artifactId>junit</artifactId>
  175 + <version>4.12</version>
  176 + <scope>test</scope>
  177 + </dependency>
  178 +
  179 + </dependencies>
  180 +
  181 + <repositories>
  182 + <repository>
  183 + <id>zil-maven-snapshot-repo</id>
  184 + <name>ZIL maven snapshot repository</name>
  185 + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url>
  186 + </repository>
  187 + <repository>
  188 + <id>zil-maven-release-repo</id>
  189 + <name>ZIL maven release repository</name>
  190 + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url>
  191 + </repository>
  192 + <repository>
  193 + <id>zil-maven-repo-3rdparty</id>
  194 + <name>ZIL maven repository 3rdparty</name>
  195 + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/thirdparty/</url>
  196 + </repository>
  197 + </repositories>
  198 +
  199 + <distributionManagement>
  200 + <repository>
  201 + <id>deployment</id>
  202 + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url>
  203 + </repository>
  204 + <snapshotRepository>
  205 + <id>deployment</id>
  206 + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url>
  207 + </snapshotRepository>
  208 + </distributionManagement>
131 209 </project>
... ...
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
1 1 package pl.waw.ipipan.zil.core.md;
2 2  
3   -import java.io.File;
4   -import java.io.FileInputStream;
5   -import java.io.IOException;
6   -import java.io.InputStream;
7   -
8   -import org.apache.log4j.Logger;
9   -
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
10 5 import pl.waw.ipipan.zil.core.md.detection.Detector;
11 6 import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
12 7 import pl.waw.ipipan.zil.core.md.entities.Text;
... ... @@ -20,134 +15,128 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
20 15 import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
21 16 import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
22 17  
23   -/**
24   - * @author Mateusz Kopeć
25   - *
26   - */
  18 +import java.io.File;
  19 +import java.io.FileInputStream;
  20 +import java.io.IOException;
  21 +import java.io.InputStream;
  22 +
27 23 public class Main {
28 24  
29   - private final static Logger logger = Logger.getLogger(Main.class);
30   - private final static boolean GZIP_OUTPUT = true;
31   -
32   - private final static String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
33   -
34   - private static ZeroSubjectDetector zeroSubjectModel;
35   - static {
36   - InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
37   - zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
38   - }
39   -
40   - /**
41   - * Main method for detecting mentions in corpus encoded in Tei format.
42   - *
43   - * @param args
44   - * arguments
45   - */
46   - public static void main(String[] args) {
47   -
48   - if (args.length != 2 && args.length != 3) {
49   - logger.error("Wrong usage! should be: " + Main.class.getSimpleName()
50   - + " input_dir result_dir [zero_subject_model]");
51   - return;
52   - }
53   -
54   - File inputDir = new File(args[0]);
55   - File outputDir = new File(args[1]);
56   -
57   - if (!inputDir.isDirectory()) {
58   - logger.error(inputDir + " is not a directory!");
59   - return;
60   - }
61   - if (!outputDir.isDirectory()) {
62   - logger.error(outputDir + " is not a directory!");
63   - return;
64   - }
65   - if (args.length == 3) {
66   - try {
67   - InputStream zeroSubjectDetectionModelStream;
68   - zeroSubjectDetectionModelStream = new FileInputStream(new File(args[2]));
69   - zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
70   - if (zeroSubjectModel == null)
71   - throw new IOException();
72   - } catch (IOException e) {
73   - logger.error("Unable to load model from file: " + args[2] + ": " + e);
74   - return;
75   - }
76   - }
77   -
78   - int all = 0;
79   - int errors = 0;
80   - for (File teiDir : IOUtils.getNKJPDirs(inputDir)) {
81   - all++;
82   - try {
83   - File targetDir = createTargetTextDir(inputDir, outputDir, teiDir);
84   - TEICorpusText teiText = TeiLoader.readTeiText(teiDir);
85   - annotateTeiText(teiText);
86   - TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT);
87   - } catch (IOException e) {
88   - logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage());
89   - errors++;
90   - }
91   - }
92   -
93   - logger.info(all + " texts processed succesfully.");
94   - if (errors > 0)
95   - logger.info(errors + " texts not processed.");
96   - logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected.");
97   - logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected.");
98   - }
99   -
100   - /**
101   - * Find relative path of text directory in the corpus directory and create
102   - * similar directory structure in the output corpus directory.
103   - *
104   - * @param inputCorpusDir
105   - * input corpus directory
106   - * @param outputCorpusDir
107   - * output corpus directory
108   - * @param textDir
109   - * input text dir
110   - * @return target text dir
111   - * @throws IOException
112   - * when an error occurs
113   - */
114   - private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException {
115   - String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length());
116   - File targetDir = new File(outputCorpusDir, relativeDirPath);
117   - targetDir.mkdirs();
118   - if (!targetDir.exists() || !targetDir.isDirectory())
119   - throw new IOException("Failed to create output directory at: " + targetDir);
120   - return targetDir;
121   - }
122   -
123   - /**
124   - * Find mentions in Thrift text and update this Thrift text with mention
125   - * annotation.
126   - *
127   - * @param thriftText
128   - * text to annotate with mentions
129   - * @throws MultiserviceException
130   - * when an error occures
131   - */
132   - public static void annotateThriftText(TText thriftText) throws MultiserviceException {
133   - Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
134   - Detector.findMentionsInText(responseText, zeroSubjectModel);
135   - ThriftSaver.updateThriftText(responseText, thriftText);
136   - }
137   -
138   - /**
139   - * Find mentions in Tei text and update this Tei text with mention
140   - * annotation. This method does not save this Tei text on disk.
141   - *
142   - * @param teiText
143   - * text to annotate with mentions
144   - * @throws TEIException
145   - * when an error occurs
146   - */
147   - public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
148   - Text responseText = TeiLoader.loadTextFromTei(teiText);
149   - Detector.findMentionsInText(responseText, zeroSubjectModel);
150   - TeiSaver.updateTeiText(responseText, teiText);
151   - }
  25 + private static final Logger logger = LoggerFactory.getLogger(Main.class);
  26 +
  27 + private static final boolean GZIP_OUTPUT = true;
  28 + private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
  29 +
  30 + private static ZeroSubjectDetector zeroSubjectModel;
  31 +
  32 + static {
  33 + InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
  34 + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
  35 + }
  36 +
  37 + private Main() {
  38 + }
  39 +
  40 + /**
  41 + * Main method for detecting mentions in corpus encoded in Tei format.
  42 + *
  43 + * @param args arguments
  44 + */
  45 + public static void main(String[] args) {
  46 +
  47 + if (args.length != 2 && args.length != 3) {
  48 + logger.error("Wrong usage! should be: " + Main.class.getSimpleName()
  49 + + " input_dir result_dir [zero_subject_model]");
  50 + return;
  51 + }
  52 +
  53 + File inputDir = new File(args[0]);
  54 + File outputDir = new File(args[1]);
  55 +
  56 + if (!inputDir.isDirectory()) {
  57 + logger.error(inputDir + " is not a directory!");
  58 + return;
  59 + }
  60 + if (!outputDir.isDirectory()) {
  61 + logger.error(outputDir + " is not a directory!");
  62 + return;
  63 + }
  64 + if (args.length == 3) {
  65 + try {
  66 + InputStream zeroSubjectDetectionModelStream;
  67 + zeroSubjectDetectionModelStream = new FileInputStream(new File(args[2]));
  68 + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
  69 + } catch (IOException e) {
  70 + logger.error("Unable to load model from file: " + args[2] + ": " + e, e);
  71 + return;
  72 + }
  73 + }
  74 +
  75 + int all = 0;
  76 + int errors = 0;
  77 + for (File teiDir : IOUtils.getNKJPDirs(inputDir)) {
  78 + all++;
  79 + try {
  80 + File targetDir = createTargetTextDir(inputDir, outputDir, teiDir);
  81 + TEICorpusText teiText = TeiLoader.readTeiText(teiDir);
  82 + annotateTeiText(teiText);
  83 + TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT);
  84 + } catch (IOException e) {
  85 + logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e);
  86 + errors++;
  87 + }
  88 + }
  89 +
  90 + logger.info(all + " texts processed succesfully.");
  91 + if (errors > 0)
  92 + logger.info(errors + " texts not processed.");
  93 + logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected.");
  94 + logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected.");
  95 + }
  96 +
  97 + /**
  98 + * Find relative path of text directory in the corpus directory and create
  99 + * similar directory structure in the output corpus directory.
  100 + *
  101 + * @param inputCorpusDir input corpus directory
  102 + * @param outputCorpusDir output corpus directory
  103 + * @param textDir input text dir
  104 + * @return target text dir
  105 + * @throws IOException when an error occurs
  106 + */
  107 + private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException {
  108 + String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length());
  109 + File targetDir = new File(outputCorpusDir, relativeDirPath);
  110 + targetDir.mkdirs();
  111 + if (!targetDir.exists() || !targetDir.isDirectory())
  112 + throw new IOException("Failed to create output directory at: " + targetDir);
  113 + return targetDir;
  114 + }
  115 +
  116 + /**
  117 + * Find mentions in Thrift text and update this Thrift text with mention
  118 + * annotation.
  119 + *
  120 + * @param thriftText text to annotate with mentions
  121 + * @throws MultiserviceException when an error occures
  122 + */
  123 + public static void annotateThriftText(TText thriftText) throws MultiserviceException {
  124 + Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
  125 + Detector.findMentionsInText(responseText, zeroSubjectModel);
  126 + ThriftSaver.updateThriftText(responseText, thriftText);
  127 + }
  128 +
  129 + /**
  130 + * Find mentions in Tei text and update this Tei text with mention
  131 + * annotation. This method does not save this Tei text on disk.
  132 + *
  133 + * @param teiText text to annotate with mentions
  134 + * @throws TEIException when an error occurs
  135 + */
  136 + public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
  137 + Text responseText = TeiLoader.loadTextFromTei(teiText);
  138 + Detector.findMentionsInText(responseText, zeroSubjectModel);
  139 + TeiSaver.updateTeiText(responseText, teiText);
  140 + }
152 141  
153 142 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
1 1 package pl.waw.ipipan.zil.core.md.detection;
2 2  
  3 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  4 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  5 +import pl.waw.ipipan.zil.core.md.entities.Token;
  6 +
3 7 import java.util.Collection;
4 8 import java.util.HashSet;
5 9 import java.util.List;
6 10 import java.util.Set;
7 11  
8   -import pl.waw.ipipan.zil.core.md.entities.Mention;
9   -import pl.waw.ipipan.zil.core.md.entities.Sentence;
10   -import pl.waw.ipipan.zil.core.md.entities.Token;
11   -
12 12 public class Cleaner {
13   - public static void cleanUnnecessarySentenceMentions(Sentence sentence) {
14   - List<Mention> mentions = sentence.getMentions();
15   - Collection<Mention> unnecessaryMentions = new HashSet<Mention>();
16   -
17   - for (int i = 0; i < mentions.size(); i++) {
18   - Mention m1 = mentions.get(i);
19   - for (int j = i + 1; j < mentions.size(); j++) {
20   - Mention m2 = mentions.get(j);
21   -
22   - Mention lessImportantMention = getLessImportantMention(m1, m2);
23   - Mention moreImportantMention = m1 == lessImportantMention ? m2
24   - : m1;
25   -
26   - // same mention borders
27   - if (m1.getSegments().equals(m2.getSegments())) {
28   - unnecessaryMentions.add(lessImportantMention);
29   - // System.out.println("Same borders: "+ m1 +", "+
30   - // m2+": "+getLessImportantMention(m1, m2)+" removed");
31   - continue;
32   - }
33   - // same mention heads
34   - if (!m1.getHeadSegments().isEmpty()
35   - && !m2.getHeadSegments().isEmpty()) {
36   - if (m1.getHeadSegments().equals(m2.getHeadSegments())) {
37   -
38   - List<Token> segments = moreImportantMention
39   - .getSegments();
40   -
41   - boolean isConj = false;
42   - for (Token seg : segments) {
43   - if (seg.getChosenInterpretation().getCtag()
44   - .equals("conj")) {
45   - isConj = true;
46   - break;
47   - }
48   - }
49   -
50   - if (!isConj) {
51   - unnecessaryMentions.add(lessImportantMention);
52   - // System.out.println("Same heads: " + m1 + ", " +
53   - // m2 + ": " + lessImportantMention
54   - // + " removed");
55   -
56   - continue;
57   - }
58   - }
59   - }
60   -
61   - // mention head equals whole other mention
62   - if (m1.getHeadSegments().isEmpty()
63   - && !m2.getHeadSegments().isEmpty()) {
64   - if (m2.getHeadSegments().equals(m1.getSegments())) {
65   - unnecessaryMentions.add(lessImportantMention);
66   - continue;
67   - // System.out.println("head is other mention: " + m1 +
68   - // ", " + m2 + ": "
69   - // + getLessImportantMention(m1, m2) + " removed");
70   - }
71   - }
72   -
73   - // the same, but other way round
74   - if (m2.getHeadSegments().isEmpty()
75   - && !m1.getHeadSegments().isEmpty()) {
76   -
77   - if (m1.getHeadSegments().equals(m2.getSegments())) {
78   - unnecessaryMentions.add(lessImportantMention);
79   - continue;
80   - // System.out.println("head is other mention: " + m1 +
81   - // ", " + m2 + ": "
82   - // + getLessImportantMention(m1, m2) + " removed");
83   - }
84   - }
85   -
86   - // nie zawieraja sie w sobie, lecz maja czesc wspolna
87   - boolean intersect = false;
88   -
89   - Set<Token> notInM1 = new HashSet<Token>(m2.getSegments());
90   - notInM1.removeAll(m1.getSegments());
91   - if (notInM1.size() < m2.getSegments().size())
92   - intersect = true;
93   -
94   - Set<Token> notInM2 = new HashSet<Token>(m1.getSegments());
95   - notInM2.removeAll(m2.getSegments());
96   - if (notInM2.size() < m1.getSegments().size())
97   - intersect = true;
98   -
99   - // if (intersect)
100   - // System.out.println(m1+","+m2);
101   -
102   - if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) {
103   - unnecessaryMentions.add(lessImportantMention);
104   - continue;
105   - // System.out.println("intersection!" + m1 + ", " + m2 +
106   - // ": "
107   - // + getLessImportantMention(m1, m2) + " removed");
108   - }
109   -
110   - }
111   - }
112   -
113   - for (Mention m : unnecessaryMentions)
114   - sentence.removeMention(m);
115   -
116   - // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]]
117   - unnecessaryMentions.clear();
118   -
119   - OUTER: for (Mention m : sentence.getMentions()) {
120   - for (Token seg : m.getSegments())
121   - if (seg.getOrth().toLowerCase().equals(seg.getOrth()))
122   - continue OUTER;
123   -
124   - //only for children of fully capitalized mentions
125   - Set<Mention> allMentions = new HashSet<Mention>();
126   - for (Token seg : m.getSegments())
127   - for (Mention m2 : seg.getMentions())
128   - if (m.getSegments().containsAll(m2.getSegments()))
129   - allMentions.add(m2);
130   -
131   - allMentions.remove(m);
132   -
133   - unnecessaryMentions.addAll(allMentions);
134   - }
135   - for (Mention m : unnecessaryMentions)
136   - sentence.removeMention(m);
137   - }
138   -
139   - private static Mention getLessImportantMention(Mention m1, Mention m2) {
140   - if (m1.getSegments().size() > m2.getSegments().size())
141   - return m2;
142   - else
143   - return m1;
144   - }
  13 + public static void cleanUnnecessarySentenceMentions(Sentence sentence) {
  14 + List<Mention> mentions = sentence.getMentions();
  15 + Collection<Mention> unnecessaryMentions = new HashSet<>();
  16 +
  17 + for (int i = 0; i < mentions.size(); i++) {
  18 + Mention m1 = mentions.get(i);
  19 + for (int j = i + 1; j < mentions.size(); j++) {
  20 + Mention m2 = mentions.get(j);
  21 +
  22 + Mention lessImportantMention = getLessImportantMention(m1, m2);
  23 + Mention moreImportantMention = m1 == lessImportantMention ? m2
  24 + : m1;
  25 +
  26 + // same mention borders
  27 + if (m1.getSegments().equals(m2.getSegments())) {
  28 + unnecessaryMentions.add(lessImportantMention);
  29 + continue;
  30 + }
  31 + // same mention heads
  32 + if (!m1.getHeadSegments().isEmpty()
  33 + && !m2.getHeadSegments().isEmpty()) {
  34 + if (m1.getHeadSegments().equals(m2.getHeadSegments())) {
  35 +
  36 + List<Token> segments = moreImportantMention
  37 + .getSegments();
  38 +
  39 + boolean isConj = false;
  40 + for (Token seg : segments) {
  41 + if (seg.getChosenInterpretation().getCtag()
  42 + .equals("conj")) {
  43 + isConj = true;
  44 + break;
  45 + }
  46 + }
  47 +
  48 + if (!isConj) {
  49 + unnecessaryMentions.add(lessImportantMention);
  50 + continue;
  51 + }
  52 + }
  53 + }
  54 +
  55 + // mention head equals whole other mention
  56 + if (m1.getHeadSegments().isEmpty()
  57 + && !m2.getHeadSegments().isEmpty()) {
  58 + if (m2.getHeadSegments().equals(m1.getSegments())) {
  59 + unnecessaryMentions.add(lessImportantMention);
  60 + continue;
  61 + }
  62 + }
  63 +
  64 + // the same, but other way round
  65 + if (m2.getHeadSegments().isEmpty()
  66 + && !m1.getHeadSegments().isEmpty()) {
  67 +
  68 + if (m1.getHeadSegments().equals(m2.getSegments())) {
  69 + unnecessaryMentions.add(lessImportantMention);
  70 + continue;
  71 + }
  72 + }
  73 +
  74 + // nie zawieraja sie w sobie, lecz maja czesc wspolna
  75 + boolean intersect = false;
  76 +
  77 + Set<Token> notInM1 = new HashSet<>(m2.getSegments());
  78 + notInM1.removeAll(m1.getSegments());
  79 + if (notInM1.size() < m2.getSegments().size())
  80 + intersect = true;
  81 +
  82 + Set<Token> notInM2 = new HashSet<>(m1.getSegments());
  83 + notInM2.removeAll(m2.getSegments());
  84 + if (notInM2.size() < m1.getSegments().size())
  85 + intersect = true;
  86 +
  87 + if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) {
  88 + unnecessaryMentions.add(lessImportantMention);
  89 + continue;
  90 + }
  91 +
  92 + }
  93 + }
  94 +
  95 + for (Mention m : unnecessaryMentions)
  96 + sentence.removeMention(m);
  97 +
  98 + // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]]
  99 + unnecessaryMentions.clear();
  100 +
  101 + OUTER:
  102 + for (Mention m : sentence.getMentions()) {
  103 + for (Token seg : m.getSegments())
  104 + if (seg.getOrth().toLowerCase().equals(seg.getOrth()))
  105 + continue OUTER;
  106 +
  107 + //only for children of fully capitalized mentions
  108 + Set<Mention> allMentions = new HashSet<>();
  109 + for (Token seg : m.getSegments())
  110 + for (Mention m2 : seg.getMentions())
  111 + if (m.getSegments().containsAll(m2.getSegments()))
  112 + allMentions.add(m2);
  113 +
  114 + allMentions.remove(m);
  115 +
  116 + unnecessaryMentions.addAll(allMentions);
  117 + }
  118 + for (Mention m : unnecessaryMentions)
  119 + sentence.removeMention(m);
  120 + }
  121 +
  122 + private static Mention getLessImportantMention(Mention m1, Mention m2) {
  123 + if (m1.getSegments().size() > m2.getSegments().size())
  124 + return m2;
  125 + else
  126 + return m1;
  127 + }
145 128 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
1 1 package pl.waw.ipipan.zil.core.md.detection;
2 2  
3 3 public class Constants {
4   - public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger";
5   - public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt";
6   - public static final String MORPHO_PRONOUN_CTAGS = "ppron3|ppron12";
7   - public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|"
8   - + MORPHO_PRONOUN_CTAGS;
9   - public static final String WORDS_CTAGS = "Noun|Ppron.*";
  4 + public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger";
  5 + public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt";
  6 + public static final String MORPHO_PRONOUN_CTAGS = "ppron3|ppron12";
  7 + public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|"
  8 + + MORPHO_PRONOUN_CTAGS;
  9 + public static final String WORDS_CTAGS = "Noun|Ppron.*";
  10 +
  11 + private Constants() {
  12 + }
10 13 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
1 1 package pl.waw.ipipan.zil.core.md.detection;
2 2  
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
  6 +import pl.waw.ipipan.zil.core.md.entities.*;
  7 +
3 8 import java.util.ArrayList;
4 9 import java.util.HashSet;
5 10 import java.util.List;
6 11 import java.util.Set;
7 12  
8   -import org.apache.log4j.Logger;
9   -
10   -import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
11   -import pl.waw.ipipan.zil.core.md.entities.Mention;
12   -import pl.waw.ipipan.zil.core.md.entities.NamedEntity;
13   -import pl.waw.ipipan.zil.core.md.entities.Paragraph;
14   -import pl.waw.ipipan.zil.core.md.entities.Sentence;
15   -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
16   -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
17   -import pl.waw.ipipan.zil.core.md.entities.Text;
18   -import pl.waw.ipipan.zil.core.md.entities.Token;
19   -
20 13 public class Detector {
21   - private static Logger logger = Logger.getLogger(Detector.class);
22   -
23   - public static void findMentionsInText(Text text,
24   - ZeroSubjectDetector zeroSubjectModel) {
25   - text.clearMentions();
26   - logger.debug("Detecting mentions in text " + text.getId());
27   - for (Paragraph p : text)
28   - for (Sentence s : p)
29   - detectMentionsInSentence(s, zeroSubjectModel);
30   - }
31   -
32   - private static void detectMentionsInSentence(Sentence sentence,
33   - ZeroSubjectDetector zeroSubjectModel) {
34   - // adding mentions
35   - addMentionsByTokenCtag(sentence);
36   - addMentionsBySyntacticWordsCtag(sentence);
37   - addMentionsByNamedEntities(sentence);
38   - addMentionsByGroups(sentence);
39   - addSpeakerMentionsInSpoken(sentence);
40   -
41   - // zero subject detection
42   - zeroSubjectModel.addZeroSubjectMentions(sentence);
43   -
44   - // removing mentions
45   - removeTo(sentence);
46   - Cleaner.cleanUnnecessarySentenceMentions(sentence);
47   -
48   - // updating mention heads
49   - updateMentionHeads(sentence);
50   - }
51   -
52   - /**
53   - * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak
54   - *
55   - * @param sentence
56   - */
57   - private static void updateMentionHeads(Sentence sentence) {
58   - for (Mention m : sentence.getMentions())
59   - if (m.getHeadSegments().isEmpty())
60   - m.addHeadSegment(m.getFirstSegment());
61   - }
62   -
63   - /**
64   - * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro""
65   - *
66   - * @param sentence
67   - */
68   - private static void removeTo(Sentence sentence) {
69   - Set<String> orths = new HashSet<String>();
70   - for (Token morph : sentence)
71   - orths.add(morph.getOrth());
72   -
73   - if (orths.contains("jeśli") || orths.contains("jeżeli")
74   - || orths.contains("skoro")) {
75   - for (Mention mention : sentence.getMentions()) {
76   - List<Token> mentSegs = mention.getSegments();
77   - if (mentSegs.size() == 1
78   - && mentSegs.get(0).getBase().equals("to")) {
79   - sentence.removeMention(mention);
80   - }
81   - }
82   - }
83   - }
84   -
85   - private static void addSpeakerMentionsInSpoken(Sentence sentence) {
86   - // heurystyka dla sp1:, sp2:, MarszałekJAkistam:
87   - if (sentence.size() > 2) {
88   - Token first = sentence.get(0);
89   - Token second = sentence.get(1);
90   - if (second.getOrth().equals(":")) {
91   - sentence.addMention(new Mention(first));
92   - }
93   - }
94   - }
95   -
96   - /**
97   - * Wyszukuję i oznaczam wszystkie NG*
98   - *
99   - * @param sentence
100   - */
101   - private static void addMentionsByGroups(Sentence sentence) {
102   - for (SyntacticGroup group : sentence.getGroups()) {
103   - if (group.getType().startsWith("NG")) {
104   - List<Token> segments = group.getTokens();
105   - List<Token> heads = group.getSemanticHeadTokens();
106   -
107   - sentence.addMention(new Mention(segments, heads));
108   - }
109   - }
110   - }
111   -
112   - /**
113   - * Wyszukuję i oznaczam wszystkie NER
114   - *
115   - * @param sentence
116   - */
117   - private static void addMentionsByNamedEntities(Sentence sentence) {
118   - for (NamedEntity ne : sentence.getNamedEntities()) {
119   -
120   - List<Token> headTokens = new ArrayList<Token>();
121   - List<Token> tokens = ne.getTokens();
122   -
123   - boolean containsNoun = false;
124   - for (Token seg : tokens) {
125   - if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) {
126   - containsNoun = true;
127   - break;
128   - }
129   - }
130   - if (!containsNoun)
131   - continue;
132   -
133   - sentence.addMention(new Mention(tokens, headTokens));
134   - }
135   - }
136   -
137   - /**
138   - * @param sentence
139   - */
140   - private static void addMentionsBySyntacticWordsCtag(Sentence sentence) {
141   - for (SyntacticWord w : sentence.getSyntacticWords())
142   - if (w.getCtag().matches(Constants.WORDS_CTAGS)) {
143   - List<Token> tokens = w.getTokens();
144   - if (tokens.size() == 1) {
145   - sentence.addMention(new Mention(tokens.get(0)));
146   - } else {
147   - List<Token> heads = new ArrayList<Token>();
148   - sentence.addMention(new Mention(tokens, heads));
149   - }
150   - }
151   - }
152   -
153   - /**
154   - * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow
155   - * skladniowych, to korzystam z niego zamiast morfoskladni
156   - *
157   - * @param sentence
158   - */
159   - private static void addMentionsByTokenCtag(Sentence sentence) {
160   - for (Token token : sentence)
161   - if (token.getCtag().matches(Constants.MORPHO_CTAGS))
162   - sentence.addMention(new Mention(token));
163   - }
  14 +
  15 + private static final Logger logger = LoggerFactory.getLogger(Detector.class);
  16 +
  17 + private Detector() {
  18 + }
  19 +
  20 + public static void findMentionsInText(Text text,
  21 + ZeroSubjectDetector zeroSubjectModel) {
  22 + text.clearMentions();
  23 + logger.debug("Detecting mentions in text " + text.getId());
  24 + for (Paragraph p : text)
  25 + for (Sentence s : p)
  26 + detectMentionsInSentence(s, zeroSubjectModel);
  27 + }
  28 +
  29 + private static void detectMentionsInSentence(Sentence sentence,
  30 + ZeroSubjectDetector zeroSubjectModel) {
  31 + // adding mentions
  32 + addMentionsByTokenCtag(sentence);
  33 + addMentionsBySyntacticWordsCtag(sentence);
  34 + addMentionsByNamedEntities(sentence);
  35 + addMentionsByGroups(sentence);
  36 + addSpeakerMentionsInSpoken(sentence);
  37 +
  38 + // zero subject detection
  39 + zeroSubjectModel.addZeroSubjectMentions(sentence);
  40 +
  41 + // removing mentions
  42 + removeTo(sentence);
  43 + Cleaner.cleanUnnecessarySentenceMentions(sentence);
  44 +
  45 + // updating mention heads
  46 + updateMentionHeads(sentence);
  47 + }
  48 +
  49 + /**
  50 + * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak
  51 + *
  52 + * @param sentence
  53 + */
  54 + private static void updateMentionHeads(Sentence sentence) {
  55 + for (Mention m : sentence.getMentions())
  56 + if (m.getHeadSegments().isEmpty())
  57 + m.addHeadSegment(m.getFirstSegment());
  58 + }
  59 +
  60 + /**
  61 + * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro""
  62 + *
  63 + * @param sentence
  64 + */
  65 + private static void removeTo(Sentence sentence) {
  66 + Set<String> orths = new HashSet<>();
  67 + for (Token morph : sentence)
  68 + orths.add(morph.getOrth());
  69 +
  70 + if (orths.contains("jeśli") || orths.contains("jeżeli")
  71 + || orths.contains("skoro")) {
  72 + for (Mention mention : sentence.getMentions()) {
  73 + List<Token> mentSegs = mention.getSegments();
  74 + if (mentSegs.size() == 1
  75 + && "to".equals(mentSegs.get(0).getBase())) {
  76 + sentence.removeMention(mention);
  77 + }
  78 + }
  79 + }
  80 + }
  81 +
  82 + private static void addSpeakerMentionsInSpoken(Sentence sentence) {
  83 + // heurystyka dla sp1:, sp2:, MarszałekJAkistam:
  84 + if (sentence.size() > 2) {
  85 + Token first = sentence.get(0);
  86 + Token second = sentence.get(1);
  87 + if (":".equals(second.getOrth())) {
  88 + sentence.addMention(new Mention(first));
  89 + }
  90 + }
  91 + }
  92 +
  93 + /**
  94 + * Wyszukuję i oznaczam wszystkie NG*
  95 + *
  96 + * @param sentence
  97 + */
  98 + private static void addMentionsByGroups(Sentence sentence) {
  99 + for (SyntacticGroup group : sentence.getGroups()) {
  100 + if (group.getType().startsWith("NG")) {
  101 + List<Token> segments = group.getTokens();
  102 + List<Token> heads = group.getSemanticHeadTokens();
  103 +
  104 + sentence.addMention(new Mention(segments, heads));
  105 + }
  106 + }
  107 + }
  108 +
  109 + /**
  110 + * Wyszukuję i oznaczam wszystkie NER
  111 + *
  112 + * @param sentence
  113 + */
  114 + private static void addMentionsByNamedEntities(Sentence sentence) {
  115 + for (NamedEntity ne : sentence.getNamedEntities()) {
  116 +
  117 + List<Token> headTokens = new ArrayList<>();
  118 + List<Token> tokens = ne.getTokens();
  119 +
  120 + boolean containsNoun = false;
  121 + for (Token seg : tokens) {
  122 + if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) {
  123 + containsNoun = true;
  124 + break;
  125 + }
  126 + }
  127 + if (!containsNoun)
  128 + continue;
  129 +
  130 + sentence.addMention(new Mention(tokens, headTokens));
  131 + }
  132 + }
  133 +
  134 + private static void addMentionsBySyntacticWordsCtag(Sentence sentence) {
  135 + for (SyntacticWord w : sentence.getSyntacticWords())
  136 + if (w.getCtag().matches(Constants.WORDS_CTAGS)) {
  137 + List<Token> tokens = w.getTokens();
  138 + if (tokens.size() == 1) {
  139 + sentence.addMention(new Mention(tokens.get(0)));
  140 + } else {
  141 + List<Token> heads = new ArrayList<>();
  142 + sentence.addMention(new Mention(tokens, heads));
  143 + }
  144 + }
  145 + }
  146 +
  147 + /**
  148 + * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow
  149 + * skladniowych, to korzystam z niego zamiast morfoskladni
  150 + *
  151 + * @param sentence
  152 + */
  153 + private static void addMentionsByTokenCtag(Sentence sentence) {
  154 + for (Token token : sentence)
  155 + if (token.getCtag().matches(Constants.MORPHO_CTAGS))
  156 + sentence.addMention(new Mention(token));
  157 + }
164 158 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/FeatureGeneration.java
1 1 package pl.waw.ipipan.zil.core.md.detection.zero;
2 2  
3   -import java.util.ArrayList;
4   -import java.util.Arrays;
5   -import java.util.HashMap;
6   -import java.util.HashSet;
7   -import java.util.Iterator;
8   -import java.util.LinkedList;
9   -import java.util.List;
10   -import java.util.Map;
11   -import java.util.Set;
12   -
13   -import pl.waw.ipipan.zil.core.md.entities.Mention;
14   -import pl.waw.ipipan.zil.core.md.entities.Sentence;
15   -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
16   -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
17   -import pl.waw.ipipan.zil.core.md.entities.Token;
  3 +import pl.waw.ipipan.zil.core.md.entities.*;
18 4 import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention;
19 5 import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph;
20 6  
  7 +import java.util.*;
  8 +
21 9 public class FeatureGeneration {
22 10 final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo",
23 11 "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" }));
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java
1 1 package pl.waw.ipipan.zil.core.md.detection.zero;
2 2  
3   -import java.io.File;
4   -import java.util.ArrayList;
5   -import java.util.HashSet;
6   -import java.util.List;
7   -import java.util.Map.Entry;
8   -import java.util.Set;
9   -import java.util.TreeMap;
10   -import java.util.TreeSet;
11   -
12   -import org.apache.log4j.Logger;
13   -
14   -import pl.waw.ipipan.zil.core.md.entities.Mention;
15   -import pl.waw.ipipan.zil.core.md.entities.Paragraph;
16   -import pl.waw.ipipan.zil.core.md.entities.Sentence;
17   -import pl.waw.ipipan.zil.core.md.entities.Text;
18   -import pl.waw.ipipan.zil.core.md.entities.Token;
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.entities.*;
19 6 import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
20 7 import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
21 8 import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
... ... @@ -25,154 +12,161 @@ import weka.core.FastVector;
25 12 import weka.core.Instance;
26 13 import weka.core.Instances;
27 14  
  15 +import java.io.File;
  16 +import java.util.*;
  17 +import java.util.Map.Entry;
  18 +
28 19 public class InstanceCreator {
29 20  
30   - final private static Logger logger = Logger.getLogger(InstanceCreator.class);
31   - final private static TEI_IO teiIO = TEI_IO.getInstance();
32   -
33   - public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) {
34   - int allTexts = 0;
35   - int exceptions = 0;
36   - int allSentences = 0;
37   -
38   - List<TreeMap<String, Object>> examples = new ArrayList<>();
39   - for (File textDir : IOUtils.getNKJPDirs(dataDir)) {
40   - try {
41   - allTexts++;
42   - logger.info("Processing text " + textDir);
43   - TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
44   - Text text = TeiLoader.loadTextFromTei(ct);
45   -
46   - for (Paragraph p : text)
47   - for (Sentence s : p) {
48   - allSentences++;
49   - loadExamplesFromSentence(quasiVerbs, examples, s);
50   - }
51   -
52   - } catch (Exception e) {
53   - logger.error(e.getLocalizedMessage());
54   - exceptions++;
55   - }
56   - }
57   -
58   - logger.info(allTexts + " texts found.");
59   - if (exceptions != 0)
60   - logger.error(exceptions + " texts with exceptions.");
61   - logger.info(allSentences + " sentences found.");
62   -
63   - return examples;
64   - }
65   -
66   - public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples,
67   - Sentence s) {
68   -
69   - // collect positive examples
70   - Set<Token> positive = new HashSet<>();
71   - for (Mention m : s.getMentions()) {
72   - if (FeatureGeneration.isVerb(m)) {
73   - positive.addAll(m.getSegments());
74   - }
75   - }
76   -
77   - for (Token m : s) {
78   - if (!FeatureGeneration.isVerb(m))
79   - continue;
80   -
81   - TreeMap<String, Object> features = new TreeMap<>();
82   - if (positive.contains(m)) {
83   - features.put("class", Boolean.valueOf(true));
84   - } else {
85   - features.put("class", Boolean.valueOf(false));
86   - }
87   -
88   - FeatureGeneration.generateFeatures(features, m, s, quasiVerbs);
89   - examples.add(features);
90   - }
91   - }
92   -
93   - public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) {
94   -
95   - TreeSet<String> booleanAttsOccurred = new TreeSet<>();
96   - TreeSet<String> doubleAttsOccurred = new TreeSet<>();
97   - TreeMap<String, Set<String>> att2values = new TreeMap<>();
98   - for (TreeMap<String, Object> example : examples) {
99   - for (Entry<String, Object> e : example.entrySet()) {
100   - String key = e.getKey();
101   - Object val = e.getValue();
102   - if (val instanceof Integer || val instanceof Double) {
103   - doubleAttsOccurred.add(key);
104   - continue;
105   - }
106   - if (val instanceof Boolean) {
107   - booleanAttsOccurred.add(key);
108   - continue;
109   - }
110   - if (!att2values.containsKey(key))
111   - att2values.put(key, new HashSet<String>());
112   - att2values.get(key).add(val.toString());
113   - }
114   - }
115   -
116   - List<Attribute> atts = new ArrayList<>();
117   -
118   - // double attributes
119   - for (String attName : doubleAttsOccurred) {
120   - Attribute att = new Attribute(attName);
121   - atts.add(att);
122   - }
123   -
124   - // boolean attributes (treated as nominal)
125   - FastVector values = new FastVector(2);
126   - values.addElement("false");
127   - values.addElement("true");
128   - for (String attName : booleanAttsOccurred) {
129   - Attribute att = new Attribute(attName, values);
130   - atts.add(att);
131   - }
132   -
133   - // nominal attributes
134   - for (Entry<String, Set<String>> attVals : att2values.entrySet()) {
135   - FastVector vals = new FastVector(attVals.getValue().size());
136   - for (String val : attVals.getValue())
137   - vals.addElement(val);
138   - Attribute att = new Attribute(attVals.getKey(), vals);
139   - atts.add(att);
140   - }
141   -
142   - FastVector fvWekaAttributes = new FastVector(atts.size());
143   - for (Attribute attr : atts) {
144   - fvWekaAttributes.addElement(attr);
145   - }
146   -
147   - Instances data = new Instances("Zero", fvWekaAttributes, 10);
148   - data.setClass(data.attribute(classFeatureName));
149   - return data;
150   - }
151   -
152   - public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) {
153   - for (TreeMap<String, Object> example : examples) {
154   - Instance instance = new Instance(instances.numAttributes());
155   -
156   - for (Entry<String, Object> e : example.entrySet()) {
157   - Object val = e.getValue();
158   - String name = e.getKey();
159   - if (val instanceof Integer) {
160   - instance.setValue(instances.attribute(name), (int) val);
161   - } else if (val instanceof Boolean) {
162   - instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false");
163   - } else {
164   - int indexOfValue = instances.attribute(name).indexOfValue(val.toString());
165   - if (indexOfValue == -1) {
166   - logger.debug("Unkown value: " + val.toString() + " of feature: " + name
167   - + ". Marking as missing value.");
168   - instance.setMissing(instances.attribute(name));
169   - } else
170   - instance.setValue(instances.attribute(name), indexOfValue);
171   - }
172   - }
173   -
174   - instance.setDataset(instances);
175   - instances.add(instance);
176   - }
177   - }
  21 + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class);
  22 + private static final TEI_IO teiIO = TEI_IO.getInstance();
  23 +
  24 + private InstanceCreator() {
  25 + }
  26 +
  27 + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) {
  28 + int allTexts = 0;
  29 + int exceptions = 0;
  30 + int allSentences = 0;
  31 +
  32 + List<TreeMap<String, Object>> examples = new ArrayList<>();
  33 + for (File textDir : IOUtils.getNKJPDirs(dataDir)) {
  34 + try {
  35 + allTexts++;
  36 + logger.info("Processing text " + textDir);
  37 + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
  38 + Text text = TeiLoader.loadTextFromTei(ct);
  39 +
  40 + for (Paragraph p : text)
  41 + for (Sentence s : p) {
  42 + allSentences++;
  43 + loadExamplesFromSentence(quasiVerbs, examples, s);
  44 + }
  45 +
  46 + } catch (Exception e) {
  47 + logger.error(e.getLocalizedMessage());
  48 + exceptions++;
  49 + }
  50 + }
  51 +
  52 + logger.info(allTexts + " texts found.");
  53 + if (exceptions != 0)
  54 + logger.error(exceptions + " texts with exceptions.");
  55 + logger.info(allSentences + " sentences found.");
  56 +
  57 + return examples;
  58 + }
  59 +
  60 + public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples,
  61 + Sentence s) {
  62 +
  63 + // collect positive examples
  64 + Set<Token> positive = new HashSet<>();
  65 + for (Mention m : s.getMentions()) {
  66 + if (FeatureGeneration.isVerb(m)) {
  67 + positive.addAll(m.getSegments());
  68 + }
  69 + }
  70 +
  71 + for (Token m : s) {
  72 + if (!FeatureGeneration.isVerb(m))
  73 + continue;
  74 +
  75 + TreeMap<String, Object> features = new TreeMap<>();
  76 + if (positive.contains(m)) {
  77 + features.put("class", Boolean.valueOf(true));
  78 + } else {
  79 + features.put("class", Boolean.valueOf(false));
  80 + }
  81 +
  82 + FeatureGeneration.generateFeatures(features, m, s, quasiVerbs);
  83 + examples.add(features);
  84 + }
  85 + }
  86 +
  87 + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) {
  88 +
  89 + TreeSet<String> booleanAttsOccurred = new TreeSet<>();
  90 + TreeSet<String> doubleAttsOccurred = new TreeSet<>();
  91 + TreeMap<String, Set<String>> att2values = new TreeMap<>();
  92 + for (TreeMap<String, Object> example : examples) {
  93 + for (Entry<String, Object> e : example.entrySet()) {
  94 + String key = e.getKey();
  95 + Object val = e.getValue();
  96 + if (val instanceof Integer || val instanceof Double) {
  97 + doubleAttsOccurred.add(key);
  98 + continue;
  99 + }
  100 + if (val instanceof Boolean) {
  101 + booleanAttsOccurred.add(key);
  102 + continue;
  103 + }
  104 + if (!att2values.containsKey(key))
  105 + att2values.put(key, new HashSet<>());
  106 + att2values.get(key).add(val.toString());
  107 + }
  108 + }
  109 +
  110 + List<Attribute> atts = new ArrayList<>();
  111 +
  112 + // double attributes
  113 + for (String attName : doubleAttsOccurred) {
  114 + Attribute att = new Attribute(attName);
  115 + atts.add(att);
  116 + }
  117 +
  118 + // boolean attributes (treated as nominal)
  119 + FastVector values = new FastVector(2);
  120 + values.addElement("false");
  121 + values.addElement("true");
  122 + for (String attName : booleanAttsOccurred) {
  123 + Attribute att = new Attribute(attName, values);
  124 + atts.add(att);
  125 + }
  126 +
  127 + // nominal attributes
  128 + for (Entry<String, Set<String>> attVals : att2values.entrySet()) {
  129 + FastVector vals = new FastVector(attVals.getValue().size());
  130 + for (String val : attVals.getValue())
  131 + vals.addElement(val);
  132 + Attribute att = new Attribute(attVals.getKey(), vals);
  133 + atts.add(att);
  134 + }
  135 +
  136 + FastVector fvWekaAttributes = new FastVector(atts.size());
  137 + for (Attribute attr : atts) {
  138 + fvWekaAttributes.addElement(attr);
  139 + }
  140 +
  141 + Instances data = new Instances("Zero", fvWekaAttributes, 10);
  142 + data.setClass(data.attribute(classFeatureName));
  143 + return data;
  144 + }
  145 +
  146 + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) {
  147 + for (TreeMap<String, Object> example : examples) {
  148 + Instance instance = new Instance(instances.numAttributes());
  149 +
  150 + for (Entry<String, Object> e : example.entrySet()) {
  151 + Object val = e.getValue();
  152 + String name = e.getKey();
  153 + if (val instanceof Integer) {
  154 + instance.setValue(instances.attribute(name), (int) val);
  155 + } else if (val instanceof Boolean) {
  156 + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false");
  157 + } else {
  158 + int indexOfValue = instances.attribute(name).indexOfValue(val.toString());
  159 + if (indexOfValue == -1) {
  160 + logger.debug("Unkown value: " + val.toString() + " of feature: " + name
  161 + + ". Marking as missing value.");
  162 + instance.setMissing(instances.attribute(name));
  163 + } else
  164 + instance.setValue(instances.attribute(name), indexOfValue);
  165 + }
  166 + }
  167 +
  168 + instance.setDataset(instances);
  169 + instances.add(instance);
  170 + }
  171 + }
178 172 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Model.java
1 1 package pl.waw.ipipan.zil.core.md.detection.zero;
2 2  
3   -import java.io.Serializable;
4   -import java.util.List;
5   -import java.util.Set;
6   -import java.util.TreeMap;
7   -
8   -import org.apache.log4j.Logger;
9   -
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
10 5 import pl.waw.ipipan.zil.core.md.entities.Sentence;
11 6 import weka.classifiers.Classifier;
12 7 import weka.core.Instance;
13 8 import weka.core.Instances;
14 9  
  10 +import java.io.Serializable;
  11 +import java.util.List;
  12 +import java.util.Set;
  13 +import java.util.TreeMap;
  14 +
15 15 public class Model implements Serializable {
16 16  
17   - private static final long serialVersionUID = 3351727361273283076L;
18   - private static final Logger logger = Logger.getLogger(Model.class);
19   -
20   - private Classifier classifier;
21   - private Set<String> quasiVerbs;
22   - private Instances instances;
23   -
24   - public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) {
25   - this.classifier = classifier;
26   - this.instances = instances;
27   - this.quasiVerbs = quasiVerbs;
28   - }
29   -
30   - public boolean isZeroSubject(Instance instance, Sentence sentence) {
31   - try {
32   - double response = this.classifier.classifyInstance(instance);
33   - return response > 0;
34   - } catch (Exception e) {
35   - logger.error("Error classyfing verb in sentence: " + sentence);
36   - return false;
37   - }
38   - }
39   -
40   - public Instances getInstances(List<TreeMap<String, Object>> examples) {
41   - Instances instances = new Instances(this.instances);
42   - InstanceCreator.fillInstances(examples, instances);
43   - return instances;
44   - }
45   -
46   - public Set<String> getQuasiVerbs() {
47   - return quasiVerbs;
48   - }
  17 + private static final long serialVersionUID = 3351727361273283076L;
  18 + private static final Logger logger = LoggerFactory.getLogger(Model.class);
  19 +
  20 + private Classifier classifier;
  21 + private Set<String> quasiVerbs;
  22 + private Instances instances;
  23 +
  24 + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) {
  25 + this.classifier = classifier;
  26 + this.instances = instances;
  27 + this.quasiVerbs = quasiVerbs;
  28 + }
  29 +
  30 + public boolean isZeroSubject(Instance instance, Sentence sentence) {
  31 + try {
  32 + double response = this.classifier.classifyInstance(instance);
  33 + return response > 0;
  34 + } catch (Exception e) {
  35 + logger.error("Error classyfing verb in sentence: " + sentence, e);
  36 + return false;
  37 + }
  38 + }
  39 +
  40 + public Instances getInstances(List<TreeMap<String, Object>> examples) {
  41 + Instances instances = new Instances(this.instances);
  42 + InstanceCreator.fillInstances(examples, instances);
  43 + return instances;
  44 + }
  45 +
  46 + public Set<String> getQuasiVerbs() {
  47 + return quasiVerbs;
  48 + }
49 49 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Serializer.java
1 1 package pl.waw.ipipan.zil.core.md.detection.zero;
2 2  
3   -import java.io.InputStream;
4   -
5 3 import weka.core.SerializationHelper;
6 4  
  5 +import java.io.InputStream;
  6 +
7 7 public class Serializer {
8 8  
9 9 public static void saveModel(Model m, String targetModelFilePath) throws Exception {
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Trainer.java
1 1 package pl.waw.ipipan.zil.core.md.detection.zero;
2 2  
3   -import java.io.BufferedReader;
4   -import java.io.File;
5   -import java.io.IOException;
6   -import java.io.InputStream;
7   -import java.io.InputStreamReader;
8   -import java.util.HashSet;
9   -import java.util.List;
10   -import java.util.Random;
11   -import java.util.Set;
12   -import java.util.TreeMap;
13   -
14   -import org.apache.log4j.Logger;
15   -
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
16 5 import weka.classifiers.Evaluation;
17 6 import weka.classifiers.rules.JRip;
18 7 import weka.classifiers.rules.JRip.RipperRule;
... ... @@ -20,104 +9,111 @@ import weka.core.Attribute;
20 9 import weka.core.Instance;
21 10 import weka.core.Instances;
22 11  
  12 +import java.io.*;
  13 +import java.util.*;
  14 +
23 15 public class Trainer {
24 16  
25   - final private static Logger logger = Logger.getLogger(Trainer.class);
26   -
27   - private static final boolean DO_CV = false;
28   - private static final String QUASI_LIST_PATH = "/quasi_verbs.txt";
29   -
30   - public static void main(String[] args) {
31   -
32   - if (args.length != 2) {
33   - logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName()
34   - + " trainDir targetModelFile");
35   - return;
36   - }
37   -
38   - File dataDir = new File(args[0]);
39   - String targetModelFilePath = args[1];
40   -
41   - if (!dataDir.isDirectory()) {
42   - logger.error(dataDir + " is not a directory!");
43   - return;
44   - }
45   -
46   - Set<String> quasiVerbs = loadQuasiVerbs();
47   -
48   - List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs);
49   - Instances instances = InstanceCreator.createInstances(examples, "class");
50   - InstanceCreator.fillInstances(examples, instances);
51   -
52   - printStats(instances);
53   -
54   - try {
55   - JRip model = new JRip();
56   -
57   - if (DO_CV) {
58   - logger.info("Crossvalidation...");
59   - Evaluation eval = new Evaluation(instances);
60   - eval.crossValidateModel(model, instances, 10, new Random(1));
61   - logger.info(eval.toSummaryString());
62   - logger.info(eval.toMatrixString());
63   - logger.info(eval.toClassDetailsString());
64   - }
65   -
66   - logger.info("Building final classifier...");
67   - model = new JRip();
68   - model.buildClassifier(instances);
69   - logger.info(model.getRuleset().size() + " rules generated.");
70   - for (int i = 0; i < model.getRuleset().size(); i++) {
71   - RipperRule v = (RipperRule) model.getRuleset().elementAt(i);
72   - logger.info("\t" + v.toString(instances.classAttribute()));
73   - }
74   -
75   - instances.delete();
76   - logger.info("Features stats:");
77   - for (int i = 0; i < instances.numAttributes(); i++) {
78   - Attribute att = instances.attribute(i);
79   - logger.info(i + ".\t" + att.toString());
80   - }
81   -
82   - logger.info("Saving classifier...");
83   - Model m = new Model(model, instances, quasiVerbs);
84   - Serializer.saveModel(m, targetModelFilePath);
85   - logger.info("Done.");
86   -
87   - } catch (Exception e) {
88   - logger.error("Error: " + e);
89   - }
90   - }
91   -
92   - private static Set<String> loadQuasiVerbs() {
93   - Set<String> quasiVerbs = new HashSet<>();
94   - InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH);
95   - try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
96   - String line = null;
97   - while ((line = br.readLine()) != null) {
98   - quasiVerbs.add(line.trim());
99   - }
100   - } catch (IOException e) {
101   - logger.error(e.getLocalizedMessage());
102   - }
103   - return quasiVerbs;
104   - }
105   -
106   - private static void printStats(Instances instances) {
107   - int positive = 0;
108   - int negative = 0;
109   - for (int i = 0; i < instances.numInstances(); i++) {
110   - Instance inst = instances.instance(i);
111   - if (inst.classValue() > 0)
112   - negative++;
113   - else
114   - positive++;
115   - }
116   - logger.info(positive + " positive examples");
117   - logger.info(negative + " negative examples");
118   - logger.info((positive + negative) + " examples total");
119   - logger.info((instances.numAttributes() - 1) + " attributes");
120   - logger.info(instances.toSummaryString());
121   - }
  17 + private static final Logger logger = LoggerFactory.getLogger(Trainer.class);
  18 +
  19 + private static final boolean DO_CV = false;
  20 + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt";
  21 +
  22 + private Trainer() {
  23 + }
  24 +
  25 + public static void main(String[] args) {
  26 +
  27 + if (args.length != 2) {
  28 + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName()
  29 + + " trainDir targetModelFile");
  30 + return;
  31 + }
  32 +
  33 + File dataDir = new File(args[0]);
  34 + String targetModelFilePath = args[1];
  35 +
  36 + if (!dataDir.isDirectory()) {
  37 + logger.error(dataDir + " is not a directory!");
  38 + return;
  39 + }
  40 +
  41 + Set<String> quasiVerbs = loadQuasiVerbs();
  42 +
  43 + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs);
  44 + Instances instances = InstanceCreator.createInstances(examples, "class");
  45 + InstanceCreator.fillInstances(examples, instances);
  46 +
  47 + printStats(instances);
  48 +
  49 + try {
  50 + JRip model;
  51 +
  52 + if (DO_CV) {
  53 + logger.info("Crossvalidation...");
  54 + model = new JRip();
  55 + Evaluation eval = new Evaluation(instances);
  56 + eval.crossValidateModel(model, instances, 10, new Random(1));
  57 + logger.info(eval.toSummaryString());
  58 + logger.info(eval.toMatrixString());
  59 + logger.info(eval.toClassDetailsString());
  60 + }
  61 +
  62 + logger.info("Building final classifier...");
  63 + model = new JRip();
  64 + model.buildClassifier(instances);
  65 + logger.info(model.getRuleset().size() + " rules generated.");
  66 + for (int i = 0; i < model.getRuleset().size(); i++) {
  67 + RipperRule v = (RipperRule) model.getRuleset().elementAt(i);
  68 + logger.info("\t" + v.toString(instances.classAttribute()));
  69 + }
  70 +
  71 + instances.delete();
  72 + logger.info("Features stats:");
  73 + for (int i = 0; i < instances.numAttributes(); i++) {
  74 + Attribute att = instances.attribute(i);
  75 + logger.info(i + ".\t" + att.toString());
  76 + }
  77 +
  78 + logger.info("Saving classifier...");
  79 + Model m = new Model(model, instances, quasiVerbs);
  80 + Serializer.saveModel(m, targetModelFilePath);
  81 + logger.info("Done.");
  82 +
  83 + } catch (Exception e) {
  84 + logger.error("Error: " + e);
  85 + }
  86 + }
  87 +
  88 + private static Set<String> loadQuasiVerbs() {
  89 + Set<String> quasiVerbs = new HashSet<>();
  90 + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH);
  91 + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
  92 + String line;
  93 + while ((line = br.readLine()) != null) {
  94 + quasiVerbs.add(line.trim());
  95 + }
  96 + } catch (IOException e) {
  97 + logger.error(e.getLocalizedMessage(), e);
  98 + }
  99 + return quasiVerbs;
  100 + }
  101 +
  102 + private static void printStats(Instances instances) {
  103 + int positive = 0;
  104 + int negative = 0;
  105 + for (int i = 0; i < instances.numInstances(); i++) {
  106 + Instance inst = instances.instance(i);
  107 + if (inst.classValue() > 0)
  108 + negative++;
  109 + else
  110 + positive++;
  111 + }
  112 + logger.info(positive + " positive examples");
  113 + logger.info(negative + " negative examples");
  114 + logger.info((positive + negative) + " examples total");
  115 + logger.info((instances.numAttributes() - 1) + " attributes");
  116 + logger.info(instances.toSummaryString());
  117 + }
122 118  
123 119 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/ZeroSubjectDetector.java
1 1 package pl.waw.ipipan.zil.core.md.detection.zero;
2 2  
3   -import java.io.File;
4   -import java.io.InputStream;
5   -import java.util.ArrayList;
6   -import java.util.HashSet;
7   -import java.util.List;
8   -import java.util.Set;
9   -import java.util.TreeMap;
10   -
11   -import org.apache.log4j.Logger;
12   -
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
13 5 import pl.waw.ipipan.zil.core.md.entities.Mention;
14 6 import pl.waw.ipipan.zil.core.md.entities.Sentence;
15 7 import pl.waw.ipipan.zil.core.md.entities.Token;
16 8 import weka.core.Instances;
17 9  
  10 +import java.io.File;
  11 +import java.io.InputStream;
  12 +import java.util.*;
  13 +
18 14 public class ZeroSubjectDetector {
19   - final private static Logger logger = Logger.getLogger(ZeroSubjectDetector.class);
20 15  
21   - private Model model;
22   - private Set<String> quasiVerbs = new HashSet<>();
  16 + final private static Logger logger = LoggerFactory.getLogger(ZeroSubjectDetector.class);
  17 +
  18 + private Model model;
  19 + private Set<String> quasiVerbs = new HashSet<>();
23 20  
24   - public static int verbsWithoutSubject = 0;
25   - public static int verbsWithSubject = 0;
  21 + public static int verbsWithoutSubject = 0;
  22 + public static int verbsWithSubject = 0;
26 23  
27   - public void addZeroSubjectMentions(Sentence sentence) {
28   - List<TreeMap<String, Object>> examples = new ArrayList<>();
29   - InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence);
30   - if (examples.isEmpty())
31   - return;
  24 + public void addZeroSubjectMentions(Sentence sentence) {
  25 + List<TreeMap<String, Object>> examples = new ArrayList<>();
  26 + InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence);
  27 + if (examples.isEmpty())
  28 + return;
32 29  
33   - Instances instances = model.getInstances(examples);
  30 + Instances instances = model.getInstances(examples);
34 31  
35   - // label instances
36   - List<Boolean> areZeros = new ArrayList<>();
37   - for (int i = 0; i < instances.numInstances(); i++) {
38   - boolean isZero = model.isZeroSubject(instances.instance(i), sentence);
39   - areZeros.add(isZero);
40   - if (isZero)
41   - verbsWithoutSubject++;
42   - else
43   - verbsWithSubject++;
44   - }
  32 + // label instances
  33 + List<Boolean> areZeros = new ArrayList<>();
  34 + for (int i = 0; i < instances.numInstances(); i++) {
  35 + boolean isZero = model.isZeroSubject(instances.instance(i), sentence);
  36 + areZeros.add(isZero);
  37 + if (isZero)
  38 + verbsWithoutSubject++;
  39 + else
  40 + verbsWithSubject++;
  41 + }
45 42  
46   - int i = 0;
47   - for (Token m : sentence) {
48   - if (!FeatureGeneration.isVerb(m))
49   - continue;
50   - if (areZeros.get(i))
51   - sentence.addMention(new Mention(m, true));
52   - i++;
53   - }
54   - }
  43 + int i = 0;
  44 + for (Token m : sentence) {
  45 + if (!FeatureGeneration.isVerb(m))
  46 + continue;
  47 + if (areZeros.get(i))
  48 + sentence.addMention(new Mention(m, true));
  49 + i++;
  50 + }
  51 + }
55 52  
56   - public ZeroSubjectDetector(File zeroSubjectDetectionModel) {
57   - try {
58   - this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath());
59   - this.quasiVerbs = this.model.getQuasiVerbs();
60   - } catch (Exception e) {
61   - logger.error("Error loading model:" + e);
62   - }
63   - }
  53 + public ZeroSubjectDetector(File zeroSubjectDetectionModel) {
  54 + try {
  55 + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath());
  56 + this.quasiVerbs = this.model.getQuasiVerbs();
  57 + } catch (Exception e) {
  58 + logger.error("Error loading model:" + e);
  59 + }
  60 + }
64 61  
65   - public ZeroSubjectDetector(InputStream zeroSubjectDetectionModelStream) {
66   - try {
67   - this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream);
68   - this.quasiVerbs = this.model.getQuasiVerbs();
69   - } catch (Exception e) {
70   - logger.error("Error loading model:" + e);
71   - }
72   - }
  62 + public ZeroSubjectDetector(InputStream zeroSubjectDetectionModelStream) {
  63 + try {
  64 + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream);
  65 + this.quasiVerbs = this.model.getQuasiVerbs();
  66 + } catch (Exception e) {
  67 + logger.error("Error loading model:" + e);
  68 + }
  69 + }
73 70 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java
1 1 package pl.waw.ipipan.zil.core.md.entities;
2 2  
3   -import java.util.ArrayList;
4   -import java.util.Collection;
5   -import java.util.Collections;
6   -import java.util.HashSet;
7   -import java.util.List;
8   -import java.util.Set;
  3 +import java.util.*;
9 4  
10 5 public class Token implements Comparable<Token> {
11 6 private Sentence sentence;
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
1 1 package pl.waw.ipipan.zil.core.md.io.tei;
2 2  
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.entities.*;
  6 +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.*;
  7 +import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
  8 +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
  9 +
3 10 import java.io.File;
4 11 import java.util.ArrayList;
5 12 import java.util.HashMap;
6 13 import java.util.List;
7 14 import java.util.Map;
8 15  
9   -import org.apache.log4j.Logger;
10   -
11   -import pl.waw.ipipan.zil.core.md.entities.Interpretation;
12   -import pl.waw.ipipan.zil.core.md.entities.Mention;
13   -import pl.waw.ipipan.zil.core.md.entities.NamedEntity;
14   -import pl.waw.ipipan.zil.core.md.entities.Paragraph;
15   -import pl.waw.ipipan.zil.core.md.entities.Sentence;
16   -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
17   -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
18   -import pl.waw.ipipan.zil.core.md.entities.Text;
19   -import pl.waw.ipipan.zil.core.md.entities.Token;
20   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
21   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIGroup;
22   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIInterpretation;
23   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention;
24   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph;
25   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEINamedEntity;
26   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIParagraph;
27   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEISentence;
28   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEISyntacticEntity;
29   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIWord;
30   -import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
31   -import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
32   -
33 16 public class TeiLoader {
34 17  
35   - private static Logger logger = Logger.getLogger(TeiLoader.class);
36   - private static TEI_IO teiAPI = TEI_IO.getInstance();
37   -
38   - public static TEICorpusText readTeiText(File teiDir) throws TEIException {
39   - return teiAPI.readFromNKJPDirectory(teiDir);
40   - }
41   -
42   - public static Text loadTextFromTei(TEICorpusText teiText) {
43   - Text text = new Text(teiText.getCorpusHeader().getId());
44   -
45   - logger.debug("Loading tei text " + text.getId() + "...");
46   - for (TEIParagraph teiP : teiText.getParagraphs())
47   - loadParagraph(text, teiP);
48   - logger.debug("Tei text loaded.");
49   -
50   - return text;
51   - }
52   -
53   - private static void loadParagraph(Text text, TEIParagraph teiP) {
54   - Paragraph p = new Paragraph();
55   - text.add(p);
56   - for (TEISentence teiS : teiP.getSentences())
57   - loadSentence(p, teiS);
58   - }
59   -
60   - private static void loadSentence(Paragraph p, TEISentence teiS) {
61   - Sentence s = new Sentence();
62   - p.add(s);
63   - Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>();
64   - for (TEIMorph teiM : teiS.getMorphs()) {
65   - Token token = loadToken(s, teiM);
66   - teiMorph2Segment.put(teiM, token);
67   - }
68   - for (TEINamedEntity ne : teiS.getAllNamedEntities())
69   - loadNE(s, ne, teiMorph2Segment);
70   - for (TEIWord w : teiS.getAllWords())
71   - loadSyntacticWord(s, w, teiMorph2Segment);
72   - for (TEIGroup g : teiS.getAllGroups())
73   - loadSyntacticGroup(s, g, teiMorph2Segment);
74   - for (TEIMention m : teiS.getAllMentions())
75   - loadMentions(s, m, teiMorph2Segment);
76   - }
77   -
78   - private static void loadMentions(Sentence s, TEIMention m,
79   - Map<TEIMorph, Token> teiMorph2Segment) {
80   - List<Token> tokens = new ArrayList<>();
81   - for (TEIMorph mo : m.getMorphs())
82   - tokens.add(teiMorph2Segment.get(mo));
83   - List<Token> headTokens = new ArrayList<>();
84   - for (TEIMorph mo : m.getHeadMorphs())
85   - headTokens.add(teiMorph2Segment.get(mo));
86   - s.addMention(new Mention(tokens, headTokens, m.isZeroSubject()));
87   - }
88   -
89   - private static void loadSyntacticGroup(Sentence s, TEIGroup g,
90   - Map<TEIMorph, Token> teiMorph2Segment) {
91   - String type = g.getType();
92   -
93   - List<Token> tokens = new ArrayList<>();
94   - for (TEIMorph m : g.getLeaves())
95   - tokens.add(teiMorph2Segment.get(m));
96   -
97   - List<Token> headTokens = new ArrayList<>();
98   - TEISyntacticEntity semanticHead = g;
99   - while (semanticHead.isGroup()
100   - && semanticHead.asGroup().getSemanticHead() != null)
101   - semanticHead = semanticHead.asGroup().getSemanticHead();
102   - for (TEIMorph m : semanticHead.getLeaves())
103   - headTokens.add(teiMorph2Segment.get(m));
104   -
105   - s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
106   - }
107   -
108   - private static void loadSyntacticWord(Sentence s, TEIWord w,
109   - Map<TEIMorph, Token> teiMorph2Segment) {
110   - String ctag = w.getInterpretation().getCtag();
111   - List<Token> tokens = new ArrayList<>();
112   - for (TEIMorph m : w.getAllMorphs())
113   - tokens.add(teiMorph2Segment.get(m));
114   - s.addSyntacticWord(new SyntacticWord(ctag, tokens));
115   - }
116   -
117   - private static void loadNE(Sentence s, TEINamedEntity ne,
118   - Map<TEIMorph, Token> teiMorph2Segment) {
119   - List<Token> tokens = new ArrayList<>();
120   - for (TEIMorph m : ne.getLeaves())
121   - tokens.add(teiMorph2Segment.get(m));
122   - s.addNamedEntity(new NamedEntity(tokens));
123   - }
124   -
125   - private static Token loadToken(Sentence s, TEIMorph teiM) {
126   - Token seg = new Token();
127   - s.add(seg);
128   -
129   - seg.setOrth(teiM.getOrth());
130   - TEIInterpretation interp = teiM.getChosenInterpretation();
131   - Interpretation chosenIterpretation = new Interpretation(
132   - interp.getCtag(), interp.getMorph(), interp.getBase());
133   - seg.addChosenInterpretation(chosenIterpretation);
134   -
135   - for (TEIInterpretation interp2 : teiM.getAllInterpretations()) {
136   - Interpretation inter = new Interpretation(interp2.getCtag(),
137   - interp2.getMorph(), interp.getBase());
138   - seg.addInterpretation(inter);
139   - }
140   -
141   - return seg;
142   - }
  18 + private static Logger logger = LoggerFactory.getLogger(TeiLoader.class);
  19 + private static TEI_IO teiAPI = TEI_IO.getInstance();
  20 +
  21 + private TeiLoader() {
  22 + }
  23 +
  24 + public static TEICorpusText readTeiText(File teiDir) throws TEIException {
  25 + return teiAPI.readFromNKJPDirectory(teiDir);
  26 + }
  27 +
  28 + public static Text loadTextFromTei(TEICorpusText teiText) {
  29 + Text text = new Text(teiText.getCorpusHeader().getId());
  30 +
  31 + logger.debug("Loading tei text " + text.getId() + "...");
  32 + for (TEIParagraph teiP : teiText.getParagraphs())
  33 + loadParagraph(text, teiP);
  34 + logger.debug("Tei text loaded.");
  35 +
  36 + return text;
  37 + }
  38 +
  39 + private static void loadParagraph(Text text, TEIParagraph teiP) {
  40 + Paragraph p = new Paragraph();
  41 + text.add(p);
  42 + for (TEISentence teiS : teiP.getSentences())
  43 + loadSentence(p, teiS);
  44 + }
  45 +
  46 + private static void loadSentence(Paragraph p, TEISentence teiS) {
  47 + Sentence s = new Sentence();
  48 + p.add(s);
  49 + Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>();
  50 + for (TEIMorph teiM : teiS.getMorphs()) {
  51 + Token token = loadToken(s, teiM);
  52 + teiMorph2Segment.put(teiM, token);
  53 + }
  54 + for (TEINamedEntity ne : teiS.getAllNamedEntities())
  55 + loadNE(s, ne, teiMorph2Segment);
  56 + for (TEIWord w : teiS.getAllWords())
  57 + loadSyntacticWord(s, w, teiMorph2Segment);
  58 + for (TEIGroup g : teiS.getAllGroups())
  59 + loadSyntacticGroup(s, g, teiMorph2Segment);
  60 + for (TEIMention m : teiS.getAllMentions())
  61 + loadMentions(s, m, teiMorph2Segment);
  62 + }
  63 +
  64 + private static void loadMentions(Sentence s, TEIMention m,
  65 + Map<TEIMorph, Token> teiMorph2Segment) {
  66 + List<Token> tokens = new ArrayList<>();
  67 + for (TEIMorph mo : m.getMorphs())
  68 + tokens.add(teiMorph2Segment.get(mo));
  69 + List<Token> headTokens = new ArrayList<>();
  70 + for (TEIMorph mo : m.getHeadMorphs())
  71 + headTokens.add(teiMorph2Segment.get(mo));
  72 + s.addMention(new Mention(tokens, headTokens, m.isZeroSubject()));
  73 + }
  74 +
  75 + private static void loadSyntacticGroup(Sentence s, TEIGroup g,
  76 + Map<TEIMorph, Token> teiMorph2Segment) {
  77 + String type = g.getType();
  78 +
  79 + List<Token> tokens = new ArrayList<>();
  80 + for (TEIMorph m : g.getLeaves())
  81 + tokens.add(teiMorph2Segment.get(m));
  82 +
  83 + List<Token> headTokens = new ArrayList<>();
  84 + TEISyntacticEntity semanticHead = g;
  85 + while (semanticHead.isGroup()
  86 + && semanticHead.asGroup().getSemanticHead() != null)
  87 + semanticHead = semanticHead.asGroup().getSemanticHead();
  88 + for (TEIMorph m : semanticHead.getLeaves())
  89 + headTokens.add(teiMorph2Segment.get(m));
  90 +
  91 + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
  92 + }
  93 +
  94 + private static void loadSyntacticWord(Sentence s, TEIWord w,
  95 + Map<TEIMorph, Token> teiMorph2Segment) {
  96 + String ctag = w.getInterpretation().getCtag();
  97 + List<Token> tokens = new ArrayList<>();
  98 + for (TEIMorph m : w.getAllMorphs())
  99 + tokens.add(teiMorph2Segment.get(m));
  100 + s.addSyntacticWord(new SyntacticWord(ctag, tokens));
  101 + }
  102 +
  103 + private static void loadNE(Sentence s, TEINamedEntity ne,
  104 + Map<TEIMorph, Token> teiMorph2Segment) {
  105 + List<Token> tokens = new ArrayList<>();
  106 + for (TEIMorph m : ne.getLeaves())
  107 + tokens.add(teiMorph2Segment.get(m));
  108 + s.addNamedEntity(new NamedEntity(tokens));
  109 + }
  110 +
  111 + private static Token loadToken(Sentence s, TEIMorph teiM) {
  112 + Token seg = new Token();
  113 + s.add(seg);
  114 +
  115 + seg.setOrth(teiM.getOrth());
  116 + TEIInterpretation interp = teiM.getChosenInterpretation();
  117 + Interpretation chosenIterpretation = new Interpretation(
  118 + interp.getCtag(), interp.getMorph(), interp.getBase());
  119 + seg.addChosenInterpretation(chosenIterpretation);
  120 +
  121 + for (TEIInterpretation interp2 : teiM.getAllInterpretations()) {
  122 + Interpretation inter = new Interpretation(interp2.getCtag(),
  123 + interp2.getMorph(), interp.getBase());
  124 + seg.addInterpretation(inter);
  125 + }
  126 +
  127 + return seg;
  128 + }
143 129  
144 130 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiSaver.java
1 1 package pl.waw.ipipan.zil.core.md.io.tei;
2 2  
3   -import java.io.File;
4   -import java.util.ArrayList;
5   -import java.util.HashMap;
6   -import java.util.Iterator;
7   -import java.util.List;
8   -import java.util.Map;
9   -
10   -import org.apache.log4j.Logger;
11   -
12   -import pl.waw.ipipan.zil.core.md.entities.Mention;
13   -import pl.waw.ipipan.zil.core.md.entities.Paragraph;
14   -import pl.waw.ipipan.zil.core.md.entities.Sentence;
15   -import pl.waw.ipipan.zil.core.md.entities.Text;
16   -import pl.waw.ipipan.zil.core.md.entities.Token;
17   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.AnnotationLayer;
18   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.EntitiesFactory;
19   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICoreference;
20   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
21   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention;
22   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph;
23   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIParagraph;
24   -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEISentence;
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.entities.*;
  6 +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.*;
25 7 import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
26 8 import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
27 9 import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO.CompressionMethod;
28 10  
  11 +import java.io.File;
  12 +import java.util.*;
  13 +
29 14 public class TeiSaver {
30 15  
31   - private static Logger logger = Logger.getLogger(TeiSaver.class);
32   - private static TEI_IO teiAPI = TEI_IO.getInstance();
33   - final private static EntitiesFactory ef = EntitiesFactory.getInstance();
34   -
35   - public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException {
36   - logger.debug("Saving text in " + targetDir);
37   - CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE;
38   - teiAPI.writeToNKJPDirectory(teiText, targetDir, cm);
39   - }
40   -
41   - public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException {
42   - Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>();
43   -
44   - Iterator<Paragraph> pIt = t.iterator();
45   - Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator();
46   - int mentionId = 0;
47   - while (pIt.hasNext() && pItTei.hasNext()) {
48   - Paragraph p = pIt.next();
49   - TEIParagraph pTei = pItTei.next();
50   -
51   - mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei);
52   - }
53   - checkIterators(pIt, pItTei, "paragraph");
54   -
55   - teiText.addAnnotationLayer(AnnotationLayer.MENTIONS,
56   - EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS));
57   -
58   - // clear coreference as we have new mentions it became invalid
59   - teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE);
60   - teiText.setCoreferences(new ArrayList<TEICoreference>());
61   -
62   - logger.debug(mentionId + " mentions added");
63   - }
64   -
65   - private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p,
66   - TEIParagraph pTei) throws TEIException {
67   - Iterator<Sentence> sIt = p.iterator();
68   - Iterator<TEISentence> sItTei = pTei.getSentences().iterator();
69   -
70   - while (sIt.hasNext() && sItTei.hasNext()) {
71   - Sentence s = sIt.next();
72   - TEISentence sTei = sItTei.next();
73   - mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei);
74   - }
75   - checkIterators(sIt, sItTei, "sentence");
76   - return mentionId;
77   - }
78   -
79   - private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s,
80   - TEISentence sTei) throws TEIException {
81   - sTei.getAllMentions().clear();
82   -
83   - Map<Token, TEIMorph> seg2morph = new HashMap<Token, TEIMorph>();
84   -
85   - Iterator<Token> segIt = s.iterator();
86   - Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator();
87   -
88   - while (segIt.hasNext() && segItTei.hasNext()) {
89   - seg2morph.put(segIt.next(), segItTei.next());
90   - }
91   - checkIterators(segIt, segItTei, "token");
92   -
93   - List<TEIMention> mentions = new ArrayList<TEIMention>();
94   -
95   - for (Mention m : s.getMentions()) {
96   - List<TEIMorph> morphs = new ArrayList<TEIMorph>();
97   - List<TEIMorph> heads = new ArrayList<TEIMorph>();
98   -
99   - for (Token seg : m.getSegments())
100   - morphs.add(seg2morph.get(seg));
101   -
102   - for (Token seg : m.getHeadSegments())
103   - heads.add(seg2morph.get(seg));
104   -
105   - TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject());
106   - mentions.add(mention);
107   - mention2mention.put(m, mention);
108   - }
109   - sTei.setMentions(mentions);
110   - return mentionId;
111   - }
112   -
113   - private static void checkIterators(Iterator<? extends Object> one, Iterator<? extends Object> other, String level)
114   - throws TEIException {
115   - if (one.hasNext() || other.hasNext())
116   - throw new TEIException("Problem mapping tei to thrift for level " + level);
117   - }
  16 + private static final Logger logger = LoggerFactory.getLogger(TeiSaver.class);
  17 + private static final TEI_IO teiAPI = TEI_IO.getInstance();
  18 + private static final EntitiesFactory ef = EntitiesFactory.getInstance();
  19 +
  20 + private TeiSaver() {
  21 + }
  22 +
  23 + public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException {
  24 + logger.debug("Saving text in " + targetDir);
  25 + CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE;
  26 + teiAPI.writeToNKJPDirectory(teiText, targetDir, cm);
  27 + }
  28 +
  29 + public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException {
  30 + Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>();
  31 +
  32 + Iterator<Paragraph> pIt = t.iterator();
  33 + Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator();
  34 + int mentionId = 0;
  35 + while (pIt.hasNext() && pItTei.hasNext()) {
  36 + Paragraph p = pIt.next();
  37 + TEIParagraph pTei = pItTei.next();
  38 +
  39 + mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei);
  40 + }
  41 + checkIterators(pIt, pItTei, "paragraph");
  42 +
  43 + teiText.addAnnotationLayer(AnnotationLayer.MENTIONS,
  44 + EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS));
  45 +
  46 + // clear coreference as we have new mentions it became invalid
  47 + teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE);
  48 + teiText.setCoreferences(new ArrayList<TEICoreference>());
  49 +
  50 + logger.debug(mentionId + " mentions added");
  51 + }
  52 +
  53 + private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p,
  54 + TEIParagraph pTei) throws TEIException {
  55 + Iterator<Sentence> sIt = p.iterator();
  56 + Iterator<TEISentence> sItTei = pTei.getSentences().iterator();
  57 +
  58 + while (sIt.hasNext() && sItTei.hasNext()) {
  59 + Sentence s = sIt.next();
  60 + TEISentence sTei = sItTei.next();
  61 + mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei);
  62 + }
  63 + checkIterators(sIt, sItTei, "sentence");
  64 + return mentionId;
  65 + }
  66 +
  67 + private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s,
  68 + TEISentence sTei) throws TEIException {
  69 + sTei.getAllMentions().clear();
  70 +
  71 + Map<Token, TEIMorph> seg2morph = new HashMap<>();
  72 +
  73 + Iterator<Token> segIt = s.iterator();
  74 + Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator();
  75 +
  76 + while (segIt.hasNext() && segItTei.hasNext()) {
  77 + seg2morph.put(segIt.next(), segItTei.next());
  78 + }
  79 + checkIterators(segIt, segItTei, "token");
  80 +
  81 + List<TEIMention> mentions = new ArrayList<>();
  82 +
  83 + for (Mention m : s.getMentions()) {
  84 + List<TEIMorph> morphs = new ArrayList<>();
  85 + List<TEIMorph> heads = new ArrayList<>();
  86 +
  87 + for (Token seg : m.getSegments())
  88 + morphs.add(seg2morph.get(seg));
  89 +
  90 + for (Token seg : m.getHeadSegments())
  91 + heads.add(seg2morph.get(seg));
  92 +
  93 + TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject());
  94 + mentions.add(mention);
  95 + mention2mention.put(m, mention);
  96 + }
  97 + sTei.setMentions(mentions);
  98 + return mentionId;
  99 + }
  100 +
  101 + private static void checkIterators(Iterator<?> one, Iterator<?> other, String level)
  102 + throws TEIException {
  103 + if (one.hasNext() || other.hasNext())
  104 + throw new TEIException("Problem mapping tei to thrift for level " + level);
  105 + }
118 106  
119 107 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
1 1 package pl.waw.ipipan.zil.core.md.io.thrift;
2 2  
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.entities.*;
  6 +import pl.waw.ipipan.zil.multiservice.thrift.types.*;
  7 +
3 8 import java.util.ArrayList;
4 9 import java.util.HashMap;
5 10 import java.util.List;
6 11 import java.util.Map;
7 12  
8   -import org.apache.log4j.Logger;
9   -
10   -import pl.waw.ipipan.zil.core.md.entities.Interpretation;
11   -import pl.waw.ipipan.zil.core.md.entities.NamedEntity;
12   -import pl.waw.ipipan.zil.core.md.entities.Paragraph;
13   -import pl.waw.ipipan.zil.core.md.entities.Sentence;
14   -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
15   -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
16   -import pl.waw.ipipan.zil.core.md.entities.Text;
17   -import pl.waw.ipipan.zil.core.md.entities.Token;
18   -import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException;
19   -import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation;
20   -import pl.waw.ipipan.zil.multiservice.thrift.types.TNamedEntity;
21   -import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
22   -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
23   -import pl.waw.ipipan.zil.multiservice.thrift.types.TSyntacticGroup;
24   -import pl.waw.ipipan.zil.multiservice.thrift.types.TSyntacticWord;
25   -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
26   -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
27   -
28 13 public class ThriftLoader {
29 14  
30   - private static Logger logger = Logger.getLogger(ThriftLoader.class);
31   -
32   - public static Text loadTextFromThrift(TText thriftText)
33   - throws MultiserviceException {
34   - Text text = new Text(thriftText.getTextHeader() == null ? "null"
35   - : thriftText.getTextHeader().getId());
36   -
37   - logger.debug("Loading text " + text.getId() + " from thrift format...");
38   - for (TParagraph teiP : thriftText.getParagraphs())
39   - loadParagraph(text, teiP);
40   - logger.debug("Thrift text loaded.");
41   -
42   - return text;
43   - }
44   -
45   - private static void loadParagraph(Text text, TParagraph teiP)
46   - throws MultiserviceException {
47   - Paragraph p = new Paragraph();
48   - text.add(p);
49   -
50   - for (TSentence teiS : teiP.getSentences())
51   - loadSentence(p, teiS);
52   - }
53   -
54   - private static void loadSentence(Paragraph p, TSentence thriftSent)
55   - throws MultiserviceException {
56   - Sentence s = new Sentence();
57   - p.add(s);
58   -
59   - Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent);
60   -
61   - Map<String, Token> thiftTokenId2Token = new HashMap<>();
62   - for (TToken teiM : thriftSent.getTokens()) {
63   - Token token = loadToken(s, teiM);
64   - thiftTokenId2Token.put(teiM.getId(), token);
65   - }
66   - if (thriftSent.isSetNames())
67   - for (TNamedEntity ne : thriftSent.getNames())
68   - loadNE(s, ne, thirftId2Entity, thiftTokenId2Token);
69   - if (thriftSent.isSetWords())
70   - for (TSyntacticWord w : thriftSent.getWords())
71   - loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token);
72   - if (thriftSent.isSetGroups())
73   - for (TSyntacticGroup g : thriftSent.getGroups())
74   - loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token);
75   - }
76   -
77   - private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g,
78   - Map<String, Object> thirftId2Entity,
79   - Map<String, Token> thiftTokenId2Token) {
80   - String type = g.getType();
81   - List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity,
82   - thiftTokenId2Token, false);
83   - List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity,
84   - thiftTokenId2Token, true);
85   - s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
86   - }
87   -
88   - private static void loadSyntacticWord(Sentence s, TSyntacticWord w,
89   - Map<String, Object> thirftId2Entity,
90   - Map<String, Token> thiftTokenId2Token) {
91   - String ctag = w.getChosenInterpretation().getCtag();
92   - List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity,
93   - thiftTokenId2Token, false);
94   - s.addSyntacticWord(new SyntacticWord(ctag, tokens));
95   - }
96   -
97   - private static void loadNE(Sentence s, TNamedEntity ne,
98   - Map<String, Object> thirftId2Entity,
99   - Map<String, Token> thiftTokenId2Token) {
100   - List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity,
101   - thiftTokenId2Token, false);
102   - s.addNamedEntity(new NamedEntity(tokens));
103   - }
104   -
105   - private static Map<String, Object> getThriftId2EntityMap(
106   - TSentence thriftSent) {
107   - Map<String, Object> idToEntity = new HashMap<>();
108   - for (TToken tok : thriftSent.getTokens())
109   - idToEntity.put(tok.getId(), tok);
110   - if (thriftSent.isSetWords())
111   - for (TSyntacticWord w : thriftSent.getWords())
112   - idToEntity.put(w.getId(), w);
113   - if (thriftSent.isSetNames())
114   - for (TNamedEntity ne : thriftSent.getNames())
115   - idToEntity.put(ne.getId(), ne);
116   - if (thriftSent.isSetGroups())
117   - for (TSyntacticGroup group : thriftSent.getGroups())
118   - idToEntity.put(group.getId(), group);
119   - return idToEntity;
120   - }
121   -
122   - private static Token loadToken(Sentence s, TToken teiM)
123   - throws MultiserviceException {
124   - Token seg = new Token();
125   - s.add(seg);
126   -
127   - seg.setOrth(teiM.getOrth());
128   - TInterpretation interp = getTokenChosenInt(teiM);
129   - Interpretation chosenIterpretation = new Interpretation(
130   - interp.getCtag(), interp.getMsd(), interp.getBase());
131   - seg.addChosenInterpretation(chosenIterpretation);
132   -
133   - for (TInterpretation interp2 : teiM.getInterpretations()) {
134   - Interpretation inter = new Interpretation(interp2.getCtag(),
135   - interp2.getMsd(), interp.getBase());
136   - seg.addInterpretation(inter);
137   - }
138   - return seg;
139   - }
140   -
141   - private static TInterpretation getTokenChosenInt(TToken token)
142   - throws MultiserviceException {
143   - TInterpretation interp = token.getChosenInterpretation();
144   - if (interp == null || interp.getBase() == null
145   - || interp.getBase().equals("")) {
146   - if (token.getCandidateInterpretations() == null
147   - || token.getCandidateInterpretations().size() == 0
148   - || token.getCandidateInterpretations().get(0).getBase() == null
149   - || token.getCandidateInterpretations().get(0).getBase()
150   - .equals(""))
151   - throw new MultiserviceException(
152   - "No proper chosen or candidate interpretation for segment: "
153   - + token.id);
154   - interp = token.getCandidateInterpretations().get(0);
155   - }
156   - return interp;
157   - }
158   -
159   - private static List<Token> getUnderlyingSegments(Object entity,
160   - Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment,
161   - boolean headsOnly) {
162   - List<Token> result = new ArrayList<>();
163   -
164   - if (entity instanceof TToken) {
165   - result.add(tokenId2Segment.get(((TToken) entity).getId()));
166   - return result;
167   - }
168   -
169   - List<String> childIds = new ArrayList<>();
170   - if (entity instanceof TSyntacticWord)
171   - childIds = ((TSyntacticWord) entity).getChildIds();
172   - else if (entity instanceof TNamedEntity)
173   - childIds = ((TNamedEntity) entity).getChildIds();
174   - else if (entity instanceof TSyntacticGroup)
175   - if (headsOnly) {
176   - childIds = new ArrayList<String>();
177   - childIds.add(((TSyntacticGroup) entity).getSemanticHeadId());
178   - } else
179   - childIds = ((TSyntacticGroup) entity).getChildIds();
180   -
181   - for (String id : childIds)
182   - result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity,
183   - tokenId2Segment, headsOnly));
184   -
185   - return result;
186   - }
  15 + private static Logger logger = LoggerFactory.getLogger(ThriftLoader.class);
  16 +
  17 + public static Text loadTextFromThrift(TText thriftText)
  18 + throws MultiserviceException {
  19 + Text text = new Text(thriftText.getTextHeader() == null ? "null"
  20 + : thriftText.getTextHeader().getId());
  21 +
  22 + logger.debug("Loading text " + text.getId() + " from thrift format...");
  23 + for (TParagraph teiP : thriftText.getParagraphs())
  24 + loadParagraph(text, teiP);
  25 + logger.debug("Thrift text loaded.");
  26 +
  27 + return text;
  28 + }
  29 +
  30 + private static void loadParagraph(Text text, TParagraph teiP)
  31 + throws MultiserviceException {
  32 + Paragraph p = new Paragraph();
  33 + text.add(p);
  34 +
  35 + for (TSentence teiS : teiP.getSentences())
  36 + loadSentence(p, teiS);
  37 + }
  38 +
  39 + private static void loadSentence(Paragraph p, TSentence thriftSent)
  40 + throws MultiserviceException {
  41 + Sentence s = new Sentence();
  42 + p.add(s);
  43 +
  44 + Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent);
  45 +
  46 + Map<String, Token> thiftTokenId2Token = new HashMap<>();
  47 + for (TToken teiM : thriftSent.getTokens()) {
  48 + Token token = loadToken(s, teiM);
  49 + thiftTokenId2Token.put(teiM.getId(), token);
  50 + }
  51 + if (thriftSent.isSetNames())
  52 + for (TNamedEntity ne : thriftSent.getNames())
  53 + loadNE(s, ne, thirftId2Entity, thiftTokenId2Token);
  54 + if (thriftSent.isSetWords())
  55 + for (TSyntacticWord w : thriftSent.getWords())
  56 + loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token);
  57 + if (thriftSent.isSetGroups())
  58 + for (TSyntacticGroup g : thriftSent.getGroups())
  59 + loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token);
  60 + }
  61 +
  62 + private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g,
  63 + Map<String, Object> thirftId2Entity,
  64 + Map<String, Token> thiftTokenId2Token) {
  65 + String type = g.getType();
  66 + List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity,
  67 + thiftTokenId2Token, false);
  68 + List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity,
  69 + thiftTokenId2Token, true);
  70 + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
  71 + }
  72 +
  73 + private static void loadSyntacticWord(Sentence s, TSyntacticWord w,
  74 + Map<String, Object> thirftId2Entity,
  75 + Map<String, Token> thiftTokenId2Token) {
  76 + String ctag = w.getChosenInterpretation().getCtag();
  77 + List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity,
  78 + thiftTokenId2Token, false);
  79 + s.addSyntacticWord(new SyntacticWord(ctag, tokens));
  80 + }
  81 +
  82 + private static void loadNE(Sentence s, TNamedEntity ne,
  83 + Map<String, Object> thirftId2Entity,
  84 + Map<String, Token> thiftTokenId2Token) {
  85 + List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity,
  86 + thiftTokenId2Token, false);
  87 + s.addNamedEntity(new NamedEntity(tokens));
  88 + }
  89 +
  90 + private static Map<String, Object> getThriftId2EntityMap(
  91 + TSentence thriftSent) {
  92 + Map<String, Object> idToEntity = new HashMap<>();
  93 + for (TToken tok : thriftSent.getTokens())
  94 + idToEntity.put(tok.getId(), tok);
  95 + if (thriftSent.isSetWords())
  96 + for (TSyntacticWord w : thriftSent.getWords())
  97 + idToEntity.put(w.getId(), w);
  98 + if (thriftSent.isSetNames())
  99 + for (TNamedEntity ne : thriftSent.getNames())
  100 + idToEntity.put(ne.getId(), ne);
  101 + if (thriftSent.isSetGroups())
  102 + for (TSyntacticGroup group : thriftSent.getGroups())
  103 + idToEntity.put(group.getId(), group);
  104 + return idToEntity;
  105 + }
  106 +
  107 + private static Token loadToken(Sentence s, TToken teiM)
  108 + throws MultiserviceException {
  109 + Token seg = new Token();
  110 + s.add(seg);
  111 +
  112 + seg.setOrth(teiM.getOrth());
  113 + TInterpretation interp = getTokenChosenInt(teiM);
  114 + Interpretation chosenIterpretation = new Interpretation(
  115 + interp.getCtag(), interp.getMsd(), interp.getBase());
  116 + seg.addChosenInterpretation(chosenIterpretation);
  117 +
  118 + for (TInterpretation interp2 : teiM.getInterpretations()) {
  119 + Interpretation inter = new Interpretation(interp2.getCtag(),
  120 + interp2.getMsd(), interp.getBase());
  121 + seg.addInterpretation(inter);
  122 + }
  123 + return seg;
  124 + }
  125 +
  126 + private static TInterpretation getTokenChosenInt(TToken token)
  127 + throws MultiserviceException {
  128 + TInterpretation interp = token.getChosenInterpretation();
  129 + if (interp == null || interp.getBase() == null
  130 + || "".equals(interp.getBase())) {
  131 + if (token.getCandidateInterpretations() == null
  132 + || token.getCandidateInterpretations().isEmpty()
  133 + || token.getCandidateInterpretations().get(0).getBase() == null
  134 + || "".equals(token.getCandidateInterpretations().get(0).getBase()))
  135 + throw new MultiserviceException(
  136 + "No proper chosen or candidate interpretation for segment: "
  137 + + token.id);
  138 + interp = token.getCandidateInterpretations().get(0);
  139 + }
  140 + return interp;
  141 + }
  142 +
  143 + private static List<Token> getUnderlyingSegments(Object entity,
  144 + Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment,
  145 + boolean headsOnly) {
  146 + List<Token> result = new ArrayList<>();
  147 +
  148 + if (entity instanceof TToken) {
  149 + result.add(tokenId2Segment.get(((TToken) entity).getId()));
  150 + return result;
  151 + }
  152 +
  153 + List<String> childIds = new ArrayList<>();
  154 + if (entity instanceof TSyntacticWord)
  155 + childIds = ((TSyntacticWord) entity).getChildIds();
  156 + else if (entity instanceof TNamedEntity)
  157 + childIds = ((TNamedEntity) entity).getChildIds();
  158 + else if (entity instanceof TSyntacticGroup)
  159 + if (headsOnly) {
  160 + childIds = new ArrayList<>();
  161 + childIds.add(((TSyntacticGroup) entity).getSemanticHeadId());
  162 + } else
  163 + childIds = ((TSyntacticGroup) entity).getChildIds();
  164 +
  165 + for (String id : childIds)
  166 + result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity,
  167 + tokenId2Segment, headsOnly));
  168 +
  169 + return result;
  170 + }
187 171 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftSaver.java
1 1 package pl.waw.ipipan.zil.core.md.io.thrift;
2 2  
3   -import java.util.ArrayList;
4   -import java.util.HashMap;
5   -import java.util.Iterator;
6   -import java.util.List;
7   -import java.util.Map;
8   -
9   -import org.apache.log4j.Logger;
10   -
11   -import pl.waw.ipipan.zil.core.md.entities.Mention;
12   -import pl.waw.ipipan.zil.core.md.entities.Paragraph;
13   -import pl.waw.ipipan.zil.core.md.entities.Sentence;
14   -import pl.waw.ipipan.zil.core.md.entities.Text;
15   -import pl.waw.ipipan.zil.core.md.entities.Token;
16   -import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException;
17   -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention;
18   -import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph;
19   -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence;
20   -import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
21   -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken;
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.entities.*;
  6 +import pl.waw.ipipan.zil.multiservice.thrift.types.*;
  7 +
  8 +import java.util.*;
22 9  
23 10 public class ThriftSaver {
24 11  
25   - private static Logger logger = Logger.getLogger(ThriftSaver.class);
26   -
27   - public static void updateThriftText(Text responseText, TText text)
28   - throws MultiserviceException {
29   -
30   - logger.debug("Updating thrift text...");
31   - Map<Mention, TMention> teiMention2ThriftMention = new HashMap<>();
32   -
33   - Iterator<TParagraph> thrPI = text.getParagraphsIterator();
34   - Iterator<Paragraph> teiPI = responseText.iterator();
35   - int freeMentionId = 0;
36   - while (thrPI.hasNext() && teiPI.hasNext()) {
37   - TParagraph thrP = thrPI.next();
38   - Paragraph teiP = teiPI.next();
39   -
40   - freeMentionId = updateThriftParagraph(teiMention2ThriftMention,
41   - freeMentionId, thrP, teiP);
42   - }
43   - checkIterators(thrPI, teiPI, "paragraph");
44   - }
45   -
46   - private static int updateThriftParagraph(
47   - Map<Mention, TMention> teiMention2ThriftMention, int freeMentionId,
48   - TParagraph thrP, Paragraph teiP) throws MultiserviceException {
49   - Iterator<TSentence> thrSI = thrP.getSentencesIterator();
50   - Iterator<Sentence> teiSI = teiP.iterator();
51   - while (thrSI.hasNext() && teiSI.hasNext()) {
52   - TSentence thrS = thrSI.next();
53   - Sentence teiS = teiSI.next();
54   - freeMentionId = updateThriftSentence(teiMention2ThriftMention,
55   - freeMentionId, thrS, teiS);
56   - }
57   - checkIterators(thrSI, teiSI, "sentence");
58   - return freeMentionId;
59   - }
60   -
61   - private static int updateThriftSentence(
62   - Map<Mention, TMention> teiMention2ThriftMention, int id,
63   - TSentence thrS, Sentence teiS) throws MultiserviceException {
64   - thrS.unsetMentions();
65   - thrS.setMentions(new ArrayList<TMention>());
66   -
67   - Map<Token, TToken> teiMorph2ThriftToken = new HashMap<>();
68   - Iterator<TToken> thrMI = thrS.getTokensIterator();
69   - Iterator<Token> teiMI = teiS.iterator();
70   - while (thrMI.hasNext() && teiMI.hasNext()) {
71   - teiMorph2ThriftToken.put(teiMI.next(), thrMI.next());
72   - }
73   - checkIterators(thrMI, teiMI, "morph");
74   -
75   - for (Mention m : teiS.getMentions()) {
76   - List<String> childIds = new ArrayList<>();
77   - List<String> headIds = new ArrayList<>();
78   - for (Token ch : m.getSegments())
79   - childIds.add(teiMorph2ThriftToken.get(ch).getId());
80   - for (Token h : m.getHeadSegments())
81   - headIds.add(teiMorph2ThriftToken.get(h).getId());
82   -
83   - TMention tm = new TMention("m-" + (id++), headIds, childIds,
84   - m.isZeroSubject());
85   - teiMention2ThriftMention.put(m, tm);
86   - thrS.addToMentions(tm);
87   - }
88   - return id;
89   - }
90   -
91   - private static void checkIterators(Iterator<? extends Object> one,
92   - Iterator<? extends Object> other, String level)
93   - throws MultiserviceException {
94   - if (one.hasNext() || other.hasNext())
95   - throw new MultiserviceException(
96   - "Problem mapping interal text representation to thrift for level "
97   - + level);
98   - }
  12 + private static final Logger LOG = LoggerFactory.getLogger(ThriftSaver.class);
  13 +
  14 + private ThriftSaver() {
  15 + }
  16 +
  17 + public static void updateThriftText(Text responseText, TText text)
  18 + throws MultiserviceException {
  19 +
  20 + LOG.debug("Updating thrift text...");
  21 + Map<Mention, TMention> teiMention2ThriftMention = new HashMap<>();
  22 +
  23 + Iterator<TParagraph> thrPI = text.getParagraphsIterator();
  24 + Iterator<Paragraph> teiPI = responseText.iterator();
  25 + int freeMentionId = 0;
  26 + while (thrPI.hasNext() && teiPI.hasNext()) {
  27 + TParagraph thrP = thrPI.next();
  28 + Paragraph teiP = teiPI.next();
  29 +
  30 + freeMentionId = updateThriftParagraph(teiMention2ThriftMention,
  31 + freeMentionId, thrP, teiP);
  32 + }
  33 + checkIterators(thrPI, teiPI, "paragraph");
  34 + }
  35 +
  36 + private static int updateThriftParagraph(
  37 + Map<Mention, TMention> teiMention2ThriftMention, int freeMentionId,
  38 + TParagraph thrP, Paragraph teiP) throws MultiserviceException {
  39 + Iterator<TSentence> thrSI = thrP.getSentencesIterator();
  40 + Iterator<Sentence> teiSI = teiP.iterator();
  41 + while (thrSI.hasNext() && teiSI.hasNext()) {
  42 + TSentence thrS = thrSI.next();
  43 + Sentence teiS = teiSI.next();
  44 + freeMentionId = updateThriftSentence(teiMention2ThriftMention,
  45 + freeMentionId, thrS, teiS);
  46 + }
  47 + checkIterators(thrSI, teiSI, "sentence");
  48 + return freeMentionId;
  49 + }
  50 +
  51 + private static int updateThriftSentence(
  52 + Map<Mention, TMention> teiMention2ThriftMention, int id,
  53 + TSentence thrS, Sentence teiS) throws MultiserviceException {
  54 + thrS.unsetMentions();
  55 + thrS.setMentions(new ArrayList<>());
  56 +
  57 + Map<Token, TToken> teiMorph2ThriftToken = new HashMap<>();
  58 + Iterator<TToken> thrMI = thrS.getTokensIterator();
  59 + Iterator<Token> teiMI = teiS.iterator();
  60 + while (thrMI.hasNext() && teiMI.hasNext()) {
  61 + teiMorph2ThriftToken.put(teiMI.next(), thrMI.next());
  62 + }
  63 + checkIterators(thrMI, teiMI, "morph");
  64 +
  65 + for (Mention m : teiS.getMentions()) {
  66 + List<String> childIds = new ArrayList<>();
  67 + List<String> headIds = new ArrayList<>();
  68 + for (Token ch : m.getSegments())
  69 + childIds.add(teiMorph2ThriftToken.get(ch).getId());
  70 + for (Token h : m.getHeadSegments())
  71 + headIds.add(teiMorph2ThriftToken.get(h).getId());
  72 +
  73 + TMention tm = new TMention("m-" + (id++), headIds, childIds,
  74 + m.isZeroSubject());
  75 + teiMention2ThriftMention.put(m, tm);
  76 + thrS.addToMentions(tm);
  77 + }
  78 + return id;
  79 + }
  80 +
  81 + private static void checkIterators(Iterator<?> one,
  82 + Iterator<?> other, String level)
  83 + throws MultiserviceException {
  84 + if (one.hasNext() || other.hasNext())
  85 + throw new MultiserviceException(
  86 + "Problem mapping interal text representation to thrift for level "
  87 + + level);
  88 + }
99 89  
100 90 }
... ...