Commit bd7f5abb07ff32954b95699545ac6194c0a44c7f
1 parent
62ccdfdc
1.3 release
Showing
19 changed files
with
1441 additions
and
1465 deletions
doc/compile.sh
0 → 100755
doc/manual.pdf
No preview for this file type
doc/manual.tex
@@ -38,10 +38,10 @@ The current version of the program facilitates the automatic mention detection, | @@ -38,10 +38,10 @@ The current version of the program facilitates the automatic mention detection, | ||
38 | MentionDetector uses information provided in it's input to produce mentions for coreference resolution. It merges entities provided by named entity recognition tools, shallow parsers and taggers. | 38 | MentionDetector uses information provided in it's input to produce mentions for coreference resolution. It merges entities provided by named entity recognition tools, shallow parsers and taggers. |
39 | 39 | ||
40 | It also finds zero subjects in clauses and marks the verbs using zero subjects as mentions, using the algorithm presented in \cite{kop:14:eacl:short}, for which a model was trained using the full Polish Coreference Corpus, version 0.92 (corpus description in \cite{ogro:etal:13:ltc}). Training data had 15875 positive and 37798 negative examples; 10-fold cross validation yielded an accuracy of 86.14\% for the task of finding zero subjects. Precision of 79.8\% and recall of 71.2\% for the zero subject class of verbs was obtained. | 40 | It also finds zero subjects in clauses and marks the verbs using zero subjects as mentions, using the algorithm presented in \cite{kop:14:eacl:short}, for which a model was trained using the full Polish Coreference Corpus, version 0.92 (corpus description in \cite{ogro:etal:13:ltc}). Training data had 15875 positive and 37798 negative examples; 10-fold cross validation yielded an accuracy of 86.14\% for the task of finding zero subjects. Precision of 79.8\% and recall of 71.2\% for the zero subject class of verbs was obtained. |
41 | - | 41 | + |
42 | \textbf{Homepage:} \url{http://zil.ipipan.waw.pl/MentionDetector} \\ | 42 | \textbf{Homepage:} \url{http://zil.ipipan.waw.pl/MentionDetector} \\ |
43 | \textbf{Contact person:} Mateusz Kopeć [mateusz.kopec@ipipan.waw.pl] \\ | 43 | \textbf{Contact person:} Mateusz Kopeć [mateusz.kopec@ipipan.waw.pl] \\ |
44 | -\textbf{Author:} Mateusz Kopeć \\ | 44 | +\textbf{Author:} Mateusz Kopeć \\ |
45 | \textbf{License:} CC BY v.3 | 45 | \textbf{License:} CC BY v.3 |
46 | 46 | ||
47 | 47 | ||
@@ -49,7 +49,7 @@ It also finds zero subjects in clauses and marks the verbs using zero subjects a | @@ -49,7 +49,7 @@ It also finds zero subjects in clauses and marks the verbs using zero subjects a | ||
49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
50 | 50 | ||
51 | \section{Requirements} | 51 | \section{Requirements} |
52 | -Java Runtime Environment (JRE) 1.7 or newer. | 52 | +Java Runtime Environment (JRE) 1.8 or newer. |
53 | 53 | ||
54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% | 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
@@ -143,7 +143,7 @@ Zero subjects are distinguished from other mentions by having an additional feat | @@ -143,7 +143,7 @@ Zero subjects are distinguished from other mentions by having an additional feat | ||
143 | 143 | ||
144 | Standalone jar doesn't need any installation. To run it, simply execute:\\ | 144 | Standalone jar doesn't need any installation. To run it, simply execute:\\ |
145 | 145 | ||
146 | -\texttt{java -jar md-1.0-SNAPSHOT.one-jar.jar <dir with input texts> <dir for output texts>}\\ | 146 | +\texttt{java -jar md-1.3-jar-with-dependencies.jar <dir with input texts> <dir for output texts>}\\ |
147 | 147 | ||
148 | All texts recursively found in \texttt{<dir with input texts>} are going to be annotated with mentions layer and saved in \texttt{<dir for output texts>}.\\ | 148 | All texts recursively found in \texttt{<dir with input texts>} are going to be annotated with mentions layer and saved in \texttt{<dir for output texts>}.\\ |
149 | 149 | ||
@@ -153,7 +153,7 @@ All texts recursively found in \texttt{<dir with input texts>} are going to be a | @@ -153,7 +153,7 @@ All texts recursively found in \texttt{<dir with input texts>} are going to be a | ||
153 | \section{Custom zero subject detection model} | 153 | \section{Custom zero subject detection model} |
154 | If you want to use custom zero subject detection model, you may try:\\ | 154 | If you want to use custom zero subject detection model, you may try:\\ |
155 | 155 | ||
156 | -\texttt{java -jar md-1.0-SNAPSHOT.one-jar.jar <dir with input texts> <dir for output texts> <model\_path>} | 156 | +\texttt{java -jar md-1.3-jar-with-dependencies.jar <dir with input texts> <dir for output texts> <model\_path>} |
157 | 157 | ||
158 | To create such model, use the \texttt{pl.waw.ipipan.zil.core.md.detection.zero.Trainer} class. | 158 | To create such model, use the \texttt{pl.waw.ipipan.zil.core.md.detection.zero.Trainer} class. |
159 | 159 |
pom.xml
1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
2 | - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
3 | - <modelVersion>4.0.0</modelVersion> | ||
4 | - <groupId>pl.waw.ipipan.zil.core</groupId> | ||
5 | - <artifactId>md</artifactId> | ||
6 | - <version>1.2-SNAPSHOT</version> | ||
7 | - <properties> | ||
8 | - <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
9 | - </properties> | ||
10 | - <build> | ||
11 | - <plugins> | ||
12 | - <plugin> | ||
13 | - <artifactId>maven-compiler-plugin</artifactId> | ||
14 | - <version>2.3.2</version> | ||
15 | - <configuration> | ||
16 | - <source>1.7</source> | ||
17 | - <target>1.7</target> | ||
18 | - </configuration> | ||
19 | - </plugin> | ||
20 | - <plugin> | ||
21 | - <artifactId>maven-source-plugin</artifactId> | ||
22 | - <version>2.4</version> | ||
23 | - <executions> | ||
24 | - <execution> | ||
25 | - <id>attach-sources</id> | ||
26 | - <phase>deploy</phase> | ||
27 | - <goals> | ||
28 | - <goal>jar-no-fork</goal> | ||
29 | - </goals> | ||
30 | - </execution> | ||
31 | - </executions> | ||
32 | - </plugin> | ||
33 | - <plugin> | ||
34 | - <artifactId>maven-javadoc-plugin</artifactId> | ||
35 | - <version>2.10.3</version> | ||
36 | - <executions> | ||
37 | - <execution> | ||
38 | - <id>attach-javadocs</id> | ||
39 | - <phase>deploy</phase> | ||
40 | - <goals> | ||
41 | - <goal>jar</goal> | ||
42 | - </goals> | ||
43 | - </execution> | ||
44 | - </executions> | ||
45 | - </plugin> | ||
46 | - <plugin> | ||
47 | - <!-- explicitly define maven-deploy-plugin after other to force exec | ||
48 | - order --> | ||
49 | - <artifactId>maven-deploy-plugin</artifactId> | ||
50 | - <version>2.7</version> | ||
51 | - <executions> | ||
52 | - <execution> | ||
53 | - <id>deploy</id> | ||
54 | - <phase>deploy</phase> | ||
55 | - <goals> | ||
56 | - <goal>deploy</goal> | ||
57 | - </goals> | ||
58 | - </execution> | ||
59 | - </executions> | ||
60 | - </plugin> | ||
61 | - <plugin> | ||
62 | - <groupId>org.dstovall</groupId> | ||
63 | - <artifactId>onejar-maven-plugin</artifactId> | ||
64 | - <version>1.4.4</version> | ||
65 | - <executions> | ||
66 | - <execution> | ||
67 | - <configuration> | ||
68 | - <mainClass>pl.waw.ipipan.zil.core.md.Main</mainClass> | ||
69 | - </configuration> | ||
70 | - <goals> | ||
71 | - <goal>one-jar</goal> | ||
72 | - </goals> | ||
73 | - </execution> | ||
74 | - </executions> | ||
75 | - </plugin> | ||
76 | - </plugins> | ||
77 | - </build> | ||
78 | - <dependencies> | ||
79 | - <dependency> | ||
80 | - <groupId>log4j</groupId> | ||
81 | - <artifactId>log4j</artifactId> | ||
82 | - <version>1.2.17</version> | ||
83 | - </dependency> | ||
84 | - <dependency> | ||
85 | - <groupId>pl.waw.ipipan.zil.multiservice</groupId> | ||
86 | - <artifactId>utils</artifactId> | ||
87 | - <version>1.0-SNAPSHOT</version> | ||
88 | - </dependency> | ||
89 | - <dependency> | ||
90 | - <groupId>pl.waw.ipipan.zil.nkjp</groupId> | ||
91 | - <artifactId>teiapi</artifactId> | ||
92 | - <version>1.0-SNAPSHOT</version> | ||
93 | - </dependency> | ||
94 | - <dependency> | ||
95 | - <groupId>junit</groupId> | ||
96 | - <artifactId>junit</artifactId> | ||
97 | - <version>4.11</version> | ||
98 | - </dependency> | ||
99 | - <dependency> | ||
100 | - <groupId>nz.ac.waikato.cms.weka</groupId> | ||
101 | - <artifactId>weka-stable</artifactId> | ||
102 | - <version>3.6.10</version> | ||
103 | - </dependency> | ||
104 | - </dependencies> | 2 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> |
3 | + <modelVersion>4.0.0</modelVersion> | ||
105 | 4 | ||
106 | - <repositories> | ||
107 | - <repository> | ||
108 | - <id>zil-maven-repo</id> | ||
109 | - <name>ZIL maven repository</name> | ||
110 | - <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots</url> | ||
111 | - </repository> | ||
112 | - </repositories> | 5 | + <groupId>pl.waw.ipipan.zil.core</groupId> |
6 | + <artifactId>md</artifactId> | ||
7 | + <version>1.3</version> | ||
113 | 8 | ||
114 | - <pluginRepositories> | ||
115 | - <pluginRepository> | ||
116 | - <id>onejar-maven-plugin.googlecode.com</id> | ||
117 | - <url>http://onejar-maven-plugin.googlecode.com/svn/mavenrepo</url> | ||
118 | - </pluginRepository> | ||
119 | - </pluginRepositories> | 9 | + <developers> |
10 | + <developer> | ||
11 | + <name>Mateusz Kopeć</name> | ||
12 | + <organization>ICS PAS</organization> | ||
13 | + <email>m.kopec@ipipan.waw.pl</email> | ||
14 | + </developer> | ||
15 | + </developers> | ||
120 | 16 | ||
121 | - <distributionManagement> | ||
122 | - <repository> | ||
123 | - <id>deployment</id> | ||
124 | - <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url> | ||
125 | - </repository> | ||
126 | - <snapshotRepository> | ||
127 | - <id>deployment</id> | ||
128 | - <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url> | ||
129 | - </snapshotRepository> | ||
130 | - </distributionManagement> | 17 | + <properties> |
18 | + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | ||
19 | + <java.version>1.8</java.version> | ||
20 | + | ||
21 | + <junit.version>4.12</junit.version> | ||
22 | + <slf4j.version>1.7.21</slf4j.version> | ||
23 | + </properties> | ||
24 | + | ||
25 | + <prerequisites> | ||
26 | + <maven>3.0.5</maven> | ||
27 | + </prerequisites> | ||
28 | + | ||
29 | + <build> | ||
30 | + <pluginManagement> | ||
31 | + <plugins> | ||
32 | + <plugin> | ||
33 | + <artifactId>maven-compiler-plugin</artifactId> | ||
34 | + <version>3.5.1</version> | ||
35 | + <configuration> | ||
36 | + <source>${java.version}</source> | ||
37 | + <target>${java.version}</target> | ||
38 | + </configuration> | ||
39 | + </plugin> | ||
40 | + <plugin> | ||
41 | + <artifactId>maven-clean-plugin</artifactId> | ||
42 | + <version>3.0.0</version> | ||
43 | + </plugin> | ||
44 | + <plugin> | ||
45 | + <artifactId>maven-install-plugin</artifactId> | ||
46 | + <version>2.5.2</version> | ||
47 | + </plugin> | ||
48 | + <plugin> | ||
49 | + <artifactId>maven-jar-plugin</artifactId> | ||
50 | + <version>3.0.2</version> | ||
51 | + </plugin> | ||
52 | + <plugin> | ||
53 | + <artifactId>maven-resources-plugin</artifactId> | ||
54 | + <version>3.0.1</version> | ||
55 | + </plugin> | ||
56 | + <plugin> | ||
57 | + <artifactId>maven-site-plugin</artifactId> | ||
58 | + <version>3.5.1</version> | ||
59 | + </plugin> | ||
60 | + <plugin> | ||
61 | + <artifactId>maven-surefire-plugin</artifactId> | ||
62 | + <version>2.19.1</version> | ||
63 | + </plugin> | ||
64 | + | ||
65 | + <plugin> | ||
66 | + <artifactId>maven-source-plugin</artifactId> | ||
67 | + <version>3.0.1</version> | ||
68 | + <executions> | ||
69 | + <execution> | ||
70 | + <id>attach-sources</id> | ||
71 | + <phase>deploy</phase> | ||
72 | + <goals> | ||
73 | + <goal>jar-no-fork</goal> | ||
74 | + </goals> | ||
75 | + </execution> | ||
76 | + </executions> | ||
77 | + </plugin> | ||
78 | + <plugin> | ||
79 | + <artifactId>maven-javadoc-plugin</artifactId> | ||
80 | + <version>2.10.4</version> | ||
81 | + <executions> | ||
82 | + <execution> | ||
83 | + <id>attach-javadocs</id> | ||
84 | + <phase>deploy</phase> | ||
85 | + <goals> | ||
86 | + <goal>jar</goal> | ||
87 | + </goals> | ||
88 | + </execution> | ||
89 | + </executions> | ||
90 | + </plugin> | ||
91 | + <plugin> | ||
92 | + <!-- explicitly define maven-deploy-plugin after other to force exec order --> | ||
93 | + <artifactId>maven-deploy-plugin</artifactId> | ||
94 | + <version>2.8.2</version> | ||
95 | + <executions> | ||
96 | + <execution> | ||
97 | + <id>deploy</id> | ||
98 | + <phase>deploy</phase> | ||
99 | + <goals> | ||
100 | + <goal>deploy</goal> | ||
101 | + </goals> | ||
102 | + </execution> | ||
103 | + </executions> | ||
104 | + </plugin> | ||
105 | + <plugin> | ||
106 | + <artifactId>maven-assembly-plugin</artifactId> | ||
107 | + <version>2.6</version> | ||
108 | + </plugin> | ||
109 | + </plugins> | ||
110 | + </pluginManagement> | ||
111 | + | ||
112 | + <plugins> | ||
113 | + <plugin> | ||
114 | + <artifactId>maven-assembly-plugin</artifactId> | ||
115 | + <configuration> | ||
116 | + <descriptorRefs> | ||
117 | + <descriptorRef>jar-with-dependencies</descriptorRef> | ||
118 | + </descriptorRefs> | ||
119 | + <archive> | ||
120 | + <manifest> | ||
121 | + <mainClass>pl.waw.ipipan.zil.core.md.Main</mainClass> | ||
122 | + </manifest> | ||
123 | + </archive> | ||
124 | + </configuration> | ||
125 | + <executions> | ||
126 | + <execution> | ||
127 | + <id>make-assembly</id> | ||
128 | + <phase>package</phase> | ||
129 | + <goals> | ||
130 | + <goal>single</goal> | ||
131 | + </goals> | ||
132 | + </execution> | ||
133 | + </executions> | ||
134 | + </plugin> | ||
135 | + </plugins> | ||
136 | + </build> | ||
137 | + | ||
138 | + <dependencies> | ||
139 | + <!-- internal --> | ||
140 | + <dependency> | ||
141 | + <groupId>pl.waw.ipipan.zil.multiservice</groupId> | ||
142 | + <artifactId>utils</artifactId> | ||
143 | + <version>1.0</version> | ||
144 | + </dependency> | ||
145 | + <dependency> | ||
146 | + <groupId>pl.waw.ipipan.zil.nkjp</groupId> | ||
147 | + <artifactId>teiapi</artifactId> | ||
148 | + <version>1.0</version> | ||
149 | + </dependency> | ||
150 | + | ||
151 | + <!-- third party --> | ||
152 | + <dependency> | ||
153 | + <groupId>nz.ac.waikato.cms.weka</groupId> | ||
154 | + <artifactId>weka-stable</artifactId> | ||
155 | + <version>3.6.10</version> | ||
156 | + </dependency> | ||
157 | + | ||
158 | + <!-- logging --> | ||
159 | + <dependency> | ||
160 | + <groupId>org.slf4j</groupId> | ||
161 | + <artifactId>slf4j-api</artifactId> | ||
162 | + <version>1.7.21</version> | ||
163 | + </dependency> | ||
164 | + <dependency> | ||
165 | + <groupId>org.slf4j</groupId> | ||
166 | + <artifactId>slf4j-simple</artifactId> | ||
167 | + <version>1.7.21</version> | ||
168 | + <scope>runtime</scope> | ||
169 | + </dependency> | ||
170 | + | ||
171 | + <!-- test --> | ||
172 | + <dependency> | ||
173 | + <groupId>junit</groupId> | ||
174 | + <artifactId>junit</artifactId> | ||
175 | + <version>4.12</version> | ||
176 | + <scope>test</scope> | ||
177 | + </dependency> | ||
178 | + | ||
179 | + </dependencies> | ||
180 | + | ||
181 | + <repositories> | ||
182 | + <repository> | ||
183 | + <id>zil-maven-snapshot-repo</id> | ||
184 | + <name>ZIL maven snapshot repository</name> | ||
185 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url> | ||
186 | + </repository> | ||
187 | + <repository> | ||
188 | + <id>zil-maven-release-repo</id> | ||
189 | + <name>ZIL maven release repository</name> | ||
190 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url> | ||
191 | + </repository> | ||
192 | + <repository> | ||
193 | + <id>zil-maven-repo-3rdparty</id> | ||
194 | + <name>ZIL maven repository 3rdparty</name> | ||
195 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/thirdparty/</url> | ||
196 | + </repository> | ||
197 | + </repositories> | ||
198 | + | ||
199 | + <distributionManagement> | ||
200 | + <repository> | ||
201 | + <id>deployment</id> | ||
202 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url> | ||
203 | + </repository> | ||
204 | + <snapshotRepository> | ||
205 | + <id>deployment</id> | ||
206 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url> | ||
207 | + </snapshotRepository> | ||
208 | + </distributionManagement> | ||
131 | </project> | 209 | </project> |
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
1 | package pl.waw.ipipan.zil.core.md; | 1 | package pl.waw.ipipan.zil.core.md; |
2 | 2 | ||
3 | -import java.io.File; | ||
4 | -import java.io.FileInputStream; | ||
5 | -import java.io.IOException; | ||
6 | -import java.io.InputStream; | ||
7 | - | ||
8 | -import org.apache.log4j.Logger; | ||
9 | - | 3 | +import org.slf4j.Logger; |
4 | +import org.slf4j.LoggerFactory; | ||
10 | import pl.waw.ipipan.zil.core.md.detection.Detector; | 5 | import pl.waw.ipipan.zil.core.md.detection.Detector; |
11 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | 6 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; |
12 | import pl.waw.ipipan.zil.core.md.entities.Text; | 7 | import pl.waw.ipipan.zil.core.md.entities.Text; |
@@ -20,134 +15,128 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | @@ -20,134 +15,128 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | ||
20 | import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; | 15 | import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; |
21 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; | 16 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; |
22 | 17 | ||
23 | -/** | ||
24 | - * @author Mateusz Kopeć | ||
25 | - * | ||
26 | - */ | 18 | +import java.io.File; |
19 | +import java.io.FileInputStream; | ||
20 | +import java.io.IOException; | ||
21 | +import java.io.InputStream; | ||
22 | + | ||
27 | public class Main { | 23 | public class Main { |
28 | 24 | ||
29 | - private final static Logger logger = Logger.getLogger(Main.class); | ||
30 | - private final static boolean GZIP_OUTPUT = true; | ||
31 | - | ||
32 | - private final static String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; | ||
33 | - | ||
34 | - private static ZeroSubjectDetector zeroSubjectModel; | ||
35 | - static { | ||
36 | - InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); | ||
37 | - zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | ||
38 | - } | ||
39 | - | ||
40 | - /** | ||
41 | - * Main method for detecting mentions in corpus encoded in Tei format. | ||
42 | - * | ||
43 | - * @param args | ||
44 | - * arguments | ||
45 | - */ | ||
46 | - public static void main(String[] args) { | ||
47 | - | ||
48 | - if (args.length != 2 && args.length != 3) { | ||
49 | - logger.error("Wrong usage! should be: " + Main.class.getSimpleName() | ||
50 | - + " input_dir result_dir [zero_subject_model]"); | ||
51 | - return; | ||
52 | - } | ||
53 | - | ||
54 | - File inputDir = new File(args[0]); | ||
55 | - File outputDir = new File(args[1]); | ||
56 | - | ||
57 | - if (!inputDir.isDirectory()) { | ||
58 | - logger.error(inputDir + " is not a directory!"); | ||
59 | - return; | ||
60 | - } | ||
61 | - if (!outputDir.isDirectory()) { | ||
62 | - logger.error(outputDir + " is not a directory!"); | ||
63 | - return; | ||
64 | - } | ||
65 | - if (args.length == 3) { | ||
66 | - try { | ||
67 | - InputStream zeroSubjectDetectionModelStream; | ||
68 | - zeroSubjectDetectionModelStream = new FileInputStream(new File(args[2])); | ||
69 | - zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | ||
70 | - if (zeroSubjectModel == null) | ||
71 | - throw new IOException(); | ||
72 | - } catch (IOException e) { | ||
73 | - logger.error("Unable to load model from file: " + args[2] + ": " + e); | ||
74 | - return; | ||
75 | - } | ||
76 | - } | ||
77 | - | ||
78 | - int all = 0; | ||
79 | - int errors = 0; | ||
80 | - for (File teiDir : IOUtils.getNKJPDirs(inputDir)) { | ||
81 | - all++; | ||
82 | - try { | ||
83 | - File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); | ||
84 | - TEICorpusText teiText = TeiLoader.readTeiText(teiDir); | ||
85 | - annotateTeiText(teiText); | ||
86 | - TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); | ||
87 | - } catch (IOException e) { | ||
88 | - logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage()); | ||
89 | - errors++; | ||
90 | - } | ||
91 | - } | ||
92 | - | ||
93 | - logger.info(all + " texts processed succesfully."); | ||
94 | - if (errors > 0) | ||
95 | - logger.info(errors + " texts not processed."); | ||
96 | - logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected."); | ||
97 | - logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected."); | ||
98 | - } | ||
99 | - | ||
100 | - /** | ||
101 | - * Find relative path of text directory in the corpus directory and create | ||
102 | - * similar directory structure in the output corpus directory. | ||
103 | - * | ||
104 | - * @param inputCorpusDir | ||
105 | - * input corpus directory | ||
106 | - * @param outputCorpusDir | ||
107 | - * output corpus directory | ||
108 | - * @param textDir | ||
109 | - * input text dir | ||
110 | - * @return target text dir | ||
111 | - * @throws IOException | ||
112 | - * when an error occurs | ||
113 | - */ | ||
114 | - private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException { | ||
115 | - String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length()); | ||
116 | - File targetDir = new File(outputCorpusDir, relativeDirPath); | ||
117 | - targetDir.mkdirs(); | ||
118 | - if (!targetDir.exists() || !targetDir.isDirectory()) | ||
119 | - throw new IOException("Failed to create output directory at: " + targetDir); | ||
120 | - return targetDir; | ||
121 | - } | ||
122 | - | ||
123 | - /** | ||
124 | - * Find mentions in Thrift text and update this Thrift text with mention | ||
125 | - * annotation. | ||
126 | - * | ||
127 | - * @param thriftText | ||
128 | - * text to annotate with mentions | ||
129 | - * @throws MultiserviceException | ||
130 | - * when an error occures | ||
131 | - */ | ||
132 | - public static void annotateThriftText(TText thriftText) throws MultiserviceException { | ||
133 | - Text responseText = ThriftLoader.loadTextFromThrift(thriftText); | ||
134 | - Detector.findMentionsInText(responseText, zeroSubjectModel); | ||
135 | - ThriftSaver.updateThriftText(responseText, thriftText); | ||
136 | - } | ||
137 | - | ||
138 | - /** | ||
139 | - * Find mentions in Tei text and update this Tei text with mention | ||
140 | - * annotation. This method does not save this Tei text on disk. | ||
141 | - * | ||
142 | - * @param teiText | ||
143 | - * text to annotate with mentions | ||
144 | - * @throws TEIException | ||
145 | - * when an error occurs | ||
146 | - */ | ||
147 | - public static void annotateTeiText(TEICorpusText teiText) throws TEIException { | ||
148 | - Text responseText = TeiLoader.loadTextFromTei(teiText); | ||
149 | - Detector.findMentionsInText(responseText, zeroSubjectModel); | ||
150 | - TeiSaver.updateTeiText(responseText, teiText); | ||
151 | - } | 25 | + private static final Logger logger = LoggerFactory.getLogger(Main.class); |
26 | + | ||
27 | + private static final boolean GZIP_OUTPUT = true; | ||
28 | + private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; | ||
29 | + | ||
30 | + private static ZeroSubjectDetector zeroSubjectModel; | ||
31 | + | ||
32 | + static { | ||
33 | + InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); | ||
34 | + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | ||
35 | + } | ||
36 | + | ||
37 | + private Main() { | ||
38 | + } | ||
39 | + | ||
40 | + /** | ||
41 | + * Main method for detecting mentions in corpus encoded in Tei format. | ||
42 | + * | ||
43 | + * @param args arguments | ||
44 | + */ | ||
45 | + public static void main(String[] args) { | ||
46 | + | ||
47 | + if (args.length != 2 && args.length != 3) { | ||
48 | + logger.error("Wrong usage! should be: " + Main.class.getSimpleName() | ||
49 | + + " input_dir result_dir [zero_subject_model]"); | ||
50 | + return; | ||
51 | + } | ||
52 | + | ||
53 | + File inputDir = new File(args[0]); | ||
54 | + File outputDir = new File(args[1]); | ||
55 | + | ||
56 | + if (!inputDir.isDirectory()) { | ||
57 | + logger.error(inputDir + " is not a directory!"); | ||
58 | + return; | ||
59 | + } | ||
60 | + if (!outputDir.isDirectory()) { | ||
61 | + logger.error(outputDir + " is not a directory!"); | ||
62 | + return; | ||
63 | + } | ||
64 | + if (args.length == 3) { | ||
65 | + try { | ||
66 | + InputStream zeroSubjectDetectionModelStream; | ||
67 | + zeroSubjectDetectionModelStream = new FileInputStream(new File(args[2])); | ||
68 | + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | ||
69 | + } catch (IOException e) { | ||
70 | + logger.error("Unable to load model from file: " + args[2] + ": " + e, e); | ||
71 | + return; | ||
72 | + } | ||
73 | + } | ||
74 | + | ||
75 | + int all = 0; | ||
76 | + int errors = 0; | ||
77 | + for (File teiDir : IOUtils.getNKJPDirs(inputDir)) { | ||
78 | + all++; | ||
79 | + try { | ||
80 | + File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); | ||
81 | + TEICorpusText teiText = TeiLoader.readTeiText(teiDir); | ||
82 | + annotateTeiText(teiText); | ||
83 | + TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); | ||
84 | + } catch (IOException e) { | ||
85 | + logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e); | ||
86 | + errors++; | ||
87 | + } | ||
88 | + } | ||
89 | + | ||
90 | + logger.info(all + " texts processed succesfully."); | ||
91 | + if (errors > 0) | ||
92 | + logger.info(errors + " texts not processed."); | ||
93 | + logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected."); | ||
94 | + logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected."); | ||
95 | + } | ||
96 | + | ||
97 | + /** | ||
98 | + * Find relative path of text directory in the corpus directory and create | ||
99 | + * similar directory structure in the output corpus directory. | ||
100 | + * | ||
101 | + * @param inputCorpusDir input corpus directory | ||
102 | + * @param outputCorpusDir output corpus directory | ||
103 | + * @param textDir input text dir | ||
104 | + * @return target text dir | ||
105 | + * @throws IOException when an error occurs | ||
106 | + */ | ||
107 | + private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException { | ||
108 | + String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length()); | ||
109 | + File targetDir = new File(outputCorpusDir, relativeDirPath); | ||
110 | + targetDir.mkdirs(); | ||
111 | + if (!targetDir.exists() || !targetDir.isDirectory()) | ||
112 | + throw new IOException("Failed to create output directory at: " + targetDir); | ||
113 | + return targetDir; | ||
114 | + } | ||
115 | + | ||
116 | + /** | ||
117 | + * Find mentions in Thrift text and update this Thrift text with mention | ||
118 | + * annotation. | ||
119 | + * | ||
120 | + * @param thriftText text to annotate with mentions | ||
121 | + * @throws MultiserviceException when an error occures | ||
122 | + */ | ||
123 | + public static void annotateThriftText(TText thriftText) throws MultiserviceException { | ||
124 | + Text responseText = ThriftLoader.loadTextFromThrift(thriftText); | ||
125 | + Detector.findMentionsInText(responseText, zeroSubjectModel); | ||
126 | + ThriftSaver.updateThriftText(responseText, thriftText); | ||
127 | + } | ||
128 | + | ||
129 | + /** | ||
130 | + * Find mentions in Tei text and update this Tei text with mention | ||
131 | + * annotation. This method does not save this Tei text on disk. | ||
132 | + * | ||
133 | + * @param teiText text to annotate with mentions | ||
134 | + * @throws TEIException when an error occurs | ||
135 | + */ | ||
136 | + public static void annotateTeiText(TEICorpusText teiText) throws TEIException { | ||
137 | + Text responseText = TeiLoader.loadTextFromTei(teiText); | ||
138 | + Detector.findMentionsInText(responseText, zeroSubjectModel); | ||
139 | + TeiSaver.updateTeiText(responseText, teiText); | ||
140 | + } | ||
152 | 141 | ||
153 | } | 142 | } |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
1 | package pl.waw.ipipan.zil.core.md.detection; | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | ||
3 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | ||
4 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.Token; | ||
6 | + | ||
3 | import java.util.Collection; | 7 | import java.util.Collection; |
4 | import java.util.HashSet; | 8 | import java.util.HashSet; |
5 | import java.util.List; | 9 | import java.util.List; |
6 | import java.util.Set; | 10 | import java.util.Set; |
7 | 11 | ||
8 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | ||
9 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
10 | -import pl.waw.ipipan.zil.core.md.entities.Token; | ||
11 | - | ||
12 | public class Cleaner { | 12 | public class Cleaner { |
13 | - public static void cleanUnnecessarySentenceMentions(Sentence sentence) { | ||
14 | - List<Mention> mentions = sentence.getMentions(); | ||
15 | - Collection<Mention> unnecessaryMentions = new HashSet<Mention>(); | ||
16 | - | ||
17 | - for (int i = 0; i < mentions.size(); i++) { | ||
18 | - Mention m1 = mentions.get(i); | ||
19 | - for (int j = i + 1; j < mentions.size(); j++) { | ||
20 | - Mention m2 = mentions.get(j); | ||
21 | - | ||
22 | - Mention lessImportantMention = getLessImportantMention(m1, m2); | ||
23 | - Mention moreImportantMention = m1 == lessImportantMention ? m2 | ||
24 | - : m1; | ||
25 | - | ||
26 | - // same mention borders | ||
27 | - if (m1.getSegments().equals(m2.getSegments())) { | ||
28 | - unnecessaryMentions.add(lessImportantMention); | ||
29 | - // System.out.println("Same borders: "+ m1 +", "+ | ||
30 | - // m2+": "+getLessImportantMention(m1, m2)+" removed"); | ||
31 | - continue; | ||
32 | - } | ||
33 | - // same mention heads | ||
34 | - if (!m1.getHeadSegments().isEmpty() | ||
35 | - && !m2.getHeadSegments().isEmpty()) { | ||
36 | - if (m1.getHeadSegments().equals(m2.getHeadSegments())) { | ||
37 | - | ||
38 | - List<Token> segments = moreImportantMention | ||
39 | - .getSegments(); | ||
40 | - | ||
41 | - boolean isConj = false; | ||
42 | - for (Token seg : segments) { | ||
43 | - if (seg.getChosenInterpretation().getCtag() | ||
44 | - .equals("conj")) { | ||
45 | - isConj = true; | ||
46 | - break; | ||
47 | - } | ||
48 | - } | ||
49 | - | ||
50 | - if (!isConj) { | ||
51 | - unnecessaryMentions.add(lessImportantMention); | ||
52 | - // System.out.println("Same heads: " + m1 + ", " + | ||
53 | - // m2 + ": " + lessImportantMention | ||
54 | - // + " removed"); | ||
55 | - | ||
56 | - continue; | ||
57 | - } | ||
58 | - } | ||
59 | - } | ||
60 | - | ||
61 | - // mention head equals whole other mention | ||
62 | - if (m1.getHeadSegments().isEmpty() | ||
63 | - && !m2.getHeadSegments().isEmpty()) { | ||
64 | - if (m2.getHeadSegments().equals(m1.getSegments())) { | ||
65 | - unnecessaryMentions.add(lessImportantMention); | ||
66 | - continue; | ||
67 | - // System.out.println("head is other mention: " + m1 + | ||
68 | - // ", " + m2 + ": " | ||
69 | - // + getLessImportantMention(m1, m2) + " removed"); | ||
70 | - } | ||
71 | - } | ||
72 | - | ||
73 | - // the same, but other way round | ||
74 | - if (m2.getHeadSegments().isEmpty() | ||
75 | - && !m1.getHeadSegments().isEmpty()) { | ||
76 | - | ||
77 | - if (m1.getHeadSegments().equals(m2.getSegments())) { | ||
78 | - unnecessaryMentions.add(lessImportantMention); | ||
79 | - continue; | ||
80 | - // System.out.println("head is other mention: " + m1 + | ||
81 | - // ", " + m2 + ": " | ||
82 | - // + getLessImportantMention(m1, m2) + " removed"); | ||
83 | - } | ||
84 | - } | ||
85 | - | ||
86 | - // nie zawieraja sie w sobie, lecz maja czesc wspolna | ||
87 | - boolean intersect = false; | ||
88 | - | ||
89 | - Set<Token> notInM1 = new HashSet<Token>(m2.getSegments()); | ||
90 | - notInM1.removeAll(m1.getSegments()); | ||
91 | - if (notInM1.size() < m2.getSegments().size()) | ||
92 | - intersect = true; | ||
93 | - | ||
94 | - Set<Token> notInM2 = new HashSet<Token>(m1.getSegments()); | ||
95 | - notInM2.removeAll(m2.getSegments()); | ||
96 | - if (notInM2.size() < m1.getSegments().size()) | ||
97 | - intersect = true; | ||
98 | - | ||
99 | - // if (intersect) | ||
100 | - // System.out.println(m1+","+m2); | ||
101 | - | ||
102 | - if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) { | ||
103 | - unnecessaryMentions.add(lessImportantMention); | ||
104 | - continue; | ||
105 | - // System.out.println("intersection!" + m1 + ", " + m2 + | ||
106 | - // ": " | ||
107 | - // + getLessImportantMention(m1, m2) + " removed"); | ||
108 | - } | ||
109 | - | ||
110 | - } | ||
111 | - } | ||
112 | - | ||
113 | - for (Mention m : unnecessaryMentions) | ||
114 | - sentence.removeMention(m); | ||
115 | - | ||
116 | - // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]] | ||
117 | - unnecessaryMentions.clear(); | ||
118 | - | ||
119 | - OUTER: for (Mention m : sentence.getMentions()) { | ||
120 | - for (Token seg : m.getSegments()) | ||
121 | - if (seg.getOrth().toLowerCase().equals(seg.getOrth())) | ||
122 | - continue OUTER; | ||
123 | - | ||
124 | - //only for children of fully capitalized mentions | ||
125 | - Set<Mention> allMentions = new HashSet<Mention>(); | ||
126 | - for (Token seg : m.getSegments()) | ||
127 | - for (Mention m2 : seg.getMentions()) | ||
128 | - if (m.getSegments().containsAll(m2.getSegments())) | ||
129 | - allMentions.add(m2); | ||
130 | - | ||
131 | - allMentions.remove(m); | ||
132 | - | ||
133 | - unnecessaryMentions.addAll(allMentions); | ||
134 | - } | ||
135 | - for (Mention m : unnecessaryMentions) | ||
136 | - sentence.removeMention(m); | ||
137 | - } | ||
138 | - | ||
139 | - private static Mention getLessImportantMention(Mention m1, Mention m2) { | ||
140 | - if (m1.getSegments().size() > m2.getSegments().size()) | ||
141 | - return m2; | ||
142 | - else | ||
143 | - return m1; | ||
144 | - } | 13 | + public static void cleanUnnecessarySentenceMentions(Sentence sentence) { |
14 | + List<Mention> mentions = sentence.getMentions(); | ||
15 | + Collection<Mention> unnecessaryMentions = new HashSet<>(); | ||
16 | + | ||
17 | + for (int i = 0; i < mentions.size(); i++) { | ||
18 | + Mention m1 = mentions.get(i); | ||
19 | + for (int j = i + 1; j < mentions.size(); j++) { | ||
20 | + Mention m2 = mentions.get(j); | ||
21 | + | ||
22 | + Mention lessImportantMention = getLessImportantMention(m1, m2); | ||
23 | + Mention moreImportantMention = m1 == lessImportantMention ? m2 | ||
24 | + : m1; | ||
25 | + | ||
26 | + // same mention borders | ||
27 | + if (m1.getSegments().equals(m2.getSegments())) { | ||
28 | + unnecessaryMentions.add(lessImportantMention); | ||
29 | + continue; | ||
30 | + } | ||
31 | + // same mention heads | ||
32 | + if (!m1.getHeadSegments().isEmpty() | ||
33 | + && !m2.getHeadSegments().isEmpty()) { | ||
34 | + if (m1.getHeadSegments().equals(m2.getHeadSegments())) { | ||
35 | + | ||
36 | + List<Token> segments = moreImportantMention | ||
37 | + .getSegments(); | ||
38 | + | ||
39 | + boolean isConj = false; | ||
40 | + for (Token seg : segments) { | ||
41 | + if (seg.getChosenInterpretation().getCtag() | ||
42 | + .equals("conj")) { | ||
43 | + isConj = true; | ||
44 | + break; | ||
45 | + } | ||
46 | + } | ||
47 | + | ||
48 | + if (!isConj) { | ||
49 | + unnecessaryMentions.add(lessImportantMention); | ||
50 | + continue; | ||
51 | + } | ||
52 | + } | ||
53 | + } | ||
54 | + | ||
55 | + // mention head equals whole other mention | ||
56 | + if (m1.getHeadSegments().isEmpty() | ||
57 | + && !m2.getHeadSegments().isEmpty()) { | ||
58 | + if (m2.getHeadSegments().equals(m1.getSegments())) { | ||
59 | + unnecessaryMentions.add(lessImportantMention); | ||
60 | + continue; | ||
61 | + } | ||
62 | + } | ||
63 | + | ||
64 | + // the same, but other way round | ||
65 | + if (m2.getHeadSegments().isEmpty() | ||
66 | + && !m1.getHeadSegments().isEmpty()) { | ||
67 | + | ||
68 | + if (m1.getHeadSegments().equals(m2.getSegments())) { | ||
69 | + unnecessaryMentions.add(lessImportantMention); | ||
70 | + continue; | ||
71 | + } | ||
72 | + } | ||
73 | + | ||
74 | + // nie zawieraja sie w sobie, lecz maja czesc wspolna | ||
75 | + boolean intersect = false; | ||
76 | + | ||
77 | + Set<Token> notInM1 = new HashSet<>(m2.getSegments()); | ||
78 | + notInM1.removeAll(m1.getSegments()); | ||
79 | + if (notInM1.size() < m2.getSegments().size()) | ||
80 | + intersect = true; | ||
81 | + | ||
82 | + Set<Token> notInM2 = new HashSet<>(m1.getSegments()); | ||
83 | + notInM2.removeAll(m2.getSegments()); | ||
84 | + if (notInM2.size() < m1.getSegments().size()) | ||
85 | + intersect = true; | ||
86 | + | ||
87 | + if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) { | ||
88 | + unnecessaryMentions.add(lessImportantMention); | ||
89 | + continue; | ||
90 | + } | ||
91 | + | ||
92 | + } | ||
93 | + } | ||
94 | + | ||
95 | + for (Mention m : unnecessaryMentions) | ||
96 | + sentence.removeMention(m); | ||
97 | + | ||
98 | + // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]] | ||
99 | + unnecessaryMentions.clear(); | ||
100 | + | ||
101 | + OUTER: | ||
102 | + for (Mention m : sentence.getMentions()) { | ||
103 | + for (Token seg : m.getSegments()) | ||
104 | + if (seg.getOrth().toLowerCase().equals(seg.getOrth())) | ||
105 | + continue OUTER; | ||
106 | + | ||
107 | + //only for children of fully capitalized mentions | ||
108 | + Set<Mention> allMentions = new HashSet<>(); | ||
109 | + for (Token seg : m.getSegments()) | ||
110 | + for (Mention m2 : seg.getMentions()) | ||
111 | + if (m.getSegments().containsAll(m2.getSegments())) | ||
112 | + allMentions.add(m2); | ||
113 | + | ||
114 | + allMentions.remove(m); | ||
115 | + | ||
116 | + unnecessaryMentions.addAll(allMentions); | ||
117 | + } | ||
118 | + for (Mention m : unnecessaryMentions) | ||
119 | + sentence.removeMention(m); | ||
120 | + } | ||
121 | + | ||
122 | + private static Mention getLessImportantMention(Mention m1, Mention m2) { | ||
123 | + if (m1.getSegments().size() > m2.getSegments().size()) | ||
124 | + return m2; | ||
125 | + else | ||
126 | + return m1; | ||
127 | + } | ||
145 | } | 128 | } |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
1 | package pl.waw.ipipan.zil.core.md.detection; | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | ||
3 | public class Constants { | 3 | public class Constants { |
4 | - public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger"; | ||
5 | - public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt"; | ||
6 | - public static final String MORPHO_PRONOUN_CTAGS = "ppron3|ppron12"; | ||
7 | - public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|" | ||
8 | - + MORPHO_PRONOUN_CTAGS; | ||
9 | - public static final String WORDS_CTAGS = "Noun|Ppron.*"; | 4 | + public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger"; |
5 | + public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt"; | ||
6 | + public static final String MORPHO_PRONOUN_CTAGS = "ppron3|ppron12"; | ||
7 | + public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|" | ||
8 | + + MORPHO_PRONOUN_CTAGS; | ||
9 | + public static final String WORDS_CTAGS = "Noun|Ppron.*"; | ||
10 | + | ||
11 | + private Constants() { | ||
12 | + } | ||
10 | } | 13 | } |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
1 | package pl.waw.ipipan.zil.core.md.detection; | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | ||
6 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
7 | + | ||
3 | import java.util.ArrayList; | 8 | import java.util.ArrayList; |
4 | import java.util.HashSet; | 9 | import java.util.HashSet; |
5 | import java.util.List; | 10 | import java.util.List; |
6 | import java.util.Set; | 11 | import java.util.Set; |
7 | 12 | ||
8 | -import org.apache.log4j.Logger; | ||
9 | - | ||
10 | -import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | ||
11 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | ||
12 | -import pl.waw.ipipan.zil.core.md.entities.NamedEntity; | ||
13 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | ||
14 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
15 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | ||
16 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | ||
17 | -import pl.waw.ipipan.zil.core.md.entities.Text; | ||
18 | -import pl.waw.ipipan.zil.core.md.entities.Token; | ||
19 | - | ||
20 | public class Detector { | 13 | public class Detector { |
21 | - private static Logger logger = Logger.getLogger(Detector.class); | ||
22 | - | ||
23 | - public static void findMentionsInText(Text text, | ||
24 | - ZeroSubjectDetector zeroSubjectModel) { | ||
25 | - text.clearMentions(); | ||
26 | - logger.debug("Detecting mentions in text " + text.getId()); | ||
27 | - for (Paragraph p : text) | ||
28 | - for (Sentence s : p) | ||
29 | - detectMentionsInSentence(s, zeroSubjectModel); | ||
30 | - } | ||
31 | - | ||
32 | - private static void detectMentionsInSentence(Sentence sentence, | ||
33 | - ZeroSubjectDetector zeroSubjectModel) { | ||
34 | - // adding mentions | ||
35 | - addMentionsByTokenCtag(sentence); | ||
36 | - addMentionsBySyntacticWordsCtag(sentence); | ||
37 | - addMentionsByNamedEntities(sentence); | ||
38 | - addMentionsByGroups(sentence); | ||
39 | - addSpeakerMentionsInSpoken(sentence); | ||
40 | - | ||
41 | - // zero subject detection | ||
42 | - zeroSubjectModel.addZeroSubjectMentions(sentence); | ||
43 | - | ||
44 | - // removing mentions | ||
45 | - removeTo(sentence); | ||
46 | - Cleaner.cleanUnnecessarySentenceMentions(sentence); | ||
47 | - | ||
48 | - // updating mention heads | ||
49 | - updateMentionHeads(sentence); | ||
50 | - } | ||
51 | - | ||
52 | - /** | ||
53 | - * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak | ||
54 | - * | ||
55 | - * @param sentence | ||
56 | - */ | ||
57 | - private static void updateMentionHeads(Sentence sentence) { | ||
58 | - for (Mention m : sentence.getMentions()) | ||
59 | - if (m.getHeadSegments().isEmpty()) | ||
60 | - m.addHeadSegment(m.getFirstSegment()); | ||
61 | - } | ||
62 | - | ||
63 | - /** | ||
64 | - * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro"" | ||
65 | - * | ||
66 | - * @param sentence | ||
67 | - */ | ||
68 | - private static void removeTo(Sentence sentence) { | ||
69 | - Set<String> orths = new HashSet<String>(); | ||
70 | - for (Token morph : sentence) | ||
71 | - orths.add(morph.getOrth()); | ||
72 | - | ||
73 | - if (orths.contains("jeśli") || orths.contains("jeżeli") | ||
74 | - || orths.contains("skoro")) { | ||
75 | - for (Mention mention : sentence.getMentions()) { | ||
76 | - List<Token> mentSegs = mention.getSegments(); | ||
77 | - if (mentSegs.size() == 1 | ||
78 | - && mentSegs.get(0).getBase().equals("to")) { | ||
79 | - sentence.removeMention(mention); | ||
80 | - } | ||
81 | - } | ||
82 | - } | ||
83 | - } | ||
84 | - | ||
85 | - private static void addSpeakerMentionsInSpoken(Sentence sentence) { | ||
86 | - // heurystyka dla sp1:, sp2:, MarszałekJAkistam: | ||
87 | - if (sentence.size() > 2) { | ||
88 | - Token first = sentence.get(0); | ||
89 | - Token second = sentence.get(1); | ||
90 | - if (second.getOrth().equals(":")) { | ||
91 | - sentence.addMention(new Mention(first)); | ||
92 | - } | ||
93 | - } | ||
94 | - } | ||
95 | - | ||
96 | - /** | ||
97 | - * Wyszukuję i oznaczam wszystkie NG* | ||
98 | - * | ||
99 | - * @param sentence | ||
100 | - */ | ||
101 | - private static void addMentionsByGroups(Sentence sentence) { | ||
102 | - for (SyntacticGroup group : sentence.getGroups()) { | ||
103 | - if (group.getType().startsWith("NG")) { | ||
104 | - List<Token> segments = group.getTokens(); | ||
105 | - List<Token> heads = group.getSemanticHeadTokens(); | ||
106 | - | ||
107 | - sentence.addMention(new Mention(segments, heads)); | ||
108 | - } | ||
109 | - } | ||
110 | - } | ||
111 | - | ||
112 | - /** | ||
113 | - * Wyszukuję i oznaczam wszystkie NER | ||
114 | - * | ||
115 | - * @param sentence | ||
116 | - */ | ||
117 | - private static void addMentionsByNamedEntities(Sentence sentence) { | ||
118 | - for (NamedEntity ne : sentence.getNamedEntities()) { | ||
119 | - | ||
120 | - List<Token> headTokens = new ArrayList<Token>(); | ||
121 | - List<Token> tokens = ne.getTokens(); | ||
122 | - | ||
123 | - boolean containsNoun = false; | ||
124 | - for (Token seg : tokens) { | ||
125 | - if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) { | ||
126 | - containsNoun = true; | ||
127 | - break; | ||
128 | - } | ||
129 | - } | ||
130 | - if (!containsNoun) | ||
131 | - continue; | ||
132 | - | ||
133 | - sentence.addMention(new Mention(tokens, headTokens)); | ||
134 | - } | ||
135 | - } | ||
136 | - | ||
137 | - /** | ||
138 | - * @param sentence | ||
139 | - */ | ||
140 | - private static void addMentionsBySyntacticWordsCtag(Sentence sentence) { | ||
141 | - for (SyntacticWord w : sentence.getSyntacticWords()) | ||
142 | - if (w.getCtag().matches(Constants.WORDS_CTAGS)) { | ||
143 | - List<Token> tokens = w.getTokens(); | ||
144 | - if (tokens.size() == 1) { | ||
145 | - sentence.addMention(new Mention(tokens.get(0))); | ||
146 | - } else { | ||
147 | - List<Token> heads = new ArrayList<Token>(); | ||
148 | - sentence.addMention(new Mention(tokens, heads)); | ||
149 | - } | ||
150 | - } | ||
151 | - } | ||
152 | - | ||
153 | - /** | ||
154 | - * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow | ||
155 | - * skladniowych, to korzystam z niego zamiast morfoskladni | ||
156 | - * | ||
157 | - * @param sentence | ||
158 | - */ | ||
159 | - private static void addMentionsByTokenCtag(Sentence sentence) { | ||
160 | - for (Token token : sentence) | ||
161 | - if (token.getCtag().matches(Constants.MORPHO_CTAGS)) | ||
162 | - sentence.addMention(new Mention(token)); | ||
163 | - } | 14 | + |
15 | + private static final Logger logger = LoggerFactory.getLogger(Detector.class); | ||
16 | + | ||
17 | + private Detector() { | ||
18 | + } | ||
19 | + | ||
20 | + public static void findMentionsInText(Text text, | ||
21 | + ZeroSubjectDetector zeroSubjectModel) { | ||
22 | + text.clearMentions(); | ||
23 | + logger.debug("Detecting mentions in text " + text.getId()); | ||
24 | + for (Paragraph p : text) | ||
25 | + for (Sentence s : p) | ||
26 | + detectMentionsInSentence(s, zeroSubjectModel); | ||
27 | + } | ||
28 | + | ||
29 | + private static void detectMentionsInSentence(Sentence sentence, | ||
30 | + ZeroSubjectDetector zeroSubjectModel) { | ||
31 | + // adding mentions | ||
32 | + addMentionsByTokenCtag(sentence); | ||
33 | + addMentionsBySyntacticWordsCtag(sentence); | ||
34 | + addMentionsByNamedEntities(sentence); | ||
35 | + addMentionsByGroups(sentence); | ||
36 | + addSpeakerMentionsInSpoken(sentence); | ||
37 | + | ||
38 | + // zero subject detection | ||
39 | + zeroSubjectModel.addZeroSubjectMentions(sentence); | ||
40 | + | ||
41 | + // removing mentions | ||
42 | + removeTo(sentence); | ||
43 | + Cleaner.cleanUnnecessarySentenceMentions(sentence); | ||
44 | + | ||
45 | + // updating mention heads | ||
46 | + updateMentionHeads(sentence); | ||
47 | + } | ||
48 | + | ||
49 | + /** | ||
50 | + * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak | ||
51 | + * | ||
52 | + * @param sentence | ||
53 | + */ | ||
54 | + private static void updateMentionHeads(Sentence sentence) { | ||
55 | + for (Mention m : sentence.getMentions()) | ||
56 | + if (m.getHeadSegments().isEmpty()) | ||
57 | + m.addHeadSegment(m.getFirstSegment()); | ||
58 | + } | ||
59 | + | ||
60 | + /** | ||
61 | + * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro"" | ||
62 | + * | ||
63 | + * @param sentence | ||
64 | + */ | ||
65 | + private static void removeTo(Sentence sentence) { | ||
66 | + Set<String> orths = new HashSet<>(); | ||
67 | + for (Token morph : sentence) | ||
68 | + orths.add(morph.getOrth()); | ||
69 | + | ||
70 | + if (orths.contains("jeśli") || orths.contains("jeżeli") | ||
71 | + || orths.contains("skoro")) { | ||
72 | + for (Mention mention : sentence.getMentions()) { | ||
73 | + List<Token> mentSegs = mention.getSegments(); | ||
74 | + if (mentSegs.size() == 1 | ||
75 | + && "to".equals(mentSegs.get(0).getBase())) { | ||
76 | + sentence.removeMention(mention); | ||
77 | + } | ||
78 | + } | ||
79 | + } | ||
80 | + } | ||
81 | + | ||
82 | + private static void addSpeakerMentionsInSpoken(Sentence sentence) { | ||
83 | + // heurystyka dla sp1:, sp2:, MarszałekJAkistam: | ||
84 | + if (sentence.size() > 2) { | ||
85 | + Token first = sentence.get(0); | ||
86 | + Token second = sentence.get(1); | ||
87 | + if (":".equals(second.getOrth())) { | ||
88 | + sentence.addMention(new Mention(first)); | ||
89 | + } | ||
90 | + } | ||
91 | + } | ||
92 | + | ||
93 | + /** | ||
94 | + * Wyszukuję i oznaczam wszystkie NG* | ||
95 | + * | ||
96 | + * @param sentence | ||
97 | + */ | ||
98 | + private static void addMentionsByGroups(Sentence sentence) { | ||
99 | + for (SyntacticGroup group : sentence.getGroups()) { | ||
100 | + if (group.getType().startsWith("NG")) { | ||
101 | + List<Token> segments = group.getTokens(); | ||
102 | + List<Token> heads = group.getSemanticHeadTokens(); | ||
103 | + | ||
104 | + sentence.addMention(new Mention(segments, heads)); | ||
105 | + } | ||
106 | + } | ||
107 | + } | ||
108 | + | ||
109 | + /** | ||
110 | + * Wyszukuję i oznaczam wszystkie NER | ||
111 | + * | ||
112 | + * @param sentence | ||
113 | + */ | ||
114 | + private static void addMentionsByNamedEntities(Sentence sentence) { | ||
115 | + for (NamedEntity ne : sentence.getNamedEntities()) { | ||
116 | + | ||
117 | + List<Token> headTokens = new ArrayList<>(); | ||
118 | + List<Token> tokens = ne.getTokens(); | ||
119 | + | ||
120 | + boolean containsNoun = false; | ||
121 | + for (Token seg : tokens) { | ||
122 | + if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) { | ||
123 | + containsNoun = true; | ||
124 | + break; | ||
125 | + } | ||
126 | + } | ||
127 | + if (!containsNoun) | ||
128 | + continue; | ||
129 | + | ||
130 | + sentence.addMention(new Mention(tokens, headTokens)); | ||
131 | + } | ||
132 | + } | ||
133 | + | ||
134 | + private static void addMentionsBySyntacticWordsCtag(Sentence sentence) { | ||
135 | + for (SyntacticWord w : sentence.getSyntacticWords()) | ||
136 | + if (w.getCtag().matches(Constants.WORDS_CTAGS)) { | ||
137 | + List<Token> tokens = w.getTokens(); | ||
138 | + if (tokens.size() == 1) { | ||
139 | + sentence.addMention(new Mention(tokens.get(0))); | ||
140 | + } else { | ||
141 | + List<Token> heads = new ArrayList<>(); | ||
142 | + sentence.addMention(new Mention(tokens, heads)); | ||
143 | + } | ||
144 | + } | ||
145 | + } | ||
146 | + | ||
147 | + /** | ||
148 | + * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow | ||
149 | + * skladniowych, to korzystam z niego zamiast morfoskladni | ||
150 | + * | ||
151 | + * @param sentence | ||
152 | + */ | ||
153 | + private static void addMentionsByTokenCtag(Sentence sentence) { | ||
154 | + for (Token token : sentence) | ||
155 | + if (token.getCtag().matches(Constants.MORPHO_CTAGS)) | ||
156 | + sentence.addMention(new Mention(token)); | ||
157 | + } | ||
164 | } | 158 | } |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/FeatureGeneration.java
1 | package pl.waw.ipipan.zil.core.md.detection.zero; | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | ||
3 | -import java.util.ArrayList; | ||
4 | -import java.util.Arrays; | ||
5 | -import java.util.HashMap; | ||
6 | -import java.util.HashSet; | ||
7 | -import java.util.Iterator; | ||
8 | -import java.util.LinkedList; | ||
9 | -import java.util.List; | ||
10 | -import java.util.Map; | ||
11 | -import java.util.Set; | ||
12 | - | ||
13 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | ||
14 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
15 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | ||
16 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | ||
17 | -import pl.waw.ipipan.zil.core.md.entities.Token; | 3 | +import pl.waw.ipipan.zil.core.md.entities.*; |
18 | import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; | 4 | import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; |
19 | import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; | 5 | import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; |
20 | 6 | ||
7 | +import java.util.*; | ||
8 | + | ||
21 | public class FeatureGeneration { | 9 | public class FeatureGeneration { |
22 | final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", | 10 | final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", |
23 | "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); | 11 | "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java
1 | package pl.waw.ipipan.zil.core.md.detection.zero; | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | ||
3 | -import java.io.File; | ||
4 | -import java.util.ArrayList; | ||
5 | -import java.util.HashSet; | ||
6 | -import java.util.List; | ||
7 | -import java.util.Map.Entry; | ||
8 | -import java.util.Set; | ||
9 | -import java.util.TreeMap; | ||
10 | -import java.util.TreeSet; | ||
11 | - | ||
12 | -import org.apache.log4j.Logger; | ||
13 | - | ||
14 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | ||
15 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | ||
16 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
17 | -import pl.waw.ipipan.zil.core.md.entities.Text; | ||
18 | -import pl.waw.ipipan.zil.core.md.entities.Token; | 3 | +import org.slf4j.Logger; |
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
19 | import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; | 6 | import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; |
20 | import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | 7 | import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; |
21 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; | 8 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; |
@@ -25,154 +12,161 @@ import weka.core.FastVector; | @@ -25,154 +12,161 @@ import weka.core.FastVector; | ||
25 | import weka.core.Instance; | 12 | import weka.core.Instance; |
26 | import weka.core.Instances; | 13 | import weka.core.Instances; |
27 | 14 | ||
15 | +import java.io.File; | ||
16 | +import java.util.*; | ||
17 | +import java.util.Map.Entry; | ||
18 | + | ||
28 | public class InstanceCreator { | 19 | public class InstanceCreator { |
29 | 20 | ||
30 | - final private static Logger logger = Logger.getLogger(InstanceCreator.class); | ||
31 | - final private static TEI_IO teiIO = TEI_IO.getInstance(); | ||
32 | - | ||
33 | - public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) { | ||
34 | - int allTexts = 0; | ||
35 | - int exceptions = 0; | ||
36 | - int allSentences = 0; | ||
37 | - | ||
38 | - List<TreeMap<String, Object>> examples = new ArrayList<>(); | ||
39 | - for (File textDir : IOUtils.getNKJPDirs(dataDir)) { | ||
40 | - try { | ||
41 | - allTexts++; | ||
42 | - logger.info("Processing text " + textDir); | ||
43 | - TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); | ||
44 | - Text text = TeiLoader.loadTextFromTei(ct); | ||
45 | - | ||
46 | - for (Paragraph p : text) | ||
47 | - for (Sentence s : p) { | ||
48 | - allSentences++; | ||
49 | - loadExamplesFromSentence(quasiVerbs, examples, s); | ||
50 | - } | ||
51 | - | ||
52 | - } catch (Exception e) { | ||
53 | - logger.error(e.getLocalizedMessage()); | ||
54 | - exceptions++; | ||
55 | - } | ||
56 | - } | ||
57 | - | ||
58 | - logger.info(allTexts + " texts found."); | ||
59 | - if (exceptions != 0) | ||
60 | - logger.error(exceptions + " texts with exceptions."); | ||
61 | - logger.info(allSentences + " sentences found."); | ||
62 | - | ||
63 | - return examples; | ||
64 | - } | ||
65 | - | ||
66 | - public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples, | ||
67 | - Sentence s) { | ||
68 | - | ||
69 | - // collect positive examples | ||
70 | - Set<Token> positive = new HashSet<>(); | ||
71 | - for (Mention m : s.getMentions()) { | ||
72 | - if (FeatureGeneration.isVerb(m)) { | ||
73 | - positive.addAll(m.getSegments()); | ||
74 | - } | ||
75 | - } | ||
76 | - | ||
77 | - for (Token m : s) { | ||
78 | - if (!FeatureGeneration.isVerb(m)) | ||
79 | - continue; | ||
80 | - | ||
81 | - TreeMap<String, Object> features = new TreeMap<>(); | ||
82 | - if (positive.contains(m)) { | ||
83 | - features.put("class", Boolean.valueOf(true)); | ||
84 | - } else { | ||
85 | - features.put("class", Boolean.valueOf(false)); | ||
86 | - } | ||
87 | - | ||
88 | - FeatureGeneration.generateFeatures(features, m, s, quasiVerbs); | ||
89 | - examples.add(features); | ||
90 | - } | ||
91 | - } | ||
92 | - | ||
93 | - public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { | ||
94 | - | ||
95 | - TreeSet<String> booleanAttsOccurred = new TreeSet<>(); | ||
96 | - TreeSet<String> doubleAttsOccurred = new TreeSet<>(); | ||
97 | - TreeMap<String, Set<String>> att2values = new TreeMap<>(); | ||
98 | - for (TreeMap<String, Object> example : examples) { | ||
99 | - for (Entry<String, Object> e : example.entrySet()) { | ||
100 | - String key = e.getKey(); | ||
101 | - Object val = e.getValue(); | ||
102 | - if (val instanceof Integer || val instanceof Double) { | ||
103 | - doubleAttsOccurred.add(key); | ||
104 | - continue; | ||
105 | - } | ||
106 | - if (val instanceof Boolean) { | ||
107 | - booleanAttsOccurred.add(key); | ||
108 | - continue; | ||
109 | - } | ||
110 | - if (!att2values.containsKey(key)) | ||
111 | - att2values.put(key, new HashSet<String>()); | ||
112 | - att2values.get(key).add(val.toString()); | ||
113 | - } | ||
114 | - } | ||
115 | - | ||
116 | - List<Attribute> atts = new ArrayList<>(); | ||
117 | - | ||
118 | - // double attributes | ||
119 | - for (String attName : doubleAttsOccurred) { | ||
120 | - Attribute att = new Attribute(attName); | ||
121 | - atts.add(att); | ||
122 | - } | ||
123 | - | ||
124 | - // boolean attributes (treated as nominal) | ||
125 | - FastVector values = new FastVector(2); | ||
126 | - values.addElement("false"); | ||
127 | - values.addElement("true"); | ||
128 | - for (String attName : booleanAttsOccurred) { | ||
129 | - Attribute att = new Attribute(attName, values); | ||
130 | - atts.add(att); | ||
131 | - } | ||
132 | - | ||
133 | - // nominal attributes | ||
134 | - for (Entry<String, Set<String>> attVals : att2values.entrySet()) { | ||
135 | - FastVector vals = new FastVector(attVals.getValue().size()); | ||
136 | - for (String val : attVals.getValue()) | ||
137 | - vals.addElement(val); | ||
138 | - Attribute att = new Attribute(attVals.getKey(), vals); | ||
139 | - atts.add(att); | ||
140 | - } | ||
141 | - | ||
142 | - FastVector fvWekaAttributes = new FastVector(atts.size()); | ||
143 | - for (Attribute attr : atts) { | ||
144 | - fvWekaAttributes.addElement(attr); | ||
145 | - } | ||
146 | - | ||
147 | - Instances data = new Instances("Zero", fvWekaAttributes, 10); | ||
148 | - data.setClass(data.attribute(classFeatureName)); | ||
149 | - return data; | ||
150 | - } | ||
151 | - | ||
152 | - public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { | ||
153 | - for (TreeMap<String, Object> example : examples) { | ||
154 | - Instance instance = new Instance(instances.numAttributes()); | ||
155 | - | ||
156 | - for (Entry<String, Object> e : example.entrySet()) { | ||
157 | - Object val = e.getValue(); | ||
158 | - String name = e.getKey(); | ||
159 | - if (val instanceof Integer) { | ||
160 | - instance.setValue(instances.attribute(name), (int) val); | ||
161 | - } else if (val instanceof Boolean) { | ||
162 | - instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); | ||
163 | - } else { | ||
164 | - int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); | ||
165 | - if (indexOfValue == -1) { | ||
166 | - logger.debug("Unkown value: " + val.toString() + " of feature: " + name | ||
167 | - + ". Marking as missing value."); | ||
168 | - instance.setMissing(instances.attribute(name)); | ||
169 | - } else | ||
170 | - instance.setValue(instances.attribute(name), indexOfValue); | ||
171 | - } | ||
172 | - } | ||
173 | - | ||
174 | - instance.setDataset(instances); | ||
175 | - instances.add(instance); | ||
176 | - } | ||
177 | - } | 21 | + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class); |
22 | + private static final TEI_IO teiIO = TEI_IO.getInstance(); | ||
23 | + | ||
24 | + private InstanceCreator() { | ||
25 | + } | ||
26 | + | ||
27 | + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) { | ||
28 | + int allTexts = 0; | ||
29 | + int exceptions = 0; | ||
30 | + int allSentences = 0; | ||
31 | + | ||
32 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | ||
33 | + for (File textDir : IOUtils.getNKJPDirs(dataDir)) { | ||
34 | + try { | ||
35 | + allTexts++; | ||
36 | + logger.info("Processing text " + textDir); | ||
37 | + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); | ||
38 | + Text text = TeiLoader.loadTextFromTei(ct); | ||
39 | + | ||
40 | + for (Paragraph p : text) | ||
41 | + for (Sentence s : p) { | ||
42 | + allSentences++; | ||
43 | + loadExamplesFromSentence(quasiVerbs, examples, s); | ||
44 | + } | ||
45 | + | ||
46 | + } catch (Exception e) { | ||
47 | + logger.error(e.getLocalizedMessage()); | ||
48 | + exceptions++; | ||
49 | + } | ||
50 | + } | ||
51 | + | ||
52 | + logger.info(allTexts + " texts found."); | ||
53 | + if (exceptions != 0) | ||
54 | + logger.error(exceptions + " texts with exceptions."); | ||
55 | + logger.info(allSentences + " sentences found."); | ||
56 | + | ||
57 | + return examples; | ||
58 | + } | ||
59 | + | ||
60 | + public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples, | ||
61 | + Sentence s) { | ||
62 | + | ||
63 | + // collect positive examples | ||
64 | + Set<Token> positive = new HashSet<>(); | ||
65 | + for (Mention m : s.getMentions()) { | ||
66 | + if (FeatureGeneration.isVerb(m)) { | ||
67 | + positive.addAll(m.getSegments()); | ||
68 | + } | ||
69 | + } | ||
70 | + | ||
71 | + for (Token m : s) { | ||
72 | + if (!FeatureGeneration.isVerb(m)) | ||
73 | + continue; | ||
74 | + | ||
75 | + TreeMap<String, Object> features = new TreeMap<>(); | ||
76 | + if (positive.contains(m)) { | ||
77 | + features.put("class", Boolean.valueOf(true)); | ||
78 | + } else { | ||
79 | + features.put("class", Boolean.valueOf(false)); | ||
80 | + } | ||
81 | + | ||
82 | + FeatureGeneration.generateFeatures(features, m, s, quasiVerbs); | ||
83 | + examples.add(features); | ||
84 | + } | ||
85 | + } | ||
86 | + | ||
87 | + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { | ||
88 | + | ||
89 | + TreeSet<String> booleanAttsOccurred = new TreeSet<>(); | ||
90 | + TreeSet<String> doubleAttsOccurred = new TreeSet<>(); | ||
91 | + TreeMap<String, Set<String>> att2values = new TreeMap<>(); | ||
92 | + for (TreeMap<String, Object> example : examples) { | ||
93 | + for (Entry<String, Object> e : example.entrySet()) { | ||
94 | + String key = e.getKey(); | ||
95 | + Object val = e.getValue(); | ||
96 | + if (val instanceof Integer || val instanceof Double) { | ||
97 | + doubleAttsOccurred.add(key); | ||
98 | + continue; | ||
99 | + } | ||
100 | + if (val instanceof Boolean) { | ||
101 | + booleanAttsOccurred.add(key); | ||
102 | + continue; | ||
103 | + } | ||
104 | + if (!att2values.containsKey(key)) | ||
105 | + att2values.put(key, new HashSet<>()); | ||
106 | + att2values.get(key).add(val.toString()); | ||
107 | + } | ||
108 | + } | ||
109 | + | ||
110 | + List<Attribute> atts = new ArrayList<>(); | ||
111 | + | ||
112 | + // double attributes | ||
113 | + for (String attName : doubleAttsOccurred) { | ||
114 | + Attribute att = new Attribute(attName); | ||
115 | + atts.add(att); | ||
116 | + } | ||
117 | + | ||
118 | + // boolean attributes (treated as nominal) | ||
119 | + FastVector values = new FastVector(2); | ||
120 | + values.addElement("false"); | ||
121 | + values.addElement("true"); | ||
122 | + for (String attName : booleanAttsOccurred) { | ||
123 | + Attribute att = new Attribute(attName, values); | ||
124 | + atts.add(att); | ||
125 | + } | ||
126 | + | ||
127 | + // nominal attributes | ||
128 | + for (Entry<String, Set<String>> attVals : att2values.entrySet()) { | ||
129 | + FastVector vals = new FastVector(attVals.getValue().size()); | ||
130 | + for (String val : attVals.getValue()) | ||
131 | + vals.addElement(val); | ||
132 | + Attribute att = new Attribute(attVals.getKey(), vals); | ||
133 | + atts.add(att); | ||
134 | + } | ||
135 | + | ||
136 | + FastVector fvWekaAttributes = new FastVector(atts.size()); | ||
137 | + for (Attribute attr : atts) { | ||
138 | + fvWekaAttributes.addElement(attr); | ||
139 | + } | ||
140 | + | ||
141 | + Instances data = new Instances("Zero", fvWekaAttributes, 10); | ||
142 | + data.setClass(data.attribute(classFeatureName)); | ||
143 | + return data; | ||
144 | + } | ||
145 | + | ||
146 | + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { | ||
147 | + for (TreeMap<String, Object> example : examples) { | ||
148 | + Instance instance = new Instance(instances.numAttributes()); | ||
149 | + | ||
150 | + for (Entry<String, Object> e : example.entrySet()) { | ||
151 | + Object val = e.getValue(); | ||
152 | + String name = e.getKey(); | ||
153 | + if (val instanceof Integer) { | ||
154 | + instance.setValue(instances.attribute(name), (int) val); | ||
155 | + } else if (val instanceof Boolean) { | ||
156 | + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); | ||
157 | + } else { | ||
158 | + int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); | ||
159 | + if (indexOfValue == -1) { | ||
160 | + logger.debug("Unkown value: " + val.toString() + " of feature: " + name | ||
161 | + + ". Marking as missing value."); | ||
162 | + instance.setMissing(instances.attribute(name)); | ||
163 | + } else | ||
164 | + instance.setValue(instances.attribute(name), indexOfValue); | ||
165 | + } | ||
166 | + } | ||
167 | + | ||
168 | + instance.setDataset(instances); | ||
169 | + instances.add(instance); | ||
170 | + } | ||
171 | + } | ||
178 | } | 172 | } |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Model.java
1 | package pl.waw.ipipan.zil.core.md.detection.zero; | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | ||
3 | -import java.io.Serializable; | ||
4 | -import java.util.List; | ||
5 | -import java.util.Set; | ||
6 | -import java.util.TreeMap; | ||
7 | - | ||
8 | -import org.apache.log4j.Logger; | ||
9 | - | 3 | +import org.slf4j.Logger; |
4 | +import org.slf4j.LoggerFactory; | ||
10 | import pl.waw.ipipan.zil.core.md.entities.Sentence; | 5 | import pl.waw.ipipan.zil.core.md.entities.Sentence; |
11 | import weka.classifiers.Classifier; | 6 | import weka.classifiers.Classifier; |
12 | import weka.core.Instance; | 7 | import weka.core.Instance; |
13 | import weka.core.Instances; | 8 | import weka.core.Instances; |
14 | 9 | ||
10 | +import java.io.Serializable; | ||
11 | +import java.util.List; | ||
12 | +import java.util.Set; | ||
13 | +import java.util.TreeMap; | ||
14 | + | ||
15 | public class Model implements Serializable { | 15 | public class Model implements Serializable { |
16 | 16 | ||
17 | - private static final long serialVersionUID = 3351727361273283076L; | ||
18 | - private static final Logger logger = Logger.getLogger(Model.class); | ||
19 | - | ||
20 | - private Classifier classifier; | ||
21 | - private Set<String> quasiVerbs; | ||
22 | - private Instances instances; | ||
23 | - | ||
24 | - public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { | ||
25 | - this.classifier = classifier; | ||
26 | - this.instances = instances; | ||
27 | - this.quasiVerbs = quasiVerbs; | ||
28 | - } | ||
29 | - | ||
30 | - public boolean isZeroSubject(Instance instance, Sentence sentence) { | ||
31 | - try { | ||
32 | - double response = this.classifier.classifyInstance(instance); | ||
33 | - return response > 0; | ||
34 | - } catch (Exception e) { | ||
35 | - logger.error("Error classyfing verb in sentence: " + sentence); | ||
36 | - return false; | ||
37 | - } | ||
38 | - } | ||
39 | - | ||
40 | - public Instances getInstances(List<TreeMap<String, Object>> examples) { | ||
41 | - Instances instances = new Instances(this.instances); | ||
42 | - InstanceCreator.fillInstances(examples, instances); | ||
43 | - return instances; | ||
44 | - } | ||
45 | - | ||
46 | - public Set<String> getQuasiVerbs() { | ||
47 | - return quasiVerbs; | ||
48 | - } | 17 | + private static final long serialVersionUID = 3351727361273283076L; |
18 | + private static final Logger logger = LoggerFactory.getLogger(Model.class); | ||
19 | + | ||
20 | + private Classifier classifier; | ||
21 | + private Set<String> quasiVerbs; | ||
22 | + private Instances instances; | ||
23 | + | ||
24 | + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { | ||
25 | + this.classifier = classifier; | ||
26 | + this.instances = instances; | ||
27 | + this.quasiVerbs = quasiVerbs; | ||
28 | + } | ||
29 | + | ||
30 | + public boolean isZeroSubject(Instance instance, Sentence sentence) { | ||
31 | + try { | ||
32 | + double response = this.classifier.classifyInstance(instance); | ||
33 | + return response > 0; | ||
34 | + } catch (Exception e) { | ||
35 | + logger.error("Error classyfing verb in sentence: " + sentence, e); | ||
36 | + return false; | ||
37 | + } | ||
38 | + } | ||
39 | + | ||
40 | + public Instances getInstances(List<TreeMap<String, Object>> examples) { | ||
41 | + Instances instances = new Instances(this.instances); | ||
42 | + InstanceCreator.fillInstances(examples, instances); | ||
43 | + return instances; | ||
44 | + } | ||
45 | + | ||
46 | + public Set<String> getQuasiVerbs() { | ||
47 | + return quasiVerbs; | ||
48 | + } | ||
49 | } | 49 | } |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Serializer.java
1 | package pl.waw.ipipan.zil.core.md.detection.zero; | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | ||
3 | -import java.io.InputStream; | ||
4 | - | ||
5 | import weka.core.SerializationHelper; | 3 | import weka.core.SerializationHelper; |
6 | 4 | ||
5 | +import java.io.InputStream; | ||
6 | + | ||
7 | public class Serializer { | 7 | public class Serializer { |
8 | 8 | ||
9 | public static void saveModel(Model m, String targetModelFilePath) throws Exception { | 9 | public static void saveModel(Model m, String targetModelFilePath) throws Exception { |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Trainer.java
1 | package pl.waw.ipipan.zil.core.md.detection.zero; | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | ||
3 | -import java.io.BufferedReader; | ||
4 | -import java.io.File; | ||
5 | -import java.io.IOException; | ||
6 | -import java.io.InputStream; | ||
7 | -import java.io.InputStreamReader; | ||
8 | -import java.util.HashSet; | ||
9 | -import java.util.List; | ||
10 | -import java.util.Random; | ||
11 | -import java.util.Set; | ||
12 | -import java.util.TreeMap; | ||
13 | - | ||
14 | -import org.apache.log4j.Logger; | ||
15 | - | 3 | +import org.slf4j.Logger; |
4 | +import org.slf4j.LoggerFactory; | ||
16 | import weka.classifiers.Evaluation; | 5 | import weka.classifiers.Evaluation; |
17 | import weka.classifiers.rules.JRip; | 6 | import weka.classifiers.rules.JRip; |
18 | import weka.classifiers.rules.JRip.RipperRule; | 7 | import weka.classifiers.rules.JRip.RipperRule; |
@@ -20,104 +9,111 @@ import weka.core.Attribute; | @@ -20,104 +9,111 @@ import weka.core.Attribute; | ||
20 | import weka.core.Instance; | 9 | import weka.core.Instance; |
21 | import weka.core.Instances; | 10 | import weka.core.Instances; |
22 | 11 | ||
12 | +import java.io.*; | ||
13 | +import java.util.*; | ||
14 | + | ||
23 | public class Trainer { | 15 | public class Trainer { |
24 | 16 | ||
25 | - final private static Logger logger = Logger.getLogger(Trainer.class); | ||
26 | - | ||
27 | - private static final boolean DO_CV = false; | ||
28 | - private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; | ||
29 | - | ||
30 | - public static void main(String[] args) { | ||
31 | - | ||
32 | - if (args.length != 2) { | ||
33 | - logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() | ||
34 | - + " trainDir targetModelFile"); | ||
35 | - return; | ||
36 | - } | ||
37 | - | ||
38 | - File dataDir = new File(args[0]); | ||
39 | - String targetModelFilePath = args[1]; | ||
40 | - | ||
41 | - if (!dataDir.isDirectory()) { | ||
42 | - logger.error(dataDir + " is not a directory!"); | ||
43 | - return; | ||
44 | - } | ||
45 | - | ||
46 | - Set<String> quasiVerbs = loadQuasiVerbs(); | ||
47 | - | ||
48 | - List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs); | ||
49 | - Instances instances = InstanceCreator.createInstances(examples, "class"); | ||
50 | - InstanceCreator.fillInstances(examples, instances); | ||
51 | - | ||
52 | - printStats(instances); | ||
53 | - | ||
54 | - try { | ||
55 | - JRip model = new JRip(); | ||
56 | - | ||
57 | - if (DO_CV) { | ||
58 | - logger.info("Crossvalidation..."); | ||
59 | - Evaluation eval = new Evaluation(instances); | ||
60 | - eval.crossValidateModel(model, instances, 10, new Random(1)); | ||
61 | - logger.info(eval.toSummaryString()); | ||
62 | - logger.info(eval.toMatrixString()); | ||
63 | - logger.info(eval.toClassDetailsString()); | ||
64 | - } | ||
65 | - | ||
66 | - logger.info("Building final classifier..."); | ||
67 | - model = new JRip(); | ||
68 | - model.buildClassifier(instances); | ||
69 | - logger.info(model.getRuleset().size() + " rules generated."); | ||
70 | - for (int i = 0; i < model.getRuleset().size(); i++) { | ||
71 | - RipperRule v = (RipperRule) model.getRuleset().elementAt(i); | ||
72 | - logger.info("\t" + v.toString(instances.classAttribute())); | ||
73 | - } | ||
74 | - | ||
75 | - instances.delete(); | ||
76 | - logger.info("Features stats:"); | ||
77 | - for (int i = 0; i < instances.numAttributes(); i++) { | ||
78 | - Attribute att = instances.attribute(i); | ||
79 | - logger.info(i + ".\t" + att.toString()); | ||
80 | - } | ||
81 | - | ||
82 | - logger.info("Saving classifier..."); | ||
83 | - Model m = new Model(model, instances, quasiVerbs); | ||
84 | - Serializer.saveModel(m, targetModelFilePath); | ||
85 | - logger.info("Done."); | ||
86 | - | ||
87 | - } catch (Exception e) { | ||
88 | - logger.error("Error: " + e); | ||
89 | - } | ||
90 | - } | ||
91 | - | ||
92 | - private static Set<String> loadQuasiVerbs() { | ||
93 | - Set<String> quasiVerbs = new HashSet<>(); | ||
94 | - InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); | ||
95 | - try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { | ||
96 | - String line = null; | ||
97 | - while ((line = br.readLine()) != null) { | ||
98 | - quasiVerbs.add(line.trim()); | ||
99 | - } | ||
100 | - } catch (IOException e) { | ||
101 | - logger.error(e.getLocalizedMessage()); | ||
102 | - } | ||
103 | - return quasiVerbs; | ||
104 | - } | ||
105 | - | ||
106 | - private static void printStats(Instances instances) { | ||
107 | - int positive = 0; | ||
108 | - int negative = 0; | ||
109 | - for (int i = 0; i < instances.numInstances(); i++) { | ||
110 | - Instance inst = instances.instance(i); | ||
111 | - if (inst.classValue() > 0) | ||
112 | - negative++; | ||
113 | - else | ||
114 | - positive++; | ||
115 | - } | ||
116 | - logger.info(positive + " positive examples"); | ||
117 | - logger.info(negative + " negative examples"); | ||
118 | - logger.info((positive + negative) + " examples total"); | ||
119 | - logger.info((instances.numAttributes() - 1) + " attributes"); | ||
120 | - logger.info(instances.toSummaryString()); | ||
121 | - } | 17 | + private static final Logger logger = LoggerFactory.getLogger(Trainer.class); |
18 | + | ||
19 | + private static final boolean DO_CV = false; | ||
20 | + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; | ||
21 | + | ||
22 | + private Trainer() { | ||
23 | + } | ||
24 | + | ||
25 | + public static void main(String[] args) { | ||
26 | + | ||
27 | + if (args.length != 2) { | ||
28 | + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() | ||
29 | + + " trainDir targetModelFile"); | ||
30 | + return; | ||
31 | + } | ||
32 | + | ||
33 | + File dataDir = new File(args[0]); | ||
34 | + String targetModelFilePath = args[1]; | ||
35 | + | ||
36 | + if (!dataDir.isDirectory()) { | ||
37 | + logger.error(dataDir + " is not a directory!"); | ||
38 | + return; | ||
39 | + } | ||
40 | + | ||
41 | + Set<String> quasiVerbs = loadQuasiVerbs(); | ||
42 | + | ||
43 | + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs); | ||
44 | + Instances instances = InstanceCreator.createInstances(examples, "class"); | ||
45 | + InstanceCreator.fillInstances(examples, instances); | ||
46 | + | ||
47 | + printStats(instances); | ||
48 | + | ||
49 | + try { | ||
50 | + JRip model; | ||
51 | + | ||
52 | + if (DO_CV) { | ||
53 | + logger.info("Crossvalidation..."); | ||
54 | + model = new JRip(); | ||
55 | + Evaluation eval = new Evaluation(instances); | ||
56 | + eval.crossValidateModel(model, instances, 10, new Random(1)); | ||
57 | + logger.info(eval.toSummaryString()); | ||
58 | + logger.info(eval.toMatrixString()); | ||
59 | + logger.info(eval.toClassDetailsString()); | ||
60 | + } | ||
61 | + | ||
62 | + logger.info("Building final classifier..."); | ||
63 | + model = new JRip(); | ||
64 | + model.buildClassifier(instances); | ||
65 | + logger.info(model.getRuleset().size() + " rules generated."); | ||
66 | + for (int i = 0; i < model.getRuleset().size(); i++) { | ||
67 | + RipperRule v = (RipperRule) model.getRuleset().elementAt(i); | ||
68 | + logger.info("\t" + v.toString(instances.classAttribute())); | ||
69 | + } | ||
70 | + | ||
71 | + instances.delete(); | ||
72 | + logger.info("Features stats:"); | ||
73 | + for (int i = 0; i < instances.numAttributes(); i++) { | ||
74 | + Attribute att = instances.attribute(i); | ||
75 | + logger.info(i + ".\t" + att.toString()); | ||
76 | + } | ||
77 | + | ||
78 | + logger.info("Saving classifier..."); | ||
79 | + Model m = new Model(model, instances, quasiVerbs); | ||
80 | + Serializer.saveModel(m, targetModelFilePath); | ||
81 | + logger.info("Done."); | ||
82 | + | ||
83 | + } catch (Exception e) { | ||
84 | + logger.error("Error: " + e); | ||
85 | + } | ||
86 | + } | ||
87 | + | ||
88 | + private static Set<String> loadQuasiVerbs() { | ||
89 | + Set<String> quasiVerbs = new HashSet<>(); | ||
90 | + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); | ||
91 | + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { | ||
92 | + String line; | ||
93 | + while ((line = br.readLine()) != null) { | ||
94 | + quasiVerbs.add(line.trim()); | ||
95 | + } | ||
96 | + } catch (IOException e) { | ||
97 | + logger.error(e.getLocalizedMessage(), e); | ||
98 | + } | ||
99 | + return quasiVerbs; | ||
100 | + } | ||
101 | + | ||
102 | + private static void printStats(Instances instances) { | ||
103 | + int positive = 0; | ||
104 | + int negative = 0; | ||
105 | + for (int i = 0; i < instances.numInstances(); i++) { | ||
106 | + Instance inst = instances.instance(i); | ||
107 | + if (inst.classValue() > 0) | ||
108 | + negative++; | ||
109 | + else | ||
110 | + positive++; | ||
111 | + } | ||
112 | + logger.info(positive + " positive examples"); | ||
113 | + logger.info(negative + " negative examples"); | ||
114 | + logger.info((positive + negative) + " examples total"); | ||
115 | + logger.info((instances.numAttributes() - 1) + " attributes"); | ||
116 | + logger.info(instances.toSummaryString()); | ||
117 | + } | ||
122 | 118 | ||
123 | } | 119 | } |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/ZeroSubjectDetector.java
1 | package pl.waw.ipipan.zil.core.md.detection.zero; | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | ||
3 | -import java.io.File; | ||
4 | -import java.io.InputStream; | ||
5 | -import java.util.ArrayList; | ||
6 | -import java.util.HashSet; | ||
7 | -import java.util.List; | ||
8 | -import java.util.Set; | ||
9 | -import java.util.TreeMap; | ||
10 | - | ||
11 | -import org.apache.log4j.Logger; | ||
12 | - | 3 | +import org.slf4j.Logger; |
4 | +import org.slf4j.LoggerFactory; | ||
13 | import pl.waw.ipipan.zil.core.md.entities.Mention; | 5 | import pl.waw.ipipan.zil.core.md.entities.Mention; |
14 | import pl.waw.ipipan.zil.core.md.entities.Sentence; | 6 | import pl.waw.ipipan.zil.core.md.entities.Sentence; |
15 | import pl.waw.ipipan.zil.core.md.entities.Token; | 7 | import pl.waw.ipipan.zil.core.md.entities.Token; |
16 | import weka.core.Instances; | 8 | import weka.core.Instances; |
17 | 9 | ||
10 | +import java.io.File; | ||
11 | +import java.io.InputStream; | ||
12 | +import java.util.*; | ||
13 | + | ||
18 | public class ZeroSubjectDetector { | 14 | public class ZeroSubjectDetector { |
19 | - final private static Logger logger = Logger.getLogger(ZeroSubjectDetector.class); | ||
20 | 15 | ||
21 | - private Model model; | ||
22 | - private Set<String> quasiVerbs = new HashSet<>(); | 16 | + final private static Logger logger = LoggerFactory.getLogger(ZeroSubjectDetector.class); |
17 | + | ||
18 | + private Model model; | ||
19 | + private Set<String> quasiVerbs = new HashSet<>(); | ||
23 | 20 | ||
24 | - public static int verbsWithoutSubject = 0; | ||
25 | - public static int verbsWithSubject = 0; | 21 | + public static int verbsWithoutSubject = 0; |
22 | + public static int verbsWithSubject = 0; | ||
26 | 23 | ||
27 | - public void addZeroSubjectMentions(Sentence sentence) { | ||
28 | - List<TreeMap<String, Object>> examples = new ArrayList<>(); | ||
29 | - InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence); | ||
30 | - if (examples.isEmpty()) | ||
31 | - return; | 24 | + public void addZeroSubjectMentions(Sentence sentence) { |
25 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | ||
26 | + InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence); | ||
27 | + if (examples.isEmpty()) | ||
28 | + return; | ||
32 | 29 | ||
33 | - Instances instances = model.getInstances(examples); | 30 | + Instances instances = model.getInstances(examples); |
34 | 31 | ||
35 | - // label instances | ||
36 | - List<Boolean> areZeros = new ArrayList<>(); | ||
37 | - for (int i = 0; i < instances.numInstances(); i++) { | ||
38 | - boolean isZero = model.isZeroSubject(instances.instance(i), sentence); | ||
39 | - areZeros.add(isZero); | ||
40 | - if (isZero) | ||
41 | - verbsWithoutSubject++; | ||
42 | - else | ||
43 | - verbsWithSubject++; | ||
44 | - } | 32 | + // label instances |
33 | + List<Boolean> areZeros = new ArrayList<>(); | ||
34 | + for (int i = 0; i < instances.numInstances(); i++) { | ||
35 | + boolean isZero = model.isZeroSubject(instances.instance(i), sentence); | ||
36 | + areZeros.add(isZero); | ||
37 | + if (isZero) | ||
38 | + verbsWithoutSubject++; | ||
39 | + else | ||
40 | + verbsWithSubject++; | ||
41 | + } | ||
45 | 42 | ||
46 | - int i = 0; | ||
47 | - for (Token m : sentence) { | ||
48 | - if (!FeatureGeneration.isVerb(m)) | ||
49 | - continue; | ||
50 | - if (areZeros.get(i)) | ||
51 | - sentence.addMention(new Mention(m, true)); | ||
52 | - i++; | ||
53 | - } | ||
54 | - } | 43 | + int i = 0; |
44 | + for (Token m : sentence) { | ||
45 | + if (!FeatureGeneration.isVerb(m)) | ||
46 | + continue; | ||
47 | + if (areZeros.get(i)) | ||
48 | + sentence.addMention(new Mention(m, true)); | ||
49 | + i++; | ||
50 | + } | ||
51 | + } | ||
55 | 52 | ||
56 | - public ZeroSubjectDetector(File zeroSubjectDetectionModel) { | ||
57 | - try { | ||
58 | - this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); | ||
59 | - this.quasiVerbs = this.model.getQuasiVerbs(); | ||
60 | - } catch (Exception e) { | ||
61 | - logger.error("Error loading model:" + e); | ||
62 | - } | ||
63 | - } | 53 | + public ZeroSubjectDetector(File zeroSubjectDetectionModel) { |
54 | + try { | ||
55 | + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); | ||
56 | + this.quasiVerbs = this.model.getQuasiVerbs(); | ||
57 | + } catch (Exception e) { | ||
58 | + logger.error("Error loading model:" + e); | ||
59 | + } | ||
60 | + } | ||
64 | 61 | ||
65 | - public ZeroSubjectDetector(InputStream zeroSubjectDetectionModelStream) { | ||
66 | - try { | ||
67 | - this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); | ||
68 | - this.quasiVerbs = this.model.getQuasiVerbs(); | ||
69 | - } catch (Exception e) { | ||
70 | - logger.error("Error loading model:" + e); | ||
71 | - } | ||
72 | - } | 62 | + public ZeroSubjectDetector(InputStream zeroSubjectDetectionModelStream) { |
63 | + try { | ||
64 | + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); | ||
65 | + this.quasiVerbs = this.model.getQuasiVerbs(); | ||
66 | + } catch (Exception e) { | ||
67 | + logger.error("Error loading model:" + e); | ||
68 | + } | ||
69 | + } | ||
73 | } | 70 | } |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java
1 | package pl.waw.ipipan.zil.core.md.entities; | 1 | package pl.waw.ipipan.zil.core.md.entities; |
2 | 2 | ||
3 | -import java.util.ArrayList; | ||
4 | -import java.util.Collection; | ||
5 | -import java.util.Collections; | ||
6 | -import java.util.HashSet; | ||
7 | -import java.util.List; | ||
8 | -import java.util.Set; | 3 | +import java.util.*; |
9 | 4 | ||
10 | public class Token implements Comparable<Token> { | 5 | public class Token implements Comparable<Token> { |
11 | private Sentence sentence; | 6 | private Sentence sentence; |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
1 | package pl.waw.ipipan.zil.core.md.io.tei; | 1 | package pl.waw.ipipan.zil.core.md.io.tei; |
2 | 2 | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
6 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.*; | ||
7 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; | ||
8 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | ||
9 | + | ||
3 | import java.io.File; | 10 | import java.io.File; |
4 | import java.util.ArrayList; | 11 | import java.util.ArrayList; |
5 | import java.util.HashMap; | 12 | import java.util.HashMap; |
6 | import java.util.List; | 13 | import java.util.List; |
7 | import java.util.Map; | 14 | import java.util.Map; |
8 | 15 | ||
9 | -import org.apache.log4j.Logger; | ||
10 | - | ||
11 | -import pl.waw.ipipan.zil.core.md.entities.Interpretation; | ||
12 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | ||
13 | -import pl.waw.ipipan.zil.core.md.entities.NamedEntity; | ||
14 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | ||
15 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
16 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | ||
17 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | ||
18 | -import pl.waw.ipipan.zil.core.md.entities.Text; | ||
19 | -import pl.waw.ipipan.zil.core.md.entities.Token; | ||
20 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | ||
21 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIGroup; | ||
22 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIInterpretation; | ||
23 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; | ||
24 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; | ||
25 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEINamedEntity; | ||
26 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIParagraph; | ||
27 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEISentence; | ||
28 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEISyntacticEntity; | ||
29 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIWord; | ||
30 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; | ||
31 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | ||
32 | - | ||
33 | public class TeiLoader { | 16 | public class TeiLoader { |
34 | 17 | ||
35 | - private static Logger logger = Logger.getLogger(TeiLoader.class); | ||
36 | - private static TEI_IO teiAPI = TEI_IO.getInstance(); | ||
37 | - | ||
38 | - public static TEICorpusText readTeiText(File teiDir) throws TEIException { | ||
39 | - return teiAPI.readFromNKJPDirectory(teiDir); | ||
40 | - } | ||
41 | - | ||
42 | - public static Text loadTextFromTei(TEICorpusText teiText) { | ||
43 | - Text text = new Text(teiText.getCorpusHeader().getId()); | ||
44 | - | ||
45 | - logger.debug("Loading tei text " + text.getId() + "..."); | ||
46 | - for (TEIParagraph teiP : teiText.getParagraphs()) | ||
47 | - loadParagraph(text, teiP); | ||
48 | - logger.debug("Tei text loaded."); | ||
49 | - | ||
50 | - return text; | ||
51 | - } | ||
52 | - | ||
53 | - private static void loadParagraph(Text text, TEIParagraph teiP) { | ||
54 | - Paragraph p = new Paragraph(); | ||
55 | - text.add(p); | ||
56 | - for (TEISentence teiS : teiP.getSentences()) | ||
57 | - loadSentence(p, teiS); | ||
58 | - } | ||
59 | - | ||
60 | - private static void loadSentence(Paragraph p, TEISentence teiS) { | ||
61 | - Sentence s = new Sentence(); | ||
62 | - p.add(s); | ||
63 | - Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>(); | ||
64 | - for (TEIMorph teiM : teiS.getMorphs()) { | ||
65 | - Token token = loadToken(s, teiM); | ||
66 | - teiMorph2Segment.put(teiM, token); | ||
67 | - } | ||
68 | - for (TEINamedEntity ne : teiS.getAllNamedEntities()) | ||
69 | - loadNE(s, ne, teiMorph2Segment); | ||
70 | - for (TEIWord w : teiS.getAllWords()) | ||
71 | - loadSyntacticWord(s, w, teiMorph2Segment); | ||
72 | - for (TEIGroup g : teiS.getAllGroups()) | ||
73 | - loadSyntacticGroup(s, g, teiMorph2Segment); | ||
74 | - for (TEIMention m : teiS.getAllMentions()) | ||
75 | - loadMentions(s, m, teiMorph2Segment); | ||
76 | - } | ||
77 | - | ||
78 | - private static void loadMentions(Sentence s, TEIMention m, | ||
79 | - Map<TEIMorph, Token> teiMorph2Segment) { | ||
80 | - List<Token> tokens = new ArrayList<>(); | ||
81 | - for (TEIMorph mo : m.getMorphs()) | ||
82 | - tokens.add(teiMorph2Segment.get(mo)); | ||
83 | - List<Token> headTokens = new ArrayList<>(); | ||
84 | - for (TEIMorph mo : m.getHeadMorphs()) | ||
85 | - headTokens.add(teiMorph2Segment.get(mo)); | ||
86 | - s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); | ||
87 | - } | ||
88 | - | ||
89 | - private static void loadSyntacticGroup(Sentence s, TEIGroup g, | ||
90 | - Map<TEIMorph, Token> teiMorph2Segment) { | ||
91 | - String type = g.getType(); | ||
92 | - | ||
93 | - List<Token> tokens = new ArrayList<>(); | ||
94 | - for (TEIMorph m : g.getLeaves()) | ||
95 | - tokens.add(teiMorph2Segment.get(m)); | ||
96 | - | ||
97 | - List<Token> headTokens = new ArrayList<>(); | ||
98 | - TEISyntacticEntity semanticHead = g; | ||
99 | - while (semanticHead.isGroup() | ||
100 | - && semanticHead.asGroup().getSemanticHead() != null) | ||
101 | - semanticHead = semanticHead.asGroup().getSemanticHead(); | ||
102 | - for (TEIMorph m : semanticHead.getLeaves()) | ||
103 | - headTokens.add(teiMorph2Segment.get(m)); | ||
104 | - | ||
105 | - s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | ||
106 | - } | ||
107 | - | ||
108 | - private static void loadSyntacticWord(Sentence s, TEIWord w, | ||
109 | - Map<TEIMorph, Token> teiMorph2Segment) { | ||
110 | - String ctag = w.getInterpretation().getCtag(); | ||
111 | - List<Token> tokens = new ArrayList<>(); | ||
112 | - for (TEIMorph m : w.getAllMorphs()) | ||
113 | - tokens.add(teiMorph2Segment.get(m)); | ||
114 | - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | ||
115 | - } | ||
116 | - | ||
117 | - private static void loadNE(Sentence s, TEINamedEntity ne, | ||
118 | - Map<TEIMorph, Token> teiMorph2Segment) { | ||
119 | - List<Token> tokens = new ArrayList<>(); | ||
120 | - for (TEIMorph m : ne.getLeaves()) | ||
121 | - tokens.add(teiMorph2Segment.get(m)); | ||
122 | - s.addNamedEntity(new NamedEntity(tokens)); | ||
123 | - } | ||
124 | - | ||
125 | - private static Token loadToken(Sentence s, TEIMorph teiM) { | ||
126 | - Token seg = new Token(); | ||
127 | - s.add(seg); | ||
128 | - | ||
129 | - seg.setOrth(teiM.getOrth()); | ||
130 | - TEIInterpretation interp = teiM.getChosenInterpretation(); | ||
131 | - Interpretation chosenIterpretation = new Interpretation( | ||
132 | - interp.getCtag(), interp.getMorph(), interp.getBase()); | ||
133 | - seg.addChosenInterpretation(chosenIterpretation); | ||
134 | - | ||
135 | - for (TEIInterpretation interp2 : teiM.getAllInterpretations()) { | ||
136 | - Interpretation inter = new Interpretation(interp2.getCtag(), | ||
137 | - interp2.getMorph(), interp.getBase()); | ||
138 | - seg.addInterpretation(inter); | ||
139 | - } | ||
140 | - | ||
141 | - return seg; | ||
142 | - } | 18 | + private static Logger logger = LoggerFactory.getLogger(TeiLoader.class); |
19 | + private static TEI_IO teiAPI = TEI_IO.getInstance(); | ||
20 | + | ||
21 | + private TeiLoader() { | ||
22 | + } | ||
23 | + | ||
24 | + public static TEICorpusText readTeiText(File teiDir) throws TEIException { | ||
25 | + return teiAPI.readFromNKJPDirectory(teiDir); | ||
26 | + } | ||
27 | + | ||
28 | + public static Text loadTextFromTei(TEICorpusText teiText) { | ||
29 | + Text text = new Text(teiText.getCorpusHeader().getId()); | ||
30 | + | ||
31 | + logger.debug("Loading tei text " + text.getId() + "..."); | ||
32 | + for (TEIParagraph teiP : teiText.getParagraphs()) | ||
33 | + loadParagraph(text, teiP); | ||
34 | + logger.debug("Tei text loaded."); | ||
35 | + | ||
36 | + return text; | ||
37 | + } | ||
38 | + | ||
39 | + private static void loadParagraph(Text text, TEIParagraph teiP) { | ||
40 | + Paragraph p = new Paragraph(); | ||
41 | + text.add(p); | ||
42 | + for (TEISentence teiS : teiP.getSentences()) | ||
43 | + loadSentence(p, teiS); | ||
44 | + } | ||
45 | + | ||
46 | + private static void loadSentence(Paragraph p, TEISentence teiS) { | ||
47 | + Sentence s = new Sentence(); | ||
48 | + p.add(s); | ||
49 | + Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>(); | ||
50 | + for (TEIMorph teiM : teiS.getMorphs()) { | ||
51 | + Token token = loadToken(s, teiM); | ||
52 | + teiMorph2Segment.put(teiM, token); | ||
53 | + } | ||
54 | + for (TEINamedEntity ne : teiS.getAllNamedEntities()) | ||
55 | + loadNE(s, ne, teiMorph2Segment); | ||
56 | + for (TEIWord w : teiS.getAllWords()) | ||
57 | + loadSyntacticWord(s, w, teiMorph2Segment); | ||
58 | + for (TEIGroup g : teiS.getAllGroups()) | ||
59 | + loadSyntacticGroup(s, g, teiMorph2Segment); | ||
60 | + for (TEIMention m : teiS.getAllMentions()) | ||
61 | + loadMentions(s, m, teiMorph2Segment); | ||
62 | + } | ||
63 | + | ||
64 | + private static void loadMentions(Sentence s, TEIMention m, | ||
65 | + Map<TEIMorph, Token> teiMorph2Segment) { | ||
66 | + List<Token> tokens = new ArrayList<>(); | ||
67 | + for (TEIMorph mo : m.getMorphs()) | ||
68 | + tokens.add(teiMorph2Segment.get(mo)); | ||
69 | + List<Token> headTokens = new ArrayList<>(); | ||
70 | + for (TEIMorph mo : m.getHeadMorphs()) | ||
71 | + headTokens.add(teiMorph2Segment.get(mo)); | ||
72 | + s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); | ||
73 | + } | ||
74 | + | ||
75 | + private static void loadSyntacticGroup(Sentence s, TEIGroup g, | ||
76 | + Map<TEIMorph, Token> teiMorph2Segment) { | ||
77 | + String type = g.getType(); | ||
78 | + | ||
79 | + List<Token> tokens = new ArrayList<>(); | ||
80 | + for (TEIMorph m : g.getLeaves()) | ||
81 | + tokens.add(teiMorph2Segment.get(m)); | ||
82 | + | ||
83 | + List<Token> headTokens = new ArrayList<>(); | ||
84 | + TEISyntacticEntity semanticHead = g; | ||
85 | + while (semanticHead.isGroup() | ||
86 | + && semanticHead.asGroup().getSemanticHead() != null) | ||
87 | + semanticHead = semanticHead.asGroup().getSemanticHead(); | ||
88 | + for (TEIMorph m : semanticHead.getLeaves()) | ||
89 | + headTokens.add(teiMorph2Segment.get(m)); | ||
90 | + | ||
91 | + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | ||
92 | + } | ||
93 | + | ||
94 | + private static void loadSyntacticWord(Sentence s, TEIWord w, | ||
95 | + Map<TEIMorph, Token> teiMorph2Segment) { | ||
96 | + String ctag = w.getInterpretation().getCtag(); | ||
97 | + List<Token> tokens = new ArrayList<>(); | ||
98 | + for (TEIMorph m : w.getAllMorphs()) | ||
99 | + tokens.add(teiMorph2Segment.get(m)); | ||
100 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | ||
101 | + } | ||
102 | + | ||
103 | + private static void loadNE(Sentence s, TEINamedEntity ne, | ||
104 | + Map<TEIMorph, Token> teiMorph2Segment) { | ||
105 | + List<Token> tokens = new ArrayList<>(); | ||
106 | + for (TEIMorph m : ne.getLeaves()) | ||
107 | + tokens.add(teiMorph2Segment.get(m)); | ||
108 | + s.addNamedEntity(new NamedEntity(tokens)); | ||
109 | + } | ||
110 | + | ||
111 | + private static Token loadToken(Sentence s, TEIMorph teiM) { | ||
112 | + Token seg = new Token(); | ||
113 | + s.add(seg); | ||
114 | + | ||
115 | + seg.setOrth(teiM.getOrth()); | ||
116 | + TEIInterpretation interp = teiM.getChosenInterpretation(); | ||
117 | + Interpretation chosenIterpretation = new Interpretation( | ||
118 | + interp.getCtag(), interp.getMorph(), interp.getBase()); | ||
119 | + seg.addChosenInterpretation(chosenIterpretation); | ||
120 | + | ||
121 | + for (TEIInterpretation interp2 : teiM.getAllInterpretations()) { | ||
122 | + Interpretation inter = new Interpretation(interp2.getCtag(), | ||
123 | + interp2.getMorph(), interp.getBase()); | ||
124 | + seg.addInterpretation(inter); | ||
125 | + } | ||
126 | + | ||
127 | + return seg; | ||
128 | + } | ||
143 | 129 | ||
144 | } | 130 | } |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiSaver.java
1 | package pl.waw.ipipan.zil.core.md.io.tei; | 1 | package pl.waw.ipipan.zil.core.md.io.tei; |
2 | 2 | ||
3 | -import java.io.File; | ||
4 | -import java.util.ArrayList; | ||
5 | -import java.util.HashMap; | ||
6 | -import java.util.Iterator; | ||
7 | -import java.util.List; | ||
8 | -import java.util.Map; | ||
9 | - | ||
10 | -import org.apache.log4j.Logger; | ||
11 | - | ||
12 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | ||
13 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | ||
14 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
15 | -import pl.waw.ipipan.zil.core.md.entities.Text; | ||
16 | -import pl.waw.ipipan.zil.core.md.entities.Token; | ||
17 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.AnnotationLayer; | ||
18 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.EntitiesFactory; | ||
19 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICoreference; | ||
20 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | ||
21 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; | ||
22 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; | ||
23 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIParagraph; | ||
24 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEISentence; | 3 | +import org.slf4j.Logger; |
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
6 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.*; | ||
25 | import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; | 7 | import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; |
26 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | 8 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; |
27 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO.CompressionMethod; | 9 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO.CompressionMethod; |
28 | 10 | ||
11 | +import java.io.File; | ||
12 | +import java.util.*; | ||
13 | + | ||
29 | public class TeiSaver { | 14 | public class TeiSaver { |
30 | 15 | ||
31 | - private static Logger logger = Logger.getLogger(TeiSaver.class); | ||
32 | - private static TEI_IO teiAPI = TEI_IO.getInstance(); | ||
33 | - final private static EntitiesFactory ef = EntitiesFactory.getInstance(); | ||
34 | - | ||
35 | - public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException { | ||
36 | - logger.debug("Saving text in " + targetDir); | ||
37 | - CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE; | ||
38 | - teiAPI.writeToNKJPDirectory(teiText, targetDir, cm); | ||
39 | - } | ||
40 | - | ||
41 | - public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException { | ||
42 | - Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>(); | ||
43 | - | ||
44 | - Iterator<Paragraph> pIt = t.iterator(); | ||
45 | - Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator(); | ||
46 | - int mentionId = 0; | ||
47 | - while (pIt.hasNext() && pItTei.hasNext()) { | ||
48 | - Paragraph p = pIt.next(); | ||
49 | - TEIParagraph pTei = pItTei.next(); | ||
50 | - | ||
51 | - mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei); | ||
52 | - } | ||
53 | - checkIterators(pIt, pItTei, "paragraph"); | ||
54 | - | ||
55 | - teiText.addAnnotationLayer(AnnotationLayer.MENTIONS, | ||
56 | - EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS)); | ||
57 | - | ||
58 | - // clear coreference as we have new mentions it became invalid | ||
59 | - teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE); | ||
60 | - teiText.setCoreferences(new ArrayList<TEICoreference>()); | ||
61 | - | ||
62 | - logger.debug(mentionId + " mentions added"); | ||
63 | - } | ||
64 | - | ||
65 | - private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p, | ||
66 | - TEIParagraph pTei) throws TEIException { | ||
67 | - Iterator<Sentence> sIt = p.iterator(); | ||
68 | - Iterator<TEISentence> sItTei = pTei.getSentences().iterator(); | ||
69 | - | ||
70 | - while (sIt.hasNext() && sItTei.hasNext()) { | ||
71 | - Sentence s = sIt.next(); | ||
72 | - TEISentence sTei = sItTei.next(); | ||
73 | - mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei); | ||
74 | - } | ||
75 | - checkIterators(sIt, sItTei, "sentence"); | ||
76 | - return mentionId; | ||
77 | - } | ||
78 | - | ||
79 | - private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s, | ||
80 | - TEISentence sTei) throws TEIException { | ||
81 | - sTei.getAllMentions().clear(); | ||
82 | - | ||
83 | - Map<Token, TEIMorph> seg2morph = new HashMap<Token, TEIMorph>(); | ||
84 | - | ||
85 | - Iterator<Token> segIt = s.iterator(); | ||
86 | - Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator(); | ||
87 | - | ||
88 | - while (segIt.hasNext() && segItTei.hasNext()) { | ||
89 | - seg2morph.put(segIt.next(), segItTei.next()); | ||
90 | - } | ||
91 | - checkIterators(segIt, segItTei, "token"); | ||
92 | - | ||
93 | - List<TEIMention> mentions = new ArrayList<TEIMention>(); | ||
94 | - | ||
95 | - for (Mention m : s.getMentions()) { | ||
96 | - List<TEIMorph> morphs = new ArrayList<TEIMorph>(); | ||
97 | - List<TEIMorph> heads = new ArrayList<TEIMorph>(); | ||
98 | - | ||
99 | - for (Token seg : m.getSegments()) | ||
100 | - morphs.add(seg2morph.get(seg)); | ||
101 | - | ||
102 | - for (Token seg : m.getHeadSegments()) | ||
103 | - heads.add(seg2morph.get(seg)); | ||
104 | - | ||
105 | - TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject()); | ||
106 | - mentions.add(mention); | ||
107 | - mention2mention.put(m, mention); | ||
108 | - } | ||
109 | - sTei.setMentions(mentions); | ||
110 | - return mentionId; | ||
111 | - } | ||
112 | - | ||
113 | - private static void checkIterators(Iterator<? extends Object> one, Iterator<? extends Object> other, String level) | ||
114 | - throws TEIException { | ||
115 | - if (one.hasNext() || other.hasNext()) | ||
116 | - throw new TEIException("Problem mapping tei to thrift for level " + level); | ||
117 | - } | 16 | + private static final Logger logger = LoggerFactory.getLogger(TeiSaver.class); |
17 | + private static final TEI_IO teiAPI = TEI_IO.getInstance(); | ||
18 | + private static final EntitiesFactory ef = EntitiesFactory.getInstance(); | ||
19 | + | ||
20 | + private TeiSaver() { | ||
21 | + } | ||
22 | + | ||
23 | + public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException { | ||
24 | + logger.debug("Saving text in " + targetDir); | ||
25 | + CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE; | ||
26 | + teiAPI.writeToNKJPDirectory(teiText, targetDir, cm); | ||
27 | + } | ||
28 | + | ||
29 | + public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException { | ||
30 | + Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>(); | ||
31 | + | ||
32 | + Iterator<Paragraph> pIt = t.iterator(); | ||
33 | + Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator(); | ||
34 | + int mentionId = 0; | ||
35 | + while (pIt.hasNext() && pItTei.hasNext()) { | ||
36 | + Paragraph p = pIt.next(); | ||
37 | + TEIParagraph pTei = pItTei.next(); | ||
38 | + | ||
39 | + mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei); | ||
40 | + } | ||
41 | + checkIterators(pIt, pItTei, "paragraph"); | ||
42 | + | ||
43 | + teiText.addAnnotationLayer(AnnotationLayer.MENTIONS, | ||
44 | + EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS)); | ||
45 | + | ||
46 | + // clear coreference as we have new mentions it became invalid | ||
47 | + teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE); | ||
48 | + teiText.setCoreferences(new ArrayList<TEICoreference>()); | ||
49 | + | ||
50 | + logger.debug(mentionId + " mentions added"); | ||
51 | + } | ||
52 | + | ||
53 | + private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p, | ||
54 | + TEIParagraph pTei) throws TEIException { | ||
55 | + Iterator<Sentence> sIt = p.iterator(); | ||
56 | + Iterator<TEISentence> sItTei = pTei.getSentences().iterator(); | ||
57 | + | ||
58 | + while (sIt.hasNext() && sItTei.hasNext()) { | ||
59 | + Sentence s = sIt.next(); | ||
60 | + TEISentence sTei = sItTei.next(); | ||
61 | + mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei); | ||
62 | + } | ||
63 | + checkIterators(sIt, sItTei, "sentence"); | ||
64 | + return mentionId; | ||
65 | + } | ||
66 | + | ||
67 | + private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s, | ||
68 | + TEISentence sTei) throws TEIException { | ||
69 | + sTei.getAllMentions().clear(); | ||
70 | + | ||
71 | + Map<Token, TEIMorph> seg2morph = new HashMap<>(); | ||
72 | + | ||
73 | + Iterator<Token> segIt = s.iterator(); | ||
74 | + Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator(); | ||
75 | + | ||
76 | + while (segIt.hasNext() && segItTei.hasNext()) { | ||
77 | + seg2morph.put(segIt.next(), segItTei.next()); | ||
78 | + } | ||
79 | + checkIterators(segIt, segItTei, "token"); | ||
80 | + | ||
81 | + List<TEIMention> mentions = new ArrayList<>(); | ||
82 | + | ||
83 | + for (Mention m : s.getMentions()) { | ||
84 | + List<TEIMorph> morphs = new ArrayList<>(); | ||
85 | + List<TEIMorph> heads = new ArrayList<>(); | ||
86 | + | ||
87 | + for (Token seg : m.getSegments()) | ||
88 | + morphs.add(seg2morph.get(seg)); | ||
89 | + | ||
90 | + for (Token seg : m.getHeadSegments()) | ||
91 | + heads.add(seg2morph.get(seg)); | ||
92 | + | ||
93 | + TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject()); | ||
94 | + mentions.add(mention); | ||
95 | + mention2mention.put(m, mention); | ||
96 | + } | ||
97 | + sTei.setMentions(mentions); | ||
98 | + return mentionId; | ||
99 | + } | ||
100 | + | ||
101 | + private static void checkIterators(Iterator<?> one, Iterator<?> other, String level) | ||
102 | + throws TEIException { | ||
103 | + if (one.hasNext() || other.hasNext()) | ||
104 | + throw new TEIException("Problem mapping tei to thrift for level " + level); | ||
105 | + } | ||
118 | 106 | ||
119 | } | 107 | } |
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
1 | package pl.waw.ipipan.zil.core.md.io.thrift; | 1 | package pl.waw.ipipan.zil.core.md.io.thrift; |
2 | 2 | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | ||
7 | + | ||
3 | import java.util.ArrayList; | 8 | import java.util.ArrayList; |
4 | import java.util.HashMap; | 9 | import java.util.HashMap; |
5 | import java.util.List; | 10 | import java.util.List; |
6 | import java.util.Map; | 11 | import java.util.Map; |
7 | 12 | ||
8 | -import org.apache.log4j.Logger; | ||
9 | - | ||
10 | -import pl.waw.ipipan.zil.core.md.entities.Interpretation; | ||
11 | -import pl.waw.ipipan.zil.core.md.entities.NamedEntity; | ||
12 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | ||
13 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
14 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | ||
15 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | ||
16 | -import pl.waw.ipipan.zil.core.md.entities.Text; | ||
17 | -import pl.waw.ipipan.zil.core.md.entities.Token; | ||
18 | -import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; | ||
19 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation; | ||
20 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TNamedEntity; | ||
21 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | ||
22 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
23 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSyntacticGroup; | ||
24 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSyntacticWord; | ||
25 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
26 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | ||
27 | - | ||
28 | public class ThriftLoader { | 13 | public class ThriftLoader { |
29 | 14 | ||
30 | - private static Logger logger = Logger.getLogger(ThriftLoader.class); | ||
31 | - | ||
32 | - public static Text loadTextFromThrift(TText thriftText) | ||
33 | - throws MultiserviceException { | ||
34 | - Text text = new Text(thriftText.getTextHeader() == null ? "null" | ||
35 | - : thriftText.getTextHeader().getId()); | ||
36 | - | ||
37 | - logger.debug("Loading text " + text.getId() + " from thrift format..."); | ||
38 | - for (TParagraph teiP : thriftText.getParagraphs()) | ||
39 | - loadParagraph(text, teiP); | ||
40 | - logger.debug("Thrift text loaded."); | ||
41 | - | ||
42 | - return text; | ||
43 | - } | ||
44 | - | ||
45 | - private static void loadParagraph(Text text, TParagraph teiP) | ||
46 | - throws MultiserviceException { | ||
47 | - Paragraph p = new Paragraph(); | ||
48 | - text.add(p); | ||
49 | - | ||
50 | - for (TSentence teiS : teiP.getSentences()) | ||
51 | - loadSentence(p, teiS); | ||
52 | - } | ||
53 | - | ||
54 | - private static void loadSentence(Paragraph p, TSentence thriftSent) | ||
55 | - throws MultiserviceException { | ||
56 | - Sentence s = new Sentence(); | ||
57 | - p.add(s); | ||
58 | - | ||
59 | - Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent); | ||
60 | - | ||
61 | - Map<String, Token> thiftTokenId2Token = new HashMap<>(); | ||
62 | - for (TToken teiM : thriftSent.getTokens()) { | ||
63 | - Token token = loadToken(s, teiM); | ||
64 | - thiftTokenId2Token.put(teiM.getId(), token); | ||
65 | - } | ||
66 | - if (thriftSent.isSetNames()) | ||
67 | - for (TNamedEntity ne : thriftSent.getNames()) | ||
68 | - loadNE(s, ne, thirftId2Entity, thiftTokenId2Token); | ||
69 | - if (thriftSent.isSetWords()) | ||
70 | - for (TSyntacticWord w : thriftSent.getWords()) | ||
71 | - loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token); | ||
72 | - if (thriftSent.isSetGroups()) | ||
73 | - for (TSyntacticGroup g : thriftSent.getGroups()) | ||
74 | - loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token); | ||
75 | - } | ||
76 | - | ||
77 | - private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g, | ||
78 | - Map<String, Object> thirftId2Entity, | ||
79 | - Map<String, Token> thiftTokenId2Token) { | ||
80 | - String type = g.getType(); | ||
81 | - List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity, | ||
82 | - thiftTokenId2Token, false); | ||
83 | - List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity, | ||
84 | - thiftTokenId2Token, true); | ||
85 | - s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | ||
86 | - } | ||
87 | - | ||
88 | - private static void loadSyntacticWord(Sentence s, TSyntacticWord w, | ||
89 | - Map<String, Object> thirftId2Entity, | ||
90 | - Map<String, Token> thiftTokenId2Token) { | ||
91 | - String ctag = w.getChosenInterpretation().getCtag(); | ||
92 | - List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity, | ||
93 | - thiftTokenId2Token, false); | ||
94 | - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | ||
95 | - } | ||
96 | - | ||
97 | - private static void loadNE(Sentence s, TNamedEntity ne, | ||
98 | - Map<String, Object> thirftId2Entity, | ||
99 | - Map<String, Token> thiftTokenId2Token) { | ||
100 | - List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity, | ||
101 | - thiftTokenId2Token, false); | ||
102 | - s.addNamedEntity(new NamedEntity(tokens)); | ||
103 | - } | ||
104 | - | ||
105 | - private static Map<String, Object> getThriftId2EntityMap( | ||
106 | - TSentence thriftSent) { | ||
107 | - Map<String, Object> idToEntity = new HashMap<>(); | ||
108 | - for (TToken tok : thriftSent.getTokens()) | ||
109 | - idToEntity.put(tok.getId(), tok); | ||
110 | - if (thriftSent.isSetWords()) | ||
111 | - for (TSyntacticWord w : thriftSent.getWords()) | ||
112 | - idToEntity.put(w.getId(), w); | ||
113 | - if (thriftSent.isSetNames()) | ||
114 | - for (TNamedEntity ne : thriftSent.getNames()) | ||
115 | - idToEntity.put(ne.getId(), ne); | ||
116 | - if (thriftSent.isSetGroups()) | ||
117 | - for (TSyntacticGroup group : thriftSent.getGroups()) | ||
118 | - idToEntity.put(group.getId(), group); | ||
119 | - return idToEntity; | ||
120 | - } | ||
121 | - | ||
122 | - private static Token loadToken(Sentence s, TToken teiM) | ||
123 | - throws MultiserviceException { | ||
124 | - Token seg = new Token(); | ||
125 | - s.add(seg); | ||
126 | - | ||
127 | - seg.setOrth(teiM.getOrth()); | ||
128 | - TInterpretation interp = getTokenChosenInt(teiM); | ||
129 | - Interpretation chosenIterpretation = new Interpretation( | ||
130 | - interp.getCtag(), interp.getMsd(), interp.getBase()); | ||
131 | - seg.addChosenInterpretation(chosenIterpretation); | ||
132 | - | ||
133 | - for (TInterpretation interp2 : teiM.getInterpretations()) { | ||
134 | - Interpretation inter = new Interpretation(interp2.getCtag(), | ||
135 | - interp2.getMsd(), interp.getBase()); | ||
136 | - seg.addInterpretation(inter); | ||
137 | - } | ||
138 | - return seg; | ||
139 | - } | ||
140 | - | ||
141 | - private static TInterpretation getTokenChosenInt(TToken token) | ||
142 | - throws MultiserviceException { | ||
143 | - TInterpretation interp = token.getChosenInterpretation(); | ||
144 | - if (interp == null || interp.getBase() == null | ||
145 | - || interp.getBase().equals("")) { | ||
146 | - if (token.getCandidateInterpretations() == null | ||
147 | - || token.getCandidateInterpretations().size() == 0 | ||
148 | - || token.getCandidateInterpretations().get(0).getBase() == null | ||
149 | - || token.getCandidateInterpretations().get(0).getBase() | ||
150 | - .equals("")) | ||
151 | - throw new MultiserviceException( | ||
152 | - "No proper chosen or candidate interpretation for segment: " | ||
153 | - + token.id); | ||
154 | - interp = token.getCandidateInterpretations().get(0); | ||
155 | - } | ||
156 | - return interp; | ||
157 | - } | ||
158 | - | ||
159 | - private static List<Token> getUnderlyingSegments(Object entity, | ||
160 | - Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment, | ||
161 | - boolean headsOnly) { | ||
162 | - List<Token> result = new ArrayList<>(); | ||
163 | - | ||
164 | - if (entity instanceof TToken) { | ||
165 | - result.add(tokenId2Segment.get(((TToken) entity).getId())); | ||
166 | - return result; | ||
167 | - } | ||
168 | - | ||
169 | - List<String> childIds = new ArrayList<>(); | ||
170 | - if (entity instanceof TSyntacticWord) | ||
171 | - childIds = ((TSyntacticWord) entity).getChildIds(); | ||
172 | - else if (entity instanceof TNamedEntity) | ||
173 | - childIds = ((TNamedEntity) entity).getChildIds(); | ||
174 | - else if (entity instanceof TSyntacticGroup) | ||
175 | - if (headsOnly) { | ||
176 | - childIds = new ArrayList<String>(); | ||
177 | - childIds.add(((TSyntacticGroup) entity).getSemanticHeadId()); | ||
178 | - } else | ||
179 | - childIds = ((TSyntacticGroup) entity).getChildIds(); | ||
180 | - | ||
181 | - for (String id : childIds) | ||
182 | - result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity, | ||
183 | - tokenId2Segment, headsOnly)); | ||
184 | - | ||
185 | - return result; | ||
186 | - } | 15 | + private static Logger logger = LoggerFactory.getLogger(ThriftLoader.class); |
16 | + | ||
17 | + public static Text loadTextFromThrift(TText thriftText) | ||
18 | + throws MultiserviceException { | ||
19 | + Text text = new Text(thriftText.getTextHeader() == null ? "null" | ||
20 | + : thriftText.getTextHeader().getId()); | ||
21 | + | ||
22 | + logger.debug("Loading text " + text.getId() + " from thrift format..."); | ||
23 | + for (TParagraph teiP : thriftText.getParagraphs()) | ||
24 | + loadParagraph(text, teiP); | ||
25 | + logger.debug("Thrift text loaded."); | ||
26 | + | ||
27 | + return text; | ||
28 | + } | ||
29 | + | ||
30 | + private static void loadParagraph(Text text, TParagraph teiP) | ||
31 | + throws MultiserviceException { | ||
32 | + Paragraph p = new Paragraph(); | ||
33 | + text.add(p); | ||
34 | + | ||
35 | + for (TSentence teiS : teiP.getSentences()) | ||
36 | + loadSentence(p, teiS); | ||
37 | + } | ||
38 | + | ||
39 | + private static void loadSentence(Paragraph p, TSentence thriftSent) | ||
40 | + throws MultiserviceException { | ||
41 | + Sentence s = new Sentence(); | ||
42 | + p.add(s); | ||
43 | + | ||
44 | + Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent); | ||
45 | + | ||
46 | + Map<String, Token> thiftTokenId2Token = new HashMap<>(); | ||
47 | + for (TToken teiM : thriftSent.getTokens()) { | ||
48 | + Token token = loadToken(s, teiM); | ||
49 | + thiftTokenId2Token.put(teiM.getId(), token); | ||
50 | + } | ||
51 | + if (thriftSent.isSetNames()) | ||
52 | + for (TNamedEntity ne : thriftSent.getNames()) | ||
53 | + loadNE(s, ne, thirftId2Entity, thiftTokenId2Token); | ||
54 | + if (thriftSent.isSetWords()) | ||
55 | + for (TSyntacticWord w : thriftSent.getWords()) | ||
56 | + loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token); | ||
57 | + if (thriftSent.isSetGroups()) | ||
58 | + for (TSyntacticGroup g : thriftSent.getGroups()) | ||
59 | + loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token); | ||
60 | + } | ||
61 | + | ||
62 | + private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g, | ||
63 | + Map<String, Object> thirftId2Entity, | ||
64 | + Map<String, Token> thiftTokenId2Token) { | ||
65 | + String type = g.getType(); | ||
66 | + List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity, | ||
67 | + thiftTokenId2Token, false); | ||
68 | + List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity, | ||
69 | + thiftTokenId2Token, true); | ||
70 | + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | ||
71 | + } | ||
72 | + | ||
73 | + private static void loadSyntacticWord(Sentence s, TSyntacticWord w, | ||
74 | + Map<String, Object> thirftId2Entity, | ||
75 | + Map<String, Token> thiftTokenId2Token) { | ||
76 | + String ctag = w.getChosenInterpretation().getCtag(); | ||
77 | + List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity, | ||
78 | + thiftTokenId2Token, false); | ||
79 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | ||
80 | + } | ||
81 | + | ||
82 | + private static void loadNE(Sentence s, TNamedEntity ne, | ||
83 | + Map<String, Object> thirftId2Entity, | ||
84 | + Map<String, Token> thiftTokenId2Token) { | ||
85 | + List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity, | ||
86 | + thiftTokenId2Token, false); | ||
87 | + s.addNamedEntity(new NamedEntity(tokens)); | ||
88 | + } | ||
89 | + | ||
90 | + private static Map<String, Object> getThriftId2EntityMap( | ||
91 | + TSentence thriftSent) { | ||
92 | + Map<String, Object> idToEntity = new HashMap<>(); | ||
93 | + for (TToken tok : thriftSent.getTokens()) | ||
94 | + idToEntity.put(tok.getId(), tok); | ||
95 | + if (thriftSent.isSetWords()) | ||
96 | + for (TSyntacticWord w : thriftSent.getWords()) | ||
97 | + idToEntity.put(w.getId(), w); | ||
98 | + if (thriftSent.isSetNames()) | ||
99 | + for (TNamedEntity ne : thriftSent.getNames()) | ||
100 | + idToEntity.put(ne.getId(), ne); | ||
101 | + if (thriftSent.isSetGroups()) | ||
102 | + for (TSyntacticGroup group : thriftSent.getGroups()) | ||
103 | + idToEntity.put(group.getId(), group); | ||
104 | + return idToEntity; | ||
105 | + } | ||
106 | + | ||
107 | + private static Token loadToken(Sentence s, TToken teiM) | ||
108 | + throws MultiserviceException { | ||
109 | + Token seg = new Token(); | ||
110 | + s.add(seg); | ||
111 | + | ||
112 | + seg.setOrth(teiM.getOrth()); | ||
113 | + TInterpretation interp = getTokenChosenInt(teiM); | ||
114 | + Interpretation chosenIterpretation = new Interpretation( | ||
115 | + interp.getCtag(), interp.getMsd(), interp.getBase()); | ||
116 | + seg.addChosenInterpretation(chosenIterpretation); | ||
117 | + | ||
118 | + for (TInterpretation interp2 : teiM.getInterpretations()) { | ||
119 | + Interpretation inter = new Interpretation(interp2.getCtag(), | ||
120 | + interp2.getMsd(), interp.getBase()); | ||
121 | + seg.addInterpretation(inter); | ||
122 | + } | ||
123 | + return seg; | ||
124 | + } | ||
125 | + | ||
126 | + private static TInterpretation getTokenChosenInt(TToken token) | ||
127 | + throws MultiserviceException { | ||
128 | + TInterpretation interp = token.getChosenInterpretation(); | ||
129 | + if (interp == null || interp.getBase() == null | ||
130 | + || "".equals(interp.getBase())) { | ||
131 | + if (token.getCandidateInterpretations() == null | ||
132 | + || token.getCandidateInterpretations().isEmpty() | ||
133 | + || token.getCandidateInterpretations().get(0).getBase() == null | ||
134 | + || "".equals(token.getCandidateInterpretations().get(0).getBase())) | ||
135 | + throw new MultiserviceException( | ||
136 | + "No proper chosen or candidate interpretation for segment: " | ||
137 | + + token.id); | ||
138 | + interp = token.getCandidateInterpretations().get(0); | ||
139 | + } | ||
140 | + return interp; | ||
141 | + } | ||
142 | + | ||
143 | + private static List<Token> getUnderlyingSegments(Object entity, | ||
144 | + Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment, | ||
145 | + boolean headsOnly) { | ||
146 | + List<Token> result = new ArrayList<>(); | ||
147 | + | ||
148 | + if (entity instanceof TToken) { | ||
149 | + result.add(tokenId2Segment.get(((TToken) entity).getId())); | ||
150 | + return result; | ||
151 | + } | ||
152 | + | ||
153 | + List<String> childIds = new ArrayList<>(); | ||
154 | + if (entity instanceof TSyntacticWord) | ||
155 | + childIds = ((TSyntacticWord) entity).getChildIds(); | ||
156 | + else if (entity instanceof TNamedEntity) | ||
157 | + childIds = ((TNamedEntity) entity).getChildIds(); | ||
158 | + else if (entity instanceof TSyntacticGroup) | ||
159 | + if (headsOnly) { | ||
160 | + childIds = new ArrayList<>(); | ||
161 | + childIds.add(((TSyntacticGroup) entity).getSemanticHeadId()); | ||
162 | + } else | ||
163 | + childIds = ((TSyntacticGroup) entity).getChildIds(); | ||
164 | + | ||
165 | + for (String id : childIds) | ||
166 | + result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity, | ||
167 | + tokenId2Segment, headsOnly)); | ||
168 | + | ||
169 | + return result; | ||
170 | + } | ||
187 | } | 171 | } |
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftSaver.java
1 | package pl.waw.ipipan.zil.core.md.io.thrift; | 1 | package pl.waw.ipipan.zil.core.md.io.thrift; |
2 | 2 | ||
3 | -import java.util.ArrayList; | ||
4 | -import java.util.HashMap; | ||
5 | -import java.util.Iterator; | ||
6 | -import java.util.List; | ||
7 | -import java.util.Map; | ||
8 | - | ||
9 | -import org.apache.log4j.Logger; | ||
10 | - | ||
11 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | ||
12 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | ||
13 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
14 | -import pl.waw.ipipan.zil.core.md.entities.Text; | ||
15 | -import pl.waw.ipipan.zil.core.md.entities.Token; | ||
16 | -import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; | ||
17 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | ||
18 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | ||
19 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | ||
20 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | ||
21 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | 3 | +import org.slf4j.Logger; |
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | ||
7 | + | ||
8 | +import java.util.*; | ||
22 | 9 | ||
23 | public class ThriftSaver { | 10 | public class ThriftSaver { |
24 | 11 | ||
25 | - private static Logger logger = Logger.getLogger(ThriftSaver.class); | ||
26 | - | ||
27 | - public static void updateThriftText(Text responseText, TText text) | ||
28 | - throws MultiserviceException { | ||
29 | - | ||
30 | - logger.debug("Updating thrift text..."); | ||
31 | - Map<Mention, TMention> teiMention2ThriftMention = new HashMap<>(); | ||
32 | - | ||
33 | - Iterator<TParagraph> thrPI = text.getParagraphsIterator(); | ||
34 | - Iterator<Paragraph> teiPI = responseText.iterator(); | ||
35 | - int freeMentionId = 0; | ||
36 | - while (thrPI.hasNext() && teiPI.hasNext()) { | ||
37 | - TParagraph thrP = thrPI.next(); | ||
38 | - Paragraph teiP = teiPI.next(); | ||
39 | - | ||
40 | - freeMentionId = updateThriftParagraph(teiMention2ThriftMention, | ||
41 | - freeMentionId, thrP, teiP); | ||
42 | - } | ||
43 | - checkIterators(thrPI, teiPI, "paragraph"); | ||
44 | - } | ||
45 | - | ||
46 | - private static int updateThriftParagraph( | ||
47 | - Map<Mention, TMention> teiMention2ThriftMention, int freeMentionId, | ||
48 | - TParagraph thrP, Paragraph teiP) throws MultiserviceException { | ||
49 | - Iterator<TSentence> thrSI = thrP.getSentencesIterator(); | ||
50 | - Iterator<Sentence> teiSI = teiP.iterator(); | ||
51 | - while (thrSI.hasNext() && teiSI.hasNext()) { | ||
52 | - TSentence thrS = thrSI.next(); | ||
53 | - Sentence teiS = teiSI.next(); | ||
54 | - freeMentionId = updateThriftSentence(teiMention2ThriftMention, | ||
55 | - freeMentionId, thrS, teiS); | ||
56 | - } | ||
57 | - checkIterators(thrSI, teiSI, "sentence"); | ||
58 | - return freeMentionId; | ||
59 | - } | ||
60 | - | ||
61 | - private static int updateThriftSentence( | ||
62 | - Map<Mention, TMention> teiMention2ThriftMention, int id, | ||
63 | - TSentence thrS, Sentence teiS) throws MultiserviceException { | ||
64 | - thrS.unsetMentions(); | ||
65 | - thrS.setMentions(new ArrayList<TMention>()); | ||
66 | - | ||
67 | - Map<Token, TToken> teiMorph2ThriftToken = new HashMap<>(); | ||
68 | - Iterator<TToken> thrMI = thrS.getTokensIterator(); | ||
69 | - Iterator<Token> teiMI = teiS.iterator(); | ||
70 | - while (thrMI.hasNext() && teiMI.hasNext()) { | ||
71 | - teiMorph2ThriftToken.put(teiMI.next(), thrMI.next()); | ||
72 | - } | ||
73 | - checkIterators(thrMI, teiMI, "morph"); | ||
74 | - | ||
75 | - for (Mention m : teiS.getMentions()) { | ||
76 | - List<String> childIds = new ArrayList<>(); | ||
77 | - List<String> headIds = new ArrayList<>(); | ||
78 | - for (Token ch : m.getSegments()) | ||
79 | - childIds.add(teiMorph2ThriftToken.get(ch).getId()); | ||
80 | - for (Token h : m.getHeadSegments()) | ||
81 | - headIds.add(teiMorph2ThriftToken.get(h).getId()); | ||
82 | - | ||
83 | - TMention tm = new TMention("m-" + (id++), headIds, childIds, | ||
84 | - m.isZeroSubject()); | ||
85 | - teiMention2ThriftMention.put(m, tm); | ||
86 | - thrS.addToMentions(tm); | ||
87 | - } | ||
88 | - return id; | ||
89 | - } | ||
90 | - | ||
91 | - private static void checkIterators(Iterator<? extends Object> one, | ||
92 | - Iterator<? extends Object> other, String level) | ||
93 | - throws MultiserviceException { | ||
94 | - if (one.hasNext() || other.hasNext()) | ||
95 | - throw new MultiserviceException( | ||
96 | - "Problem mapping interal text representation to thrift for level " | ||
97 | - + level); | ||
98 | - } | 12 | + private static final Logger LOG = LoggerFactory.getLogger(ThriftSaver.class); |
13 | + | ||
14 | + private ThriftSaver() { | ||
15 | + } | ||
16 | + | ||
17 | + public static void updateThriftText(Text responseText, TText text) | ||
18 | + throws MultiserviceException { | ||
19 | + | ||
20 | + LOG.debug("Updating thrift text..."); | ||
21 | + Map<Mention, TMention> teiMention2ThriftMention = new HashMap<>(); | ||
22 | + | ||
23 | + Iterator<TParagraph> thrPI = text.getParagraphsIterator(); | ||
24 | + Iterator<Paragraph> teiPI = responseText.iterator(); | ||
25 | + int freeMentionId = 0; | ||
26 | + while (thrPI.hasNext() && teiPI.hasNext()) { | ||
27 | + TParagraph thrP = thrPI.next(); | ||
28 | + Paragraph teiP = teiPI.next(); | ||
29 | + | ||
30 | + freeMentionId = updateThriftParagraph(teiMention2ThriftMention, | ||
31 | + freeMentionId, thrP, teiP); | ||
32 | + } | ||
33 | + checkIterators(thrPI, teiPI, "paragraph"); | ||
34 | + } | ||
35 | + | ||
36 | + private static int updateThriftParagraph( | ||
37 | + Map<Mention, TMention> teiMention2ThriftMention, int freeMentionId, | ||
38 | + TParagraph thrP, Paragraph teiP) throws MultiserviceException { | ||
39 | + Iterator<TSentence> thrSI = thrP.getSentencesIterator(); | ||
40 | + Iterator<Sentence> teiSI = teiP.iterator(); | ||
41 | + while (thrSI.hasNext() && teiSI.hasNext()) { | ||
42 | + TSentence thrS = thrSI.next(); | ||
43 | + Sentence teiS = teiSI.next(); | ||
44 | + freeMentionId = updateThriftSentence(teiMention2ThriftMention, | ||
45 | + freeMentionId, thrS, teiS); | ||
46 | + } | ||
47 | + checkIterators(thrSI, teiSI, "sentence"); | ||
48 | + return freeMentionId; | ||
49 | + } | ||
50 | + | ||
51 | + private static int updateThriftSentence( | ||
52 | + Map<Mention, TMention> teiMention2ThriftMention, int id, | ||
53 | + TSentence thrS, Sentence teiS) throws MultiserviceException { | ||
54 | + thrS.unsetMentions(); | ||
55 | + thrS.setMentions(new ArrayList<>()); | ||
56 | + | ||
57 | + Map<Token, TToken> teiMorph2ThriftToken = new HashMap<>(); | ||
58 | + Iterator<TToken> thrMI = thrS.getTokensIterator(); | ||
59 | + Iterator<Token> teiMI = teiS.iterator(); | ||
60 | + while (thrMI.hasNext() && teiMI.hasNext()) { | ||
61 | + teiMorph2ThriftToken.put(teiMI.next(), thrMI.next()); | ||
62 | + } | ||
63 | + checkIterators(thrMI, teiMI, "morph"); | ||
64 | + | ||
65 | + for (Mention m : teiS.getMentions()) { | ||
66 | + List<String> childIds = new ArrayList<>(); | ||
67 | + List<String> headIds = new ArrayList<>(); | ||
68 | + for (Token ch : m.getSegments()) | ||
69 | + childIds.add(teiMorph2ThriftToken.get(ch).getId()); | ||
70 | + for (Token h : m.getHeadSegments()) | ||
71 | + headIds.add(teiMorph2ThriftToken.get(h).getId()); | ||
72 | + | ||
73 | + TMention tm = new TMention("m-" + (id++), headIds, childIds, | ||
74 | + m.isZeroSubject()); | ||
75 | + teiMention2ThriftMention.put(m, tm); | ||
76 | + thrS.addToMentions(tm); | ||
77 | + } | ||
78 | + return id; | ||
79 | + } | ||
80 | + | ||
81 | + private static void checkIterators(Iterator<?> one, | ||
82 | + Iterator<?> other, String level) | ||
83 | + throws MultiserviceException { | ||
84 | + if (one.hasNext() || other.hasNext()) | ||
85 | + throw new MultiserviceException( | ||
86 | + "Problem mapping interal text representation to thrift for level " | ||
87 | + + level); | ||
88 | + } | ||
99 | 89 | ||
100 | } | 90 | } |