Commit bd7f5abb07ff32954b95699545ac6194c0a44c7f
1 parent
62ccdfdc
1.3 release
Showing
19 changed files
with
1441 additions
and
1465 deletions
doc/compile.sh
0 → 100755
doc/manual.pdf
No preview for this file type
doc/manual.tex
... | ... | @@ -38,10 +38,10 @@ The current version of the program facilitates the automatic mention detection, |
38 | 38 | MentionDetector uses information provided in it's input to produce mentions for coreference resolution. It merges entities provided by named entity recognition tools, shallow parsers and taggers. |
39 | 39 | |
40 | 40 | It also finds zero subjects in clauses and marks the verbs using zero subjects as mentions, using the algorithm presented in \cite{kop:14:eacl:short}, for which a model was trained using the full Polish Coreference Corpus, version 0.92 (corpus description in \cite{ogro:etal:13:ltc}). Training data had 15875 positive and 37798 negative examples; 10-fold cross validation yielded an accuracy of 86.14\% for the task of finding zero subjects. Precision of 79.8\% and recall of 71.2\% for the zero subject class of verbs was obtained. |
41 | - | |
41 | + | |
42 | 42 | \textbf{Homepage:} \url{http://zil.ipipan.waw.pl/MentionDetector} \\ |
43 | 43 | \textbf{Contact person:} Mateusz Kopeć [mateusz.kopec@ipipan.waw.pl] \\ |
44 | -\textbf{Author:} Mateusz Kopeć \\ | |
44 | +\textbf{Author:} Mateusz Kopeć \\ | |
45 | 45 | \textbf{License:} CC BY v.3 |
46 | 46 | |
47 | 47 | |
... | ... | @@ -49,7 +49,7 @@ It also finds zero subjects in clauses and marks the verbs using zero subjects a |
49 | 49 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
50 | 50 | |
51 | 51 | \section{Requirements} |
52 | -Java Runtime Environment (JRE) 1.7 or newer. | |
52 | +Java Runtime Environment (JRE) 1.8 or newer. | |
53 | 53 | |
54 | 54 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
55 | 55 | %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% |
... | ... | @@ -143,7 +143,7 @@ Zero subjects are distinguished from other mentions by having an additional feat |
143 | 143 | |
144 | 144 | Standalone jar doesn't need any installation. To run it, simply execute:\\ |
145 | 145 | |
146 | -\texttt{java -jar md-1.0-SNAPSHOT.one-jar.jar <dir with input texts> <dir for output texts>}\\ | |
146 | +\texttt{java -jar md-1.3-jar-with-dependencies.jar <dir with input texts> <dir for output texts>}\\ | |
147 | 147 | |
148 | 148 | All texts recursively found in \texttt{<dir with input texts>} are going to be annotated with mentions layer and saved in \texttt{<dir for output texts>}.\\ |
149 | 149 | |
... | ... | @@ -153,7 +153,7 @@ All texts recursively found in \texttt{<dir with input texts>} are going to be a |
153 | 153 | \section{Custom zero subject detection model} |
154 | 154 | If you want to use custom zero subject detection model, you may try:\\ |
155 | 155 | |
156 | -\texttt{java -jar md-1.0-SNAPSHOT.one-jar.jar <dir with input texts> <dir for output texts> <model\_path>} | |
156 | +\texttt{java -jar md-1.3-jar-with-dependencies.jar <dir with input texts> <dir for output texts> <model\_path>} | |
157 | 157 | |
158 | 158 | To create such model, use the \texttt{pl.waw.ipipan.zil.core.md.detection.zero.Trainer} class. |
159 | 159 | |
... | ... |
pom.xml
1 | 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" |
2 | - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
3 | - <modelVersion>4.0.0</modelVersion> | |
4 | - <groupId>pl.waw.ipipan.zil.core</groupId> | |
5 | - <artifactId>md</artifactId> | |
6 | - <version>1.2-SNAPSHOT</version> | |
7 | - <properties> | |
8 | - <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |
9 | - </properties> | |
10 | - <build> | |
11 | - <plugins> | |
12 | - <plugin> | |
13 | - <artifactId>maven-compiler-plugin</artifactId> | |
14 | - <version>2.3.2</version> | |
15 | - <configuration> | |
16 | - <source>1.7</source> | |
17 | - <target>1.7</target> | |
18 | - </configuration> | |
19 | - </plugin> | |
20 | - <plugin> | |
21 | - <artifactId>maven-source-plugin</artifactId> | |
22 | - <version>2.4</version> | |
23 | - <executions> | |
24 | - <execution> | |
25 | - <id>attach-sources</id> | |
26 | - <phase>deploy</phase> | |
27 | - <goals> | |
28 | - <goal>jar-no-fork</goal> | |
29 | - </goals> | |
30 | - </execution> | |
31 | - </executions> | |
32 | - </plugin> | |
33 | - <plugin> | |
34 | - <artifactId>maven-javadoc-plugin</artifactId> | |
35 | - <version>2.10.3</version> | |
36 | - <executions> | |
37 | - <execution> | |
38 | - <id>attach-javadocs</id> | |
39 | - <phase>deploy</phase> | |
40 | - <goals> | |
41 | - <goal>jar</goal> | |
42 | - </goals> | |
43 | - </execution> | |
44 | - </executions> | |
45 | - </plugin> | |
46 | - <plugin> | |
47 | - <!-- explicitly define maven-deploy-plugin after other to force exec | |
48 | - order --> | |
49 | - <artifactId>maven-deploy-plugin</artifactId> | |
50 | - <version>2.7</version> | |
51 | - <executions> | |
52 | - <execution> | |
53 | - <id>deploy</id> | |
54 | - <phase>deploy</phase> | |
55 | - <goals> | |
56 | - <goal>deploy</goal> | |
57 | - </goals> | |
58 | - </execution> | |
59 | - </executions> | |
60 | - </plugin> | |
61 | - <plugin> | |
62 | - <groupId>org.dstovall</groupId> | |
63 | - <artifactId>onejar-maven-plugin</artifactId> | |
64 | - <version>1.4.4</version> | |
65 | - <executions> | |
66 | - <execution> | |
67 | - <configuration> | |
68 | - <mainClass>pl.waw.ipipan.zil.core.md.Main</mainClass> | |
69 | - </configuration> | |
70 | - <goals> | |
71 | - <goal>one-jar</goal> | |
72 | - </goals> | |
73 | - </execution> | |
74 | - </executions> | |
75 | - </plugin> | |
76 | - </plugins> | |
77 | - </build> | |
78 | - <dependencies> | |
79 | - <dependency> | |
80 | - <groupId>log4j</groupId> | |
81 | - <artifactId>log4j</artifactId> | |
82 | - <version>1.2.17</version> | |
83 | - </dependency> | |
84 | - <dependency> | |
85 | - <groupId>pl.waw.ipipan.zil.multiservice</groupId> | |
86 | - <artifactId>utils</artifactId> | |
87 | - <version>1.0-SNAPSHOT</version> | |
88 | - </dependency> | |
89 | - <dependency> | |
90 | - <groupId>pl.waw.ipipan.zil.nkjp</groupId> | |
91 | - <artifactId>teiapi</artifactId> | |
92 | - <version>1.0-SNAPSHOT</version> | |
93 | - </dependency> | |
94 | - <dependency> | |
95 | - <groupId>junit</groupId> | |
96 | - <artifactId>junit</artifactId> | |
97 | - <version>4.11</version> | |
98 | - </dependency> | |
99 | - <dependency> | |
100 | - <groupId>nz.ac.waikato.cms.weka</groupId> | |
101 | - <artifactId>weka-stable</artifactId> | |
102 | - <version>3.6.10</version> | |
103 | - </dependency> | |
104 | - </dependencies> | |
2 | + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | |
3 | + <modelVersion>4.0.0</modelVersion> | |
105 | 4 | |
106 | - <repositories> | |
107 | - <repository> | |
108 | - <id>zil-maven-repo</id> | |
109 | - <name>ZIL maven repository</name> | |
110 | - <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots</url> | |
111 | - </repository> | |
112 | - </repositories> | |
5 | + <groupId>pl.waw.ipipan.zil.core</groupId> | |
6 | + <artifactId>md</artifactId> | |
7 | + <version>1.3</version> | |
113 | 8 | |
114 | - <pluginRepositories> | |
115 | - <pluginRepository> | |
116 | - <id>onejar-maven-plugin.googlecode.com</id> | |
117 | - <url>http://onejar-maven-plugin.googlecode.com/svn/mavenrepo</url> | |
118 | - </pluginRepository> | |
119 | - </pluginRepositories> | |
9 | + <developers> | |
10 | + <developer> | |
11 | + <name>Mateusz Kopeć</name> | |
12 | + <organization>ICS PAS</organization> | |
13 | + <email>m.kopec@ipipan.waw.pl</email> | |
14 | + </developer> | |
15 | + </developers> | |
120 | 16 | |
121 | - <distributionManagement> | |
122 | - <repository> | |
123 | - <id>deployment</id> | |
124 | - <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url> | |
125 | - </repository> | |
126 | - <snapshotRepository> | |
127 | - <id>deployment</id> | |
128 | - <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url> | |
129 | - </snapshotRepository> | |
130 | - </distributionManagement> | |
17 | + <properties> | |
18 | + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> | |
19 | + <java.version>1.8</java.version> | |
20 | + | |
21 | + <junit.version>4.12</junit.version> | |
22 | + <slf4j.version>1.7.21</slf4j.version> | |
23 | + </properties> | |
24 | + | |
25 | + <prerequisites> | |
26 | + <maven>3.0.5</maven> | |
27 | + </prerequisites> | |
28 | + | |
29 | + <build> | |
30 | + <pluginManagement> | |
31 | + <plugins> | |
32 | + <plugin> | |
33 | + <artifactId>maven-compiler-plugin</artifactId> | |
34 | + <version>3.5.1</version> | |
35 | + <configuration> | |
36 | + <source>${java.version}</source> | |
37 | + <target>${java.version}</target> | |
38 | + </configuration> | |
39 | + </plugin> | |
40 | + <plugin> | |
41 | + <artifactId>maven-clean-plugin</artifactId> | |
42 | + <version>3.0.0</version> | |
43 | + </plugin> | |
44 | + <plugin> | |
45 | + <artifactId>maven-install-plugin</artifactId> | |
46 | + <version>2.5.2</version> | |
47 | + </plugin> | |
48 | + <plugin> | |
49 | + <artifactId>maven-jar-plugin</artifactId> | |
50 | + <version>3.0.2</version> | |
51 | + </plugin> | |
52 | + <plugin> | |
53 | + <artifactId>maven-resources-plugin</artifactId> | |
54 | + <version>3.0.1</version> | |
55 | + </plugin> | |
56 | + <plugin> | |
57 | + <artifactId>maven-site-plugin</artifactId> | |
58 | + <version>3.5.1</version> | |
59 | + </plugin> | |
60 | + <plugin> | |
61 | + <artifactId>maven-surefire-plugin</artifactId> | |
62 | + <version>2.19.1</version> | |
63 | + </plugin> | |
64 | + | |
65 | + <plugin> | |
66 | + <artifactId>maven-source-plugin</artifactId> | |
67 | + <version>3.0.1</version> | |
68 | + <executions> | |
69 | + <execution> | |
70 | + <id>attach-sources</id> | |
71 | + <phase>deploy</phase> | |
72 | + <goals> | |
73 | + <goal>jar-no-fork</goal> | |
74 | + </goals> | |
75 | + </execution> | |
76 | + </executions> | |
77 | + </plugin> | |
78 | + <plugin> | |
79 | + <artifactId>maven-javadoc-plugin</artifactId> | |
80 | + <version>2.10.4</version> | |
81 | + <executions> | |
82 | + <execution> | |
83 | + <id>attach-javadocs</id> | |
84 | + <phase>deploy</phase> | |
85 | + <goals> | |
86 | + <goal>jar</goal> | |
87 | + </goals> | |
88 | + </execution> | |
89 | + </executions> | |
90 | + </plugin> | |
91 | + <plugin> | |
92 | + <!-- explicitly define maven-deploy-plugin after other to force exec order --> | |
93 | + <artifactId>maven-deploy-plugin</artifactId> | |
94 | + <version>2.8.2</version> | |
95 | + <executions> | |
96 | + <execution> | |
97 | + <id>deploy</id> | |
98 | + <phase>deploy</phase> | |
99 | + <goals> | |
100 | + <goal>deploy</goal> | |
101 | + </goals> | |
102 | + </execution> | |
103 | + </executions> | |
104 | + </plugin> | |
105 | + <plugin> | |
106 | + <artifactId>maven-assembly-plugin</artifactId> | |
107 | + <version>2.6</version> | |
108 | + </plugin> | |
109 | + </plugins> | |
110 | + </pluginManagement> | |
111 | + | |
112 | + <plugins> | |
113 | + <plugin> | |
114 | + <artifactId>maven-assembly-plugin</artifactId> | |
115 | + <configuration> | |
116 | + <descriptorRefs> | |
117 | + <descriptorRef>jar-with-dependencies</descriptorRef> | |
118 | + </descriptorRefs> | |
119 | + <archive> | |
120 | + <manifest> | |
121 | + <mainClass>pl.waw.ipipan.zil.core.md.Main</mainClass> | |
122 | + </manifest> | |
123 | + </archive> | |
124 | + </configuration> | |
125 | + <executions> | |
126 | + <execution> | |
127 | + <id>make-assembly</id> | |
128 | + <phase>package</phase> | |
129 | + <goals> | |
130 | + <goal>single</goal> | |
131 | + </goals> | |
132 | + </execution> | |
133 | + </executions> | |
134 | + </plugin> | |
135 | + </plugins> | |
136 | + </build> | |
137 | + | |
138 | + <dependencies> | |
139 | + <!-- internal --> | |
140 | + <dependency> | |
141 | + <groupId>pl.waw.ipipan.zil.multiservice</groupId> | |
142 | + <artifactId>utils</artifactId> | |
143 | + <version>1.0</version> | |
144 | + </dependency> | |
145 | + <dependency> | |
146 | + <groupId>pl.waw.ipipan.zil.nkjp</groupId> | |
147 | + <artifactId>teiapi</artifactId> | |
148 | + <version>1.0</version> | |
149 | + </dependency> | |
150 | + | |
151 | + <!-- third party --> | |
152 | + <dependency> | |
153 | + <groupId>nz.ac.waikato.cms.weka</groupId> | |
154 | + <artifactId>weka-stable</artifactId> | |
155 | + <version>3.6.10</version> | |
156 | + </dependency> | |
157 | + | |
158 | + <!-- logging --> | |
159 | + <dependency> | |
160 | + <groupId>org.slf4j</groupId> | |
161 | + <artifactId>slf4j-api</artifactId> | |
162 | + <version>1.7.21</version> | |
163 | + </dependency> | |
164 | + <dependency> | |
165 | + <groupId>org.slf4j</groupId> | |
166 | + <artifactId>slf4j-simple</artifactId> | |
167 | + <version>1.7.21</version> | |
168 | + <scope>runtime</scope> | |
169 | + </dependency> | |
170 | + | |
171 | + <!-- test --> | |
172 | + <dependency> | |
173 | + <groupId>junit</groupId> | |
174 | + <artifactId>junit</artifactId> | |
175 | + <version>4.12</version> | |
176 | + <scope>test</scope> | |
177 | + </dependency> | |
178 | + | |
179 | + </dependencies> | |
180 | + | |
181 | + <repositories> | |
182 | + <repository> | |
183 | + <id>zil-maven-snapshot-repo</id> | |
184 | + <name>ZIL maven snapshot repository</name> | |
185 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url> | |
186 | + </repository> | |
187 | + <repository> | |
188 | + <id>zil-maven-release-repo</id> | |
189 | + <name>ZIL maven release repository</name> | |
190 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url> | |
191 | + </repository> | |
192 | + <repository> | |
193 | + <id>zil-maven-repo-3rdparty</id> | |
194 | + <name>ZIL maven repository 3rdparty</name> | |
195 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/thirdparty/</url> | |
196 | + </repository> | |
197 | + </repositories> | |
198 | + | |
199 | + <distributionManagement> | |
200 | + <repository> | |
201 | + <id>deployment</id> | |
202 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/releases/</url> | |
203 | + </repository> | |
204 | + <snapshotRepository> | |
205 | + <id>deployment</id> | |
206 | + <url>http://maven.nlp.ipipan.waw.pl/content/repositories/snapshots/</url> | |
207 | + </snapshotRepository> | |
208 | + </distributionManagement> | |
131 | 209 | </project> |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
1 | 1 | package pl.waw.ipipan.zil.core.md; |
2 | 2 | |
3 | -import java.io.File; | |
4 | -import java.io.FileInputStream; | |
5 | -import java.io.IOException; | |
6 | -import java.io.InputStream; | |
7 | - | |
8 | -import org.apache.log4j.Logger; | |
9 | - | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
10 | 5 | import pl.waw.ipipan.zil.core.md.detection.Detector; |
11 | 6 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; |
12 | 7 | import pl.waw.ipipan.zil.core.md.entities.Text; |
... | ... | @@ -20,134 +15,128 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; |
20 | 15 | import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; |
21 | 16 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; |
22 | 17 | |
23 | -/** | |
24 | - * @author Mateusz Kopeć | |
25 | - * | |
26 | - */ | |
18 | +import java.io.File; | |
19 | +import java.io.FileInputStream; | |
20 | +import java.io.IOException; | |
21 | +import java.io.InputStream; | |
22 | + | |
27 | 23 | public class Main { |
28 | 24 | |
29 | - private final static Logger logger = Logger.getLogger(Main.class); | |
30 | - private final static boolean GZIP_OUTPUT = true; | |
31 | - | |
32 | - private final static String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; | |
33 | - | |
34 | - private static ZeroSubjectDetector zeroSubjectModel; | |
35 | - static { | |
36 | - InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); | |
37 | - zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | |
38 | - } | |
39 | - | |
40 | - /** | |
41 | - * Main method for detecting mentions in corpus encoded in Tei format. | |
42 | - * | |
43 | - * @param args | |
44 | - * arguments | |
45 | - */ | |
46 | - public static void main(String[] args) { | |
47 | - | |
48 | - if (args.length != 2 && args.length != 3) { | |
49 | - logger.error("Wrong usage! should be: " + Main.class.getSimpleName() | |
50 | - + " input_dir result_dir [zero_subject_model]"); | |
51 | - return; | |
52 | - } | |
53 | - | |
54 | - File inputDir = new File(args[0]); | |
55 | - File outputDir = new File(args[1]); | |
56 | - | |
57 | - if (!inputDir.isDirectory()) { | |
58 | - logger.error(inputDir + " is not a directory!"); | |
59 | - return; | |
60 | - } | |
61 | - if (!outputDir.isDirectory()) { | |
62 | - logger.error(outputDir + " is not a directory!"); | |
63 | - return; | |
64 | - } | |
65 | - if (args.length == 3) { | |
66 | - try { | |
67 | - InputStream zeroSubjectDetectionModelStream; | |
68 | - zeroSubjectDetectionModelStream = new FileInputStream(new File(args[2])); | |
69 | - zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | |
70 | - if (zeroSubjectModel == null) | |
71 | - throw new IOException(); | |
72 | - } catch (IOException e) { | |
73 | - logger.error("Unable to load model from file: " + args[2] + ": " + e); | |
74 | - return; | |
75 | - } | |
76 | - } | |
77 | - | |
78 | - int all = 0; | |
79 | - int errors = 0; | |
80 | - for (File teiDir : IOUtils.getNKJPDirs(inputDir)) { | |
81 | - all++; | |
82 | - try { | |
83 | - File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); | |
84 | - TEICorpusText teiText = TeiLoader.readTeiText(teiDir); | |
85 | - annotateTeiText(teiText); | |
86 | - TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); | |
87 | - } catch (IOException e) { | |
88 | - logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage()); | |
89 | - errors++; | |
90 | - } | |
91 | - } | |
92 | - | |
93 | - logger.info(all + " texts processed succesfully."); | |
94 | - if (errors > 0) | |
95 | - logger.info(errors + " texts not processed."); | |
96 | - logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected."); | |
97 | - logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected."); | |
98 | - } | |
99 | - | |
100 | - /** | |
101 | - * Find relative path of text directory in the corpus directory and create | |
102 | - * similar directory structure in the output corpus directory. | |
103 | - * | |
104 | - * @param inputCorpusDir | |
105 | - * input corpus directory | |
106 | - * @param outputCorpusDir | |
107 | - * output corpus directory | |
108 | - * @param textDir | |
109 | - * input text dir | |
110 | - * @return target text dir | |
111 | - * @throws IOException | |
112 | - * when an error occurs | |
113 | - */ | |
114 | - private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException { | |
115 | - String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length()); | |
116 | - File targetDir = new File(outputCorpusDir, relativeDirPath); | |
117 | - targetDir.mkdirs(); | |
118 | - if (!targetDir.exists() || !targetDir.isDirectory()) | |
119 | - throw new IOException("Failed to create output directory at: " + targetDir); | |
120 | - return targetDir; | |
121 | - } | |
122 | - | |
123 | - /** | |
124 | - * Find mentions in Thrift text and update this Thrift text with mention | |
125 | - * annotation. | |
126 | - * | |
127 | - * @param thriftText | |
128 | - * text to annotate with mentions | |
129 | - * @throws MultiserviceException | |
130 | - * when an error occures | |
131 | - */ | |
132 | - public static void annotateThriftText(TText thriftText) throws MultiserviceException { | |
133 | - Text responseText = ThriftLoader.loadTextFromThrift(thriftText); | |
134 | - Detector.findMentionsInText(responseText, zeroSubjectModel); | |
135 | - ThriftSaver.updateThriftText(responseText, thriftText); | |
136 | - } | |
137 | - | |
138 | - /** | |
139 | - * Find mentions in Tei text and update this Tei text with mention | |
140 | - * annotation. This method does not save this Tei text on disk. | |
141 | - * | |
142 | - * @param teiText | |
143 | - * text to annotate with mentions | |
144 | - * @throws TEIException | |
145 | - * when an error occurs | |
146 | - */ | |
147 | - public static void annotateTeiText(TEICorpusText teiText) throws TEIException { | |
148 | - Text responseText = TeiLoader.loadTextFromTei(teiText); | |
149 | - Detector.findMentionsInText(responseText, zeroSubjectModel); | |
150 | - TeiSaver.updateTeiText(responseText, teiText); | |
151 | - } | |
25 | + private static final Logger logger = LoggerFactory.getLogger(Main.class); | |
26 | + | |
27 | + private static final boolean GZIP_OUTPUT = true; | |
28 | + private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; | |
29 | + | |
30 | + private static ZeroSubjectDetector zeroSubjectModel; | |
31 | + | |
32 | + static { | |
33 | + InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); | |
34 | + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | |
35 | + } | |
36 | + | |
37 | + private Main() { | |
38 | + } | |
39 | + | |
40 | + /** | |
41 | + * Main method for detecting mentions in corpus encoded in Tei format. | |
42 | + * | |
43 | + * @param args arguments | |
44 | + */ | |
45 | + public static void main(String[] args) { | |
46 | + | |
47 | + if (args.length != 2 && args.length != 3) { | |
48 | + logger.error("Wrong usage! should be: " + Main.class.getSimpleName() | |
49 | + + " input_dir result_dir [zero_subject_model]"); | |
50 | + return; | |
51 | + } | |
52 | + | |
53 | + File inputDir = new File(args[0]); | |
54 | + File outputDir = new File(args[1]); | |
55 | + | |
56 | + if (!inputDir.isDirectory()) { | |
57 | + logger.error(inputDir + " is not a directory!"); | |
58 | + return; | |
59 | + } | |
60 | + if (!outputDir.isDirectory()) { | |
61 | + logger.error(outputDir + " is not a directory!"); | |
62 | + return; | |
63 | + } | |
64 | + if (args.length == 3) { | |
65 | + try { | |
66 | + InputStream zeroSubjectDetectionModelStream; | |
67 | + zeroSubjectDetectionModelStream = new FileInputStream(new File(args[2])); | |
68 | + zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); | |
69 | + } catch (IOException e) { | |
70 | + logger.error("Unable to load model from file: " + args[2] + ": " + e, e); | |
71 | + return; | |
72 | + } | |
73 | + } | |
74 | + | |
75 | + int all = 0; | |
76 | + int errors = 0; | |
77 | + for (File teiDir : IOUtils.getNKJPDirs(inputDir)) { | |
78 | + all++; | |
79 | + try { | |
80 | + File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); | |
81 | + TEICorpusText teiText = TeiLoader.readTeiText(teiDir); | |
82 | + annotateTeiText(teiText); | |
83 | + TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); | |
84 | + } catch (IOException e) { | |
85 | + logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e); | |
86 | + errors++; | |
87 | + } | |
88 | + } | |
89 | + | |
90 | + logger.info(all + " texts processed succesfully."); | |
91 | + if (errors > 0) | |
92 | + logger.info(errors + " texts not processed."); | |
93 | + logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected."); | |
94 | + logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected."); | |
95 | + } | |
96 | + | |
97 | + /** | |
98 | + * Find relative path of text directory in the corpus directory and create | |
99 | + * similar directory structure in the output corpus directory. | |
100 | + * | |
101 | + * @param inputCorpusDir input corpus directory | |
102 | + * @param outputCorpusDir output corpus directory | |
103 | + * @param textDir input text dir | |
104 | + * @return target text dir | |
105 | + * @throws IOException when an error occurs | |
106 | + */ | |
107 | + private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException { | |
108 | + String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length()); | |
109 | + File targetDir = new File(outputCorpusDir, relativeDirPath); | |
110 | + targetDir.mkdirs(); | |
111 | + if (!targetDir.exists() || !targetDir.isDirectory()) | |
112 | + throw new IOException("Failed to create output directory at: " + targetDir); | |
113 | + return targetDir; | |
114 | + } | |
115 | + | |
116 | + /** | |
117 | + * Find mentions in Thrift text and update this Thrift text with mention | |
118 | + * annotation. | |
119 | + * | |
120 | + * @param thriftText text to annotate with mentions | |
121 | + * @throws MultiserviceException when an error occures | |
122 | + */ | |
123 | + public static void annotateThriftText(TText thriftText) throws MultiserviceException { | |
124 | + Text responseText = ThriftLoader.loadTextFromThrift(thriftText); | |
125 | + Detector.findMentionsInText(responseText, zeroSubjectModel); | |
126 | + ThriftSaver.updateThriftText(responseText, thriftText); | |
127 | + } | |
128 | + | |
129 | + /** | |
130 | + * Find mentions in Tei text and update this Tei text with mention | |
131 | + * annotation. This method does not save this Tei text on disk. | |
132 | + * | |
133 | + * @param teiText text to annotate with mentions | |
134 | + * @throws TEIException when an error occurs | |
135 | + */ | |
136 | + public static void annotateTeiText(TEICorpusText teiText) throws TEIException { | |
137 | + Text responseText = TeiLoader.loadTextFromTei(teiText); | |
138 | + Detector.findMentionsInText(responseText, zeroSubjectModel); | |
139 | + TeiSaver.updateTeiText(responseText, teiText); | |
140 | + } | |
152 | 141 | |
153 | 142 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | |
3 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
4 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
6 | + | |
3 | 7 | import java.util.Collection; |
4 | 8 | import java.util.HashSet; |
5 | 9 | import java.util.List; |
6 | 10 | import java.util.Set; |
7 | 11 | |
8 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | |
9 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
10 | -import pl.waw.ipipan.zil.core.md.entities.Token; | |
11 | - | |
12 | 12 | public class Cleaner { |
13 | - public static void cleanUnnecessarySentenceMentions(Sentence sentence) { | |
14 | - List<Mention> mentions = sentence.getMentions(); | |
15 | - Collection<Mention> unnecessaryMentions = new HashSet<Mention>(); | |
16 | - | |
17 | - for (int i = 0; i < mentions.size(); i++) { | |
18 | - Mention m1 = mentions.get(i); | |
19 | - for (int j = i + 1; j < mentions.size(); j++) { | |
20 | - Mention m2 = mentions.get(j); | |
21 | - | |
22 | - Mention lessImportantMention = getLessImportantMention(m1, m2); | |
23 | - Mention moreImportantMention = m1 == lessImportantMention ? m2 | |
24 | - : m1; | |
25 | - | |
26 | - // same mention borders | |
27 | - if (m1.getSegments().equals(m2.getSegments())) { | |
28 | - unnecessaryMentions.add(lessImportantMention); | |
29 | - // System.out.println("Same borders: "+ m1 +", "+ | |
30 | - // m2+": "+getLessImportantMention(m1, m2)+" removed"); | |
31 | - continue; | |
32 | - } | |
33 | - // same mention heads | |
34 | - if (!m1.getHeadSegments().isEmpty() | |
35 | - && !m2.getHeadSegments().isEmpty()) { | |
36 | - if (m1.getHeadSegments().equals(m2.getHeadSegments())) { | |
37 | - | |
38 | - List<Token> segments = moreImportantMention | |
39 | - .getSegments(); | |
40 | - | |
41 | - boolean isConj = false; | |
42 | - for (Token seg : segments) { | |
43 | - if (seg.getChosenInterpretation().getCtag() | |
44 | - .equals("conj")) { | |
45 | - isConj = true; | |
46 | - break; | |
47 | - } | |
48 | - } | |
49 | - | |
50 | - if (!isConj) { | |
51 | - unnecessaryMentions.add(lessImportantMention); | |
52 | - // System.out.println("Same heads: " + m1 + ", " + | |
53 | - // m2 + ": " + lessImportantMention | |
54 | - // + " removed"); | |
55 | - | |
56 | - continue; | |
57 | - } | |
58 | - } | |
59 | - } | |
60 | - | |
61 | - // mention head equals whole other mention | |
62 | - if (m1.getHeadSegments().isEmpty() | |
63 | - && !m2.getHeadSegments().isEmpty()) { | |
64 | - if (m2.getHeadSegments().equals(m1.getSegments())) { | |
65 | - unnecessaryMentions.add(lessImportantMention); | |
66 | - continue; | |
67 | - // System.out.println("head is other mention: " + m1 + | |
68 | - // ", " + m2 + ": " | |
69 | - // + getLessImportantMention(m1, m2) + " removed"); | |
70 | - } | |
71 | - } | |
72 | - | |
73 | - // the same, but other way round | |
74 | - if (m2.getHeadSegments().isEmpty() | |
75 | - && !m1.getHeadSegments().isEmpty()) { | |
76 | - | |
77 | - if (m1.getHeadSegments().equals(m2.getSegments())) { | |
78 | - unnecessaryMentions.add(lessImportantMention); | |
79 | - continue; | |
80 | - // System.out.println("head is other mention: " + m1 + | |
81 | - // ", " + m2 + ": " | |
82 | - // + getLessImportantMention(m1, m2) + " removed"); | |
83 | - } | |
84 | - } | |
85 | - | |
86 | - // nie zawieraja sie w sobie, lecz maja czesc wspolna | |
87 | - boolean intersect = false; | |
88 | - | |
89 | - Set<Token> notInM1 = new HashSet<Token>(m2.getSegments()); | |
90 | - notInM1.removeAll(m1.getSegments()); | |
91 | - if (notInM1.size() < m2.getSegments().size()) | |
92 | - intersect = true; | |
93 | - | |
94 | - Set<Token> notInM2 = new HashSet<Token>(m1.getSegments()); | |
95 | - notInM2.removeAll(m2.getSegments()); | |
96 | - if (notInM2.size() < m1.getSegments().size()) | |
97 | - intersect = true; | |
98 | - | |
99 | - // if (intersect) | |
100 | - // System.out.println(m1+","+m2); | |
101 | - | |
102 | - if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) { | |
103 | - unnecessaryMentions.add(lessImportantMention); | |
104 | - continue; | |
105 | - // System.out.println("intersection!" + m1 + ", " + m2 + | |
106 | - // ": " | |
107 | - // + getLessImportantMention(m1, m2) + " removed"); | |
108 | - } | |
109 | - | |
110 | - } | |
111 | - } | |
112 | - | |
113 | - for (Mention m : unnecessaryMentions) | |
114 | - sentence.removeMention(m); | |
115 | - | |
116 | - // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]] | |
117 | - unnecessaryMentions.clear(); | |
118 | - | |
119 | - OUTER: for (Mention m : sentence.getMentions()) { | |
120 | - for (Token seg : m.getSegments()) | |
121 | - if (seg.getOrth().toLowerCase().equals(seg.getOrth())) | |
122 | - continue OUTER; | |
123 | - | |
124 | - //only for children of fully capitalized mentions | |
125 | - Set<Mention> allMentions = new HashSet<Mention>(); | |
126 | - for (Token seg : m.getSegments()) | |
127 | - for (Mention m2 : seg.getMentions()) | |
128 | - if (m.getSegments().containsAll(m2.getSegments())) | |
129 | - allMentions.add(m2); | |
130 | - | |
131 | - allMentions.remove(m); | |
132 | - | |
133 | - unnecessaryMentions.addAll(allMentions); | |
134 | - } | |
135 | - for (Mention m : unnecessaryMentions) | |
136 | - sentence.removeMention(m); | |
137 | - } | |
138 | - | |
139 | - private static Mention getLessImportantMention(Mention m1, Mention m2) { | |
140 | - if (m1.getSegments().size() > m2.getSegments().size()) | |
141 | - return m2; | |
142 | - else | |
143 | - return m1; | |
144 | - } | |
13 | + public static void cleanUnnecessarySentenceMentions(Sentence sentence) { | |
14 | + List<Mention> mentions = sentence.getMentions(); | |
15 | + Collection<Mention> unnecessaryMentions = new HashSet<>(); | |
16 | + | |
17 | + for (int i = 0; i < mentions.size(); i++) { | |
18 | + Mention m1 = mentions.get(i); | |
19 | + for (int j = i + 1; j < mentions.size(); j++) { | |
20 | + Mention m2 = mentions.get(j); | |
21 | + | |
22 | + Mention lessImportantMention = getLessImportantMention(m1, m2); | |
23 | + Mention moreImportantMention = m1 == lessImportantMention ? m2 | |
24 | + : m1; | |
25 | + | |
26 | + // same mention borders | |
27 | + if (m1.getSegments().equals(m2.getSegments())) { | |
28 | + unnecessaryMentions.add(lessImportantMention); | |
29 | + continue; | |
30 | + } | |
31 | + // same mention heads | |
32 | + if (!m1.getHeadSegments().isEmpty() | |
33 | + && !m2.getHeadSegments().isEmpty()) { | |
34 | + if (m1.getHeadSegments().equals(m2.getHeadSegments())) { | |
35 | + | |
36 | + List<Token> segments = moreImportantMention | |
37 | + .getSegments(); | |
38 | + | |
39 | + boolean isConj = false; | |
40 | + for (Token seg : segments) { | |
41 | + if (seg.getChosenInterpretation().getCtag() | |
42 | + .equals("conj")) { | |
43 | + isConj = true; | |
44 | + break; | |
45 | + } | |
46 | + } | |
47 | + | |
48 | + if (!isConj) { | |
49 | + unnecessaryMentions.add(lessImportantMention); | |
50 | + continue; | |
51 | + } | |
52 | + } | |
53 | + } | |
54 | + | |
55 | + // mention head equals whole other mention | |
56 | + if (m1.getHeadSegments().isEmpty() | |
57 | + && !m2.getHeadSegments().isEmpty()) { | |
58 | + if (m2.getHeadSegments().equals(m1.getSegments())) { | |
59 | + unnecessaryMentions.add(lessImportantMention); | |
60 | + continue; | |
61 | + } | |
62 | + } | |
63 | + | |
64 | + // the same, but other way round | |
65 | + if (m2.getHeadSegments().isEmpty() | |
66 | + && !m1.getHeadSegments().isEmpty()) { | |
67 | + | |
68 | + if (m1.getHeadSegments().equals(m2.getSegments())) { | |
69 | + unnecessaryMentions.add(lessImportantMention); | |
70 | + continue; | |
71 | + } | |
72 | + } | |
73 | + | |
74 | + // nie zawieraja sie w sobie, lecz maja czesc wspolna | |
75 | + boolean intersect = false; | |
76 | + | |
77 | + Set<Token> notInM1 = new HashSet<>(m2.getSegments()); | |
78 | + notInM1.removeAll(m1.getSegments()); | |
79 | + if (notInM1.size() < m2.getSegments().size()) | |
80 | + intersect = true; | |
81 | + | |
82 | + Set<Token> notInM2 = new HashSet<>(m1.getSegments()); | |
83 | + notInM2.removeAll(m2.getSegments()); | |
84 | + if (notInM2.size() < m1.getSegments().size()) | |
85 | + intersect = true; | |
86 | + | |
87 | + if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) { | |
88 | + unnecessaryMentions.add(lessImportantMention); | |
89 | + continue; | |
90 | + } | |
91 | + | |
92 | + } | |
93 | + } | |
94 | + | |
95 | + for (Mention m : unnecessaryMentions) | |
96 | + sentence.removeMention(m); | |
97 | + | |
98 | + // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]] | |
99 | + unnecessaryMentions.clear(); | |
100 | + | |
101 | + OUTER: | |
102 | + for (Mention m : sentence.getMentions()) { | |
103 | + for (Token seg : m.getSegments()) | |
104 | + if (seg.getOrth().toLowerCase().equals(seg.getOrth())) | |
105 | + continue OUTER; | |
106 | + | |
107 | + //only for children of fully capitalized mentions | |
108 | + Set<Mention> allMentions = new HashSet<>(); | |
109 | + for (Token seg : m.getSegments()) | |
110 | + for (Mention m2 : seg.getMentions()) | |
111 | + if (m.getSegments().containsAll(m2.getSegments())) | |
112 | + allMentions.add(m2); | |
113 | + | |
114 | + allMentions.remove(m); | |
115 | + | |
116 | + unnecessaryMentions.addAll(allMentions); | |
117 | + } | |
118 | + for (Mention m : unnecessaryMentions) | |
119 | + sentence.removeMention(m); | |
120 | + } | |
121 | + | |
122 | + private static Mention getLessImportantMention(Mention m1, Mention m2) { | |
123 | + if (m1.getSegments().size() > m2.getSegments().size()) | |
124 | + return m2; | |
125 | + else | |
126 | + return m1; | |
127 | + } | |
145 | 128 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | |
3 | 3 | public class Constants { |
4 | - public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger"; | |
5 | - public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt"; | |
6 | - public static final String MORPHO_PRONOUN_CTAGS = "ppron3|ppron12"; | |
7 | - public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|" | |
8 | - + MORPHO_PRONOUN_CTAGS; | |
9 | - public static final String WORDS_CTAGS = "Noun|Ppron.*"; | |
4 | + public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger"; | |
5 | + public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt"; | |
6 | + public static final String MORPHO_PRONOUN_CTAGS = "ppron3|ppron12"; | |
7 | + public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|" | |
8 | + + MORPHO_PRONOUN_CTAGS; | |
9 | + public static final String WORDS_CTAGS = "Noun|Ppron.*"; | |
10 | + | |
11 | + private Constants() { | |
12 | + } | |
10 | 13 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection; |
2 | 2 | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | |
6 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
7 | + | |
3 | 8 | import java.util.ArrayList; |
4 | 9 | import java.util.HashSet; |
5 | 10 | import java.util.List; |
6 | 11 | import java.util.Set; |
7 | 12 | |
8 | -import org.apache.log4j.Logger; | |
9 | - | |
10 | -import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | |
11 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | |
12 | -import pl.waw.ipipan.zil.core.md.entities.NamedEntity; | |
13 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
14 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
15 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | |
16 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | |
17 | -import pl.waw.ipipan.zil.core.md.entities.Text; | |
18 | -import pl.waw.ipipan.zil.core.md.entities.Token; | |
19 | - | |
20 | 13 | public class Detector { |
21 | - private static Logger logger = Logger.getLogger(Detector.class); | |
22 | - | |
23 | - public static void findMentionsInText(Text text, | |
24 | - ZeroSubjectDetector zeroSubjectModel) { | |
25 | - text.clearMentions(); | |
26 | - logger.debug("Detecting mentions in text " + text.getId()); | |
27 | - for (Paragraph p : text) | |
28 | - for (Sentence s : p) | |
29 | - detectMentionsInSentence(s, zeroSubjectModel); | |
30 | - } | |
31 | - | |
32 | - private static void detectMentionsInSentence(Sentence sentence, | |
33 | - ZeroSubjectDetector zeroSubjectModel) { | |
34 | - // adding mentions | |
35 | - addMentionsByTokenCtag(sentence); | |
36 | - addMentionsBySyntacticWordsCtag(sentence); | |
37 | - addMentionsByNamedEntities(sentence); | |
38 | - addMentionsByGroups(sentence); | |
39 | - addSpeakerMentionsInSpoken(sentence); | |
40 | - | |
41 | - // zero subject detection | |
42 | - zeroSubjectModel.addZeroSubjectMentions(sentence); | |
43 | - | |
44 | - // removing mentions | |
45 | - removeTo(sentence); | |
46 | - Cleaner.cleanUnnecessarySentenceMentions(sentence); | |
47 | - | |
48 | - // updating mention heads | |
49 | - updateMentionHeads(sentence); | |
50 | - } | |
51 | - | |
52 | - /** | |
53 | - * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak | |
54 | - * | |
55 | - * @param sentence | |
56 | - */ | |
57 | - private static void updateMentionHeads(Sentence sentence) { | |
58 | - for (Mention m : sentence.getMentions()) | |
59 | - if (m.getHeadSegments().isEmpty()) | |
60 | - m.addHeadSegment(m.getFirstSegment()); | |
61 | - } | |
62 | - | |
63 | - /** | |
64 | - * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro"" | |
65 | - * | |
66 | - * @param sentence | |
67 | - */ | |
68 | - private static void removeTo(Sentence sentence) { | |
69 | - Set<String> orths = new HashSet<String>(); | |
70 | - for (Token morph : sentence) | |
71 | - orths.add(morph.getOrth()); | |
72 | - | |
73 | - if (orths.contains("jeśli") || orths.contains("jeżeli") | |
74 | - || orths.contains("skoro")) { | |
75 | - for (Mention mention : sentence.getMentions()) { | |
76 | - List<Token> mentSegs = mention.getSegments(); | |
77 | - if (mentSegs.size() == 1 | |
78 | - && mentSegs.get(0).getBase().equals("to")) { | |
79 | - sentence.removeMention(mention); | |
80 | - } | |
81 | - } | |
82 | - } | |
83 | - } | |
84 | - | |
85 | - private static void addSpeakerMentionsInSpoken(Sentence sentence) { | |
86 | - // heurystyka dla sp1:, sp2:, MarszałekJAkistam: | |
87 | - if (sentence.size() > 2) { | |
88 | - Token first = sentence.get(0); | |
89 | - Token second = sentence.get(1); | |
90 | - if (second.getOrth().equals(":")) { | |
91 | - sentence.addMention(new Mention(first)); | |
92 | - } | |
93 | - } | |
94 | - } | |
95 | - | |
96 | - /** | |
97 | - * Wyszukuję i oznaczam wszystkie NG* | |
98 | - * | |
99 | - * @param sentence | |
100 | - */ | |
101 | - private static void addMentionsByGroups(Sentence sentence) { | |
102 | - for (SyntacticGroup group : sentence.getGroups()) { | |
103 | - if (group.getType().startsWith("NG")) { | |
104 | - List<Token> segments = group.getTokens(); | |
105 | - List<Token> heads = group.getSemanticHeadTokens(); | |
106 | - | |
107 | - sentence.addMention(new Mention(segments, heads)); | |
108 | - } | |
109 | - } | |
110 | - } | |
111 | - | |
112 | - /** | |
113 | - * Wyszukuję i oznaczam wszystkie NER | |
114 | - * | |
115 | - * @param sentence | |
116 | - */ | |
117 | - private static void addMentionsByNamedEntities(Sentence sentence) { | |
118 | - for (NamedEntity ne : sentence.getNamedEntities()) { | |
119 | - | |
120 | - List<Token> headTokens = new ArrayList<Token>(); | |
121 | - List<Token> tokens = ne.getTokens(); | |
122 | - | |
123 | - boolean containsNoun = false; | |
124 | - for (Token seg : tokens) { | |
125 | - if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) { | |
126 | - containsNoun = true; | |
127 | - break; | |
128 | - } | |
129 | - } | |
130 | - if (!containsNoun) | |
131 | - continue; | |
132 | - | |
133 | - sentence.addMention(new Mention(tokens, headTokens)); | |
134 | - } | |
135 | - } | |
136 | - | |
137 | - /** | |
138 | - * @param sentence | |
139 | - */ | |
140 | - private static void addMentionsBySyntacticWordsCtag(Sentence sentence) { | |
141 | - for (SyntacticWord w : sentence.getSyntacticWords()) | |
142 | - if (w.getCtag().matches(Constants.WORDS_CTAGS)) { | |
143 | - List<Token> tokens = w.getTokens(); | |
144 | - if (tokens.size() == 1) { | |
145 | - sentence.addMention(new Mention(tokens.get(0))); | |
146 | - } else { | |
147 | - List<Token> heads = new ArrayList<Token>(); | |
148 | - sentence.addMention(new Mention(tokens, heads)); | |
149 | - } | |
150 | - } | |
151 | - } | |
152 | - | |
153 | - /** | |
154 | - * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow | |
155 | - * skladniowych, to korzystam z niego zamiast morfoskladni | |
156 | - * | |
157 | - * @param sentence | |
158 | - */ | |
159 | - private static void addMentionsByTokenCtag(Sentence sentence) { | |
160 | - for (Token token : sentence) | |
161 | - if (token.getCtag().matches(Constants.MORPHO_CTAGS)) | |
162 | - sentence.addMention(new Mention(token)); | |
163 | - } | |
14 | + | |
15 | + private static final Logger logger = LoggerFactory.getLogger(Detector.class); | |
16 | + | |
17 | + private Detector() { | |
18 | + } | |
19 | + | |
20 | + public static void findMentionsInText(Text text, | |
21 | + ZeroSubjectDetector zeroSubjectModel) { | |
22 | + text.clearMentions(); | |
23 | + logger.debug("Detecting mentions in text " + text.getId()); | |
24 | + for (Paragraph p : text) | |
25 | + for (Sentence s : p) | |
26 | + detectMentionsInSentence(s, zeroSubjectModel); | |
27 | + } | |
28 | + | |
29 | + private static void detectMentionsInSentence(Sentence sentence, | |
30 | + ZeroSubjectDetector zeroSubjectModel) { | |
31 | + // adding mentions | |
32 | + addMentionsByTokenCtag(sentence); | |
33 | + addMentionsBySyntacticWordsCtag(sentence); | |
34 | + addMentionsByNamedEntities(sentence); | |
35 | + addMentionsByGroups(sentence); | |
36 | + addSpeakerMentionsInSpoken(sentence); | |
37 | + | |
38 | + // zero subject detection | |
39 | + zeroSubjectModel.addZeroSubjectMentions(sentence); | |
40 | + | |
41 | + // removing mentions | |
42 | + removeTo(sentence); | |
43 | + Cleaner.cleanUnnecessarySentenceMentions(sentence); | |
44 | + | |
45 | + // updating mention heads | |
46 | + updateMentionHeads(sentence); | |
47 | + } | |
48 | + | |
49 | + /** | |
50 | + * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak | |
51 | + * | |
52 | + * @param sentence | |
53 | + */ | |
54 | + private static void updateMentionHeads(Sentence sentence) { | |
55 | + for (Mention m : sentence.getMentions()) | |
56 | + if (m.getHeadSegments().isEmpty()) | |
57 | + m.addHeadSegment(m.getFirstSegment()); | |
58 | + } | |
59 | + | |
60 | + /** | |
61 | + * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro"" | |
62 | + * | |
63 | + * @param sentence | |
64 | + */ | |
65 | + private static void removeTo(Sentence sentence) { | |
66 | + Set<String> orths = new HashSet<>(); | |
67 | + for (Token morph : sentence) | |
68 | + orths.add(morph.getOrth()); | |
69 | + | |
70 | + if (orths.contains("jeśli") || orths.contains("jeżeli") | |
71 | + || orths.contains("skoro")) { | |
72 | + for (Mention mention : sentence.getMentions()) { | |
73 | + List<Token> mentSegs = mention.getSegments(); | |
74 | + if (mentSegs.size() == 1 | |
75 | + && "to".equals(mentSegs.get(0).getBase())) { | |
76 | + sentence.removeMention(mention); | |
77 | + } | |
78 | + } | |
79 | + } | |
80 | + } | |
81 | + | |
82 | + private static void addSpeakerMentionsInSpoken(Sentence sentence) { | |
83 | + // heurystyka dla sp1:, sp2:, MarszałekJAkistam: | |
84 | + if (sentence.size() > 2) { | |
85 | + Token first = sentence.get(0); | |
86 | + Token second = sentence.get(1); | |
87 | + if (":".equals(second.getOrth())) { | |
88 | + sentence.addMention(new Mention(first)); | |
89 | + } | |
90 | + } | |
91 | + } | |
92 | + | |
93 | + /** | |
94 | + * Wyszukuję i oznaczam wszystkie NG* | |
95 | + * | |
96 | + * @param sentence | |
97 | + */ | |
98 | + private static void addMentionsByGroups(Sentence sentence) { | |
99 | + for (SyntacticGroup group : sentence.getGroups()) { | |
100 | + if (group.getType().startsWith("NG")) { | |
101 | + List<Token> segments = group.getTokens(); | |
102 | + List<Token> heads = group.getSemanticHeadTokens(); | |
103 | + | |
104 | + sentence.addMention(new Mention(segments, heads)); | |
105 | + } | |
106 | + } | |
107 | + } | |
108 | + | |
109 | + /** | |
110 | + * Wyszukuję i oznaczam wszystkie NER | |
111 | + * | |
112 | + * @param sentence | |
113 | + */ | |
114 | + private static void addMentionsByNamedEntities(Sentence sentence) { | |
115 | + for (NamedEntity ne : sentence.getNamedEntities()) { | |
116 | + | |
117 | + List<Token> headTokens = new ArrayList<>(); | |
118 | + List<Token> tokens = ne.getTokens(); | |
119 | + | |
120 | + boolean containsNoun = false; | |
121 | + for (Token seg : tokens) { | |
122 | + if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) { | |
123 | + containsNoun = true; | |
124 | + break; | |
125 | + } | |
126 | + } | |
127 | + if (!containsNoun) | |
128 | + continue; | |
129 | + | |
130 | + sentence.addMention(new Mention(tokens, headTokens)); | |
131 | + } | |
132 | + } | |
133 | + | |
134 | + private static void addMentionsBySyntacticWordsCtag(Sentence sentence) { | |
135 | + for (SyntacticWord w : sentence.getSyntacticWords()) | |
136 | + if (w.getCtag().matches(Constants.WORDS_CTAGS)) { | |
137 | + List<Token> tokens = w.getTokens(); | |
138 | + if (tokens.size() == 1) { | |
139 | + sentence.addMention(new Mention(tokens.get(0))); | |
140 | + } else { | |
141 | + List<Token> heads = new ArrayList<>(); | |
142 | + sentence.addMention(new Mention(tokens, heads)); | |
143 | + } | |
144 | + } | |
145 | + } | |
146 | + | |
147 | + /** | |
148 | + * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow | |
149 | + * skladniowych, to korzystam z niego zamiast morfoskladni | |
150 | + * | |
151 | + * @param sentence | |
152 | + */ | |
153 | + private static void addMentionsByTokenCtag(Sentence sentence) { | |
154 | + for (Token token : sentence) | |
155 | + if (token.getCtag().matches(Constants.MORPHO_CTAGS)) | |
156 | + sentence.addMention(new Mention(token)); | |
157 | + } | |
164 | 158 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/FeatureGeneration.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | |
3 | -import java.util.ArrayList; | |
4 | -import java.util.Arrays; | |
5 | -import java.util.HashMap; | |
6 | -import java.util.HashSet; | |
7 | -import java.util.Iterator; | |
8 | -import java.util.LinkedList; | |
9 | -import java.util.List; | |
10 | -import java.util.Map; | |
11 | -import java.util.Set; | |
12 | - | |
13 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | |
14 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
15 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | |
16 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | |
17 | -import pl.waw.ipipan.zil.core.md.entities.Token; | |
3 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
18 | 4 | import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; |
19 | 5 | import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; |
20 | 6 | |
7 | +import java.util.*; | |
8 | + | |
21 | 9 | public class FeatureGeneration { |
22 | 10 | final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", |
23 | 11 | "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | |
3 | -import java.io.File; | |
4 | -import java.util.ArrayList; | |
5 | -import java.util.HashSet; | |
6 | -import java.util.List; | |
7 | -import java.util.Map.Entry; | |
8 | -import java.util.Set; | |
9 | -import java.util.TreeMap; | |
10 | -import java.util.TreeSet; | |
11 | - | |
12 | -import org.apache.log4j.Logger; | |
13 | - | |
14 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | |
15 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
16 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
17 | -import pl.waw.ipipan.zil.core.md.entities.Text; | |
18 | -import pl.waw.ipipan.zil.core.md.entities.Token; | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
19 | 6 | import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; |
20 | 7 | import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; |
21 | 8 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; |
... | ... | @@ -25,154 +12,161 @@ import weka.core.FastVector; |
25 | 12 | import weka.core.Instance; |
26 | 13 | import weka.core.Instances; |
27 | 14 | |
15 | +import java.io.File; | |
16 | +import java.util.*; | |
17 | +import java.util.Map.Entry; | |
18 | + | |
28 | 19 | public class InstanceCreator { |
29 | 20 | |
30 | - final private static Logger logger = Logger.getLogger(InstanceCreator.class); | |
31 | - final private static TEI_IO teiIO = TEI_IO.getInstance(); | |
32 | - | |
33 | - public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) { | |
34 | - int allTexts = 0; | |
35 | - int exceptions = 0; | |
36 | - int allSentences = 0; | |
37 | - | |
38 | - List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
39 | - for (File textDir : IOUtils.getNKJPDirs(dataDir)) { | |
40 | - try { | |
41 | - allTexts++; | |
42 | - logger.info("Processing text " + textDir); | |
43 | - TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); | |
44 | - Text text = TeiLoader.loadTextFromTei(ct); | |
45 | - | |
46 | - for (Paragraph p : text) | |
47 | - for (Sentence s : p) { | |
48 | - allSentences++; | |
49 | - loadExamplesFromSentence(quasiVerbs, examples, s); | |
50 | - } | |
51 | - | |
52 | - } catch (Exception e) { | |
53 | - logger.error(e.getLocalizedMessage()); | |
54 | - exceptions++; | |
55 | - } | |
56 | - } | |
57 | - | |
58 | - logger.info(allTexts + " texts found."); | |
59 | - if (exceptions != 0) | |
60 | - logger.error(exceptions + " texts with exceptions."); | |
61 | - logger.info(allSentences + " sentences found."); | |
62 | - | |
63 | - return examples; | |
64 | - } | |
65 | - | |
66 | - public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples, | |
67 | - Sentence s) { | |
68 | - | |
69 | - // collect positive examples | |
70 | - Set<Token> positive = new HashSet<>(); | |
71 | - for (Mention m : s.getMentions()) { | |
72 | - if (FeatureGeneration.isVerb(m)) { | |
73 | - positive.addAll(m.getSegments()); | |
74 | - } | |
75 | - } | |
76 | - | |
77 | - for (Token m : s) { | |
78 | - if (!FeatureGeneration.isVerb(m)) | |
79 | - continue; | |
80 | - | |
81 | - TreeMap<String, Object> features = new TreeMap<>(); | |
82 | - if (positive.contains(m)) { | |
83 | - features.put("class", Boolean.valueOf(true)); | |
84 | - } else { | |
85 | - features.put("class", Boolean.valueOf(false)); | |
86 | - } | |
87 | - | |
88 | - FeatureGeneration.generateFeatures(features, m, s, quasiVerbs); | |
89 | - examples.add(features); | |
90 | - } | |
91 | - } | |
92 | - | |
93 | - public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { | |
94 | - | |
95 | - TreeSet<String> booleanAttsOccurred = new TreeSet<>(); | |
96 | - TreeSet<String> doubleAttsOccurred = new TreeSet<>(); | |
97 | - TreeMap<String, Set<String>> att2values = new TreeMap<>(); | |
98 | - for (TreeMap<String, Object> example : examples) { | |
99 | - for (Entry<String, Object> e : example.entrySet()) { | |
100 | - String key = e.getKey(); | |
101 | - Object val = e.getValue(); | |
102 | - if (val instanceof Integer || val instanceof Double) { | |
103 | - doubleAttsOccurred.add(key); | |
104 | - continue; | |
105 | - } | |
106 | - if (val instanceof Boolean) { | |
107 | - booleanAttsOccurred.add(key); | |
108 | - continue; | |
109 | - } | |
110 | - if (!att2values.containsKey(key)) | |
111 | - att2values.put(key, new HashSet<String>()); | |
112 | - att2values.get(key).add(val.toString()); | |
113 | - } | |
114 | - } | |
115 | - | |
116 | - List<Attribute> atts = new ArrayList<>(); | |
117 | - | |
118 | - // double attributes | |
119 | - for (String attName : doubleAttsOccurred) { | |
120 | - Attribute att = new Attribute(attName); | |
121 | - atts.add(att); | |
122 | - } | |
123 | - | |
124 | - // boolean attributes (treated as nominal) | |
125 | - FastVector values = new FastVector(2); | |
126 | - values.addElement("false"); | |
127 | - values.addElement("true"); | |
128 | - for (String attName : booleanAttsOccurred) { | |
129 | - Attribute att = new Attribute(attName, values); | |
130 | - atts.add(att); | |
131 | - } | |
132 | - | |
133 | - // nominal attributes | |
134 | - for (Entry<String, Set<String>> attVals : att2values.entrySet()) { | |
135 | - FastVector vals = new FastVector(attVals.getValue().size()); | |
136 | - for (String val : attVals.getValue()) | |
137 | - vals.addElement(val); | |
138 | - Attribute att = new Attribute(attVals.getKey(), vals); | |
139 | - atts.add(att); | |
140 | - } | |
141 | - | |
142 | - FastVector fvWekaAttributes = new FastVector(atts.size()); | |
143 | - for (Attribute attr : atts) { | |
144 | - fvWekaAttributes.addElement(attr); | |
145 | - } | |
146 | - | |
147 | - Instances data = new Instances("Zero", fvWekaAttributes, 10); | |
148 | - data.setClass(data.attribute(classFeatureName)); | |
149 | - return data; | |
150 | - } | |
151 | - | |
152 | - public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { | |
153 | - for (TreeMap<String, Object> example : examples) { | |
154 | - Instance instance = new Instance(instances.numAttributes()); | |
155 | - | |
156 | - for (Entry<String, Object> e : example.entrySet()) { | |
157 | - Object val = e.getValue(); | |
158 | - String name = e.getKey(); | |
159 | - if (val instanceof Integer) { | |
160 | - instance.setValue(instances.attribute(name), (int) val); | |
161 | - } else if (val instanceof Boolean) { | |
162 | - instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); | |
163 | - } else { | |
164 | - int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); | |
165 | - if (indexOfValue == -1) { | |
166 | - logger.debug("Unkown value: " + val.toString() + " of feature: " + name | |
167 | - + ". Marking as missing value."); | |
168 | - instance.setMissing(instances.attribute(name)); | |
169 | - } else | |
170 | - instance.setValue(instances.attribute(name), indexOfValue); | |
171 | - } | |
172 | - } | |
173 | - | |
174 | - instance.setDataset(instances); | |
175 | - instances.add(instance); | |
176 | - } | |
177 | - } | |
21 | + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class); | |
22 | + private static final TEI_IO teiIO = TEI_IO.getInstance(); | |
23 | + | |
24 | + private InstanceCreator() { | |
25 | + } | |
26 | + | |
27 | + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) { | |
28 | + int allTexts = 0; | |
29 | + int exceptions = 0; | |
30 | + int allSentences = 0; | |
31 | + | |
32 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
33 | + for (File textDir : IOUtils.getNKJPDirs(dataDir)) { | |
34 | + try { | |
35 | + allTexts++; | |
36 | + logger.info("Processing text " + textDir); | |
37 | + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); | |
38 | + Text text = TeiLoader.loadTextFromTei(ct); | |
39 | + | |
40 | + for (Paragraph p : text) | |
41 | + for (Sentence s : p) { | |
42 | + allSentences++; | |
43 | + loadExamplesFromSentence(quasiVerbs, examples, s); | |
44 | + } | |
45 | + | |
46 | + } catch (Exception e) { | |
47 | + logger.error(e.getLocalizedMessage()); | |
48 | + exceptions++; | |
49 | + } | |
50 | + } | |
51 | + | |
52 | + logger.info(allTexts + " texts found."); | |
53 | + if (exceptions != 0) | |
54 | + logger.error(exceptions + " texts with exceptions."); | |
55 | + logger.info(allSentences + " sentences found."); | |
56 | + | |
57 | + return examples; | |
58 | + } | |
59 | + | |
60 | + public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples, | |
61 | + Sentence s) { | |
62 | + | |
63 | + // collect positive examples | |
64 | + Set<Token> positive = new HashSet<>(); | |
65 | + for (Mention m : s.getMentions()) { | |
66 | + if (FeatureGeneration.isVerb(m)) { | |
67 | + positive.addAll(m.getSegments()); | |
68 | + } | |
69 | + } | |
70 | + | |
71 | + for (Token m : s) { | |
72 | + if (!FeatureGeneration.isVerb(m)) | |
73 | + continue; | |
74 | + | |
75 | + TreeMap<String, Object> features = new TreeMap<>(); | |
76 | + if (positive.contains(m)) { | |
77 | + features.put("class", Boolean.valueOf(true)); | |
78 | + } else { | |
79 | + features.put("class", Boolean.valueOf(false)); | |
80 | + } | |
81 | + | |
82 | + FeatureGeneration.generateFeatures(features, m, s, quasiVerbs); | |
83 | + examples.add(features); | |
84 | + } | |
85 | + } | |
86 | + | |
87 | + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { | |
88 | + | |
89 | + TreeSet<String> booleanAttsOccurred = new TreeSet<>(); | |
90 | + TreeSet<String> doubleAttsOccurred = new TreeSet<>(); | |
91 | + TreeMap<String, Set<String>> att2values = new TreeMap<>(); | |
92 | + for (TreeMap<String, Object> example : examples) { | |
93 | + for (Entry<String, Object> e : example.entrySet()) { | |
94 | + String key = e.getKey(); | |
95 | + Object val = e.getValue(); | |
96 | + if (val instanceof Integer || val instanceof Double) { | |
97 | + doubleAttsOccurred.add(key); | |
98 | + continue; | |
99 | + } | |
100 | + if (val instanceof Boolean) { | |
101 | + booleanAttsOccurred.add(key); | |
102 | + continue; | |
103 | + } | |
104 | + if (!att2values.containsKey(key)) | |
105 | + att2values.put(key, new HashSet<>()); | |
106 | + att2values.get(key).add(val.toString()); | |
107 | + } | |
108 | + } | |
109 | + | |
110 | + List<Attribute> atts = new ArrayList<>(); | |
111 | + | |
112 | + // double attributes | |
113 | + for (String attName : doubleAttsOccurred) { | |
114 | + Attribute att = new Attribute(attName); | |
115 | + atts.add(att); | |
116 | + } | |
117 | + | |
118 | + // boolean attributes (treated as nominal) | |
119 | + FastVector values = new FastVector(2); | |
120 | + values.addElement("false"); | |
121 | + values.addElement("true"); | |
122 | + for (String attName : booleanAttsOccurred) { | |
123 | + Attribute att = new Attribute(attName, values); | |
124 | + atts.add(att); | |
125 | + } | |
126 | + | |
127 | + // nominal attributes | |
128 | + for (Entry<String, Set<String>> attVals : att2values.entrySet()) { | |
129 | + FastVector vals = new FastVector(attVals.getValue().size()); | |
130 | + for (String val : attVals.getValue()) | |
131 | + vals.addElement(val); | |
132 | + Attribute att = new Attribute(attVals.getKey(), vals); | |
133 | + atts.add(att); | |
134 | + } | |
135 | + | |
136 | + FastVector fvWekaAttributes = new FastVector(atts.size()); | |
137 | + for (Attribute attr : atts) { | |
138 | + fvWekaAttributes.addElement(attr); | |
139 | + } | |
140 | + | |
141 | + Instances data = new Instances("Zero", fvWekaAttributes, 10); | |
142 | + data.setClass(data.attribute(classFeatureName)); | |
143 | + return data; | |
144 | + } | |
145 | + | |
146 | + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { | |
147 | + for (TreeMap<String, Object> example : examples) { | |
148 | + Instance instance = new Instance(instances.numAttributes()); | |
149 | + | |
150 | + for (Entry<String, Object> e : example.entrySet()) { | |
151 | + Object val = e.getValue(); | |
152 | + String name = e.getKey(); | |
153 | + if (val instanceof Integer) { | |
154 | + instance.setValue(instances.attribute(name), (int) val); | |
155 | + } else if (val instanceof Boolean) { | |
156 | + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); | |
157 | + } else { | |
158 | + int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); | |
159 | + if (indexOfValue == -1) { | |
160 | + logger.debug("Unkown value: " + val.toString() + " of feature: " + name | |
161 | + + ". Marking as missing value."); | |
162 | + instance.setMissing(instances.attribute(name)); | |
163 | + } else | |
164 | + instance.setValue(instances.attribute(name), indexOfValue); | |
165 | + } | |
166 | + } | |
167 | + | |
168 | + instance.setDataset(instances); | |
169 | + instances.add(instance); | |
170 | + } | |
171 | + } | |
178 | 172 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Model.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | |
3 | -import java.io.Serializable; | |
4 | -import java.util.List; | |
5 | -import java.util.Set; | |
6 | -import java.util.TreeMap; | |
7 | - | |
8 | -import org.apache.log4j.Logger; | |
9 | - | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
10 | 5 | import pl.waw.ipipan.zil.core.md.entities.Sentence; |
11 | 6 | import weka.classifiers.Classifier; |
12 | 7 | import weka.core.Instance; |
13 | 8 | import weka.core.Instances; |
14 | 9 | |
10 | +import java.io.Serializable; | |
11 | +import java.util.List; | |
12 | +import java.util.Set; | |
13 | +import java.util.TreeMap; | |
14 | + | |
15 | 15 | public class Model implements Serializable { |
16 | 16 | |
17 | - private static final long serialVersionUID = 3351727361273283076L; | |
18 | - private static final Logger logger = Logger.getLogger(Model.class); | |
19 | - | |
20 | - private Classifier classifier; | |
21 | - private Set<String> quasiVerbs; | |
22 | - private Instances instances; | |
23 | - | |
24 | - public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { | |
25 | - this.classifier = classifier; | |
26 | - this.instances = instances; | |
27 | - this.quasiVerbs = quasiVerbs; | |
28 | - } | |
29 | - | |
30 | - public boolean isZeroSubject(Instance instance, Sentence sentence) { | |
31 | - try { | |
32 | - double response = this.classifier.classifyInstance(instance); | |
33 | - return response > 0; | |
34 | - } catch (Exception e) { | |
35 | - logger.error("Error classyfing verb in sentence: " + sentence); | |
36 | - return false; | |
37 | - } | |
38 | - } | |
39 | - | |
40 | - public Instances getInstances(List<TreeMap<String, Object>> examples) { | |
41 | - Instances instances = new Instances(this.instances); | |
42 | - InstanceCreator.fillInstances(examples, instances); | |
43 | - return instances; | |
44 | - } | |
45 | - | |
46 | - public Set<String> getQuasiVerbs() { | |
47 | - return quasiVerbs; | |
48 | - } | |
17 | + private static final long serialVersionUID = 3351727361273283076L; | |
18 | + private static final Logger logger = LoggerFactory.getLogger(Model.class); | |
19 | + | |
20 | + private Classifier classifier; | |
21 | + private Set<String> quasiVerbs; | |
22 | + private Instances instances; | |
23 | + | |
24 | + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { | |
25 | + this.classifier = classifier; | |
26 | + this.instances = instances; | |
27 | + this.quasiVerbs = quasiVerbs; | |
28 | + } | |
29 | + | |
30 | + public boolean isZeroSubject(Instance instance, Sentence sentence) { | |
31 | + try { | |
32 | + double response = this.classifier.classifyInstance(instance); | |
33 | + return response > 0; | |
34 | + } catch (Exception e) { | |
35 | + logger.error("Error classyfing verb in sentence: " + sentence, e); | |
36 | + return false; | |
37 | + } | |
38 | + } | |
39 | + | |
40 | + public Instances getInstances(List<TreeMap<String, Object>> examples) { | |
41 | + Instances instances = new Instances(this.instances); | |
42 | + InstanceCreator.fillInstances(examples, instances); | |
43 | + return instances; | |
44 | + } | |
45 | + | |
46 | + public Set<String> getQuasiVerbs() { | |
47 | + return quasiVerbs; | |
48 | + } | |
49 | 49 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Serializer.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | |
3 | -import java.io.InputStream; | |
4 | - | |
5 | 3 | import weka.core.SerializationHelper; |
6 | 4 | |
5 | +import java.io.InputStream; | |
6 | + | |
7 | 7 | public class Serializer { |
8 | 8 | |
9 | 9 | public static void saveModel(Model m, String targetModelFilePath) throws Exception { |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/Trainer.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | |
3 | -import java.io.BufferedReader; | |
4 | -import java.io.File; | |
5 | -import java.io.IOException; | |
6 | -import java.io.InputStream; | |
7 | -import java.io.InputStreamReader; | |
8 | -import java.util.HashSet; | |
9 | -import java.util.List; | |
10 | -import java.util.Random; | |
11 | -import java.util.Set; | |
12 | -import java.util.TreeMap; | |
13 | - | |
14 | -import org.apache.log4j.Logger; | |
15 | - | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
16 | 5 | import weka.classifiers.Evaluation; |
17 | 6 | import weka.classifiers.rules.JRip; |
18 | 7 | import weka.classifiers.rules.JRip.RipperRule; |
... | ... | @@ -20,104 +9,111 @@ import weka.core.Attribute; |
20 | 9 | import weka.core.Instance; |
21 | 10 | import weka.core.Instances; |
22 | 11 | |
12 | +import java.io.*; | |
13 | +import java.util.*; | |
14 | + | |
23 | 15 | public class Trainer { |
24 | 16 | |
25 | - final private static Logger logger = Logger.getLogger(Trainer.class); | |
26 | - | |
27 | - private static final boolean DO_CV = false; | |
28 | - private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; | |
29 | - | |
30 | - public static void main(String[] args) { | |
31 | - | |
32 | - if (args.length != 2) { | |
33 | - logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() | |
34 | - + " trainDir targetModelFile"); | |
35 | - return; | |
36 | - } | |
37 | - | |
38 | - File dataDir = new File(args[0]); | |
39 | - String targetModelFilePath = args[1]; | |
40 | - | |
41 | - if (!dataDir.isDirectory()) { | |
42 | - logger.error(dataDir + " is not a directory!"); | |
43 | - return; | |
44 | - } | |
45 | - | |
46 | - Set<String> quasiVerbs = loadQuasiVerbs(); | |
47 | - | |
48 | - List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs); | |
49 | - Instances instances = InstanceCreator.createInstances(examples, "class"); | |
50 | - InstanceCreator.fillInstances(examples, instances); | |
51 | - | |
52 | - printStats(instances); | |
53 | - | |
54 | - try { | |
55 | - JRip model = new JRip(); | |
56 | - | |
57 | - if (DO_CV) { | |
58 | - logger.info("Crossvalidation..."); | |
59 | - Evaluation eval = new Evaluation(instances); | |
60 | - eval.crossValidateModel(model, instances, 10, new Random(1)); | |
61 | - logger.info(eval.toSummaryString()); | |
62 | - logger.info(eval.toMatrixString()); | |
63 | - logger.info(eval.toClassDetailsString()); | |
64 | - } | |
65 | - | |
66 | - logger.info("Building final classifier..."); | |
67 | - model = new JRip(); | |
68 | - model.buildClassifier(instances); | |
69 | - logger.info(model.getRuleset().size() + " rules generated."); | |
70 | - for (int i = 0; i < model.getRuleset().size(); i++) { | |
71 | - RipperRule v = (RipperRule) model.getRuleset().elementAt(i); | |
72 | - logger.info("\t" + v.toString(instances.classAttribute())); | |
73 | - } | |
74 | - | |
75 | - instances.delete(); | |
76 | - logger.info("Features stats:"); | |
77 | - for (int i = 0; i < instances.numAttributes(); i++) { | |
78 | - Attribute att = instances.attribute(i); | |
79 | - logger.info(i + ".\t" + att.toString()); | |
80 | - } | |
81 | - | |
82 | - logger.info("Saving classifier..."); | |
83 | - Model m = new Model(model, instances, quasiVerbs); | |
84 | - Serializer.saveModel(m, targetModelFilePath); | |
85 | - logger.info("Done."); | |
86 | - | |
87 | - } catch (Exception e) { | |
88 | - logger.error("Error: " + e); | |
89 | - } | |
90 | - } | |
91 | - | |
92 | - private static Set<String> loadQuasiVerbs() { | |
93 | - Set<String> quasiVerbs = new HashSet<>(); | |
94 | - InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); | |
95 | - try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { | |
96 | - String line = null; | |
97 | - while ((line = br.readLine()) != null) { | |
98 | - quasiVerbs.add(line.trim()); | |
99 | - } | |
100 | - } catch (IOException e) { | |
101 | - logger.error(e.getLocalizedMessage()); | |
102 | - } | |
103 | - return quasiVerbs; | |
104 | - } | |
105 | - | |
106 | - private static void printStats(Instances instances) { | |
107 | - int positive = 0; | |
108 | - int negative = 0; | |
109 | - for (int i = 0; i < instances.numInstances(); i++) { | |
110 | - Instance inst = instances.instance(i); | |
111 | - if (inst.classValue() > 0) | |
112 | - negative++; | |
113 | - else | |
114 | - positive++; | |
115 | - } | |
116 | - logger.info(positive + " positive examples"); | |
117 | - logger.info(negative + " negative examples"); | |
118 | - logger.info((positive + negative) + " examples total"); | |
119 | - logger.info((instances.numAttributes() - 1) + " attributes"); | |
120 | - logger.info(instances.toSummaryString()); | |
121 | - } | |
17 | + private static final Logger logger = LoggerFactory.getLogger(Trainer.class); | |
18 | + | |
19 | + private static final boolean DO_CV = false; | |
20 | + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; | |
21 | + | |
22 | + private Trainer() { | |
23 | + } | |
24 | + | |
25 | + public static void main(String[] args) { | |
26 | + | |
27 | + if (args.length != 2) { | |
28 | + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() | |
29 | + + " trainDir targetModelFile"); | |
30 | + return; | |
31 | + } | |
32 | + | |
33 | + File dataDir = new File(args[0]); | |
34 | + String targetModelFilePath = args[1]; | |
35 | + | |
36 | + if (!dataDir.isDirectory()) { | |
37 | + logger.error(dataDir + " is not a directory!"); | |
38 | + return; | |
39 | + } | |
40 | + | |
41 | + Set<String> quasiVerbs = loadQuasiVerbs(); | |
42 | + | |
43 | + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs); | |
44 | + Instances instances = InstanceCreator.createInstances(examples, "class"); | |
45 | + InstanceCreator.fillInstances(examples, instances); | |
46 | + | |
47 | + printStats(instances); | |
48 | + | |
49 | + try { | |
50 | + JRip model; | |
51 | + | |
52 | + if (DO_CV) { | |
53 | + logger.info("Crossvalidation..."); | |
54 | + model = new JRip(); | |
55 | + Evaluation eval = new Evaluation(instances); | |
56 | + eval.crossValidateModel(model, instances, 10, new Random(1)); | |
57 | + logger.info(eval.toSummaryString()); | |
58 | + logger.info(eval.toMatrixString()); | |
59 | + logger.info(eval.toClassDetailsString()); | |
60 | + } | |
61 | + | |
62 | + logger.info("Building final classifier..."); | |
63 | + model = new JRip(); | |
64 | + model.buildClassifier(instances); | |
65 | + logger.info(model.getRuleset().size() + " rules generated."); | |
66 | + for (int i = 0; i < model.getRuleset().size(); i++) { | |
67 | + RipperRule v = (RipperRule) model.getRuleset().elementAt(i); | |
68 | + logger.info("\t" + v.toString(instances.classAttribute())); | |
69 | + } | |
70 | + | |
71 | + instances.delete(); | |
72 | + logger.info("Features stats:"); | |
73 | + for (int i = 0; i < instances.numAttributes(); i++) { | |
74 | + Attribute att = instances.attribute(i); | |
75 | + logger.info(i + ".\t" + att.toString()); | |
76 | + } | |
77 | + | |
78 | + logger.info("Saving classifier..."); | |
79 | + Model m = new Model(model, instances, quasiVerbs); | |
80 | + Serializer.saveModel(m, targetModelFilePath); | |
81 | + logger.info("Done."); | |
82 | + | |
83 | + } catch (Exception e) { | |
84 | + logger.error("Error: " + e); | |
85 | + } | |
86 | + } | |
87 | + | |
88 | + private static Set<String> loadQuasiVerbs() { | |
89 | + Set<String> quasiVerbs = new HashSet<>(); | |
90 | + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); | |
91 | + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { | |
92 | + String line; | |
93 | + while ((line = br.readLine()) != null) { | |
94 | + quasiVerbs.add(line.trim()); | |
95 | + } | |
96 | + } catch (IOException e) { | |
97 | + logger.error(e.getLocalizedMessage(), e); | |
98 | + } | |
99 | + return quasiVerbs; | |
100 | + } | |
101 | + | |
102 | + private static void printStats(Instances instances) { | |
103 | + int positive = 0; | |
104 | + int negative = 0; | |
105 | + for (int i = 0; i < instances.numInstances(); i++) { | |
106 | + Instance inst = instances.instance(i); | |
107 | + if (inst.classValue() > 0) | |
108 | + negative++; | |
109 | + else | |
110 | + positive++; | |
111 | + } | |
112 | + logger.info(positive + " positive examples"); | |
113 | + logger.info(negative + " negative examples"); | |
114 | + logger.info((positive + negative) + " examples total"); | |
115 | + logger.info((instances.numAttributes() - 1) + " attributes"); | |
116 | + logger.info(instances.toSummaryString()); | |
117 | + } | |
122 | 118 | |
123 | 119 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/ZeroSubjectDetector.java
1 | 1 | package pl.waw.ipipan.zil.core.md.detection.zero; |
2 | 2 | |
3 | -import java.io.File; | |
4 | -import java.io.InputStream; | |
5 | -import java.util.ArrayList; | |
6 | -import java.util.HashSet; | |
7 | -import java.util.List; | |
8 | -import java.util.Set; | |
9 | -import java.util.TreeMap; | |
10 | - | |
11 | -import org.apache.log4j.Logger; | |
12 | - | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
13 | 5 | import pl.waw.ipipan.zil.core.md.entities.Mention; |
14 | 6 | import pl.waw.ipipan.zil.core.md.entities.Sentence; |
15 | 7 | import pl.waw.ipipan.zil.core.md.entities.Token; |
16 | 8 | import weka.core.Instances; |
17 | 9 | |
10 | +import java.io.File; | |
11 | +import java.io.InputStream; | |
12 | +import java.util.*; | |
13 | + | |
18 | 14 | public class ZeroSubjectDetector { |
19 | - final private static Logger logger = Logger.getLogger(ZeroSubjectDetector.class); | |
20 | 15 | |
21 | - private Model model; | |
22 | - private Set<String> quasiVerbs = new HashSet<>(); | |
16 | + final private static Logger logger = LoggerFactory.getLogger(ZeroSubjectDetector.class); | |
17 | + | |
18 | + private Model model; | |
19 | + private Set<String> quasiVerbs = new HashSet<>(); | |
23 | 20 | |
24 | - public static int verbsWithoutSubject = 0; | |
25 | - public static int verbsWithSubject = 0; | |
21 | + public static int verbsWithoutSubject = 0; | |
22 | + public static int verbsWithSubject = 0; | |
26 | 23 | |
27 | - public void addZeroSubjectMentions(Sentence sentence) { | |
28 | - List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
29 | - InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence); | |
30 | - if (examples.isEmpty()) | |
31 | - return; | |
24 | + public void addZeroSubjectMentions(Sentence sentence) { | |
25 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
26 | + InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence); | |
27 | + if (examples.isEmpty()) | |
28 | + return; | |
32 | 29 | |
33 | - Instances instances = model.getInstances(examples); | |
30 | + Instances instances = model.getInstances(examples); | |
34 | 31 | |
35 | - // label instances | |
36 | - List<Boolean> areZeros = new ArrayList<>(); | |
37 | - for (int i = 0; i < instances.numInstances(); i++) { | |
38 | - boolean isZero = model.isZeroSubject(instances.instance(i), sentence); | |
39 | - areZeros.add(isZero); | |
40 | - if (isZero) | |
41 | - verbsWithoutSubject++; | |
42 | - else | |
43 | - verbsWithSubject++; | |
44 | - } | |
32 | + // label instances | |
33 | + List<Boolean> areZeros = new ArrayList<>(); | |
34 | + for (int i = 0; i < instances.numInstances(); i++) { | |
35 | + boolean isZero = model.isZeroSubject(instances.instance(i), sentence); | |
36 | + areZeros.add(isZero); | |
37 | + if (isZero) | |
38 | + verbsWithoutSubject++; | |
39 | + else | |
40 | + verbsWithSubject++; | |
41 | + } | |
45 | 42 | |
46 | - int i = 0; | |
47 | - for (Token m : sentence) { | |
48 | - if (!FeatureGeneration.isVerb(m)) | |
49 | - continue; | |
50 | - if (areZeros.get(i)) | |
51 | - sentence.addMention(new Mention(m, true)); | |
52 | - i++; | |
53 | - } | |
54 | - } | |
43 | + int i = 0; | |
44 | + for (Token m : sentence) { | |
45 | + if (!FeatureGeneration.isVerb(m)) | |
46 | + continue; | |
47 | + if (areZeros.get(i)) | |
48 | + sentence.addMention(new Mention(m, true)); | |
49 | + i++; | |
50 | + } | |
51 | + } | |
55 | 52 | |
56 | - public ZeroSubjectDetector(File zeroSubjectDetectionModel) { | |
57 | - try { | |
58 | - this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); | |
59 | - this.quasiVerbs = this.model.getQuasiVerbs(); | |
60 | - } catch (Exception e) { | |
61 | - logger.error("Error loading model:" + e); | |
62 | - } | |
63 | - } | |
53 | + public ZeroSubjectDetector(File zeroSubjectDetectionModel) { | |
54 | + try { | |
55 | + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); | |
56 | + this.quasiVerbs = this.model.getQuasiVerbs(); | |
57 | + } catch (Exception e) { | |
58 | + logger.error("Error loading model:" + e); | |
59 | + } | |
60 | + } | |
64 | 61 | |
65 | - public ZeroSubjectDetector(InputStream zeroSubjectDetectionModelStream) { | |
66 | - try { | |
67 | - this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); | |
68 | - this.quasiVerbs = this.model.getQuasiVerbs(); | |
69 | - } catch (Exception e) { | |
70 | - logger.error("Error loading model:" + e); | |
71 | - } | |
72 | - } | |
62 | + public ZeroSubjectDetector(InputStream zeroSubjectDetectionModelStream) { | |
63 | + try { | |
64 | + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); | |
65 | + this.quasiVerbs = this.model.getQuasiVerbs(); | |
66 | + } catch (Exception e) { | |
67 | + logger.error("Error loading model:" + e); | |
68 | + } | |
69 | + } | |
73 | 70 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java
1 | 1 | package pl.waw.ipipan.zil.core.md.entities; |
2 | 2 | |
3 | -import java.util.ArrayList; | |
4 | -import java.util.Collection; | |
5 | -import java.util.Collections; | |
6 | -import java.util.HashSet; | |
7 | -import java.util.List; | |
8 | -import java.util.Set; | |
3 | +import java.util.*; | |
9 | 4 | |
10 | 5 | public class Token implements Comparable<Token> { |
11 | 6 | private Sentence sentence; |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
1 | 1 | package pl.waw.ipipan.zil.core.md.io.tei; |
2 | 2 | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
6 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.*; | |
7 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; | |
8 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | |
9 | + | |
3 | 10 | import java.io.File; |
4 | 11 | import java.util.ArrayList; |
5 | 12 | import java.util.HashMap; |
6 | 13 | import java.util.List; |
7 | 14 | import java.util.Map; |
8 | 15 | |
9 | -import org.apache.log4j.Logger; | |
10 | - | |
11 | -import pl.waw.ipipan.zil.core.md.entities.Interpretation; | |
12 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | |
13 | -import pl.waw.ipipan.zil.core.md.entities.NamedEntity; | |
14 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
15 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
16 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | |
17 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | |
18 | -import pl.waw.ipipan.zil.core.md.entities.Text; | |
19 | -import pl.waw.ipipan.zil.core.md.entities.Token; | |
20 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | |
21 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIGroup; | |
22 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIInterpretation; | |
23 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; | |
24 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; | |
25 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEINamedEntity; | |
26 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIParagraph; | |
27 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEISentence; | |
28 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEISyntacticEntity; | |
29 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIWord; | |
30 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; | |
31 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | |
32 | - | |
33 | 16 | public class TeiLoader { |
34 | 17 | |
35 | - private static Logger logger = Logger.getLogger(TeiLoader.class); | |
36 | - private static TEI_IO teiAPI = TEI_IO.getInstance(); | |
37 | - | |
38 | - public static TEICorpusText readTeiText(File teiDir) throws TEIException { | |
39 | - return teiAPI.readFromNKJPDirectory(teiDir); | |
40 | - } | |
41 | - | |
42 | - public static Text loadTextFromTei(TEICorpusText teiText) { | |
43 | - Text text = new Text(teiText.getCorpusHeader().getId()); | |
44 | - | |
45 | - logger.debug("Loading tei text " + text.getId() + "..."); | |
46 | - for (TEIParagraph teiP : teiText.getParagraphs()) | |
47 | - loadParagraph(text, teiP); | |
48 | - logger.debug("Tei text loaded."); | |
49 | - | |
50 | - return text; | |
51 | - } | |
52 | - | |
53 | - private static void loadParagraph(Text text, TEIParagraph teiP) { | |
54 | - Paragraph p = new Paragraph(); | |
55 | - text.add(p); | |
56 | - for (TEISentence teiS : teiP.getSentences()) | |
57 | - loadSentence(p, teiS); | |
58 | - } | |
59 | - | |
60 | - private static void loadSentence(Paragraph p, TEISentence teiS) { | |
61 | - Sentence s = new Sentence(); | |
62 | - p.add(s); | |
63 | - Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>(); | |
64 | - for (TEIMorph teiM : teiS.getMorphs()) { | |
65 | - Token token = loadToken(s, teiM); | |
66 | - teiMorph2Segment.put(teiM, token); | |
67 | - } | |
68 | - for (TEINamedEntity ne : teiS.getAllNamedEntities()) | |
69 | - loadNE(s, ne, teiMorph2Segment); | |
70 | - for (TEIWord w : teiS.getAllWords()) | |
71 | - loadSyntacticWord(s, w, teiMorph2Segment); | |
72 | - for (TEIGroup g : teiS.getAllGroups()) | |
73 | - loadSyntacticGroup(s, g, teiMorph2Segment); | |
74 | - for (TEIMention m : teiS.getAllMentions()) | |
75 | - loadMentions(s, m, teiMorph2Segment); | |
76 | - } | |
77 | - | |
78 | - private static void loadMentions(Sentence s, TEIMention m, | |
79 | - Map<TEIMorph, Token> teiMorph2Segment) { | |
80 | - List<Token> tokens = new ArrayList<>(); | |
81 | - for (TEIMorph mo : m.getMorphs()) | |
82 | - tokens.add(teiMorph2Segment.get(mo)); | |
83 | - List<Token> headTokens = new ArrayList<>(); | |
84 | - for (TEIMorph mo : m.getHeadMorphs()) | |
85 | - headTokens.add(teiMorph2Segment.get(mo)); | |
86 | - s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); | |
87 | - } | |
88 | - | |
89 | - private static void loadSyntacticGroup(Sentence s, TEIGroup g, | |
90 | - Map<TEIMorph, Token> teiMorph2Segment) { | |
91 | - String type = g.getType(); | |
92 | - | |
93 | - List<Token> tokens = new ArrayList<>(); | |
94 | - for (TEIMorph m : g.getLeaves()) | |
95 | - tokens.add(teiMorph2Segment.get(m)); | |
96 | - | |
97 | - List<Token> headTokens = new ArrayList<>(); | |
98 | - TEISyntacticEntity semanticHead = g; | |
99 | - while (semanticHead.isGroup() | |
100 | - && semanticHead.asGroup().getSemanticHead() != null) | |
101 | - semanticHead = semanticHead.asGroup().getSemanticHead(); | |
102 | - for (TEIMorph m : semanticHead.getLeaves()) | |
103 | - headTokens.add(teiMorph2Segment.get(m)); | |
104 | - | |
105 | - s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | |
106 | - } | |
107 | - | |
108 | - private static void loadSyntacticWord(Sentence s, TEIWord w, | |
109 | - Map<TEIMorph, Token> teiMorph2Segment) { | |
110 | - String ctag = w.getInterpretation().getCtag(); | |
111 | - List<Token> tokens = new ArrayList<>(); | |
112 | - for (TEIMorph m : w.getAllMorphs()) | |
113 | - tokens.add(teiMorph2Segment.get(m)); | |
114 | - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | |
115 | - } | |
116 | - | |
117 | - private static void loadNE(Sentence s, TEINamedEntity ne, | |
118 | - Map<TEIMorph, Token> teiMorph2Segment) { | |
119 | - List<Token> tokens = new ArrayList<>(); | |
120 | - for (TEIMorph m : ne.getLeaves()) | |
121 | - tokens.add(teiMorph2Segment.get(m)); | |
122 | - s.addNamedEntity(new NamedEntity(tokens)); | |
123 | - } | |
124 | - | |
125 | - private static Token loadToken(Sentence s, TEIMorph teiM) { | |
126 | - Token seg = new Token(); | |
127 | - s.add(seg); | |
128 | - | |
129 | - seg.setOrth(teiM.getOrth()); | |
130 | - TEIInterpretation interp = teiM.getChosenInterpretation(); | |
131 | - Interpretation chosenIterpretation = new Interpretation( | |
132 | - interp.getCtag(), interp.getMorph(), interp.getBase()); | |
133 | - seg.addChosenInterpretation(chosenIterpretation); | |
134 | - | |
135 | - for (TEIInterpretation interp2 : teiM.getAllInterpretations()) { | |
136 | - Interpretation inter = new Interpretation(interp2.getCtag(), | |
137 | - interp2.getMorph(), interp.getBase()); | |
138 | - seg.addInterpretation(inter); | |
139 | - } | |
140 | - | |
141 | - return seg; | |
142 | - } | |
18 | + private static Logger logger = LoggerFactory.getLogger(TeiLoader.class); | |
19 | + private static TEI_IO teiAPI = TEI_IO.getInstance(); | |
20 | + | |
21 | + private TeiLoader() { | |
22 | + } | |
23 | + | |
24 | + public static TEICorpusText readTeiText(File teiDir) throws TEIException { | |
25 | + return teiAPI.readFromNKJPDirectory(teiDir); | |
26 | + } | |
27 | + | |
28 | + public static Text loadTextFromTei(TEICorpusText teiText) { | |
29 | + Text text = new Text(teiText.getCorpusHeader().getId()); | |
30 | + | |
31 | + logger.debug("Loading tei text " + text.getId() + "..."); | |
32 | + for (TEIParagraph teiP : teiText.getParagraphs()) | |
33 | + loadParagraph(text, teiP); | |
34 | + logger.debug("Tei text loaded."); | |
35 | + | |
36 | + return text; | |
37 | + } | |
38 | + | |
39 | + private static void loadParagraph(Text text, TEIParagraph teiP) { | |
40 | + Paragraph p = new Paragraph(); | |
41 | + text.add(p); | |
42 | + for (TEISentence teiS : teiP.getSentences()) | |
43 | + loadSentence(p, teiS); | |
44 | + } | |
45 | + | |
46 | + private static void loadSentence(Paragraph p, TEISentence teiS) { | |
47 | + Sentence s = new Sentence(); | |
48 | + p.add(s); | |
49 | + Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>(); | |
50 | + for (TEIMorph teiM : teiS.getMorphs()) { | |
51 | + Token token = loadToken(s, teiM); | |
52 | + teiMorph2Segment.put(teiM, token); | |
53 | + } | |
54 | + for (TEINamedEntity ne : teiS.getAllNamedEntities()) | |
55 | + loadNE(s, ne, teiMorph2Segment); | |
56 | + for (TEIWord w : teiS.getAllWords()) | |
57 | + loadSyntacticWord(s, w, teiMorph2Segment); | |
58 | + for (TEIGroup g : teiS.getAllGroups()) | |
59 | + loadSyntacticGroup(s, g, teiMorph2Segment); | |
60 | + for (TEIMention m : teiS.getAllMentions()) | |
61 | + loadMentions(s, m, teiMorph2Segment); | |
62 | + } | |
63 | + | |
64 | + private static void loadMentions(Sentence s, TEIMention m, | |
65 | + Map<TEIMorph, Token> teiMorph2Segment) { | |
66 | + List<Token> tokens = new ArrayList<>(); | |
67 | + for (TEIMorph mo : m.getMorphs()) | |
68 | + tokens.add(teiMorph2Segment.get(mo)); | |
69 | + List<Token> headTokens = new ArrayList<>(); | |
70 | + for (TEIMorph mo : m.getHeadMorphs()) | |
71 | + headTokens.add(teiMorph2Segment.get(mo)); | |
72 | + s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); | |
73 | + } | |
74 | + | |
75 | + private static void loadSyntacticGroup(Sentence s, TEIGroup g, | |
76 | + Map<TEIMorph, Token> teiMorph2Segment) { | |
77 | + String type = g.getType(); | |
78 | + | |
79 | + List<Token> tokens = new ArrayList<>(); | |
80 | + for (TEIMorph m : g.getLeaves()) | |
81 | + tokens.add(teiMorph2Segment.get(m)); | |
82 | + | |
83 | + List<Token> headTokens = new ArrayList<>(); | |
84 | + TEISyntacticEntity semanticHead = g; | |
85 | + while (semanticHead.isGroup() | |
86 | + && semanticHead.asGroup().getSemanticHead() != null) | |
87 | + semanticHead = semanticHead.asGroup().getSemanticHead(); | |
88 | + for (TEIMorph m : semanticHead.getLeaves()) | |
89 | + headTokens.add(teiMorph2Segment.get(m)); | |
90 | + | |
91 | + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | |
92 | + } | |
93 | + | |
94 | + private static void loadSyntacticWord(Sentence s, TEIWord w, | |
95 | + Map<TEIMorph, Token> teiMorph2Segment) { | |
96 | + String ctag = w.getInterpretation().getCtag(); | |
97 | + List<Token> tokens = new ArrayList<>(); | |
98 | + for (TEIMorph m : w.getAllMorphs()) | |
99 | + tokens.add(teiMorph2Segment.get(m)); | |
100 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | |
101 | + } | |
102 | + | |
103 | + private static void loadNE(Sentence s, TEINamedEntity ne, | |
104 | + Map<TEIMorph, Token> teiMorph2Segment) { | |
105 | + List<Token> tokens = new ArrayList<>(); | |
106 | + for (TEIMorph m : ne.getLeaves()) | |
107 | + tokens.add(teiMorph2Segment.get(m)); | |
108 | + s.addNamedEntity(new NamedEntity(tokens)); | |
109 | + } | |
110 | + | |
111 | + private static Token loadToken(Sentence s, TEIMorph teiM) { | |
112 | + Token seg = new Token(); | |
113 | + s.add(seg); | |
114 | + | |
115 | + seg.setOrth(teiM.getOrth()); | |
116 | + TEIInterpretation interp = teiM.getChosenInterpretation(); | |
117 | + Interpretation chosenIterpretation = new Interpretation( | |
118 | + interp.getCtag(), interp.getMorph(), interp.getBase()); | |
119 | + seg.addChosenInterpretation(chosenIterpretation); | |
120 | + | |
121 | + for (TEIInterpretation interp2 : teiM.getAllInterpretations()) { | |
122 | + Interpretation inter = new Interpretation(interp2.getCtag(), | |
123 | + interp2.getMorph(), interp.getBase()); | |
124 | + seg.addInterpretation(inter); | |
125 | + } | |
126 | + | |
127 | + return seg; | |
128 | + } | |
143 | 129 | |
144 | 130 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiSaver.java
1 | 1 | package pl.waw.ipipan.zil.core.md.io.tei; |
2 | 2 | |
3 | -import java.io.File; | |
4 | -import java.util.ArrayList; | |
5 | -import java.util.HashMap; | |
6 | -import java.util.Iterator; | |
7 | -import java.util.List; | |
8 | -import java.util.Map; | |
9 | - | |
10 | -import org.apache.log4j.Logger; | |
11 | - | |
12 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | |
13 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
14 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
15 | -import pl.waw.ipipan.zil.core.md.entities.Text; | |
16 | -import pl.waw.ipipan.zil.core.md.entities.Token; | |
17 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.AnnotationLayer; | |
18 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.EntitiesFactory; | |
19 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICoreference; | |
20 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | |
21 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; | |
22 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; | |
23 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIParagraph; | |
24 | -import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEISentence; | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
6 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.*; | |
25 | 7 | import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; |
26 | 8 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; |
27 | 9 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO.CompressionMethod; |
28 | 10 | |
11 | +import java.io.File; | |
12 | +import java.util.*; | |
13 | + | |
29 | 14 | public class TeiSaver { |
30 | 15 | |
31 | - private static Logger logger = Logger.getLogger(TeiSaver.class); | |
32 | - private static TEI_IO teiAPI = TEI_IO.getInstance(); | |
33 | - final private static EntitiesFactory ef = EntitiesFactory.getInstance(); | |
34 | - | |
35 | - public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException { | |
36 | - logger.debug("Saving text in " + targetDir); | |
37 | - CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE; | |
38 | - teiAPI.writeToNKJPDirectory(teiText, targetDir, cm); | |
39 | - } | |
40 | - | |
41 | - public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException { | |
42 | - Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>(); | |
43 | - | |
44 | - Iterator<Paragraph> pIt = t.iterator(); | |
45 | - Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator(); | |
46 | - int mentionId = 0; | |
47 | - while (pIt.hasNext() && pItTei.hasNext()) { | |
48 | - Paragraph p = pIt.next(); | |
49 | - TEIParagraph pTei = pItTei.next(); | |
50 | - | |
51 | - mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei); | |
52 | - } | |
53 | - checkIterators(pIt, pItTei, "paragraph"); | |
54 | - | |
55 | - teiText.addAnnotationLayer(AnnotationLayer.MENTIONS, | |
56 | - EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS)); | |
57 | - | |
58 | - // clear coreference as we have new mentions it became invalid | |
59 | - teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE); | |
60 | - teiText.setCoreferences(new ArrayList<TEICoreference>()); | |
61 | - | |
62 | - logger.debug(mentionId + " mentions added"); | |
63 | - } | |
64 | - | |
65 | - private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p, | |
66 | - TEIParagraph pTei) throws TEIException { | |
67 | - Iterator<Sentence> sIt = p.iterator(); | |
68 | - Iterator<TEISentence> sItTei = pTei.getSentences().iterator(); | |
69 | - | |
70 | - while (sIt.hasNext() && sItTei.hasNext()) { | |
71 | - Sentence s = sIt.next(); | |
72 | - TEISentence sTei = sItTei.next(); | |
73 | - mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei); | |
74 | - } | |
75 | - checkIterators(sIt, sItTei, "sentence"); | |
76 | - return mentionId; | |
77 | - } | |
78 | - | |
79 | - private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s, | |
80 | - TEISentence sTei) throws TEIException { | |
81 | - sTei.getAllMentions().clear(); | |
82 | - | |
83 | - Map<Token, TEIMorph> seg2morph = new HashMap<Token, TEIMorph>(); | |
84 | - | |
85 | - Iterator<Token> segIt = s.iterator(); | |
86 | - Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator(); | |
87 | - | |
88 | - while (segIt.hasNext() && segItTei.hasNext()) { | |
89 | - seg2morph.put(segIt.next(), segItTei.next()); | |
90 | - } | |
91 | - checkIterators(segIt, segItTei, "token"); | |
92 | - | |
93 | - List<TEIMention> mentions = new ArrayList<TEIMention>(); | |
94 | - | |
95 | - for (Mention m : s.getMentions()) { | |
96 | - List<TEIMorph> morphs = new ArrayList<TEIMorph>(); | |
97 | - List<TEIMorph> heads = new ArrayList<TEIMorph>(); | |
98 | - | |
99 | - for (Token seg : m.getSegments()) | |
100 | - morphs.add(seg2morph.get(seg)); | |
101 | - | |
102 | - for (Token seg : m.getHeadSegments()) | |
103 | - heads.add(seg2morph.get(seg)); | |
104 | - | |
105 | - TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject()); | |
106 | - mentions.add(mention); | |
107 | - mention2mention.put(m, mention); | |
108 | - } | |
109 | - sTei.setMentions(mentions); | |
110 | - return mentionId; | |
111 | - } | |
112 | - | |
113 | - private static void checkIterators(Iterator<? extends Object> one, Iterator<? extends Object> other, String level) | |
114 | - throws TEIException { | |
115 | - if (one.hasNext() || other.hasNext()) | |
116 | - throw new TEIException("Problem mapping tei to thrift for level " + level); | |
117 | - } | |
16 | + private static final Logger logger = LoggerFactory.getLogger(TeiSaver.class); | |
17 | + private static final TEI_IO teiAPI = TEI_IO.getInstance(); | |
18 | + private static final EntitiesFactory ef = EntitiesFactory.getInstance(); | |
19 | + | |
20 | + private TeiSaver() { | |
21 | + } | |
22 | + | |
23 | + public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException { | |
24 | + logger.debug("Saving text in " + targetDir); | |
25 | + CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE; | |
26 | + teiAPI.writeToNKJPDirectory(teiText, targetDir, cm); | |
27 | + } | |
28 | + | |
29 | + public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException { | |
30 | + Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>(); | |
31 | + | |
32 | + Iterator<Paragraph> pIt = t.iterator(); | |
33 | + Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator(); | |
34 | + int mentionId = 0; | |
35 | + while (pIt.hasNext() && pItTei.hasNext()) { | |
36 | + Paragraph p = pIt.next(); | |
37 | + TEIParagraph pTei = pItTei.next(); | |
38 | + | |
39 | + mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei); | |
40 | + } | |
41 | + checkIterators(pIt, pItTei, "paragraph"); | |
42 | + | |
43 | + teiText.addAnnotationLayer(AnnotationLayer.MENTIONS, | |
44 | + EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS)); | |
45 | + | |
46 | + // clear coreference as we have new mentions it became invalid | |
47 | + teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE); | |
48 | + teiText.setCoreferences(new ArrayList<TEICoreference>()); | |
49 | + | |
50 | + logger.debug(mentionId + " mentions added"); | |
51 | + } | |
52 | + | |
53 | + private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p, | |
54 | + TEIParagraph pTei) throws TEIException { | |
55 | + Iterator<Sentence> sIt = p.iterator(); | |
56 | + Iterator<TEISentence> sItTei = pTei.getSentences().iterator(); | |
57 | + | |
58 | + while (sIt.hasNext() && sItTei.hasNext()) { | |
59 | + Sentence s = sIt.next(); | |
60 | + TEISentence sTei = sItTei.next(); | |
61 | + mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei); | |
62 | + } | |
63 | + checkIterators(sIt, sItTei, "sentence"); | |
64 | + return mentionId; | |
65 | + } | |
66 | + | |
67 | + private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s, | |
68 | + TEISentence sTei) throws TEIException { | |
69 | + sTei.getAllMentions().clear(); | |
70 | + | |
71 | + Map<Token, TEIMorph> seg2morph = new HashMap<>(); | |
72 | + | |
73 | + Iterator<Token> segIt = s.iterator(); | |
74 | + Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator(); | |
75 | + | |
76 | + while (segIt.hasNext() && segItTei.hasNext()) { | |
77 | + seg2morph.put(segIt.next(), segItTei.next()); | |
78 | + } | |
79 | + checkIterators(segIt, segItTei, "token"); | |
80 | + | |
81 | + List<TEIMention> mentions = new ArrayList<>(); | |
82 | + | |
83 | + for (Mention m : s.getMentions()) { | |
84 | + List<TEIMorph> morphs = new ArrayList<>(); | |
85 | + List<TEIMorph> heads = new ArrayList<>(); | |
86 | + | |
87 | + for (Token seg : m.getSegments()) | |
88 | + morphs.add(seg2morph.get(seg)); | |
89 | + | |
90 | + for (Token seg : m.getHeadSegments()) | |
91 | + heads.add(seg2morph.get(seg)); | |
92 | + | |
93 | + TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject()); | |
94 | + mentions.add(mention); | |
95 | + mention2mention.put(m, mention); | |
96 | + } | |
97 | + sTei.setMentions(mentions); | |
98 | + return mentionId; | |
99 | + } | |
100 | + | |
101 | + private static void checkIterators(Iterator<?> one, Iterator<?> other, String level) | |
102 | + throws TEIException { | |
103 | + if (one.hasNext() || other.hasNext()) | |
104 | + throw new TEIException("Problem mapping tei to thrift for level " + level); | |
105 | + } | |
118 | 106 | |
119 | 107 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
1 | 1 | package pl.waw.ipipan.zil.core.md.io.thrift; |
2 | 2 | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | |
7 | + | |
3 | 8 | import java.util.ArrayList; |
4 | 9 | import java.util.HashMap; |
5 | 10 | import java.util.List; |
6 | 11 | import java.util.Map; |
7 | 12 | |
8 | -import org.apache.log4j.Logger; | |
9 | - | |
10 | -import pl.waw.ipipan.zil.core.md.entities.Interpretation; | |
11 | -import pl.waw.ipipan.zil.core.md.entities.NamedEntity; | |
12 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
13 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
14 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; | |
15 | -import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; | |
16 | -import pl.waw.ipipan.zil.core.md.entities.Text; | |
17 | -import pl.waw.ipipan.zil.core.md.entities.Token; | |
18 | -import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; | |
19 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TInterpretation; | |
20 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TNamedEntity; | |
21 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | |
22 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
23 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSyntacticGroup; | |
24 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSyntacticWord; | |
25 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
26 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | |
27 | - | |
28 | 13 | public class ThriftLoader { |
29 | 14 | |
30 | - private static Logger logger = Logger.getLogger(ThriftLoader.class); | |
31 | - | |
32 | - public static Text loadTextFromThrift(TText thriftText) | |
33 | - throws MultiserviceException { | |
34 | - Text text = new Text(thriftText.getTextHeader() == null ? "null" | |
35 | - : thriftText.getTextHeader().getId()); | |
36 | - | |
37 | - logger.debug("Loading text " + text.getId() + " from thrift format..."); | |
38 | - for (TParagraph teiP : thriftText.getParagraphs()) | |
39 | - loadParagraph(text, teiP); | |
40 | - logger.debug("Thrift text loaded."); | |
41 | - | |
42 | - return text; | |
43 | - } | |
44 | - | |
45 | - private static void loadParagraph(Text text, TParagraph teiP) | |
46 | - throws MultiserviceException { | |
47 | - Paragraph p = new Paragraph(); | |
48 | - text.add(p); | |
49 | - | |
50 | - for (TSentence teiS : teiP.getSentences()) | |
51 | - loadSentence(p, teiS); | |
52 | - } | |
53 | - | |
54 | - private static void loadSentence(Paragraph p, TSentence thriftSent) | |
55 | - throws MultiserviceException { | |
56 | - Sentence s = new Sentence(); | |
57 | - p.add(s); | |
58 | - | |
59 | - Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent); | |
60 | - | |
61 | - Map<String, Token> thiftTokenId2Token = new HashMap<>(); | |
62 | - for (TToken teiM : thriftSent.getTokens()) { | |
63 | - Token token = loadToken(s, teiM); | |
64 | - thiftTokenId2Token.put(teiM.getId(), token); | |
65 | - } | |
66 | - if (thriftSent.isSetNames()) | |
67 | - for (TNamedEntity ne : thriftSent.getNames()) | |
68 | - loadNE(s, ne, thirftId2Entity, thiftTokenId2Token); | |
69 | - if (thriftSent.isSetWords()) | |
70 | - for (TSyntacticWord w : thriftSent.getWords()) | |
71 | - loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token); | |
72 | - if (thriftSent.isSetGroups()) | |
73 | - for (TSyntacticGroup g : thriftSent.getGroups()) | |
74 | - loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token); | |
75 | - } | |
76 | - | |
77 | - private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g, | |
78 | - Map<String, Object> thirftId2Entity, | |
79 | - Map<String, Token> thiftTokenId2Token) { | |
80 | - String type = g.getType(); | |
81 | - List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity, | |
82 | - thiftTokenId2Token, false); | |
83 | - List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity, | |
84 | - thiftTokenId2Token, true); | |
85 | - s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | |
86 | - } | |
87 | - | |
88 | - private static void loadSyntacticWord(Sentence s, TSyntacticWord w, | |
89 | - Map<String, Object> thirftId2Entity, | |
90 | - Map<String, Token> thiftTokenId2Token) { | |
91 | - String ctag = w.getChosenInterpretation().getCtag(); | |
92 | - List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity, | |
93 | - thiftTokenId2Token, false); | |
94 | - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | |
95 | - } | |
96 | - | |
97 | - private static void loadNE(Sentence s, TNamedEntity ne, | |
98 | - Map<String, Object> thirftId2Entity, | |
99 | - Map<String, Token> thiftTokenId2Token) { | |
100 | - List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity, | |
101 | - thiftTokenId2Token, false); | |
102 | - s.addNamedEntity(new NamedEntity(tokens)); | |
103 | - } | |
104 | - | |
105 | - private static Map<String, Object> getThriftId2EntityMap( | |
106 | - TSentence thriftSent) { | |
107 | - Map<String, Object> idToEntity = new HashMap<>(); | |
108 | - for (TToken tok : thriftSent.getTokens()) | |
109 | - idToEntity.put(tok.getId(), tok); | |
110 | - if (thriftSent.isSetWords()) | |
111 | - for (TSyntacticWord w : thriftSent.getWords()) | |
112 | - idToEntity.put(w.getId(), w); | |
113 | - if (thriftSent.isSetNames()) | |
114 | - for (TNamedEntity ne : thriftSent.getNames()) | |
115 | - idToEntity.put(ne.getId(), ne); | |
116 | - if (thriftSent.isSetGroups()) | |
117 | - for (TSyntacticGroup group : thriftSent.getGroups()) | |
118 | - idToEntity.put(group.getId(), group); | |
119 | - return idToEntity; | |
120 | - } | |
121 | - | |
122 | - private static Token loadToken(Sentence s, TToken teiM) | |
123 | - throws MultiserviceException { | |
124 | - Token seg = new Token(); | |
125 | - s.add(seg); | |
126 | - | |
127 | - seg.setOrth(teiM.getOrth()); | |
128 | - TInterpretation interp = getTokenChosenInt(teiM); | |
129 | - Interpretation chosenIterpretation = new Interpretation( | |
130 | - interp.getCtag(), interp.getMsd(), interp.getBase()); | |
131 | - seg.addChosenInterpretation(chosenIterpretation); | |
132 | - | |
133 | - for (TInterpretation interp2 : teiM.getInterpretations()) { | |
134 | - Interpretation inter = new Interpretation(interp2.getCtag(), | |
135 | - interp2.getMsd(), interp.getBase()); | |
136 | - seg.addInterpretation(inter); | |
137 | - } | |
138 | - return seg; | |
139 | - } | |
140 | - | |
141 | - private static TInterpretation getTokenChosenInt(TToken token) | |
142 | - throws MultiserviceException { | |
143 | - TInterpretation interp = token.getChosenInterpretation(); | |
144 | - if (interp == null || interp.getBase() == null | |
145 | - || interp.getBase().equals("")) { | |
146 | - if (token.getCandidateInterpretations() == null | |
147 | - || token.getCandidateInterpretations().size() == 0 | |
148 | - || token.getCandidateInterpretations().get(0).getBase() == null | |
149 | - || token.getCandidateInterpretations().get(0).getBase() | |
150 | - .equals("")) | |
151 | - throw new MultiserviceException( | |
152 | - "No proper chosen or candidate interpretation for segment: " | |
153 | - + token.id); | |
154 | - interp = token.getCandidateInterpretations().get(0); | |
155 | - } | |
156 | - return interp; | |
157 | - } | |
158 | - | |
159 | - private static List<Token> getUnderlyingSegments(Object entity, | |
160 | - Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment, | |
161 | - boolean headsOnly) { | |
162 | - List<Token> result = new ArrayList<>(); | |
163 | - | |
164 | - if (entity instanceof TToken) { | |
165 | - result.add(tokenId2Segment.get(((TToken) entity).getId())); | |
166 | - return result; | |
167 | - } | |
168 | - | |
169 | - List<String> childIds = new ArrayList<>(); | |
170 | - if (entity instanceof TSyntacticWord) | |
171 | - childIds = ((TSyntacticWord) entity).getChildIds(); | |
172 | - else if (entity instanceof TNamedEntity) | |
173 | - childIds = ((TNamedEntity) entity).getChildIds(); | |
174 | - else if (entity instanceof TSyntacticGroup) | |
175 | - if (headsOnly) { | |
176 | - childIds = new ArrayList<String>(); | |
177 | - childIds.add(((TSyntacticGroup) entity).getSemanticHeadId()); | |
178 | - } else | |
179 | - childIds = ((TSyntacticGroup) entity).getChildIds(); | |
180 | - | |
181 | - for (String id : childIds) | |
182 | - result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity, | |
183 | - tokenId2Segment, headsOnly)); | |
184 | - | |
185 | - return result; | |
186 | - } | |
15 | + private static Logger logger = LoggerFactory.getLogger(ThriftLoader.class); | |
16 | + | |
17 | + public static Text loadTextFromThrift(TText thriftText) | |
18 | + throws MultiserviceException { | |
19 | + Text text = new Text(thriftText.getTextHeader() == null ? "null" | |
20 | + : thriftText.getTextHeader().getId()); | |
21 | + | |
22 | + logger.debug("Loading text " + text.getId() + " from thrift format..."); | |
23 | + for (TParagraph teiP : thriftText.getParagraphs()) | |
24 | + loadParagraph(text, teiP); | |
25 | + logger.debug("Thrift text loaded."); | |
26 | + | |
27 | + return text; | |
28 | + } | |
29 | + | |
30 | + private static void loadParagraph(Text text, TParagraph teiP) | |
31 | + throws MultiserviceException { | |
32 | + Paragraph p = new Paragraph(); | |
33 | + text.add(p); | |
34 | + | |
35 | + for (TSentence teiS : teiP.getSentences()) | |
36 | + loadSentence(p, teiS); | |
37 | + } | |
38 | + | |
39 | + private static void loadSentence(Paragraph p, TSentence thriftSent) | |
40 | + throws MultiserviceException { | |
41 | + Sentence s = new Sentence(); | |
42 | + p.add(s); | |
43 | + | |
44 | + Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent); | |
45 | + | |
46 | + Map<String, Token> thiftTokenId2Token = new HashMap<>(); | |
47 | + for (TToken teiM : thriftSent.getTokens()) { | |
48 | + Token token = loadToken(s, teiM); | |
49 | + thiftTokenId2Token.put(teiM.getId(), token); | |
50 | + } | |
51 | + if (thriftSent.isSetNames()) | |
52 | + for (TNamedEntity ne : thriftSent.getNames()) | |
53 | + loadNE(s, ne, thirftId2Entity, thiftTokenId2Token); | |
54 | + if (thriftSent.isSetWords()) | |
55 | + for (TSyntacticWord w : thriftSent.getWords()) | |
56 | + loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token); | |
57 | + if (thriftSent.isSetGroups()) | |
58 | + for (TSyntacticGroup g : thriftSent.getGroups()) | |
59 | + loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token); | |
60 | + } | |
61 | + | |
62 | + private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g, | |
63 | + Map<String, Object> thirftId2Entity, | |
64 | + Map<String, Token> thiftTokenId2Token) { | |
65 | + String type = g.getType(); | |
66 | + List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity, | |
67 | + thiftTokenId2Token, false); | |
68 | + List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity, | |
69 | + thiftTokenId2Token, true); | |
70 | + s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens)); | |
71 | + } | |
72 | + | |
73 | + private static void loadSyntacticWord(Sentence s, TSyntacticWord w, | |
74 | + Map<String, Object> thirftId2Entity, | |
75 | + Map<String, Token> thiftTokenId2Token) { | |
76 | + String ctag = w.getChosenInterpretation().getCtag(); | |
77 | + List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity, | |
78 | + thiftTokenId2Token, false); | |
79 | + s.addSyntacticWord(new SyntacticWord(ctag, tokens)); | |
80 | + } | |
81 | + | |
82 | + private static void loadNE(Sentence s, TNamedEntity ne, | |
83 | + Map<String, Object> thirftId2Entity, | |
84 | + Map<String, Token> thiftTokenId2Token) { | |
85 | + List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity, | |
86 | + thiftTokenId2Token, false); | |
87 | + s.addNamedEntity(new NamedEntity(tokens)); | |
88 | + } | |
89 | + | |
90 | + private static Map<String, Object> getThriftId2EntityMap( | |
91 | + TSentence thriftSent) { | |
92 | + Map<String, Object> idToEntity = new HashMap<>(); | |
93 | + for (TToken tok : thriftSent.getTokens()) | |
94 | + idToEntity.put(tok.getId(), tok); | |
95 | + if (thriftSent.isSetWords()) | |
96 | + for (TSyntacticWord w : thriftSent.getWords()) | |
97 | + idToEntity.put(w.getId(), w); | |
98 | + if (thriftSent.isSetNames()) | |
99 | + for (TNamedEntity ne : thriftSent.getNames()) | |
100 | + idToEntity.put(ne.getId(), ne); | |
101 | + if (thriftSent.isSetGroups()) | |
102 | + for (TSyntacticGroup group : thriftSent.getGroups()) | |
103 | + idToEntity.put(group.getId(), group); | |
104 | + return idToEntity; | |
105 | + } | |
106 | + | |
107 | + private static Token loadToken(Sentence s, TToken teiM) | |
108 | + throws MultiserviceException { | |
109 | + Token seg = new Token(); | |
110 | + s.add(seg); | |
111 | + | |
112 | + seg.setOrth(teiM.getOrth()); | |
113 | + TInterpretation interp = getTokenChosenInt(teiM); | |
114 | + Interpretation chosenIterpretation = new Interpretation( | |
115 | + interp.getCtag(), interp.getMsd(), interp.getBase()); | |
116 | + seg.addChosenInterpretation(chosenIterpretation); | |
117 | + | |
118 | + for (TInterpretation interp2 : teiM.getInterpretations()) { | |
119 | + Interpretation inter = new Interpretation(interp2.getCtag(), | |
120 | + interp2.getMsd(), interp.getBase()); | |
121 | + seg.addInterpretation(inter); | |
122 | + } | |
123 | + return seg; | |
124 | + } | |
125 | + | |
126 | + private static TInterpretation getTokenChosenInt(TToken token) | |
127 | + throws MultiserviceException { | |
128 | + TInterpretation interp = token.getChosenInterpretation(); | |
129 | + if (interp == null || interp.getBase() == null | |
130 | + || "".equals(interp.getBase())) { | |
131 | + if (token.getCandidateInterpretations() == null | |
132 | + || token.getCandidateInterpretations().isEmpty() | |
133 | + || token.getCandidateInterpretations().get(0).getBase() == null | |
134 | + || "".equals(token.getCandidateInterpretations().get(0).getBase())) | |
135 | + throw new MultiserviceException( | |
136 | + "No proper chosen or candidate interpretation for segment: " | |
137 | + + token.id); | |
138 | + interp = token.getCandidateInterpretations().get(0); | |
139 | + } | |
140 | + return interp; | |
141 | + } | |
142 | + | |
143 | + private static List<Token> getUnderlyingSegments(Object entity, | |
144 | + Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment, | |
145 | + boolean headsOnly) { | |
146 | + List<Token> result = new ArrayList<>(); | |
147 | + | |
148 | + if (entity instanceof TToken) { | |
149 | + result.add(tokenId2Segment.get(((TToken) entity).getId())); | |
150 | + return result; | |
151 | + } | |
152 | + | |
153 | + List<String> childIds = new ArrayList<>(); | |
154 | + if (entity instanceof TSyntacticWord) | |
155 | + childIds = ((TSyntacticWord) entity).getChildIds(); | |
156 | + else if (entity instanceof TNamedEntity) | |
157 | + childIds = ((TNamedEntity) entity).getChildIds(); | |
158 | + else if (entity instanceof TSyntacticGroup) | |
159 | + if (headsOnly) { | |
160 | + childIds = new ArrayList<>(); | |
161 | + childIds.add(((TSyntacticGroup) entity).getSemanticHeadId()); | |
162 | + } else | |
163 | + childIds = ((TSyntacticGroup) entity).getChildIds(); | |
164 | + | |
165 | + for (String id : childIds) | |
166 | + result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity, | |
167 | + tokenId2Segment, headsOnly)); | |
168 | + | |
169 | + return result; | |
170 | + } | |
187 | 171 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftSaver.java
1 | 1 | package pl.waw.ipipan.zil.core.md.io.thrift; |
2 | 2 | |
3 | -import java.util.ArrayList; | |
4 | -import java.util.HashMap; | |
5 | -import java.util.Iterator; | |
6 | -import java.util.List; | |
7 | -import java.util.Map; | |
8 | - | |
9 | -import org.apache.log4j.Logger; | |
10 | - | |
11 | -import pl.waw.ipipan.zil.core.md.entities.Mention; | |
12 | -import pl.waw.ipipan.zil.core.md.entities.Paragraph; | |
13 | -import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
14 | -import pl.waw.ipipan.zil.core.md.entities.Text; | |
15 | -import pl.waw.ipipan.zil.core.md.entities.Token; | |
16 | -import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException; | |
17 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TMention; | |
18 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TParagraph; | |
19 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TSentence; | |
20 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TText; | |
21 | -import pl.waw.ipipan.zil.multiservice.thrift.types.TToken; | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
6 | +import pl.waw.ipipan.zil.multiservice.thrift.types.*; | |
7 | + | |
8 | +import java.util.*; | |
22 | 9 | |
23 | 10 | public class ThriftSaver { |
24 | 11 | |
25 | - private static Logger logger = Logger.getLogger(ThriftSaver.class); | |
26 | - | |
27 | - public static void updateThriftText(Text responseText, TText text) | |
28 | - throws MultiserviceException { | |
29 | - | |
30 | - logger.debug("Updating thrift text..."); | |
31 | - Map<Mention, TMention> teiMention2ThriftMention = new HashMap<>(); | |
32 | - | |
33 | - Iterator<TParagraph> thrPI = text.getParagraphsIterator(); | |
34 | - Iterator<Paragraph> teiPI = responseText.iterator(); | |
35 | - int freeMentionId = 0; | |
36 | - while (thrPI.hasNext() && teiPI.hasNext()) { | |
37 | - TParagraph thrP = thrPI.next(); | |
38 | - Paragraph teiP = teiPI.next(); | |
39 | - | |
40 | - freeMentionId = updateThriftParagraph(teiMention2ThriftMention, | |
41 | - freeMentionId, thrP, teiP); | |
42 | - } | |
43 | - checkIterators(thrPI, teiPI, "paragraph"); | |
44 | - } | |
45 | - | |
46 | - private static int updateThriftParagraph( | |
47 | - Map<Mention, TMention> teiMention2ThriftMention, int freeMentionId, | |
48 | - TParagraph thrP, Paragraph teiP) throws MultiserviceException { | |
49 | - Iterator<TSentence> thrSI = thrP.getSentencesIterator(); | |
50 | - Iterator<Sentence> teiSI = teiP.iterator(); | |
51 | - while (thrSI.hasNext() && teiSI.hasNext()) { | |
52 | - TSentence thrS = thrSI.next(); | |
53 | - Sentence teiS = teiSI.next(); | |
54 | - freeMentionId = updateThriftSentence(teiMention2ThriftMention, | |
55 | - freeMentionId, thrS, teiS); | |
56 | - } | |
57 | - checkIterators(thrSI, teiSI, "sentence"); | |
58 | - return freeMentionId; | |
59 | - } | |
60 | - | |
61 | - private static int updateThriftSentence( | |
62 | - Map<Mention, TMention> teiMention2ThriftMention, int id, | |
63 | - TSentence thrS, Sentence teiS) throws MultiserviceException { | |
64 | - thrS.unsetMentions(); | |
65 | - thrS.setMentions(new ArrayList<TMention>()); | |
66 | - | |
67 | - Map<Token, TToken> teiMorph2ThriftToken = new HashMap<>(); | |
68 | - Iterator<TToken> thrMI = thrS.getTokensIterator(); | |
69 | - Iterator<Token> teiMI = teiS.iterator(); | |
70 | - while (thrMI.hasNext() && teiMI.hasNext()) { | |
71 | - teiMorph2ThriftToken.put(teiMI.next(), thrMI.next()); | |
72 | - } | |
73 | - checkIterators(thrMI, teiMI, "morph"); | |
74 | - | |
75 | - for (Mention m : teiS.getMentions()) { | |
76 | - List<String> childIds = new ArrayList<>(); | |
77 | - List<String> headIds = new ArrayList<>(); | |
78 | - for (Token ch : m.getSegments()) | |
79 | - childIds.add(teiMorph2ThriftToken.get(ch).getId()); | |
80 | - for (Token h : m.getHeadSegments()) | |
81 | - headIds.add(teiMorph2ThriftToken.get(h).getId()); | |
82 | - | |
83 | - TMention tm = new TMention("m-" + (id++), headIds, childIds, | |
84 | - m.isZeroSubject()); | |
85 | - teiMention2ThriftMention.put(m, tm); | |
86 | - thrS.addToMentions(tm); | |
87 | - } | |
88 | - return id; | |
89 | - } | |
90 | - | |
91 | - private static void checkIterators(Iterator<? extends Object> one, | |
92 | - Iterator<? extends Object> other, String level) | |
93 | - throws MultiserviceException { | |
94 | - if (one.hasNext() || other.hasNext()) | |
95 | - throw new MultiserviceException( | |
96 | - "Problem mapping interal text representation to thrift for level " | |
97 | - + level); | |
98 | - } | |
12 | + private static final Logger LOG = LoggerFactory.getLogger(ThriftSaver.class); | |
13 | + | |
14 | + private ThriftSaver() { | |
15 | + } | |
16 | + | |
17 | + public static void updateThriftText(Text responseText, TText text) | |
18 | + throws MultiserviceException { | |
19 | + | |
20 | + LOG.debug("Updating thrift text..."); | |
21 | + Map<Mention, TMention> teiMention2ThriftMention = new HashMap<>(); | |
22 | + | |
23 | + Iterator<TParagraph> thrPI = text.getParagraphsIterator(); | |
24 | + Iterator<Paragraph> teiPI = responseText.iterator(); | |
25 | + int freeMentionId = 0; | |
26 | + while (thrPI.hasNext() && teiPI.hasNext()) { | |
27 | + TParagraph thrP = thrPI.next(); | |
28 | + Paragraph teiP = teiPI.next(); | |
29 | + | |
30 | + freeMentionId = updateThriftParagraph(teiMention2ThriftMention, | |
31 | + freeMentionId, thrP, teiP); | |
32 | + } | |
33 | + checkIterators(thrPI, teiPI, "paragraph"); | |
34 | + } | |
35 | + | |
36 | + private static int updateThriftParagraph( | |
37 | + Map<Mention, TMention> teiMention2ThriftMention, int freeMentionId, | |
38 | + TParagraph thrP, Paragraph teiP) throws MultiserviceException { | |
39 | + Iterator<TSentence> thrSI = thrP.getSentencesIterator(); | |
40 | + Iterator<Sentence> teiSI = teiP.iterator(); | |
41 | + while (thrSI.hasNext() && teiSI.hasNext()) { | |
42 | + TSentence thrS = thrSI.next(); | |
43 | + Sentence teiS = teiSI.next(); | |
44 | + freeMentionId = updateThriftSentence(teiMention2ThriftMention, | |
45 | + freeMentionId, thrS, teiS); | |
46 | + } | |
47 | + checkIterators(thrSI, teiSI, "sentence"); | |
48 | + return freeMentionId; | |
49 | + } | |
50 | + | |
51 | + private static int updateThriftSentence( | |
52 | + Map<Mention, TMention> teiMention2ThriftMention, int id, | |
53 | + TSentence thrS, Sentence teiS) throws MultiserviceException { | |
54 | + thrS.unsetMentions(); | |
55 | + thrS.setMentions(new ArrayList<>()); | |
56 | + | |
57 | + Map<Token, TToken> teiMorph2ThriftToken = new HashMap<>(); | |
58 | + Iterator<TToken> thrMI = thrS.getTokensIterator(); | |
59 | + Iterator<Token> teiMI = teiS.iterator(); | |
60 | + while (thrMI.hasNext() && teiMI.hasNext()) { | |
61 | + teiMorph2ThriftToken.put(teiMI.next(), thrMI.next()); | |
62 | + } | |
63 | + checkIterators(thrMI, teiMI, "morph"); | |
64 | + | |
65 | + for (Mention m : teiS.getMentions()) { | |
66 | + List<String> childIds = new ArrayList<>(); | |
67 | + List<String> headIds = new ArrayList<>(); | |
68 | + for (Token ch : m.getSegments()) | |
69 | + childIds.add(teiMorph2ThriftToken.get(ch).getId()); | |
70 | + for (Token h : m.getHeadSegments()) | |
71 | + headIds.add(teiMorph2ThriftToken.get(h).getId()); | |
72 | + | |
73 | + TMention tm = new TMention("m-" + (id++), headIds, childIds, | |
74 | + m.isZeroSubject()); | |
75 | + teiMention2ThriftMention.put(m, tm); | |
76 | + thrS.addToMentions(tm); | |
77 | + } | |
78 | + return id; | |
79 | + } | |
80 | + | |
81 | + private static void checkIterators(Iterator<?> one, | |
82 | + Iterator<?> other, String level) | |
83 | + throws MultiserviceException { | |
84 | + if (one.hasNext() || other.hasNext()) | |
85 | + throw new MultiserviceException( | |
86 | + "Problem mapping interal text representation to thrift for level " | |
87 | + + level); | |
88 | + } | |
99 | 89 | |
100 | 90 | } |
... | ... |