Main.java
9.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
package pl.waw.ipipan.zil.core.md;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.waw.ipipan.zil.core.md.detection.Detector;
import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector;
import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector;
import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
import pl.waw.ipipan.zil.core.md.entities.Text;
import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
import pl.waw.ipipan.zil.core.md.io.tei.TeiSaver;
import pl.waw.ipipan.zil.core.md.io.thrift.ThriftLoader;
import pl.waw.ipipan.zil.core.md.io.thrift.ThriftSaver;
import pl.waw.ipipan.zil.multiservice.thrift.types.MultiserviceException;
import pl.waw.ipipan.zil.multiservice.thrift.types.TText;
import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.EnumMap;
import java.util.HashMap;
import java.util.Map;
public class Main {
private static final Logger logger = LoggerFactory.getLogger(Main.class);
private static final boolean GZIP_OUTPUT = true;
private static final String DEFAULT_HEAD_MODEL = "/head_model.bin";
private static final String DEFAULT_NOMINAL_MENTION_MODEL = "/nominal_model.bin";
private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt";
private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt";
private static HeadDetector headModel;
private static NominalMentionDetector nominalMentionModel;
private static ZeroSubjectDetector zeroSubjectModel;
public static enum ValenceDicts {
VerbsValence,
NounsValence
}
private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence =
new EnumMap(ValenceDicts.class);
static {
InputStream headDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_HEAD_MODEL);
headModel = new HeadDetector(headDetectionModelStream);
InputStream nominalMentionDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_NOMINAL_MENTION_MODEL);
nominalMentionModel = new NominalMentionDetector(nominalMentionDetectionModelStream);
InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE);
valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream));
InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
}
public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream)
{
Map<String,ArrayList<String>> map;
try {
BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream));
map = new HashMap<String,ArrayList<String>>();
String line;
boolean firstLine = true;
while((line = br.readLine()) != null) {
if (firstLine) {
line = line.replace("\uFEFF", ""); // remove BOM character
firstLine = false;
}
if (!line.startsWith("%")) {
String[] lineParts = line.split(":");
String lemma = lineParts[0].trim();
String schema = lineParts[5].trim();
if (schema.trim().isEmpty()) {
continue;
}
String[] lemmaParts = lemma.split(" ");
if(lemmaParts.length == 1 && schemaContainsSie(schema)) {
lemma = lemma + " się";
}
ArrayList<String> schemata;
if (!map.containsKey(lemma)) {
schemata = new ArrayList<String>();
schemata.add(schema);
map.put(lemma, schemata);
} else {
schemata = map.get(lemma);
schemata.add(schema);
map.put(lemma, schemata);
}
}
}
br.close();
} catch (IOException ex) {
ex.printStackTrace();
throw new RuntimeException(ex);
}
return map;
}
private static boolean schemaContainsSie(String schema) {
for (String position : schema.split("\\s\\+\\s")) {
position = position.trim();
position = position.substring(1, position.length()-1);
for (String phrT : position.split(";")) {
if (phrT.equals("refl") || phrT.equals("recip")) {
return true;
}
}
}
return false;
}
private Main() {
}
/**
* Main method for detecting mentions in corpus encoded in Tei format.
*
* @param args arguments
*/
public static void main(String[] args) {
if (args.length != 2 && args.length != 5) {
logger.error("Wrong usage! should be: " + Main.class.getSimpleName()
+ " input_dir result_dir [head_model] [nominal_mention_model] [zero_subject_model]");
return;
}
File inputDir = new File(args[0]);
File outputDir = new File(args[1]);
if (!inputDir.isDirectory()) {
logger.error(inputDir + " is not a directory!");
return;
}
if (!outputDir.isDirectory()) {
logger.error(outputDir + " is not a directory!");
return;
}
if (args.length == 5) {
try {
InputStream headDetectionModelStream;
headDetectionModelStream = new FileInputStream(new File(args[2]));
headModel = new HeadDetector(headDetectionModelStream);
InputStream nominalMentionsDetectionModelStream;
nominalMentionsDetectionModelStream = new FileInputStream(new File(args[3]));
nominalMentionModel = new NominalMentionDetector(nominalMentionsDetectionModelStream);
InputStream zeroSubjectDetectionModelStream;
zeroSubjectDetectionModelStream = new FileInputStream(new File(args[4]));
zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
} catch (IOException e) {
logger.error("Unable to load model: " + e, e);
return;
}
}
int all = 0;
int errors = 0;
for (File teiDir : IOUtils.getNKJPDirs(inputDir)) {
all++;
try {
File targetDir = createTargetTextDir(inputDir, outputDir, teiDir);
TEICorpusText teiText = TeiLoader.readTeiText(teiDir);
annotateTeiText(teiText, teiDir);
TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT);
} catch (IOException e) {
logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e);
errors++;
}
}
logger.info(all + " texts processed succesfully.");
if (errors > 0)
logger.info(errors + " texts not processed.");
logger.info(ZeroSubjectDetector.verbsWithoutSubject + " verbs with zero subject detected.");
logger.info(ZeroSubjectDetector.verbsWithSubject + " verbs with explicit subject detected.");
}
/**
* Find relative path of text directory in the corpus directory and create
* similar directory structure in the output corpus directory.
*
* @param inputCorpusDir input corpus directory
* @param outputCorpusDir output corpus directory
* @param textDir input text dir
* @return target text dir
* @throws IOException when an error occurs
*/
private static File createTargetTextDir(File inputCorpusDir, File outputCorpusDir, File textDir) throws IOException {
String relativeDirPath = textDir.toString().substring(inputCorpusDir.toString().length());
File targetDir = new File(outputCorpusDir, relativeDirPath);
targetDir.mkdirs();
if (!targetDir.exists() || !targetDir.isDirectory())
throw new IOException("Failed to create output directory at: " + targetDir);
return targetDir;
}
/**
* Find mentions in Thrift text and update this Thrift text with mention
* annotation.
*
* @param thriftText text to annotate with mentions
* @throws MultiserviceException when an error occures
*/
public static void annotateThriftText(TText thriftText) throws MultiserviceException {
Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence);
ThriftSaver.updateThriftText(responseText, thriftText);
}
/**
* Find mentions in Tei text and update this Tei text with mention
* annotation. This method does not save this Tei text on disk.
*
* @param teiText text to annotate with mentions
* @throws TEIException when an error occurs
*/
public static void annotateTeiText(TEICorpusText teiText, File textDir) throws TEIException {
Text responseText = TeiLoader.loadTextFromTei(teiText, textDir);
Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence);
TeiSaver.updateTeiText(responseText, teiText);
}
}