Commit 3d23a642e950208184da7ac7d198861326de7415
1 parent
2d60e476
Added missing files.
Showing
14 changed files
with
2360 additions
and
0 deletions
.gitignore
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/FeatureGeneration.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.core.md.detection.Constants; | |
4 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
5 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; | |
6 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; | |
7 | + | |
8 | +import java.util.*; | |
9 | + | |
10 | +public class FeatureGeneration { | |
11 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", | |
12 | + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); | |
13 | + | |
14 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale", | |
15 | + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie", | |
16 | + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" })); | |
17 | + | |
18 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>( | |
19 | + Arrays.asList(new String[] { "?", "!" })); | |
20 | + | |
21 | + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>(); | |
22 | + static { | |
23 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")"); | |
24 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\""); | |
25 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'"); | |
26 | + } | |
27 | + | |
28 | + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12", | |
29 | + "ppron3", "ger", "num", "numcol" })); | |
30 | + | |
31 | + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" })); | |
32 | + | |
33 | + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt", | |
34 | + "praet", "winien" })); | |
35 | + | |
36 | + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki", | |
37 | + "który" })); | |
38 | + | |
39 | + public static void generateFeatures(Map<String, Object> features, Token t, Sentence s, Set<String> quasiVerbs) { | |
40 | + | |
41 | + features.put("ctag", t.getChosenInterpretation().getCtag()); | |
42 | + features.put("number", t.getChosenInterpretation().getNumber()); | |
43 | + | |
44 | + features.put("NGHead", NGHead(t, s)); | |
45 | + features.put("isNextColon", isNextColon(t, s)); | |
46 | + features.put("wordCtag", wordCtag(t, s)); | |
47 | + features.put("isPartOfNE", isPartOfNE(t, s)); | |
48 | + features.put("isFirstInNE", isFirstInNE(t, s)); | |
49 | + features.put("nextCtag", getNeighbouringTag(s, t, 1)); | |
50 | + features.put("prevCtag", getNeighbouringTag(s, t, -1)); | |
51 | + features.put("sentLength", s.size()); | |
52 | + | |
53 | + features.put("tokenOrthLength", t.getOrth().length()); | |
54 | + features.put("tokenBaseLength", t.getBase().length()); | |
55 | + features.put("isNextDot", isNextDot(t, s)); | |
56 | + features.put("closestNEDistance", closestNEDistance(t, s)); | |
57 | + features.put("startsWithUpperOrth", Character.isUpperCase(t.getOrth().codePointAt(0))); | |
58 | + features.put("startsWithUpperBase", Character.isUpperCase(t.getBase().codePointAt(0))); | |
59 | + | |
60 | + | |
61 | + //features.put("isPartOfFrazeo", isPartOfFrazeo(t, s)); | |
62 | + //features.put("gender", t.getChosenInterpretation().getGender()); | |
63 | + //features.put("person", t.getChosenInterpretation().getPerson()); | |
64 | + //features.put("quasi", quasiVerbs.contains(m.getChosenInterpretation().getBase())); | |
65 | + //features.put("isPrevPraet", isPrevPraet(t, s)); | |
66 | + //features.put("isPrevComma", isPrevComma(t, s)); | |
67 | + //features.put("isPrev2Pred", isPrev2Pred(t, s)); | |
68 | + //features.put("isNextInf", isNextInf(t, s)); | |
69 | + | |
70 | + | |
71 | + //List<Token> clause = getClause(s, m); | |
72 | +// features.put("clauseLength", clause.size()); | |
73 | + | |
74 | + //addFeatures(features, clause, "clause", m); | |
75 | +/* addFeatures(features, s, "sent", t); | |
76 | + for (int i = 1; i < 6; i++) | |
77 | + addFeatures(features, getWindow(s, t, i, 0), "window_" + i + "_" + 0, t); | |
78 | + for (int i = 1; i < 6; i++) | |
79 | + addFeatures(features, getWindow(s, t, 0, i), "window_" + 0 + "_" + i, t); | |
80 | + for (int i = 1; i < 6; i++) | |
81 | + addFeatures(features, getWindow(s, t, i, i), "window_" + i + "_" + i, t);*/ | |
82 | + } | |
83 | + | |
84 | + /////////////////////////////////// | |
85 | + | |
86 | + private static boolean NGHead(Token t, Sentence s) { | |
87 | + | |
88 | + for (SyntacticGroup group : s.getGroups()) { | |
89 | + if (group.getType().startsWith("NG") && group.getSemanticHeadTokens().contains(t)) { | |
90 | + return Boolean.valueOf(true); | |
91 | + } | |
92 | + } | |
93 | + return Boolean.valueOf(false); | |
94 | + } | |
95 | + | |
96 | + private static boolean isNextColon(Token t, Sentence s) { | |
97 | + int idx = s.indexOf(t) + 1; | |
98 | + if (idx >= s.size() || idx < 0) | |
99 | + return Boolean.valueOf(false); | |
100 | + return Boolean.valueOf(s.get(idx).getOrth().equals(":")); | |
101 | + } | |
102 | + | |
103 | + private static boolean isNextDot(Token t, Sentence s) { | |
104 | + int idx = s.indexOf(t) + 1; | |
105 | + if (idx >= s.size() || idx < 0) | |
106 | + return Boolean.valueOf(false); | |
107 | + return Boolean.valueOf(s.get(idx).getOrth().equals(".")); | |
108 | + } | |
109 | + | |
110 | + private static String wordCtag(Token t, Sentence s) { | |
111 | + for (SyntacticWord w : s.getSyntacticWords()) { | |
112 | + if (w.getTokens().contains(t)) { | |
113 | + return w.getCtag(); | |
114 | + } | |
115 | + } | |
116 | + return "None"; | |
117 | + } | |
118 | + | |
119 | + private static boolean isPartOfNE(Token t, Sentence s) { | |
120 | + for (NamedEntity ne : s.getNamedEntities()) { | |
121 | + if (ne.getTokens().contains(t)) { | |
122 | + return Boolean.valueOf(true); | |
123 | + } | |
124 | + } | |
125 | + return Boolean.valueOf(false); | |
126 | + } | |
127 | + | |
128 | + private static int closestNEDistance(Token t, Sentence s) { | |
129 | + int lowestDistance = -1; | |
130 | + for (NamedEntity ne : s.getNamedEntities()) { | |
131 | + int distance = ne.getTokens().get(0).getSentencePosition() - t.getSentencePosition(); | |
132 | + if ( distance >= 0 && (distance < lowestDistance || lowestDistance < 0)) { | |
133 | + lowestDistance = distance; | |
134 | + } | |
135 | + } | |
136 | + return lowestDistance; | |
137 | + } | |
138 | + | |
139 | + private static boolean isFirstInNE(Token t, Sentence s) { | |
140 | + for (NamedEntity ne : s.getNamedEntities()) { | |
141 | + if (ne.getTokens().get(0).compareTo(t) == 0) { | |
142 | + return Boolean.valueOf(true); | |
143 | + } | |
144 | + } | |
145 | + return Boolean.valueOf(false); | |
146 | + } | |
147 | + | |
148 | + private static boolean isPartOfFrazeo(Token t, Sentence s) { | |
149 | + for (SyntacticWord word : s.getSyntacticWords()) { | |
150 | + if (word.getTokens().contains(t) && | |
151 | + Constants.FRAZEO_CTAGS.contains(word.getCtag())) { | |
152 | + return true; | |
153 | + } | |
154 | + } | |
155 | + return false; | |
156 | + } | |
157 | + | |
158 | + /////////////////////////////////// | |
159 | + | |
160 | + private static boolean isNextInf(Token m, Sentence s) { | |
161 | + boolean now = false; | |
162 | + for (Token morph : s) { | |
163 | + if (now) | |
164 | + return morph.getChosenInterpretation().getCtag().equals("inf"); | |
165 | + if (m.equals(morph)) | |
166 | + now = true; | |
167 | + } | |
168 | + return false; | |
169 | + } | |
170 | + | |
171 | + private static boolean isPrev2Pred(Token m, Sentence s) { | |
172 | + Token prev = null; | |
173 | + Token prev2 = null; | |
174 | + for (Token morph : s) { | |
175 | + if (m.equals(morph)) | |
176 | + break; | |
177 | + prev2 = prev; | |
178 | + prev = morph; | |
179 | + } | |
180 | + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred")) | |
181 | + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred")); | |
182 | + } | |
183 | + | |
184 | + private static Object isPrevComma(Token m, Sentence s) { | |
185 | + Token prev = null; | |
186 | + for (Token morph : s) { | |
187 | + if (m.equals(morph)) | |
188 | + break; | |
189 | + prev = morph; | |
190 | + } | |
191 | + return prev != null && prev.getChosenInterpretation().getBase().equals(","); | |
192 | + } | |
193 | + | |
194 | + private static String getNeighbouringTag(Sentence s, Token m, int i) { | |
195 | + int idx = s.indexOf(m) + i; | |
196 | + if (idx >= s.size() || idx < 0) | |
197 | + return "None"; | |
198 | + return s.get(idx).getChosenInterpretation().getCtag(); | |
199 | + } | |
200 | + | |
201 | + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) { | |
202 | + | |
203 | + boolean hasNom = false; // 1 | |
204 | + boolean hasNum = false; // 2 | |
205 | + boolean hasPOG = false; // 3 | |
206 | + | |
207 | + boolean hasNomNum = false; | |
208 | + boolean hasNumPOG = false; | |
209 | + boolean hasNomPOG = false; | |
210 | + boolean hasNomNumPOG = false; | |
211 | + | |
212 | + boolean has2Nom = false; | |
213 | + boolean has2NomPOG = false; | |
214 | + boolean has2POG = false; | |
215 | + | |
216 | + Token prev = null; | |
217 | + for (Token candidate : clause) { | |
218 | + | |
219 | + if (!isNoun(candidate) || isJakJako(prev)) { | |
220 | + prev = candidate; | |
221 | + continue; | |
222 | + } | |
223 | + | |
224 | + // nom, nom2 | |
225 | + if (isNom(candidate)) { | |
226 | + if (hasNom) | |
227 | + has2Nom = true; | |
228 | + hasNom = true; | |
229 | + } | |
230 | + // num | |
231 | + if (agreedNum(candidate, m)) { | |
232 | + hasNum = true; | |
233 | + } | |
234 | + // pog, pog2 | |
235 | + if (agreedGenderOrPerson(candidate, m)) { | |
236 | + if (hasPOG) | |
237 | + has2POG = true; | |
238 | + hasPOG = true; | |
239 | + } | |
240 | + | |
241 | + // nom num, nom num pog | |
242 | + if (isNom(candidate) && agreedNum(candidate, m)) { | |
243 | + if (agreedGenderOrPerson(candidate, m)) | |
244 | + hasNomNumPOG = true; | |
245 | + hasNomNum = true; | |
246 | + } | |
247 | + | |
248 | + // nom pog, num pog | |
249 | + if (agreedGenderOrPerson(candidate, m)) | |
250 | + if (isNom(candidate)) { | |
251 | + if (hasNomPOG) | |
252 | + has2NomPOG = true; | |
253 | + hasNomPOG = true; | |
254 | + } else if (agreedNum(candidate, m)) | |
255 | + hasNumPOG = true; | |
256 | + | |
257 | + prev = candidate; | |
258 | + } | |
259 | + | |
260 | + // features.put("conj_" + prefix, hasConj); | |
261 | + features.put("cand_2_nom_" + prefix, has2Nom); | |
262 | + features.put("cand_2_POG_" + prefix, has2POG); | |
263 | + features.put("cand_2_nom+POG_" + prefix, has2NomPOG); | |
264 | + | |
265 | + features.put("cand_nom_" + prefix, hasNom); | |
266 | + features.put("cand_num_" + prefix, hasNum); | |
267 | + features.put("cand_POG_" + prefix, hasPOG); | |
268 | + | |
269 | + features.put("cand_nom+num_" + prefix, hasNomNum); | |
270 | + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG); | |
271 | + features.put("cand_nom+POG_" + prefix, hasNomPOG); | |
272 | + features.put("cand_num+POG_" + prefix, hasNumPOG); | |
273 | + } | |
274 | + | |
275 | + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) { | |
276 | + | |
277 | + int idx = s.indexOf(m); | |
278 | + int from = Math.max(0, idx - pre); | |
279 | + int to = Math.min(s.size(), idx + post + 1); | |
280 | + | |
281 | + return new ArrayList<>(s.subList(from, to)); | |
282 | + } | |
283 | + | |
284 | + private static boolean isPrevPraet(Token m, Sentence s) { | |
285 | + Token prev = null; | |
286 | + for (Token morph : s) { | |
287 | + if (m.equals(morph)) | |
288 | + break; | |
289 | + prev = morph; | |
290 | + } | |
291 | + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet"); | |
292 | + } | |
293 | + | |
294 | + /** | |
295 | + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo, | |
296 | + * lub (jak przy streszczeniach: w środku musi być czasownik w formie | |
297 | + * osobowej), | |
298 | + * | |
299 | + * @param s | |
300 | + * sentence | |
301 | + * @param m2 | |
302 | + * token | |
303 | + * @return clause with the token | |
304 | + */ | |
305 | + public static List<Token> getClause(Sentence s, Token m2) { | |
306 | + | |
307 | + List<List<Token>> sublists = getClauses(s); | |
308 | + | |
309 | + for (List<Token> sub : sublists) | |
310 | + for (Token m : sub) | |
311 | + if (m.equals(m2)) | |
312 | + return sub; | |
313 | + | |
314 | + return null; | |
315 | + } | |
316 | + | |
317 | + public static List<List<Token>> getClauses(Sentence s) { | |
318 | + | |
319 | + Set<Token> noSplitMorphs = new HashSet<>(); | |
320 | + for (SyntacticGroup g : s.getGroups()) { | |
321 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | |
322 | + noSplitMorphs.add(m); | |
323 | + } | |
324 | + } | |
325 | + for (SyntacticWord g : s.getSyntacticWords()) { | |
326 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | |
327 | + noSplitMorphs.add(m); | |
328 | + } | |
329 | + } | |
330 | + | |
331 | + LinkedList<List<Token>> sublists = new LinkedList<>(); | |
332 | + List<Token> currentSublist = new ArrayList<>(); | |
333 | + boolean clauseHasVerb = false; | |
334 | + for (Token m : s) { | |
335 | + String base = m.getChosenInterpretation().getBase(); | |
336 | + if (!noSplitMorphs.contains(m) | |
337 | + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2 | |
338 | + .contains(base)) && clauseHasVerb))) { | |
339 | + sublists.add(currentSublist); | |
340 | + currentSublist = new ArrayList<>(); | |
341 | + clauseHasVerb = false; | |
342 | + } else { | |
343 | + if (isVerb(m)) | |
344 | + clauseHasVerb = true; | |
345 | + } | |
346 | + currentSublist.add(m); | |
347 | + } | |
348 | + if (currentSublist.size() > 0) { | |
349 | + if (clauseHasVerb) | |
350 | + sublists.add(currentSublist); | |
351 | + else | |
352 | + sublists.getLast().addAll(currentSublist); | |
353 | + } | |
354 | + | |
355 | + // merge clause beginning with zaimek wzgl. etc to previous clause | |
356 | + List<Token> prev = null; | |
357 | + Iterator<List<Token>> it = sublists.iterator(); | |
358 | + while (it.hasNext()) { | |
359 | + List<Token> sublist = it.next(); | |
360 | + boolean containsRelPron = false; | |
361 | + int i = 1; | |
362 | + for (Token m : sublist) { | |
363 | + if (i > 2) | |
364 | + break; | |
365 | + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) { | |
366 | + containsRelPron = true; | |
367 | + break; | |
368 | + } | |
369 | + i++; | |
370 | + } | |
371 | + if (prev != null && containsRelPron) { | |
372 | + prev.addAll(sublist); | |
373 | + it.remove(); | |
374 | + } else | |
375 | + prev = sublist; | |
376 | + } | |
377 | + | |
378 | + return sublists; | |
379 | + } | |
380 | + | |
381 | + private static boolean agreedNum(Token candidate, Token keyword) { | |
382 | + String keywordNum = keyword.getNumber(); | |
383 | + String wordNum = candidate.getNumber(); | |
384 | + return keywordNum.equals(wordNum); | |
385 | + } | |
386 | + | |
387 | + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) { | |
388 | + if (isPraet(keyword)) { | |
389 | + // praet has number:gender | |
390 | + String keywordGender = keyword.getGender(); | |
391 | + String wordGender = candidate.getGender(); | |
392 | + return keywordGender.equals(wordGender); | |
393 | + } else { | |
394 | + // other verbs have number:person | |
395 | + String keywordPerson = keyword.getPerson(); | |
396 | + String wordPerson = "ter"; // default | |
397 | + if (PRONOUN_TAGS.contains(candidate.getCtag())) | |
398 | + wordPerson = candidate.getPerson(); | |
399 | + return wordPerson.equals(keywordPerson); | |
400 | + } | |
401 | + } | |
402 | + | |
403 | + private static boolean isJakJako(Token prev) { | |
404 | + String base = prev == null ? null : prev.getBase(); | |
405 | + return prev != null && (base.equals("jak") || base.equals("jako")); | |
406 | + } | |
407 | + | |
408 | + private static boolean isPraet(Token keyword) { | |
409 | + return keyword.getCtag().equals("praet"); | |
410 | + } | |
411 | + | |
412 | + private static boolean isNom(Token candidate) { | |
413 | + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow | |
414 | + // tylko! | |
415 | + } | |
416 | + | |
417 | + private static boolean isNoun(Token m) { | |
418 | + return NOUN_TAGS.contains(m.getCtag()); | |
419 | + } | |
420 | + | |
421 | + public static boolean isVerb(Token morph) { | |
422 | + return VERB_TAGS.contains(morph.getCtag()); | |
423 | + } | |
424 | + | |
425 | + public static boolean isVerb(Mention m) { | |
426 | + boolean hasOnlyVerbs = true; | |
427 | + for (Token morph : m.getSegments()) | |
428 | + if (!isVerb(morph)) { | |
429 | + hasOnlyVerbs = false; | |
430 | + break; | |
431 | + } | |
432 | + return hasOnlyVerbs; | |
433 | + } | |
434 | + | |
435 | + public static boolean isVerb(TEIMention m) { | |
436 | + boolean hasOnlyVerbs = true; | |
437 | + for (TEIMorph morph : m.getMorphs()) | |
438 | + if (!isVerb(morph)) { | |
439 | + hasOnlyVerbs = false; | |
440 | + break; | |
441 | + } | |
442 | + return hasOnlyVerbs; | |
443 | + } | |
444 | + | |
445 | + private static boolean isVerb(TEIMorph morph) { | |
446 | + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag()); | |
447 | + } | |
448 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/HeadDetector.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | |
2 | + | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
6 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
7 | +import weka.core.Instances; | |
8 | + | |
9 | +import java.io.File; | |
10 | +import java.io.InputStream; | |
11 | +import java.util.*; | |
12 | + | |
13 | +public class HeadDetector { | |
14 | + | |
15 | + final private static Logger logger = LoggerFactory.getLogger(HeadDetector.class); | |
16 | + | |
17 | + private Model model; | |
18 | + private Set<String> quasiVerbs = new HashSet<>(); | |
19 | + | |
20 | + public static int detectedHeads = 0; | |
21 | + | |
22 | + public List<Token> detectHeads(Sentence sentence) { | |
23 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
24 | + InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence); | |
25 | + if (examples.isEmpty()) | |
26 | + return null; | |
27 | + | |
28 | + Instances instances = model.getInstances(examples); | |
29 | + | |
30 | + // label instances | |
31 | + List<Boolean> areHeads = new ArrayList<>(); | |
32 | + List<Token> heads = new ArrayList<>(); | |
33 | + for (int i = 0; i < instances.numInstances(); i++) { | |
34 | + boolean isHead = model.isHead(instances.instance(i), sentence); | |
35 | + areHeads.add(isHead); | |
36 | + if (isHead) | |
37 | + detectedHeads++; | |
38 | + } | |
39 | + | |
40 | + int i = 0; | |
41 | + for (Token m : sentence) { | |
42 | + if (FeatureGeneration.isVerb(m)) | |
43 | + continue; | |
44 | + if (areHeads.get(i)) | |
45 | + heads.add(m); | |
46 | + // sentence.addMention(new Mention(m, false)); | |
47 | + i++; | |
48 | + } | |
49 | + return heads; | |
50 | + } | |
51 | + | |
52 | + public HeadDetector(File zeroSubjectDetectionModel) { | |
53 | + try { | |
54 | + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); | |
55 | + this.quasiVerbs = this.model.getQuasiVerbs(); | |
56 | + } catch (Exception e) { | |
57 | + logger.error("Error loading model:" + e); | |
58 | + } | |
59 | + } | |
60 | + | |
61 | + public HeadDetector(InputStream zeroSubjectDetectionModelStream) { | |
62 | + try { | |
63 | + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); | |
64 | + this.quasiVerbs = this.model.getQuasiVerbs(); | |
65 | + } catch (Exception e) { | |
66 | + logger.error("Error loading model:" + e); | |
67 | + } | |
68 | + } | |
69 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/InstanceCreator.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | |
2 | + | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
6 | +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; | |
7 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | |
8 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; | |
9 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | |
10 | +import weka.core.Attribute; | |
11 | +import weka.core.FastVector; | |
12 | +import weka.core.Instance; | |
13 | +import weka.core.Instances; | |
14 | + | |
15 | +import java.io.File; | |
16 | +import java.util.*; | |
17 | +import java.util.Map.Entry; | |
18 | + | |
19 | +public class InstanceCreator { | |
20 | + | |
21 | + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class); | |
22 | + private static final TEI_IO teiIO = TEI_IO.getInstance(); | |
23 | + | |
24 | + private InstanceCreator() { | |
25 | + } | |
26 | + | |
27 | + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) { | |
28 | + int allTexts = 0; | |
29 | + int exceptions = 0; | |
30 | + int allSentences = 0; | |
31 | + | |
32 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
33 | + for (File textDir : IOUtils.getNKJPDirs(dataDir)) { | |
34 | + try { | |
35 | + allTexts++; | |
36 | + logger.info("Processing text " + textDir); | |
37 | + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); | |
38 | + Text text = TeiLoader.loadTextFromTei(ct, textDir); | |
39 | + | |
40 | + for (Paragraph p : text) | |
41 | + for (Sentence s : p) { | |
42 | + allSentences++; | |
43 | + loadExamplesFromSentence(quasiVerbs, examples, s); | |
44 | + } | |
45 | + | |
46 | + } catch (Exception e) { | |
47 | + logger.error(e.getLocalizedMessage()); | |
48 | + exceptions++; | |
49 | + } | |
50 | + } | |
51 | + | |
52 | + logger.info(allTexts + " texts found."); | |
53 | + if (exceptions != 0) | |
54 | + logger.error(exceptions + " texts with exceptions."); | |
55 | + logger.info(allSentences + " sentences found."); | |
56 | + | |
57 | + return examples; | |
58 | + } | |
59 | + | |
60 | + public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples, | |
61 | + Sentence s) { | |
62 | + | |
63 | + // collect positive examples | |
64 | + Set<Token> positive = new HashSet<>(); | |
65 | + for (Mention m : s.getMentions()) { | |
66 | + if (!FeatureGeneration.isVerb(m)) { | |
67 | + positive.addAll(m.getHeadSegments()); | |
68 | + } | |
69 | + } | |
70 | + | |
71 | + for (Token m : s) { | |
72 | + if (FeatureGeneration.isVerb(m)) | |
73 | + continue; | |
74 | + | |
75 | + TreeMap<String, Object> features = new TreeMap<>(); | |
76 | + if (positive.contains(m)) { | |
77 | + features.put("class", Boolean.valueOf(true)); | |
78 | + } else { | |
79 | + features.put("class", Boolean.valueOf(false)); | |
80 | + } | |
81 | + | |
82 | + FeatureGeneration.generateFeatures(features, m, s, quasiVerbs); | |
83 | + examples.add(features); | |
84 | + } | |
85 | + } | |
86 | + | |
87 | + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { | |
88 | + | |
89 | + TreeSet<String> booleanAttsOccurred = new TreeSet<>(); | |
90 | + TreeSet<String> doubleAttsOccurred = new TreeSet<>(); | |
91 | + TreeMap<String, Set<String>> att2values = new TreeMap<>(); | |
92 | + for (TreeMap<String, Object> example : examples) { | |
93 | + for (Entry<String, Object> e : example.entrySet()) { | |
94 | + String key = e.getKey(); | |
95 | + Object val = e.getValue(); | |
96 | + if (val instanceof Integer || val instanceof Double) { | |
97 | + doubleAttsOccurred.add(key); | |
98 | + continue; | |
99 | + } | |
100 | + if (val instanceof Boolean) { | |
101 | + booleanAttsOccurred.add(key); | |
102 | + continue; | |
103 | + } | |
104 | + if (!att2values.containsKey(key)) | |
105 | + att2values.put(key, new HashSet<>()); | |
106 | + att2values.get(key).add(val.toString()); | |
107 | + } | |
108 | + } | |
109 | + | |
110 | + List<Attribute> atts = new ArrayList<>(); | |
111 | + | |
112 | + // double attributes | |
113 | + for (String attName : doubleAttsOccurred) { | |
114 | + Attribute att = new Attribute(attName); | |
115 | + atts.add(att); | |
116 | + } | |
117 | + | |
118 | + // boolean attributes (treated as nominal) | |
119 | + FastVector values = new FastVector(2); | |
120 | + values.addElement("false"); | |
121 | + values.addElement("true"); | |
122 | + for (String attName : booleanAttsOccurred) { | |
123 | + Attribute att = new Attribute(attName, values); | |
124 | + atts.add(att); | |
125 | + } | |
126 | + | |
127 | + // nominal attributes | |
128 | + for (Entry<String, Set<String>> attVals : att2values.entrySet()) { | |
129 | + FastVector vals = new FastVector(attVals.getValue().size()); | |
130 | + for (String val : attVals.getValue()) | |
131 | + vals.addElement(val); | |
132 | + Attribute att = new Attribute(attVals.getKey(), vals); | |
133 | + atts.add(att); | |
134 | + } | |
135 | + | |
136 | + FastVector fvWekaAttributes = new FastVector(atts.size()); | |
137 | + for (Attribute attr : atts) { | |
138 | + fvWekaAttributes.addElement(attr); | |
139 | + } | |
140 | + | |
141 | + Instances data = new Instances("Head", fvWekaAttributes, 10); | |
142 | + data.setClass(data.attribute(classFeatureName)); | |
143 | + return data; | |
144 | + } | |
145 | + | |
146 | + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { | |
147 | + for (TreeMap<String, Object> example : examples) { | |
148 | + Instance instance = new Instance(instances.numAttributes()); | |
149 | + | |
150 | + for (Entry<String, Object> e : example.entrySet()) { | |
151 | + Object val = e.getValue(); | |
152 | + String name = e.getKey(); | |
153 | + if (val instanceof Integer) { | |
154 | + instance.setValue(instances.attribute(name), (int) val); | |
155 | + } else if (val instanceof Boolean) { | |
156 | + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); | |
157 | + } else { | |
158 | + int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); | |
159 | + if (indexOfValue == -1) { | |
160 | + logger.debug("Unkown value: " + val.toString() + " of feature: " + name | |
161 | + + ". Marking as missing value."); | |
162 | + instance.setMissing(instances.attribute(name)); | |
163 | + } else | |
164 | + instance.setValue(instances.attribute(name), indexOfValue); | |
165 | + } | |
166 | + } | |
167 | + | |
168 | + instance.setDataset(instances); | |
169 | + instances.add(instance); | |
170 | + } | |
171 | + } | |
172 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Model.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | |
2 | + | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
6 | +import weka.classifiers.Classifier; | |
7 | +import weka.core.Instance; | |
8 | +import weka.core.Instances; | |
9 | + | |
10 | +import java.io.Serializable; | |
11 | +import java.util.List; | |
12 | +import java.util.Set; | |
13 | +import java.util.TreeMap; | |
14 | + | |
15 | +public class Model implements Serializable { | |
16 | + | |
17 | + private static final long serialVersionUID = 3351727361273283076L; | |
18 | + private static final Logger logger = LoggerFactory.getLogger(Model.class); | |
19 | + | |
20 | + private Classifier classifier; | |
21 | + private Set<String> quasiVerbs; | |
22 | + private Instances instances; | |
23 | + | |
24 | + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { | |
25 | + this.classifier = classifier; | |
26 | + this.instances = instances; | |
27 | + this.quasiVerbs = quasiVerbs; | |
28 | + } | |
29 | + | |
30 | + public boolean isHead(Instance instance, Sentence sentence) { | |
31 | + try { | |
32 | + double response = this.classifier.classifyInstance(instance); | |
33 | + return response > 0; | |
34 | + } catch (Exception e) { | |
35 | + logger.error("Error classyfing head in sentence: " + sentence, e); | |
36 | + return false; | |
37 | + } | |
38 | + } | |
39 | + | |
40 | + public Instances getInstances(List<TreeMap<String, Object>> examples) { | |
41 | + Instances instances = new Instances(this.instances); | |
42 | + InstanceCreator.fillInstances(examples, instances); | |
43 | + return instances; | |
44 | + } | |
45 | + | |
46 | + public Set<String> getQuasiVerbs() { | |
47 | + return quasiVerbs; | |
48 | + } | |
49 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Serializer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | |
2 | + | |
3 | +import weka.core.SerializationHelper; | |
4 | + | |
5 | +import java.io.InputStream; | |
6 | + | |
7 | +public class Serializer { | |
8 | + | |
9 | + public static void saveModel(Model m, String targetModelFilePath) throws Exception { | |
10 | + SerializationHelper.write(targetModelFilePath, m); | |
11 | + } | |
12 | + | |
13 | + public static Model loadModel(String path) throws Exception { | |
14 | + Model m = (Model) SerializationHelper.read(path); | |
15 | + return m; | |
16 | + } | |
17 | + | |
18 | + public static Model loadModelFromStream(InputStream stream) throws Exception { | |
19 | + Model m = (Model) SerializationHelper.read(stream); | |
20 | + return m; | |
21 | + } | |
22 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Trainer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | |
2 | + | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import weka.classifiers.Evaluation; | |
6 | +import weka.classifiers.rules.JRip; | |
7 | +import weka.classifiers.rules.JRip.RipperRule; | |
8 | +import weka.core.Attribute; | |
9 | +import weka.core.Instance; | |
10 | +import weka.core.Instances; | |
11 | + | |
12 | +import java.io.*; | |
13 | +import java.util.*; | |
14 | + | |
15 | +public class Trainer { | |
16 | + | |
17 | + private static final Logger logger = LoggerFactory.getLogger(Trainer.class); | |
18 | + | |
19 | + private static final boolean DO_CV = false; | |
20 | + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; | |
21 | + | |
22 | + private Trainer() { | |
23 | + } | |
24 | + | |
25 | + public static void main(String[] args) { | |
26 | + | |
27 | + if (args.length != 2) { | |
28 | + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() | |
29 | + + " trainDir targetModelFile"); | |
30 | + return; | |
31 | + } | |
32 | + | |
33 | + File dataDir = new File(args[0]); | |
34 | + String targetModelFilePath = args[1]; | |
35 | + | |
36 | + if (!dataDir.isDirectory()) { | |
37 | + logger.error(dataDir + " is not a directory!"); | |
38 | + return; | |
39 | + } | |
40 | + | |
41 | + Set<String> quasiVerbs = loadQuasiVerbs(); | |
42 | + | |
43 | + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs); | |
44 | + Instances instances = InstanceCreator.createInstances(examples, "class"); | |
45 | + InstanceCreator.fillInstances(examples, instances); | |
46 | + | |
47 | + printStats(instances); | |
48 | + | |
49 | + try { | |
50 | + JRip model; | |
51 | + | |
52 | + if (DO_CV) { | |
53 | + logger.info("Crossvalidation..."); | |
54 | + model = new JRip(); | |
55 | + Evaluation eval = new Evaluation(instances); | |
56 | + eval.crossValidateModel(model, instances, 10, new Random(1)); | |
57 | + logger.info(eval.toSummaryString()); | |
58 | + logger.info(eval.toMatrixString()); | |
59 | + logger.info(eval.toClassDetailsString()); | |
60 | + } | |
61 | + | |
62 | + logger.info("Building final classifier..."); | |
63 | + model = new JRip(); | |
64 | + model.buildClassifier(instances); | |
65 | + logger.info(model.getRuleset().size() + " rules generated."); | |
66 | + for (int i = 0; i < model.getRuleset().size(); i++) { | |
67 | + RipperRule v = (RipperRule) model.getRuleset().elementAt(i); | |
68 | + logger.info("\t" + v.toString(instances.classAttribute())); | |
69 | + } | |
70 | + | |
71 | + instances.delete(); | |
72 | + logger.info("Features stats:"); | |
73 | + for (int i = 0; i < instances.numAttributes(); i++) { | |
74 | + Attribute att = instances.attribute(i); | |
75 | + logger.info(i + ".\t" + att.toString()); | |
76 | + } | |
77 | + | |
78 | + logger.info("Saving classifier..."); | |
79 | + Model m = new Model(model, instances, quasiVerbs); | |
80 | + Serializer.saveModel(m, targetModelFilePath); | |
81 | + logger.info("Done."); | |
82 | + | |
83 | + } catch (Exception e) { | |
84 | + logger.error("Error: " + e); | |
85 | + } | |
86 | + } | |
87 | + | |
88 | + private static Set<String> loadQuasiVerbs() { | |
89 | + Set<String> quasiVerbs = new HashSet<>(); | |
90 | + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); | |
91 | + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { | |
92 | + String line; | |
93 | + while ((line = br.readLine()) != null) { | |
94 | + quasiVerbs.add(line.trim()); | |
95 | + } | |
96 | + } catch (IOException e) { | |
97 | + logger.error(e.getLocalizedMessage(), e); | |
98 | + } | |
99 | + return quasiVerbs; | |
100 | + } | |
101 | + | |
102 | + private static void printStats(Instances instances) { | |
103 | + int positive = 0; | |
104 | + int negative = 0; | |
105 | + for (int i = 0; i < instances.numInstances(); i++) { | |
106 | + Instance inst = instances.instance(i); | |
107 | + if (inst.classValue() > 0) | |
108 | + negative++; | |
109 | + else | |
110 | + positive++; | |
111 | + } | |
112 | + logger.info(positive + " positive examples"); | |
113 | + logger.info(negative + " negative examples"); | |
114 | + logger.info((positive + negative) + " examples total"); | |
115 | + logger.info((instances.numAttributes() - 1) + " attributes"); | |
116 | + logger.info(instances.toSummaryString()); | |
117 | + } | |
118 | + | |
119 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/FeatureGeneration.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | |
2 | + | |
3 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | |
4 | +import pl.waw.ipipan.zil.core.md.detection.Constants; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
6 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; | |
7 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; | |
8 | + | |
9 | +import java.util.*; | |
10 | + | |
11 | + | |
12 | +public class FeatureGeneration { | |
13 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", | |
14 | + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); | |
15 | + | |
16 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale", | |
17 | + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie", | |
18 | + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" })); | |
19 | + | |
20 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>( | |
21 | + Arrays.asList(new String[] { "?", "!" })); | |
22 | + | |
23 | + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>(); | |
24 | + static { | |
25 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")"); | |
26 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\""); | |
27 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'"); | |
28 | + } | |
29 | + | |
30 | + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12", | |
31 | + "ppron3", "ger", "num", "numcol" })); | |
32 | + | |
33 | + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" })); | |
34 | + | |
35 | + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt", | |
36 | + "praet", "winien" })); | |
37 | + | |
38 | + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki", | |
39 | + "który" })); | |
40 | + | |
41 | + public static void generateFeatures(Map<String, Object> features, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
42 | + Token head, Token candidate, Sentence s, List<Token> heads) { | |
43 | + | |
44 | + //addTokenFeatures(features, "head", head, s); | |
45 | + addTokenFeatures(features, "candidate", candidate, s); | |
46 | + | |
47 | + //features.put("sentLength", s.size()); // ostatnie sprawdzone | |
48 | + features.put("sameWord", sameWord(head, candidate, s)); | |
49 | + features.put("sameNE", sameNE(head, candidate, s)); | |
50 | + features.put("sameNG", sameNG(head, candidate, s)); | |
51 | + | |
52 | + features.put("distance", Math.abs(head.getSentencePosition() - candidate.getSentencePosition())); | |
53 | + //features.put("headIsFirst", Boolean.valueOf(head.compareTo(candidate) < 0)); | |
54 | + features.put("candidateIsFirst", Boolean.valueOf(head.compareTo(candidate) > 0)); | |
55 | + | |
56 | + features.put("sameWalentyConstruction", sameWalentyConstruction(head, candidate, s, valence)); | |
57 | + features.put("sameToken", sameToken(head, candidate)); | |
58 | + | |
59 | + features.put("candidateIsAlsoHead", Boolean.valueOf(heads.contains(candidate))); | |
60 | + features.put("isNextToCandidateColon", isNextColon(candidate, s)); | |
61 | + | |
62 | + features.put("candidateStartsWithUpperOrth", Character.isUpperCase(candidate.getOrth().codePointAt(0))); | |
63 | + features.put("candidateStartsWithUpperBase", Character.isUpperCase(candidate.getBase().codePointAt(0))); | |
64 | + features.put("isDotNextToHead", isNextDot(head, s)); | |
65 | + features.put("closestNEDistance", closestNEDistance(head, candidate, s)); | |
66 | + features.put("headStartsWithUpperOrth", Character.isUpperCase(head.getOrth().codePointAt(0))); | |
67 | + features.put("headStartsWithUpperBase", Character.isUpperCase(head.getBase().codePointAt(0))); // tutaj optymalna wersja sie konczy | |
68 | + | |
69 | + | |
70 | + // candidate in head in closest NE distance | |
71 | + | |
72 | +// features.put("candidateOrthLength", candidate.getOrth().length()); | |
73 | +// features.put("candidateBaseLength", candidate.getBase().length()); | |
74 | +// features.put("headOrthLength", head.getOrth().length()); | |
75 | +// features.put("headBaseLength", head.getBase().length()); | |
76 | + | |
77 | + //features.put("isNextToHeadColon", isNextColon(head, s)); | |
78 | + //features.put("isCandidateColon", Boolean.valueOf(candidate.getOrth().equals(":"))); // tylko run zrobic, tak jeszcze nie sprawdzalem | |
79 | + | |
80 | +/* features.put("isClauseSplitLemmaStrict", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(candidate.getBase()))); | |
81 | + features.put("isClauseSplitLemma", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS.contains(candidate.getBase()))); | |
82 | + features.put("isClauseSplitLemma2", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(candidate.getBase())));*/ | |
83 | + | |
84 | +/* Token next = getNeighbouringToken(s, candidate, 1); | |
85 | + if (next != null) { | |
86 | + features.put("nextIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(next.getBase()))); | |
87 | + features.put("nextIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(next.getBase()))); | |
88 | + features.put("nextIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(next.getBase()))); | |
89 | + } else { | |
90 | + features.put("nextIsClauseSplitLemmaStrict", "sentEnd"); | |
91 | + features.put("nextIsClauseSplitLemma", "sentEnd"); | |
92 | + features.put("nextIsClauseSplitLemma2", "sentEnd"); | |
93 | + } | |
94 | + | |
95 | + Token previous = getNeighbouringToken(s, candidate, -1); | |
96 | + if (previous != null) { | |
97 | + features.put("previousIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(previous.getBase()))); | |
98 | + features.put("previousIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(previous.getBase()))); | |
99 | + features.put("previousIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(previous.getBase()))); | |
100 | + } else { | |
101 | + features.put("previousIsClauseSplitLemmaStrict", "sentStart"); | |
102 | + features.put("previousIsClauseSplitLemma", "sentStart"); | |
103 | + features.put("previousIsClauseSplitLemma2", "sentStart"); | |
104 | + }*/ | |
105 | + | |
106 | + | |
107 | + //features.put("candidateIsClosingBracket", candidateIsClosingBracket(head, candidate, s)); | |
108 | + //features.put("candidateIsQM", candidateIsClosingQM(head, candidate, s)); | |
109 | + //features.put("candidateIsClosingBracket", Boolean.valueOf(candidate.getOrth().equals(")"))); | |
110 | + | |
111 | + // pozycja glowy we wzmiance, da sie zasymulowac!!cos nie bangla | |
112 | + // jeszcze raz niestety trzeba sprawdzic ciaglosc prawostronna chyba | |
113 | + // head NG group length i walenty construction group Length dodac bo moze to dobrze zadzialac z odelgloscia | |
114 | + // is stop word dodac dla candidate i jakies rozwiazania z detekcji glowy moze | |
115 | + // zrobic tak zeby jeszcze sprawdzalo czy token przed jest czescia wzmianki | |
116 | + // z tymi separatorami tez sie pobawic | |
117 | + // word Ctag !! | |
118 | +/* | |
119 | + Token next = getNeighbouringToken(s, candidate, 1); | |
120 | + if (next != null) { | |
121 | + features.put(String.format("%sCtag", "nextToCandidate"), next.getChosenInterpretation().getCtag()); | |
122 | + features.put(String.format("%sNumber", "nextToCandidate"), next.getChosenInterpretation().getNumber()); | |
123 | + features.put(String.format("%sGender", "nextToCandidate"), next.getChosenInterpretation().getGender()); | |
124 | + features.put(String.format("%sPerson", "nextToCandidate"), next.getChosenInterpretation().getPerson()); | |
125 | + } else { | |
126 | + features.put(String.format("%sCtag", "nextToCandidate"), "null"); | |
127 | + features.put(String.format("%sNumber", "nextToCandidate"), "null"); | |
128 | + features.put(String.format("%sGender", "nextToCandidate"), "null"); | |
129 | + features.put(String.format("%sPerson", "nextToCandidate"), "null"); | |
130 | + } | |
131 | + | |
132 | + Token previous = getNeighbouringToken(s, candidate, -1); | |
133 | + if (previous != null) { | |
134 | + features.put(String.format("%sCtag", "previousToCandidate"), previous.getChosenInterpretation().getCtag()); | |
135 | + features.put(String.format("%sNumber", "previousToCandidate"), previous.getChosenInterpretation().getNumber()); | |
136 | + features.put(String.format("%sGender", "previousToCandidate"), previous.getChosenInterpretation().getGender()); | |
137 | + features.put(String.format("%sPerson", "previousToCandidate"), previous.getChosenInterpretation().getPerson()); | |
138 | + } else { | |
139 | + features.put(String.format("%sCtag", "previousToCandidate"), "null"); | |
140 | + features.put(String.format("%sNumber", "previousToCandidate"), "null"); | |
141 | + features.put(String.format("%sGender", "previousToCandidate"), "null"); | |
142 | + features.put(String.format("%sPerson", "previousToCandidate"), "null"); | |
143 | + } | |
144 | + */ | |
145 | + | |
146 | + | |
147 | + } | |
148 | + | |
149 | + private static int closestNEDistance(Token head, Token candidate, Sentence s) { | |
150 | + int lowestDistance = -1; | |
151 | + for (NamedEntity ne : s.getNamedEntities()) { | |
152 | + int distance = ne.getTokens().get(0).getSentencePosition() - head.getSentencePosition(); | |
153 | + if ( distance >= 0 && ne.getTokens().contains(candidate) && (distance < lowestDistance || lowestDistance < 0)) { | |
154 | + lowestDistance = distance; | |
155 | + } | |
156 | + } | |
157 | + return lowestDistance; | |
158 | + } | |
159 | + | |
160 | + ///////////////////////////// | |
161 | + | |
162 | +/* private static boolean candidateIsClosingBracket(Token head, Token candidate, Sentence s) { | |
163 | + | |
164 | + | |
165 | + | |
166 | + if (!candidate.getOrth().equals(")")) { | |
167 | + return Boolean.valueOf(false); | |
168 | + } | |
169 | + | |
170 | + int openedBrackets = 0; | |
171 | + int closedBrackets = 0; | |
172 | + for (Token t : s) { | |
173 | + if (candidate.getSentencePosition() == t.getSentencePosition()) { | |
174 | + break; | |
175 | + } | |
176 | + | |
177 | + if (t.getSentencePosition() >= head.getSentencePosition()) { | |
178 | + if (t.getOrth().equals("(")) | |
179 | + openedBrackets++; | |
180 | + if (t.getOrth().equals(")")) | |
181 | + closedBrackets++; | |
182 | + } | |
183 | + } | |
184 | + | |
185 | + if (openedBrackets - closedBrackets > 0) { | |
186 | + return Boolean.valueOf(true); | |
187 | + } | |
188 | + | |
189 | + return Boolean.valueOf(false); | |
190 | + }*/ | |
191 | + | |
192 | + private static boolean isNextColon(Token t, Sentence s) { | |
193 | + int idx = s.indexOf(t) + 1; | |
194 | + if (idx >= s.size() || idx < 0) | |
195 | + return Boolean.valueOf(false); | |
196 | + return Boolean.valueOf(s.get(idx).getOrth().equals(":")); | |
197 | + } | |
198 | + | |
199 | + private static boolean isNextDot(Token t, Sentence s) { | |
200 | + int idx = s.indexOf(t) + 1; | |
201 | + if (idx >= s.size() || idx < 0) | |
202 | + return Boolean.valueOf(false); | |
203 | + return Boolean.valueOf(s.get(idx).getOrth().equals(".")); | |
204 | + } | |
205 | + | |
206 | + private static boolean candidateIsClosingQM(Token head, Token candidate, Sentence s) { | |
207 | + | |
208 | + if (!candidate.getOrth().equals("\"")) { | |
209 | + return Boolean.valueOf(false); | |
210 | + } | |
211 | + | |
212 | + int start = head.getSentencePosition(); | |
213 | + int end = candidate.getSentencePosition() - 1; | |
214 | + if (head.compareTo(candidate) > 0) { | |
215 | + start = candidate.getSentencePosition() + 1; | |
216 | + end = head.getSentencePosition(); | |
217 | + } | |
218 | + | |
219 | + int QMs = 0; | |
220 | + for (Token t : s) { | |
221 | + if (end == t.getSentencePosition()) { | |
222 | + break; | |
223 | + } | |
224 | + | |
225 | + if (t.getSentencePosition() >= start) { | |
226 | + if (t.getOrth().equals("\"")) | |
227 | + QMs++; | |
228 | + } | |
229 | + } | |
230 | + | |
231 | + if ((QMs % 2) != 0) { | |
232 | + return Boolean.valueOf(true); | |
233 | + } | |
234 | + | |
235 | + return Boolean.valueOf(false); | |
236 | + } | |
237 | + | |
238 | + private static boolean sameWord(Token t1, Token t2, Sentence s) { | |
239 | + | |
240 | + for (SyntacticWord w : s.getSyntacticWords()) { | |
241 | + if (w.getTokens().contains(t1) && w.getTokens().contains(t2)) { | |
242 | + return Boolean.valueOf(true); | |
243 | + } | |
244 | + } | |
245 | + return Boolean.valueOf(false); | |
246 | + } | |
247 | + | |
248 | + private static boolean sameNE(Token t1, Token t2, Sentence s) { | |
249 | + | |
250 | + for (NamedEntity ne : s.getNamedEntities()) { | |
251 | + if (ne.getTokens().contains(t1) && ne.getTokens().contains(t2)) { | |
252 | + return Boolean.valueOf(true); | |
253 | + } | |
254 | + } | |
255 | + return Boolean.valueOf(false); | |
256 | + } | |
257 | + | |
258 | + private static boolean sameNG(Token head, Token candidate, Sentence s) { | |
259 | + | |
260 | + for (SyntacticGroup group : s.getGroups()) { | |
261 | + if (group.getType().startsWith("NG")) { | |
262 | + if (group.getSemanticHeadTokens().contains(head) && group.getTokens().contains(candidate)) { | |
263 | + return Boolean.valueOf(true); | |
264 | + } | |
265 | + } | |
266 | + } | |
267 | + return Boolean.valueOf(false); | |
268 | + } | |
269 | + | |
270 | + private static boolean sameWalentyConstruction(Token head, Token candidate, Sentence s, | |
271 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | |
272 | + | |
273 | + for (SyntacticGroup group : s.getGroups()) { | |
274 | + if (group.getType().startsWith("NG")) { | |
275 | + ArrayList<SyntacticGroup> nestedGroups = new ArrayList<SyntacticGroup>(); | |
276 | + nestedGroups.add(group); | |
277 | + | |
278 | + SyntacticGroup nextGroup = group.getFollowingGroup(); | |
279 | + while (nextGroup != null) { | |
280 | + nestedGroups.add(nextGroup); | |
281 | + nextGroup = nextGroup.getFollowingGroup(); | |
282 | + } | |
283 | + | |
284 | + List<Token> extendedGroupSegments = getExtendedGroupSegments(nestedGroups, valence.get(ValenceDicts.NounsValence)); | |
285 | + List<Token> extendedGroupHeads = getExtendedGroupHeads(nestedGroups); | |
286 | + if (extendedGroupHeads.contains(head) && extendedGroupSegments.contains(candidate)) | |
287 | + return Boolean.valueOf(true); | |
288 | + } | |
289 | + } | |
290 | + return Boolean.valueOf(false); | |
291 | + } | |
292 | + | |
293 | + private static List<Token> getExtendedGroupSegments(ArrayList<SyntacticGroup> nestedGroups, | |
294 | + Map<String,ArrayList<String>> walentyNouns) { | |
295 | + | |
296 | + SyntacticGroup initialGroup = nestedGroups.get(0); | |
297 | + String initialGroupHead = initialGroup.getSemanticHeadTokens().get(0).getBase(); | |
298 | + | |
299 | + List<Token> heads = initialGroup.getSemanticHeadTokens(); | |
300 | + List<Token> segments = new ArrayList<Token>(); | |
301 | + | |
302 | + if (!walentyNouns.containsKey(initialGroupHead)) { | |
303 | + segments.addAll(initialGroup.getTokens()); | |
304 | + } else { | |
305 | + | |
306 | + ArrayList<String> schemata = walentyNouns.get(initialGroupHead); | |
307 | + ArrayList<ArrayList<String>> groupsRealizations = new ArrayList<ArrayList<String>>(); | |
308 | + ArrayList<SyntacticGroup> largestMatch = new ArrayList<SyntacticGroup>(); | |
309 | + largestMatch.add(initialGroup); | |
310 | + | |
311 | + for (int i=1; i < nestedGroups.size(); i++) { | |
312 | + SyntacticGroup group = nestedGroups.get(i); | |
313 | + ArrayList<String> realizations = group.getWalentyRealizations(); | |
314 | + groupsRealizations.add(realizations); | |
315 | + if (realizationsMatch(schemata, groupsRealizations)) { | |
316 | + largestMatch.add(group); | |
317 | + } else { | |
318 | + break; | |
319 | + } | |
320 | + } | |
321 | + | |
322 | + for (SyntacticGroup group : largestMatch) { | |
323 | + segments.addAll(group.getTokens()); | |
324 | + } | |
325 | + | |
326 | + } | |
327 | + return segments; | |
328 | + } | |
329 | + | |
330 | + private static List<Token> getExtendedGroupHeads(ArrayList<SyntacticGroup> nestedGroups) { | |
331 | + | |
332 | + SyntacticGroup initialGroup = nestedGroups.get(0); | |
333 | + | |
334 | + List<Token> heads = initialGroup.getSemanticHeadTokens(); | |
335 | + | |
336 | + return heads; | |
337 | + } | |
338 | + | |
339 | + private static boolean realizationsMatch(ArrayList<String> schemata, | |
340 | + ArrayList<ArrayList<String>> groupsRealizations) { | |
341 | + for (String schema : schemata) { | |
342 | + if (isProperSchema(schema, groupsRealizations)) { | |
343 | + return true; | |
344 | + } | |
345 | + } | |
346 | + return false; | |
347 | + } | |
348 | + | |
349 | + private static boolean isProperSchema(String schema, | |
350 | + ArrayList<ArrayList<String>> groupsRealizations) { | |
351 | + | |
352 | + ArrayList<ArrayList<String>> matchingPositions = new ArrayList<ArrayList<String>>(); | |
353 | + for (ArrayList<String> realizations : groupsRealizations) { | |
354 | + matchingPositions.add(getMatchingPositions(schema, realizations)); | |
355 | + } | |
356 | + | |
357 | + if (matchingPositionsExists(matchingPositions)) { | |
358 | + return true; | |
359 | + /*ArrayList<ArrayList<String>> product = cartesianProduct(matchingPositions); | |
360 | + for (ArrayList<String> combination : product) { | |
361 | + Set<String> combinationSet = new HashSet<String>(combination); | |
362 | + if (combinationSet.size() == matchingPositions.size()) { | |
363 | + return true; | |
364 | + } | |
365 | + }*/ | |
366 | + } | |
367 | + return false; | |
368 | + } | |
369 | + | |
370 | + private static ArrayList<String> getMatchingPositions(String schema, ArrayList<String> phraseRealizations) { | |
371 | + ArrayList<String> positions = new ArrayList<String>(); | |
372 | + for (String position : schema.split("\\s\\+\\s")) { | |
373 | + position = position.trim(); | |
374 | + position = position.substring(1, position.length()-1); | |
375 | + for (String phrT : position.split(";")) { | |
376 | + if (phraseRealizations.contains(phrT.trim())) { | |
377 | + positions.add(position); | |
378 | + break; | |
379 | + } | |
380 | + } | |
381 | + } | |
382 | + return positions; | |
383 | + } | |
384 | + | |
385 | + private static boolean matchingPositionsExists(ArrayList<ArrayList<String>> matchingPositions) { | |
386 | + for (ArrayList<String> positions : matchingPositions) { | |
387 | + if (positions.isEmpty()) { | |
388 | + return false; | |
389 | + } | |
390 | + } | |
391 | + return true; | |
392 | + } | |
393 | + | |
394 | + private static boolean sameToken(Token t1, Token t2) { | |
395 | + if (t1.compareTo(t2) == 0) { | |
396 | + return Boolean.valueOf(true); | |
397 | + } | |
398 | + return Boolean.valueOf(false); | |
399 | + } | |
400 | + ////////////////////////////////// | |
401 | + | |
402 | + private static void addTokenFeatures(Map<String, Object> features, String label, Token t, Sentence s) { | |
403 | + features.put(String.format("%sCtag", label), t.getChosenInterpretation().getCtag()); | |
404 | + features.put(String.format("%sNumber", label), t.getChosenInterpretation().getNumber()); | |
405 | + features.put(String.format("%sGender", label), t.getChosenInterpretation().getGender()); | |
406 | + features.put(String.format("%sPerson", label), t.getChosenInterpretation().getPerson()); | |
407 | + features.put(String.format("%sWordCtag", label), wordCtag(t, s)); | |
408 | + | |
409 | + features.put(String.format("%sNextCtag", label), getNeighbouringTag(s, t, 1)); | |
410 | + features.put(String.format("%sPrevCtag", label), getNeighbouringTag(s, t, -1)); | |
411 | + | |
412 | + | |
413 | + Token next = getNeighbouringToken(s, t, 1); | |
414 | + if (next != null) { | |
415 | + features.put(String.format("%sNextWordCtag", label), wordCtag(next, s)); | |
416 | + } else { | |
417 | + features.put(String.format("%sNextWordCtag", label), "None"); | |
418 | + } | |
419 | + | |
420 | + Token previous = getNeighbouringToken(s, t, -1); | |
421 | + if (previous != null) { | |
422 | + features.put(String.format("%sPrevWordCtag", label), wordCtag(previous, s)); | |
423 | + } else { | |
424 | + features.put(String.format("%sPrevWordCtag", label), "None"); | |
425 | + } | |
426 | + | |
427 | +// features.put(String.format("%sNextNextCtag", label), getNeighbouringTag(s, t, 2)); | |
428 | +// features.put(String.format("%sPrevPrevCtag", label), getNeighbouringTag(s, t, -2)); | |
429 | + | |
430 | +// features.put(String.format("%sSentPosition", label), t.getSentencePosition()); | |
431 | + | |
432 | + | |
433 | +// features.put(String.format("%sPrevPraet", label), isPrevPraet(t, s)); | |
434 | +// features.put(String.format("%sPrevComma", label), isPrevComma(t, s)); | |
435 | +// features.put(String.format("%sPrev2Pred", label), isPrev2Pred(t, s)); | |
436 | +// features.put(String.format("%sNextInf", label), isNextInf(t, s)); | |
437 | + | |
438 | +/* List<Token> clause = getClause(s, t); | |
439 | + if (clause != null) | |
440 | + features.put(String.format("%sClauseLength", label), clause.size()); | |
441 | + else | |
442 | + features.put(String.format("%sClauseLength", label), 0);*/ | |
443 | + | |
444 | + /*addFeatures(features, clause, String.format("%sClause", label), t); | |
445 | + addFeatures(features, s, String.format("%sSent", label), t);*/ | |
446 | +// for (int i = 1; i < 6; i++) // zrobic to ale w oknie od head do candidate | |
447 | +// addFeatures(features, getWindow(s, t, i, 0), String.format("%sWindow_", label) + i + "_" + 0, t); | |
448 | +// for (int i = 1; i < 6; i++) | |
449 | +// addFeatures(features, getWindow(s, t, 0, i), String.format("%sWindow_", label) + 0 + "_" + i, t); | |
450 | +// for (int i = 1; i < 6; i++) | |
451 | +// addFeatures(features, getWindow(s, t, i, i), String.format("%sWindow_", label) + i + "_" + i, t); | |
452 | + } | |
453 | + | |
454 | + private static String wordCtag(Token t, Sentence s) { | |
455 | + for (SyntacticWord w : s.getSyntacticWords()) { | |
456 | + if (w.getTokens().contains(t)) { | |
457 | + return w.getCtag(); | |
458 | + } | |
459 | + } | |
460 | + return "None"; | |
461 | + } | |
462 | + | |
463 | + private static boolean isNextInf(Token m, Sentence s) { | |
464 | + boolean now = false; | |
465 | + for (Token morph : s) { | |
466 | + if (now) | |
467 | + return morph.getChosenInterpretation().getCtag().equals("inf"); | |
468 | + if (m.equals(morph)) | |
469 | + now = true; | |
470 | + } | |
471 | + return false; | |
472 | + } | |
473 | + | |
474 | + private static boolean isPrev2Pred(Token m, Sentence s) { | |
475 | + Token prev = null; | |
476 | + Token prev2 = null; | |
477 | + for (Token morph : s) { | |
478 | + if (m.equals(morph)) | |
479 | + break; | |
480 | + prev2 = prev; | |
481 | + prev = morph; | |
482 | + } | |
483 | + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred")) | |
484 | + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred")); | |
485 | + } | |
486 | + | |
487 | + private static Object isPrevComma(Token m, Sentence s) { | |
488 | + Token prev = null; | |
489 | + for (Token morph : s) { | |
490 | + if (m.equals(morph)) | |
491 | + break; | |
492 | + prev = morph; | |
493 | + } | |
494 | + return prev != null && prev.getChosenInterpretation().getBase().equals(","); | |
495 | + } | |
496 | + | |
497 | + private static String getNeighbouringTag(Sentence s, Token m, int i) { | |
498 | + int idx = s.indexOf(m) + i; | |
499 | + if (idx >= s.size() || idx < 0) | |
500 | + return "None"; | |
501 | + return s.get(idx).getChosenInterpretation().getCtag(); | |
502 | + } | |
503 | + | |
504 | + private static Token getNeighbouringToken(Sentence s, Token m, int i) { | |
505 | + int idx = s.indexOf(m) + i; | |
506 | + if (idx >= s.size() || idx < 0) | |
507 | + return null; | |
508 | + return s.get(idx); | |
509 | + } | |
510 | + | |
511 | + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) { | |
512 | + | |
513 | + boolean hasNom = false; // 1 | |
514 | + boolean hasNum = false; // 2 | |
515 | + boolean hasPOG = false; // 3 | |
516 | + | |
517 | + boolean hasNomNum = false; | |
518 | + boolean hasNumPOG = false; | |
519 | + boolean hasNomPOG = false; | |
520 | + boolean hasNomNumPOG = false; | |
521 | + | |
522 | + boolean has2Nom = false; | |
523 | + boolean has2NomPOG = false; | |
524 | + boolean has2POG = false; | |
525 | + | |
526 | + Token prev = null; | |
527 | + for (Token candidate : clause) { | |
528 | + | |
529 | + if (!isNoun(candidate) || isJakJako(prev)) { | |
530 | + prev = candidate; | |
531 | + continue; | |
532 | + } | |
533 | + | |
534 | + // nom, nom2 | |
535 | + if (isNom(candidate)) { | |
536 | + if (hasNom) | |
537 | + has2Nom = true; | |
538 | + hasNom = true; | |
539 | + } | |
540 | + // num | |
541 | + if (agreedNum(candidate, m)) { | |
542 | + hasNum = true; | |
543 | + } | |
544 | + // pog, pog2 | |
545 | + if (agreedGenderOrPerson(candidate, m)) { | |
546 | + if (hasPOG) | |
547 | + has2POG = true; | |
548 | + hasPOG = true; | |
549 | + } | |
550 | + | |
551 | + // nom num, nom num pog | |
552 | + if (isNom(candidate) && agreedNum(candidate, m)) { | |
553 | + if (agreedGenderOrPerson(candidate, m)) | |
554 | + hasNomNumPOG = true; | |
555 | + hasNomNum = true; | |
556 | + } | |
557 | + | |
558 | + // nom pog, num pog | |
559 | + if (agreedGenderOrPerson(candidate, m)) | |
560 | + if (isNom(candidate)) { | |
561 | + if (hasNomPOG) | |
562 | + has2NomPOG = true; | |
563 | + hasNomPOG = true; | |
564 | + } else if (agreedNum(candidate, m)) | |
565 | + hasNumPOG = true; | |
566 | + | |
567 | + prev = candidate; | |
568 | + } | |
569 | + | |
570 | + // features.put("conj_" + prefix, hasConj); | |
571 | + features.put("cand_2_nom_" + prefix, has2Nom); | |
572 | + features.put("cand_2_POG_" + prefix, has2POG); | |
573 | + features.put("cand_2_nom+POG_" + prefix, has2NomPOG); | |
574 | + | |
575 | + features.put("cand_nom_" + prefix, hasNom); | |
576 | + features.put("cand_num_" + prefix, hasNum); | |
577 | + features.put("cand_POG_" + prefix, hasPOG); | |
578 | + | |
579 | + features.put("cand_nom+num_" + prefix, hasNomNum); | |
580 | + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG); | |
581 | + features.put("cand_nom+POG_" + prefix, hasNomPOG); | |
582 | + features.put("cand_num+POG_" + prefix, hasNumPOG); | |
583 | + } | |
584 | + | |
585 | + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) { | |
586 | + | |
587 | + int idx = s.indexOf(m); | |
588 | + int from = Math.max(0, idx - pre); | |
589 | + int to = Math.min(s.size(), idx + post + 1); | |
590 | + | |
591 | + return new ArrayList<>(s.subList(from, to)); | |
592 | + } | |
593 | + | |
594 | + private static boolean isPrevPraet(Token m, Sentence s) { | |
595 | + Token prev = null; | |
596 | + for (Token morph : s) { | |
597 | + if (m.equals(morph)) | |
598 | + break; | |
599 | + prev = morph; | |
600 | + } | |
601 | + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet"); | |
602 | + } | |
603 | + | |
604 | + /** | |
605 | + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo, | |
606 | + * lub (jak przy streszczeniach: w środku musi być czasownik w formie | |
607 | + * osobowej), | |
608 | + * | |
609 | + * @param s | |
610 | + * sentence | |
611 | + * @param m2 | |
612 | + * token | |
613 | + * @return clause with the token | |
614 | + */ | |
615 | + public static List<Token> getClause(Sentence s, Token m2) { | |
616 | + | |
617 | + List<List<Token>> sublists = getClauses(s); | |
618 | + | |
619 | + for (List<Token> sub : sublists) | |
620 | + for (Token m : sub) | |
621 | + if (m.equals(m2)) | |
622 | + return sub; | |
623 | + | |
624 | + return null; | |
625 | + } | |
626 | + | |
627 | + public static List<List<Token>> getClauses(Sentence s) { | |
628 | + | |
629 | + Set<Token> noSplitMorphs = new HashSet<>(); | |
630 | + for (SyntacticGroup g : s.getGroups()) { | |
631 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | |
632 | + noSplitMorphs.add(m); | |
633 | + } | |
634 | + } | |
635 | + for (SyntacticWord g : s.getSyntacticWords()) { | |
636 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | |
637 | + noSplitMorphs.add(m); | |
638 | + } | |
639 | + } | |
640 | + | |
641 | + LinkedList<List<Token>> sublists = new LinkedList<>(); | |
642 | + List<Token> currentSublist = new ArrayList<>(); | |
643 | + boolean clauseHasVerb = false; | |
644 | + for (Token m : s) { | |
645 | + String base = m.getChosenInterpretation().getBase(); | |
646 | + if (!noSplitMorphs.contains(m) | |
647 | + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2 | |
648 | + .contains(base)) && clauseHasVerb))) { | |
649 | + sublists.add(currentSublist); | |
650 | + currentSublist = new ArrayList<>(); | |
651 | + clauseHasVerb = false; | |
652 | + } else { | |
653 | + if (isVerb(m)) | |
654 | + clauseHasVerb = true; | |
655 | + } | |
656 | + currentSublist.add(m); | |
657 | + } | |
658 | + if (currentSublist.size() > 0) { | |
659 | + if (clauseHasVerb) | |
660 | + sublists.add(currentSublist); | |
661 | + else if (!sublists.isEmpty()) | |
662 | + sublists.getLast().addAll(currentSublist); | |
663 | + } | |
664 | + | |
665 | + // merge clause beginning with zaimek wzgl. etc to previous clause | |
666 | + List<Token> prev = null; | |
667 | + Iterator<List<Token>> it = sublists.iterator(); | |
668 | + while (it.hasNext()) { | |
669 | + List<Token> sublist = it.next(); | |
670 | + boolean containsRelPron = false; | |
671 | + int i = 1; | |
672 | + for (Token m : sublist) { | |
673 | + if (i > 2) | |
674 | + break; | |
675 | + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) { | |
676 | + containsRelPron = true; | |
677 | + break; | |
678 | + } | |
679 | + i++; | |
680 | + } | |
681 | + if (prev != null && containsRelPron) { | |
682 | + prev.addAll(sublist); | |
683 | + it.remove(); | |
684 | + } else | |
685 | + prev = sublist; | |
686 | + } | |
687 | + | |
688 | + return sublists; | |
689 | + } | |
690 | + | |
691 | + private static boolean agreedNum(Token candidate, Token keyword) { | |
692 | + String keywordNum = keyword.getNumber(); | |
693 | + String wordNum = candidate.getNumber(); | |
694 | + return keywordNum.equals(wordNum); | |
695 | + } | |
696 | + | |
697 | + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) { | |
698 | + if (isPraet(keyword)) { | |
699 | + // praet has number:gender | |
700 | + String keywordGender = keyword.getGender(); | |
701 | + String wordGender = candidate.getGender(); | |
702 | + return keywordGender.equals(wordGender); | |
703 | + } else { | |
704 | + // other verbs have number:person | |
705 | + String keywordPerson = keyword.getPerson(); | |
706 | + String wordPerson = "ter"; // default | |
707 | + if (PRONOUN_TAGS.contains(candidate.getCtag())) | |
708 | + wordPerson = candidate.getPerson(); | |
709 | + return wordPerson.equals(keywordPerson); | |
710 | + } | |
711 | + } | |
712 | + | |
713 | + private static boolean isJakJako(Token prev) { | |
714 | + String base = prev == null ? null : prev.getBase(); | |
715 | + return prev != null && (base.equals("jak") || base.equals("jako")); | |
716 | + } | |
717 | + | |
718 | + private static boolean isPraet(Token keyword) { | |
719 | + return keyword.getCtag().equals("praet"); | |
720 | + } | |
721 | + | |
722 | + private static boolean isNom(Token candidate) { | |
723 | + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow | |
724 | + // tylko! | |
725 | + } | |
726 | + | |
727 | + public static boolean isNoun(Token m) { | |
728 | + return NOUN_TAGS.contains(m.getCtag()); | |
729 | + } | |
730 | + | |
731 | + public static boolean isNoun(Mention m) { | |
732 | + return NOUN_TAGS.contains(m.getHeadSegments().get(0).getCtag()); | |
733 | + } | |
734 | + | |
735 | + public static boolean isVerb(Token morph) { | |
736 | + return VERB_TAGS.contains(morph.getCtag()); | |
737 | + } | |
738 | + | |
739 | + public static boolean isVerb(Mention m) { | |
740 | + boolean hasOnlyVerbs = true; | |
741 | + for (Token morph : m.getSegments()) | |
742 | + if (!isVerb(morph)) { | |
743 | + hasOnlyVerbs = false; | |
744 | + break; | |
745 | + } | |
746 | + return hasOnlyVerbs; | |
747 | + } | |
748 | + | |
749 | + public static boolean isVerb(TEIMention m) { | |
750 | + boolean hasOnlyVerbs = true; | |
751 | + for (TEIMorph morph : m.getMorphs()) | |
752 | + if (!isVerb(morph)) { | |
753 | + hasOnlyVerbs = false; | |
754 | + break; | |
755 | + } | |
756 | + return hasOnlyVerbs; | |
757 | + } | |
758 | + | |
759 | + private static boolean isVerb(TEIMorph morph) { | |
760 | + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag()); | |
761 | + } | |
762 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/InstanceCreator.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | |
2 | + | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | + | |
6 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | |
7 | +import pl.waw.ipipan.zil.core.md.entities.*; | |
8 | +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; | |
9 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | |
10 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; | |
11 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | |
12 | +import weka.core.Attribute; | |
13 | +import weka.core.FastVector; | |
14 | +import weka.core.Instance; | |
15 | +import weka.core.Instances; | |
16 | + | |
17 | +import java.io.File; | |
18 | +import java.util.*; | |
19 | +import java.util.Map.Entry; | |
20 | + | |
21 | +public class InstanceCreator { | |
22 | + | |
23 | + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class); | |
24 | + private static final TEI_IO teiIO = TEI_IO.getInstance(); | |
25 | + | |
26 | + private InstanceCreator() { | |
27 | + } | |
28 | + | |
29 | + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs, | |
30 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | |
31 | + int allTexts = 0; | |
32 | + int exceptions = 0; | |
33 | + int allSentences = 0; | |
34 | + | |
35 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
36 | + for (File textDir : IOUtils.getNKJPDirs(dataDir)) { | |
37 | + try { | |
38 | + allTexts++; | |
39 | + logger.info("Processing text " + textDir); | |
40 | + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); | |
41 | + Text text = TeiLoader.loadTextFromTei(ct, textDir); | |
42 | + | |
43 | + for (Paragraph p : text) | |
44 | + for (Sentence s : p) { | |
45 | + allSentences++; | |
46 | + loadExamplesFromSentence(quasiVerbs, valence, examples, s); | |
47 | + } | |
48 | + | |
49 | + } catch (Exception e) { | |
50 | + //logger.error(e.getLocalizedMessage()); | |
51 | + e.printStackTrace(); | |
52 | + exceptions++; | |
53 | + } | |
54 | + } | |
55 | + | |
56 | + logger.info(allTexts + " texts found."); | |
57 | + if (exceptions != 0) | |
58 | + logger.error(exceptions + " texts with exceptions."); | |
59 | + logger.info(allSentences + " sentences found."); | |
60 | + | |
61 | + return examples; | |
62 | + } | |
63 | + | |
64 | + public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
65 | + List<TreeMap<String, Object>> examples, Sentence s) { | |
66 | + | |
67 | + | |
68 | + ArrayList<Token> heads = new ArrayList<>(); | |
69 | + for (Mention m : s.getMentions()) { | |
70 | + heads.addAll(m.getHeadSegments()); | |
71 | + } | |
72 | + | |
73 | + // collect positive examples | |
74 | + HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>(); | |
75 | + for (Mention m : s.getMentions()) { | |
76 | + if (heads.containsAll(m.getHeadSegments())) { | |
77 | + positives.put(m.getHeadSegments().get(0), m.getSegments()); | |
78 | + } | |
79 | + } | |
80 | + | |
81 | + for (Token head : s) { | |
82 | + if (heads.contains(head)) { | |
83 | + for (Token t : s) { | |
84 | + //if (head.compareTo(t) != 0) {// && Math.abs(head.getSentencePosition() - t.getSentencePosition()) <= window) { | |
85 | + TreeMap<String, Object> features = new TreeMap<>(); | |
86 | + if (positives.containsKey(head) && positives.get(head).contains(t)) { | |
87 | + features.put("class", Boolean.valueOf(true)); | |
88 | + //features.put("candidatePositionInMention", positionInMention(head, t, s)); | |
89 | + | |
90 | + } else { | |
91 | + features.put("class", Boolean.valueOf(false)); | |
92 | + //features.put("candidatePositionInMention", 0); | |
93 | + } | |
94 | + | |
95 | + | |
96 | + FeatureGeneration.generateFeatures(features, valence, head, t, s, heads); | |
97 | + //features.put("candidatePositionInMention", positionInMention(head, t, s)); | |
98 | + addPreviousStates(features, head, t, s); | |
99 | + | |
100 | + examples.add(features); | |
101 | + // } | |
102 | + } | |
103 | + } | |
104 | + } | |
105 | + } | |
106 | + | |
107 | + public static void addPreviousStates(Map<String, Object> features, Token head, Token candidate, Sentence s) { | |
108 | + int context = 1; | |
109 | + int candidateLocation = candidate.getSentencePosition(); | |
110 | + for (int i = 1; i <= context; i++) { | |
111 | + if (candidateLocation - i < 0) { | |
112 | + features.put(String.format("location-%d", i), Boolean.valueOf(false)); | |
113 | + } else if (sameMention(s.get(candidateLocation - i), head, s) ) { | |
114 | + features.put(String.format("location-%d", i), Boolean.valueOf(true)); | |
115 | + } else { | |
116 | + features.put(String.format("location-%d", i), Boolean.valueOf(false)); | |
117 | + } | |
118 | + } | |
119 | + } | |
120 | + | |
121 | + public static int positionInMention(Token head, Token t, Sentence s) { | |
122 | + | |
123 | + Token previous = null; | |
124 | + if (t.getSentencePosition()-1 >= 0) { | |
125 | + previous = s.get(t.getSentencePosition()-1); | |
126 | + } else { | |
127 | + return 0; | |
128 | + } | |
129 | + | |
130 | + for (Mention m : s.getMentions()) { | |
131 | + if (m.getHeadSegments().contains(head) && m.getSegments().contains(previous)) { | |
132 | +/* if (m.getSegments().get(0).getSentencePosition() - t.getSentencePosition() <= -1) { | |
133 | + System.out.println(m.getSegments().get(0)); | |
134 | + System.out.println(t); | |
135 | + System.out.println(m.getSegments()); | |
136 | + }*/ | |
137 | + return previous.getSentencePosition() - m.getSegments().get(0).getSentencePosition(); | |
138 | + } | |
139 | + } | |
140 | + return 0; | |
141 | + } | |
142 | + | |
143 | + private static boolean sameMention(Token t1, Token t2, Sentence s) { | |
144 | + for (Mention m : s.getMentions()) { | |
145 | + if (m.getSegments().contains(t1) && m.getSegments().contains(t2)) { | |
146 | + return true; | |
147 | + } | |
148 | + } | |
149 | + return false; | |
150 | + } | |
151 | + | |
152 | + public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
153 | + List<TreeMap<String, Object>> examples, Sentence s, List<Token> heads) { | |
154 | + | |
155 | + | |
156 | + if (heads == null || heads.isEmpty()) | |
157 | + return; | |
158 | + | |
159 | + // collect positive examples | |
160 | + HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>(); | |
161 | + for (Mention m : s.getMentions()) { | |
162 | + if (heads.containsAll(m.getHeadSegments())) { | |
163 | + positives.put(m.getHeadSegments().get(0), m.getSegments()); | |
164 | + } | |
165 | + } | |
166 | + | |
167 | + for (Token head : s) { | |
168 | + if (heads.contains(head)) { | |
169 | + for (Token t : s) { | |
170 | + TreeMap<String, Object> features = new TreeMap<>(); | |
171 | + | |
172 | + if (positives.containsKey(head) && positives.get(head).contains(t)) { | |
173 | + features.put("class", Boolean.valueOf(true)); | |
174 | + //features.put("candidatePositionInMention", positionInMention(head, t, s)); | |
175 | + | |
176 | + } else { | |
177 | + features.put("class", Boolean.valueOf(false)); | |
178 | + //features.put("candidatePositionInMention", 0); | |
179 | + } | |
180 | + | |
181 | + FeatureGeneration.generateFeatures(features, valence, head, t, s, heads); | |
182 | + //features.put("candidatePositionInMention", positionInMention(head, t, s)); | |
183 | + addPreviousStates(features, head, t, s); | |
184 | + examples.add(features); | |
185 | + } | |
186 | + } | |
187 | + } | |
188 | + } | |
189 | + | |
190 | + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { | |
191 | + | |
192 | + TreeSet<String> booleanAttsOccurred = new TreeSet<>(); | |
193 | + TreeSet<String> doubleAttsOccurred = new TreeSet<>(); | |
194 | + TreeMap<String, Set<String>> att2values = new TreeMap<>(); | |
195 | + for (TreeMap<String, Object> example : examples) { | |
196 | + for (Entry<String, Object> e : example.entrySet()) { | |
197 | + String key = e.getKey(); | |
198 | + Object val = e.getValue(); | |
199 | + if (val instanceof Integer || val instanceof Double) { | |
200 | + doubleAttsOccurred.add(key); | |
201 | + continue; | |
202 | + } | |
203 | + if (val instanceof Boolean) { | |
204 | + booleanAttsOccurred.add(key); | |
205 | + continue; | |
206 | + } | |
207 | + if (!att2values.containsKey(key)) | |
208 | + att2values.put(key, new HashSet<>()); | |
209 | + att2values.get(key).add(val.toString()); | |
210 | + } | |
211 | + } | |
212 | + | |
213 | + List<Attribute> atts = new ArrayList<>(); | |
214 | + | |
215 | + // double attributes | |
216 | + for (String attName : doubleAttsOccurred) { | |
217 | + Attribute att = new Attribute(attName); | |
218 | + atts.add(att); | |
219 | + } | |
220 | + | |
221 | + // boolean attributes (treated as nominal) | |
222 | + FastVector values = new FastVector(2); | |
223 | + values.addElement("false"); | |
224 | + values.addElement("true"); | |
225 | + for (String attName : booleanAttsOccurred) { | |
226 | + Attribute att = new Attribute(attName, values); | |
227 | + atts.add(att); | |
228 | + } | |
229 | + | |
230 | + // nominal attributes | |
231 | + for (Entry<String, Set<String>> attVals : att2values.entrySet()) { | |
232 | + FastVector vals = new FastVector(attVals.getValue().size()); | |
233 | + for (String val : attVals.getValue()) | |
234 | + vals.addElement(val); | |
235 | + Attribute att = new Attribute(attVals.getKey(), vals); | |
236 | + atts.add(att); | |
237 | + } | |
238 | + | |
239 | + FastVector fvWekaAttributes = new FastVector(atts.size()); | |
240 | + for (Attribute attr : atts) { | |
241 | + fvWekaAttributes.addElement(attr); | |
242 | + } | |
243 | + | |
244 | + Instances data = new Instances("Nominal", fvWekaAttributes, 10); | |
245 | + data.setClass(data.attribute(classFeatureName)); | |
246 | + return data; | |
247 | + } | |
248 | + | |
249 | + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { | |
250 | + for (TreeMap<String, Object> example : examples) { | |
251 | + addInstance(example, instances); | |
252 | + } | |
253 | + } | |
254 | + | |
255 | + public static void addInstance(TreeMap<String, Object> example, Instances instances) { | |
256 | + Instance instance = new Instance(instances.numAttributes()); | |
257 | + | |
258 | + for (Entry<String, Object> e : example.entrySet()) { | |
259 | + Object val = e.getValue(); | |
260 | + String name = e.getKey(); | |
261 | + if (val instanceof Integer) { | |
262 | + instance.setValue(instances.attribute(name), (int) val); | |
263 | + } else if (val instanceof Boolean) { | |
264 | + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); | |
265 | + } else { | |
266 | + int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); | |
267 | + if (indexOfValue == -1) { | |
268 | + logger.debug("Unkown value: " + val.toString() + " of feature: " + name | |
269 | + + ". Marking as missing value."); | |
270 | + instance.setMissing(instances.attribute(name)); | |
271 | + } else | |
272 | + instance.setValue(instances.attribute(name), indexOfValue); | |
273 | + } | |
274 | + } | |
275 | + | |
276 | + instance.setDataset(instances); | |
277 | + instances.add(instance); | |
278 | + } | |
279 | + | |
280 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Model.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | |
2 | + | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
6 | +import weka.classifiers.Classifier; | |
7 | +import weka.core.Instance; | |
8 | +import weka.core.Instances; | |
9 | + | |
10 | +import java.io.Serializable; | |
11 | +import java.util.List; | |
12 | +import java.util.Set; | |
13 | +import java.util.TreeMap; | |
14 | + | |
15 | +public class Model implements Serializable { | |
16 | + | |
17 | + private static final long serialVersionUID = 3351727361273283076L; | |
18 | + private static final Logger logger = LoggerFactory.getLogger(Model.class); | |
19 | + | |
20 | + private Classifier classifier; | |
21 | + private Set<String> quasiVerbs; | |
22 | + private Instances instances; | |
23 | + | |
24 | + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { | |
25 | + this.classifier = classifier; | |
26 | + this.instances = instances; | |
27 | + this.quasiVerbs = quasiVerbs; | |
28 | + } | |
29 | + | |
30 | + public boolean arePartOfSameMention(Instance instance, Sentence sentence) { | |
31 | + try { | |
32 | + double response = this.classifier.classifyInstance(instance); | |
33 | + return response > 0; | |
34 | + } catch (Exception e) { | |
35 | + logger.error("Error classyfing verb in sentence: " + sentence, e); | |
36 | + return false; | |
37 | + } | |
38 | + } | |
39 | + | |
40 | + public Instances getInstances(List<TreeMap<String, Object>> examples) { | |
41 | + Instances instances = new Instances(this.instances); | |
42 | + InstanceCreator.fillInstances(examples, instances); | |
43 | + return instances; | |
44 | + } | |
45 | + | |
46 | + public Instances getInstances() { | |
47 | + Instances instances = new Instances(this.instances); | |
48 | + return instances; | |
49 | + } | |
50 | + | |
51 | + public Set<String> getQuasiVerbs() { | |
52 | + return quasiVerbs; | |
53 | + } | |
54 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/NominalMentionDetector.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | |
2 | + | |
3 | +import java.io.File; | |
4 | +import java.io.InputStream; | |
5 | +import java.util.ArrayList; | |
6 | +import java.util.HashSet; | |
7 | +import java.util.List; | |
8 | +import java.util.Map; | |
9 | +import java.util.Set; | |
10 | +import java.util.TreeMap; | |
11 | +import java.util.Map.Entry; | |
12 | + | |
13 | +import org.slf4j.Logger; | |
14 | +import org.slf4j.LoggerFactory; | |
15 | + | |
16 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | |
17 | +import pl.waw.ipipan.zil.core.md.detection.nominal.FeatureGeneration; | |
18 | +import pl.waw.ipipan.zil.core.md.detection.nominal.InstanceCreator; | |
19 | +import pl.waw.ipipan.zil.core.md.detection.nominal.Model; | |
20 | +import pl.waw.ipipan.zil.core.md.detection.nominal.Serializer; | |
21 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | |
22 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | |
23 | +import pl.waw.ipipan.zil.core.md.entities.Token; | |
24 | +import weka.core.Instances; | |
25 | + | |
26 | +public class NominalMentionDetector { | |
27 | + final private static Logger logger = LoggerFactory.getLogger(NominalMentionDetector.class); | |
28 | + | |
29 | + private Model model; | |
30 | + private Set<String> quasiVerbs = new HashSet<>(); | |
31 | + | |
32 | + public void addNominalMentions(Sentence sentence, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, List<Token> heads) { | |
33 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | |
34 | + InstanceCreator.loadExamplesFromSentence(quasiVerbs, valence, examples, sentence, heads); | |
35 | + if (examples.isEmpty()) | |
36 | + return; | |
37 | + | |
38 | + Instances instances = model.getInstances(); | |
39 | + | |
40 | + // label instances | |
41 | + List<Boolean> areInSameMention = new ArrayList<>(); | |
42 | + for (int i = 0; i < examples.size(); i++) { | |
43 | + TreeMap<String, Object> example = examples.get(i); | |
44 | + if (i - 1 < 0) { | |
45 | + example.put("location-1", Boolean.valueOf(false)); | |
46 | + //example.put("candidatePositionInMention", 0); | |
47 | + } else { | |
48 | + example.put("location-1", Boolean.valueOf(areInSameMention.get(i-1))); | |
49 | +// int positionInMention = 1; | |
50 | +// while (i - positionInMention >= 0 && areInSameMention.get(i-positionInMention)) { | |
51 | +// positionInMention++; | |
52 | +// } | |
53 | +// example.put("candidatePositionInMention", positionInMention-1); | |
54 | + } | |
55 | + | |
56 | + InstanceCreator.addInstance(example, instances); | |
57 | + boolean inSameMention = model.arePartOfSameMention(instances.instance(i), sentence); | |
58 | + areInSameMention.add(inSameMention); | |
59 | + } | |
60 | + | |
61 | + int i = 0; | |
62 | + for (Token head : sentence) { | |
63 | + if (heads.contains(head)) { | |
64 | + ArrayList<Token> mSegments = new ArrayList<Token>(); | |
65 | + ArrayList<Token> mHead = new ArrayList<Token>(); | |
66 | + mHead.add(head); | |
67 | + for (Token t : sentence) { | |
68 | + if (head.compareTo(t) != 0) { | |
69 | + if (areInSameMention.get(i)) { | |
70 | + mSegments.add(t); | |
71 | + } | |
72 | + } else { | |
73 | + mSegments.add(t); | |
74 | + } | |
75 | + i++; | |
76 | + } | |
77 | + | |
78 | + // cleaning | |
79 | + if(mSegments.get(mSegments.size()-1).getCtag().equals("prep") || mSegments.get(mSegments.size()-1).getCtag().equals("conj") || | |
80 | + mSegments.get(mSegments.size()-1).getCtag().equals("comp")) { | |
81 | + mSegments.remove(mSegments.size()-1); | |
82 | + } | |
83 | + if(mSegments.get(0).getCtag().equals("prep") || mSegments.get(0).getCtag().equals("conj") || | |
84 | + mSegments.get(0).getCtag().equals("comp")) { | |
85 | + mSegments.remove(0); | |
86 | + } | |
87 | + | |
88 | + sentence.addMention(new Mention(mSegments, mHead)); | |
89 | + } | |
90 | + } | |
91 | + } | |
92 | + | |
93 | + public NominalMentionDetector(File zeroSubjectDetectionModel) { | |
94 | + try { | |
95 | + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); | |
96 | + this.quasiVerbs = this.model.getQuasiVerbs(); | |
97 | + } catch (Exception e) { | |
98 | + logger.error("Error loading model:" + e); | |
99 | + } | |
100 | + } | |
101 | + | |
102 | + public NominalMentionDetector(InputStream zeroSubjectDetectionModelStream) { | |
103 | + try { | |
104 | + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); | |
105 | + this.quasiVerbs = this.model.getQuasiVerbs(); | |
106 | + } catch (Exception e) { | |
107 | + logger.error("Error loading model:" + e); | |
108 | + } | |
109 | + } | |
110 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Serializer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | |
2 | + | |
3 | +import weka.core.SerializationHelper; | |
4 | + | |
5 | +import java.io.InputStream; | |
6 | + | |
7 | +public class Serializer { | |
8 | + | |
9 | + public static void saveModel(Model m, String targetModelFilePath) throws Exception { | |
10 | + SerializationHelper.write(targetModelFilePath, m); | |
11 | + } | |
12 | + | |
13 | + public static Model loadModel(String path) throws Exception { | |
14 | + Model m = (Model) SerializationHelper.read(path); | |
15 | + return m; | |
16 | + } | |
17 | + | |
18 | + public static Model loadModelFromStream(InputStream stream) throws Exception { | |
19 | + Model m = (Model) SerializationHelper.read(stream); | |
20 | + return m; | |
21 | + } | |
22 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Trainer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | |
2 | + | |
3 | +import org.slf4j.Logger; | |
4 | +import org.slf4j.LoggerFactory; | |
5 | + | |
6 | +import pl.waw.ipipan.zil.core.md.Main; | |
7 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | |
8 | +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector; | |
9 | +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | |
10 | +import weka.classifiers.Evaluation; | |
11 | +import weka.classifiers.rules.JRip; | |
12 | +import weka.classifiers.rules.JRip.RipperRule; | |
13 | +import weka.classifiers.trees.J48; | |
14 | +import weka.core.Attribute; | |
15 | +import weka.core.Instance; | |
16 | +import weka.core.Instances; | |
17 | + | |
18 | +import java.io.*; | |
19 | +import java.util.*; | |
20 | + | |
21 | +public class Trainer { | |
22 | + | |
23 | + private static final Logger logger = LoggerFactory.getLogger(Trainer.class); | |
24 | + | |
25 | + private static final boolean DO_CV = false; | |
26 | + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; | |
27 | + private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt"; | |
28 | + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt"; | |
29 | + | |
30 | + private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = | |
31 | + new EnumMap(ValenceDicts.class); | |
32 | + | |
33 | + static { | |
34 | + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE); | |
35 | + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream)); | |
36 | + | |
37 | + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); | |
38 | + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); | |
39 | + } | |
40 | + | |
41 | + private Trainer() { | |
42 | + } | |
43 | + | |
44 | + public static void main(String[] args) { | |
45 | + | |
46 | + if (args.length != 2) { | |
47 | + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() | |
48 | + + " trainDir targetModelFile"); | |
49 | + return; | |
50 | + } | |
51 | + | |
52 | + File dataDir = new File(args[0]); | |
53 | + String targetModelFilePath = args[1]; | |
54 | + | |
55 | + if (!dataDir.isDirectory()) { | |
56 | + logger.error(dataDir + " is not a directory!"); | |
57 | + return; | |
58 | + } | |
59 | + | |
60 | + Set<String> quasiVerbs = loadQuasiVerbs(); | |
61 | + | |
62 | + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE); | |
63 | + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream)); | |
64 | + | |
65 | + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); | |
66 | + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); | |
67 | + | |
68 | + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs, valence); | |
69 | + Instances instances = InstanceCreator.createInstances(examples, "class"); | |
70 | + InstanceCreator.fillInstances(examples, instances); | |
71 | + | |
72 | + printStats(instances); | |
73 | + | |
74 | + try { | |
75 | + J48 model; | |
76 | + | |
77 | + logger.info("Building final classifier..."); | |
78 | + model = new J48(); | |
79 | + model.buildClassifier(instances); | |
80 | + logger.info("J48 tree:"); | |
81 | + logger.info(model.toString()); | |
82 | + | |
83 | + instances.delete(); | |
84 | + logger.info("Features stats:"); | |
85 | + for (int i = 0; i < instances.numAttributes(); i++) { | |
86 | + Attribute att = instances.attribute(i); | |
87 | + logger.info(i + ".\t" + att.toString()); | |
88 | + } | |
89 | + | |
90 | + logger.info("Saving classifier..."); | |
91 | + Model m = new Model(model, instances, quasiVerbs); | |
92 | + Serializer.saveModel(m, targetModelFilePath); | |
93 | + logger.info("Done."); | |
94 | + | |
95 | + } catch (Exception e) { | |
96 | + logger.error("Error: " + e); | |
97 | + } | |
98 | + | |
99 | +/* try { | |
100 | + JRip model; | |
101 | + | |
102 | + if (DO_CV) { | |
103 | + logger.info("Crossvalidation..."); | |
104 | + model = new JRip(); | |
105 | + Evaluation eval = new Evaluation(instances); | |
106 | + eval.crossValidateModel(model, instances, 10, new Random(1)); | |
107 | + logger.info(eval.toSummaryString()); | |
108 | + logger.info(eval.toMatrixString()); | |
109 | + logger.info(eval.toClassDetailsString()); | |
110 | + } | |
111 | + | |
112 | + logger.info("Building final classifier..."); | |
113 | + model = new JRip(); | |
114 | + model.buildClassifier(instances); | |
115 | + logger.info(model.getRuleset().size() + " rules generated."); | |
116 | + for (int i = 0; i < model.getRuleset().size(); i++) { | |
117 | + RipperRule v = (RipperRule) model.getRuleset().elementAt(i); | |
118 | + logger.info("\t" + v.toString(instances.classAttribute())); | |
119 | + } | |
120 | + | |
121 | + instances.delete(); | |
122 | + logger.info("Features stats:"); | |
123 | + for (int i = 0; i < instances.numAttributes(); i++) { | |
124 | + Attribute att = instances.attribute(i); | |
125 | + logger.info(i + ".\t" + att.toString()); | |
126 | + } | |
127 | + | |
128 | + logger.info("Saving classifier..."); | |
129 | + Model m = new Model(model, instances, quasiVerbs); | |
130 | + Serializer.saveModel(m, targetModelFilePath); | |
131 | + logger.info("Done."); | |
132 | + | |
133 | + } catch (Exception e) { | |
134 | + logger.error("Error: " + e); | |
135 | + }*/ | |
136 | + } | |
137 | + | |
138 | + public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream) | |
139 | + { | |
140 | + Map<String,ArrayList<String>> map; | |
141 | + try { | |
142 | + BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream)); | |
143 | + map = new HashMap<String,ArrayList<String>>(); | |
144 | + String line; | |
145 | + boolean firstLine = true; | |
146 | + while((line = br.readLine()) != null) { | |
147 | + if (firstLine) { | |
148 | + line = line.replace("\uFEFF", ""); // remove BOM character | |
149 | + firstLine = false; | |
150 | + } | |
151 | + | |
152 | + if (!line.startsWith("%")) { | |
153 | + String[] lineParts = line.split(":"); | |
154 | + String lemma = lineParts[0].trim(); | |
155 | + String schema = lineParts[5].trim(); | |
156 | + | |
157 | + if (schema.trim().isEmpty()) { | |
158 | + continue; | |
159 | + } | |
160 | + | |
161 | + String[] lemmaParts = lemma.split(" "); | |
162 | + if(lemmaParts.length == 1 && schemaContainsSie(schema)) { | |
163 | + lemma = lemma + " się"; | |
164 | + } | |
165 | + | |
166 | + ArrayList<String> schemata; | |
167 | + if (!map.containsKey(lemma)) { | |
168 | + schemata = new ArrayList<String>(); | |
169 | + schemata.add(schema); | |
170 | + map.put(lemma, schemata); | |
171 | + } else { | |
172 | + schemata = map.get(lemma); | |
173 | + schemata.add(schema); | |
174 | + map.put(lemma, schemata); | |
175 | + } | |
176 | + } | |
177 | + } | |
178 | + br.close(); | |
179 | + } catch (IOException ex) { | |
180 | + ex.printStackTrace(); | |
181 | + throw new RuntimeException(ex); | |
182 | + } | |
183 | + return map; | |
184 | + } | |
185 | + | |
186 | + private static boolean schemaContainsSie(String schema) { | |
187 | + for (String position : schema.split("\\s\\+\\s")) { | |
188 | + position = position.trim(); | |
189 | + position = position.substring(1, position.length()-1); | |
190 | + for (String phrT : position.split(";")) { | |
191 | + if (phrT.equals("refl") || phrT.equals("recip")) { | |
192 | + return true; | |
193 | + } | |
194 | + } | |
195 | + } | |
196 | + | |
197 | + return false; | |
198 | + } | |
199 | + | |
200 | + private static Set<String> loadQuasiVerbs() { | |
201 | + Set<String> quasiVerbs = new HashSet<>(); | |
202 | + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); | |
203 | + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { | |
204 | + String line; | |
205 | + while ((line = br.readLine()) != null) { | |
206 | + quasiVerbs.add(line.trim()); | |
207 | + } | |
208 | + } catch (IOException e) { | |
209 | + logger.error(e.getLocalizedMessage(), e); | |
210 | + } | |
211 | + return quasiVerbs; | |
212 | + } | |
213 | + | |
214 | + private static void printStats(Instances instances) { | |
215 | + int positive = 0; | |
216 | + int negative = 0; | |
217 | + for (int i = 0; i < instances.numInstances(); i++) { | |
218 | + Instance inst = instances.instance(i); | |
219 | + if (inst.classValue() > 0) | |
220 | + negative++; | |
221 | + else | |
222 | + positive++; | |
223 | + } | |
224 | + logger.info(positive + " positive examples"); | |
225 | + logger.info(negative + " negative examples"); | |
226 | + logger.info((positive + negative) + " examples total"); | |
227 | + logger.info((instances.numAttributes() - 1) + " attributes"); | |
228 | + logger.info(instances.toSummaryString()); | |
229 | + } | |
230 | + | |
231 | +} | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Relation.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.entities; | |
2 | + | |
3 | +public class Relation { | |
4 | + | |
5 | + private String name; | |
6 | + private Token target; | |
7 | + | |
8 | + public Relation(String name, Token target) { | |
9 | + this.name = name; | |
10 | + this.target = target; | |
11 | + } | |
12 | + | |
13 | + public String getName() { | |
14 | + return name; | |
15 | + } | |
16 | + | |
17 | + public Token getTarget() { | |
18 | + return target; | |
19 | + } | |
20 | + | |
21 | +} | |
... | ... |