Commit 3d23a642e950208184da7ac7d198861326de7415
1 parent
2d60e476
Added missing files.
Showing
14 changed files
with
2360 additions
and
0 deletions
.gitignore
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/FeatureGeneration.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.core.md.detection.Constants; | ||
4 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
5 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; | ||
6 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; | ||
7 | + | ||
8 | +import java.util.*; | ||
9 | + | ||
10 | +public class FeatureGeneration { | ||
11 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", | ||
12 | + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); | ||
13 | + | ||
14 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale", | ||
15 | + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie", | ||
16 | + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" })); | ||
17 | + | ||
18 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>( | ||
19 | + Arrays.asList(new String[] { "?", "!" })); | ||
20 | + | ||
21 | + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>(); | ||
22 | + static { | ||
23 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")"); | ||
24 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\""); | ||
25 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'"); | ||
26 | + } | ||
27 | + | ||
28 | + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12", | ||
29 | + "ppron3", "ger", "num", "numcol" })); | ||
30 | + | ||
31 | + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" })); | ||
32 | + | ||
33 | + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt", | ||
34 | + "praet", "winien" })); | ||
35 | + | ||
36 | + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki", | ||
37 | + "który" })); | ||
38 | + | ||
39 | + public static void generateFeatures(Map<String, Object> features, Token t, Sentence s, Set<String> quasiVerbs) { | ||
40 | + | ||
41 | + features.put("ctag", t.getChosenInterpretation().getCtag()); | ||
42 | + features.put("number", t.getChosenInterpretation().getNumber()); | ||
43 | + | ||
44 | + features.put("NGHead", NGHead(t, s)); | ||
45 | + features.put("isNextColon", isNextColon(t, s)); | ||
46 | + features.put("wordCtag", wordCtag(t, s)); | ||
47 | + features.put("isPartOfNE", isPartOfNE(t, s)); | ||
48 | + features.put("isFirstInNE", isFirstInNE(t, s)); | ||
49 | + features.put("nextCtag", getNeighbouringTag(s, t, 1)); | ||
50 | + features.put("prevCtag", getNeighbouringTag(s, t, -1)); | ||
51 | + features.put("sentLength", s.size()); | ||
52 | + | ||
53 | + features.put("tokenOrthLength", t.getOrth().length()); | ||
54 | + features.put("tokenBaseLength", t.getBase().length()); | ||
55 | + features.put("isNextDot", isNextDot(t, s)); | ||
56 | + features.put("closestNEDistance", closestNEDistance(t, s)); | ||
57 | + features.put("startsWithUpperOrth", Character.isUpperCase(t.getOrth().codePointAt(0))); | ||
58 | + features.put("startsWithUpperBase", Character.isUpperCase(t.getBase().codePointAt(0))); | ||
59 | + | ||
60 | + | ||
61 | + //features.put("isPartOfFrazeo", isPartOfFrazeo(t, s)); | ||
62 | + //features.put("gender", t.getChosenInterpretation().getGender()); | ||
63 | + //features.put("person", t.getChosenInterpretation().getPerson()); | ||
64 | + //features.put("quasi", quasiVerbs.contains(m.getChosenInterpretation().getBase())); | ||
65 | + //features.put("isPrevPraet", isPrevPraet(t, s)); | ||
66 | + //features.put("isPrevComma", isPrevComma(t, s)); | ||
67 | + //features.put("isPrev2Pred", isPrev2Pred(t, s)); | ||
68 | + //features.put("isNextInf", isNextInf(t, s)); | ||
69 | + | ||
70 | + | ||
71 | + //List<Token> clause = getClause(s, m); | ||
72 | +// features.put("clauseLength", clause.size()); | ||
73 | + | ||
74 | + //addFeatures(features, clause, "clause", m); | ||
75 | +/* addFeatures(features, s, "sent", t); | ||
76 | + for (int i = 1; i < 6; i++) | ||
77 | + addFeatures(features, getWindow(s, t, i, 0), "window_" + i + "_" + 0, t); | ||
78 | + for (int i = 1; i < 6; i++) | ||
79 | + addFeatures(features, getWindow(s, t, 0, i), "window_" + 0 + "_" + i, t); | ||
80 | + for (int i = 1; i < 6; i++) | ||
81 | + addFeatures(features, getWindow(s, t, i, i), "window_" + i + "_" + i, t);*/ | ||
82 | + } | ||
83 | + | ||
84 | + /////////////////////////////////// | ||
85 | + | ||
86 | + private static boolean NGHead(Token t, Sentence s) { | ||
87 | + | ||
88 | + for (SyntacticGroup group : s.getGroups()) { | ||
89 | + if (group.getType().startsWith("NG") && group.getSemanticHeadTokens().contains(t)) { | ||
90 | + return Boolean.valueOf(true); | ||
91 | + } | ||
92 | + } | ||
93 | + return Boolean.valueOf(false); | ||
94 | + } | ||
95 | + | ||
96 | + private static boolean isNextColon(Token t, Sentence s) { | ||
97 | + int idx = s.indexOf(t) + 1; | ||
98 | + if (idx >= s.size() || idx < 0) | ||
99 | + return Boolean.valueOf(false); | ||
100 | + return Boolean.valueOf(s.get(idx).getOrth().equals(":")); | ||
101 | + } | ||
102 | + | ||
103 | + private static boolean isNextDot(Token t, Sentence s) { | ||
104 | + int idx = s.indexOf(t) + 1; | ||
105 | + if (idx >= s.size() || idx < 0) | ||
106 | + return Boolean.valueOf(false); | ||
107 | + return Boolean.valueOf(s.get(idx).getOrth().equals(".")); | ||
108 | + } | ||
109 | + | ||
110 | + private static String wordCtag(Token t, Sentence s) { | ||
111 | + for (SyntacticWord w : s.getSyntacticWords()) { | ||
112 | + if (w.getTokens().contains(t)) { | ||
113 | + return w.getCtag(); | ||
114 | + } | ||
115 | + } | ||
116 | + return "None"; | ||
117 | + } | ||
118 | + | ||
119 | + private static boolean isPartOfNE(Token t, Sentence s) { | ||
120 | + for (NamedEntity ne : s.getNamedEntities()) { | ||
121 | + if (ne.getTokens().contains(t)) { | ||
122 | + return Boolean.valueOf(true); | ||
123 | + } | ||
124 | + } | ||
125 | + return Boolean.valueOf(false); | ||
126 | + } | ||
127 | + | ||
128 | + private static int closestNEDistance(Token t, Sentence s) { | ||
129 | + int lowestDistance = -1; | ||
130 | + for (NamedEntity ne : s.getNamedEntities()) { | ||
131 | + int distance = ne.getTokens().get(0).getSentencePosition() - t.getSentencePosition(); | ||
132 | + if ( distance >= 0 && (distance < lowestDistance || lowestDistance < 0)) { | ||
133 | + lowestDistance = distance; | ||
134 | + } | ||
135 | + } | ||
136 | + return lowestDistance; | ||
137 | + } | ||
138 | + | ||
139 | + private static boolean isFirstInNE(Token t, Sentence s) { | ||
140 | + for (NamedEntity ne : s.getNamedEntities()) { | ||
141 | + if (ne.getTokens().get(0).compareTo(t) == 0) { | ||
142 | + return Boolean.valueOf(true); | ||
143 | + } | ||
144 | + } | ||
145 | + return Boolean.valueOf(false); | ||
146 | + } | ||
147 | + | ||
148 | + private static boolean isPartOfFrazeo(Token t, Sentence s) { | ||
149 | + for (SyntacticWord word : s.getSyntacticWords()) { | ||
150 | + if (word.getTokens().contains(t) && | ||
151 | + Constants.FRAZEO_CTAGS.contains(word.getCtag())) { | ||
152 | + return true; | ||
153 | + } | ||
154 | + } | ||
155 | + return false; | ||
156 | + } | ||
157 | + | ||
158 | + /////////////////////////////////// | ||
159 | + | ||
160 | + private static boolean isNextInf(Token m, Sentence s) { | ||
161 | + boolean now = false; | ||
162 | + for (Token morph : s) { | ||
163 | + if (now) | ||
164 | + return morph.getChosenInterpretation().getCtag().equals("inf"); | ||
165 | + if (m.equals(morph)) | ||
166 | + now = true; | ||
167 | + } | ||
168 | + return false; | ||
169 | + } | ||
170 | + | ||
171 | + private static boolean isPrev2Pred(Token m, Sentence s) { | ||
172 | + Token prev = null; | ||
173 | + Token prev2 = null; | ||
174 | + for (Token morph : s) { | ||
175 | + if (m.equals(morph)) | ||
176 | + break; | ||
177 | + prev2 = prev; | ||
178 | + prev = morph; | ||
179 | + } | ||
180 | + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred")) | ||
181 | + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred")); | ||
182 | + } | ||
183 | + | ||
184 | + private static Object isPrevComma(Token m, Sentence s) { | ||
185 | + Token prev = null; | ||
186 | + for (Token morph : s) { | ||
187 | + if (m.equals(morph)) | ||
188 | + break; | ||
189 | + prev = morph; | ||
190 | + } | ||
191 | + return prev != null && prev.getChosenInterpretation().getBase().equals(","); | ||
192 | + } | ||
193 | + | ||
194 | + private static String getNeighbouringTag(Sentence s, Token m, int i) { | ||
195 | + int idx = s.indexOf(m) + i; | ||
196 | + if (idx >= s.size() || idx < 0) | ||
197 | + return "None"; | ||
198 | + return s.get(idx).getChosenInterpretation().getCtag(); | ||
199 | + } | ||
200 | + | ||
201 | + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) { | ||
202 | + | ||
203 | + boolean hasNom = false; // 1 | ||
204 | + boolean hasNum = false; // 2 | ||
205 | + boolean hasPOG = false; // 3 | ||
206 | + | ||
207 | + boolean hasNomNum = false; | ||
208 | + boolean hasNumPOG = false; | ||
209 | + boolean hasNomPOG = false; | ||
210 | + boolean hasNomNumPOG = false; | ||
211 | + | ||
212 | + boolean has2Nom = false; | ||
213 | + boolean has2NomPOG = false; | ||
214 | + boolean has2POG = false; | ||
215 | + | ||
216 | + Token prev = null; | ||
217 | + for (Token candidate : clause) { | ||
218 | + | ||
219 | + if (!isNoun(candidate) || isJakJako(prev)) { | ||
220 | + prev = candidate; | ||
221 | + continue; | ||
222 | + } | ||
223 | + | ||
224 | + // nom, nom2 | ||
225 | + if (isNom(candidate)) { | ||
226 | + if (hasNom) | ||
227 | + has2Nom = true; | ||
228 | + hasNom = true; | ||
229 | + } | ||
230 | + // num | ||
231 | + if (agreedNum(candidate, m)) { | ||
232 | + hasNum = true; | ||
233 | + } | ||
234 | + // pog, pog2 | ||
235 | + if (agreedGenderOrPerson(candidate, m)) { | ||
236 | + if (hasPOG) | ||
237 | + has2POG = true; | ||
238 | + hasPOG = true; | ||
239 | + } | ||
240 | + | ||
241 | + // nom num, nom num pog | ||
242 | + if (isNom(candidate) && agreedNum(candidate, m)) { | ||
243 | + if (agreedGenderOrPerson(candidate, m)) | ||
244 | + hasNomNumPOG = true; | ||
245 | + hasNomNum = true; | ||
246 | + } | ||
247 | + | ||
248 | + // nom pog, num pog | ||
249 | + if (agreedGenderOrPerson(candidate, m)) | ||
250 | + if (isNom(candidate)) { | ||
251 | + if (hasNomPOG) | ||
252 | + has2NomPOG = true; | ||
253 | + hasNomPOG = true; | ||
254 | + } else if (agreedNum(candidate, m)) | ||
255 | + hasNumPOG = true; | ||
256 | + | ||
257 | + prev = candidate; | ||
258 | + } | ||
259 | + | ||
260 | + // features.put("conj_" + prefix, hasConj); | ||
261 | + features.put("cand_2_nom_" + prefix, has2Nom); | ||
262 | + features.put("cand_2_POG_" + prefix, has2POG); | ||
263 | + features.put("cand_2_nom+POG_" + prefix, has2NomPOG); | ||
264 | + | ||
265 | + features.put("cand_nom_" + prefix, hasNom); | ||
266 | + features.put("cand_num_" + prefix, hasNum); | ||
267 | + features.put("cand_POG_" + prefix, hasPOG); | ||
268 | + | ||
269 | + features.put("cand_nom+num_" + prefix, hasNomNum); | ||
270 | + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG); | ||
271 | + features.put("cand_nom+POG_" + prefix, hasNomPOG); | ||
272 | + features.put("cand_num+POG_" + prefix, hasNumPOG); | ||
273 | + } | ||
274 | + | ||
275 | + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) { | ||
276 | + | ||
277 | + int idx = s.indexOf(m); | ||
278 | + int from = Math.max(0, idx - pre); | ||
279 | + int to = Math.min(s.size(), idx + post + 1); | ||
280 | + | ||
281 | + return new ArrayList<>(s.subList(from, to)); | ||
282 | + } | ||
283 | + | ||
284 | + private static boolean isPrevPraet(Token m, Sentence s) { | ||
285 | + Token prev = null; | ||
286 | + for (Token morph : s) { | ||
287 | + if (m.equals(morph)) | ||
288 | + break; | ||
289 | + prev = morph; | ||
290 | + } | ||
291 | + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet"); | ||
292 | + } | ||
293 | + | ||
294 | + /** | ||
295 | + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo, | ||
296 | + * lub (jak przy streszczeniach: w środku musi być czasownik w formie | ||
297 | + * osobowej), | ||
298 | + * | ||
299 | + * @param s | ||
300 | + * sentence | ||
301 | + * @param m2 | ||
302 | + * token | ||
303 | + * @return clause with the token | ||
304 | + */ | ||
305 | + public static List<Token> getClause(Sentence s, Token m2) { | ||
306 | + | ||
307 | + List<List<Token>> sublists = getClauses(s); | ||
308 | + | ||
309 | + for (List<Token> sub : sublists) | ||
310 | + for (Token m : sub) | ||
311 | + if (m.equals(m2)) | ||
312 | + return sub; | ||
313 | + | ||
314 | + return null; | ||
315 | + } | ||
316 | + | ||
317 | + public static List<List<Token>> getClauses(Sentence s) { | ||
318 | + | ||
319 | + Set<Token> noSplitMorphs = new HashSet<>(); | ||
320 | + for (SyntacticGroup g : s.getGroups()) { | ||
321 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | ||
322 | + noSplitMorphs.add(m); | ||
323 | + } | ||
324 | + } | ||
325 | + for (SyntacticWord g : s.getSyntacticWords()) { | ||
326 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | ||
327 | + noSplitMorphs.add(m); | ||
328 | + } | ||
329 | + } | ||
330 | + | ||
331 | + LinkedList<List<Token>> sublists = new LinkedList<>(); | ||
332 | + List<Token> currentSublist = new ArrayList<>(); | ||
333 | + boolean clauseHasVerb = false; | ||
334 | + for (Token m : s) { | ||
335 | + String base = m.getChosenInterpretation().getBase(); | ||
336 | + if (!noSplitMorphs.contains(m) | ||
337 | + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2 | ||
338 | + .contains(base)) && clauseHasVerb))) { | ||
339 | + sublists.add(currentSublist); | ||
340 | + currentSublist = new ArrayList<>(); | ||
341 | + clauseHasVerb = false; | ||
342 | + } else { | ||
343 | + if (isVerb(m)) | ||
344 | + clauseHasVerb = true; | ||
345 | + } | ||
346 | + currentSublist.add(m); | ||
347 | + } | ||
348 | + if (currentSublist.size() > 0) { | ||
349 | + if (clauseHasVerb) | ||
350 | + sublists.add(currentSublist); | ||
351 | + else | ||
352 | + sublists.getLast().addAll(currentSublist); | ||
353 | + } | ||
354 | + | ||
355 | + // merge clause beginning with zaimek wzgl. etc to previous clause | ||
356 | + List<Token> prev = null; | ||
357 | + Iterator<List<Token>> it = sublists.iterator(); | ||
358 | + while (it.hasNext()) { | ||
359 | + List<Token> sublist = it.next(); | ||
360 | + boolean containsRelPron = false; | ||
361 | + int i = 1; | ||
362 | + for (Token m : sublist) { | ||
363 | + if (i > 2) | ||
364 | + break; | ||
365 | + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) { | ||
366 | + containsRelPron = true; | ||
367 | + break; | ||
368 | + } | ||
369 | + i++; | ||
370 | + } | ||
371 | + if (prev != null && containsRelPron) { | ||
372 | + prev.addAll(sublist); | ||
373 | + it.remove(); | ||
374 | + } else | ||
375 | + prev = sublist; | ||
376 | + } | ||
377 | + | ||
378 | + return sublists; | ||
379 | + } | ||
380 | + | ||
381 | + private static boolean agreedNum(Token candidate, Token keyword) { | ||
382 | + String keywordNum = keyword.getNumber(); | ||
383 | + String wordNum = candidate.getNumber(); | ||
384 | + return keywordNum.equals(wordNum); | ||
385 | + } | ||
386 | + | ||
387 | + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) { | ||
388 | + if (isPraet(keyword)) { | ||
389 | + // praet has number:gender | ||
390 | + String keywordGender = keyword.getGender(); | ||
391 | + String wordGender = candidate.getGender(); | ||
392 | + return keywordGender.equals(wordGender); | ||
393 | + } else { | ||
394 | + // other verbs have number:person | ||
395 | + String keywordPerson = keyword.getPerson(); | ||
396 | + String wordPerson = "ter"; // default | ||
397 | + if (PRONOUN_TAGS.contains(candidate.getCtag())) | ||
398 | + wordPerson = candidate.getPerson(); | ||
399 | + return wordPerson.equals(keywordPerson); | ||
400 | + } | ||
401 | + } | ||
402 | + | ||
403 | + private static boolean isJakJako(Token prev) { | ||
404 | + String base = prev == null ? null : prev.getBase(); | ||
405 | + return prev != null && (base.equals("jak") || base.equals("jako")); | ||
406 | + } | ||
407 | + | ||
408 | + private static boolean isPraet(Token keyword) { | ||
409 | + return keyword.getCtag().equals("praet"); | ||
410 | + } | ||
411 | + | ||
412 | + private static boolean isNom(Token candidate) { | ||
413 | + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow | ||
414 | + // tylko! | ||
415 | + } | ||
416 | + | ||
417 | + private static boolean isNoun(Token m) { | ||
418 | + return NOUN_TAGS.contains(m.getCtag()); | ||
419 | + } | ||
420 | + | ||
421 | + public static boolean isVerb(Token morph) { | ||
422 | + return VERB_TAGS.contains(morph.getCtag()); | ||
423 | + } | ||
424 | + | ||
425 | + public static boolean isVerb(Mention m) { | ||
426 | + boolean hasOnlyVerbs = true; | ||
427 | + for (Token morph : m.getSegments()) | ||
428 | + if (!isVerb(morph)) { | ||
429 | + hasOnlyVerbs = false; | ||
430 | + break; | ||
431 | + } | ||
432 | + return hasOnlyVerbs; | ||
433 | + } | ||
434 | + | ||
435 | + public static boolean isVerb(TEIMention m) { | ||
436 | + boolean hasOnlyVerbs = true; | ||
437 | + for (TEIMorph morph : m.getMorphs()) | ||
438 | + if (!isVerb(morph)) { | ||
439 | + hasOnlyVerbs = false; | ||
440 | + break; | ||
441 | + } | ||
442 | + return hasOnlyVerbs; | ||
443 | + } | ||
444 | + | ||
445 | + private static boolean isVerb(TEIMorph morph) { | ||
446 | + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag()); | ||
447 | + } | ||
448 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/HeadDetector.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
6 | +import pl.waw.ipipan.zil.core.md.entities.Token; | ||
7 | +import weka.core.Instances; | ||
8 | + | ||
9 | +import java.io.File; | ||
10 | +import java.io.InputStream; | ||
11 | +import java.util.*; | ||
12 | + | ||
13 | +public class HeadDetector { | ||
14 | + | ||
15 | + final private static Logger logger = LoggerFactory.getLogger(HeadDetector.class); | ||
16 | + | ||
17 | + private Model model; | ||
18 | + private Set<String> quasiVerbs = new HashSet<>(); | ||
19 | + | ||
20 | + public static int detectedHeads = 0; | ||
21 | + | ||
22 | + public List<Token> detectHeads(Sentence sentence) { | ||
23 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | ||
24 | + InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence); | ||
25 | + if (examples.isEmpty()) | ||
26 | + return null; | ||
27 | + | ||
28 | + Instances instances = model.getInstances(examples); | ||
29 | + | ||
30 | + // label instances | ||
31 | + List<Boolean> areHeads = new ArrayList<>(); | ||
32 | + List<Token> heads = new ArrayList<>(); | ||
33 | + for (int i = 0; i < instances.numInstances(); i++) { | ||
34 | + boolean isHead = model.isHead(instances.instance(i), sentence); | ||
35 | + areHeads.add(isHead); | ||
36 | + if (isHead) | ||
37 | + detectedHeads++; | ||
38 | + } | ||
39 | + | ||
40 | + int i = 0; | ||
41 | + for (Token m : sentence) { | ||
42 | + if (FeatureGeneration.isVerb(m)) | ||
43 | + continue; | ||
44 | + if (areHeads.get(i)) | ||
45 | + heads.add(m); | ||
46 | + // sentence.addMention(new Mention(m, false)); | ||
47 | + i++; | ||
48 | + } | ||
49 | + return heads; | ||
50 | + } | ||
51 | + | ||
52 | + public HeadDetector(File zeroSubjectDetectionModel) { | ||
53 | + try { | ||
54 | + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); | ||
55 | + this.quasiVerbs = this.model.getQuasiVerbs(); | ||
56 | + } catch (Exception e) { | ||
57 | + logger.error("Error loading model:" + e); | ||
58 | + } | ||
59 | + } | ||
60 | + | ||
61 | + public HeadDetector(InputStream zeroSubjectDetectionModelStream) { | ||
62 | + try { | ||
63 | + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); | ||
64 | + this.quasiVerbs = this.model.getQuasiVerbs(); | ||
65 | + } catch (Exception e) { | ||
66 | + logger.error("Error loading model:" + e); | ||
67 | + } | ||
68 | + } | ||
69 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/InstanceCreator.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
6 | +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; | ||
7 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | ||
8 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; | ||
9 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | ||
10 | +import weka.core.Attribute; | ||
11 | +import weka.core.FastVector; | ||
12 | +import weka.core.Instance; | ||
13 | +import weka.core.Instances; | ||
14 | + | ||
15 | +import java.io.File; | ||
16 | +import java.util.*; | ||
17 | +import java.util.Map.Entry; | ||
18 | + | ||
19 | +public class InstanceCreator { | ||
20 | + | ||
21 | + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class); | ||
22 | + private static final TEI_IO teiIO = TEI_IO.getInstance(); | ||
23 | + | ||
24 | + private InstanceCreator() { | ||
25 | + } | ||
26 | + | ||
27 | + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) { | ||
28 | + int allTexts = 0; | ||
29 | + int exceptions = 0; | ||
30 | + int allSentences = 0; | ||
31 | + | ||
32 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | ||
33 | + for (File textDir : IOUtils.getNKJPDirs(dataDir)) { | ||
34 | + try { | ||
35 | + allTexts++; | ||
36 | + logger.info("Processing text " + textDir); | ||
37 | + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); | ||
38 | + Text text = TeiLoader.loadTextFromTei(ct, textDir); | ||
39 | + | ||
40 | + for (Paragraph p : text) | ||
41 | + for (Sentence s : p) { | ||
42 | + allSentences++; | ||
43 | + loadExamplesFromSentence(quasiVerbs, examples, s); | ||
44 | + } | ||
45 | + | ||
46 | + } catch (Exception e) { | ||
47 | + logger.error(e.getLocalizedMessage()); | ||
48 | + exceptions++; | ||
49 | + } | ||
50 | + } | ||
51 | + | ||
52 | + logger.info(allTexts + " texts found."); | ||
53 | + if (exceptions != 0) | ||
54 | + logger.error(exceptions + " texts with exceptions."); | ||
55 | + logger.info(allSentences + " sentences found."); | ||
56 | + | ||
57 | + return examples; | ||
58 | + } | ||
59 | + | ||
60 | + public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples, | ||
61 | + Sentence s) { | ||
62 | + | ||
63 | + // collect positive examples | ||
64 | + Set<Token> positive = new HashSet<>(); | ||
65 | + for (Mention m : s.getMentions()) { | ||
66 | + if (!FeatureGeneration.isVerb(m)) { | ||
67 | + positive.addAll(m.getHeadSegments()); | ||
68 | + } | ||
69 | + } | ||
70 | + | ||
71 | + for (Token m : s) { | ||
72 | + if (FeatureGeneration.isVerb(m)) | ||
73 | + continue; | ||
74 | + | ||
75 | + TreeMap<String, Object> features = new TreeMap<>(); | ||
76 | + if (positive.contains(m)) { | ||
77 | + features.put("class", Boolean.valueOf(true)); | ||
78 | + } else { | ||
79 | + features.put("class", Boolean.valueOf(false)); | ||
80 | + } | ||
81 | + | ||
82 | + FeatureGeneration.generateFeatures(features, m, s, quasiVerbs); | ||
83 | + examples.add(features); | ||
84 | + } | ||
85 | + } | ||
86 | + | ||
87 | + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { | ||
88 | + | ||
89 | + TreeSet<String> booleanAttsOccurred = new TreeSet<>(); | ||
90 | + TreeSet<String> doubleAttsOccurred = new TreeSet<>(); | ||
91 | + TreeMap<String, Set<String>> att2values = new TreeMap<>(); | ||
92 | + for (TreeMap<String, Object> example : examples) { | ||
93 | + for (Entry<String, Object> e : example.entrySet()) { | ||
94 | + String key = e.getKey(); | ||
95 | + Object val = e.getValue(); | ||
96 | + if (val instanceof Integer || val instanceof Double) { | ||
97 | + doubleAttsOccurred.add(key); | ||
98 | + continue; | ||
99 | + } | ||
100 | + if (val instanceof Boolean) { | ||
101 | + booleanAttsOccurred.add(key); | ||
102 | + continue; | ||
103 | + } | ||
104 | + if (!att2values.containsKey(key)) | ||
105 | + att2values.put(key, new HashSet<>()); | ||
106 | + att2values.get(key).add(val.toString()); | ||
107 | + } | ||
108 | + } | ||
109 | + | ||
110 | + List<Attribute> atts = new ArrayList<>(); | ||
111 | + | ||
112 | + // double attributes | ||
113 | + for (String attName : doubleAttsOccurred) { | ||
114 | + Attribute att = new Attribute(attName); | ||
115 | + atts.add(att); | ||
116 | + } | ||
117 | + | ||
118 | + // boolean attributes (treated as nominal) | ||
119 | + FastVector values = new FastVector(2); | ||
120 | + values.addElement("false"); | ||
121 | + values.addElement("true"); | ||
122 | + for (String attName : booleanAttsOccurred) { | ||
123 | + Attribute att = new Attribute(attName, values); | ||
124 | + atts.add(att); | ||
125 | + } | ||
126 | + | ||
127 | + // nominal attributes | ||
128 | + for (Entry<String, Set<String>> attVals : att2values.entrySet()) { | ||
129 | + FastVector vals = new FastVector(attVals.getValue().size()); | ||
130 | + for (String val : attVals.getValue()) | ||
131 | + vals.addElement(val); | ||
132 | + Attribute att = new Attribute(attVals.getKey(), vals); | ||
133 | + atts.add(att); | ||
134 | + } | ||
135 | + | ||
136 | + FastVector fvWekaAttributes = new FastVector(atts.size()); | ||
137 | + for (Attribute attr : atts) { | ||
138 | + fvWekaAttributes.addElement(attr); | ||
139 | + } | ||
140 | + | ||
141 | + Instances data = new Instances("Head", fvWekaAttributes, 10); | ||
142 | + data.setClass(data.attribute(classFeatureName)); | ||
143 | + return data; | ||
144 | + } | ||
145 | + | ||
146 | + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { | ||
147 | + for (TreeMap<String, Object> example : examples) { | ||
148 | + Instance instance = new Instance(instances.numAttributes()); | ||
149 | + | ||
150 | + for (Entry<String, Object> e : example.entrySet()) { | ||
151 | + Object val = e.getValue(); | ||
152 | + String name = e.getKey(); | ||
153 | + if (val instanceof Integer) { | ||
154 | + instance.setValue(instances.attribute(name), (int) val); | ||
155 | + } else if (val instanceof Boolean) { | ||
156 | + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); | ||
157 | + } else { | ||
158 | + int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); | ||
159 | + if (indexOfValue == -1) { | ||
160 | + logger.debug("Unkown value: " + val.toString() + " of feature: " + name | ||
161 | + + ". Marking as missing value."); | ||
162 | + instance.setMissing(instances.attribute(name)); | ||
163 | + } else | ||
164 | + instance.setValue(instances.attribute(name), indexOfValue); | ||
165 | + } | ||
166 | + } | ||
167 | + | ||
168 | + instance.setDataset(instances); | ||
169 | + instances.add(instance); | ||
170 | + } | ||
171 | + } | ||
172 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Model.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
6 | +import weka.classifiers.Classifier; | ||
7 | +import weka.core.Instance; | ||
8 | +import weka.core.Instances; | ||
9 | + | ||
10 | +import java.io.Serializable; | ||
11 | +import java.util.List; | ||
12 | +import java.util.Set; | ||
13 | +import java.util.TreeMap; | ||
14 | + | ||
15 | +public class Model implements Serializable { | ||
16 | + | ||
17 | + private static final long serialVersionUID = 3351727361273283076L; | ||
18 | + private static final Logger logger = LoggerFactory.getLogger(Model.class); | ||
19 | + | ||
20 | + private Classifier classifier; | ||
21 | + private Set<String> quasiVerbs; | ||
22 | + private Instances instances; | ||
23 | + | ||
24 | + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { | ||
25 | + this.classifier = classifier; | ||
26 | + this.instances = instances; | ||
27 | + this.quasiVerbs = quasiVerbs; | ||
28 | + } | ||
29 | + | ||
30 | + public boolean isHead(Instance instance, Sentence sentence) { | ||
31 | + try { | ||
32 | + double response = this.classifier.classifyInstance(instance); | ||
33 | + return response > 0; | ||
34 | + } catch (Exception e) { | ||
35 | + logger.error("Error classyfing head in sentence: " + sentence, e); | ||
36 | + return false; | ||
37 | + } | ||
38 | + } | ||
39 | + | ||
40 | + public Instances getInstances(List<TreeMap<String, Object>> examples) { | ||
41 | + Instances instances = new Instances(this.instances); | ||
42 | + InstanceCreator.fillInstances(examples, instances); | ||
43 | + return instances; | ||
44 | + } | ||
45 | + | ||
46 | + public Set<String> getQuasiVerbs() { | ||
47 | + return quasiVerbs; | ||
48 | + } | ||
49 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Serializer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | ||
2 | + | ||
3 | +import weka.core.SerializationHelper; | ||
4 | + | ||
5 | +import java.io.InputStream; | ||
6 | + | ||
7 | +public class Serializer { | ||
8 | + | ||
9 | + public static void saveModel(Model m, String targetModelFilePath) throws Exception { | ||
10 | + SerializationHelper.write(targetModelFilePath, m); | ||
11 | + } | ||
12 | + | ||
13 | + public static Model loadModel(String path) throws Exception { | ||
14 | + Model m = (Model) SerializationHelper.read(path); | ||
15 | + return m; | ||
16 | + } | ||
17 | + | ||
18 | + public static Model loadModelFromStream(InputStream stream) throws Exception { | ||
19 | + Model m = (Model) SerializationHelper.read(stream); | ||
20 | + return m; | ||
21 | + } | ||
22 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Trainer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.head; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import weka.classifiers.Evaluation; | ||
6 | +import weka.classifiers.rules.JRip; | ||
7 | +import weka.classifiers.rules.JRip.RipperRule; | ||
8 | +import weka.core.Attribute; | ||
9 | +import weka.core.Instance; | ||
10 | +import weka.core.Instances; | ||
11 | + | ||
12 | +import java.io.*; | ||
13 | +import java.util.*; | ||
14 | + | ||
15 | +public class Trainer { | ||
16 | + | ||
17 | + private static final Logger logger = LoggerFactory.getLogger(Trainer.class); | ||
18 | + | ||
19 | + private static final boolean DO_CV = false; | ||
20 | + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; | ||
21 | + | ||
22 | + private Trainer() { | ||
23 | + } | ||
24 | + | ||
25 | + public static void main(String[] args) { | ||
26 | + | ||
27 | + if (args.length != 2) { | ||
28 | + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() | ||
29 | + + " trainDir targetModelFile"); | ||
30 | + return; | ||
31 | + } | ||
32 | + | ||
33 | + File dataDir = new File(args[0]); | ||
34 | + String targetModelFilePath = args[1]; | ||
35 | + | ||
36 | + if (!dataDir.isDirectory()) { | ||
37 | + logger.error(dataDir + " is not a directory!"); | ||
38 | + return; | ||
39 | + } | ||
40 | + | ||
41 | + Set<String> quasiVerbs = loadQuasiVerbs(); | ||
42 | + | ||
43 | + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs); | ||
44 | + Instances instances = InstanceCreator.createInstances(examples, "class"); | ||
45 | + InstanceCreator.fillInstances(examples, instances); | ||
46 | + | ||
47 | + printStats(instances); | ||
48 | + | ||
49 | + try { | ||
50 | + JRip model; | ||
51 | + | ||
52 | + if (DO_CV) { | ||
53 | + logger.info("Crossvalidation..."); | ||
54 | + model = new JRip(); | ||
55 | + Evaluation eval = new Evaluation(instances); | ||
56 | + eval.crossValidateModel(model, instances, 10, new Random(1)); | ||
57 | + logger.info(eval.toSummaryString()); | ||
58 | + logger.info(eval.toMatrixString()); | ||
59 | + logger.info(eval.toClassDetailsString()); | ||
60 | + } | ||
61 | + | ||
62 | + logger.info("Building final classifier..."); | ||
63 | + model = new JRip(); | ||
64 | + model.buildClassifier(instances); | ||
65 | + logger.info(model.getRuleset().size() + " rules generated."); | ||
66 | + for (int i = 0; i < model.getRuleset().size(); i++) { | ||
67 | + RipperRule v = (RipperRule) model.getRuleset().elementAt(i); | ||
68 | + logger.info("\t" + v.toString(instances.classAttribute())); | ||
69 | + } | ||
70 | + | ||
71 | + instances.delete(); | ||
72 | + logger.info("Features stats:"); | ||
73 | + for (int i = 0; i < instances.numAttributes(); i++) { | ||
74 | + Attribute att = instances.attribute(i); | ||
75 | + logger.info(i + ".\t" + att.toString()); | ||
76 | + } | ||
77 | + | ||
78 | + logger.info("Saving classifier..."); | ||
79 | + Model m = new Model(model, instances, quasiVerbs); | ||
80 | + Serializer.saveModel(m, targetModelFilePath); | ||
81 | + logger.info("Done."); | ||
82 | + | ||
83 | + } catch (Exception e) { | ||
84 | + logger.error("Error: " + e); | ||
85 | + } | ||
86 | + } | ||
87 | + | ||
88 | + private static Set<String> loadQuasiVerbs() { | ||
89 | + Set<String> quasiVerbs = new HashSet<>(); | ||
90 | + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); | ||
91 | + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { | ||
92 | + String line; | ||
93 | + while ((line = br.readLine()) != null) { | ||
94 | + quasiVerbs.add(line.trim()); | ||
95 | + } | ||
96 | + } catch (IOException e) { | ||
97 | + logger.error(e.getLocalizedMessage(), e); | ||
98 | + } | ||
99 | + return quasiVerbs; | ||
100 | + } | ||
101 | + | ||
102 | + private static void printStats(Instances instances) { | ||
103 | + int positive = 0; | ||
104 | + int negative = 0; | ||
105 | + for (int i = 0; i < instances.numInstances(); i++) { | ||
106 | + Instance inst = instances.instance(i); | ||
107 | + if (inst.classValue() > 0) | ||
108 | + negative++; | ||
109 | + else | ||
110 | + positive++; | ||
111 | + } | ||
112 | + logger.info(positive + " positive examples"); | ||
113 | + logger.info(negative + " negative examples"); | ||
114 | + logger.info((positive + negative) + " examples total"); | ||
115 | + logger.info((instances.numAttributes() - 1) + " attributes"); | ||
116 | + logger.info(instances.toSummaryString()); | ||
117 | + } | ||
118 | + | ||
119 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/FeatureGeneration.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | ||
2 | + | ||
3 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | ||
4 | +import pl.waw.ipipan.zil.core.md.detection.Constants; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
6 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; | ||
7 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; | ||
8 | + | ||
9 | +import java.util.*; | ||
10 | + | ||
11 | + | ||
12 | +public class FeatureGeneration { | ||
13 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", | ||
14 | + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); | ||
15 | + | ||
16 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale", | ||
17 | + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie", | ||
18 | + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" })); | ||
19 | + | ||
20 | + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>( | ||
21 | + Arrays.asList(new String[] { "?", "!" })); | ||
22 | + | ||
23 | + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>(); | ||
24 | + static { | ||
25 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")"); | ||
26 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\""); | ||
27 | + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'"); | ||
28 | + } | ||
29 | + | ||
30 | + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12", | ||
31 | + "ppron3", "ger", "num", "numcol" })); | ||
32 | + | ||
33 | + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" })); | ||
34 | + | ||
35 | + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt", | ||
36 | + "praet", "winien" })); | ||
37 | + | ||
38 | + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki", | ||
39 | + "który" })); | ||
40 | + | ||
41 | + public static void generateFeatures(Map<String, Object> features, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | ||
42 | + Token head, Token candidate, Sentence s, List<Token> heads) { | ||
43 | + | ||
44 | + //addTokenFeatures(features, "head", head, s); | ||
45 | + addTokenFeatures(features, "candidate", candidate, s); | ||
46 | + | ||
47 | + //features.put("sentLength", s.size()); // ostatnie sprawdzone | ||
48 | + features.put("sameWord", sameWord(head, candidate, s)); | ||
49 | + features.put("sameNE", sameNE(head, candidate, s)); | ||
50 | + features.put("sameNG", sameNG(head, candidate, s)); | ||
51 | + | ||
52 | + features.put("distance", Math.abs(head.getSentencePosition() - candidate.getSentencePosition())); | ||
53 | + //features.put("headIsFirst", Boolean.valueOf(head.compareTo(candidate) < 0)); | ||
54 | + features.put("candidateIsFirst", Boolean.valueOf(head.compareTo(candidate) > 0)); | ||
55 | + | ||
56 | + features.put("sameWalentyConstruction", sameWalentyConstruction(head, candidate, s, valence)); | ||
57 | + features.put("sameToken", sameToken(head, candidate)); | ||
58 | + | ||
59 | + features.put("candidateIsAlsoHead", Boolean.valueOf(heads.contains(candidate))); | ||
60 | + features.put("isNextToCandidateColon", isNextColon(candidate, s)); | ||
61 | + | ||
62 | + features.put("candidateStartsWithUpperOrth", Character.isUpperCase(candidate.getOrth().codePointAt(0))); | ||
63 | + features.put("candidateStartsWithUpperBase", Character.isUpperCase(candidate.getBase().codePointAt(0))); | ||
64 | + features.put("isDotNextToHead", isNextDot(head, s)); | ||
65 | + features.put("closestNEDistance", closestNEDistance(head, candidate, s)); | ||
66 | + features.put("headStartsWithUpperOrth", Character.isUpperCase(head.getOrth().codePointAt(0))); | ||
67 | + features.put("headStartsWithUpperBase", Character.isUpperCase(head.getBase().codePointAt(0))); // tutaj optymalna wersja sie konczy | ||
68 | + | ||
69 | + | ||
70 | + // candidate in head in closest NE distance | ||
71 | + | ||
72 | +// features.put("candidateOrthLength", candidate.getOrth().length()); | ||
73 | +// features.put("candidateBaseLength", candidate.getBase().length()); | ||
74 | +// features.put("headOrthLength", head.getOrth().length()); | ||
75 | +// features.put("headBaseLength", head.getBase().length()); | ||
76 | + | ||
77 | + //features.put("isNextToHeadColon", isNextColon(head, s)); | ||
78 | + //features.put("isCandidateColon", Boolean.valueOf(candidate.getOrth().equals(":"))); // tylko run zrobic, tak jeszcze nie sprawdzalem | ||
79 | + | ||
80 | +/* features.put("isClauseSplitLemmaStrict", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(candidate.getBase()))); | ||
81 | + features.put("isClauseSplitLemma", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS.contains(candidate.getBase()))); | ||
82 | + features.put("isClauseSplitLemma2", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(candidate.getBase())));*/ | ||
83 | + | ||
84 | +/* Token next = getNeighbouringToken(s, candidate, 1); | ||
85 | + if (next != null) { | ||
86 | + features.put("nextIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(next.getBase()))); | ||
87 | + features.put("nextIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(next.getBase()))); | ||
88 | + features.put("nextIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(next.getBase()))); | ||
89 | + } else { | ||
90 | + features.put("nextIsClauseSplitLemmaStrict", "sentEnd"); | ||
91 | + features.put("nextIsClauseSplitLemma", "sentEnd"); | ||
92 | + features.put("nextIsClauseSplitLemma2", "sentEnd"); | ||
93 | + } | ||
94 | + | ||
95 | + Token previous = getNeighbouringToken(s, candidate, -1); | ||
96 | + if (previous != null) { | ||
97 | + features.put("previousIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(previous.getBase()))); | ||
98 | + features.put("previousIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(previous.getBase()))); | ||
99 | + features.put("previousIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(previous.getBase()))); | ||
100 | + } else { | ||
101 | + features.put("previousIsClauseSplitLemmaStrict", "sentStart"); | ||
102 | + features.put("previousIsClauseSplitLemma", "sentStart"); | ||
103 | + features.put("previousIsClauseSplitLemma2", "sentStart"); | ||
104 | + }*/ | ||
105 | + | ||
106 | + | ||
107 | + //features.put("candidateIsClosingBracket", candidateIsClosingBracket(head, candidate, s)); | ||
108 | + //features.put("candidateIsQM", candidateIsClosingQM(head, candidate, s)); | ||
109 | + //features.put("candidateIsClosingBracket", Boolean.valueOf(candidate.getOrth().equals(")"))); | ||
110 | + | ||
111 | + // pozycja glowy we wzmiance, da sie zasymulowac!!cos nie bangla | ||
112 | + // jeszcze raz niestety trzeba sprawdzic ciaglosc prawostronna chyba | ||
113 | + // head NG group length i walenty construction group Length dodac bo moze to dobrze zadzialac z odelgloscia | ||
114 | + // is stop word dodac dla candidate i jakies rozwiazania z detekcji glowy moze | ||
115 | + // zrobic tak zeby jeszcze sprawdzalo czy token przed jest czescia wzmianki | ||
116 | + // z tymi separatorami tez sie pobawic | ||
117 | + // word Ctag !! | ||
118 | +/* | ||
119 | + Token next = getNeighbouringToken(s, candidate, 1); | ||
120 | + if (next != null) { | ||
121 | + features.put(String.format("%sCtag", "nextToCandidate"), next.getChosenInterpretation().getCtag()); | ||
122 | + features.put(String.format("%sNumber", "nextToCandidate"), next.getChosenInterpretation().getNumber()); | ||
123 | + features.put(String.format("%sGender", "nextToCandidate"), next.getChosenInterpretation().getGender()); | ||
124 | + features.put(String.format("%sPerson", "nextToCandidate"), next.getChosenInterpretation().getPerson()); | ||
125 | + } else { | ||
126 | + features.put(String.format("%sCtag", "nextToCandidate"), "null"); | ||
127 | + features.put(String.format("%sNumber", "nextToCandidate"), "null"); | ||
128 | + features.put(String.format("%sGender", "nextToCandidate"), "null"); | ||
129 | + features.put(String.format("%sPerson", "nextToCandidate"), "null"); | ||
130 | + } | ||
131 | + | ||
132 | + Token previous = getNeighbouringToken(s, candidate, -1); | ||
133 | + if (previous != null) { | ||
134 | + features.put(String.format("%sCtag", "previousToCandidate"), previous.getChosenInterpretation().getCtag()); | ||
135 | + features.put(String.format("%sNumber", "previousToCandidate"), previous.getChosenInterpretation().getNumber()); | ||
136 | + features.put(String.format("%sGender", "previousToCandidate"), previous.getChosenInterpretation().getGender()); | ||
137 | + features.put(String.format("%sPerson", "previousToCandidate"), previous.getChosenInterpretation().getPerson()); | ||
138 | + } else { | ||
139 | + features.put(String.format("%sCtag", "previousToCandidate"), "null"); | ||
140 | + features.put(String.format("%sNumber", "previousToCandidate"), "null"); | ||
141 | + features.put(String.format("%sGender", "previousToCandidate"), "null"); | ||
142 | + features.put(String.format("%sPerson", "previousToCandidate"), "null"); | ||
143 | + } | ||
144 | + */ | ||
145 | + | ||
146 | + | ||
147 | + } | ||
148 | + | ||
149 | + private static int closestNEDistance(Token head, Token candidate, Sentence s) { | ||
150 | + int lowestDistance = -1; | ||
151 | + for (NamedEntity ne : s.getNamedEntities()) { | ||
152 | + int distance = ne.getTokens().get(0).getSentencePosition() - head.getSentencePosition(); | ||
153 | + if ( distance >= 0 && ne.getTokens().contains(candidate) && (distance < lowestDistance || lowestDistance < 0)) { | ||
154 | + lowestDistance = distance; | ||
155 | + } | ||
156 | + } | ||
157 | + return lowestDistance; | ||
158 | + } | ||
159 | + | ||
160 | + ///////////////////////////// | ||
161 | + | ||
162 | +/* private static boolean candidateIsClosingBracket(Token head, Token candidate, Sentence s) { | ||
163 | + | ||
164 | + | ||
165 | + | ||
166 | + if (!candidate.getOrth().equals(")")) { | ||
167 | + return Boolean.valueOf(false); | ||
168 | + } | ||
169 | + | ||
170 | + int openedBrackets = 0; | ||
171 | + int closedBrackets = 0; | ||
172 | + for (Token t : s) { | ||
173 | + if (candidate.getSentencePosition() == t.getSentencePosition()) { | ||
174 | + break; | ||
175 | + } | ||
176 | + | ||
177 | + if (t.getSentencePosition() >= head.getSentencePosition()) { | ||
178 | + if (t.getOrth().equals("(")) | ||
179 | + openedBrackets++; | ||
180 | + if (t.getOrth().equals(")")) | ||
181 | + closedBrackets++; | ||
182 | + } | ||
183 | + } | ||
184 | + | ||
185 | + if (openedBrackets - closedBrackets > 0) { | ||
186 | + return Boolean.valueOf(true); | ||
187 | + } | ||
188 | + | ||
189 | + return Boolean.valueOf(false); | ||
190 | + }*/ | ||
191 | + | ||
192 | + private static boolean isNextColon(Token t, Sentence s) { | ||
193 | + int idx = s.indexOf(t) + 1; | ||
194 | + if (idx >= s.size() || idx < 0) | ||
195 | + return Boolean.valueOf(false); | ||
196 | + return Boolean.valueOf(s.get(idx).getOrth().equals(":")); | ||
197 | + } | ||
198 | + | ||
199 | + private static boolean isNextDot(Token t, Sentence s) { | ||
200 | + int idx = s.indexOf(t) + 1; | ||
201 | + if (idx >= s.size() || idx < 0) | ||
202 | + return Boolean.valueOf(false); | ||
203 | + return Boolean.valueOf(s.get(idx).getOrth().equals(".")); | ||
204 | + } | ||
205 | + | ||
206 | + private static boolean candidateIsClosingQM(Token head, Token candidate, Sentence s) { | ||
207 | + | ||
208 | + if (!candidate.getOrth().equals("\"")) { | ||
209 | + return Boolean.valueOf(false); | ||
210 | + } | ||
211 | + | ||
212 | + int start = head.getSentencePosition(); | ||
213 | + int end = candidate.getSentencePosition() - 1; | ||
214 | + if (head.compareTo(candidate) > 0) { | ||
215 | + start = candidate.getSentencePosition() + 1; | ||
216 | + end = head.getSentencePosition(); | ||
217 | + } | ||
218 | + | ||
219 | + int QMs = 0; | ||
220 | + for (Token t : s) { | ||
221 | + if (end == t.getSentencePosition()) { | ||
222 | + break; | ||
223 | + } | ||
224 | + | ||
225 | + if (t.getSentencePosition() >= start) { | ||
226 | + if (t.getOrth().equals("\"")) | ||
227 | + QMs++; | ||
228 | + } | ||
229 | + } | ||
230 | + | ||
231 | + if ((QMs % 2) != 0) { | ||
232 | + return Boolean.valueOf(true); | ||
233 | + } | ||
234 | + | ||
235 | + return Boolean.valueOf(false); | ||
236 | + } | ||
237 | + | ||
238 | + private static boolean sameWord(Token t1, Token t2, Sentence s) { | ||
239 | + | ||
240 | + for (SyntacticWord w : s.getSyntacticWords()) { | ||
241 | + if (w.getTokens().contains(t1) && w.getTokens().contains(t2)) { | ||
242 | + return Boolean.valueOf(true); | ||
243 | + } | ||
244 | + } | ||
245 | + return Boolean.valueOf(false); | ||
246 | + } | ||
247 | + | ||
248 | + private static boolean sameNE(Token t1, Token t2, Sentence s) { | ||
249 | + | ||
250 | + for (NamedEntity ne : s.getNamedEntities()) { | ||
251 | + if (ne.getTokens().contains(t1) && ne.getTokens().contains(t2)) { | ||
252 | + return Boolean.valueOf(true); | ||
253 | + } | ||
254 | + } | ||
255 | + return Boolean.valueOf(false); | ||
256 | + } | ||
257 | + | ||
258 | + private static boolean sameNG(Token head, Token candidate, Sentence s) { | ||
259 | + | ||
260 | + for (SyntacticGroup group : s.getGroups()) { | ||
261 | + if (group.getType().startsWith("NG")) { | ||
262 | + if (group.getSemanticHeadTokens().contains(head) && group.getTokens().contains(candidate)) { | ||
263 | + return Boolean.valueOf(true); | ||
264 | + } | ||
265 | + } | ||
266 | + } | ||
267 | + return Boolean.valueOf(false); | ||
268 | + } | ||
269 | + | ||
270 | + private static boolean sameWalentyConstruction(Token head, Token candidate, Sentence s, | ||
271 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | ||
272 | + | ||
273 | + for (SyntacticGroup group : s.getGroups()) { | ||
274 | + if (group.getType().startsWith("NG")) { | ||
275 | + ArrayList<SyntacticGroup> nestedGroups = new ArrayList<SyntacticGroup>(); | ||
276 | + nestedGroups.add(group); | ||
277 | + | ||
278 | + SyntacticGroup nextGroup = group.getFollowingGroup(); | ||
279 | + while (nextGroup != null) { | ||
280 | + nestedGroups.add(nextGroup); | ||
281 | + nextGroup = nextGroup.getFollowingGroup(); | ||
282 | + } | ||
283 | + | ||
284 | + List<Token> extendedGroupSegments = getExtendedGroupSegments(nestedGroups, valence.get(ValenceDicts.NounsValence)); | ||
285 | + List<Token> extendedGroupHeads = getExtendedGroupHeads(nestedGroups); | ||
286 | + if (extendedGroupHeads.contains(head) && extendedGroupSegments.contains(candidate)) | ||
287 | + return Boolean.valueOf(true); | ||
288 | + } | ||
289 | + } | ||
290 | + return Boolean.valueOf(false); | ||
291 | + } | ||
292 | + | ||
293 | + private static List<Token> getExtendedGroupSegments(ArrayList<SyntacticGroup> nestedGroups, | ||
294 | + Map<String,ArrayList<String>> walentyNouns) { | ||
295 | + | ||
296 | + SyntacticGroup initialGroup = nestedGroups.get(0); | ||
297 | + String initialGroupHead = initialGroup.getSemanticHeadTokens().get(0).getBase(); | ||
298 | + | ||
299 | + List<Token> heads = initialGroup.getSemanticHeadTokens(); | ||
300 | + List<Token> segments = new ArrayList<Token>(); | ||
301 | + | ||
302 | + if (!walentyNouns.containsKey(initialGroupHead)) { | ||
303 | + segments.addAll(initialGroup.getTokens()); | ||
304 | + } else { | ||
305 | + | ||
306 | + ArrayList<String> schemata = walentyNouns.get(initialGroupHead); | ||
307 | + ArrayList<ArrayList<String>> groupsRealizations = new ArrayList<ArrayList<String>>(); | ||
308 | + ArrayList<SyntacticGroup> largestMatch = new ArrayList<SyntacticGroup>(); | ||
309 | + largestMatch.add(initialGroup); | ||
310 | + | ||
311 | + for (int i=1; i < nestedGroups.size(); i++) { | ||
312 | + SyntacticGroup group = nestedGroups.get(i); | ||
313 | + ArrayList<String> realizations = group.getWalentyRealizations(); | ||
314 | + groupsRealizations.add(realizations); | ||
315 | + if (realizationsMatch(schemata, groupsRealizations)) { | ||
316 | + largestMatch.add(group); | ||
317 | + } else { | ||
318 | + break; | ||
319 | + } | ||
320 | + } | ||
321 | + | ||
322 | + for (SyntacticGroup group : largestMatch) { | ||
323 | + segments.addAll(group.getTokens()); | ||
324 | + } | ||
325 | + | ||
326 | + } | ||
327 | + return segments; | ||
328 | + } | ||
329 | + | ||
330 | + private static List<Token> getExtendedGroupHeads(ArrayList<SyntacticGroup> nestedGroups) { | ||
331 | + | ||
332 | + SyntacticGroup initialGroup = nestedGroups.get(0); | ||
333 | + | ||
334 | + List<Token> heads = initialGroup.getSemanticHeadTokens(); | ||
335 | + | ||
336 | + return heads; | ||
337 | + } | ||
338 | + | ||
339 | + private static boolean realizationsMatch(ArrayList<String> schemata, | ||
340 | + ArrayList<ArrayList<String>> groupsRealizations) { | ||
341 | + for (String schema : schemata) { | ||
342 | + if (isProperSchema(schema, groupsRealizations)) { | ||
343 | + return true; | ||
344 | + } | ||
345 | + } | ||
346 | + return false; | ||
347 | + } | ||
348 | + | ||
349 | + private static boolean isProperSchema(String schema, | ||
350 | + ArrayList<ArrayList<String>> groupsRealizations) { | ||
351 | + | ||
352 | + ArrayList<ArrayList<String>> matchingPositions = new ArrayList<ArrayList<String>>(); | ||
353 | + for (ArrayList<String> realizations : groupsRealizations) { | ||
354 | + matchingPositions.add(getMatchingPositions(schema, realizations)); | ||
355 | + } | ||
356 | + | ||
357 | + if (matchingPositionsExists(matchingPositions)) { | ||
358 | + return true; | ||
359 | + /*ArrayList<ArrayList<String>> product = cartesianProduct(matchingPositions); | ||
360 | + for (ArrayList<String> combination : product) { | ||
361 | + Set<String> combinationSet = new HashSet<String>(combination); | ||
362 | + if (combinationSet.size() == matchingPositions.size()) { | ||
363 | + return true; | ||
364 | + } | ||
365 | + }*/ | ||
366 | + } | ||
367 | + return false; | ||
368 | + } | ||
369 | + | ||
370 | + private static ArrayList<String> getMatchingPositions(String schema, ArrayList<String> phraseRealizations) { | ||
371 | + ArrayList<String> positions = new ArrayList<String>(); | ||
372 | + for (String position : schema.split("\\s\\+\\s")) { | ||
373 | + position = position.trim(); | ||
374 | + position = position.substring(1, position.length()-1); | ||
375 | + for (String phrT : position.split(";")) { | ||
376 | + if (phraseRealizations.contains(phrT.trim())) { | ||
377 | + positions.add(position); | ||
378 | + break; | ||
379 | + } | ||
380 | + } | ||
381 | + } | ||
382 | + return positions; | ||
383 | + } | ||
384 | + | ||
385 | + private static boolean matchingPositionsExists(ArrayList<ArrayList<String>> matchingPositions) { | ||
386 | + for (ArrayList<String> positions : matchingPositions) { | ||
387 | + if (positions.isEmpty()) { | ||
388 | + return false; | ||
389 | + } | ||
390 | + } | ||
391 | + return true; | ||
392 | + } | ||
393 | + | ||
394 | + private static boolean sameToken(Token t1, Token t2) { | ||
395 | + if (t1.compareTo(t2) == 0) { | ||
396 | + return Boolean.valueOf(true); | ||
397 | + } | ||
398 | + return Boolean.valueOf(false); | ||
399 | + } | ||
400 | + ////////////////////////////////// | ||
401 | + | ||
402 | + private static void addTokenFeatures(Map<String, Object> features, String label, Token t, Sentence s) { | ||
403 | + features.put(String.format("%sCtag", label), t.getChosenInterpretation().getCtag()); | ||
404 | + features.put(String.format("%sNumber", label), t.getChosenInterpretation().getNumber()); | ||
405 | + features.put(String.format("%sGender", label), t.getChosenInterpretation().getGender()); | ||
406 | + features.put(String.format("%sPerson", label), t.getChosenInterpretation().getPerson()); | ||
407 | + features.put(String.format("%sWordCtag", label), wordCtag(t, s)); | ||
408 | + | ||
409 | + features.put(String.format("%sNextCtag", label), getNeighbouringTag(s, t, 1)); | ||
410 | + features.put(String.format("%sPrevCtag", label), getNeighbouringTag(s, t, -1)); | ||
411 | + | ||
412 | + | ||
413 | + Token next = getNeighbouringToken(s, t, 1); | ||
414 | + if (next != null) { | ||
415 | + features.put(String.format("%sNextWordCtag", label), wordCtag(next, s)); | ||
416 | + } else { | ||
417 | + features.put(String.format("%sNextWordCtag", label), "None"); | ||
418 | + } | ||
419 | + | ||
420 | + Token previous = getNeighbouringToken(s, t, -1); | ||
421 | + if (previous != null) { | ||
422 | + features.put(String.format("%sPrevWordCtag", label), wordCtag(previous, s)); | ||
423 | + } else { | ||
424 | + features.put(String.format("%sPrevWordCtag", label), "None"); | ||
425 | + } | ||
426 | + | ||
427 | +// features.put(String.format("%sNextNextCtag", label), getNeighbouringTag(s, t, 2)); | ||
428 | +// features.put(String.format("%sPrevPrevCtag", label), getNeighbouringTag(s, t, -2)); | ||
429 | + | ||
430 | +// features.put(String.format("%sSentPosition", label), t.getSentencePosition()); | ||
431 | + | ||
432 | + | ||
433 | +// features.put(String.format("%sPrevPraet", label), isPrevPraet(t, s)); | ||
434 | +// features.put(String.format("%sPrevComma", label), isPrevComma(t, s)); | ||
435 | +// features.put(String.format("%sPrev2Pred", label), isPrev2Pred(t, s)); | ||
436 | +// features.put(String.format("%sNextInf", label), isNextInf(t, s)); | ||
437 | + | ||
438 | +/* List<Token> clause = getClause(s, t); | ||
439 | + if (clause != null) | ||
440 | + features.put(String.format("%sClauseLength", label), clause.size()); | ||
441 | + else | ||
442 | + features.put(String.format("%sClauseLength", label), 0);*/ | ||
443 | + | ||
444 | + /*addFeatures(features, clause, String.format("%sClause", label), t); | ||
445 | + addFeatures(features, s, String.format("%sSent", label), t);*/ | ||
446 | +// for (int i = 1; i < 6; i++) // zrobic to ale w oknie od head do candidate | ||
447 | +// addFeatures(features, getWindow(s, t, i, 0), String.format("%sWindow_", label) + i + "_" + 0, t); | ||
448 | +// for (int i = 1; i < 6; i++) | ||
449 | +// addFeatures(features, getWindow(s, t, 0, i), String.format("%sWindow_", label) + 0 + "_" + i, t); | ||
450 | +// for (int i = 1; i < 6; i++) | ||
451 | +// addFeatures(features, getWindow(s, t, i, i), String.format("%sWindow_", label) + i + "_" + i, t); | ||
452 | + } | ||
453 | + | ||
454 | + private static String wordCtag(Token t, Sentence s) { | ||
455 | + for (SyntacticWord w : s.getSyntacticWords()) { | ||
456 | + if (w.getTokens().contains(t)) { | ||
457 | + return w.getCtag(); | ||
458 | + } | ||
459 | + } | ||
460 | + return "None"; | ||
461 | + } | ||
462 | + | ||
463 | + private static boolean isNextInf(Token m, Sentence s) { | ||
464 | + boolean now = false; | ||
465 | + for (Token morph : s) { | ||
466 | + if (now) | ||
467 | + return morph.getChosenInterpretation().getCtag().equals("inf"); | ||
468 | + if (m.equals(morph)) | ||
469 | + now = true; | ||
470 | + } | ||
471 | + return false; | ||
472 | + } | ||
473 | + | ||
474 | + private static boolean isPrev2Pred(Token m, Sentence s) { | ||
475 | + Token prev = null; | ||
476 | + Token prev2 = null; | ||
477 | + for (Token morph : s) { | ||
478 | + if (m.equals(morph)) | ||
479 | + break; | ||
480 | + prev2 = prev; | ||
481 | + prev = morph; | ||
482 | + } | ||
483 | + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred")) | ||
484 | + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred")); | ||
485 | + } | ||
486 | + | ||
487 | + private static Object isPrevComma(Token m, Sentence s) { | ||
488 | + Token prev = null; | ||
489 | + for (Token morph : s) { | ||
490 | + if (m.equals(morph)) | ||
491 | + break; | ||
492 | + prev = morph; | ||
493 | + } | ||
494 | + return prev != null && prev.getChosenInterpretation().getBase().equals(","); | ||
495 | + } | ||
496 | + | ||
497 | + private static String getNeighbouringTag(Sentence s, Token m, int i) { | ||
498 | + int idx = s.indexOf(m) + i; | ||
499 | + if (idx >= s.size() || idx < 0) | ||
500 | + return "None"; | ||
501 | + return s.get(idx).getChosenInterpretation().getCtag(); | ||
502 | + } | ||
503 | + | ||
504 | + private static Token getNeighbouringToken(Sentence s, Token m, int i) { | ||
505 | + int idx = s.indexOf(m) + i; | ||
506 | + if (idx >= s.size() || idx < 0) | ||
507 | + return null; | ||
508 | + return s.get(idx); | ||
509 | + } | ||
510 | + | ||
511 | + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) { | ||
512 | + | ||
513 | + boolean hasNom = false; // 1 | ||
514 | + boolean hasNum = false; // 2 | ||
515 | + boolean hasPOG = false; // 3 | ||
516 | + | ||
517 | + boolean hasNomNum = false; | ||
518 | + boolean hasNumPOG = false; | ||
519 | + boolean hasNomPOG = false; | ||
520 | + boolean hasNomNumPOG = false; | ||
521 | + | ||
522 | + boolean has2Nom = false; | ||
523 | + boolean has2NomPOG = false; | ||
524 | + boolean has2POG = false; | ||
525 | + | ||
526 | + Token prev = null; | ||
527 | + for (Token candidate : clause) { | ||
528 | + | ||
529 | + if (!isNoun(candidate) || isJakJako(prev)) { | ||
530 | + prev = candidate; | ||
531 | + continue; | ||
532 | + } | ||
533 | + | ||
534 | + // nom, nom2 | ||
535 | + if (isNom(candidate)) { | ||
536 | + if (hasNom) | ||
537 | + has2Nom = true; | ||
538 | + hasNom = true; | ||
539 | + } | ||
540 | + // num | ||
541 | + if (agreedNum(candidate, m)) { | ||
542 | + hasNum = true; | ||
543 | + } | ||
544 | + // pog, pog2 | ||
545 | + if (agreedGenderOrPerson(candidate, m)) { | ||
546 | + if (hasPOG) | ||
547 | + has2POG = true; | ||
548 | + hasPOG = true; | ||
549 | + } | ||
550 | + | ||
551 | + // nom num, nom num pog | ||
552 | + if (isNom(candidate) && agreedNum(candidate, m)) { | ||
553 | + if (agreedGenderOrPerson(candidate, m)) | ||
554 | + hasNomNumPOG = true; | ||
555 | + hasNomNum = true; | ||
556 | + } | ||
557 | + | ||
558 | + // nom pog, num pog | ||
559 | + if (agreedGenderOrPerson(candidate, m)) | ||
560 | + if (isNom(candidate)) { | ||
561 | + if (hasNomPOG) | ||
562 | + has2NomPOG = true; | ||
563 | + hasNomPOG = true; | ||
564 | + } else if (agreedNum(candidate, m)) | ||
565 | + hasNumPOG = true; | ||
566 | + | ||
567 | + prev = candidate; | ||
568 | + } | ||
569 | + | ||
570 | + // features.put("conj_" + prefix, hasConj); | ||
571 | + features.put("cand_2_nom_" + prefix, has2Nom); | ||
572 | + features.put("cand_2_POG_" + prefix, has2POG); | ||
573 | + features.put("cand_2_nom+POG_" + prefix, has2NomPOG); | ||
574 | + | ||
575 | + features.put("cand_nom_" + prefix, hasNom); | ||
576 | + features.put("cand_num_" + prefix, hasNum); | ||
577 | + features.put("cand_POG_" + prefix, hasPOG); | ||
578 | + | ||
579 | + features.put("cand_nom+num_" + prefix, hasNomNum); | ||
580 | + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG); | ||
581 | + features.put("cand_nom+POG_" + prefix, hasNomPOG); | ||
582 | + features.put("cand_num+POG_" + prefix, hasNumPOG); | ||
583 | + } | ||
584 | + | ||
585 | + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) { | ||
586 | + | ||
587 | + int idx = s.indexOf(m); | ||
588 | + int from = Math.max(0, idx - pre); | ||
589 | + int to = Math.min(s.size(), idx + post + 1); | ||
590 | + | ||
591 | + return new ArrayList<>(s.subList(from, to)); | ||
592 | + } | ||
593 | + | ||
594 | + private static boolean isPrevPraet(Token m, Sentence s) { | ||
595 | + Token prev = null; | ||
596 | + for (Token morph : s) { | ||
597 | + if (m.equals(morph)) | ||
598 | + break; | ||
599 | + prev = morph; | ||
600 | + } | ||
601 | + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet"); | ||
602 | + } | ||
603 | + | ||
604 | + /** | ||
605 | + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo, | ||
606 | + * lub (jak przy streszczeniach: w środku musi być czasownik w formie | ||
607 | + * osobowej), | ||
608 | + * | ||
609 | + * @param s | ||
610 | + * sentence | ||
611 | + * @param m2 | ||
612 | + * token | ||
613 | + * @return clause with the token | ||
614 | + */ | ||
615 | + public static List<Token> getClause(Sentence s, Token m2) { | ||
616 | + | ||
617 | + List<List<Token>> sublists = getClauses(s); | ||
618 | + | ||
619 | + for (List<Token> sub : sublists) | ||
620 | + for (Token m : sub) | ||
621 | + if (m.equals(m2)) | ||
622 | + return sub; | ||
623 | + | ||
624 | + return null; | ||
625 | + } | ||
626 | + | ||
627 | + public static List<List<Token>> getClauses(Sentence s) { | ||
628 | + | ||
629 | + Set<Token> noSplitMorphs = new HashSet<>(); | ||
630 | + for (SyntacticGroup g : s.getGroups()) { | ||
631 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | ||
632 | + noSplitMorphs.add(m); | ||
633 | + } | ||
634 | + } | ||
635 | + for (SyntacticWord g : s.getSyntacticWords()) { | ||
636 | + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { | ||
637 | + noSplitMorphs.add(m); | ||
638 | + } | ||
639 | + } | ||
640 | + | ||
641 | + LinkedList<List<Token>> sublists = new LinkedList<>(); | ||
642 | + List<Token> currentSublist = new ArrayList<>(); | ||
643 | + boolean clauseHasVerb = false; | ||
644 | + for (Token m : s) { | ||
645 | + String base = m.getChosenInterpretation().getBase(); | ||
646 | + if (!noSplitMorphs.contains(m) | ||
647 | + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2 | ||
648 | + .contains(base)) && clauseHasVerb))) { | ||
649 | + sublists.add(currentSublist); | ||
650 | + currentSublist = new ArrayList<>(); | ||
651 | + clauseHasVerb = false; | ||
652 | + } else { | ||
653 | + if (isVerb(m)) | ||
654 | + clauseHasVerb = true; | ||
655 | + } | ||
656 | + currentSublist.add(m); | ||
657 | + } | ||
658 | + if (currentSublist.size() > 0) { | ||
659 | + if (clauseHasVerb) | ||
660 | + sublists.add(currentSublist); | ||
661 | + else if (!sublists.isEmpty()) | ||
662 | + sublists.getLast().addAll(currentSublist); | ||
663 | + } | ||
664 | + | ||
665 | + // merge clause beginning with zaimek wzgl. etc to previous clause | ||
666 | + List<Token> prev = null; | ||
667 | + Iterator<List<Token>> it = sublists.iterator(); | ||
668 | + while (it.hasNext()) { | ||
669 | + List<Token> sublist = it.next(); | ||
670 | + boolean containsRelPron = false; | ||
671 | + int i = 1; | ||
672 | + for (Token m : sublist) { | ||
673 | + if (i > 2) | ||
674 | + break; | ||
675 | + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) { | ||
676 | + containsRelPron = true; | ||
677 | + break; | ||
678 | + } | ||
679 | + i++; | ||
680 | + } | ||
681 | + if (prev != null && containsRelPron) { | ||
682 | + prev.addAll(sublist); | ||
683 | + it.remove(); | ||
684 | + } else | ||
685 | + prev = sublist; | ||
686 | + } | ||
687 | + | ||
688 | + return sublists; | ||
689 | + } | ||
690 | + | ||
691 | + private static boolean agreedNum(Token candidate, Token keyword) { | ||
692 | + String keywordNum = keyword.getNumber(); | ||
693 | + String wordNum = candidate.getNumber(); | ||
694 | + return keywordNum.equals(wordNum); | ||
695 | + } | ||
696 | + | ||
697 | + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) { | ||
698 | + if (isPraet(keyword)) { | ||
699 | + // praet has number:gender | ||
700 | + String keywordGender = keyword.getGender(); | ||
701 | + String wordGender = candidate.getGender(); | ||
702 | + return keywordGender.equals(wordGender); | ||
703 | + } else { | ||
704 | + // other verbs have number:person | ||
705 | + String keywordPerson = keyword.getPerson(); | ||
706 | + String wordPerson = "ter"; // default | ||
707 | + if (PRONOUN_TAGS.contains(candidate.getCtag())) | ||
708 | + wordPerson = candidate.getPerson(); | ||
709 | + return wordPerson.equals(keywordPerson); | ||
710 | + } | ||
711 | + } | ||
712 | + | ||
713 | + private static boolean isJakJako(Token prev) { | ||
714 | + String base = prev == null ? null : prev.getBase(); | ||
715 | + return prev != null && (base.equals("jak") || base.equals("jako")); | ||
716 | + } | ||
717 | + | ||
718 | + private static boolean isPraet(Token keyword) { | ||
719 | + return keyword.getCtag().equals("praet"); | ||
720 | + } | ||
721 | + | ||
722 | + private static boolean isNom(Token candidate) { | ||
723 | + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow | ||
724 | + // tylko! | ||
725 | + } | ||
726 | + | ||
727 | + public static boolean isNoun(Token m) { | ||
728 | + return NOUN_TAGS.contains(m.getCtag()); | ||
729 | + } | ||
730 | + | ||
731 | + public static boolean isNoun(Mention m) { | ||
732 | + return NOUN_TAGS.contains(m.getHeadSegments().get(0).getCtag()); | ||
733 | + } | ||
734 | + | ||
735 | + public static boolean isVerb(Token morph) { | ||
736 | + return VERB_TAGS.contains(morph.getCtag()); | ||
737 | + } | ||
738 | + | ||
739 | + public static boolean isVerb(Mention m) { | ||
740 | + boolean hasOnlyVerbs = true; | ||
741 | + for (Token morph : m.getSegments()) | ||
742 | + if (!isVerb(morph)) { | ||
743 | + hasOnlyVerbs = false; | ||
744 | + break; | ||
745 | + } | ||
746 | + return hasOnlyVerbs; | ||
747 | + } | ||
748 | + | ||
749 | + public static boolean isVerb(TEIMention m) { | ||
750 | + boolean hasOnlyVerbs = true; | ||
751 | + for (TEIMorph morph : m.getMorphs()) | ||
752 | + if (!isVerb(morph)) { | ||
753 | + hasOnlyVerbs = false; | ||
754 | + break; | ||
755 | + } | ||
756 | + return hasOnlyVerbs; | ||
757 | + } | ||
758 | + | ||
759 | + private static boolean isVerb(TEIMorph morph) { | ||
760 | + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag()); | ||
761 | + } | ||
762 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/InstanceCreator.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | + | ||
6 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | ||
7 | +import pl.waw.ipipan.zil.core.md.entities.*; | ||
8 | +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; | ||
9 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; | ||
10 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; | ||
11 | +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; | ||
12 | +import weka.core.Attribute; | ||
13 | +import weka.core.FastVector; | ||
14 | +import weka.core.Instance; | ||
15 | +import weka.core.Instances; | ||
16 | + | ||
17 | +import java.io.File; | ||
18 | +import java.util.*; | ||
19 | +import java.util.Map.Entry; | ||
20 | + | ||
21 | +public class InstanceCreator { | ||
22 | + | ||
23 | + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class); | ||
24 | + private static final TEI_IO teiIO = TEI_IO.getInstance(); | ||
25 | + | ||
26 | + private InstanceCreator() { | ||
27 | + } | ||
28 | + | ||
29 | + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs, | ||
30 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | ||
31 | + int allTexts = 0; | ||
32 | + int exceptions = 0; | ||
33 | + int allSentences = 0; | ||
34 | + | ||
35 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | ||
36 | + for (File textDir : IOUtils.getNKJPDirs(dataDir)) { | ||
37 | + try { | ||
38 | + allTexts++; | ||
39 | + logger.info("Processing text " + textDir); | ||
40 | + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); | ||
41 | + Text text = TeiLoader.loadTextFromTei(ct, textDir); | ||
42 | + | ||
43 | + for (Paragraph p : text) | ||
44 | + for (Sentence s : p) { | ||
45 | + allSentences++; | ||
46 | + loadExamplesFromSentence(quasiVerbs, valence, examples, s); | ||
47 | + } | ||
48 | + | ||
49 | + } catch (Exception e) { | ||
50 | + //logger.error(e.getLocalizedMessage()); | ||
51 | + e.printStackTrace(); | ||
52 | + exceptions++; | ||
53 | + } | ||
54 | + } | ||
55 | + | ||
56 | + logger.info(allTexts + " texts found."); | ||
57 | + if (exceptions != 0) | ||
58 | + logger.error(exceptions + " texts with exceptions."); | ||
59 | + logger.info(allSentences + " sentences found."); | ||
60 | + | ||
61 | + return examples; | ||
62 | + } | ||
63 | + | ||
64 | + public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | ||
65 | + List<TreeMap<String, Object>> examples, Sentence s) { | ||
66 | + | ||
67 | + | ||
68 | + ArrayList<Token> heads = new ArrayList<>(); | ||
69 | + for (Mention m : s.getMentions()) { | ||
70 | + heads.addAll(m.getHeadSegments()); | ||
71 | + } | ||
72 | + | ||
73 | + // collect positive examples | ||
74 | + HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>(); | ||
75 | + for (Mention m : s.getMentions()) { | ||
76 | + if (heads.containsAll(m.getHeadSegments())) { | ||
77 | + positives.put(m.getHeadSegments().get(0), m.getSegments()); | ||
78 | + } | ||
79 | + } | ||
80 | + | ||
81 | + for (Token head : s) { | ||
82 | + if (heads.contains(head)) { | ||
83 | + for (Token t : s) { | ||
84 | + //if (head.compareTo(t) != 0) {// && Math.abs(head.getSentencePosition() - t.getSentencePosition()) <= window) { | ||
85 | + TreeMap<String, Object> features = new TreeMap<>(); | ||
86 | + if (positives.containsKey(head) && positives.get(head).contains(t)) { | ||
87 | + features.put("class", Boolean.valueOf(true)); | ||
88 | + //features.put("candidatePositionInMention", positionInMention(head, t, s)); | ||
89 | + | ||
90 | + } else { | ||
91 | + features.put("class", Boolean.valueOf(false)); | ||
92 | + //features.put("candidatePositionInMention", 0); | ||
93 | + } | ||
94 | + | ||
95 | + | ||
96 | + FeatureGeneration.generateFeatures(features, valence, head, t, s, heads); | ||
97 | + //features.put("candidatePositionInMention", positionInMention(head, t, s)); | ||
98 | + addPreviousStates(features, head, t, s); | ||
99 | + | ||
100 | + examples.add(features); | ||
101 | + // } | ||
102 | + } | ||
103 | + } | ||
104 | + } | ||
105 | + } | ||
106 | + | ||
107 | + public static void addPreviousStates(Map<String, Object> features, Token head, Token candidate, Sentence s) { | ||
108 | + int context = 1; | ||
109 | + int candidateLocation = candidate.getSentencePosition(); | ||
110 | + for (int i = 1; i <= context; i++) { | ||
111 | + if (candidateLocation - i < 0) { | ||
112 | + features.put(String.format("location-%d", i), Boolean.valueOf(false)); | ||
113 | + } else if (sameMention(s.get(candidateLocation - i), head, s) ) { | ||
114 | + features.put(String.format("location-%d", i), Boolean.valueOf(true)); | ||
115 | + } else { | ||
116 | + features.put(String.format("location-%d", i), Boolean.valueOf(false)); | ||
117 | + } | ||
118 | + } | ||
119 | + } | ||
120 | + | ||
121 | + public static int positionInMention(Token head, Token t, Sentence s) { | ||
122 | + | ||
123 | + Token previous = null; | ||
124 | + if (t.getSentencePosition()-1 >= 0) { | ||
125 | + previous = s.get(t.getSentencePosition()-1); | ||
126 | + } else { | ||
127 | + return 0; | ||
128 | + } | ||
129 | + | ||
130 | + for (Mention m : s.getMentions()) { | ||
131 | + if (m.getHeadSegments().contains(head) && m.getSegments().contains(previous)) { | ||
132 | +/* if (m.getSegments().get(0).getSentencePosition() - t.getSentencePosition() <= -1) { | ||
133 | + System.out.println(m.getSegments().get(0)); | ||
134 | + System.out.println(t); | ||
135 | + System.out.println(m.getSegments()); | ||
136 | + }*/ | ||
137 | + return previous.getSentencePosition() - m.getSegments().get(0).getSentencePosition(); | ||
138 | + } | ||
139 | + } | ||
140 | + return 0; | ||
141 | + } | ||
142 | + | ||
143 | + private static boolean sameMention(Token t1, Token t2, Sentence s) { | ||
144 | + for (Mention m : s.getMentions()) { | ||
145 | + if (m.getSegments().contains(t1) && m.getSegments().contains(t2)) { | ||
146 | + return true; | ||
147 | + } | ||
148 | + } | ||
149 | + return false; | ||
150 | + } | ||
151 | + | ||
152 | + public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | ||
153 | + List<TreeMap<String, Object>> examples, Sentence s, List<Token> heads) { | ||
154 | + | ||
155 | + | ||
156 | + if (heads == null || heads.isEmpty()) | ||
157 | + return; | ||
158 | + | ||
159 | + // collect positive examples | ||
160 | + HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>(); | ||
161 | + for (Mention m : s.getMentions()) { | ||
162 | + if (heads.containsAll(m.getHeadSegments())) { | ||
163 | + positives.put(m.getHeadSegments().get(0), m.getSegments()); | ||
164 | + } | ||
165 | + } | ||
166 | + | ||
167 | + for (Token head : s) { | ||
168 | + if (heads.contains(head)) { | ||
169 | + for (Token t : s) { | ||
170 | + TreeMap<String, Object> features = new TreeMap<>(); | ||
171 | + | ||
172 | + if (positives.containsKey(head) && positives.get(head).contains(t)) { | ||
173 | + features.put("class", Boolean.valueOf(true)); | ||
174 | + //features.put("candidatePositionInMention", positionInMention(head, t, s)); | ||
175 | + | ||
176 | + } else { | ||
177 | + features.put("class", Boolean.valueOf(false)); | ||
178 | + //features.put("candidatePositionInMention", 0); | ||
179 | + } | ||
180 | + | ||
181 | + FeatureGeneration.generateFeatures(features, valence, head, t, s, heads); | ||
182 | + //features.put("candidatePositionInMention", positionInMention(head, t, s)); | ||
183 | + addPreviousStates(features, head, t, s); | ||
184 | + examples.add(features); | ||
185 | + } | ||
186 | + } | ||
187 | + } | ||
188 | + } | ||
189 | + | ||
190 | + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { | ||
191 | + | ||
192 | + TreeSet<String> booleanAttsOccurred = new TreeSet<>(); | ||
193 | + TreeSet<String> doubleAttsOccurred = new TreeSet<>(); | ||
194 | + TreeMap<String, Set<String>> att2values = new TreeMap<>(); | ||
195 | + for (TreeMap<String, Object> example : examples) { | ||
196 | + for (Entry<String, Object> e : example.entrySet()) { | ||
197 | + String key = e.getKey(); | ||
198 | + Object val = e.getValue(); | ||
199 | + if (val instanceof Integer || val instanceof Double) { | ||
200 | + doubleAttsOccurred.add(key); | ||
201 | + continue; | ||
202 | + } | ||
203 | + if (val instanceof Boolean) { | ||
204 | + booleanAttsOccurred.add(key); | ||
205 | + continue; | ||
206 | + } | ||
207 | + if (!att2values.containsKey(key)) | ||
208 | + att2values.put(key, new HashSet<>()); | ||
209 | + att2values.get(key).add(val.toString()); | ||
210 | + } | ||
211 | + } | ||
212 | + | ||
213 | + List<Attribute> atts = new ArrayList<>(); | ||
214 | + | ||
215 | + // double attributes | ||
216 | + for (String attName : doubleAttsOccurred) { | ||
217 | + Attribute att = new Attribute(attName); | ||
218 | + atts.add(att); | ||
219 | + } | ||
220 | + | ||
221 | + // boolean attributes (treated as nominal) | ||
222 | + FastVector values = new FastVector(2); | ||
223 | + values.addElement("false"); | ||
224 | + values.addElement("true"); | ||
225 | + for (String attName : booleanAttsOccurred) { | ||
226 | + Attribute att = new Attribute(attName, values); | ||
227 | + atts.add(att); | ||
228 | + } | ||
229 | + | ||
230 | + // nominal attributes | ||
231 | + for (Entry<String, Set<String>> attVals : att2values.entrySet()) { | ||
232 | + FastVector vals = new FastVector(attVals.getValue().size()); | ||
233 | + for (String val : attVals.getValue()) | ||
234 | + vals.addElement(val); | ||
235 | + Attribute att = new Attribute(attVals.getKey(), vals); | ||
236 | + atts.add(att); | ||
237 | + } | ||
238 | + | ||
239 | + FastVector fvWekaAttributes = new FastVector(atts.size()); | ||
240 | + for (Attribute attr : atts) { | ||
241 | + fvWekaAttributes.addElement(attr); | ||
242 | + } | ||
243 | + | ||
244 | + Instances data = new Instances("Nominal", fvWekaAttributes, 10); | ||
245 | + data.setClass(data.attribute(classFeatureName)); | ||
246 | + return data; | ||
247 | + } | ||
248 | + | ||
249 | + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { | ||
250 | + for (TreeMap<String, Object> example : examples) { | ||
251 | + addInstance(example, instances); | ||
252 | + } | ||
253 | + } | ||
254 | + | ||
255 | + public static void addInstance(TreeMap<String, Object> example, Instances instances) { | ||
256 | + Instance instance = new Instance(instances.numAttributes()); | ||
257 | + | ||
258 | + for (Entry<String, Object> e : example.entrySet()) { | ||
259 | + Object val = e.getValue(); | ||
260 | + String name = e.getKey(); | ||
261 | + if (val instanceof Integer) { | ||
262 | + instance.setValue(instances.attribute(name), (int) val); | ||
263 | + } else if (val instanceof Boolean) { | ||
264 | + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); | ||
265 | + } else { | ||
266 | + int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); | ||
267 | + if (indexOfValue == -1) { | ||
268 | + logger.debug("Unkown value: " + val.toString() + " of feature: " + name | ||
269 | + + ". Marking as missing value."); | ||
270 | + instance.setMissing(instances.attribute(name)); | ||
271 | + } else | ||
272 | + instance.setValue(instances.attribute(name), indexOfValue); | ||
273 | + } | ||
274 | + } | ||
275 | + | ||
276 | + instance.setDataset(instances); | ||
277 | + instances.add(instance); | ||
278 | + } | ||
279 | + | ||
280 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Model.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
6 | +import weka.classifiers.Classifier; | ||
7 | +import weka.core.Instance; | ||
8 | +import weka.core.Instances; | ||
9 | + | ||
10 | +import java.io.Serializable; | ||
11 | +import java.util.List; | ||
12 | +import java.util.Set; | ||
13 | +import java.util.TreeMap; | ||
14 | + | ||
15 | +public class Model implements Serializable { | ||
16 | + | ||
17 | + private static final long serialVersionUID = 3351727361273283076L; | ||
18 | + private static final Logger logger = LoggerFactory.getLogger(Model.class); | ||
19 | + | ||
20 | + private Classifier classifier; | ||
21 | + private Set<String> quasiVerbs; | ||
22 | + private Instances instances; | ||
23 | + | ||
24 | + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { | ||
25 | + this.classifier = classifier; | ||
26 | + this.instances = instances; | ||
27 | + this.quasiVerbs = quasiVerbs; | ||
28 | + } | ||
29 | + | ||
30 | + public boolean arePartOfSameMention(Instance instance, Sentence sentence) { | ||
31 | + try { | ||
32 | + double response = this.classifier.classifyInstance(instance); | ||
33 | + return response > 0; | ||
34 | + } catch (Exception e) { | ||
35 | + logger.error("Error classyfing verb in sentence: " + sentence, e); | ||
36 | + return false; | ||
37 | + } | ||
38 | + } | ||
39 | + | ||
40 | + public Instances getInstances(List<TreeMap<String, Object>> examples) { | ||
41 | + Instances instances = new Instances(this.instances); | ||
42 | + InstanceCreator.fillInstances(examples, instances); | ||
43 | + return instances; | ||
44 | + } | ||
45 | + | ||
46 | + public Instances getInstances() { | ||
47 | + Instances instances = new Instances(this.instances); | ||
48 | + return instances; | ||
49 | + } | ||
50 | + | ||
51 | + public Set<String> getQuasiVerbs() { | ||
52 | + return quasiVerbs; | ||
53 | + } | ||
54 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/NominalMentionDetector.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | ||
2 | + | ||
3 | +import java.io.File; | ||
4 | +import java.io.InputStream; | ||
5 | +import java.util.ArrayList; | ||
6 | +import java.util.HashSet; | ||
7 | +import java.util.List; | ||
8 | +import java.util.Map; | ||
9 | +import java.util.Set; | ||
10 | +import java.util.TreeMap; | ||
11 | +import java.util.Map.Entry; | ||
12 | + | ||
13 | +import org.slf4j.Logger; | ||
14 | +import org.slf4j.LoggerFactory; | ||
15 | + | ||
16 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | ||
17 | +import pl.waw.ipipan.zil.core.md.detection.nominal.FeatureGeneration; | ||
18 | +import pl.waw.ipipan.zil.core.md.detection.nominal.InstanceCreator; | ||
19 | +import pl.waw.ipipan.zil.core.md.detection.nominal.Model; | ||
20 | +import pl.waw.ipipan.zil.core.md.detection.nominal.Serializer; | ||
21 | +import pl.waw.ipipan.zil.core.md.entities.Mention; | ||
22 | +import pl.waw.ipipan.zil.core.md.entities.Sentence; | ||
23 | +import pl.waw.ipipan.zil.core.md.entities.Token; | ||
24 | +import weka.core.Instances; | ||
25 | + | ||
26 | +public class NominalMentionDetector { | ||
27 | + final private static Logger logger = LoggerFactory.getLogger(NominalMentionDetector.class); | ||
28 | + | ||
29 | + private Model model; | ||
30 | + private Set<String> quasiVerbs = new HashSet<>(); | ||
31 | + | ||
32 | + public void addNominalMentions(Sentence sentence, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, List<Token> heads) { | ||
33 | + List<TreeMap<String, Object>> examples = new ArrayList<>(); | ||
34 | + InstanceCreator.loadExamplesFromSentence(quasiVerbs, valence, examples, sentence, heads); | ||
35 | + if (examples.isEmpty()) | ||
36 | + return; | ||
37 | + | ||
38 | + Instances instances = model.getInstances(); | ||
39 | + | ||
40 | + // label instances | ||
41 | + List<Boolean> areInSameMention = new ArrayList<>(); | ||
42 | + for (int i = 0; i < examples.size(); i++) { | ||
43 | + TreeMap<String, Object> example = examples.get(i); | ||
44 | + if (i - 1 < 0) { | ||
45 | + example.put("location-1", Boolean.valueOf(false)); | ||
46 | + //example.put("candidatePositionInMention", 0); | ||
47 | + } else { | ||
48 | + example.put("location-1", Boolean.valueOf(areInSameMention.get(i-1))); | ||
49 | +// int positionInMention = 1; | ||
50 | +// while (i - positionInMention >= 0 && areInSameMention.get(i-positionInMention)) { | ||
51 | +// positionInMention++; | ||
52 | +// } | ||
53 | +// example.put("candidatePositionInMention", positionInMention-1); | ||
54 | + } | ||
55 | + | ||
56 | + InstanceCreator.addInstance(example, instances); | ||
57 | + boolean inSameMention = model.arePartOfSameMention(instances.instance(i), sentence); | ||
58 | + areInSameMention.add(inSameMention); | ||
59 | + } | ||
60 | + | ||
61 | + int i = 0; | ||
62 | + for (Token head : sentence) { | ||
63 | + if (heads.contains(head)) { | ||
64 | + ArrayList<Token> mSegments = new ArrayList<Token>(); | ||
65 | + ArrayList<Token> mHead = new ArrayList<Token>(); | ||
66 | + mHead.add(head); | ||
67 | + for (Token t : sentence) { | ||
68 | + if (head.compareTo(t) != 0) { | ||
69 | + if (areInSameMention.get(i)) { | ||
70 | + mSegments.add(t); | ||
71 | + } | ||
72 | + } else { | ||
73 | + mSegments.add(t); | ||
74 | + } | ||
75 | + i++; | ||
76 | + } | ||
77 | + | ||
78 | + // cleaning | ||
79 | + if(mSegments.get(mSegments.size()-1).getCtag().equals("prep") || mSegments.get(mSegments.size()-1).getCtag().equals("conj") || | ||
80 | + mSegments.get(mSegments.size()-1).getCtag().equals("comp")) { | ||
81 | + mSegments.remove(mSegments.size()-1); | ||
82 | + } | ||
83 | + if(mSegments.get(0).getCtag().equals("prep") || mSegments.get(0).getCtag().equals("conj") || | ||
84 | + mSegments.get(0).getCtag().equals("comp")) { | ||
85 | + mSegments.remove(0); | ||
86 | + } | ||
87 | + | ||
88 | + sentence.addMention(new Mention(mSegments, mHead)); | ||
89 | + } | ||
90 | + } | ||
91 | + } | ||
92 | + | ||
93 | + public NominalMentionDetector(File zeroSubjectDetectionModel) { | ||
94 | + try { | ||
95 | + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); | ||
96 | + this.quasiVerbs = this.model.getQuasiVerbs(); | ||
97 | + } catch (Exception e) { | ||
98 | + logger.error("Error loading model:" + e); | ||
99 | + } | ||
100 | + } | ||
101 | + | ||
102 | + public NominalMentionDetector(InputStream zeroSubjectDetectionModelStream) { | ||
103 | + try { | ||
104 | + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); | ||
105 | + this.quasiVerbs = this.model.getQuasiVerbs(); | ||
106 | + } catch (Exception e) { | ||
107 | + logger.error("Error loading model:" + e); | ||
108 | + } | ||
109 | + } | ||
110 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Serializer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | ||
2 | + | ||
3 | +import weka.core.SerializationHelper; | ||
4 | + | ||
5 | +import java.io.InputStream; | ||
6 | + | ||
7 | +public class Serializer { | ||
8 | + | ||
9 | + public static void saveModel(Model m, String targetModelFilePath) throws Exception { | ||
10 | + SerializationHelper.write(targetModelFilePath, m); | ||
11 | + } | ||
12 | + | ||
13 | + public static Model loadModel(String path) throws Exception { | ||
14 | + Model m = (Model) SerializationHelper.read(path); | ||
15 | + return m; | ||
16 | + } | ||
17 | + | ||
18 | + public static Model loadModelFromStream(InputStream stream) throws Exception { | ||
19 | + Model m = (Model) SerializationHelper.read(stream); | ||
20 | + return m; | ||
21 | + } | ||
22 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Trainer.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.detection.nominal; | ||
2 | + | ||
3 | +import org.slf4j.Logger; | ||
4 | +import org.slf4j.LoggerFactory; | ||
5 | + | ||
6 | +import pl.waw.ipipan.zil.core.md.Main; | ||
7 | +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; | ||
8 | +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector; | ||
9 | +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; | ||
10 | +import weka.classifiers.Evaluation; | ||
11 | +import weka.classifiers.rules.JRip; | ||
12 | +import weka.classifiers.rules.JRip.RipperRule; | ||
13 | +import weka.classifiers.trees.J48; | ||
14 | +import weka.core.Attribute; | ||
15 | +import weka.core.Instance; | ||
16 | +import weka.core.Instances; | ||
17 | + | ||
18 | +import java.io.*; | ||
19 | +import java.util.*; | ||
20 | + | ||
21 | +public class Trainer { | ||
22 | + | ||
23 | + private static final Logger logger = LoggerFactory.getLogger(Trainer.class); | ||
24 | + | ||
25 | + private static final boolean DO_CV = false; | ||
26 | + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; | ||
27 | + private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt"; | ||
28 | + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt"; | ||
29 | + | ||
30 | + private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = | ||
31 | + new EnumMap(ValenceDicts.class); | ||
32 | + | ||
33 | + static { | ||
34 | + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE); | ||
35 | + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream)); | ||
36 | + | ||
37 | + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); | ||
38 | + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); | ||
39 | + } | ||
40 | + | ||
41 | + private Trainer() { | ||
42 | + } | ||
43 | + | ||
44 | + public static void main(String[] args) { | ||
45 | + | ||
46 | + if (args.length != 2) { | ||
47 | + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() | ||
48 | + + " trainDir targetModelFile"); | ||
49 | + return; | ||
50 | + } | ||
51 | + | ||
52 | + File dataDir = new File(args[0]); | ||
53 | + String targetModelFilePath = args[1]; | ||
54 | + | ||
55 | + if (!dataDir.isDirectory()) { | ||
56 | + logger.error(dataDir + " is not a directory!"); | ||
57 | + return; | ||
58 | + } | ||
59 | + | ||
60 | + Set<String> quasiVerbs = loadQuasiVerbs(); | ||
61 | + | ||
62 | + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE); | ||
63 | + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream)); | ||
64 | + | ||
65 | + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); | ||
66 | + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); | ||
67 | + | ||
68 | + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs, valence); | ||
69 | + Instances instances = InstanceCreator.createInstances(examples, "class"); | ||
70 | + InstanceCreator.fillInstances(examples, instances); | ||
71 | + | ||
72 | + printStats(instances); | ||
73 | + | ||
74 | + try { | ||
75 | + J48 model; | ||
76 | + | ||
77 | + logger.info("Building final classifier..."); | ||
78 | + model = new J48(); | ||
79 | + model.buildClassifier(instances); | ||
80 | + logger.info("J48 tree:"); | ||
81 | + logger.info(model.toString()); | ||
82 | + | ||
83 | + instances.delete(); | ||
84 | + logger.info("Features stats:"); | ||
85 | + for (int i = 0; i < instances.numAttributes(); i++) { | ||
86 | + Attribute att = instances.attribute(i); | ||
87 | + logger.info(i + ".\t" + att.toString()); | ||
88 | + } | ||
89 | + | ||
90 | + logger.info("Saving classifier..."); | ||
91 | + Model m = new Model(model, instances, quasiVerbs); | ||
92 | + Serializer.saveModel(m, targetModelFilePath); | ||
93 | + logger.info("Done."); | ||
94 | + | ||
95 | + } catch (Exception e) { | ||
96 | + logger.error("Error: " + e); | ||
97 | + } | ||
98 | + | ||
99 | +/* try { | ||
100 | + JRip model; | ||
101 | + | ||
102 | + if (DO_CV) { | ||
103 | + logger.info("Crossvalidation..."); | ||
104 | + model = new JRip(); | ||
105 | + Evaluation eval = new Evaluation(instances); | ||
106 | + eval.crossValidateModel(model, instances, 10, new Random(1)); | ||
107 | + logger.info(eval.toSummaryString()); | ||
108 | + logger.info(eval.toMatrixString()); | ||
109 | + logger.info(eval.toClassDetailsString()); | ||
110 | + } | ||
111 | + | ||
112 | + logger.info("Building final classifier..."); | ||
113 | + model = new JRip(); | ||
114 | + model.buildClassifier(instances); | ||
115 | + logger.info(model.getRuleset().size() + " rules generated."); | ||
116 | + for (int i = 0; i < model.getRuleset().size(); i++) { | ||
117 | + RipperRule v = (RipperRule) model.getRuleset().elementAt(i); | ||
118 | + logger.info("\t" + v.toString(instances.classAttribute())); | ||
119 | + } | ||
120 | + | ||
121 | + instances.delete(); | ||
122 | + logger.info("Features stats:"); | ||
123 | + for (int i = 0; i < instances.numAttributes(); i++) { | ||
124 | + Attribute att = instances.attribute(i); | ||
125 | + logger.info(i + ".\t" + att.toString()); | ||
126 | + } | ||
127 | + | ||
128 | + logger.info("Saving classifier..."); | ||
129 | + Model m = new Model(model, instances, quasiVerbs); | ||
130 | + Serializer.saveModel(m, targetModelFilePath); | ||
131 | + logger.info("Done."); | ||
132 | + | ||
133 | + } catch (Exception e) { | ||
134 | + logger.error("Error: " + e); | ||
135 | + }*/ | ||
136 | + } | ||
137 | + | ||
138 | + public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream) | ||
139 | + { | ||
140 | + Map<String,ArrayList<String>> map; | ||
141 | + try { | ||
142 | + BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream)); | ||
143 | + map = new HashMap<String,ArrayList<String>>(); | ||
144 | + String line; | ||
145 | + boolean firstLine = true; | ||
146 | + while((line = br.readLine()) != null) { | ||
147 | + if (firstLine) { | ||
148 | + line = line.replace("\uFEFF", ""); // remove BOM character | ||
149 | + firstLine = false; | ||
150 | + } | ||
151 | + | ||
152 | + if (!line.startsWith("%")) { | ||
153 | + String[] lineParts = line.split(":"); | ||
154 | + String lemma = lineParts[0].trim(); | ||
155 | + String schema = lineParts[5].trim(); | ||
156 | + | ||
157 | + if (schema.trim().isEmpty()) { | ||
158 | + continue; | ||
159 | + } | ||
160 | + | ||
161 | + String[] lemmaParts = lemma.split(" "); | ||
162 | + if(lemmaParts.length == 1 && schemaContainsSie(schema)) { | ||
163 | + lemma = lemma + " się"; | ||
164 | + } | ||
165 | + | ||
166 | + ArrayList<String> schemata; | ||
167 | + if (!map.containsKey(lemma)) { | ||
168 | + schemata = new ArrayList<String>(); | ||
169 | + schemata.add(schema); | ||
170 | + map.put(lemma, schemata); | ||
171 | + } else { | ||
172 | + schemata = map.get(lemma); | ||
173 | + schemata.add(schema); | ||
174 | + map.put(lemma, schemata); | ||
175 | + } | ||
176 | + } | ||
177 | + } | ||
178 | + br.close(); | ||
179 | + } catch (IOException ex) { | ||
180 | + ex.printStackTrace(); | ||
181 | + throw new RuntimeException(ex); | ||
182 | + } | ||
183 | + return map; | ||
184 | + } | ||
185 | + | ||
186 | + private static boolean schemaContainsSie(String schema) { | ||
187 | + for (String position : schema.split("\\s\\+\\s")) { | ||
188 | + position = position.trim(); | ||
189 | + position = position.substring(1, position.length()-1); | ||
190 | + for (String phrT : position.split(";")) { | ||
191 | + if (phrT.equals("refl") || phrT.equals("recip")) { | ||
192 | + return true; | ||
193 | + } | ||
194 | + } | ||
195 | + } | ||
196 | + | ||
197 | + return false; | ||
198 | + } | ||
199 | + | ||
200 | + private static Set<String> loadQuasiVerbs() { | ||
201 | + Set<String> quasiVerbs = new HashSet<>(); | ||
202 | + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); | ||
203 | + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { | ||
204 | + String line; | ||
205 | + while ((line = br.readLine()) != null) { | ||
206 | + quasiVerbs.add(line.trim()); | ||
207 | + } | ||
208 | + } catch (IOException e) { | ||
209 | + logger.error(e.getLocalizedMessage(), e); | ||
210 | + } | ||
211 | + return quasiVerbs; | ||
212 | + } | ||
213 | + | ||
214 | + private static void printStats(Instances instances) { | ||
215 | + int positive = 0; | ||
216 | + int negative = 0; | ||
217 | + for (int i = 0; i < instances.numInstances(); i++) { | ||
218 | + Instance inst = instances.instance(i); | ||
219 | + if (inst.classValue() > 0) | ||
220 | + negative++; | ||
221 | + else | ||
222 | + positive++; | ||
223 | + } | ||
224 | + logger.info(positive + " positive examples"); | ||
225 | + logger.info(negative + " negative examples"); | ||
226 | + logger.info((positive + negative) + " examples total"); | ||
227 | + logger.info((instances.numAttributes() - 1) + " attributes"); | ||
228 | + logger.info(instances.toSummaryString()); | ||
229 | + } | ||
230 | + | ||
231 | +} |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Relation.java
0 → 100644
1 | +package pl.waw.ipipan.zil.core.md.entities; | ||
2 | + | ||
3 | +public class Relation { | ||
4 | + | ||
5 | + private String name; | ||
6 | + private Token target; | ||
7 | + | ||
8 | + public Relation(String name, Token target) { | ||
9 | + this.name = name; | ||
10 | + this.target = target; | ||
11 | + } | ||
12 | + | ||
13 | + public String getName() { | ||
14 | + return name; | ||
15 | + } | ||
16 | + | ||
17 | + public Token getTarget() { | ||
18 | + return target; | ||
19 | + } | ||
20 | + | ||
21 | +} |