Commit 2d60e476d9f47fbd460efb5c02d1f76b50decb08
1 parent
86cf20ea
Fully statistical mention detector version (2.0).
Showing
13 changed files
with
680 additions
and
37 deletions
pom.xml
... | ... | @@ -4,13 +4,13 @@ |
4 | 4 | |
5 | 5 | <groupId>pl.waw.ipipan.zil.core</groupId> |
6 | 6 | <artifactId>md</artifactId> |
7 | - <version>1.3</version> | |
7 | + <version>2.0</version> | |
8 | 8 | |
9 | 9 | <developers> |
10 | 10 | <developer> |
11 | - <name>Mateusz Kopeć</name> | |
11 | + <name>Bartłomiej Nitoń</name> | |
12 | 12 | <organization>ICS PAS</organization> |
13 | - <email>m.kopec@ipipan.waw.pl</email> | |
13 | + <email>bartek.niton@gmail.com</email> | |
14 | 14 | </developer> |
15 | 15 | </developers> |
16 | 16 | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
... | ... | @@ -4,6 +4,8 @@ import org.slf4j.Logger; |
4 | 4 | import org.slf4j.LoggerFactory; |
5 | 5 | |
6 | 6 | import pl.waw.ipipan.zil.core.md.detection.Detector; |
7 | +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector; | |
8 | +import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector; | |
7 | 9 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; |
8 | 10 | import pl.waw.ipipan.zil.core.md.entities.Text; |
9 | 11 | import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; |
... | ... | @@ -19,9 +21,11 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; |
19 | 21 | import java.io.BufferedReader; |
20 | 22 | import java.io.File; |
21 | 23 | import java.io.FileInputStream; |
24 | +import java.io.FileNotFoundException; | |
22 | 25 | import java.io.IOException; |
23 | 26 | import java.io.InputStream; |
24 | 27 | import java.io.InputStreamReader; |
28 | +import java.io.PrintWriter; | |
25 | 29 | import java.util.ArrayList; |
26 | 30 | import java.util.EnumMap; |
27 | 31 | import java.util.HashMap; |
... | ... | @@ -32,13 +36,17 @@ public class Main { |
32 | 36 | private static final Logger logger = LoggerFactory.getLogger(Main.class); |
33 | 37 | |
34 | 38 | private static final boolean GZIP_OUTPUT = true; |
39 | + private static final String DEFAULT_HEAD_MODEL = "/head_model.bin"; | |
40 | + private static final String DEFAULT_NOMINAL_MENTION_MODEL = "/nominal_model.bin"; | |
35 | 41 | private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; |
36 | 42 | private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt"; |
37 | 43 | private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt"; |
38 | 44 | |
45 | + private static HeadDetector headModel; | |
46 | + private static NominalMentionDetector nominalMentionModel; | |
39 | 47 | private static ZeroSubjectDetector zeroSubjectModel; |
40 | 48 | |
41 | - public static enum ValenceDicts { | |
49 | + public static enum ValenceDicts { | |
42 | 50 | VerbsValence, |
43 | 51 | NounsValence |
44 | 52 | } |
... | ... | @@ -47,6 +55,12 @@ public class Main { |
47 | 55 | new EnumMap(ValenceDicts.class); |
48 | 56 | |
49 | 57 | static { |
58 | + InputStream headDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_HEAD_MODEL); | |
59 | + headModel = new HeadDetector(headDetectionModelStream); | |
60 | + | |
61 | + InputStream nominalMentionDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_NOMINAL_MENTION_MODEL); | |
62 | + nominalMentionModel = new NominalMentionDetector(nominalMentionDetectionModelStream); | |
63 | + | |
50 | 64 | InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); |
51 | 65 | zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); |
52 | 66 | |
... | ... | @@ -138,6 +152,14 @@ public class Main { |
138 | 152 | |
139 | 153 | File inputDir = new File(args[0]); |
140 | 154 | File outputDir = new File(args[1]); |
155 | + File defsOutputFile = new File(args[1], "definitions.csv"); | |
156 | + PrintWriter defsWriter = null; | |
157 | + try { | |
158 | + defsWriter = new PrintWriter(defsOutputFile); | |
159 | + } catch (FileNotFoundException e1) { | |
160 | + // TODO Auto-generated catch block | |
161 | + e1.printStackTrace(); | |
162 | + } | |
141 | 163 | |
142 | 164 | if (!inputDir.isDirectory()) { |
143 | 165 | logger.error(inputDir + " is not a directory!"); |
... | ... | @@ -159,7 +181,6 @@ public class Main { |
159 | 181 | } |
160 | 182 | |
161 | 183 | |
162 | - | |
163 | 184 | int all = 0; |
164 | 185 | int errors = 0; |
165 | 186 | for (File teiDir : IOUtils.getNKJPDirs(inputDir)) { |
... | ... | @@ -167,13 +188,15 @@ public class Main { |
167 | 188 | try { |
168 | 189 | File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); |
169 | 190 | TEICorpusText teiText = TeiLoader.readTeiText(teiDir); |
170 | - annotateTeiText(teiText); | |
191 | + annotateTeiText(teiText, teiDir, defsWriter); | |
171 | 192 | TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); |
172 | 193 | } catch (IOException e) { |
173 | 194 | logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e); |
174 | 195 | errors++; |
175 | 196 | } |
176 | 197 | } |
198 | + | |
199 | + defsWriter.close(); | |
177 | 200 | |
178 | 201 | logger.info(all + " texts processed succesfully."); |
179 | 202 | if (errors > 0) |
... | ... | @@ -208,9 +231,9 @@ public class Main { |
208 | 231 | * @param thriftText text to annotate with mentions |
209 | 232 | * @throws MultiserviceException when an error occures |
210 | 233 | */ |
211 | - public static void annotateThriftText(TText thriftText) throws MultiserviceException { | |
234 | + public static void annotateThriftText(TText thriftText, PrintWriter defsWriter) throws MultiserviceException { | |
212 | 235 | Text responseText = ThriftLoader.loadTextFromThrift(thriftText); |
213 | - Detector.findMentionsInText(responseText, zeroSubjectModel, valence); | |
236 | + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); | |
214 | 237 | ThriftSaver.updateThriftText(responseText, thriftText); |
215 | 238 | } |
216 | 239 | |
... | ... | @@ -221,9 +244,9 @@ public class Main { |
221 | 244 | * @param teiText text to annotate with mentions |
222 | 245 | * @throws TEIException when an error occurs |
223 | 246 | */ |
224 | - public static void annotateTeiText(TEICorpusText teiText) throws TEIException { | |
225 | - Text responseText = TeiLoader.loadTextFromTei(teiText); | |
226 | - Detector.findMentionsInText(responseText, zeroSubjectModel, valence); | |
247 | + public static void annotateTeiText(TEICorpusText teiText, File textDir, PrintWriter defsWriter) throws TEIException { | |
248 | + Text responseText = TeiLoader.loadTextFromTei(teiText, textDir); | |
249 | + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); | |
227 | 250 | TeiSaver.updateTeiText(responseText, teiText); |
228 | 251 | } |
229 | 252 | |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
... | ... | @@ -15,6 +15,8 @@ public class Constants { |
15 | 15 | "Adj", "Conj", "Comp"); |
16 | 16 | |
17 | 17 | public static final List<String> VERB_CTAGS = Arrays.asList("Inf", "Verbfin"); |
18 | + | |
19 | + public static final List<String> DEPPARSE_MLABELS = Arrays.asList("subj", "obj", "comp");//, "pd"); | |
18 | 20 | |
19 | 21 | private Constants() { |
20 | 22 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
... | ... | @@ -4,10 +4,15 @@ import org.slf4j.Logger; |
4 | 4 | import org.slf4j.LoggerFactory; |
5 | 5 | |
6 | 6 | import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; |
7 | +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector; | |
8 | +import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector; | |
7 | 9 | import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; |
8 | 10 | import pl.waw.ipipan.zil.core.md.entities.*; |
9 | 11 | |
12 | +import java.io.PrintWriter; | |
10 | 13 | import java.util.ArrayList; |
14 | +import java.util.Arrays; | |
15 | +import java.util.Collections; | |
11 | 16 | import java.util.HashSet; |
12 | 17 | import java.util.List; |
13 | 18 | import java.util.Map; |
... | ... | @@ -21,36 +26,47 @@ public class Detector { |
21 | 26 | } |
22 | 27 | |
23 | 28 | public static void findMentionsInText(Text text, |
29 | + HeadDetector headModel, | |
24 | 30 | ZeroSubjectDetector zeroSubjectModel, |
25 | - Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | |
31 | + NominalMentionDetector nominalMentionModel, | |
32 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
33 | + PrintWriter defsWriter) { | |
26 | 34 | text.clearMentions(); |
27 | 35 | logger.debug("Detecting mentions in text " + text.getId()); |
28 | 36 | for (Paragraph p : text) |
29 | 37 | for (Sentence s : p) |
30 | - detectMentionsInSentence(s, zeroSubjectModel, valence); | |
38 | + detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); | |
31 | 39 | } |
32 | 40 | |
33 | 41 | private static void detectMentionsInSentence(Sentence sentence, |
42 | + HeadDetector headModel, | |
34 | 43 | ZeroSubjectDetector zeroSubjectModel, |
35 | - Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { | |
44 | + NominalMentionDetector nominalMentionModel, | |
45 | + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, | |
46 | + PrintWriter defsWriter) { | |
36 | 47 | // adding mentions |
37 | - addMentionsByTokenCtag(sentence); | |
38 | - addMentionsBySyntacticWordsCtag(sentence); | |
39 | - addMentionsByNamedEntities(sentence); | |
40 | - addMentionsByGroups(sentence, valence); | |
41 | - addSpeakerMentionsInSpoken(sentence); | |
48 | +// addMentionsByTokenCtag(sentence); | |
49 | +// addMentionsBySyntacticWordsCtag(sentence); | |
50 | +// addMentionsByNamedEntities(sentence); | |
51 | +// addMentionsByGroups(sentence, valence); | |
52 | +// //addMentionsByDeppParse(sentence); | |
53 | +// addSpeakerMentionsInSpoken(sentence); | |
42 | 54 | |
43 | 55 | // zero subject detection |
44 | 56 | zeroSubjectModel.addZeroSubjectMentions(sentence); |
57 | + | |
58 | + List<Token> heads = headModel.detectHeads(sentence); | |
59 | + nominalMentionModel.addNominalMentions(sentence, valence, heads); | |
45 | 60 | |
46 | 61 | // removing mentions |
47 | - removeTo(sentence); | |
48 | - Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); | |
49 | - Cleaner.cleanUnnecessarySentenceMentions(sentence); | |
50 | - Cleaner.cleanFrazeos(sentence); | |
62 | + // removeTo(sentence); to nic nie daje, jeszcze ponizsze spradzic | |
63 | +// Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); | |
64 | +// Cleaner.cleanUnnecessarySentenceMentions(sentence); | |
65 | +// Cleaner.cleanFrazeos(sentence); | |
66 | + | |
51 | 67 | |
52 | 68 | // updating mention heads |
53 | - updateMentionHeads(sentence); | |
69 | + // updateMentionHeads(sentence); | |
54 | 70 | } |
55 | 71 | |
56 | 72 | /** |
... | ... | @@ -106,7 +122,7 @@ public class Detector { |
106 | 122 | private static void addMentionsByGroups(Sentence sentence, |
107 | 123 | Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { |
108 | 124 | |
109 | - for (SyntacticGroup group : sentence.getGroups()) { | |
125 | + for (SyntacticGroup group : sentence.getGroups()) { | |
110 | 126 | if (group.getType().startsWith("NG")) { |
111 | 127 | ArrayList<SyntacticGroup> nestedGroups = new ArrayList<SyntacticGroup>(); |
112 | 128 | nestedGroups.add(group); |
... | ... | @@ -286,4 +302,431 @@ public class Detector { |
286 | 302 | sentence.addMention(new Mention(token)); |
287 | 303 | } |
288 | 304 | } |
305 | + | |
306 | + private static void addMentionsByDeppParse(Sentence sentence) { | |
307 | + for (Token tok : sentence) { | |
308 | + // sprawdzac czy wzmianka jest ciagla tekstowo, bo czasami depparser zwraca dziwne drzewka | |
309 | + /*HashSet<Relation> relations = tok.getRelations(); | |
310 | + for (Relation rel : relations) { | |
311 | + if (Constants.DEPPARSE_MLABELS.contains(rel.getName()) | |
312 | + && !rel.getTarget().getCtag().matches(Constants.MORPHO_CTAGS) | |
313 | + && !rel.getTarget().getCtag().equals("prep")) { | |
314 | + Mention mention = buildMentionFromSubtree(rel.getTarget()); | |
315 | + if (mention != null && !sentence.getMentions().contains(mention)) { | |
316 | + sentence.addMention(mention); | |
317 | + } | |
318 | + } | |
319 | + }*/ | |
320 | + if (tok.getCtag().matches(Constants.MORPHO_CTAGS) || tok.getCtag().equals("num")) { | |
321 | + Mention mention = buildMentionFromSubtree(tok); | |
322 | + if (mention != null && !sentence.getMentions().contains(mention)) { | |
323 | + sentence.addMention(mention); | |
324 | + } | |
325 | + } | |
326 | + } | |
327 | + } | |
328 | + | |
329 | + private static Mention buildMentionFromSubtree(Token head) { | |
330 | + List<Token> heads = new ArrayList<Token>(); | |
331 | + List<Token> segments = new ArrayList<Token>(); | |
332 | + heads.add(head); | |
333 | + //segments.add(head); | |
334 | + segments.addAll(getTreeSegments(head)); | |
335 | + Collections.sort(segments); | |
336 | + Mention mention = null; | |
337 | + try { | |
338 | + segments = removeBorderingSegments(segments, Arrays.asList("qub", "interp")); | |
339 | + if (!segments.isEmpty()) { | |
340 | + mention = new Mention(segments, heads); | |
341 | + } | |
342 | + } catch (ArrayIndexOutOfBoundsException e) { | |
343 | + logger.warn("Strange dependency structure"); | |
344 | + } | |
345 | + return mention; | |
346 | + } | |
347 | + | |
348 | + private static List<Token> removeBorderingSegments(List<Token> segments, List<String> tags2Remove) { | |
349 | + Token firstSeg = segments.get(0); | |
350 | + while(tags2Remove.contains(firstSeg.getCtag())) { | |
351 | + segments.remove(firstSeg); | |
352 | + if (segments.isEmpty()) { | |
353 | + return segments; | |
354 | + } | |
355 | + firstSeg = segments.get(0); | |
356 | + } | |
357 | + | |
358 | + Token lastSeg = segments.get(segments.size() - 1); | |
359 | + while(tags2Remove.contains(lastSeg.getCtag())) { | |
360 | + segments.remove(lastSeg); | |
361 | + if (segments.isEmpty()) { | |
362 | + return segments; | |
363 | + } | |
364 | + lastSeg = segments.get(segments.size() - 1); | |
365 | + } | |
366 | + | |
367 | + return segments; | |
368 | + } | |
369 | + | |
370 | + private static List<Token> removePrecedingAdjs(List<Token> segments) { | |
371 | + Token firstSeg = segments.get(0); | |
372 | + while(firstSeg.getCtag().equals("adj")) { | |
373 | + segments.remove(firstSeg); | |
374 | + if (segments.isEmpty()) { | |
375 | + return segments; | |
376 | + } | |
377 | + firstSeg = segments.get(0); | |
378 | + } | |
379 | + return segments; | |
380 | + } | |
381 | + | |
382 | + private static HashSet<Token> getTreeSegments(Token tok) { | |
383 | + HashSet<Token> segments = new HashSet<Token>(); | |
384 | + segments.add(tok); | |
385 | + for (Relation rel : tok.getRelations()) { | |
386 | + segments.addAll(getTreeSegments(rel.getTarget())); | |
387 | + } | |
388 | + return segments; | |
389 | + } | |
390 | + | |
391 | + | |
392 | + private static final List<String> DEF_CONJS_ORTHS = | |
393 | + Arrays.asList(//"to", | |
394 | + "to jest", "jest to", "zwane inaczej", "czyli", "inaczej mówiąc", | |
395 | + "inaczej nazywane", "zwane też", "zwane także", "zwane również", "zwane często", | |
396 | + "zwane zwykle", "definiowane jako", "znaczy tyle co", "rozumiane jako", "rozumiane jest", | |
397 | + "ktoś kto", "coś co", "nazywa się", "tak definiuje się"); | |
398 | + | |
399 | + private static final List<String> DEF_CONJS_BASES = | |
400 | + Arrays.asList(//"to", | |
401 | + "to być", "być to", "zwać inaczej", "czyli", "inaczej mówić", | |
402 | + "inaczej nazywać", "zwać też", "zwać także", "zwać również", "zwać często", | |
403 | + "zwać zwykle", "definiować jako", "znaczyć tyle co", "rozumieć jako", "rozumieć być", | |
404 | + "ktoś kto", "kto być kto", | |
405 | + "coś co", "co być co", | |
406 | + "nazywać się", "tak definiować się"); | |
407 | + | |
408 | + | |
409 | + private static final List<String> ANN_SOURCE_TO_OMMIT = | |
410 | + Arrays.asList("pan", "pani"); | |
411 | + | |
412 | + | |
413 | + private static void getDefinitionsByGroups(Sentence sentence, String form, PrintWriter defsWriter) { | |
414 | + List<String> def_conjs = DEF_CONJS_ORTHS; | |
415 | + if (form.equals("base")) { | |
416 | + def_conjs = DEF_CONJS_BASES; | |
417 | + } | |
418 | + for (SyntacticGroup group : sentence.getGroups()) { | |
419 | + if (group.getType().startsWith("NG")) { | |
420 | + SyntacticGroup nextGroup = group.getClosestNGroup(); | |
421 | + | |
422 | + if (nextGroup != null) { | |
423 | + int conjStart = group.getSentenceEndPosition() + 1; | |
424 | + int conjEnd = nextGroup.getSentenceStartPosition() - 1; | |
425 | + String conj = ""; | |
426 | + if (conjEnd > conjStart && (group.containsNE() || nextGroup.containsNE())) { | |
427 | + conj = getText(sentence, conjStart, conjEnd, form); | |
428 | + if (def_conjs.contains(conj)) { | |
429 | + String definition = String.format("%s\t[%s%s%s]\t%s\t%s", | |
430 | + group.toString(), | |
431 | + conj, "/groups/", form, | |
432 | + nextGroup.toString(), | |
433 | + sentence.toStringWithoutMentions()); | |
434 | + defsWriter.println(definition); | |
435 | + } | |
436 | + } | |
437 | + } | |
438 | + | |
439 | + } | |
440 | + } | |
441 | + } | |
442 | + | |
443 | + private static void getDefinitionsByMentions(Sentence sentence, String form, PrintWriter defsWriter) { | |
444 | + List<String> def_conjs = DEF_CONJS_ORTHS; | |
445 | + if (form.equals("base")) { | |
446 | + def_conjs = DEF_CONJS_BASES; | |
447 | + } | |
448 | + for (Mention mnt1 : sentence.getMentions()) { | |
449 | + int mnt1End = mnt1.getSentenceEndPosition(); | |
450 | + for (Mention mnt2 : sentence.getMentions()) { | |
451 | + int mnt2Start = mnt2.getSentenceStartPosition(); | |
452 | + int conjStart = mnt1End + 1; | |
453 | + int conjEnd = mnt2Start - 1; | |
454 | + if (conjEnd > conjStart) { | |
455 | + String conj = getText(sentence, conjStart, conjEnd, form); | |
456 | + if (def_conjs.contains(conj)) { | |
457 | + String definition = String.format("%s\t[%s%s%s]\t%s\t%s", | |
458 | + mnt1.toStringWithoutBrackets(), | |
459 | + conj, "/mentions/", form, | |
460 | + mnt2.toStringWithoutBrackets(), | |
461 | + sentence.toStringWithoutMentions()); | |
462 | + defsWriter.println(definition); | |
463 | + } | |
464 | + } | |
465 | + } | |
466 | + } | |
467 | + } | |
468 | + | |
469 | + /*==> buildDefinitionsFromSubtree, | |
470 | + zwrocic dla drzewa o korzeniu subj, wszystkie poddrzewa | |
471 | + rozpoczynane relacja app, to co pod samym subj, to keyword: | |
472 | + patrz zdanie: | |
473 | + | |
474 | + Dr David Warner , neurofizjolog Akademii Medycznej Loma Linda w Kalifornii , wspólnie | |
475 | + ze specjalistami z Uniwersytetu Stanforda opracował urządzenie reagujące na ruchy mięśni twarzy . | |
476 | + */ | |
477 | + | |
478 | + private static void getDefinitionsByDeppParse(Sentence sentence, PrintWriter defsWriter) { | |
479 | + | |
480 | + // podzielic mention przez relacje apozycji | |
481 | + | |
482 | + for (Token source : sentence) { | |
483 | + HashSet<Relation> relations = source.getRelations(); | |
484 | + for (Relation rel : relations) { | |
485 | + if (//Constants.DEPPARSE_MLABELS.contains(rel.getName()) | |
486 | + //rel.getName().equals("subj") | |
487 | + rel.getName().equals("app") && | |
488 | + source.getReturnRelation() != null && | |
489 | + //Constants.DEPPARSE_MLABELS.contains(source.getReturnRelation().getName()) | |
490 | + ((source.getCase().equals("nom") && rel.getTarget().getCase().equals("nom") | |
491 | + && source.getNumber().equals(rel.getTarget().getNumber()) | |
492 | + && source.getGender().equals(rel.getTarget().getGender()) | |
493 | + && !source.isPpron() && !rel.getTarget().isPpron()) | |
494 | + //|| source.getCtag().equals("brev") | |
495 | + ) //cos z tym brevem zrobic trzeba | |
496 | + ) { | |
497 | + ArrayList<List<Token>> appositions = getAppositionsFromSubtree(source, rel.getTarget()); | |
498 | + if (appositions.size() > 1 && containsNE(appositions)) { | |
499 | + appositions = mergeNEs(appositions); | |
500 | + } | |
501 | + if (appositions.size() > 1 && containsNE(appositions)) { | |
502 | + ArrayList<String> appsStrList = appositionsToString(appositions); | |
503 | + String appositionsStr = String.join("\t", appsStrList); | |
504 | + | |
505 | + String definition = String.format("%s\t!!!!!\t%s", | |
506 | + //source.getOrth(), | |
507 | + appositionsStr, | |
508 | + sentence.toStringWithoutMentions()); | |
509 | + defsWriter.println(definition); | |
510 | + } | |
511 | + } | |
512 | + } | |
513 | + } | |
514 | + } | |
515 | + | |
516 | + private static ArrayList<List<Token>> getAppositionsFromSubtree(Token root) { | |
517 | + | |
518 | + ArrayList<List<Token>> appositions = new ArrayList<List<Token>>(); | |
519 | + | |
520 | + List<Token> segments = new ArrayList<Token>(); | |
521 | + segments.addAll(getTreeSegments(root, "app")); | |
522 | + List<Token> allSegments = new ArrayList<Token>(); | |
523 | + allSegments.addAll(extendByNEs(segments)); | |
524 | + | |
525 | + Collections.sort(allSegments); | |
526 | + if (!ommitApp(allSegments)) { | |
527 | + appositions.add(allSegments); | |
528 | + } | |
529 | + | |
530 | + | |
531 | + | |
532 | + for (Token tok : allSegments) { | |
533 | + for (Relation rel : tok.getRelations()) { | |
534 | + if (rel.getName().equals("app") && !sameNE(tok, rel.getTarget())) { | |
535 | + appositions.addAll(getAppositionsFromSubtree(rel.getTarget())); | |
536 | + } | |
537 | + } | |
538 | + } | |
539 | + | |
540 | + return appositions; | |
541 | + } | |
542 | + | |
543 | + private static ArrayList<List<Token>> getAppositionsFromSubtree(Token source, Token target) { | |
544 | + | |
545 | + ArrayList<List<Token>> appositions = new ArrayList<List<Token>>(); | |
546 | + if (sameNE(source, target)) { | |
547 | + return appositions; | |
548 | + } | |
549 | + | |
550 | + List<Token> sourceSegments = new ArrayList<Token>(); | |
551 | + sourceSegments.addAll(getTreeSegments(source, target)); | |
552 | + List<Token> allSourceSegments = new ArrayList<Token>(); | |
553 | + allSourceSegments.addAll(extendByNEs(sourceSegments)); | |
554 | + | |
555 | + Collections.sort(allSourceSegments); | |
556 | + if (!ommitApp(allSourceSegments)) { | |
557 | + appositions.add(allSourceSegments); | |
558 | + } | |
559 | + | |
560 | + List<Token> targetSegments = new ArrayList<Token>(); | |
561 | + targetSegments.addAll(getTreeSegments(target)); | |
562 | + List<Token> allTargetSegments = new ArrayList<Token>(); | |
563 | + allTargetSegments.addAll(extendByNEs(targetSegments)); | |
564 | + | |
565 | + Collections.sort(allTargetSegments); | |
566 | + if (!ommitApp(allTargetSegments)) { | |
567 | + appositions.add(allTargetSegments); | |
568 | + } | |
569 | + | |
570 | + return appositions; | |
571 | + } | |
572 | + | |
573 | + private static ArrayList<List<Token>> mergeNEs(ArrayList<List<Token>> appositions) { | |
574 | + ArrayList<List<Token>> appositionsCopy = new ArrayList<List<Token>>(appositions); | |
575 | + Sentence sentence = appositions.get(0).get(0).getSentence(); | |
576 | + for (NamedEntity ne : sentence.getNamedEntities()) { | |
577 | + if (ne.getType().equals("persName") | |
578 | + && (ne.getSubtype() == null || ne.getSubtype().isEmpty())) { | |
579 | + HashSet<Token> mergedNE = new HashSet<Token>(); | |
580 | + for (List<Token> app : appositionsCopy) { | |
581 | + if (ne.getTokens().containsAll(app)) { | |
582 | + mergedNE.addAll(app); | |
583 | + appositions.remove(app); | |
584 | + } | |
585 | + } | |
586 | + if (mergedNE.size() > 0) { | |
587 | + ArrayList newApposition = new ArrayList<Token>(); | |
588 | + newApposition.addAll(mergedNE); | |
589 | + Collections.sort(newApposition); | |
590 | + appositions.add(newApposition); | |
591 | + } | |
592 | + appositionsCopy = new ArrayList<List<Token>>(appositions); | |
593 | + } | |
594 | + } | |
595 | + return appositions; | |
596 | + } | |
597 | + | |
598 | + public static boolean containsNE(ArrayList<List<Token>> appositions) { | |
599 | + for (List<Token> app : appositions) { | |
600 | + if (isNE(app)) { | |
601 | + return true; | |
602 | + } | |
603 | + /*for (Token tok : app) { | |
604 | + for (NamedEntity ne : sentence.getNamedEntities()) { | |
605 | + if (ne.getSubtype() != null && ne.getSubtype().equals("forename")) { | |
606 | + continue; | |
607 | + } | |
608 | + if (ne.getTokens().contains(tok)) { | |
609 | + return true; | |
610 | + } | |
611 | + } | |
612 | + }*/ | |
613 | + } | |
614 | + return false; | |
615 | + } | |
616 | + | |
617 | + private static boolean isNE(List<Token> segments) { | |
618 | + Sentence sentence = segments.get(0).getSentence(); | |
619 | + for (NamedEntity ne : sentence.getNamedEntities()) { | |
620 | + if (ne.getTokens().containsAll(segments) && | |
621 | + segments.containsAll(ne.getTokens())) { | |
622 | + return true; | |
623 | + } | |
624 | + } | |
625 | + return false; | |
626 | + } | |
627 | + | |
628 | + private static ArrayList<String> appositionsToString(ArrayList<List<Token>> appositions) { | |
629 | + ArrayList<String> apposistionsStrs = new ArrayList<String>(); | |
630 | + for (List<Token> apposition : appositions) { | |
631 | + String appText = getText(apposition, "orth"); | |
632 | + apposistionsStrs.add(appText); | |
633 | + } | |
634 | + return apposistionsStrs; | |
635 | + } | |
636 | + | |
637 | + | |
638 | + private static boolean ommitApp(List<Token> segments) { | |
639 | + segments = removeBorderingSegments(segments, Arrays.asList("interp")); | |
640 | + if (segments.size() == 0) { | |
641 | + return true; | |
642 | + } | |
643 | + String appositionBase = getText(segments, "base"); | |
644 | + if (ANN_SOURCE_TO_OMMIT.contains(segments.get(0).getBase().toLowerCase()) || | |
645 | + appositionBase.length() < 2) { | |
646 | + return true; | |
647 | + } | |
648 | + return false; | |
649 | + } | |
650 | + | |
651 | + private static HashSet<Token> getTreeSegments(Token tok, String divRel) { | |
652 | + HashSet<Token> segments = new HashSet<Token>(); | |
653 | + segments.add(tok); | |
654 | + | |
655 | + for (Relation rel : tok.getRelations()) { | |
656 | + if (!rel.getName().equals(divRel)) { | |
657 | + segments.addAll(getTreeSegments(rel.getTarget(), divRel)); | |
658 | + } | |
659 | + | |
660 | + } | |
661 | + return segments; | |
662 | + } | |
663 | + | |
664 | + private static HashSet<Token> getTreeSegments(Token tok, Token nGoThere) { | |
665 | + HashSet<Token> segments = new HashSet<Token>(); | |
666 | + segments.add(tok); | |
667 | + | |
668 | + for (Relation rel : tok.getRelations()) { | |
669 | + if (!rel.getTarget().equals(nGoThere)) { | |
670 | + segments.addAll(getTreeSegments(rel.getTarget(), nGoThere)); | |
671 | + } | |
672 | + | |
673 | + } | |
674 | + return segments; | |
675 | + } | |
676 | + | |
677 | + private static HashSet<Token> extendByNEs(List<Token> segments) { | |
678 | + HashSet<Token> allSegments = new HashSet<Token>(); | |
679 | + allSegments.addAll(segments); | |
680 | + for (Token tok : segments) { | |
681 | + Token neTok = tok; | |
682 | + while (neTok.getReturnRelation() != null | |
683 | + && (//neTok.getReturnRelation().getName().equals("ne") | |
684 | + //|| | |
685 | + sameNE(neTok, neTok.getReturnRelation().getTarget()))) { | |
686 | + neTok = neTok.getReturnRelation().getTarget(); | |
687 | + allSegments.add(neTok); | |
688 | + } | |
689 | + } | |
690 | + return allSegments; | |
691 | + } | |
692 | + | |
693 | + private static boolean sameNE(Token tok1, Token tok2) { | |
694 | + Sentence sentence = tok1.getSentence(); | |
695 | + for (NamedEntity ne : sentence.getNamedEntities()) { | |
696 | + if (ne.getTokens().contains(tok1) | |
697 | + && ne.getTokens().contains(tok2) | |
698 | + && ne.getType().equals("persName")) { | |
699 | + return true; | |
700 | + } | |
701 | + } | |
702 | + return false; | |
703 | + } | |
704 | + | |
705 | + // TODO: przeniesc do klasy Sentence i wywalic static | |
706 | + private static String getText(Sentence sentence, int start, int end, String form) { | |
707 | + String conj = ""; | |
708 | + for (Token tok : sentence.subList(start, end+1)) { | |
709 | + if (!tok.getCtag().equals("interp")) { | |
710 | + if (form.equals("orth")) { | |
711 | + conj += " " + tok.getOrth(); | |
712 | + } else if (form.equals("base")) { | |
713 | + conj += " " + tok.getBase(); | |
714 | + } | |
715 | + } | |
716 | + } | |
717 | + return conj.trim(); | |
718 | + } | |
719 | + | |
720 | + private static String getText(List<Token> segments, String form) { | |
721 | + String conj = ""; | |
722 | + for (Token tok : segments) { | |
723 | + if (form.equals("orth")) { | |
724 | + conj += " " + tok.getOrth(); | |
725 | + } else if (form.equals("base")) { | |
726 | + conj += " " + tok.getBase(); | |
727 | + } | |
728 | + } | |
729 | + return conj.trim(); | |
730 | + } | |
731 | + | |
289 | 732 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java
... | ... | @@ -35,7 +35,7 @@ public class InstanceCreator { |
35 | 35 | allTexts++; |
36 | 36 | logger.info("Processing text " + textDir); |
37 | 37 | TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); |
38 | - Text text = TeiLoader.loadTextFromTei(ct); | |
38 | + Text text = TeiLoader.loadTextFromTei(ct, textDir); | |
39 | 39 | |
40 | 40 | for (Paragraph p : text) |
41 | 41 | for (Sentence s : p) { |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
... | ... | @@ -83,6 +83,14 @@ public class Mention implements Comparable<Mention> { |
83 | 83 | sb.append("]"); |
84 | 84 | return sb.toString(); |
85 | 85 | } |
86 | + | |
87 | + public String toStringWithoutBrackets() { | |
88 | + StringBuffer sb = new StringBuffer(); | |
89 | + for (Token seg : segments) { | |
90 | + sb.append(seg.toString() + " "); | |
91 | + } | |
92 | + return sb.toString(); | |
93 | + } | |
86 | 94 | |
87 | 95 | public MentionGroup getMentionGroup() { |
88 | 96 | return mentionGroup; |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java
... | ... | @@ -6,14 +6,26 @@ import java.util.List; |
6 | 6 | public class NamedEntity implements Comparable<NamedEntity> { |
7 | 7 | |
8 | 8 | private List<Token> tokens; |
9 | + private String type; | |
10 | + private String subtype; | |
9 | 11 | |
10 | - public NamedEntity(List<Token> tokens) { | |
12 | + public NamedEntity(List<Token> tokens, String type, String subType) { | |
11 | 13 | this.tokens = tokens; |
14 | + this.type = type; | |
15 | + this.subtype = subType; | |
12 | 16 | } |
13 | 17 | |
14 | 18 | public List<Token> getTokens() { |
15 | 19 | return this.tokens; |
16 | 20 | } |
21 | + | |
22 | + public String getType() { | |
23 | + return this.type; | |
24 | + } | |
25 | + | |
26 | + public String getSubtype() { | |
27 | + return this.subtype; | |
28 | + } | |
17 | 29 | |
18 | 30 | @Override |
19 | 31 | public int compareTo(NamedEntity o) { |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
1 | 1 | package pl.waw.ipipan.zil.core.md.entities; |
2 | 2 | |
3 | 3 | import java.util.ArrayList; |
4 | +import java.util.Arrays; | |
4 | 5 | import java.util.Iterator; |
5 | 6 | import java.util.List; |
6 | 7 | |
... | ... | @@ -143,6 +144,30 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { |
143 | 144 | return largestGroup; |
144 | 145 | } |
145 | 146 | |
147 | + public SyntacticGroup getClosestNGroup() { | |
148 | + SyntacticGroup nextGroup = null; | |
149 | + Sentence sentence = this.tokens.get(0).getSentence(); | |
150 | + int nextTokenPosition = this.getSentenceEndPosition() + 1; | |
151 | + while (nextTokenPosition <= sentence.size()) { | |
152 | + | |
153 | + for (SyntacticGroup group : sentence.getGroups()) { | |
154 | + if (group.getType().startsWith("NG") && | |
155 | + group.getSentenceStartPosition() == nextTokenPosition) { | |
156 | + if (nextGroup == null || | |
157 | + nextGroup.getTokens().size() < group.getTokens().size()) { | |
158 | + nextGroup = group; | |
159 | + } | |
160 | + } | |
161 | + } | |
162 | + if (nextGroup != null) { | |
163 | + break; | |
164 | + } | |
165 | + nextTokenPosition ++; | |
166 | + } | |
167 | + | |
168 | + return nextGroup; | |
169 | + } | |
170 | + | |
146 | 171 | public SyntacticWord getPrecedingVerb() { |
147 | 172 | int precedingTokenPosition = this.getSentenceStartPosition() - 1; |
148 | 173 | Sentence sentence = this.tokens.get(0).getSentence(); |
... | ... | @@ -190,5 +215,28 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { |
190 | 215 | } |
191 | 216 | return parentPrepNG; |
192 | 217 | } |
218 | + | |
219 | + public String toString() { | |
220 | + String textRep = ""; | |
221 | + for (Token tok : tokens) { | |
222 | + textRep += " " + tok.getOrth(); | |
223 | + } | |
224 | + return textRep.trim(); | |
225 | + } | |
226 | + | |
227 | + public boolean containsNE() { | |
228 | + Sentence sentence = this.tokens.get(0).getSentence(); | |
229 | + for (Token tok : tokens) { | |
230 | + for (NamedEntity ne : sentence.getNamedEntities()) { | |
231 | + if (ne.getSubtype() != null && ne.getSubtype().equals("forename")) { | |
232 | + continue; | |
233 | + } | |
234 | + if (ne.getTokens().contains(tok)) { | |
235 | + return true; | |
236 | + } | |
237 | + } | |
238 | + } | |
239 | + return false; | |
240 | + } | |
193 | 241 | |
194 | 242 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java
... | ... | @@ -7,6 +7,8 @@ public class Token implements Comparable<Token> { |
7 | 7 | private int sentencePosition; |
8 | 8 | |
9 | 9 | private Set<Mention> mentions = null; |
10 | + private HashSet<Relation> relations = new HashSet<Relation>(); | |
11 | + private Relation returnRelation = null; | |
10 | 12 | |
11 | 13 | private String orth; |
12 | 14 | private Interpretation chosenInterpretation; |
... | ... | @@ -119,10 +121,33 @@ public class Token implements Comparable<Token> { |
119 | 121 | public String getCtag() { |
120 | 122 | return getChosenInterpretation().getCtag(); |
121 | 123 | } |
124 | + | |
125 | + public boolean isPpron() { | |
126 | + if (this.getCtag().startsWith("ppron")) { | |
127 | + return true; | |
128 | + } | |
129 | + return false; | |
130 | + } | |
122 | 131 | |
123 | 132 | @Override |
124 | 133 | public int compareTo(Token o) { |
125 | 134 | return getSentencePosition().compareTo(o.getSentencePosition()); |
126 | 135 | } |
136 | + | |
137 | + public void addRelation(Relation relation) { | |
138 | + relations.add(relation); | |
139 | + } | |
140 | + | |
141 | + public HashSet<Relation> getRelations() { | |
142 | + return relations; | |
143 | + } | |
144 | + | |
145 | + public void setReturnRelation(Relation relation) { | |
146 | + returnRelation = relation; | |
147 | + } | |
148 | + | |
149 | + public Relation getReturnRelation() { | |
150 | + return returnRelation; | |
151 | + } | |
127 | 152 | |
128 | 153 | } |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
... | ... | @@ -8,11 +8,19 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; |
8 | 8 | import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; |
9 | 9 | |
10 | 10 | import java.io.File; |
11 | +import java.io.IOException; | |
12 | +import java.nio.charset.StandardCharsets; | |
13 | +import java.nio.file.Files; | |
14 | +import java.nio.file.Paths; | |
11 | 15 | import java.util.ArrayList; |
12 | 16 | import java.util.HashMap; |
13 | 17 | import java.util.List; |
14 | 18 | import java.util.Map; |
15 | 19 | |
20 | +import org.json.JSONArray; | |
21 | +import org.json.JSONObject; | |
22 | + | |
23 | + | |
16 | 24 | public class TeiLoader { |
17 | 25 | |
18 | 26 | private static Logger logger = LoggerFactory.getLogger(TeiLoader.class); |
... | ... | @@ -24,28 +32,75 @@ public class TeiLoader { |
24 | 32 | public static TEICorpusText readTeiText(File teiDir) throws TEIException { |
25 | 33 | return teiAPI.readFromNKJPDirectory(teiDir); |
26 | 34 | } |
27 | - | |
28 | - public static Text loadTextFromTei(TEICorpusText teiText) { | |
35 | + | |
36 | + public static Text loadTextFromTei(TEICorpusText teiText, File textDir) { | |
29 | 37 | Text text = new Text(teiText.getCorpusHeader().getId()); |
38 | + | |
39 | + String textId = textDir.getName(); | |
40 | + | |
41 | + System.out.println(textId); | |
42 | + | |
43 | + byte[] encoded; | |
44 | + JSONArray jsonParagraphs = null; | |
45 | + try { | |
46 | + //encoded = Files.readAllBytes(Paths.get("/home/bniton/Projekty/koreferencja/COTHEC/clients/python/data/train-JSON-Concraft-old_grammar/"+textId+".json")); | |
47 | + encoded = Files.readAllBytes(Paths.get("/home/bartek/Projekty/COTHEC/clients_data/"+textId+".json")); | |
48 | + String jsonContent = new String(encoded, StandardCharsets.UTF_8); | |
49 | + JSONObject jsonObject = new JSONObject(jsonContent); | |
50 | + | |
51 | + jsonParagraphs = jsonObject.getJSONArray("paragraphs"); | |
52 | + } catch (IOException e) { | |
53 | + // TODO Auto-generated catch block | |
54 | + //e.printStackTrace(); | |
55 | + logger.debug("No depparse layer."); | |
56 | + } | |
30 | 57 | |
31 | 58 | logger.debug("Loading tei text " + text.getId() + "..."); |
32 | - for (TEIParagraph teiP : teiText.getParagraphs()) | |
33 | - loadParagraph(text, teiP); | |
59 | + | |
60 | + List<TEIParagraph> teiParagraphs = teiText.getParagraphs(); | |
61 | + | |
62 | + for (int i=0; i < teiParagraphs.size(); i++) { | |
63 | + TEIParagraph teiP = teiParagraphs.get(i); | |
64 | + JSONObject jsonP = null; | |
65 | + if (jsonParagraphs != null) { | |
66 | + jsonP = new JSONObject(jsonParagraphs.get(i).toString()); | |
67 | + } | |
68 | + loadParagraph(text, teiP, jsonP); | |
69 | + } | |
34 | 70 | logger.debug("Tei text loaded."); |
35 | 71 | |
36 | 72 | return text; |
37 | 73 | } |
38 | 74 | |
39 | - private static void loadParagraph(Text text, TEIParagraph teiP) { | |
75 | + private static void loadParagraph(Text text, TEIParagraph teiP, JSONObject jsonP) { | |
40 | 76 | Paragraph p = new Paragraph(); |
41 | 77 | text.add(p); |
42 | - for (TEISentence teiS : teiP.getSentences()) | |
43 | - loadSentence(p, teiS); | |
78 | + | |
79 | + List<TEISentence> teiSentences = teiP.getSentences(); | |
80 | + | |
81 | + JSONArray jsonSentences = null; | |
82 | + if (jsonP != null) { | |
83 | + jsonSentences = jsonP.getJSONArray("sentences"); | |
84 | + } | |
85 | + | |
86 | + for (int i=0; i < teiSentences.size(); i++) { | |
87 | + TEISentence teiS = teiSentences.get(i); | |
88 | + | |
89 | + JSONObject jsonS = null; | |
90 | + if (jsonP != null) { | |
91 | + if (i < jsonSentences.length()) { | |
92 | + jsonS = new JSONObject(jsonSentences.get(i).toString()); | |
93 | + } | |
94 | + } | |
95 | + | |
96 | + loadSentence(p, teiS, jsonS); | |
97 | + } | |
44 | 98 | } |
45 | 99 | |
46 | - private static void loadSentence(Paragraph p, TEISentence teiS) { | |
100 | + private static void loadSentence(Paragraph p, TEISentence teiS, JSONObject jsonS) { | |
47 | 101 | Sentence s = new Sentence(); |
48 | 102 | p.add(s); |
103 | + | |
49 | 104 | Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>(); |
50 | 105 | for (TEIMorph teiM : teiS.getMorphs()) { |
51 | 106 | Token token = loadToken(s, teiM); |
... | ... | @@ -59,6 +114,33 @@ public class TeiLoader { |
59 | 114 | loadSyntacticGroup(s, g, teiMorph2Segment); |
60 | 115 | for (TEIMention m : teiS.getAllMentions()) |
61 | 116 | loadMentions(s, m, teiMorph2Segment); |
117 | + | |
118 | + if (jsonS != null && s.size() == jsonS.getJSONArray("tokens").length()) { | |
119 | + JSONArray relations = jsonS.getJSONArray("dependencyParse"); | |
120 | + for (int i=0; i<relations.length(); i++) { | |
121 | + loadRelation(s, new JSONObject(relations.get(i).toString())); | |
122 | + } | |
123 | + } else { | |
124 | + //System.out.println(s.toStringWithoutMentions()); | |
125 | + } | |
126 | + } | |
127 | + | |
128 | + private static void loadRelation(Sentence s, JSONObject jsonRelation) { | |
129 | + String label = jsonRelation.getString("label"); | |
130 | + if (!label.equals("root") && !jsonRelation.getString("startTokenId").isEmpty() && | |
131 | + jsonRelation.get("startTokenId").getClass() == String.class) { | |
132 | + String[] sourceIdParts = jsonRelation.getString("startTokenId").split("\\."); | |
133 | + String[] targetIdParts = jsonRelation.getString("endTokenId").split("\\."); | |
134 | + | |
135 | + int sourceId = Integer.parseInt(sourceIdParts[sourceIdParts.length-1]); | |
136 | + int targetId = Integer.parseInt(targetIdParts[sourceIdParts.length-1]); | |
137 | + | |
138 | + Token source = s.get(sourceId); | |
139 | + Token target = s.get(targetId); | |
140 | + | |
141 | + source.addRelation(new Relation(label, target)); | |
142 | + target.setReturnRelation(new Relation(label, source)); | |
143 | + } | |
62 | 144 | } |
63 | 145 | |
64 | 146 | private static void loadMentions(Sentence s, TEIMention m, |
... | ... | @@ -107,7 +189,7 @@ public class TeiLoader { |
107 | 189 | List<Token> tokens = new ArrayList<>(); |
108 | 190 | for (TEIMorph m : ne.getLeaves()) |
109 | 191 | tokens.add(teiMorph2Segment.get(m)); |
110 | - s.addNamedEntity(new NamedEntity(tokens)); | |
192 | + s.addNamedEntity(new NamedEntity(tokens, ne.getType(), ne.getSubtype())); | |
111 | 193 | } |
112 | 194 | |
113 | 195 | private static Token loadToken(Sentence s, TEIMorph teiM) { |
... | ... |
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
... | ... | @@ -86,7 +86,7 @@ public class ThriftLoader { |
86 | 86 | Map<String, Token> thiftTokenId2Token) { |
87 | 87 | List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity, |
88 | 88 | thiftTokenId2Token, false); |
89 | - s.addNamedEntity(new NamedEntity(tokens)); | |
89 | + s.addNamedEntity(new NamedEntity(tokens, ne.getType(), ne.getSubtype())); | |
90 | 90 | } |
91 | 91 | |
92 | 92 | private static Map<String, Object> getThriftId2EntityMap( |
... | ... |
src/main/resources/head_model.bin
0 → 100644
No preview for this file type
src/main/resources/nominal_model.bin
0 → 100644
No preview for this file type