Commit 2d60e476d9f47fbd460efb5c02d1f76b50decb08

Authored by Bartłomiej Nitoń
1 parent 86cf20ea

Fully statistical mention detector version (2.0).

... ... @@ -4,13 +4,13 @@
4 4  
5 5 <groupId>pl.waw.ipipan.zil.core</groupId>
6 6 <artifactId>md</artifactId>
7   - <version>1.3</version>
  7 + <version>2.0</version>
8 8  
9 9 <developers>
10 10 <developer>
11   - <name>Mateusz Kopeć</name>
  11 + <name>Bartłomiej Nitoń</name>
12 12 <organization>ICS PAS</organization>
13   - <email>m.kopec@ipipan.waw.pl</email>
  13 + <email>bartek.niton@gmail.com</email>
14 14 </developer>
15 15 </developers>
16 16  
... ...
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
... ... @@ -4,6 +4,8 @@ import org.slf4j.Logger;
4 4 import org.slf4j.LoggerFactory;
5 5  
6 6 import pl.waw.ipipan.zil.core.md.detection.Detector;
  7 +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector;
  8 +import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector;
7 9 import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
8 10 import pl.waw.ipipan.zil.core.md.entities.Text;
9 11 import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
... ... @@ -19,9 +21,11 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
19 21 import java.io.BufferedReader;
20 22 import java.io.File;
21 23 import java.io.FileInputStream;
  24 +import java.io.FileNotFoundException;
22 25 import java.io.IOException;
23 26 import java.io.InputStream;
24 27 import java.io.InputStreamReader;
  28 +import java.io.PrintWriter;
25 29 import java.util.ArrayList;
26 30 import java.util.EnumMap;
27 31 import java.util.HashMap;
... ... @@ -32,13 +36,17 @@ public class Main {
32 36 private static final Logger logger = LoggerFactory.getLogger(Main.class);
33 37  
34 38 private static final boolean GZIP_OUTPUT = true;
  39 + private static final String DEFAULT_HEAD_MODEL = "/head_model.bin";
  40 + private static final String DEFAULT_NOMINAL_MENTION_MODEL = "/nominal_model.bin";
35 41 private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
36 42 private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt";
37 43 private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt";
38 44  
  45 + private static HeadDetector headModel;
  46 + private static NominalMentionDetector nominalMentionModel;
39 47 private static ZeroSubjectDetector zeroSubjectModel;
40 48  
41   - public static enum ValenceDicts {
  49 + public static enum ValenceDicts {
42 50 VerbsValence,
43 51 NounsValence
44 52 }
... ... @@ -47,6 +55,12 @@ public class Main {
47 55 new EnumMap(ValenceDicts.class);
48 56  
49 57 static {
  58 + InputStream headDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_HEAD_MODEL);
  59 + headModel = new HeadDetector(headDetectionModelStream);
  60 +
  61 + InputStream nominalMentionDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_NOMINAL_MENTION_MODEL);
  62 + nominalMentionModel = new NominalMentionDetector(nominalMentionDetectionModelStream);
  63 +
50 64 InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
51 65 zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
52 66  
... ... @@ -138,6 +152,14 @@ public class Main {
138 152  
139 153 File inputDir = new File(args[0]);
140 154 File outputDir = new File(args[1]);
  155 + File defsOutputFile = new File(args[1], "definitions.csv");
  156 + PrintWriter defsWriter = null;
  157 + try {
  158 + defsWriter = new PrintWriter(defsOutputFile);
  159 + } catch (FileNotFoundException e1) {
  160 + // TODO Auto-generated catch block
  161 + e1.printStackTrace();
  162 + }
141 163  
142 164 if (!inputDir.isDirectory()) {
143 165 logger.error(inputDir + " is not a directory!");
... ... @@ -159,7 +181,6 @@ public class Main {
159 181 }
160 182  
161 183  
162   -
163 184 int all = 0;
164 185 int errors = 0;
165 186 for (File teiDir : IOUtils.getNKJPDirs(inputDir)) {
... ... @@ -167,13 +188,15 @@ public class Main {
167 188 try {
168 189 File targetDir = createTargetTextDir(inputDir, outputDir, teiDir);
169 190 TEICorpusText teiText = TeiLoader.readTeiText(teiDir);
170   - annotateTeiText(teiText);
  191 + annotateTeiText(teiText, teiDir, defsWriter);
171 192 TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT);
172 193 } catch (IOException e) {
173 194 logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e);
174 195 errors++;
175 196 }
176 197 }
  198 +
  199 + defsWriter.close();
177 200  
178 201 logger.info(all + " texts processed succesfully.");
179 202 if (errors > 0)
... ... @@ -208,9 +231,9 @@ public class Main {
208 231 * @param thriftText text to annotate with mentions
209 232 * @throws MultiserviceException when an error occures
210 233 */
211   - public static void annotateThriftText(TText thriftText) throws MultiserviceException {
  234 + public static void annotateThriftText(TText thriftText, PrintWriter defsWriter) throws MultiserviceException {
212 235 Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
213   - Detector.findMentionsInText(responseText, zeroSubjectModel, valence);
  236 + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter);
214 237 ThriftSaver.updateThriftText(responseText, thriftText);
215 238 }
216 239  
... ... @@ -221,9 +244,9 @@ public class Main {
221 244 * @param teiText text to annotate with mentions
222 245 * @throws TEIException when an error occurs
223 246 */
224   - public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
225   - Text responseText = TeiLoader.loadTextFromTei(teiText);
226   - Detector.findMentionsInText(responseText, zeroSubjectModel, valence);
  247 + public static void annotateTeiText(TEICorpusText teiText, File textDir, PrintWriter defsWriter) throws TEIException {
  248 + Text responseText = TeiLoader.loadTextFromTei(teiText, textDir);
  249 + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter);
227 250 TeiSaver.updateTeiText(responseText, teiText);
228 251 }
229 252  
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
... ... @@ -15,6 +15,8 @@ public class Constants {
15 15 "Adj", "Conj", "Comp");
16 16  
17 17 public static final List<String> VERB_CTAGS = Arrays.asList("Inf", "Verbfin");
  18 +
  19 + public static final List<String> DEPPARSE_MLABELS = Arrays.asList("subj", "obj", "comp");//, "pd");
18 20  
19 21 private Constants() {
20 22 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
... ... @@ -4,10 +4,15 @@ import org.slf4j.Logger;
4 4 import org.slf4j.LoggerFactory;
5 5  
6 6 import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
  7 +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector;
  8 +import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector;
7 9 import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
8 10 import pl.waw.ipipan.zil.core.md.entities.*;
9 11  
  12 +import java.io.PrintWriter;
10 13 import java.util.ArrayList;
  14 +import java.util.Arrays;
  15 +import java.util.Collections;
11 16 import java.util.HashSet;
12 17 import java.util.List;
13 18 import java.util.Map;
... ... @@ -21,36 +26,47 @@ public class Detector {
21 26 }
22 27  
23 28 public static void findMentionsInText(Text text,
  29 + HeadDetector headModel,
24 30 ZeroSubjectDetector zeroSubjectModel,
25   - Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
  31 + NominalMentionDetector nominalMentionModel,
  32 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
  33 + PrintWriter defsWriter) {
26 34 text.clearMentions();
27 35 logger.debug("Detecting mentions in text " + text.getId());
28 36 for (Paragraph p : text)
29 37 for (Sentence s : p)
30   - detectMentionsInSentence(s, zeroSubjectModel, valence);
  38 + detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter);
31 39 }
32 40  
33 41 private static void detectMentionsInSentence(Sentence sentence,
  42 + HeadDetector headModel,
34 43 ZeroSubjectDetector zeroSubjectModel,
35   - Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
  44 + NominalMentionDetector nominalMentionModel,
  45 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
  46 + PrintWriter defsWriter) {
36 47 // adding mentions
37   - addMentionsByTokenCtag(sentence);
38   - addMentionsBySyntacticWordsCtag(sentence);
39   - addMentionsByNamedEntities(sentence);
40   - addMentionsByGroups(sentence, valence);
41   - addSpeakerMentionsInSpoken(sentence);
  48 +// addMentionsByTokenCtag(sentence);
  49 +// addMentionsBySyntacticWordsCtag(sentence);
  50 +// addMentionsByNamedEntities(sentence);
  51 +// addMentionsByGroups(sentence, valence);
  52 +// //addMentionsByDeppParse(sentence);
  53 +// addSpeakerMentionsInSpoken(sentence);
42 54  
43 55 // zero subject detection
44 56 zeroSubjectModel.addZeroSubjectMentions(sentence);
  57 +
  58 + List<Token> heads = headModel.detectHeads(sentence);
  59 + nominalMentionModel.addNominalMentions(sentence, valence, heads);
45 60  
46 61 // removing mentions
47   - removeTo(sentence);
48   - Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));
49   - Cleaner.cleanUnnecessarySentenceMentions(sentence);
50   - Cleaner.cleanFrazeos(sentence);
  62 + // removeTo(sentence); to nic nie daje, jeszcze ponizsze spradzic
  63 +// Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));
  64 +// Cleaner.cleanUnnecessarySentenceMentions(sentence);
  65 +// Cleaner.cleanFrazeos(sentence);
  66 +
51 67  
52 68 // updating mention heads
53   - updateMentionHeads(sentence);
  69 + // updateMentionHeads(sentence);
54 70 }
55 71  
56 72 /**
... ... @@ -106,7 +122,7 @@ public class Detector {
106 122 private static void addMentionsByGroups(Sentence sentence,
107 123 Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
108 124  
109   - for (SyntacticGroup group : sentence.getGroups()) {
  125 + for (SyntacticGroup group : sentence.getGroups()) {
110 126 if (group.getType().startsWith("NG")) {
111 127 ArrayList<SyntacticGroup> nestedGroups = new ArrayList<SyntacticGroup>();
112 128 nestedGroups.add(group);
... ... @@ -286,4 +302,431 @@ public class Detector {
286 302 sentence.addMention(new Mention(token));
287 303 }
288 304 }
  305 +
  306 + private static void addMentionsByDeppParse(Sentence sentence) {
  307 + for (Token tok : sentence) {
  308 + // sprawdzac czy wzmianka jest ciagla tekstowo, bo czasami depparser zwraca dziwne drzewka
  309 + /*HashSet<Relation> relations = tok.getRelations();
  310 + for (Relation rel : relations) {
  311 + if (Constants.DEPPARSE_MLABELS.contains(rel.getName())
  312 + && !rel.getTarget().getCtag().matches(Constants.MORPHO_CTAGS)
  313 + && !rel.getTarget().getCtag().equals("prep")) {
  314 + Mention mention = buildMentionFromSubtree(rel.getTarget());
  315 + if (mention != null && !sentence.getMentions().contains(mention)) {
  316 + sentence.addMention(mention);
  317 + }
  318 + }
  319 + }*/
  320 + if (tok.getCtag().matches(Constants.MORPHO_CTAGS) || tok.getCtag().equals("num")) {
  321 + Mention mention = buildMentionFromSubtree(tok);
  322 + if (mention != null && !sentence.getMentions().contains(mention)) {
  323 + sentence.addMention(mention);
  324 + }
  325 + }
  326 + }
  327 + }
  328 +
  329 + private static Mention buildMentionFromSubtree(Token head) {
  330 + List<Token> heads = new ArrayList<Token>();
  331 + List<Token> segments = new ArrayList<Token>();
  332 + heads.add(head);
  333 + //segments.add(head);
  334 + segments.addAll(getTreeSegments(head));
  335 + Collections.sort(segments);
  336 + Mention mention = null;
  337 + try {
  338 + segments = removeBorderingSegments(segments, Arrays.asList("qub", "interp"));
  339 + if (!segments.isEmpty()) {
  340 + mention = new Mention(segments, heads);
  341 + }
  342 + } catch (ArrayIndexOutOfBoundsException e) {
  343 + logger.warn("Strange dependency structure");
  344 + }
  345 + return mention;
  346 + }
  347 +
  348 + private static List<Token> removeBorderingSegments(List<Token> segments, List<String> tags2Remove) {
  349 + Token firstSeg = segments.get(0);
  350 + while(tags2Remove.contains(firstSeg.getCtag())) {
  351 + segments.remove(firstSeg);
  352 + if (segments.isEmpty()) {
  353 + return segments;
  354 + }
  355 + firstSeg = segments.get(0);
  356 + }
  357 +
  358 + Token lastSeg = segments.get(segments.size() - 1);
  359 + while(tags2Remove.contains(lastSeg.getCtag())) {
  360 + segments.remove(lastSeg);
  361 + if (segments.isEmpty()) {
  362 + return segments;
  363 + }
  364 + lastSeg = segments.get(segments.size() - 1);
  365 + }
  366 +
  367 + return segments;
  368 + }
  369 +
  370 + private static List<Token> removePrecedingAdjs(List<Token> segments) {
  371 + Token firstSeg = segments.get(0);
  372 + while(firstSeg.getCtag().equals("adj")) {
  373 + segments.remove(firstSeg);
  374 + if (segments.isEmpty()) {
  375 + return segments;
  376 + }
  377 + firstSeg = segments.get(0);
  378 + }
  379 + return segments;
  380 + }
  381 +
  382 + private static HashSet<Token> getTreeSegments(Token tok) {
  383 + HashSet<Token> segments = new HashSet<Token>();
  384 + segments.add(tok);
  385 + for (Relation rel : tok.getRelations()) {
  386 + segments.addAll(getTreeSegments(rel.getTarget()));
  387 + }
  388 + return segments;
  389 + }
  390 +
  391 +
  392 + private static final List<String> DEF_CONJS_ORTHS =
  393 + Arrays.asList(//"to",
  394 + "to jest", "jest to", "zwane inaczej", "czyli", "inaczej mówiąc",
  395 + "inaczej nazywane", "zwane też", "zwane także", "zwane również", "zwane często",
  396 + "zwane zwykle", "definiowane jako", "znaczy tyle co", "rozumiane jako", "rozumiane jest",
  397 + "ktoś kto", "coś co", "nazywa się", "tak definiuje się");
  398 +
  399 + private static final List<String> DEF_CONJS_BASES =
  400 + Arrays.asList(//"to",
  401 + "to być", "być to", "zwać inaczej", "czyli", "inaczej mówić",
  402 + "inaczej nazywać", "zwać też", "zwać także", "zwać również", "zwać często",
  403 + "zwać zwykle", "definiować jako", "znaczyć tyle co", "rozumieć jako", "rozumieć być",
  404 + "ktoś kto", "kto być kto",
  405 + "coś co", "co być co",
  406 + "nazywać się", "tak definiować się");
  407 +
  408 +
  409 + private static final List<String> ANN_SOURCE_TO_OMMIT =
  410 + Arrays.asList("pan", "pani");
  411 +
  412 +
  413 + private static void getDefinitionsByGroups(Sentence sentence, String form, PrintWriter defsWriter) {
  414 + List<String> def_conjs = DEF_CONJS_ORTHS;
  415 + if (form.equals("base")) {
  416 + def_conjs = DEF_CONJS_BASES;
  417 + }
  418 + for (SyntacticGroup group : sentence.getGroups()) {
  419 + if (group.getType().startsWith("NG")) {
  420 + SyntacticGroup nextGroup = group.getClosestNGroup();
  421 +
  422 + if (nextGroup != null) {
  423 + int conjStart = group.getSentenceEndPosition() + 1;
  424 + int conjEnd = nextGroup.getSentenceStartPosition() - 1;
  425 + String conj = "";
  426 + if (conjEnd > conjStart && (group.containsNE() || nextGroup.containsNE())) {
  427 + conj = getText(sentence, conjStart, conjEnd, form);
  428 + if (def_conjs.contains(conj)) {
  429 + String definition = String.format("%s\t[%s%s%s]\t%s\t%s",
  430 + group.toString(),
  431 + conj, "/groups/", form,
  432 + nextGroup.toString(),
  433 + sentence.toStringWithoutMentions());
  434 + defsWriter.println(definition);
  435 + }
  436 + }
  437 + }
  438 +
  439 + }
  440 + }
  441 + }
  442 +
  443 + private static void getDefinitionsByMentions(Sentence sentence, String form, PrintWriter defsWriter) {
  444 + List<String> def_conjs = DEF_CONJS_ORTHS;
  445 + if (form.equals("base")) {
  446 + def_conjs = DEF_CONJS_BASES;
  447 + }
  448 + for (Mention mnt1 : sentence.getMentions()) {
  449 + int mnt1End = mnt1.getSentenceEndPosition();
  450 + for (Mention mnt2 : sentence.getMentions()) {
  451 + int mnt2Start = mnt2.getSentenceStartPosition();
  452 + int conjStart = mnt1End + 1;
  453 + int conjEnd = mnt2Start - 1;
  454 + if (conjEnd > conjStart) {
  455 + String conj = getText(sentence, conjStart, conjEnd, form);
  456 + if (def_conjs.contains(conj)) {
  457 + String definition = String.format("%s\t[%s%s%s]\t%s\t%s",
  458 + mnt1.toStringWithoutBrackets(),
  459 + conj, "/mentions/", form,
  460 + mnt2.toStringWithoutBrackets(),
  461 + sentence.toStringWithoutMentions());
  462 + defsWriter.println(definition);
  463 + }
  464 + }
  465 + }
  466 + }
  467 + }
  468 +
  469 + /*==> buildDefinitionsFromSubtree,
  470 + zwrocic dla drzewa o korzeniu subj, wszystkie poddrzewa
  471 + rozpoczynane relacja app, to co pod samym subj, to keyword:
  472 + patrz zdanie:
  473 +
  474 + Dr David Warner , neurofizjolog Akademii Medycznej Loma Linda w Kalifornii , wspólnie
  475 + ze specjalistami z Uniwersytetu Stanforda opracował urządzenie reagujące na ruchy mięśni twarzy .
  476 + */
  477 +
  478 + private static void getDefinitionsByDeppParse(Sentence sentence, PrintWriter defsWriter) {
  479 +
  480 + // podzielic mention przez relacje apozycji
  481 +
  482 + for (Token source : sentence) {
  483 + HashSet<Relation> relations = source.getRelations();
  484 + for (Relation rel : relations) {
  485 + if (//Constants.DEPPARSE_MLABELS.contains(rel.getName())
  486 + //rel.getName().equals("subj")
  487 + rel.getName().equals("app") &&
  488 + source.getReturnRelation() != null &&
  489 + //Constants.DEPPARSE_MLABELS.contains(source.getReturnRelation().getName())
  490 + ((source.getCase().equals("nom") && rel.getTarget().getCase().equals("nom")
  491 + && source.getNumber().equals(rel.getTarget().getNumber())
  492 + && source.getGender().equals(rel.getTarget().getGender())
  493 + && !source.isPpron() && !rel.getTarget().isPpron())
  494 + //|| source.getCtag().equals("brev")
  495 + ) //cos z tym brevem zrobic trzeba
  496 + ) {
  497 + ArrayList<List<Token>> appositions = getAppositionsFromSubtree(source, rel.getTarget());
  498 + if (appositions.size() > 1 && containsNE(appositions)) {
  499 + appositions = mergeNEs(appositions);
  500 + }
  501 + if (appositions.size() > 1 && containsNE(appositions)) {
  502 + ArrayList<String> appsStrList = appositionsToString(appositions);
  503 + String appositionsStr = String.join("\t", appsStrList);
  504 +
  505 + String definition = String.format("%s\t!!!!!\t%s",
  506 + //source.getOrth(),
  507 + appositionsStr,
  508 + sentence.toStringWithoutMentions());
  509 + defsWriter.println(definition);
  510 + }
  511 + }
  512 + }
  513 + }
  514 + }
  515 +
  516 + private static ArrayList<List<Token>> getAppositionsFromSubtree(Token root) {
  517 +
  518 + ArrayList<List<Token>> appositions = new ArrayList<List<Token>>();
  519 +
  520 + List<Token> segments = new ArrayList<Token>();
  521 + segments.addAll(getTreeSegments(root, "app"));
  522 + List<Token> allSegments = new ArrayList<Token>();
  523 + allSegments.addAll(extendByNEs(segments));
  524 +
  525 + Collections.sort(allSegments);
  526 + if (!ommitApp(allSegments)) {
  527 + appositions.add(allSegments);
  528 + }
  529 +
  530 +
  531 +
  532 + for (Token tok : allSegments) {
  533 + for (Relation rel : tok.getRelations()) {
  534 + if (rel.getName().equals("app") && !sameNE(tok, rel.getTarget())) {
  535 + appositions.addAll(getAppositionsFromSubtree(rel.getTarget()));
  536 + }
  537 + }
  538 + }
  539 +
  540 + return appositions;
  541 + }
  542 +
  543 + private static ArrayList<List<Token>> getAppositionsFromSubtree(Token source, Token target) {
  544 +
  545 + ArrayList<List<Token>> appositions = new ArrayList<List<Token>>();
  546 + if (sameNE(source, target)) {
  547 + return appositions;
  548 + }
  549 +
  550 + List<Token> sourceSegments = new ArrayList<Token>();
  551 + sourceSegments.addAll(getTreeSegments(source, target));
  552 + List<Token> allSourceSegments = new ArrayList<Token>();
  553 + allSourceSegments.addAll(extendByNEs(sourceSegments));
  554 +
  555 + Collections.sort(allSourceSegments);
  556 + if (!ommitApp(allSourceSegments)) {
  557 + appositions.add(allSourceSegments);
  558 + }
  559 +
  560 + List<Token> targetSegments = new ArrayList<Token>();
  561 + targetSegments.addAll(getTreeSegments(target));
  562 + List<Token> allTargetSegments = new ArrayList<Token>();
  563 + allTargetSegments.addAll(extendByNEs(targetSegments));
  564 +
  565 + Collections.sort(allTargetSegments);
  566 + if (!ommitApp(allTargetSegments)) {
  567 + appositions.add(allTargetSegments);
  568 + }
  569 +
  570 + return appositions;
  571 + }
  572 +
  573 + private static ArrayList<List<Token>> mergeNEs(ArrayList<List<Token>> appositions) {
  574 + ArrayList<List<Token>> appositionsCopy = new ArrayList<List<Token>>(appositions);
  575 + Sentence sentence = appositions.get(0).get(0).getSentence();
  576 + for (NamedEntity ne : sentence.getNamedEntities()) {
  577 + if (ne.getType().equals("persName")
  578 + && (ne.getSubtype() == null || ne.getSubtype().isEmpty())) {
  579 + HashSet<Token> mergedNE = new HashSet<Token>();
  580 + for (List<Token> app : appositionsCopy) {
  581 + if (ne.getTokens().containsAll(app)) {
  582 + mergedNE.addAll(app);
  583 + appositions.remove(app);
  584 + }
  585 + }
  586 + if (mergedNE.size() > 0) {
  587 + ArrayList newApposition = new ArrayList<Token>();
  588 + newApposition.addAll(mergedNE);
  589 + Collections.sort(newApposition);
  590 + appositions.add(newApposition);
  591 + }
  592 + appositionsCopy = new ArrayList<List<Token>>(appositions);
  593 + }
  594 + }
  595 + return appositions;
  596 + }
  597 +
  598 + public static boolean containsNE(ArrayList<List<Token>> appositions) {
  599 + for (List<Token> app : appositions) {
  600 + if (isNE(app)) {
  601 + return true;
  602 + }
  603 + /*for (Token tok : app) {
  604 + for (NamedEntity ne : sentence.getNamedEntities()) {
  605 + if (ne.getSubtype() != null && ne.getSubtype().equals("forename")) {
  606 + continue;
  607 + }
  608 + if (ne.getTokens().contains(tok)) {
  609 + return true;
  610 + }
  611 + }
  612 + }*/
  613 + }
  614 + return false;
  615 + }
  616 +
  617 + private static boolean isNE(List<Token> segments) {
  618 + Sentence sentence = segments.get(0).getSentence();
  619 + for (NamedEntity ne : sentence.getNamedEntities()) {
  620 + if (ne.getTokens().containsAll(segments) &&
  621 + segments.containsAll(ne.getTokens())) {
  622 + return true;
  623 + }
  624 + }
  625 + return false;
  626 + }
  627 +
  628 + private static ArrayList<String> appositionsToString(ArrayList<List<Token>> appositions) {
  629 + ArrayList<String> apposistionsStrs = new ArrayList<String>();
  630 + for (List<Token> apposition : appositions) {
  631 + String appText = getText(apposition, "orth");
  632 + apposistionsStrs.add(appText);
  633 + }
  634 + return apposistionsStrs;
  635 + }
  636 +
  637 +
  638 + private static boolean ommitApp(List<Token> segments) {
  639 + segments = removeBorderingSegments(segments, Arrays.asList("interp"));
  640 + if (segments.size() == 0) {
  641 + return true;
  642 + }
  643 + String appositionBase = getText(segments, "base");
  644 + if (ANN_SOURCE_TO_OMMIT.contains(segments.get(0).getBase().toLowerCase()) ||
  645 + appositionBase.length() < 2) {
  646 + return true;
  647 + }
  648 + return false;
  649 + }
  650 +
  651 + private static HashSet<Token> getTreeSegments(Token tok, String divRel) {
  652 + HashSet<Token> segments = new HashSet<Token>();
  653 + segments.add(tok);
  654 +
  655 + for (Relation rel : tok.getRelations()) {
  656 + if (!rel.getName().equals(divRel)) {
  657 + segments.addAll(getTreeSegments(rel.getTarget(), divRel));
  658 + }
  659 +
  660 + }
  661 + return segments;
  662 + }
  663 +
  664 + private static HashSet<Token> getTreeSegments(Token tok, Token nGoThere) {
  665 + HashSet<Token> segments = new HashSet<Token>();
  666 + segments.add(tok);
  667 +
  668 + for (Relation rel : tok.getRelations()) {
  669 + if (!rel.getTarget().equals(nGoThere)) {
  670 + segments.addAll(getTreeSegments(rel.getTarget(), nGoThere));
  671 + }
  672 +
  673 + }
  674 + return segments;
  675 + }
  676 +
  677 + private static HashSet<Token> extendByNEs(List<Token> segments) {
  678 + HashSet<Token> allSegments = new HashSet<Token>();
  679 + allSegments.addAll(segments);
  680 + for (Token tok : segments) {
  681 + Token neTok = tok;
  682 + while (neTok.getReturnRelation() != null
  683 + && (//neTok.getReturnRelation().getName().equals("ne")
  684 + //||
  685 + sameNE(neTok, neTok.getReturnRelation().getTarget()))) {
  686 + neTok = neTok.getReturnRelation().getTarget();
  687 + allSegments.add(neTok);
  688 + }
  689 + }
  690 + return allSegments;
  691 + }
  692 +
  693 + private static boolean sameNE(Token tok1, Token tok2) {
  694 + Sentence sentence = tok1.getSentence();
  695 + for (NamedEntity ne : sentence.getNamedEntities()) {
  696 + if (ne.getTokens().contains(tok1)
  697 + && ne.getTokens().contains(tok2)
  698 + && ne.getType().equals("persName")) {
  699 + return true;
  700 + }
  701 + }
  702 + return false;
  703 + }
  704 +
  705 + // TODO: przeniesc do klasy Sentence i wywalic static
  706 + private static String getText(Sentence sentence, int start, int end, String form) {
  707 + String conj = "";
  708 + for (Token tok : sentence.subList(start, end+1)) {
  709 + if (!tok.getCtag().equals("interp")) {
  710 + if (form.equals("orth")) {
  711 + conj += " " + tok.getOrth();
  712 + } else if (form.equals("base")) {
  713 + conj += " " + tok.getBase();
  714 + }
  715 + }
  716 + }
  717 + return conj.trim();
  718 + }
  719 +
  720 + private static String getText(List<Token> segments, String form) {
  721 + String conj = "";
  722 + for (Token tok : segments) {
  723 + if (form.equals("orth")) {
  724 + conj += " " + tok.getOrth();
  725 + } else if (form.equals("base")) {
  726 + conj += " " + tok.getBase();
  727 + }
  728 + }
  729 + return conj.trim();
  730 + }
  731 +
289 732 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java
... ... @@ -35,7 +35,7 @@ public class InstanceCreator {
35 35 allTexts++;
36 36 logger.info("Processing text " + textDir);
37 37 TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
38   - Text text = TeiLoader.loadTextFromTei(ct);
  38 + Text text = TeiLoader.loadTextFromTei(ct, textDir);
39 39  
40 40 for (Paragraph p : text)
41 41 for (Sentence s : p) {
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
... ... @@ -83,6 +83,14 @@ public class Mention implements Comparable&lt;Mention&gt; {
83 83 sb.append("]");
84 84 return sb.toString();
85 85 }
  86 +
  87 + public String toStringWithoutBrackets() {
  88 + StringBuffer sb = new StringBuffer();
  89 + for (Token seg : segments) {
  90 + sb.append(seg.toString() + " ");
  91 + }
  92 + return sb.toString();
  93 + }
86 94  
87 95 public MentionGroup getMentionGroup() {
88 96 return mentionGroup;
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java
... ... @@ -6,14 +6,26 @@ import java.util.List;
6 6 public class NamedEntity implements Comparable<NamedEntity> {
7 7  
8 8 private List<Token> tokens;
  9 + private String type;
  10 + private String subtype;
9 11  
10   - public NamedEntity(List<Token> tokens) {
  12 + public NamedEntity(List<Token> tokens, String type, String subType) {
11 13 this.tokens = tokens;
  14 + this.type = type;
  15 + this.subtype = subType;
12 16 }
13 17  
14 18 public List<Token> getTokens() {
15 19 return this.tokens;
16 20 }
  21 +
  22 + public String getType() {
  23 + return this.type;
  24 + }
  25 +
  26 + public String getSubtype() {
  27 + return this.subtype;
  28 + }
17 29  
18 30 @Override
19 31 public int compareTo(NamedEntity o) {
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
1 1 package pl.waw.ipipan.zil.core.md.entities;
2 2  
3 3 import java.util.ArrayList;
  4 +import java.util.Arrays;
4 5 import java.util.Iterator;
5 6 import java.util.List;
6 7  
... ... @@ -143,6 +144,30 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
143 144 return largestGroup;
144 145 }
145 146  
  147 + public SyntacticGroup getClosestNGroup() {
  148 + SyntacticGroup nextGroup = null;
  149 + Sentence sentence = this.tokens.get(0).getSentence();
  150 + int nextTokenPosition = this.getSentenceEndPosition() + 1;
  151 + while (nextTokenPosition <= sentence.size()) {
  152 +
  153 + for (SyntacticGroup group : sentence.getGroups()) {
  154 + if (group.getType().startsWith("NG") &&
  155 + group.getSentenceStartPosition() == nextTokenPosition) {
  156 + if (nextGroup == null ||
  157 + nextGroup.getTokens().size() < group.getTokens().size()) {
  158 + nextGroup = group;
  159 + }
  160 + }
  161 + }
  162 + if (nextGroup != null) {
  163 + break;
  164 + }
  165 + nextTokenPosition ++;
  166 + }
  167 +
  168 + return nextGroup;
  169 + }
  170 +
146 171 public SyntacticWord getPrecedingVerb() {
147 172 int precedingTokenPosition = this.getSentenceStartPosition() - 1;
148 173 Sentence sentence = this.tokens.get(0).getSentence();
... ... @@ -190,5 +215,28 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
190 215 }
191 216 return parentPrepNG;
192 217 }
  218 +
  219 + public String toString() {
  220 + String textRep = "";
  221 + for (Token tok : tokens) {
  222 + textRep += " " + tok.getOrth();
  223 + }
  224 + return textRep.trim();
  225 + }
  226 +
  227 + public boolean containsNE() {
  228 + Sentence sentence = this.tokens.get(0).getSentence();
  229 + for (Token tok : tokens) {
  230 + for (NamedEntity ne : sentence.getNamedEntities()) {
  231 + if (ne.getSubtype() != null && ne.getSubtype().equals("forename")) {
  232 + continue;
  233 + }
  234 + if (ne.getTokens().contains(tok)) {
  235 + return true;
  236 + }
  237 + }
  238 + }
  239 + return false;
  240 + }
193 241  
194 242 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java
... ... @@ -7,6 +7,8 @@ public class Token implements Comparable&lt;Token&gt; {
7 7 private int sentencePosition;
8 8  
9 9 private Set<Mention> mentions = null;
  10 + private HashSet<Relation> relations = new HashSet<Relation>();
  11 + private Relation returnRelation = null;
10 12  
11 13 private String orth;
12 14 private Interpretation chosenInterpretation;
... ... @@ -119,10 +121,33 @@ public class Token implements Comparable&lt;Token&gt; {
119 121 public String getCtag() {
120 122 return getChosenInterpretation().getCtag();
121 123 }
  124 +
  125 + public boolean isPpron() {
  126 + if (this.getCtag().startsWith("ppron")) {
  127 + return true;
  128 + }
  129 + return false;
  130 + }
122 131  
123 132 @Override
124 133 public int compareTo(Token o) {
125 134 return getSentencePosition().compareTo(o.getSentencePosition());
126 135 }
  136 +
  137 + public void addRelation(Relation relation) {
  138 + relations.add(relation);
  139 + }
  140 +
  141 + public HashSet<Relation> getRelations() {
  142 + return relations;
  143 + }
  144 +
  145 + public void setReturnRelation(Relation relation) {
  146 + returnRelation = relation;
  147 + }
  148 +
  149 + public Relation getReturnRelation() {
  150 + return returnRelation;
  151 + }
127 152  
128 153 }
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
... ... @@ -8,11 +8,19 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
8 8 import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
9 9  
10 10 import java.io.File;
  11 +import java.io.IOException;
  12 +import java.nio.charset.StandardCharsets;
  13 +import java.nio.file.Files;
  14 +import java.nio.file.Paths;
11 15 import java.util.ArrayList;
12 16 import java.util.HashMap;
13 17 import java.util.List;
14 18 import java.util.Map;
15 19  
  20 +import org.json.JSONArray;
  21 +import org.json.JSONObject;
  22 +
  23 +
16 24 public class TeiLoader {
17 25  
18 26 private static Logger logger = LoggerFactory.getLogger(TeiLoader.class);
... ... @@ -24,28 +32,75 @@ public class TeiLoader {
24 32 public static TEICorpusText readTeiText(File teiDir) throws TEIException {
25 33 return teiAPI.readFromNKJPDirectory(teiDir);
26 34 }
27   -
28   - public static Text loadTextFromTei(TEICorpusText teiText) {
  35 +
  36 + public static Text loadTextFromTei(TEICorpusText teiText, File textDir) {
29 37 Text text = new Text(teiText.getCorpusHeader().getId());
  38 +
  39 + String textId = textDir.getName();
  40 +
  41 + System.out.println(textId);
  42 +
  43 + byte[] encoded;
  44 + JSONArray jsonParagraphs = null;
  45 + try {
  46 + //encoded = Files.readAllBytes(Paths.get("/home/bniton/Projekty/koreferencja/COTHEC/clients/python/data/train-JSON-Concraft-old_grammar/"+textId+".json"));
  47 + encoded = Files.readAllBytes(Paths.get("/home/bartek/Projekty/COTHEC/clients_data/"+textId+".json"));
  48 + String jsonContent = new String(encoded, StandardCharsets.UTF_8);
  49 + JSONObject jsonObject = new JSONObject(jsonContent);
  50 +
  51 + jsonParagraphs = jsonObject.getJSONArray("paragraphs");
  52 + } catch (IOException e) {
  53 + // TODO Auto-generated catch block
  54 + //e.printStackTrace();
  55 + logger.debug("No depparse layer.");
  56 + }
30 57  
31 58 logger.debug("Loading tei text " + text.getId() + "...");
32   - for (TEIParagraph teiP : teiText.getParagraphs())
33   - loadParagraph(text, teiP);
  59 +
  60 + List<TEIParagraph> teiParagraphs = teiText.getParagraphs();
  61 +
  62 + for (int i=0; i < teiParagraphs.size(); i++) {
  63 + TEIParagraph teiP = teiParagraphs.get(i);
  64 + JSONObject jsonP = null;
  65 + if (jsonParagraphs != null) {
  66 + jsonP = new JSONObject(jsonParagraphs.get(i).toString());
  67 + }
  68 + loadParagraph(text, teiP, jsonP);
  69 + }
34 70 logger.debug("Tei text loaded.");
35 71  
36 72 return text;
37 73 }
38 74  
39   - private static void loadParagraph(Text text, TEIParagraph teiP) {
  75 + private static void loadParagraph(Text text, TEIParagraph teiP, JSONObject jsonP) {
40 76 Paragraph p = new Paragraph();
41 77 text.add(p);
42   - for (TEISentence teiS : teiP.getSentences())
43   - loadSentence(p, teiS);
  78 +
  79 + List<TEISentence> teiSentences = teiP.getSentences();
  80 +
  81 + JSONArray jsonSentences = null;
  82 + if (jsonP != null) {
  83 + jsonSentences = jsonP.getJSONArray("sentences");
  84 + }
  85 +
  86 + for (int i=0; i < teiSentences.size(); i++) {
  87 + TEISentence teiS = teiSentences.get(i);
  88 +
  89 + JSONObject jsonS = null;
  90 + if (jsonP != null) {
  91 + if (i < jsonSentences.length()) {
  92 + jsonS = new JSONObject(jsonSentences.get(i).toString());
  93 + }
  94 + }
  95 +
  96 + loadSentence(p, teiS, jsonS);
  97 + }
44 98 }
45 99  
46   - private static void loadSentence(Paragraph p, TEISentence teiS) {
  100 + private static void loadSentence(Paragraph p, TEISentence teiS, JSONObject jsonS) {
47 101 Sentence s = new Sentence();
48 102 p.add(s);
  103 +
49 104 Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>();
50 105 for (TEIMorph teiM : teiS.getMorphs()) {
51 106 Token token = loadToken(s, teiM);
... ... @@ -59,6 +114,33 @@ public class TeiLoader {
59 114 loadSyntacticGroup(s, g, teiMorph2Segment);
60 115 for (TEIMention m : teiS.getAllMentions())
61 116 loadMentions(s, m, teiMorph2Segment);
  117 +
  118 + if (jsonS != null && s.size() == jsonS.getJSONArray("tokens").length()) {
  119 + JSONArray relations = jsonS.getJSONArray("dependencyParse");
  120 + for (int i=0; i<relations.length(); i++) {
  121 + loadRelation(s, new JSONObject(relations.get(i).toString()));
  122 + }
  123 + } else {
  124 + //System.out.println(s.toStringWithoutMentions());
  125 + }
  126 + }
  127 +
  128 + private static void loadRelation(Sentence s, JSONObject jsonRelation) {
  129 + String label = jsonRelation.getString("label");
  130 + if (!label.equals("root") && !jsonRelation.getString("startTokenId").isEmpty() &&
  131 + jsonRelation.get("startTokenId").getClass() == String.class) {
  132 + String[] sourceIdParts = jsonRelation.getString("startTokenId").split("\\.");
  133 + String[] targetIdParts = jsonRelation.getString("endTokenId").split("\\.");
  134 +
  135 + int sourceId = Integer.parseInt(sourceIdParts[sourceIdParts.length-1]);
  136 + int targetId = Integer.parseInt(targetIdParts[sourceIdParts.length-1]);
  137 +
  138 + Token source = s.get(sourceId);
  139 + Token target = s.get(targetId);
  140 +
  141 + source.addRelation(new Relation(label, target));
  142 + target.setReturnRelation(new Relation(label, source));
  143 + }
62 144 }
63 145  
64 146 private static void loadMentions(Sentence s, TEIMention m,
... ... @@ -107,7 +189,7 @@ public class TeiLoader {
107 189 List<Token> tokens = new ArrayList<>();
108 190 for (TEIMorph m : ne.getLeaves())
109 191 tokens.add(teiMorph2Segment.get(m));
110   - s.addNamedEntity(new NamedEntity(tokens));
  192 + s.addNamedEntity(new NamedEntity(tokens, ne.getType(), ne.getSubtype()));
111 193 }
112 194  
113 195 private static Token loadToken(Sentence s, TEIMorph teiM) {
... ...
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
... ... @@ -86,7 +86,7 @@ public class ThriftLoader {
86 86 Map<String, Token> thiftTokenId2Token) {
87 87 List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity,
88 88 thiftTokenId2Token, false);
89   - s.addNamedEntity(new NamedEntity(tokens));
  89 + s.addNamedEntity(new NamedEntity(tokens, ne.getType(), ne.getSubtype()));
90 90 }
91 91  
92 92 private static Map<String, Object> getThriftId2EntityMap(
... ...
src/main/resources/head_model.bin 0 → 100644
No preview for this file type
src/main/resources/nominal_model.bin 0 → 100644
No preview for this file type