From 2d60e476d9f47fbd460efb5c02d1f76b50decb08 Mon Sep 17 00:00:00 2001 From: bniton <bartek.niton@gmail.com> Date: Tue, 15 May 2018 15:01:42 +0200 Subject: [PATCH] Fully statistical mention detector version (2.0). --- pom.xml | 6 +++--- src/main/java/pl/waw/ipipan/zil/core/md/Main.java | 39 +++++++++++++++++++++++++++++++-------- src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java | 2 ++ src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java | 471 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------- src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java | 2 +- src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java | 8 ++++++++ src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java | 14 +++++++++++++- src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java | 48 ++++++++++++++++++++++++++++++++++++++++++++++++ src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java | 25 +++++++++++++++++++++++++ src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java | 2 +- src/main/resources/head_model.bin | Bin 0 -> 65359 bytes src/main/resources/nominal_model.bin | Bin 0 -> 3880515 bytes 13 files changed, 680 insertions(+), 37 deletions(-) create mode 100644 src/main/resources/head_model.bin create mode 100644 src/main/resources/nominal_model.bin diff --git a/pom.xml b/pom.xml index f8c4fb8..b86b5b2 100644 --- a/pom.xml +++ b/pom.xml @@ -4,13 +4,13 @@ <groupId>pl.waw.ipipan.zil.core</groupId> <artifactId>md</artifactId> - <version>1.3</version> + <version>2.0</version> <developers> <developer> - <name>Mateusz Kopeć</name> + <name>Bartłomiej Nitoń</name> <organization>ICS PAS</organization> - <email>m.kopec@ipipan.waw.pl</email> + <email>bartek.niton@gmail.com</email> </developer> </developers> diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java b/src/main/java/pl/waw/ipipan/zil/core/md/Main.java index 1820371..32dea8d 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/Main.java @@ -4,6 +4,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.core.md.detection.Detector; +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector; +import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector; import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; import pl.waw.ipipan.zil.core.md.entities.Text; import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; @@ -19,9 +21,11 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.PrintWriter; import java.util.ArrayList; import java.util.EnumMap; import java.util.HashMap; @@ -32,13 +36,17 @@ public class Main { private static final Logger logger = LoggerFactory.getLogger(Main.class); private static final boolean GZIP_OUTPUT = true; + private static final String DEFAULT_HEAD_MODEL = "/head_model.bin"; + private static final String DEFAULT_NOMINAL_MENTION_MODEL = "/nominal_model.bin"; private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt"; private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt"; + private static HeadDetector headModel; + private static NominalMentionDetector nominalMentionModel; private static ZeroSubjectDetector zeroSubjectModel; - public static enum ValenceDicts { + public static enum ValenceDicts { VerbsValence, NounsValence } @@ -47,6 +55,12 @@ public class Main { new EnumMap(ValenceDicts.class); static { + InputStream headDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_HEAD_MODEL); + headModel = new HeadDetector(headDetectionModelStream); + + InputStream nominalMentionDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_NOMINAL_MENTION_MODEL); + nominalMentionModel = new NominalMentionDetector(nominalMentionDetectionModelStream); + InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); @@ -138,6 +152,14 @@ public class Main { File inputDir = new File(args[0]); File outputDir = new File(args[1]); + File defsOutputFile = new File(args[1], "definitions.csv"); + PrintWriter defsWriter = null; + try { + defsWriter = new PrintWriter(defsOutputFile); + } catch (FileNotFoundException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } if (!inputDir.isDirectory()) { logger.error(inputDir + " is not a directory!"); @@ -159,7 +181,6 @@ public class Main { } - int all = 0; int errors = 0; for (File teiDir : IOUtils.getNKJPDirs(inputDir)) { @@ -167,13 +188,15 @@ public class Main { try { File targetDir = createTargetTextDir(inputDir, outputDir, teiDir); TEICorpusText teiText = TeiLoader.readTeiText(teiDir); - annotateTeiText(teiText); + annotateTeiText(teiText, teiDir, defsWriter); TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT); } catch (IOException e) { logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e); errors++; } } + + defsWriter.close(); logger.info(all + " texts processed succesfully."); if (errors > 0) @@ -208,9 +231,9 @@ public class Main { * @param thriftText text to annotate with mentions * @throws MultiserviceException when an error occures */ - public static void annotateThriftText(TText thriftText) throws MultiserviceException { + public static void annotateThriftText(TText thriftText, PrintWriter defsWriter) throws MultiserviceException { Text responseText = ThriftLoader.loadTextFromThrift(thriftText); - Detector.findMentionsInText(responseText, zeroSubjectModel, valence); + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); ThriftSaver.updateThriftText(responseText, thriftText); } @@ -221,9 +244,9 @@ public class Main { * @param teiText text to annotate with mentions * @throws TEIException when an error occurs */ - public static void annotateTeiText(TEICorpusText teiText) throws TEIException { - Text responseText = TeiLoader.loadTextFromTei(teiText); - Detector.findMentionsInText(responseText, zeroSubjectModel, valence); + public static void annotateTeiText(TEICorpusText teiText, File textDir, PrintWriter defsWriter) throws TEIException { + Text responseText = TeiLoader.loadTextFromTei(teiText, textDir); + Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); TeiSaver.updateTeiText(responseText, teiText); } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java index cc7aa2a..8da1fee 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java @@ -15,6 +15,8 @@ public class Constants { "Adj", "Conj", "Comp"); public static final List<String> VERB_CTAGS = Arrays.asList("Inf", "Verbfin"); + + public static final List<String> DEPPARSE_MLABELS = Arrays.asList("subj", "obj", "comp");//, "pd"); private Constants() { } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java index 504ab4d..1c4197c 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java @@ -4,10 +4,15 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector; +import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector; import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; import pl.waw.ipipan.zil.core.md.entities.*; +import java.io.PrintWriter; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -21,36 +26,47 @@ public class Detector { } public static void findMentionsInText(Text text, + HeadDetector headModel, ZeroSubjectDetector zeroSubjectModel, - Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { + NominalMentionDetector nominalMentionModel, + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, + PrintWriter defsWriter) { text.clearMentions(); logger.debug("Detecting mentions in text " + text.getId()); for (Paragraph p : text) for (Sentence s : p) - detectMentionsInSentence(s, zeroSubjectModel, valence); + detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter); } private static void detectMentionsInSentence(Sentence sentence, + HeadDetector headModel, ZeroSubjectDetector zeroSubjectModel, - Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { + NominalMentionDetector nominalMentionModel, + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, + PrintWriter defsWriter) { // adding mentions - addMentionsByTokenCtag(sentence); - addMentionsBySyntacticWordsCtag(sentence); - addMentionsByNamedEntities(sentence); - addMentionsByGroups(sentence, valence); - addSpeakerMentionsInSpoken(sentence); +// addMentionsByTokenCtag(sentence); +// addMentionsBySyntacticWordsCtag(sentence); +// addMentionsByNamedEntities(sentence); +// addMentionsByGroups(sentence, valence); +// //addMentionsByDeppParse(sentence); +// addSpeakerMentionsInSpoken(sentence); // zero subject detection zeroSubjectModel.addZeroSubjectMentions(sentence); + + List<Token> heads = headModel.detectHeads(sentence); + nominalMentionModel.addNominalMentions(sentence, valence, heads); // removing mentions - removeTo(sentence); - Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); - Cleaner.cleanUnnecessarySentenceMentions(sentence); - Cleaner.cleanFrazeos(sentence); + // removeTo(sentence); to nic nie daje, jeszcze ponizsze spradzic +// Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); +// Cleaner.cleanUnnecessarySentenceMentions(sentence); +// Cleaner.cleanFrazeos(sentence); + // updating mention heads - updateMentionHeads(sentence); + // updateMentionHeads(sentence); } /** @@ -106,7 +122,7 @@ public class Detector { private static void addMentionsByGroups(Sentence sentence, Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { - for (SyntacticGroup group : sentence.getGroups()) { + for (SyntacticGroup group : sentence.getGroups()) { if (group.getType().startsWith("NG")) { ArrayList<SyntacticGroup> nestedGroups = new ArrayList<SyntacticGroup>(); nestedGroups.add(group); @@ -286,4 +302,431 @@ public class Detector { sentence.addMention(new Mention(token)); } } + + private static void addMentionsByDeppParse(Sentence sentence) { + for (Token tok : sentence) { + // sprawdzac czy wzmianka jest ciagla tekstowo, bo czasami depparser zwraca dziwne drzewka + /*HashSet<Relation> relations = tok.getRelations(); + for (Relation rel : relations) { + if (Constants.DEPPARSE_MLABELS.contains(rel.getName()) + && !rel.getTarget().getCtag().matches(Constants.MORPHO_CTAGS) + && !rel.getTarget().getCtag().equals("prep")) { + Mention mention = buildMentionFromSubtree(rel.getTarget()); + if (mention != null && !sentence.getMentions().contains(mention)) { + sentence.addMention(mention); + } + } + }*/ + if (tok.getCtag().matches(Constants.MORPHO_CTAGS) || tok.getCtag().equals("num")) { + Mention mention = buildMentionFromSubtree(tok); + if (mention != null && !sentence.getMentions().contains(mention)) { + sentence.addMention(mention); + } + } + } + } + + private static Mention buildMentionFromSubtree(Token head) { + List<Token> heads = new ArrayList<Token>(); + List<Token> segments = new ArrayList<Token>(); + heads.add(head); + //segments.add(head); + segments.addAll(getTreeSegments(head)); + Collections.sort(segments); + Mention mention = null; + try { + segments = removeBorderingSegments(segments, Arrays.asList("qub", "interp")); + if (!segments.isEmpty()) { + mention = new Mention(segments, heads); + } + } catch (ArrayIndexOutOfBoundsException e) { + logger.warn("Strange dependency structure"); + } + return mention; + } + + private static List<Token> removeBorderingSegments(List<Token> segments, List<String> tags2Remove) { + Token firstSeg = segments.get(0); + while(tags2Remove.contains(firstSeg.getCtag())) { + segments.remove(firstSeg); + if (segments.isEmpty()) { + return segments; + } + firstSeg = segments.get(0); + } + + Token lastSeg = segments.get(segments.size() - 1); + while(tags2Remove.contains(lastSeg.getCtag())) { + segments.remove(lastSeg); + if (segments.isEmpty()) { + return segments; + } + lastSeg = segments.get(segments.size() - 1); + } + + return segments; + } + + private static List<Token> removePrecedingAdjs(List<Token> segments) { + Token firstSeg = segments.get(0); + while(firstSeg.getCtag().equals("adj")) { + segments.remove(firstSeg); + if (segments.isEmpty()) { + return segments; + } + firstSeg = segments.get(0); + } + return segments; + } + + private static HashSet<Token> getTreeSegments(Token tok) { + HashSet<Token> segments = new HashSet<Token>(); + segments.add(tok); + for (Relation rel : tok.getRelations()) { + segments.addAll(getTreeSegments(rel.getTarget())); + } + return segments; + } + + + private static final List<String> DEF_CONJS_ORTHS = + Arrays.asList(//"to", + "to jest", "jest to", "zwane inaczej", "czyli", "inaczej mówiąc", + "inaczej nazywane", "zwane też", "zwane także", "zwane również", "zwane często", + "zwane zwykle", "definiowane jako", "znaczy tyle co", "rozumiane jako", "rozumiane jest", + "ktoś kto", "coś co", "nazywa się", "tak definiuje się"); + + private static final List<String> DEF_CONJS_BASES = + Arrays.asList(//"to", + "to być", "być to", "zwać inaczej", "czyli", "inaczej mówić", + "inaczej nazywać", "zwać też", "zwać także", "zwać również", "zwać często", + "zwać zwykle", "definiować jako", "znaczyć tyle co", "rozumieć jako", "rozumieć być", + "ktoś kto", "kto być kto", + "coś co", "co być co", + "nazywać się", "tak definiować się"); + + + private static final List<String> ANN_SOURCE_TO_OMMIT = + Arrays.asList("pan", "pani"); + + + private static void getDefinitionsByGroups(Sentence sentence, String form, PrintWriter defsWriter) { + List<String> def_conjs = DEF_CONJS_ORTHS; + if (form.equals("base")) { + def_conjs = DEF_CONJS_BASES; + } + for (SyntacticGroup group : sentence.getGroups()) { + if (group.getType().startsWith("NG")) { + SyntacticGroup nextGroup = group.getClosestNGroup(); + + if (nextGroup != null) { + int conjStart = group.getSentenceEndPosition() + 1; + int conjEnd = nextGroup.getSentenceStartPosition() - 1; + String conj = ""; + if (conjEnd > conjStart && (group.containsNE() || nextGroup.containsNE())) { + conj = getText(sentence, conjStart, conjEnd, form); + if (def_conjs.contains(conj)) { + String definition = String.format("%s\t[%s%s%s]\t%s\t%s", + group.toString(), + conj, "/groups/", form, + nextGroup.toString(), + sentence.toStringWithoutMentions()); + defsWriter.println(definition); + } + } + } + + } + } + } + + private static void getDefinitionsByMentions(Sentence sentence, String form, PrintWriter defsWriter) { + List<String> def_conjs = DEF_CONJS_ORTHS; + if (form.equals("base")) { + def_conjs = DEF_CONJS_BASES; + } + for (Mention mnt1 : sentence.getMentions()) { + int mnt1End = mnt1.getSentenceEndPosition(); + for (Mention mnt2 : sentence.getMentions()) { + int mnt2Start = mnt2.getSentenceStartPosition(); + int conjStart = mnt1End + 1; + int conjEnd = mnt2Start - 1; + if (conjEnd > conjStart) { + String conj = getText(sentence, conjStart, conjEnd, form); + if (def_conjs.contains(conj)) { + String definition = String.format("%s\t[%s%s%s]\t%s\t%s", + mnt1.toStringWithoutBrackets(), + conj, "/mentions/", form, + mnt2.toStringWithoutBrackets(), + sentence.toStringWithoutMentions()); + defsWriter.println(definition); + } + } + } + } + } + + /*==> buildDefinitionsFromSubtree, + zwrocic dla drzewa o korzeniu subj, wszystkie poddrzewa + rozpoczynane relacja app, to co pod samym subj, to keyword: + patrz zdanie: + + Dr David Warner , neurofizjolog Akademii Medycznej Loma Linda w Kalifornii , wspólnie + ze specjalistami z Uniwersytetu Stanforda opracował urządzenie reagujące na ruchy mięśni twarzy . + */ + + private static void getDefinitionsByDeppParse(Sentence sentence, PrintWriter defsWriter) { + + // podzielic mention przez relacje apozycji + + for (Token source : sentence) { + HashSet<Relation> relations = source.getRelations(); + for (Relation rel : relations) { + if (//Constants.DEPPARSE_MLABELS.contains(rel.getName()) + //rel.getName().equals("subj") + rel.getName().equals("app") && + source.getReturnRelation() != null && + //Constants.DEPPARSE_MLABELS.contains(source.getReturnRelation().getName()) + ((source.getCase().equals("nom") && rel.getTarget().getCase().equals("nom") + && source.getNumber().equals(rel.getTarget().getNumber()) + && source.getGender().equals(rel.getTarget().getGender()) + && !source.isPpron() && !rel.getTarget().isPpron()) + //|| source.getCtag().equals("brev") + ) //cos z tym brevem zrobic trzeba + ) { + ArrayList<List<Token>> appositions = getAppositionsFromSubtree(source, rel.getTarget()); + if (appositions.size() > 1 && containsNE(appositions)) { + appositions = mergeNEs(appositions); + } + if (appositions.size() > 1 && containsNE(appositions)) { + ArrayList<String> appsStrList = appositionsToString(appositions); + String appositionsStr = String.join("\t", appsStrList); + + String definition = String.format("%s\t!!!!!\t%s", + //source.getOrth(), + appositionsStr, + sentence.toStringWithoutMentions()); + defsWriter.println(definition); + } + } + } + } + } + + private static ArrayList<List<Token>> getAppositionsFromSubtree(Token root) { + + ArrayList<List<Token>> appositions = new ArrayList<List<Token>>(); + + List<Token> segments = new ArrayList<Token>(); + segments.addAll(getTreeSegments(root, "app")); + List<Token> allSegments = new ArrayList<Token>(); + allSegments.addAll(extendByNEs(segments)); + + Collections.sort(allSegments); + if (!ommitApp(allSegments)) { + appositions.add(allSegments); + } + + + + for (Token tok : allSegments) { + for (Relation rel : tok.getRelations()) { + if (rel.getName().equals("app") && !sameNE(tok, rel.getTarget())) { + appositions.addAll(getAppositionsFromSubtree(rel.getTarget())); + } + } + } + + return appositions; + } + + private static ArrayList<List<Token>> getAppositionsFromSubtree(Token source, Token target) { + + ArrayList<List<Token>> appositions = new ArrayList<List<Token>>(); + if (sameNE(source, target)) { + return appositions; + } + + List<Token> sourceSegments = new ArrayList<Token>(); + sourceSegments.addAll(getTreeSegments(source, target)); + List<Token> allSourceSegments = new ArrayList<Token>(); + allSourceSegments.addAll(extendByNEs(sourceSegments)); + + Collections.sort(allSourceSegments); + if (!ommitApp(allSourceSegments)) { + appositions.add(allSourceSegments); + } + + List<Token> targetSegments = new ArrayList<Token>(); + targetSegments.addAll(getTreeSegments(target)); + List<Token> allTargetSegments = new ArrayList<Token>(); + allTargetSegments.addAll(extendByNEs(targetSegments)); + + Collections.sort(allTargetSegments); + if (!ommitApp(allTargetSegments)) { + appositions.add(allTargetSegments); + } + + return appositions; + } + + private static ArrayList<List<Token>> mergeNEs(ArrayList<List<Token>> appositions) { + ArrayList<List<Token>> appositionsCopy = new ArrayList<List<Token>>(appositions); + Sentence sentence = appositions.get(0).get(0).getSentence(); + for (NamedEntity ne : sentence.getNamedEntities()) { + if (ne.getType().equals("persName") + && (ne.getSubtype() == null || ne.getSubtype().isEmpty())) { + HashSet<Token> mergedNE = new HashSet<Token>(); + for (List<Token> app : appositionsCopy) { + if (ne.getTokens().containsAll(app)) { + mergedNE.addAll(app); + appositions.remove(app); + } + } + if (mergedNE.size() > 0) { + ArrayList newApposition = new ArrayList<Token>(); + newApposition.addAll(mergedNE); + Collections.sort(newApposition); + appositions.add(newApposition); + } + appositionsCopy = new ArrayList<List<Token>>(appositions); + } + } + return appositions; + } + + public static boolean containsNE(ArrayList<List<Token>> appositions) { + for (List<Token> app : appositions) { + if (isNE(app)) { + return true; + } + /*for (Token tok : app) { + for (NamedEntity ne : sentence.getNamedEntities()) { + if (ne.getSubtype() != null && ne.getSubtype().equals("forename")) { + continue; + } + if (ne.getTokens().contains(tok)) { + return true; + } + } + }*/ + } + return false; + } + + private static boolean isNE(List<Token> segments) { + Sentence sentence = segments.get(0).getSentence(); + for (NamedEntity ne : sentence.getNamedEntities()) { + if (ne.getTokens().containsAll(segments) && + segments.containsAll(ne.getTokens())) { + return true; + } + } + return false; + } + + private static ArrayList<String> appositionsToString(ArrayList<List<Token>> appositions) { + ArrayList<String> apposistionsStrs = new ArrayList<String>(); + for (List<Token> apposition : appositions) { + String appText = getText(apposition, "orth"); + apposistionsStrs.add(appText); + } + return apposistionsStrs; + } + + + private static boolean ommitApp(List<Token> segments) { + segments = removeBorderingSegments(segments, Arrays.asList("interp")); + if (segments.size() == 0) { + return true; + } + String appositionBase = getText(segments, "base"); + if (ANN_SOURCE_TO_OMMIT.contains(segments.get(0).getBase().toLowerCase()) || + appositionBase.length() < 2) { + return true; + } + return false; + } + + private static HashSet<Token> getTreeSegments(Token tok, String divRel) { + HashSet<Token> segments = new HashSet<Token>(); + segments.add(tok); + + for (Relation rel : tok.getRelations()) { + if (!rel.getName().equals(divRel)) { + segments.addAll(getTreeSegments(rel.getTarget(), divRel)); + } + + } + return segments; + } + + private static HashSet<Token> getTreeSegments(Token tok, Token nGoThere) { + HashSet<Token> segments = new HashSet<Token>(); + segments.add(tok); + + for (Relation rel : tok.getRelations()) { + if (!rel.getTarget().equals(nGoThere)) { + segments.addAll(getTreeSegments(rel.getTarget(), nGoThere)); + } + + } + return segments; + } + + private static HashSet<Token> extendByNEs(List<Token> segments) { + HashSet<Token> allSegments = new HashSet<Token>(); + allSegments.addAll(segments); + for (Token tok : segments) { + Token neTok = tok; + while (neTok.getReturnRelation() != null + && (//neTok.getReturnRelation().getName().equals("ne") + //|| + sameNE(neTok, neTok.getReturnRelation().getTarget()))) { + neTok = neTok.getReturnRelation().getTarget(); + allSegments.add(neTok); + } + } + return allSegments; + } + + private static boolean sameNE(Token tok1, Token tok2) { + Sentence sentence = tok1.getSentence(); + for (NamedEntity ne : sentence.getNamedEntities()) { + if (ne.getTokens().contains(tok1) + && ne.getTokens().contains(tok2) + && ne.getType().equals("persName")) { + return true; + } + } + return false; + } + + // TODO: przeniesc do klasy Sentence i wywalic static + private static String getText(Sentence sentence, int start, int end, String form) { + String conj = ""; + for (Token tok : sentence.subList(start, end+1)) { + if (!tok.getCtag().equals("interp")) { + if (form.equals("orth")) { + conj += " " + tok.getOrth(); + } else if (form.equals("base")) { + conj += " " + tok.getBase(); + } + } + } + return conj.trim(); + } + + private static String getText(List<Token> segments, String form) { + String conj = ""; + for (Token tok : segments) { + if (form.equals("orth")) { + conj += " " + tok.getOrth(); + } else if (form.equals("base")) { + conj += " " + tok.getBase(); + } + } + return conj.trim(); + } + } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java index 376b415..32bfd84 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java @@ -35,7 +35,7 @@ public class InstanceCreator { allTexts++; logger.info("Processing text " + textDir); TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); - Text text = TeiLoader.loadTextFromTei(ct); + Text text = TeiLoader.loadTextFromTei(ct, textDir); for (Paragraph p : text) for (Sentence s : p) { diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java index 120d94b..6e03a8c 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java @@ -83,6 +83,14 @@ public class Mention implements Comparable<Mention> { sb.append("]"); return sb.toString(); } + + public String toStringWithoutBrackets() { + StringBuffer sb = new StringBuffer(); + for (Token seg : segments) { + sb.append(seg.toString() + " "); + } + return sb.toString(); + } public MentionGroup getMentionGroup() { return mentionGroup; diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java index 15bb533..aa1996c 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java @@ -6,14 +6,26 @@ import java.util.List; public class NamedEntity implements Comparable<NamedEntity> { private List<Token> tokens; + private String type; + private String subtype; - public NamedEntity(List<Token> tokens) { + public NamedEntity(List<Token> tokens, String type, String subType) { this.tokens = tokens; + this.type = type; + this.subtype = subType; } public List<Token> getTokens() { return this.tokens; } + + public String getType() { + return this.type; + } + + public String getSubtype() { + return this.subtype; + } @Override public int compareTo(NamedEntity o) { diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java index b10fa3d..7390a1b 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java @@ -1,6 +1,7 @@ package pl.waw.ipipan.zil.core.md.entities; import java.util.ArrayList; +import java.util.Arrays; import java.util.Iterator; import java.util.List; @@ -143,6 +144,30 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { return largestGroup; } + public SyntacticGroup getClosestNGroup() { + SyntacticGroup nextGroup = null; + Sentence sentence = this.tokens.get(0).getSentence(); + int nextTokenPosition = this.getSentenceEndPosition() + 1; + while (nextTokenPosition <= sentence.size()) { + + for (SyntacticGroup group : sentence.getGroups()) { + if (group.getType().startsWith("NG") && + group.getSentenceStartPosition() == nextTokenPosition) { + if (nextGroup == null || + nextGroup.getTokens().size() < group.getTokens().size()) { + nextGroup = group; + } + } + } + if (nextGroup != null) { + break; + } + nextTokenPosition ++; + } + + return nextGroup; + } + public SyntacticWord getPrecedingVerb() { int precedingTokenPosition = this.getSentenceStartPosition() - 1; Sentence sentence = this.tokens.get(0).getSentence(); @@ -190,5 +215,28 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { } return parentPrepNG; } + + public String toString() { + String textRep = ""; + for (Token tok : tokens) { + textRep += " " + tok.getOrth(); + } + return textRep.trim(); + } + + public boolean containsNE() { + Sentence sentence = this.tokens.get(0).getSentence(); + for (Token tok : tokens) { + for (NamedEntity ne : sentence.getNamedEntities()) { + if (ne.getSubtype() != null && ne.getSubtype().equals("forename")) { + continue; + } + if (ne.getTokens().contains(tok)) { + return true; + } + } + } + return false; + } } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java index 6f5510d..596665f 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java @@ -7,6 +7,8 @@ public class Token implements Comparable<Token> { private int sentencePosition; private Set<Mention> mentions = null; + private HashSet<Relation> relations = new HashSet<Relation>(); + private Relation returnRelation = null; private String orth; private Interpretation chosenInterpretation; @@ -119,10 +121,33 @@ public class Token implements Comparable<Token> { public String getCtag() { return getChosenInterpretation().getCtag(); } + + public boolean isPpron() { + if (this.getCtag().startsWith("ppron")) { + return true; + } + return false; + } @Override public int compareTo(Token o) { return getSentencePosition().compareTo(o.getSentencePosition()); } + + public void addRelation(Relation relation) { + relations.add(relation); + } + + public HashSet<Relation> getRelations() { + return relations; + } + + public void setReturnRelation(Relation relation) { + returnRelation = relation; + } + + public Relation getReturnRelation() { + return returnRelation; + } } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java b/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java index e3216a5..df7a1ef 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java @@ -8,11 +8,19 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import org.json.JSONArray; +import org.json.JSONObject; + + public class TeiLoader { private static Logger logger = LoggerFactory.getLogger(TeiLoader.class); @@ -24,28 +32,75 @@ public class TeiLoader { public static TEICorpusText readTeiText(File teiDir) throws TEIException { return teiAPI.readFromNKJPDirectory(teiDir); } - - public static Text loadTextFromTei(TEICorpusText teiText) { + + public static Text loadTextFromTei(TEICorpusText teiText, File textDir) { Text text = new Text(teiText.getCorpusHeader().getId()); + + String textId = textDir.getName(); + + System.out.println(textId); + + byte[] encoded; + JSONArray jsonParagraphs = null; + try { + //encoded = Files.readAllBytes(Paths.get("/home/bniton/Projekty/koreferencja/COTHEC/clients/python/data/train-JSON-Concraft-old_grammar/"+textId+".json")); + encoded = Files.readAllBytes(Paths.get("/home/bartek/Projekty/COTHEC/clients_data/"+textId+".json")); + String jsonContent = new String(encoded, StandardCharsets.UTF_8); + JSONObject jsonObject = new JSONObject(jsonContent); + + jsonParagraphs = jsonObject.getJSONArray("paragraphs"); + } catch (IOException e) { + // TODO Auto-generated catch block + //e.printStackTrace(); + logger.debug("No depparse layer."); + } logger.debug("Loading tei text " + text.getId() + "..."); - for (TEIParagraph teiP : teiText.getParagraphs()) - loadParagraph(text, teiP); + + List<TEIParagraph> teiParagraphs = teiText.getParagraphs(); + + for (int i=0; i < teiParagraphs.size(); i++) { + TEIParagraph teiP = teiParagraphs.get(i); + JSONObject jsonP = null; + if (jsonParagraphs != null) { + jsonP = new JSONObject(jsonParagraphs.get(i).toString()); + } + loadParagraph(text, teiP, jsonP); + } logger.debug("Tei text loaded."); return text; } - private static void loadParagraph(Text text, TEIParagraph teiP) { + private static void loadParagraph(Text text, TEIParagraph teiP, JSONObject jsonP) { Paragraph p = new Paragraph(); text.add(p); - for (TEISentence teiS : teiP.getSentences()) - loadSentence(p, teiS); + + List<TEISentence> teiSentences = teiP.getSentences(); + + JSONArray jsonSentences = null; + if (jsonP != null) { + jsonSentences = jsonP.getJSONArray("sentences"); + } + + for (int i=0; i < teiSentences.size(); i++) { + TEISentence teiS = teiSentences.get(i); + + JSONObject jsonS = null; + if (jsonP != null) { + if (i < jsonSentences.length()) { + jsonS = new JSONObject(jsonSentences.get(i).toString()); + } + } + + loadSentence(p, teiS, jsonS); + } } - private static void loadSentence(Paragraph p, TEISentence teiS) { + private static void loadSentence(Paragraph p, TEISentence teiS, JSONObject jsonS) { Sentence s = new Sentence(); p.add(s); + Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>(); for (TEIMorph teiM : teiS.getMorphs()) { Token token = loadToken(s, teiM); @@ -59,6 +114,33 @@ public class TeiLoader { loadSyntacticGroup(s, g, teiMorph2Segment); for (TEIMention m : teiS.getAllMentions()) loadMentions(s, m, teiMorph2Segment); + + if (jsonS != null && s.size() == jsonS.getJSONArray("tokens").length()) { + JSONArray relations = jsonS.getJSONArray("dependencyParse"); + for (int i=0; i<relations.length(); i++) { + loadRelation(s, new JSONObject(relations.get(i).toString())); + } + } else { + //System.out.println(s.toStringWithoutMentions()); + } + } + + private static void loadRelation(Sentence s, JSONObject jsonRelation) { + String label = jsonRelation.getString("label"); + if (!label.equals("root") && !jsonRelation.getString("startTokenId").isEmpty() && + jsonRelation.get("startTokenId").getClass() == String.class) { + String[] sourceIdParts = jsonRelation.getString("startTokenId").split("\\."); + String[] targetIdParts = jsonRelation.getString("endTokenId").split("\\."); + + int sourceId = Integer.parseInt(sourceIdParts[sourceIdParts.length-1]); + int targetId = Integer.parseInt(targetIdParts[sourceIdParts.length-1]); + + Token source = s.get(sourceId); + Token target = s.get(targetId); + + source.addRelation(new Relation(label, target)); + target.setReturnRelation(new Relation(label, source)); + } } private static void loadMentions(Sentence s, TEIMention m, @@ -107,7 +189,7 @@ public class TeiLoader { List<Token> tokens = new ArrayList<>(); for (TEIMorph m : ne.getLeaves()) tokens.add(teiMorph2Segment.get(m)); - s.addNamedEntity(new NamedEntity(tokens)); + s.addNamedEntity(new NamedEntity(tokens, ne.getType(), ne.getSubtype())); } private static Token loadToken(Sentence s, TEIMorph teiM) { diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java b/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java index 2676122..f2bb9cd 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java @@ -86,7 +86,7 @@ public class ThriftLoader { Map<String, Token> thiftTokenId2Token) { List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity, thiftTokenId2Token, false); - s.addNamedEntity(new NamedEntity(tokens)); + s.addNamedEntity(new NamedEntity(tokens, ne.getType(), ne.getSubtype())); } private static Map<String, Object> getThriftId2EntityMap( diff --git a/src/main/resources/head_model.bin b/src/main/resources/head_model.bin new file mode 100644 index 0000000..0385daa Binary files /dev/null and b/src/main/resources/head_model.bin differ diff --git a/src/main/resources/nominal_model.bin b/src/main/resources/nominal_model.bin new file mode 100644 index 0000000..a216fa7 Binary files /dev/null and b/src/main/resources/nominal_model.bin differ -- libgit2 0.22.2