diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java b/src/main/java/pl/waw/ipipan/zil/core/md/Main.java index 2a8af13..4740d94 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/Main.java @@ -2,6 +2,7 @@ package pl.waw.ipipan.zil.core.md; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + import pl.waw.ipipan.zil.core.md.detection.Detector; import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; import pl.waw.ipipan.zil.core.md.entities.Text; @@ -15,10 +16,16 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException; import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; +import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.EnumMap; +import java.util.HashMap; +import java.util.Map; public class Main { @@ -26,12 +33,125 @@ public class Main { private static final boolean GZIP_OUTPUT = true; private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; + private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all.txt"; + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all.txt"; + private static final String COMPLEX_PREPS = "/complex_preps_Walenty_USJP.txt"; private static ZeroSubjectDetector zeroSubjectModel; + + public static enum ValenceDicts { + VerbsValence, + NounsValence + } + + private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = + new EnumMap(ValenceDicts.class); + + private static final ArrayList<String> complexPreps; static { InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream); + + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE); + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream)); + + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); + + InputStream complexPrepositionsStream = Main.class.getResourceAsStream(COMPLEX_PREPS); + complexPreps = readValues(complexPrepositionsStream); + } + + + public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream) + { + Map<String,ArrayList<String>> map; + try { + BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream)); + map = new HashMap<String,ArrayList<String>>(); + String line; + boolean firstLine = true; + while((line = br.readLine()) != null) { + if (firstLine) { + line = line.replace("\uFEFF", ""); // remove BOM character + firstLine = false; + } + + if (!line.startsWith("%")) { + String[] lineParts = line.split(":"); + String lemma = lineParts[0].trim(); + String schema = lineParts[5].trim(); + + if (schema.trim().isEmpty()) { + continue; + } + + String[] lemmaParts = lemma.split(" "); + if(lemmaParts.length == 1 && schemaContainsSie(schema)) { + lemma = lemma + " się"; + } + + ArrayList<String> schemata; + if (!map.containsKey(lemma)) { + schemata = new ArrayList<String>(); + schemata.add(schema); + map.put(lemma, schemata); + } else { + schemata = map.get(lemma); + schemata.add(schema); + map.put(lemma, schemata); + } + } + } + br.close(); + } catch (IOException ex) { + ex.printStackTrace(); + throw new RuntimeException(ex); + } + return map; + } + + private static boolean schemaContainsSie(String schema) { + for (String position : schema.split("\\s\\+\\s")) { + position = position.trim(); + position = position.substring(1, position.length()-1); + for (String phrT : position.split(";")) { + if (phrT.equals("refl") || phrT.equals("recip")) { + return true; + } + } + } + + return false; + } + + public static ArrayList<String> readValues(InputStream stream) { + ArrayList<String> values; + try { + BufferedReader br=new BufferedReader(new InputStreamReader(stream)); + values = new ArrayList<String>(); + String line; + boolean firstLine = true; + while((line = br.readLine()) != null) { + if (firstLine) { + line = line.replace("\uFEFF", ""); // remove BOM character + firstLine = false; + } + + if (!line.startsWith("%")) { + String value = line.trim(); + if (!value.isEmpty()) { + values.add(value); + } + } + } + br.close(); + } catch (IOException ex) { + ex.printStackTrace(); + throw new RuntimeException(ex); + } + return values; } private Main() { @@ -71,6 +191,8 @@ public class Main { return; } } + + int all = 0; int errors = 0; @@ -122,7 +244,7 @@ public class Main { */ public static void annotateThriftText(TText thriftText) throws MultiserviceException { Text responseText = ThriftLoader.loadTextFromThrift(thriftText); - Detector.findMentionsInText(responseText, zeroSubjectModel); + Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); ThriftSaver.updateThriftText(responseText, thriftText); } @@ -135,7 +257,7 @@ public class Main { */ public static void annotateTeiText(TEICorpusText teiText) throws TEIException { Text responseText = TeiLoader.loadTextFromTei(teiText); - Detector.findMentionsInText(responseText, zeroSubjectModel); + Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); TeiSaver.updateTeiText(responseText, teiText); } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java index 94eaedc..23a83b3 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java @@ -1,12 +1,17 @@ package pl.waw.ipipan.zil.core.md.detection; +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; import pl.waw.ipipan.zil.core.md.entities.Mention; import pl.waw.ipipan.zil.core.md.entities.Sentence; +import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; +import pl.waw.ipipan.zil.core.md.entities.SyntacticWord; import pl.waw.ipipan.zil.core.md.entities.Token; +import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; public class Cleaner { @@ -125,4 +130,157 @@ public class Cleaner { else return m1; } + + public static void cleanWalentyFramedMentions(Sentence sentence, + Map<String,ArrayList<String>> verbsValence) { + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); + for (Mention mention : sentence.getMentions()) { + int mentionStart = mention.getFirstSegment().getSentencePosition(); + int mentionEnd = mention.getLastSegment().getSentencePosition(); + SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd); + SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd); + + if (startGroup != null && endGroup != null + && startGroup.compareTo(endGroup) != 0) { + + SyntacticWord verb = startGroup.getPrecedingVerb(); + if (verb != null && !verb.getBase().equals("mieć") + && verbsValence.containsKey(verb.getBase())) { + ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations(); + ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations(); + + for (String schema : verbsValence.get(verb.getBase())) { + if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) { + mentionsToRemove.add(mention); + break; + } + } + } + } + } + + for (Mention mentionToRemove : mentionsToRemove) { + sentence.removeMention(mentionToRemove); + } + } + + /*private static void removeWalentyFramedMentions(Sentence sentence, + ArrayList<Mention> mentions, + ArrayList<String> schemata) { + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); + for (Mention mention : mentions) { + int mentionStart = mention.getFirstSegment().getSentencePosition(); + int mentionEnd = mention.getLastSegment().getSentencePosition(); + SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd); + SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd); + if (startGroup != null && endGroup != null + && startGroup.compareTo(endGroup) != 0) { + ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations(); + ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations(); + for (String schema : schemata) { + if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) { + mentionsToRemove.add(mention); + break; + } + } + } + } + + for (Mention mentionToRemove : mentionsToRemove) { + sentence.removeMention(mentionToRemove); + } + }*/ + + private static boolean isProperSchema(String schema, ArrayList<String> group1Types, + ArrayList<String> group2Types) { + for (String group1Type : group1Types) { + for (String group2Type : group2Types) { + if (schemaContains(schema, group1Type, group2Type)) { + return true; + } + } + } + return false; + } + + private static boolean schemaContains(String schema, String phraseType1, + String phraseType2) { + boolean phrType1Found = false; + boolean phrType2Found = false; + for (String position : schema.split("\\+")) { + position = position.trim(); + position = position.substring(1, position.length()-1); + for (String phrT : position.split(";")) { + if (phrT.equals(phraseType1)) { + phrType1Found = true; + break; + } else if (phrT.equals(phraseType2)) { + phrType2Found = true; + break; + } + } + if (phrType1Found && phrType2Found) { + return true; + } + } + return false; + } + + + // wykrywa, ze "wszystkim" jest czescia "przede wszystkim" (kublika Qub) + public static void cleanQubs(Sentence sentence) { + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); + for (Mention mention : sentence.getMentions()) { + if (mention.isPartOfQub()) { + mentionsToRemove.add(mention); + } + } + + for (Mention mentionToRemove : mentionsToRemove) { + sentence.removeMention(mentionToRemove); + } + } + + public static void cleanPreps(Sentence sentence) { + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); + for (Mention mention : sentence.getMentions()) { + if (mention.isPartOfPrep()) { + mentionsToRemove.add(mention); + } + } + + for (Mention mentionToRemove : mentionsToRemove) { + sentence.removeMention(mentionToRemove); + } + } + + public static void cleanFrazeos(Sentence sentence) { + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); + for (Mention mention : sentence.getMentions()) { + if (mention.isPartOfFrazeo()) { + mentionsToRemove.add(mention); + } + } + + for (Mention mentionToRemove : mentionsToRemove) { + sentence.removeMention(mentionToRemove); + } + } + + // wyrzuca wzmianki bedace czescia przyimkow zlozonych + public static void cleanComplexPreps(Sentence sentence, + ArrayList<String> complexPreps) { + + ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); + for (Mention mention : sentence.getMentions()) { + if (mention.isPartOfComplexPrep(complexPreps)) { + mentionsToRemove.add(mention); + } + } + + for (Mention mentionToRemove : mentionsToRemove) { + sentence.removeMention(mentionToRemove); + } + } + } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java index 90ae83f..c8c89b2 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java @@ -2,12 +2,15 @@ package pl.waw.ipipan.zil.core.md.detection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; + +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; import pl.waw.ipipan.zil.core.md.entities.*; import java.util.ArrayList; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Set; public class Detector { @@ -18,21 +21,25 @@ public class Detector { } public static void findMentionsInText(Text text, - ZeroSubjectDetector zeroSubjectModel) { + ZeroSubjectDetector zeroSubjectModel, + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, + ArrayList<String> complexPreps) { text.clearMentions(); logger.debug("Detecting mentions in text " + text.getId()); for (Paragraph p : text) for (Sentence s : p) - detectMentionsInSentence(s, zeroSubjectModel); + detectMentionsInSentence(s, zeroSubjectModel, valence, complexPreps); } private static void detectMentionsInSentence(Sentence sentence, - ZeroSubjectDetector zeroSubjectModel) { + ZeroSubjectDetector zeroSubjectModel, + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, + ArrayList<String> complexPreps) { // adding mentions addMentionsByTokenCtag(sentence); addMentionsBySyntacticWordsCtag(sentence); addMentionsByNamedEntities(sentence); - addMentionsByGroups(sentence); + addMentionsByGroups(sentence, valence, complexPreps); addSpeakerMentionsInSpoken(sentence); // zero subject detection @@ -41,6 +48,11 @@ public class Detector { // removing mentions removeTo(sentence); Cleaner.cleanUnnecessarySentenceMentions(sentence); + //Cleaner.cleanQubs(sentence); + //Cleaner.cleanPreps(sentence); + //Cleaner.cleanComplexPreps(sentence, complexPreps); + Cleaner.cleanFrazeos(sentence); + Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); // updating mention heads updateMentionHeads(sentence); @@ -95,16 +107,468 @@ public class Detector { * * @param sentence */ - private static void addMentionsByGroups(Sentence sentence) { - for (SyntacticGroup group : sentence.getGroups()) { + private static void addMentionsByGroups(Sentence sentence, + Map<ValenceDicts,Map<String,ArrayList<String>>> valence, + ArrayList<String> complexPreps) { + List<SyntacticGroup> groups = sentence.getGroups(); + for (int i = 0; i < groups.size(); i++) { + SyntacticGroup thisGroup = groups.get(i); + + /*SyntacticGroup nearPrepNG = null; + SyntacticGroup nextNG = null;*/ + + SyntacticGroup nextGroup = thisGroup.getFollowingGroup(); + + /*if (thisGroup.getType().startsWith("NG")) { + nearPrepNG = getFollowingPrepNGs(thisGroup.getSentencePositionEnd(), + sentence); + nextNG = thisGroup.getNextNG(); + }*/ + + /*if (nextNG != null) { + int prepStart = thisGroup.getSentencePositionEnd() + 1; + int prepEnd = nextNG.getSentencePositionStart() - 1; + String prep = sentence.getTextInsideSpan(prepStart, prepEnd); + if (complexPreps.contains(prep)) { + String cos = ""; + } + }*/ + + /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && + //!isPartOfPrepNG(thisGroup, sentence) && + //getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && + precedingWordIsVerb(thisGroup, sentence) && + //!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && + !precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && + !sameSemanticHeads(thisGroup, nearPrepNG)) { + List<Token> heads = thisGroup.getSemanticHeadTokens(); + List<Token> segments = thisGroup.getTokens(); + segments.addAll(nearPrepNG.getTokens()); + + sentence.addMention(new Mention(segments, heads)); + }*/ + /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && + // !precedingWordIsVerb(thisGroup, sentence) && + !isPartOfPrepNG(thisGroup, sentence) && + getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && + //!precedingWordIsVerb(thisGroup, sentence) && + !precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && + //!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && + !sameSemanticHeads(thisGroup, nearPrepNG)) { + List<Token> heads = thisGroup.getSemanticHeadTokens(); + List<Token> segments = thisGroup.getTokens(); + segments.addAll(nearPrepNG.getTokens()); + + sentence.addMention(new Mention(segments, heads)); + }*/ + if (thisGroup.getType().startsWith("NG") && + nextGroup != null && nextGroup.getType().startsWith("PrepNG") && + NGPrepNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))) { + List<Token> heads = thisGroup.getSemanticHeadTokens(); + List<Token> segments = new ArrayList<Token>(); + segments.addAll(thisGroup.getTokens()); + segments.addAll(nextGroup.getTokens()); + + sentence.addMention(new Mention(segments, heads)); + } else if (thisGroup.getType().startsWith("NG") && nextGroup != null && + nextGroup.getType().startsWith("NG") && + NGNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence)) + ) { + List<Token> heads = thisGroup.getSemanticHeadTokens(); + List<Token> segments = new ArrayList<Token>(); + segments.addAll(thisGroup.getTokens()); + segments.addAll(nextGroup.getTokens()); + + sentence.addMention(new Mention(segments, heads)); + } /*else if (thisGroup.getType().startsWith("NG") && nextNG != null && nearPrepNG == null && + NGcomplexPrepNGValenceCompatibility(thisGroup, nextNG, sentence, valence.get(ValenceDicts.NounsValence))) { + List<Token> heads = thisGroup.getSemanticHeadTokens(); + + List<Token> segments = new ArrayList<Token>(); + segments.addAll(thisGroup.getTokens()); + + int prepStart = thisGroup.getSentencePositionEnd() + 1; + int prepEnd = nextNG.getSentencePositionStart() - 1; + ArrayList<Token> prepSegments = sentence.getSegmentsInsideSpan(prepStart, prepEnd); + segments.addAll(prepSegments); + + segments.addAll(nextNG.getTokens()); + + sentence.addMention(new Mention(segments, heads)); + }*/ + //else if // NG + im./pt. NG + // daty nie sa oznaczane np 21 kwietnia itd. , to co z ta gramatyka + // "instytut naukowy w montrealu" czemu sie nie laczy? ==> NE(orgName) + w(prep) + NE(placeName) + else if (thisGroup.getType().startsWith("NG")) { + List<Token> segments = thisGroup.getTokens(); + List<Token> heads = thisGroup.getSemanticHeadTokens(); + + sentence.addMention(new Mention(segments, heads)); + } + } + + // oryginalna wersja + /*for (SyntacticGroup group : sentence.getGroups()) { if (group.getType().startsWith("NG")) { List<Token> segments = group.getTokens(); List<Token> heads = group.getSemanticHeadTokens(); sentence.addMention(new Mention(segments, heads)); } - } + }*/ + } + + private static boolean followingWordIsInf(SyntacticGroup group, + Sentence sentence) { + int followingTokenPosition = group.getSentencePositionEnd() + 1; + for (SyntacticWord word : sentence.getSyntacticWords()) { + int firstWordPosition = word.getSentencePositionStart(); + if (followingTokenPosition == firstWordPosition && + (word.getCtag().equals("Inf"))) { + return true; + } + } + + return false; + } + + private static SyntacticGroup getFollowingPrepNGs(int sentencePosition, + Sentence sentence) { + SyntacticGroup largestGroup = null; + int nextTokenPosition = sentencePosition + 1; + for (SyntacticGroup group : sentence.getGroups()) { + if (group.getType().startsWith("PrepNG") && + group.getSentencePositionStart() == nextTokenPosition) { + if (largestGroup == null || + largestGroup.getTokens().size() < group.getTokens().size()) { + largestGroup = group; + } + } + } + return largestGroup; + } + + private static boolean isPartOfPrepNG(SyntacticGroup NGGroup, + Sentence sentence) { + int NGGroupStart = NGGroup.getSentencePositionStart(); + int NGGroupEnd = NGGroup.getSentencePositionEnd(); + for (SyntacticGroup group : sentence.getGroups()) { + if (group.getType().startsWith("PrepNG") && + group.getSentencePositionStart() <= NGGroupStart && + group.getSentencePositionEnd() >= NGGroupEnd) { + return true; + } + } + return false; + } + + private static boolean precedingWordIsVerb(SyntacticGroup group, + Sentence sentence) { + int precedingTokenPosition = group.getSentencePositionStart() - 1; + if(isPartOfPrepNG(group, sentence)) { + SyntacticGroup parentGroup = getParentPrepNG(group, sentence); + precedingTokenPosition = parentGroup.getSentencePositionStart() - 1; + } + + for (SyntacticWord word : sentence.getSyntacticWords()) { + int lastWordPosition = word.getSentencePositionEnd(); + if (precedingTokenPosition == lastWordPosition && + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { + return true; + } + } + return false; + } + + // czy sie w lemacie bedzie zgodne miedzy walentym a spejdem? + // czy prep moze sie skladac z wiecej niz jednego segmentu? + // dopasowywac refla i recip do sie spejdowego + private static boolean precedingVerbModifyPrepNG(SyntacticGroup NGGroup, + SyntacticGroup PrepNGGroup, Sentence sentence, + Map<String,ArrayList<String>> walentyMapping) { + int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; + for (SyntacticWord word : sentence.getSyntacticWords()) { + int lastWordPosition = word.getSentencePositionEnd(); + if (precedingTokenPosition == lastWordPosition && + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { + String verb = word.getBase(); + if (!walentyMapping.containsKey(verb)) { + return true; + } else { + SyntacticWord prepWord = PrepNGGroup.getFirstWord(); + + if (prepWord.getTokens().size() == 1) { + Token prep = prepWord.getTokens().get(0); + String prepBase = prep.getBase(); + // sprawdzic czy glowa moze miec wiele tokenow + String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); + ArrayList<String> prepnps = getPrepnps(prepBase, prepCase); + + ArrayList<String> schemata = walentyMapping.get(verb); + for (String schema : schemata) { + for (String prepnp : prepnps) { + if (schema.contains(prepnp)) { + return true; + } + } + } + } else if (prepWord.getTokens().size() > 1) { + String prepOrth = prepWord.getOrth().toLowerCase(); + String comprepnp = String.format("comprepnp(%s)", prepOrth); + ArrayList<String> schemata = walentyMapping.get(verb); + for (String schema : schemata) { + if (schema.contains(comprepnp)) { + return true; + } + } + + } + + + } + } + } + return false; + } + + private static boolean precedingVerbModifyNG(SyntacticGroup NGGroup, + SyntacticGroup PrepNGGroup, Sentence sentence, + Map<String,ArrayList<String>> walentyMapping) { + int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; + if(isPartOfPrepNG(NGGroup, sentence)) { + SyntacticGroup parentNGGroup = getParentPrepNG(NGGroup, sentence); + precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; + } + for (SyntacticWord word : sentence.getSyntacticWords()) { + int lastWordPosition = word.getSentencePositionEnd(); + if (precedingTokenPosition == lastWordPosition && + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { + if (verbModifyNG(word, NGGroup, PrepNGGroup, sentence, walentyMapping)) { + return true; + } + if (!walentyMapping.containsKey(word.getBase())) { + return true; + } + + } + } + return false; } + + private static boolean verbModifyNG(SyntacticWord verb, SyntacticGroup NGGroup, + SyntacticGroup PrepNGGroup, Sentence sentence, + Map<String,ArrayList<String>> walentyMapping) { + String verbBase = verb.getBase(); + if (!walentyMapping.containsKey(verbBase)) { + return true; + } else { + ArrayList<String> schemata = walentyMapping.get(verbBase); + + // PrepNG + PrepNG + if (isPartOfPrepNG(NGGroup, sentence)) { + SyntacticGroup NGParentGroup = getParentPrepNG(NGGroup, sentence); + ArrayList<String> prepNG1Realizations = NGParentGroup.getWalentyRealizations(); + ArrayList<String> prepNG2Realizations = PrepNGGroup.getWalentyRealizations(); + for (String schema : schemata) { + if (isProperSchema(schema, prepNG1Realizations, prepNG2Realizations)) { + return true; + } + } + } + + // NG + PrepNG + else { + ArrayList<String> NGRealizations = NGGroup.getWalentyRealizations(); + ArrayList<String> prepNGRealizations = PrepNGGroup.getWalentyRealizations(); + for (String schema : schemata) { + if (isProperSchema(schema, NGRealizations, prepNGRealizations)) { + return true; + } + } + } + } + return false; + } + + private static boolean isProperSchema(String schema, ArrayList<String> group1Types, + ArrayList<String> group2Types) { + for (String group1Type : group1Types) { + if (schema.contains(group1Type)) { + for (String group2Type : group2Types) { + if (schema.contains(group2Type)) { + return true; + } + } + } + } + return false; + } + + private static SyntacticGroup getParentPrepNG(SyntacticGroup NGGroup, + Sentence sentence) { + SyntacticGroup parentPrepNG = null; + int NGGroupStart = NGGroup.getSentencePositionStart(); + int NGGroupEnd = NGGroup.getSentencePositionEnd(); + for (SyntacticGroup group : sentence.getGroups()) { + if (group.getType().startsWith("PrepNG") && + group.getSentencePositionStart() <= NGGroupStart && + group.getSentencePositionEnd() >= NGGroupEnd) { + if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { + parentPrepNG = group; + } + } + } + return parentPrepNG; + } + + private static boolean NGPrepNGValenceCompatibility(SyntacticGroup NGGroup, + SyntacticGroup PrepNGGroup, Sentence sentence, + Map<String,ArrayList<String>> walentyMapping) { + Token NGHead = NGGroup.getSemanticHeadTokens().get(0); + + String NGHeadBase = NGHead.getBase(); + + if (!walentyMapping.containsKey(NGHeadBase)) { + return false; + } else { + SyntacticWord prepWord = PrepNGGroup.getFirstWord(); + + if (prepWord.getTokens().size() == 1) { + Token prep = prepWord.getTokens().get(0); + String prepBase = prep.getBase(); + String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); + String prepnp = String.format("prepnp(%s,%s)", prepBase, prepCase); + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); + for (String schema : schemata) { + if (schemaContains(schema, prepnp)) { + return true; + } + } + } else if (prepWord.getTokens().size() > 1) { + String prepOrth = prepWord.getOrth().toLowerCase(); + String comprepnp = String.format("comprepnp(%s)", prepOrth); + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); + for (String schema : schemata) { + if (schemaContains(schema, comprepnp)) { + return true; + } + } + + } + + } + return false; + } + + private static boolean NGNGValenceCompatibility(SyntacticGroup NG1, + SyntacticGroup NG2, Sentence sentence, + Map<String,ArrayList<String>> walentyMapping) { + Token NG1Head = NG1.getSemanticHeadTokens().get(0); + + String NGHeadBase = NG1Head.getBase(); + + if (!walentyMapping.containsKey(NGHeadBase)) { + return false; + } else { + ArrayList<String> NG2realizations = NG2.getWalentyRealizations(); + + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); + for (String real : NG2realizations) { + for (String schema : schemata) { + if (schemaContains(schema, real)) { + return true; + } + } + } + } + return false; + } + + private static boolean NGcomplexPrepNGValenceCompatibility(SyntacticGroup NGGroup1, + SyntacticGroup NGGroup2, Sentence sentence, + Map<String,ArrayList<String>> walentyMapping) { + + Token NGHead = NGGroup1.getSemanticHeadTokens().get(0); + String NGHeadBase = NGHead.getBase(); + + if (!walentyMapping.containsKey(NGHeadBase)) { + return false; + } else { + int prepStart = NGGroup1.getSentencePositionEnd() + 1; + int prepEnd = NGGroup2.getSentencePositionStart() - 1; + String complexPrep = sentence.getTextInsideSpan(prepStart, prepEnd); + String comprepnp = String.format("comprepnp(%s)", complexPrep); + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); + for (String schema : schemata) { + if (schemaContains(schema, comprepnp)) { + return true; + } + } + } + return false; + } + + private static boolean schemaContains(String schema, String phraseType) { + for (String position : schema.split("\\s\\+\\s")) { + position = position.trim(); + position = position.substring(1, position.length()-1); + for (String phrT : position.split(";")) { + if (phrT.equals(phraseType)) { + return true; + } + } + } + return false; + } + + private static boolean schemaContainsType(String schema, String type) { + // to lepiej dziala dla rzeczownikow + for (String position : schema.split("\\s\\+\\s")) { + position = position.trim(); + position = position.substring(1, position.length()-1); + for (String phrT : position.split(";")) { + + if (phrT.startsWith(type+"(")) { + return true; + } + } + } + return false; + } + + + // compar ?? + private static ArrayList<String> getPrepnps(String prepBase, String prepCase) { + ArrayList<String> prepnps = new ArrayList<String>(); + prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); + if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) { + prepnps.add(String.format("prepnp(%s,str)", prepBase)); + } + if (prepCase.equals("gen") || prepCase.equals("acc")) { + prepnps.add(String.format("prepnp(%s,part)", prepBase)); + } + return prepnps; + } + + // eliminuje "od wsi do wsi" + private static boolean sameSemanticHeads(SyntacticGroup group1, + SyntacticGroup group2) { + + List<Token> group1HeadTokens = group1.getSemanticHeadTokens(); + List<Token> group2HeadTokens = group2.getSemanticHeadTokens(); + if (group1HeadTokens.size() != group2HeadTokens.size()) { + return false; + } + + for (int i=0; i < group1HeadTokens.size(); i++) { + if (!group1HeadTokens.get(i).getBase().equals(group2HeadTokens.get(i).getBase())) { + return false; + } + } + + return true; + } + /** * Wyszukuję i oznaczam wszystkie NER @@ -151,8 +615,9 @@ public class Detector { * @param sentence */ private static void addMentionsByTokenCtag(Sentence sentence) { - for (Token token : sentence) + for (Token token : sentence) { if (token.getCtag().matches(Constants.MORPHO_CTAGS)) sentence.addMention(new Mention(token)); + } } } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java index 3f1a922..2fe1e86 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java @@ -1,6 +1,7 @@ package pl.waw.ipipan.zil.core.md.entities; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; /** @@ -203,4 +204,83 @@ public class Mention implements Comparable<Mention> { public boolean isZeroSubject() { return isZeroSubject; } + + public int getSentencePositionStart() { + Token startToken = this.getFirstSegment(); + return startToken.getSentencePosition(); + } + + public int getSentencePositionEnd() { + Token endToken = this.getLastSegment(); + return endToken.getSentencePosition(); + } + + public boolean isPartOfQub() { + if (this.segments.size() == 1) { + Sentence sentence = this.segments.get(0).getSentence(); + for (SyntacticWord word : sentence.getSyntacticWords()) { + if (word.getTokens().contains(this.segments.get(0)) && + word.getCtag().equals("Qub")) { + return true; + } + } + } + return false; + } + + public boolean isPartOfPrep() { + if (this.segments.size() == 1) { + Sentence sentence = this.segments.get(0).getSentence(); + for (SyntacticWord word : sentence.getSyntacticWords()) { + if (word.getTokens().contains(this.segments.get(0)) && + word.getCtag().equals("Prep")) { + return true; + } + } + } + return false; + } + + private final List<String> FRAZEOS = Arrays.asList("Prep", "Qub", "Adv", "Interj", + "Adj", "Conj", "Comp"); + + public boolean isPartOfFrazeo() { + if (this.segments.size() == 1) { + Sentence sentence = this.segments.get(0).getSentence(); + for (SyntacticWord word : sentence.getSyntacticWords()) { + if (word.getTokens().contains(this.segments.get(0)) && + FRAZEOS.contains(word.getCtag())) { + return true; + } + } + } + return false; + } + + public boolean isPartOfComplexPrep(ArrayList<String> complexPreps) { + if (this.segments.size() == 1) { + Sentence sentence = this.segments.get(0).getSentence(); + if (this.getSentencePositionStart() - 1 >= 0) { + String prep = sentence.get(this.getSentencePositionStart() - 1).getOrth(); + String noun = sentence.get(this.getSentencePositionStart()).getOrth(); + String possiblePrep = String.format("%s %s", prep, noun); + if (complexPreps.contains(possiblePrep)) { + return true; + } + } + + if (this.getSentencePositionStart() - 1 >= 0 && + this.getSentencePositionStart() + 1 < sentence.size()) { + String prep1 = sentence.get(this.getSentencePositionStart() - 1).getOrth(); + String noun = sentence.get(this.getSentencePositionStart()).getOrth(); + String prep2 = sentence.get(this.getSentencePositionStart() + 1).getOrth(); + String possiblePrep = String.format("%s %s %s", prep1, noun, prep2); + if (complexPreps.contains(possiblePrep)) { + return true; + } + } + } + return false; + } + } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java index 9f7f423..476cdf9 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java @@ -109,4 +109,118 @@ public class Sentence extends ArrayList<Token> { public void addNamedEntity(NamedEntity namedEntity) { namedEntities.add(namedEntity); } + + public ArrayList<SyntacticGroup> getGroupsInsideSpan(int start, int end) { + ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); + for (SyntacticGroup group : this.syntacticGroups) { + if (group.getSentencePositionStart() >= start && + group.getSentencePositionEnd() <= end) { + if (!(group.getSentencePositionStart() == start && + group.getSentencePositionEnd() == end)) { + groupsAtSpan.add(group); + } + } + } + return groupsAtSpan; + } + + public ArrayList<SyntacticGroup> getLargestNotIntersectingGroupsInsideSpan(int start, int end) { + ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); + for (SyntacticGroup group : this.syntacticGroups) { + + if (group.getSentencePositionStart() >= start && + group.getSentencePositionEnd() <= end) { + if (!(group.getSentencePositionStart() == start && + group.getSentencePositionEnd() == end)) { + groupsAtSpan.add(group); + } + } + } + return groupsAtSpan; + } + + public SyntacticGroup getFirstGroup(int start, int end) { + SyntacticGroup largestGroup = null; + int step = start; + while (step <= end && largestGroup == null) { + largestGroup = getLargestGroupOnStartPoint(step, end); + step++; + } + return largestGroup; + } + + private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) { + SyntacticGroup largestGroup = null; + for (SyntacticGroup group : this.getGroups()) { + int groupStart = group.getSentencePositionStart(); + int groupEnd = group.getSentencePositionEnd(); + if (groupStart == start && groupEnd <= end && + !(groupStart == start && groupEnd == end) && + (largestGroup == null || + largestGroup.getTokens().size() < group.getTokens().size())) { + largestGroup = group; + } + } + return largestGroup; + } + + public SyntacticGroup getLastGroup(int start, int end) { + SyntacticGroup largestGroup = null; + int step = end; + while (step != start && largestGroup == null) { + largestGroup = getLargestGroupOnEndPoint(start, step); + step--; + } + return largestGroup; + } + + private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) { + SyntacticGroup largestGroup = null; + for (SyntacticGroup group : this.getGroups()) { + int groupStart = group.getSentencePositionStart(); + int groupEnd = group.getSentencePositionEnd(); + if (groupEnd == end && groupStart >= start && + !(groupStart == start && groupEnd == end) && + (largestGroup == null || + largestGroup.getTokens().size() < group.getTokens().size())) { + largestGroup = group; + } + } + return largestGroup; + } + + public ArrayList<Mention> getMentionsInsideSpan(int start, int end) { + ArrayList<Mention> mentionsAtSpan = new ArrayList<Mention>(); + for (Mention mention : this.mentions) { + if (mention.getSentencePositionStart() >= start && + mention.getSentencePositionEnd() <= end) { + mentionsAtSpan.add(mention); + } + } + return mentionsAtSpan; + } + + public String getTextInsideSpan(int start, int end) { + String text = ""; + int step = start; + while (step <= end) { + if (step != start) { + text += " "; + } + text += this.get(step).getOrth(); + step++; + } + return text; + } + + public ArrayList<Token> getSegmentsInsideSpan(int start, int end) { + ArrayList<Token> tokensAtSpan = new ArrayList<Token>(); + int step = start; + while (step <= end) { + tokensAtSpan.add(this.get(step)); + step++; + } + return tokensAtSpan; + } + } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java index 83d6d35..ed6f234 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java @@ -1,5 +1,6 @@ package pl.waw.ipipan.zil.core.md.entities; +import java.util.ArrayList; import java.util.Iterator; import java.util.List; @@ -53,4 +54,175 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { return getType().compareTo(o.getType()); } + + public int getSentencePositionStart() { + Token startToken = tokens.get(0); + return startToken.getSentencePosition(); + } + + public int getSentencePositionEnd() { + Token endToken = tokens.get(tokens.size()-1); + return endToken.getSentencePosition(); + } + + + public SyntacticWord getFirstWord() { + SyntacticWord firstWord = null; + Token startToken = tokens.get(0); + Sentence sentence = startToken.getSentence(); + for (SyntacticWord word : sentence.getSyntacticWords()) { + if(startToken.compareTo(word.getTokens().get(0)) == 0 && + (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) { + firstWord = word; + } + } + return firstWord; + } + + // NG and PrepNG only now + public ArrayList<String> getWalentyRealizations() { + ArrayList<String> realizations = new ArrayList<String>(); + if (this.type.startsWith("PrepNG")) { + SyntacticWord prepWord = this.getFirstWord(); + if (prepWord.getTokens().size() == 1) { + + Token prep = prepWord.getTokens().get(0); + String prepBase = prep.getBase(); + String prepCase = this.getSemanticHeadTokens().get(0).getCase(); + realizations.addAll(getPrepnps(prepBase, prepCase)); + + } else if (prepWord.getTokens().size() > 1) { + + String prepOrth = prepWord.getOrth().toLowerCase(); + String comprepnp = String.format("comprepnp(%s)", prepOrth); + realizations.add(comprepnp); + + } + } else if (this.type.startsWith("NG")) { + String npCase = this.getSemanticHeadTokens().get(0).getCase(); + realizations.addAll(getNps(npCase)); + } + return realizations; + } + + // compar ?? + private ArrayList<String> getPrepnps(String prepBase, String prepCase) { + ArrayList<String> prepnps = new ArrayList<String>(); + prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); + if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) { + prepnps.add(String.format("prepnp(%s,str)", prepBase)); + } + if (prepCase.equals("gen") || prepCase.equals("acc")) { + prepnps.add(String.format("prepnp(%s,part)", prepBase)); + } + return prepnps; + } + + private ArrayList<String> getNps(String npCase) { + ArrayList<String> nps = new ArrayList<String>(); + nps.add(String.format("np(%s)", npCase)); + if (npCase.equals("nom") || npCase.equals("gen") || npCase.equals("acc")) { + nps.add(String.format("np(str)")); + } + if (npCase.equals("gen") || npCase.equals("acc")) { + nps.add(String.format("np(part)")); + } + return nps; + } + + public boolean precedingWordIsVerb() { + Sentence sentence = this.tokens.get(0).getSentence(); + int precedingTokenPosition = this.getSentencePositionStart() - 1; + for (SyntacticWord word : sentence.getSyntacticWords()) { + int lastWordPosition = word.getSentencePositionEnd(); + if (precedingTokenPosition == lastWordPosition && + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { + return true; + } + } + return false; + } + + public SyntacticGroup getNextNG() { + Sentence sentence = this.tokens.get(0).getSentence(); + int thisGroupEnd = this.getSentencePositionEnd(); + int sentenceLength = sentence.size(); + + SyntacticGroup nextNG = null; + for (int step = thisGroupEnd; step < sentenceLength; step++) { + nextNG = sentence.getFirstGroup(step, sentenceLength); + if (nextNG != null && nextNG.type.startsWith("NG") && + this.getSentencePositionEnd() < nextNG.getSentencePositionStart()) { + break; + } else { + nextNG = null; + } + } + return nextNG; + } + + public SyntacticGroup getFollowingGroup() { + SyntacticGroup largestGroup = null; + Sentence sentence = this.tokens.get(0).getSentence(); + int nextTokenPosition = this.getSentencePositionEnd() + 1; + for (SyntacticGroup group : sentence.getGroups()) { + if ((group.getType().startsWith("PrepNG") || group.getType().startsWith("NG")) && + group.getSentencePositionStart() == nextTokenPosition) { + if (largestGroup == null || + largestGroup.getTokens().size() < group.getTokens().size()) { + largestGroup = group; + } + } + } + return largestGroup; + } + + public SyntacticWord getPrecedingVerb() { + int precedingTokenPosition = this.getSentencePositionStart() - 1; + Sentence sentence = this.tokens.get(0).getSentence(); + if(this.isPartOfPrepNG()) { + SyntacticGroup parentNGGroup = this.getParentPrepNG(); + precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; + } + for (SyntacticWord word : sentence.getSyntacticWords()) { + int lastWordPosition = word.getSentencePositionEnd(); + if (precedingTokenPosition == lastWordPosition && + (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { + return word; + } + } + return null; + } + + private boolean isPartOfPrepNG() { + int NGGroupStart = this.getSentencePositionStart(); + int NGGroupEnd = this.getSentencePositionEnd(); + Sentence sentence = this.tokens.get(0).getSentence(); + for (SyntacticGroup group : sentence.getGroups()) { + if (group.getType().startsWith("PrepNG") && + group.getSentencePositionStart() <= NGGroupStart && + group.getSentencePositionEnd() >= NGGroupEnd) { + return true; + } + } + return false; + } + + private SyntacticGroup getParentPrepNG() { + SyntacticGroup parentPrepNG = null; + int NGGroupStart = this.getSentencePositionStart(); + int NGGroupEnd = this.getSentencePositionEnd(); + Sentence sentence = this.tokens.get(0).getSentence(); + for (SyntacticGroup group : sentence.getGroups()) { + if (group.getType().startsWith("PrepNG") && + group.getSentencePositionStart() <= NGGroupStart && + group.getSentencePositionEnd() >= NGGroupEnd) { + if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { + parentPrepNG = group; + } + } + } + return parentPrepNG; + } + } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java index 7de1f53..aa80dec 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java @@ -6,11 +6,16 @@ import java.util.List; public class SyntacticWord implements Comparable<SyntacticWord> { + private String base; private String ctag; + private String orth; private List<Token> tokens = new ArrayList<>(); - public SyntacticWord(String ctag, List<Token> tokens) { + public SyntacticWord(String ctag, List<Token> tokens, + String base, String orth) { + this.base = base; this.ctag = ctag; + this.orth = orth; this.tokens = tokens; } @@ -39,5 +44,37 @@ public class SyntacticWord implements Comparable<SyntacticWord> { return getCtag().compareTo(o.getCtag()); } + + public int getSentencePositionStart() { + Token startToken = tokens.get(0); + return startToken.getSentencePosition(); + } + + public int getSentencePositionEnd() { + Token endToken = tokens.get(tokens.size()-1); + return endToken.getSentencePosition(); + } + + public String getBase() { + return this.base; + } + + public String getOrth() { + return this.orth; + } + + public boolean isVerb() { + if (this.ctag.equals("Verbfin") || this.ctag.equals("Inf")) { + return true; + } + return false; + } + + public boolean isInterp() { + if (this.ctag.equals("Interp")) { + return true; + } + return false; + } } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java b/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java index 99ca78c..255a056 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java @@ -70,6 +70,7 @@ public class TeiLoader { for (TEIMorph mo : m.getHeadMorphs()) headTokens.add(teiMorph2Segment.get(mo)); s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); + System.out.println(tokens.toString()); } private static void loadSyntacticGroup(Sentence s, TEIGroup g, @@ -94,10 +95,12 @@ public class TeiLoader { private static void loadSyntacticWord(Sentence s, TEIWord w, Map<TEIMorph, Token> teiMorph2Segment) { String ctag = w.getInterpretation().getCtag(); + String base = w.getInterpretation().getBase(); + String orth = w.getOrth(); List<Token> tokens = new ArrayList<>(); for (TEIMorph m : w.getAllMorphs()) tokens.add(teiMorph2Segment.get(m)); - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); + s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth)); } private static void loadNE(Sentence s, TEINamedEntity ne, diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java b/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java index 1076b57..2676122 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java @@ -73,10 +73,12 @@ public class ThriftLoader { private static void loadSyntacticWord(Sentence s, TSyntacticWord w, Map<String, Object> thirftId2Entity, Map<String, Token> thiftTokenId2Token) { + String base = w.getChosenInterpretation().getBase(); String ctag = w.getChosenInterpretation().getCtag(); + String orth = w.getOrth(); List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity, thiftTokenId2Token, false); - s.addSyntacticWord(new SyntacticWord(ctag, tokens)); + s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth)); } private static void loadNE(Sentence s, TNamedEntity ne,