From 8f86545e14f99bbf47ab83bf202e26af7a2716c4 Mon Sep 17 00:00:00 2001 From: Bartlomiej Niton <bartek.niton@gmail.com> Date: Wed, 25 Jan 2017 15:36:01 +0100 Subject: [PATCH] Cleaning unused experimental code. --- src/main/java/pl/waw/ipipan/zil/core/md/Main.java | 42 ++++-------------------------------------- src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java | 74 +------------------------------------------------------------------------- src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java | 8 ++++++++ src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java | 545 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java | 65 ++++++----------------------------------------------------------- src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java | 71 ++++------------------------------------------------------------------- src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java | 98 ++++++++++++++++++++++++++++++++------------------------------------------------------------------ src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java | 31 +++++++++++++------------------ src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java | 1 - 9 files changed, 233 insertions(+), 702 deletions(-) diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java b/src/main/java/pl/waw/ipipan/zil/core/md/Main.java index 4740d94..e70f152 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/Main.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/Main.java @@ -33,9 +33,8 @@ public class Main { private static final boolean GZIP_OUTPUT = true; private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin"; - private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all.txt"; - private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all.txt"; - private static final String COMPLEX_PREPS = "/complex_preps_Walenty_USJP.txt"; + private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all_with_realizations.txt"; + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all_with_realizations.txt"; private static ZeroSubjectDetector zeroSubjectModel; @@ -46,8 +45,6 @@ public class Main { private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = new EnumMap(ValenceDicts.class); - - private static final ArrayList<String> complexPreps; static { InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL); @@ -58,9 +55,6 @@ public class Main { InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); - - InputStream complexPrepositionsStream = Main.class.getResourceAsStream(COMPLEX_PREPS); - complexPreps = readValues(complexPrepositionsStream); } @@ -125,34 +119,6 @@ public class Main { return false; } - - public static ArrayList<String> readValues(InputStream stream) { - ArrayList<String> values; - try { - BufferedReader br=new BufferedReader(new InputStreamReader(stream)); - values = new ArrayList<String>(); - String line; - boolean firstLine = true; - while((line = br.readLine()) != null) { - if (firstLine) { - line = line.replace("\uFEFF", ""); // remove BOM character - firstLine = false; - } - - if (!line.startsWith("%")) { - String value = line.trim(); - if (!value.isEmpty()) { - values.add(value); - } - } - } - br.close(); - } catch (IOException ex) { - ex.printStackTrace(); - throw new RuntimeException(ex); - } - return values; - } private Main() { } @@ -244,7 +210,7 @@ public class Main { */ public static void annotateThriftText(TText thriftText) throws MultiserviceException { Text responseText = ThriftLoader.loadTextFromThrift(thriftText); - Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); + Detector.findMentionsInText(responseText, zeroSubjectModel, valence); ThriftSaver.updateThriftText(responseText, thriftText); } @@ -257,7 +223,7 @@ public class Main { */ public static void annotateTeiText(TEICorpusText teiText) throws TEIException { Text responseText = TeiLoader.loadTextFromTei(teiText); - Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps); + Detector.findMentionsInText(responseText, zeroSubjectModel, valence); TeiSaver.updateTeiText(responseText, teiText); } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java index 23a83b3..9eb2ec8 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java @@ -1,6 +1,5 @@ package pl.waw.ipipan.zil.core.md.detection; -import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; import pl.waw.ipipan.zil.core.md.entities.Mention; import pl.waw.ipipan.zil.core.md.entities.Sentence; import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup; @@ -164,33 +163,6 @@ public class Cleaner { } } - /*private static void removeWalentyFramedMentions(Sentence sentence, - ArrayList<Mention> mentions, - ArrayList<String> schemata) { - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); - for (Mention mention : mentions) { - int mentionStart = mention.getFirstSegment().getSentencePosition(); - int mentionEnd = mention.getLastSegment().getSentencePosition(); - SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd); - SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd); - if (startGroup != null && endGroup != null - && startGroup.compareTo(endGroup) != 0) { - ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations(); - ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations(); - for (String schema : schemata) { - if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) { - mentionsToRemove.add(mention); - break; - } - } - } - } - - for (Mention mentionToRemove : mentionsToRemove) { - sentence.removeMention(mentionToRemove); - } - }*/ - private static boolean isProperSchema(String schema, ArrayList<String> group1Types, ArrayList<String> group2Types) { for (String group1Type : group1Types) { @@ -207,7 +179,7 @@ public class Cleaner { String phraseType2) { boolean phrType1Found = false; boolean phrType2Found = false; - for (String position : schema.split("\\+")) { + for (String position : schema.split("\\s\\+\\s")) { position = position.trim(); position = position.substring(1, position.length()-1); for (String phrT : position.split(";")) { @@ -226,34 +198,6 @@ public class Cleaner { return false; } - - // wykrywa, ze "wszystkim" jest czescia "przede wszystkim" (kublika Qub) - public static void cleanQubs(Sentence sentence) { - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); - for (Mention mention : sentence.getMentions()) { - if (mention.isPartOfQub()) { - mentionsToRemove.add(mention); - } - } - - for (Mention mentionToRemove : mentionsToRemove) { - sentence.removeMention(mentionToRemove); - } - } - - public static void cleanPreps(Sentence sentence) { - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); - for (Mention mention : sentence.getMentions()) { - if (mention.isPartOfPrep()) { - mentionsToRemove.add(mention); - } - } - - for (Mention mentionToRemove : mentionsToRemove) { - sentence.removeMention(mentionToRemove); - } - } - public static void cleanFrazeos(Sentence sentence) { ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); for (Mention mention : sentence.getMentions()) { @@ -267,20 +211,4 @@ public class Cleaner { } } - // wyrzuca wzmianki bedace czescia przyimkow zlozonych - public static void cleanComplexPreps(Sentence sentence, - ArrayList<String> complexPreps) { - - ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>(); - for (Mention mention : sentence.getMentions()) { - if (mention.isPartOfComplexPrep(complexPreps)) { - mentionsToRemove.add(mention); - } - } - - for (Mention mentionToRemove : mentionsToRemove) { - sentence.removeMention(mentionToRemove); - } - } - } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java index 19e6d56..cc7aa2a 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java @@ -1,5 +1,8 @@ package pl.waw.ipipan.zil.core.md.detection; +import java.util.Arrays; +import java.util.List; + public class Constants { public static final String MORPHO_NOUN_CTAGS = "subst|depr|ger"; public static final String MORPHO_VERB_CTAGS = "fin|bedzie|aglt|impt"; @@ -7,6 +10,11 @@ public class Constants { public static final String MORPHO_CTAGS = MORPHO_NOUN_CTAGS + "|" + MORPHO_PRONOUN_CTAGS; public static final String WORDS_CTAGS = "Noun|Ppron.*"; + + public static final List<String> FRAZEO_CTAGS = Arrays.asList("Prep", "Qub", "Adv", "Interj", + "Adj", "Conj", "Comp"); + + public static final List<String> VERB_CTAGS = Arrays.asList("Inf", "Verbfin"); private Constants() { } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java index c8c89b2..399e9e3 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java @@ -22,24 +22,22 @@ public class Detector { public static void findMentionsInText(Text text, ZeroSubjectDetector zeroSubjectModel, - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, - ArrayList<String> complexPreps) { + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { text.clearMentions(); logger.debug("Detecting mentions in text " + text.getId()); for (Paragraph p : text) for (Sentence s : p) - detectMentionsInSentence(s, zeroSubjectModel, valence, complexPreps); + detectMentionsInSentence(s, zeroSubjectModel, valence); } private static void detectMentionsInSentence(Sentence sentence, ZeroSubjectDetector zeroSubjectModel, - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, - ArrayList<String> complexPreps) { + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { // adding mentions addMentionsByTokenCtag(sentence); addMentionsBySyntacticWordsCtag(sentence); addMentionsByNamedEntities(sentence); - addMentionsByGroups(sentence, valence, complexPreps); + addMentionsByGroups(sentence, valence); addSpeakerMentionsInSpoken(sentence); // zero subject detection @@ -47,12 +45,9 @@ public class Detector { // removing mentions removeTo(sentence); + Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); Cleaner.cleanUnnecessarySentenceMentions(sentence); - //Cleaner.cleanQubs(sentence); - //Cleaner.cleanPreps(sentence); - //Cleaner.cleanComplexPreps(sentence, complexPreps); Cleaner.cleanFrazeos(sentence); - Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence)); // updating mention heads updateMentionHeads(sentence); @@ -108,294 +103,64 @@ public class Detector { * @param sentence */ private static void addMentionsByGroups(Sentence sentence, - Map<ValenceDicts,Map<String,ArrayList<String>>> valence, - ArrayList<String> complexPreps) { - List<SyntacticGroup> groups = sentence.getGroups(); - for (int i = 0; i < groups.size(); i++) { - SyntacticGroup thisGroup = groups.get(i); - - /*SyntacticGroup nearPrepNG = null; - SyntacticGroup nextNG = null;*/ - - SyntacticGroup nextGroup = thisGroup.getFollowingGroup(); - - /*if (thisGroup.getType().startsWith("NG")) { - nearPrepNG = getFollowingPrepNGs(thisGroup.getSentencePositionEnd(), - sentence); - nextNG = thisGroup.getNextNG(); - }*/ - - /*if (nextNG != null) { - int prepStart = thisGroup.getSentencePositionEnd() + 1; - int prepEnd = nextNG.getSentencePositionStart() - 1; - String prep = sentence.getTextInsideSpan(prepStart, prepEnd); - if (complexPreps.contains(prep)) { - String cos = ""; - } - }*/ + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { + + for (SyntacticGroup group : sentence.getGroups()) { + SyntacticGroup nextGroup = group.getFollowingGroup(); + SyntacticGroup nextnextGroup = null; + SyntacticGroup nextnextnextGroup = null; + if (nextGroup != null) { + nextnextGroup = nextGroup.getFollowingGroup(); + if (nextnextGroup != null) { + nextnextnextGroup = nextnextGroup.getFollowingGroup(); + } + } - /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && - //!isPartOfPrepNG(thisGroup, sentence) && - //getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && - precedingWordIsVerb(thisGroup, sentence) && - //!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && - !precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && - !sameSemanticHeads(thisGroup, nearPrepNG)) { - List<Token> heads = thisGroup.getSemanticHeadTokens(); - List<Token> segments = thisGroup.getTokens(); - segments.addAll(nearPrepNG.getTokens()); - - sentence.addMention(new Mention(segments, heads)); - }*/ - /*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && - // !precedingWordIsVerb(thisGroup, sentence) && - !isPartOfPrepNG(thisGroup, sentence) && - getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null && - //!precedingWordIsVerb(thisGroup, sentence) && - !precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && - //!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) && - !sameSemanticHeads(thisGroup, nearPrepNG)) { - List<Token> heads = thisGroup.getSemanticHeadTokens(); - List<Token> segments = thisGroup.getTokens(); - segments.addAll(nearPrepNG.getTokens()); - - sentence.addMention(new Mention(segments, heads)); - }*/ - if (thisGroup.getType().startsWith("NG") && - nextGroup != null && nextGroup.getType().startsWith("PrepNG") && - NGPrepNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))) { - List<Token> heads = thisGroup.getSemanticHeadTokens(); + if (group.getType().startsWith("NG") && nextGroup != null && + nextnextGroup != null && nextnextnextGroup != null && + quatroCompatibility(group, nextGroup, nextnextGroup, + nextnextnextGroup, valence.get(ValenceDicts.NounsValence))) { + List<Token> heads = group.getSemanticHeadTokens(); List<Token> segments = new ArrayList<Token>(); - segments.addAll(thisGroup.getTokens()); + segments.addAll(group.getTokens()); segments.addAll(nextGroup.getTokens()); + segments.addAll(nextnextGroup.getTokens()); + segments.addAll(nextnextnextGroup.getTokens()); sentence.addMention(new Mention(segments, heads)); - } else if (thisGroup.getType().startsWith("NG") && nextGroup != null && - nextGroup.getType().startsWith("NG") && - NGNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence)) - ) { - List<Token> heads = thisGroup.getSemanticHeadTokens(); + } else if (group.getType().startsWith("NG") && nextGroup != null && + nextnextGroup != null && tripleCompatibility(group, nextGroup, nextnextGroup, valence.get(ValenceDicts.NounsValence))) { + List<Token> heads = group.getSemanticHeadTokens(); List<Token> segments = new ArrayList<Token>(); - segments.addAll(thisGroup.getTokens()); + segments.addAll(group.getTokens()); segments.addAll(nextGroup.getTokens()); + segments.addAll(nextnextGroup.getTokens()); sentence.addMention(new Mention(segments, heads)); - } /*else if (thisGroup.getType().startsWith("NG") && nextNG != null && nearPrepNG == null && - NGcomplexPrepNGValenceCompatibility(thisGroup, nextNG, sentence, valence.get(ValenceDicts.NounsValence))) { - List<Token> heads = thisGroup.getSemanticHeadTokens(); - + } else if (group.getType().startsWith("NG") && nextGroup != null && + groupsValenceCompatibility(group, nextGroup, sentence, valence.get(ValenceDicts.NounsValence)) + ) { + List<Token> heads = group.getSemanticHeadTokens(); List<Token> segments = new ArrayList<Token>(); - segments.addAll(thisGroup.getTokens()); - - int prepStart = thisGroup.getSentencePositionEnd() + 1; - int prepEnd = nextNG.getSentencePositionStart() - 1; - ArrayList<Token> prepSegments = sentence.getSegmentsInsideSpan(prepStart, prepEnd); - segments.addAll(prepSegments); - - segments.addAll(nextNG.getTokens()); + segments.addAll(group.getTokens()); + segments.addAll(nextGroup.getTokens()); sentence.addMention(new Mention(segments, heads)); - }*/ - //else if // NG + im./pt. NG - // daty nie sa oznaczane np 21 kwietnia itd. , to co z ta gramatyka - // "instytut naukowy w montrealu" czemu sie nie laczy? ==> NE(orgName) + w(prep) + NE(placeName) - else if (thisGroup.getType().startsWith("NG")) { - List<Token> segments = thisGroup.getTokens(); - List<Token> heads = thisGroup.getSemanticHeadTokens(); - - sentence.addMention(new Mention(segments, heads)); - } - } - - // oryginalna wersja - /*for (SyntacticGroup group : sentence.getGroups()) { - if (group.getType().startsWith("NG")) { + } else if (group.getType().startsWith("NG")) { List<Token> segments = group.getTokens(); List<Token> heads = group.getSemanticHeadTokens(); sentence.addMention(new Mention(segments, heads)); } - }*/ - } - - private static boolean followingWordIsInf(SyntacticGroup group, - Sentence sentence) { - int followingTokenPosition = group.getSentencePositionEnd() + 1; - for (SyntacticWord word : sentence.getSyntacticWords()) { - int firstWordPosition = word.getSentencePositionStart(); - if (followingTokenPosition == firstWordPosition && - (word.getCtag().equals("Inf"))) { - return true; - } - } - - return false; - } - - private static SyntacticGroup getFollowingPrepNGs(int sentencePosition, - Sentence sentence) { - SyntacticGroup largestGroup = null; - int nextTokenPosition = sentencePosition + 1; - for (SyntacticGroup group : sentence.getGroups()) { - if (group.getType().startsWith("PrepNG") && - group.getSentencePositionStart() == nextTokenPosition) { - if (largestGroup == null || - largestGroup.getTokens().size() < group.getTokens().size()) { - largestGroup = group; - } - } - } - return largestGroup; - } - - private static boolean isPartOfPrepNG(SyntacticGroup NGGroup, - Sentence sentence) { - int NGGroupStart = NGGroup.getSentencePositionStart(); - int NGGroupEnd = NGGroup.getSentencePositionEnd(); - for (SyntacticGroup group : sentence.getGroups()) { - if (group.getType().startsWith("PrepNG") && - group.getSentencePositionStart() <= NGGroupStart && - group.getSentencePositionEnd() >= NGGroupEnd) { - return true; - } - } - return false; - } - - private static boolean precedingWordIsVerb(SyntacticGroup group, - Sentence sentence) { - int precedingTokenPosition = group.getSentencePositionStart() - 1; - if(isPartOfPrepNG(group, sentence)) { - SyntacticGroup parentGroup = getParentPrepNG(group, sentence); - precedingTokenPosition = parentGroup.getSentencePositionStart() - 1; - } - - for (SyntacticWord word : sentence.getSyntacticWords()) { - int lastWordPosition = word.getSentencePositionEnd(); - if (precedingTokenPosition == lastWordPosition && - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { - return true; - } - } - return false; - } - - // czy sie w lemacie bedzie zgodne miedzy walentym a spejdem? - // czy prep moze sie skladac z wiecej niz jednego segmentu? - // dopasowywac refla i recip do sie spejdowego - private static boolean precedingVerbModifyPrepNG(SyntacticGroup NGGroup, - SyntacticGroup PrepNGGroup, Sentence sentence, - Map<String,ArrayList<String>> walentyMapping) { - int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; - for (SyntacticWord word : sentence.getSyntacticWords()) { - int lastWordPosition = word.getSentencePositionEnd(); - if (precedingTokenPosition == lastWordPosition && - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { - String verb = word.getBase(); - if (!walentyMapping.containsKey(verb)) { - return true; - } else { - SyntacticWord prepWord = PrepNGGroup.getFirstWord(); - - if (prepWord.getTokens().size() == 1) { - Token prep = prepWord.getTokens().get(0); - String prepBase = prep.getBase(); - // sprawdzic czy glowa moze miec wiele tokenow - String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); - ArrayList<String> prepnps = getPrepnps(prepBase, prepCase); - - ArrayList<String> schemata = walentyMapping.get(verb); - for (String schema : schemata) { - for (String prepnp : prepnps) { - if (schema.contains(prepnp)) { - return true; - } - } - } - } else if (prepWord.getTokens().size() > 1) { - String prepOrth = prepWord.getOrth().toLowerCase(); - String comprepnp = String.format("comprepnp(%s)", prepOrth); - ArrayList<String> schemata = walentyMapping.get(verb); - for (String schema : schemata) { - if (schema.contains(comprepnp)) { - return true; - } - } - - } - - - } - } } - return false; - } - - private static boolean precedingVerbModifyNG(SyntacticGroup NGGroup, - SyntacticGroup PrepNGGroup, Sentence sentence, - Map<String,ArrayList<String>> walentyMapping) { - int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1; - if(isPartOfPrepNG(NGGroup, sentence)) { - SyntacticGroup parentNGGroup = getParentPrepNG(NGGroup, sentence); - precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; - } - for (SyntacticWord word : sentence.getSyntacticWords()) { - int lastWordPosition = word.getSentencePositionEnd(); - if (precedingTokenPosition == lastWordPosition && - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { - if (verbModifyNG(word, NGGroup, PrepNGGroup, sentence, walentyMapping)) { - return true; - } - if (!walentyMapping.containsKey(word.getBase())) { - return true; - } - - } - } - return false; - } - - private static boolean verbModifyNG(SyntacticWord verb, SyntacticGroup NGGroup, - SyntacticGroup PrepNGGroup, Sentence sentence, - Map<String,ArrayList<String>> walentyMapping) { - String verbBase = verb.getBase(); - if (!walentyMapping.containsKey(verbBase)) { - return true; - } else { - ArrayList<String> schemata = walentyMapping.get(verbBase); - - // PrepNG + PrepNG - if (isPartOfPrepNG(NGGroup, sentence)) { - SyntacticGroup NGParentGroup = getParentPrepNG(NGGroup, sentence); - ArrayList<String> prepNG1Realizations = NGParentGroup.getWalentyRealizations(); - ArrayList<String> prepNG2Realizations = PrepNGGroup.getWalentyRealizations(); - for (String schema : schemata) { - if (isProperSchema(schema, prepNG1Realizations, prepNG2Realizations)) { - return true; - } - } - } - - // NG + PrepNG - else { - ArrayList<String> NGRealizations = NGGroup.getWalentyRealizations(); - ArrayList<String> prepNGRealizations = PrepNGGroup.getWalentyRealizations(); - for (String schema : schemata) { - if (isProperSchema(schema, NGRealizations, prepNGRealizations)) { - return true; - } - } - } - } - return false; } private static boolean isProperSchema(String schema, ArrayList<String> group1Types, ArrayList<String> group2Types) { for (String group1Type : group1Types) { - if (schema.contains(group1Type)) { + if (schemaContains(schema, group1Type)) { for (String group2Type : group2Types) { - if (schema.contains(group2Type)) { + if (schemaContains(schema, group2Type)) { return true; } } @@ -404,103 +169,71 @@ public class Detector { return false; } - private static SyntacticGroup getParentPrepNG(SyntacticGroup NGGroup, - Sentence sentence) { - SyntacticGroup parentPrepNG = null; - int NGGroupStart = NGGroup.getSentencePositionStart(); - int NGGroupEnd = NGGroup.getSentencePositionEnd(); - for (SyntacticGroup group : sentence.getGroups()) { - if (group.getType().startsWith("PrepNG") && - group.getSentencePositionStart() <= NGGroupStart && - group.getSentencePositionEnd() >= NGGroupEnd) { - if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { - parentPrepNG = group; - } - } - } - return parentPrepNG; - } - - private static boolean NGPrepNGValenceCompatibility(SyntacticGroup NGGroup, - SyntacticGroup PrepNGGroup, Sentence sentence, + private static boolean groupsValenceCompatibility(SyntacticGroup NG1, + SyntacticGroup NG2, Sentence sentence, Map<String,ArrayList<String>> walentyMapping) { - Token NGHead = NGGroup.getSemanticHeadTokens().get(0); + Token NG1Head = NG1.getSemanticHeadTokens().get(0); - String NGHeadBase = NGHead.getBase(); + String NGHeadBase = NG1Head.getBase(); if (!walentyMapping.containsKey(NGHeadBase)) { return false; } else { - SyntacticWord prepWord = PrepNGGroup.getFirstWord(); + ArrayList<String> NG2realizations = NG2.getWalentyRealizations(); - if (prepWord.getTokens().size() == 1) { - Token prep = prepWord.getTokens().get(0); - String prepBase = prep.getBase(); - String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase(); - String prepnp = String.format("prepnp(%s,%s)", prepBase, prepCase); - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); - for (String schema : schemata) { - if (schemaContains(schema, prepnp)) { - return true; - } - } - } else if (prepWord.getTokens().size() > 1) { - String prepOrth = prepWord.getOrth().toLowerCase(); - String comprepnp = String.format("comprepnp(%s)", prepOrth); - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); + ArrayList<String> schemata = walentyMapping.get(NGHeadBase); + for (String real : NG2realizations) { for (String schema : schemata) { - if (schemaContains(schema, comprepnp)) { + if (schemaContains(schema, real)) { return true; } } - } - } return false; } - private static boolean NGNGValenceCompatibility(SyntacticGroup NG1, - SyntacticGroup NG2, Sentence sentence, + private static boolean tripleCompatibility(SyntacticGroup group1, + SyntacticGroup group2, SyntacticGroup group3, Map<String,ArrayList<String>> walentyMapping) { - Token NG1Head = NG1.getSemanticHeadTokens().get(0); + Token group1Head = group1.getSemanticHeadTokens().get(0); - String NGHeadBase = NG1Head.getBase(); + String group1HeadBase = group1Head.getBase(); - if (!walentyMapping.containsKey(NGHeadBase)) { + if (!walentyMapping.containsKey(group1HeadBase)) { return false; } else { - ArrayList<String> NG2realizations = NG2.getWalentyRealizations(); + ArrayList<String> group2realizations = group2.getWalentyRealizations(); + ArrayList<String> group3realizations = group3.getWalentyRealizations(); - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); - for (String real : NG2realizations) { - for (String schema : schemata) { - if (schemaContains(schema, real)) { - return true; - } + ArrayList<String> schemata = walentyMapping.get(group1HeadBase); + for (String schema : schemata) { + if (isProperSchema(schema, group2realizations, group3realizations)) { + return true; } } } return false; } - private static boolean NGcomplexPrepNGValenceCompatibility(SyntacticGroup NGGroup1, - SyntacticGroup NGGroup2, Sentence sentence, + private static boolean quatroCompatibility(SyntacticGroup group1, + SyntacticGroup group2, SyntacticGroup group3, SyntacticGroup group4, Map<String,ArrayList<String>> walentyMapping) { - - Token NGHead = NGGroup1.getSemanticHeadTokens().get(0); - String NGHeadBase = NGHead.getBase(); + Token group1Head = group1.getSemanticHeadTokens().get(0); + + String group1HeadBase = group1Head.getBase(); - if (!walentyMapping.containsKey(NGHeadBase)) { + if (!walentyMapping.containsKey(group1HeadBase)) { return false; } else { - int prepStart = NGGroup1.getSentencePositionEnd() + 1; - int prepEnd = NGGroup2.getSentencePositionStart() - 1; - String complexPrep = sentence.getTextInsideSpan(prepStart, prepEnd); - String comprepnp = String.format("comprepnp(%s)", complexPrep); - ArrayList<String> schemata = walentyMapping.get(NGHeadBase); + ArrayList<String> group2realizations = group2.getWalentyRealizations(); + ArrayList<String> group3realizations = group3.getWalentyRealizations(); + ArrayList<String> group4realizations = group4.getWalentyRealizations(); + + ArrayList<String> schemata = walentyMapping.get(group1HeadBase); for (String schema : schemata) { - if (schemaContains(schema, comprepnp)) { + if (isTripleProperSchema(schema, group2realizations, group3realizations, + group4realizations)) { return true; } } @@ -508,67 +241,119 @@ public class Detector { return false; } - private static boolean schemaContains(String schema, String phraseType) { - for (String position : schema.split("\\s\\+\\s")) { - position = position.trim(); - position = position.substring(1, position.length()-1); - for (String phrT : position.split(";")) { - if (phrT.equals(phraseType)) { - return true; + private static boolean isTripleProperSchema(String schema, ArrayList<String> group1Types, + ArrayList<String> group2Types, ArrayList<String> group3Types) { + for (String group1Type : group1Types) { + if (schemaContains(schema, group1Type)) { + for (String group2Type : group2Types) { + if (schemaContains(schema, group2Type)) { + for (String group3Type : group3Types) { + if (schemaContains(schema, group3Type)) { + return true; + } + } + } } } } return false; } - private static boolean schemaContainsType(String schema, String type) { - // to lepiej dziala dla rzeczownikow - for (String position : schema.split("\\s\\+\\s")) { - position = position.trim(); - position = position.substring(1, position.length()-1); - for (String phrT : position.split(";")) { - - if (phrT.startsWith(type+"(")) { - return true; + /*private static boolean isTripleProperSchema(String schema, ArrayList<String> group1Types, + ArrayList<String> group2Types, ArrayList<String> group3Types) { + + ArrayList<String> group1MPositions = getMatchingPositions(schema, group1Types); + ArrayList<String> group2MPositions = getMatchingPositions(schema, group2Types); + ArrayList<String> group3MPositions = getMatchingPositions(schema, group3Types); + + + + ArrayList<String> group1MPositionsCopy = new ArrayList<String>(); + ArrayList<String> group2MPositionsCopy = getMatchingPositions(schema, group2Types); + ArrayList<String> group3MPositionsCopy = getMatchingPositions(schema, group3Types); + + + if (group1MPositions.isEmpty() || group2MPositions.isEmpty() || group3MPositions.isEmpty()) { + return false; + } + + boolean group1ok = false; + boolean group2ok = false; + boolean group3ok = false; + + for (String pos : group1MPositions) { + + } + + ArrayList<String> + + if (union(group1MPositions, group2MPositions).size() > group1MPositions.size() && + ) + + + for (String group1Type : group1Types) { + if (schemaContains(schema, group1Type)) { + for (String group2Type : group2Types) { + if (schemaContains(schema, group2Type)) { + for (String group3Type : group3Types) { + if (schemaContains(schema, group3Type)) { + return true; + } + } + } } } } return false; + }*/ + + public static List<String> union(List<String> list1, List<String> list2) { + HashSet<String> set = new HashSet<String>(); + + set.addAll(list1); + set.addAll(list2); + + return new ArrayList<String>(set); } + public static List<String> tripleUnion(List<String> list1, List<String> list2, + List<String> list3) { + HashSet<String> set = new HashSet<String>(); + + set.addAll(list1); + set.addAll(list2); + set.addAll(list3); + + return new ArrayList<String>(set); + } - // compar ?? - private static ArrayList<String> getPrepnps(String prepBase, String prepCase) { - ArrayList<String> prepnps = new ArrayList<String>(); - prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); - if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) { - prepnps.add(String.format("prepnp(%s,str)", prepBase)); - } - if (prepCase.equals("gen") || prepCase.equals("acc")) { - prepnps.add(String.format("prepnp(%s,part)", prepBase)); + private static ArrayList<String> getMatchingPositions(String schema, ArrayList<String> phraseRealizations) { + ArrayList<String> positions = new ArrayList<String>(); + for (String position : schema.split("\\s\\+\\s")) { + position = position.trim(); + position = position.substring(1, position.length()-1); + for (String phrT : position.split(";")) { + if (phraseRealizations.contains(phrT.trim())) { + positions.add(position); + break; + } + } } - return prepnps; + return positions; } - // eliminuje "od wsi do wsi" - private static boolean sameSemanticHeads(SyntacticGroup group1, - SyntacticGroup group2) { - - List<Token> group1HeadTokens = group1.getSemanticHeadTokens(); - List<Token> group2HeadTokens = group2.getSemanticHeadTokens(); - if (group1HeadTokens.size() != group2HeadTokens.size()) { - return false; - } - - for (int i=0; i < group1HeadTokens.size(); i++) { - if (!group1HeadTokens.get(i).getBase().equals(group2HeadTokens.get(i).getBase())) { - return false; + private static boolean schemaContains(String schema, String phraseType) { + for (String position : schema.split("\\s\\+\\s")) { + position = position.trim(); + position = position.substring(1, position.length()-1); + for (String phrT : position.split(";")) { + if (phrT.equals(phraseType)) { + return true; + } } } - - return true; + return false; } - /** * WyszukujÄ™ i oznaczam wszystkie NER diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java index 2fe1e86..120d94b 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java @@ -1,11 +1,13 @@ package pl.waw.ipipan.zil.core.md.entities; import java.util.ArrayList; -import java.util.Arrays; import java.util.List; +import pl.waw.ipipan.zil.core.md.detection.Constants; + /** * @author Mateusz Kopec + * Modified 2017 by Bartlomiej Niton * */ public class Mention implements Comparable<Mention> { @@ -205,77 +207,22 @@ public class Mention implements Comparable<Mention> { return isZeroSubject; } - public int getSentencePositionStart() { + public int getSentenceStartPosition() { Token startToken = this.getFirstSegment(); return startToken.getSentencePosition(); } - public int getSentencePositionEnd() { + public int getSentenceEndPosition() { Token endToken = this.getLastSegment(); return endToken.getSentencePosition(); } - - public boolean isPartOfQub() { - if (this.segments.size() == 1) { - Sentence sentence = this.segments.get(0).getSentence(); - for (SyntacticWord word : sentence.getSyntacticWords()) { - if (word.getTokens().contains(this.segments.get(0)) && - word.getCtag().equals("Qub")) { - return true; - } - } - } - return false; - } - - public boolean isPartOfPrep() { - if (this.segments.size() == 1) { - Sentence sentence = this.segments.get(0).getSentence(); - for (SyntacticWord word : sentence.getSyntacticWords()) { - if (word.getTokens().contains(this.segments.get(0)) && - word.getCtag().equals("Prep")) { - return true; - } - } - } - return false; - } - - private final List<String> FRAZEOS = Arrays.asList("Prep", "Qub", "Adv", "Interj", - "Adj", "Conj", "Comp"); public boolean isPartOfFrazeo() { if (this.segments.size() == 1) { Sentence sentence = this.segments.get(0).getSentence(); for (SyntacticWord word : sentence.getSyntacticWords()) { if (word.getTokens().contains(this.segments.get(0)) && - FRAZEOS.contains(word.getCtag())) { - return true; - } - } - } - return false; - } - - public boolean isPartOfComplexPrep(ArrayList<String> complexPreps) { - if (this.segments.size() == 1) { - Sentence sentence = this.segments.get(0).getSentence(); - if (this.getSentencePositionStart() - 1 >= 0) { - String prep = sentence.get(this.getSentencePositionStart() - 1).getOrth(); - String noun = sentence.get(this.getSentencePositionStart()).getOrth(); - String possiblePrep = String.format("%s %s", prep, noun); - if (complexPreps.contains(possiblePrep)) { - return true; - } - } - - if (this.getSentencePositionStart() - 1 >= 0 && - this.getSentencePositionStart() + 1 < sentence.size()) { - String prep1 = sentence.get(this.getSentencePositionStart() - 1).getOrth(); - String noun = sentence.get(this.getSentencePositionStart()).getOrth(); - String prep2 = sentence.get(this.getSentencePositionStart() + 1).getOrth(); - String possiblePrep = String.format("%s %s %s", prep1, noun, prep2); - if (complexPreps.contains(possiblePrep)) { + Constants.FRAZEO_CTAGS.contains(word.getCtag())) { return true; } } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java index 476cdf9..558a71c 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java @@ -110,35 +110,6 @@ public class Sentence extends ArrayList<Token> { namedEntities.add(namedEntity); } - public ArrayList<SyntacticGroup> getGroupsInsideSpan(int start, int end) { - ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); - for (SyntacticGroup group : this.syntacticGroups) { - if (group.getSentencePositionStart() >= start && - group.getSentencePositionEnd() <= end) { - if (!(group.getSentencePositionStart() == start && - group.getSentencePositionEnd() == end)) { - groupsAtSpan.add(group); - } - } - } - return groupsAtSpan; - } - - public ArrayList<SyntacticGroup> getLargestNotIntersectingGroupsInsideSpan(int start, int end) { - ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>(); - for (SyntacticGroup group : this.syntacticGroups) { - - if (group.getSentencePositionStart() >= start && - group.getSentencePositionEnd() <= end) { - if (!(group.getSentencePositionStart() == start && - group.getSentencePositionEnd() == end)) { - groupsAtSpan.add(group); - } - } - } - return groupsAtSpan; - } - public SyntacticGroup getFirstGroup(int start, int end) { SyntacticGroup largestGroup = null; int step = start; @@ -152,8 +123,8 @@ public class Sentence extends ArrayList<Token> { private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) { SyntacticGroup largestGroup = null; for (SyntacticGroup group : this.getGroups()) { - int groupStart = group.getSentencePositionStart(); - int groupEnd = group.getSentencePositionEnd(); + int groupStart = group.getSentenceStartPosition(); + int groupEnd = group.getSentenceEndPosition(); if (groupStart == start && groupEnd <= end && !(groupStart == start && groupEnd == end) && (largestGroup == null || @@ -177,8 +148,8 @@ public class Sentence extends ArrayList<Token> { private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) { SyntacticGroup largestGroup = null; for (SyntacticGroup group : this.getGroups()) { - int groupStart = group.getSentencePositionStart(); - int groupEnd = group.getSentencePositionEnd(); + int groupStart = group.getSentenceStartPosition(); + int groupEnd = group.getSentenceEndPosition(); if (groupEnd == end && groupStart >= start && !(groupStart == start && groupEnd == end) && (largestGroup == null || @@ -189,38 +160,4 @@ public class Sentence extends ArrayList<Token> { return largestGroup; } - public ArrayList<Mention> getMentionsInsideSpan(int start, int end) { - ArrayList<Mention> mentionsAtSpan = new ArrayList<Mention>(); - for (Mention mention : this.mentions) { - if (mention.getSentencePositionStart() >= start && - mention.getSentencePositionEnd() <= end) { - mentionsAtSpan.add(mention); - } - } - return mentionsAtSpan; - } - - public String getTextInsideSpan(int start, int end) { - String text = ""; - int step = start; - while (step <= end) { - if (step != start) { - text += " "; - } - text += this.get(step).getOrth(); - step++; - } - return text; - } - - public ArrayList<Token> getSegmentsInsideSpan(int start, int end) { - ArrayList<Token> tokensAtSpan = new ArrayList<Token>(); - int step = start; - while (step <= end) { - tokensAtSpan.add(this.get(step)); - step++; - } - return tokensAtSpan; - } - } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java index ed6f234..b10fa3d 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java @@ -55,34 +55,19 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { return getType().compareTo(o.getType()); } - public int getSentencePositionStart() { + public int getSentenceStartPosition() { Token startToken = tokens.get(0); return startToken.getSentencePosition(); } - public int getSentencePositionEnd() { + public int getSentenceEndPosition() { Token endToken = tokens.get(tokens.size()-1); return endToken.getSentencePosition(); } - - public SyntacticWord getFirstWord() { - SyntacticWord firstWord = null; - Token startToken = tokens.get(0); - Sentence sentence = startToken.getSentence(); - for (SyntacticWord word : sentence.getSyntacticWords()) { - if(startToken.compareTo(word.getTokens().get(0)) == 0 && - (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) { - firstWord = word; - } - } - return firstWord; - } - - // NG and PrepNG only now public ArrayList<String> getWalentyRealizations() { ArrayList<String> realizations = new ArrayList<String>(); - if (this.type.startsWith("PrepNG")) { + if (this.type.equals("PrepNG")) { SyntacticWord prepWord = this.getFirstWord(); if (prepWord.getTokens().size() == 1) { @@ -105,7 +90,19 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { return realizations; } - // compar ?? + public SyntacticWord getFirstWord() { + SyntacticWord firstWord = null; + Token startToken = tokens.get(0); + Sentence sentence = startToken.getSentence(); + for (SyntacticWord word : sentence.getSyntacticWords()) { + if(startToken.compareTo(word.getTokens().get(0)) == 0 && + (firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) { + firstWord = word; + } + } + return firstWord; + } + private ArrayList<String> getPrepnps(String prepBase, String prepCase) { ArrayList<String> prepnps = new ArrayList<String>(); prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase)); @@ -130,44 +127,13 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { return nps; } - public boolean precedingWordIsVerb() { - Sentence sentence = this.tokens.get(0).getSentence(); - int precedingTokenPosition = this.getSentencePositionStart() - 1; - for (SyntacticWord word : sentence.getSyntacticWords()) { - int lastWordPosition = word.getSentencePositionEnd(); - if (precedingTokenPosition == lastWordPosition && - (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { - return true; - } - } - return false; - } - - public SyntacticGroup getNextNG() { - Sentence sentence = this.tokens.get(0).getSentence(); - int thisGroupEnd = this.getSentencePositionEnd(); - int sentenceLength = sentence.size(); - - SyntacticGroup nextNG = null; - for (int step = thisGroupEnd; step < sentenceLength; step++) { - nextNG = sentence.getFirstGroup(step, sentenceLength); - if (nextNG != null && nextNG.type.startsWith("NG") && - this.getSentencePositionEnd() < nextNG.getSentencePositionStart()) { - break; - } else { - nextNG = null; - } - } - return nextNG; - } - public SyntacticGroup getFollowingGroup() { SyntacticGroup largestGroup = null; Sentence sentence = this.tokens.get(0).getSentence(); - int nextTokenPosition = this.getSentencePositionEnd() + 1; + int nextTokenPosition = this.getSentenceEndPosition() + 1; for (SyntacticGroup group : sentence.getGroups()) { - if ((group.getType().startsWith("PrepNG") || group.getType().startsWith("NG")) && - group.getSentencePositionStart() == nextTokenPosition) { + if ((group.getType().equals("PrepNG") || group.getType().startsWith("NG")) && + group.getSentenceStartPosition() == nextTokenPosition) { if (largestGroup == null || largestGroup.getTokens().size() < group.getTokens().size()) { largestGroup = group; @@ -178,14 +144,14 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { } public SyntacticWord getPrecedingVerb() { - int precedingTokenPosition = this.getSentencePositionStart() - 1; + int precedingTokenPosition = this.getSentenceStartPosition() - 1; Sentence sentence = this.tokens.get(0).getSentence(); if(this.isPartOfPrepNG()) { SyntacticGroup parentNGGroup = this.getParentPrepNG(); - precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1; + precedingTokenPosition = parentNGGroup.getSentenceStartPosition() - 1; } for (SyntacticWord word : sentence.getSyntacticWords()) { - int lastWordPosition = word.getSentencePositionEnd(); + int lastWordPosition = word.getSentenceEndPosition(); if (precedingTokenPosition == lastWordPosition && (word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) { return word; @@ -195,13 +161,13 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { } private boolean isPartOfPrepNG() { - int NGGroupStart = this.getSentencePositionStart(); - int NGGroupEnd = this.getSentencePositionEnd(); + int NGGroupStart = this.getSentenceStartPosition(); + int NGGroupEnd = this.getSentenceEndPosition(); Sentence sentence = this.tokens.get(0).getSentence(); for (SyntacticGroup group : sentence.getGroups()) { - if (group.getType().startsWith("PrepNG") && - group.getSentencePositionStart() <= NGGroupStart && - group.getSentencePositionEnd() >= NGGroupEnd) { + if (group.getType().equals("PrepNG") && + group.getSentenceStartPosition() <= NGGroupStart && + group.getSentenceEndPosition() >= NGGroupEnd) { return true; } } @@ -210,13 +176,13 @@ public class SyntacticGroup implements Comparable<SyntacticGroup> { private SyntacticGroup getParentPrepNG() { SyntacticGroup parentPrepNG = null; - int NGGroupStart = this.getSentencePositionStart(); - int NGGroupEnd = this.getSentencePositionEnd(); + int NGGroupStart = this.getSentenceStartPosition(); + int NGGroupEnd = this.getSentenceEndPosition(); Sentence sentence = this.tokens.get(0).getSentence(); for (SyntacticGroup group : sentence.getGroups()) { - if (group.getType().startsWith("PrepNG") && - group.getSentencePositionStart() <= NGGroupStart && - group.getSentencePositionEnd() >= NGGroupEnd) { + if (group.getType().equals("PrepNG") && + group.getSentenceStartPosition() <= NGGroupStart && + group.getSentenceEndPosition() >= NGGroupEnd) { if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) { parentPrepNG = group; } diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java index aa80dec..14178b6 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java @@ -4,6 +4,8 @@ import java.util.ArrayList; import java.util.Iterator; import java.util.List; +import pl.waw.ipipan.zil.core.md.detection.Constants; + public class SyntacticWord implements Comparable<SyntacticWord> { private String base; @@ -22,6 +24,14 @@ public class SyntacticWord implements Comparable<SyntacticWord> { public String getCtag() { return ctag; } + + public String getBase() { + return base; + } + + public String getOrth() { + return orth; + } public List<Token> getTokens() { return tokens; @@ -45,33 +55,18 @@ public class SyntacticWord implements Comparable<SyntacticWord> { return getCtag().compareTo(o.getCtag()); } - public int getSentencePositionStart() { + public int getSentenceStartPosition() { Token startToken = tokens.get(0); return startToken.getSentencePosition(); } - public int getSentencePositionEnd() { + public int getSentenceEndPosition() { Token endToken = tokens.get(tokens.size()-1); return endToken.getSentencePosition(); } - public String getBase() { - return this.base; - } - - public String getOrth() { - return this.orth; - } - public boolean isVerb() { - if (this.ctag.equals("Verbfin") || this.ctag.equals("Inf")) { - return true; - } - return false; - } - - public boolean isInterp() { - if (this.ctag.equals("Interp")) { + if (Constants.VERB_CTAGS.contains(this.ctag)) { return true; } return false; diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java b/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java index 255a056..e3216a5 100644 --- a/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java +++ b/src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java @@ -70,7 +70,6 @@ public class TeiLoader { for (TEIMorph mo : m.getHeadMorphs()) headTokens.add(teiMorph2Segment.get(mo)); s.addMention(new Mention(tokens, headTokens, m.isZeroSubject())); - System.out.println(tokens.toString()); } private static void loadSyntacticGroup(Sentence s, TEIGroup g, -- libgit2 0.22.2