Added new mention detection rules based on Walenty dictionary.

Bartłomiej Nitoń
1 parent 3682bbf2
Showing 9 changed files with 1166 additions and 13 deletions
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/Cleaner.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
src/main/java/pl/waw/ipipan/zil/core/md/entities/Sentence.java
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticWord.java
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
@@ -2,6 +2,7 @@ package pl.waw.ipipan.zil.core.md;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
 import pl.waw.ipipan.zil.core.md.detection.Detector;
 import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
 import pl.waw.ipipan.zil.core.md.entities.Text;
@@ -15,10 +16,16 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
 import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
 import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
+import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.EnumMap;
+import java.util.HashMap;
+import java.util.Map;
 public class Main {
@@ -26,12 +33,125 @@ public class Main {
     private static final boolean GZIP_OUTPUT = true;
     private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
+    private static final String DEFAULT_VERBS_VALENCE = "/walenty_20170117_verbs_all.txt";
+    private static final String DEFAULT_NOUNS_VALENCE = "/walenty_20170117_nouns_all.txt";
+    private static final String COMPLEX_PREPS = "/complex_preps_Walenty_USJP.txt";
     private static ZeroSubjectDetector zeroSubjectModel;
+    
+    public static enum ValenceDicts {
+    	VerbsValence,
+        NounsValence
+    }
+    
+    private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = 
+    		new EnumMap(ValenceDicts.class);
+    
+    private static final ArrayList<String> complexPreps;
     static {
         InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
         zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
+        
+        InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE);
+        valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream));
+        
+        InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
+        valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
+        
+        InputStream complexPrepositionsStream = Main.class.getResourceAsStream(COMPLEX_PREPS);
+        complexPreps = readValues(complexPrepositionsStream);
+    }
+    
+    
+    public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream)
+    {
+	    Map<String,ArrayList<String>> map;
+	    try {
+	        BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream));
+	        map = new HashMap<String,ArrayList<String>>();
+	        String line;
+	        boolean firstLine = true;
+			while((line = br.readLine()) != null) {
+				if (firstLine) {
+					line = line.replace("\uFEFF", ""); // remove BOM character
+					firstLine = false;
+				}
+
+				if (!line.startsWith("%")) {
+				    String[] lineParts = line.split(":");
+				    String lemma = lineParts[0].trim();
+				    String schema = lineParts[5].trim();
+				    
+				    if (schema.trim().isEmpty()) {
+				    	continue;
+				    }
+				    
+				    String[] lemmaParts = lemma.split(" ");
+				    if(lemmaParts.length == 1 && schemaContainsSie(schema)) {
+				    	lemma = lemma + " się";
+				    }
+				    
+				    ArrayList<String> schemata;
+				    if (!map.containsKey(lemma)) {
+				    	schemata = new ArrayList<String>();
+				    	schemata.add(schema);
+				    	map.put(lemma, schemata);
+				    } else {
+				    	schemata = map.get(lemma);
+				    	schemata.add(schema);
+				    	map.put(lemma, schemata);
+				    }
+				}
+			}
+	        br.close();
+	    } catch (IOException ex) {
+	        ex.printStackTrace();
+	        throw new RuntimeException(ex);
+	    }
+	    return map;
+    }
+    
+    private static boolean schemaContainsSie(String schema) {
+    	for (String position : schema.split("\\s\\+\\s")) {
+    		position = position.trim();
+    		position = position.substring(1, position.length()-1);
+    		for (String phrT : position.split(";")) {
+    			if (phrT.equals("refl") || phrT.equals("recip")) {
+    				return true;
+    			}
+    		}
+    	}
+    	
+    	return false;
+    }
+    
+    public static ArrayList<String> readValues(InputStream stream) {
+	    ArrayList<String> values;
+	    try {
+	        BufferedReader br=new BufferedReader(new InputStreamReader(stream));
+	        values = new ArrayList<String>();
+	        String line;
+	        boolean firstLine = true;
+			while((line = br.readLine()) != null) {
+				if (firstLine) {
+					line = line.replace("\uFEFF", ""); // remove BOM character
+					firstLine = false;
+				}
+
+				if (!line.startsWith("%")) {
+					String value = line.trim();
+				    if (!value.isEmpty()) {
+				    	values.add(value);
+				    }
+				}
+			}
+	        br.close();
+	    } catch (IOException ex) {
+	        ex.printStackTrace();
+	        throw new RuntimeException(ex);
+	    }
+	    return values;
     }
     private Main() {
@@ -71,6 +191,8 @@ public class Main {
                 return;
             }
         }
+        
+        
         int all = 0;
         int errors = 0;
@@ -122,7 +244,7 @@ public class Main {
      */
     public static void annotateThriftText(TText thriftText) throws MultiserviceException {
         Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
-        Detector.findMentionsInText(responseText, zeroSubjectModel);
+        Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps);
         ThriftSaver.updateThriftText(responseText, thriftText);
     }
@@ -135,7 +257,7 @@ public class Main {
      */
     public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
         Text responseText = TeiLoader.loadTextFromTei(teiText);
-        Detector.findMentionsInText(responseText, zeroSubjectModel);
+        Detector.findMentionsInText(responseText, zeroSubjectModel, valence, complexPreps);
         TeiSaver.updateTeiText(responseText, teiText);
     }
 package pl.waw.ipipan.zil.core.md.detection;
+import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
 import pl.waw.ipipan.zil.core.md.entities.Mention;
 import pl.waw.ipipan.zil.core.md.entities.Sentence;
+import pl.waw.ipipan.zil.core.md.entities.SyntacticGroup;
+import pl.waw.ipipan.zil.core.md.entities.SyntacticWord;
 import pl.waw.ipipan.zil.core.md.entities.Token;
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 public class Cleaner {
@@ -125,4 +130,157 @@ public class Cleaner {
         else
             return m1;
     }
+    
+    public static void cleanWalentyFramedMentions(Sentence sentence,
+    		Map<String,ArrayList<String>> verbsValence) {
+    	ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
+    	for (Mention mention : sentence.getMentions()) {
+        	int mentionStart = mention.getFirstSegment().getSentencePosition();
+        	int mentionEnd = mention.getLastSegment().getSentencePosition();
+        	SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd);
+        	SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd);
+
+        	if (startGroup != null && endGroup != null
+        			&& startGroup.compareTo(endGroup) != 0) {
+        		
+        		SyntacticWord verb = startGroup.getPrecedingVerb();
+        		if (verb != null && !verb.getBase().equals("mieć") 
+        				&& verbsValence.containsKey(verb.getBase())) {
+            		ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations();
+            		ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations();
+            		
+            		for (String schema : verbsValence.get(verb.getBase())) {
+    					if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) {
+    						mentionsToRemove.add(mention);
+    						break;
+    					}
+    				}
+        		}
+        	}
+        }
+    	
+    	for (Mention mentionToRemove : mentionsToRemove) {
+    		sentence.removeMention(mentionToRemove);
+    	}
+    }
+    
+    /*private static void removeWalentyFramedMentions(Sentence sentence, 
+    		ArrayList<Mention> mentions, 
+    		ArrayList<String> schemata) {
+    	ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
+    	for (Mention mention : mentions) {
+        	int mentionStart = mention.getFirstSegment().getSentencePosition();
+        	int mentionEnd = mention.getLastSegment().getSentencePosition();
+        	SyntacticGroup startGroup = sentence.getFirstGroup(mentionStart, mentionEnd);
+        	SyntacticGroup endGroup = sentence.getLastGroup(mentionStart, mentionEnd);
+        	if (startGroup != null && endGroup != null 
+        			&& startGroup.compareTo(endGroup) != 0) {
+        		ArrayList<String> startGroupRealizations = startGroup.getWalentyRealizations();
+        		ArrayList<String> endGroupRealizations = endGroup.getWalentyRealizations();
+        		for (String schema : schemata) {
+					if (isProperSchema(schema, startGroupRealizations, endGroupRealizations)) {
+						mentionsToRemove.add(mention);
+						break;
+					}
+				}
+        	}
+        }
+    	
+    	for (Mention mentionToRemove : mentionsToRemove) {
+    		sentence.removeMention(mentionToRemove);
+    	}
+    }*/
+    
+    private static boolean isProperSchema(String schema, ArrayList<String> group1Types,
+    		ArrayList<String> group2Types) {
+    	for (String group1Type : group1Types) {
+    		for (String group2Type : group2Types) {
+    			if (schemaContains(schema, group1Type, group2Type)) {
+        			return true;
+        		}
+    		}
+    	}
+    	return false;
+    }
+    
+    private static boolean schemaContains(String schema, String phraseType1,
+    		String phraseType2) {
+    	boolean phrType1Found = false;
+    	boolean phrType2Found = false;
+    	for (String position : schema.split("\\+")) {
+    		position = position.trim();
+    		position = position.substring(1, position.length()-1);
+    		for (String phrT : position.split(";")) {
+    			if (phrT.equals(phraseType1)) {
+    				phrType1Found = true;
+    				break;
+    			} else if (phrT.equals(phraseType2)) {
+    				phrType2Found = true;
+    				break;
+    			}
+    		}
+			if (phrType1Found && phrType2Found) {
+				return true;
+			}
+    	}
+    	return false;
+    }
+    
+    
+    // wykrywa, ze "wszystkim" jest czescia "przede wszystkim" (kublika Qub)
+    public static void cleanQubs(Sentence sentence) {
+    	ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
+    	for (Mention mention : sentence.getMentions()) {
+    		if (mention.isPartOfQub()) {
+    			mentionsToRemove.add(mention);
+    		}
+    	}
+    	
+    	for (Mention mentionToRemove : mentionsToRemove) {
+    		sentence.removeMention(mentionToRemove);
+    	}
+    }
+    
+    public static void cleanPreps(Sentence sentence) {
+    	ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
+    	for (Mention mention : sentence.getMentions()) {
+    		if (mention.isPartOfPrep()) {
+    			mentionsToRemove.add(mention);
+    		}
+    	}
+    	
+    	for (Mention mentionToRemove : mentionsToRemove) {
+    		sentence.removeMention(mentionToRemove);
+    	}
+    }
+    
+    public static void cleanFrazeos(Sentence sentence) {
+    	ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
+    	for (Mention mention : sentence.getMentions()) {
+    		if (mention.isPartOfFrazeo()) {
+    			mentionsToRemove.add(mention);
+    		}
+    	}
+    	
+    	for (Mention mentionToRemove : mentionsToRemove) {
+    		sentence.removeMention(mentionToRemove);
+    	}
+    }
+    
+    // wyrzuca wzmianki bedace czescia przyimkow zlozonych
+    public static void cleanComplexPreps(Sentence sentence, 
+    		ArrayList<String> complexPreps) {
+    	
+    	ArrayList<Mention> mentionsToRemove = new ArrayList<Mention>();
+    	for (Mention mention : sentence.getMentions()) {
+    		if (mention.isPartOfComplexPrep(complexPreps)) {
+    			mentionsToRemove.add(mention);
+    		}
+    	}
+    	
+    	for (Mention mentionToRemove : mentionsToRemove) {
+    		sentence.removeMention(mentionToRemove);
+    	}
+    }
+    
 }
@@ -2,12 +2,15 @@ package pl.waw.ipipan.zil.core.md.detection;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
+
+import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
 import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
 import pl.waw.ipipan.zil.core.md.entities.*;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 public class Detector {
@@ -18,21 +21,25 @@ public class Detector {
     }
     public static void findMentionsInText(Text text,
-                                          ZeroSubjectDetector zeroSubjectModel) {
+                                          ZeroSubjectDetector zeroSubjectModel,
+                                          Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
+                                          ArrayList<String> complexPreps) {
         text.clearMentions();
         logger.debug("Detecting mentions in text " + text.getId());
         for (Paragraph p : text)
             for (Sentence s : p)
-                detectMentionsInSentence(s, zeroSubjectModel);
+                detectMentionsInSentence(s, zeroSubjectModel, valence, complexPreps);
     }
     private static void detectMentionsInSentence(Sentence sentence,
-                                                 ZeroSubjectDetector zeroSubjectModel) {
+                                                 ZeroSubjectDetector zeroSubjectModel,
+                                                 Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
+                                                 ArrayList<String> complexPreps) {
         // adding mentions
         addMentionsByTokenCtag(sentence);
         addMentionsBySyntacticWordsCtag(sentence);
         addMentionsByNamedEntities(sentence);
-        addMentionsByGroups(sentence);
+        addMentionsByGroups(sentence, valence, complexPreps);
         addSpeakerMentionsInSpoken(sentence);
         // zero subject detection
@@ -41,6 +48,11 @@ public class Detector {
         // removing mentions
         removeTo(sentence);
         Cleaner.cleanUnnecessarySentenceMentions(sentence);
+        //Cleaner.cleanQubs(sentence);
+        //Cleaner.cleanPreps(sentence);
+        //Cleaner.cleanComplexPreps(sentence, complexPreps);
+        Cleaner.cleanFrazeos(sentence);
+        Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));
         // updating mention heads
         updateMentionHeads(sentence);
@@ -95,16 +107,468 @@ public class Detector {
      *
      * @param sentence
      */
-    private static void addMentionsByGroups(Sentence sentence) {
-        for (SyntacticGroup group : sentence.getGroups()) {
+    private static void addMentionsByGroups(Sentence sentence, 
+    		Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
+    		ArrayList<String> complexPreps) {
+    	List<SyntacticGroup> groups = sentence.getGroups();
+    	for (int i = 0; i < groups.size(); i++) {
+    		SyntacticGroup thisGroup = groups.get(i);
+    		
+    		/*SyntacticGroup nearPrepNG = null;
+    		SyntacticGroup nextNG = null;*/
+    		
+    		SyntacticGroup nextGroup = thisGroup.getFollowingGroup();
+    		
+    		/*if (thisGroup.getType().startsWith("NG")) {
+    			nearPrepNG = getFollowingPrepNGs(thisGroup.getSentencePositionEnd(),
+	    	    		sentence);
+    			nextNG = thisGroup.getNextNG();
+    		}*/
+    		
+    		/*if (nextNG != null) {
+    			int prepStart = thisGroup.getSentencePositionEnd() + 1;
+    	    	int prepEnd = nextNG.getSentencePositionStart() - 1;
+    	    	String prep = sentence.getTextInsideSpan(prepStart, prepEnd);
+    	    	if (complexPreps.contains(prep)) {
+    	    		String cos = "";
+    	    	}
+    		}*/
+    		
+    		/*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null && 
+    				//!isPartOfPrepNG(thisGroup, sentence) &&
+    				//getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null &&
+    				precedingWordIsVerb(thisGroup, sentence) &&
+    				//!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
+    				!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
+    				!sameSemanticHeads(thisGroup, nearPrepNG)) {
+    			List<Token> heads = thisGroup.getSemanticHeadTokens();
+    			List<Token> segments = thisGroup.getTokens();
+    			segments.addAll(nearPrepNG.getTokens());
+    			
+    			sentence.addMention(new Mention(segments, heads));
+    		}*/
+    		/*if (thisGroup.getType().startsWith("NG") && nearPrepNG != null &&
+    			//	!precedingWordIsVerb(thisGroup, sentence) && 
+    				!isPartOfPrepNG(thisGroup, sentence) &&
+    				getFollowingPrepNGs(nearPrepNG.getSentencePositionEnd(), sentence) == null &&
+    				//!precedingWordIsVerb(thisGroup, sentence) &&
+    				!precedingVerbModifyPrepNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
+    				//!precedingVerbModifyNG(thisGroup, nearPrepNG, sentence, valence.get(ValenceDicts.VerbsValence)) &&
+    				!sameSemanticHeads(thisGroup, nearPrepNG)) {
+    			List<Token> heads = thisGroup.getSemanticHeadTokens();
+    			List<Token> segments = thisGroup.getTokens();
+    			segments.addAll(nearPrepNG.getTokens());
+    			
+    			sentence.addMention(new Mention(segments, heads));
+    		}*/
+    		if (thisGroup.getType().startsWith("NG") && 
+    				nextGroup != null && nextGroup.getType().startsWith("PrepNG") &&
+    				NGPrepNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))) {
+    			List<Token> heads = thisGroup.getSemanticHeadTokens();
+    			List<Token> segments = new ArrayList<Token>();
+    			segments.addAll(thisGroup.getTokens());
+    			segments.addAll(nextGroup.getTokens());
+    			
+    			sentence.addMention(new Mention(segments, heads));
+    		} else if (thisGroup.getType().startsWith("NG") && nextGroup != null && 
+    				nextGroup.getType().startsWith("NG") &&
+    				NGNGValenceCompatibility(thisGroup, nextGroup, sentence, valence.get(ValenceDicts.NounsValence))
+    				) {
+    			List<Token> heads = thisGroup.getSemanticHeadTokens();
+    			List<Token> segments = new ArrayList<Token>();
+    			segments.addAll(thisGroup.getTokens());
+    			segments.addAll(nextGroup.getTokens());
+    			
+    			sentence.addMention(new Mention(segments, heads));
+    		} /*else if (thisGroup.getType().startsWith("NG") && nextNG != null && nearPrepNG == null &&
+    				NGcomplexPrepNGValenceCompatibility(thisGroup, nextNG, sentence, valence.get(ValenceDicts.NounsValence))) {
+    			List<Token> heads = thisGroup.getSemanticHeadTokens();
+    			
+    			List<Token> segments = new ArrayList<Token>();
+    			segments.addAll(thisGroup.getTokens());
+    			
+    	    	int prepStart = thisGroup.getSentencePositionEnd() + 1;
+    	    	int prepEnd = nextNG.getSentencePositionStart() - 1;
+    	    	ArrayList<Token> prepSegments = sentence.getSegmentsInsideSpan(prepStart, prepEnd);
+    	    	segments.addAll(prepSegments);
+    			
+    			segments.addAll(nextNG.getTokens());
+    			
+    			sentence.addMention(new Mention(segments, heads));
+    		}*/
+    		//else if // NG + im./pt. NG
+    		// daty nie sa oznaczane np 21 kwietnia itd. , to co z ta gramatyka
+    		// "instytut naukowy w montrealu" czemu sie nie laczy? ==> NE(orgName) + w(prep) + NE(placeName)
+    		else if (thisGroup.getType().startsWith("NG")) {
+                List<Token> segments = thisGroup.getTokens();
+                List<Token> heads = thisGroup.getSemanticHeadTokens();
+
+                sentence.addMention(new Mention(segments, heads));
+            }
+    	}
+    	
+       // oryginalna wersja
+       /*for (SyntacticGroup group : sentence.getGroups()) {
             if (group.getType().startsWith("NG")) {
                 List<Token> segments = group.getTokens();
                 List<Token> heads = group.getSemanticHeadTokens();
                 sentence.addMention(new Mention(segments, heads));
             }
-        }
+        }*/
+    }
+    
+    private static boolean followingWordIsInf(SyntacticGroup group,
+    		Sentence sentence) {
+    	int followingTokenPosition = group.getSentencePositionEnd() + 1;
+    	for (SyntacticWord word : sentence.getSyntacticWords()) {
+    		int firstWordPosition = word.getSentencePositionStart();
+    		if (followingTokenPosition == firstWordPosition &&
+    				(word.getCtag().equals("Inf"))) {
+    			return true;
+    		}
+    	}
+
+    	return false;
+    }
+    
+    private static SyntacticGroup getFollowingPrepNGs(int sentencePosition,
+    		Sentence sentence) {
+    	SyntacticGroup largestGroup = null;
+    	int nextTokenPosition = sentencePosition + 1;
+    	for (SyntacticGroup group : sentence.getGroups()) {
+    		if (group.getType().startsWith("PrepNG") &&
+    				group.getSentencePositionStart() == nextTokenPosition) {
+    			if (largestGroup == null || 
+    					largestGroup.getTokens().size() < group.getTokens().size()) {
+    				largestGroup = group;
+    			}
+    		}
+    	}
+    	return largestGroup;
+    }
+    
+    private static boolean isPartOfPrepNG(SyntacticGroup NGGroup,
+    		Sentence sentence) {
+    	int NGGroupStart = NGGroup.getSentencePositionStart();
+    	int NGGroupEnd = NGGroup.getSentencePositionEnd();
+    	for (SyntacticGroup group : sentence.getGroups()) {
+    		if (group.getType().startsWith("PrepNG") &&
+    				group.getSentencePositionStart() <= NGGroupStart &&
+    				group.getSentencePositionEnd() >= NGGroupEnd) {
+    			return true;
+    		}
+    	}
+    	return false;
+    }
+    
+    private static boolean precedingWordIsVerb(SyntacticGroup group,
+    		Sentence sentence) {
+    	int precedingTokenPosition = group.getSentencePositionStart() - 1;
+    	if(isPartOfPrepNG(group, sentence)) {
+    		SyntacticGroup parentGroup = getParentPrepNG(group, sentence);
+    		precedingTokenPosition = parentGroup.getSentencePositionStart() - 1;
+    	}
+    	
+    	for (SyntacticWord word : sentence.getSyntacticWords()) {
+    		int lastWordPosition = word.getSentencePositionEnd();
+    		if (precedingTokenPosition == lastWordPosition &&
+    				(word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
+    			return true;
+    		}
+    	}
+    	return false;
+    }
+    
+    // czy sie w lemacie bedzie zgodne miedzy walentym a spejdem?
+    // czy prep moze sie skladac z wiecej niz jednego segmentu?
+    // dopasowywac refla i recip do sie spejdowego
+    private static boolean precedingVerbModifyPrepNG(SyntacticGroup NGGroup,
+    		SyntacticGroup PrepNGGroup, Sentence sentence, 
+    		Map<String,ArrayList<String>> walentyMapping) {
+    	int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1;
+    	for (SyntacticWord word : sentence.getSyntacticWords()) {
+    		int lastWordPosition = word.getSentencePositionEnd();
+    		if (precedingTokenPosition == lastWordPosition &&
+    				(word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
+    			String verb = word.getBase();
+    			if (!walentyMapping.containsKey(verb)) {
+    				return true;
+    			} else {
+    				SyntacticWord prepWord = PrepNGGroup.getFirstWord();
+    				
+    				if (prepWord.getTokens().size() == 1) {
+    			    	Token prep = prepWord.getTokens().get(0);
+    					String prepBase = prep.getBase();
+    					// sprawdzic czy glowa moze miec wiele tokenow
+    					String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase();
+        				ArrayList<String> prepnps = getPrepnps(prepBase, prepCase);
+        				
+        				ArrayList<String> schemata = walentyMapping.get(verb);
+        				for (String schema : schemata) {
+        					for (String prepnp : prepnps) {
+        						if (schema.contains(prepnp)) {
+            						return true;
+            					}
+        					}
+        				}
+    				} else if (prepWord.getTokens().size() > 1) {
+    					String prepOrth = prepWord.getOrth().toLowerCase();
+    					String comprepnp = String.format("comprepnp(%s)", prepOrth);
+    					ArrayList<String> schemata = walentyMapping.get(verb);
+    					for (String schema : schemata) {
+    						if (schema.contains(comprepnp)) {
+    							return true;
+    						}
+    					}
+    					
+    				}
+    				
+    				
+    			}
+    		}
+    	}
+    	return false;
+    }
+    
+    private static boolean precedingVerbModifyNG(SyntacticGroup NGGroup,
+    		SyntacticGroup PrepNGGroup, Sentence sentence, 
+    		Map<String,ArrayList<String>> walentyMapping) {
+    	int precedingTokenPosition = NGGroup.getSentencePositionStart() - 1;
+    	if(isPartOfPrepNG(NGGroup, sentence)) {
+    		SyntacticGroup parentNGGroup = getParentPrepNG(NGGroup, sentence);
+    		precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1;
+    	}
+    	for (SyntacticWord word : sentence.getSyntacticWords()) {
+    		int lastWordPosition = word.getSentencePositionEnd();
+    		if (precedingTokenPosition == lastWordPosition &&
+    				(word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
+    			if (verbModifyNG(word, NGGroup, PrepNGGroup, sentence, walentyMapping)) {
+    				return true;
+    			}
+    			if (!walentyMapping.containsKey(word.getBase())) {
+    				return true;
+    			}
+    			
+    		}
+    	}
+    	return false;
     }
+    
+    private static boolean verbModifyNG(SyntacticWord verb, SyntacticGroup NGGroup,
+    		SyntacticGroup PrepNGGroup, Sentence sentence, 
+    		Map<String,ArrayList<String>> walentyMapping) {
+		String verbBase = verb.getBase();
+		if (!walentyMapping.containsKey(verbBase)) {
+			return true;
+		} else {
+			ArrayList<String> schemata = walentyMapping.get(verbBase);
+			
+			// PrepNG + PrepNG
+			if (isPartOfPrepNG(NGGroup, sentence)) {
+				SyntacticGroup NGParentGroup = getParentPrepNG(NGGroup, sentence);
+				ArrayList<String> prepNG1Realizations = NGParentGroup.getWalentyRealizations();
+				ArrayList<String> prepNG2Realizations = PrepNGGroup.getWalentyRealizations();
+				for (String schema : schemata) {
+					if (isProperSchema(schema, prepNG1Realizations, prepNG2Realizations)) {
+						return true;
+					}
+				}
+			}
+			
+			// NG + PrepNG
+			else {
+				ArrayList<String> NGRealizations = NGGroup.getWalentyRealizations();
+				ArrayList<String> prepNGRealizations = PrepNGGroup.getWalentyRealizations();
+				for (String schema : schemata) {
+					if (isProperSchema(schema, NGRealizations, prepNGRealizations)) {
+						return true;
+					}
+				}
+			}
+		}
+		return false;
+    }
+    
+    private static boolean isProperSchema(String schema, ArrayList<String> group1Types,
+    		ArrayList<String> group2Types) {
+    	for (String group1Type : group1Types) {
+    		if (schema.contains(group1Type)) {
+    			for (String group2Type : group2Types) {
+    				if (schema.contains(group2Type)) {
+        				return true;
+        			}
+    			}
+    		}
+    	}
+    	return false;
+    }
+    
+    private static SyntacticGroup getParentPrepNG(SyntacticGroup NGGroup,
+    		Sentence sentence) {
+    	SyntacticGroup parentPrepNG = null;
+    	int NGGroupStart = NGGroup.getSentencePositionStart();
+    	int NGGroupEnd = NGGroup.getSentencePositionEnd();
+    	for (SyntacticGroup group : sentence.getGroups()) {
+    		if (group.getType().startsWith("PrepNG") &&
+    				group.getSentencePositionStart() <= NGGroupStart &&
+    				group.getSentencePositionEnd() >= NGGroupEnd) {
+    			if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) {
+    				parentPrepNG = group;
+    			}
+    		}
+    	}
+    	return parentPrepNG;
+    }
+    
+    private static boolean NGPrepNGValenceCompatibility(SyntacticGroup NGGroup,
+    		SyntacticGroup PrepNGGroup, Sentence sentence, 
+    		Map<String,ArrayList<String>> walentyMapping) {
+    	Token NGHead = NGGroup.getSemanticHeadTokens().get(0);
+    	
+    	String NGHeadBase = NGHead.getBase();
+		
+		if (!walentyMapping.containsKey(NGHeadBase)) {
+			return false;
+		} else {
+			SyntacticWord prepWord = PrepNGGroup.getFirstWord();
+			
+			if (prepWord.getTokens().size() == 1) {
+		    	Token prep = prepWord.getTokens().get(0);
+				String prepBase = prep.getBase();
+				String prepCase = PrepNGGroup.getSemanticHeadTokens().get(0).getCase();
+				String prepnp = String.format("prepnp(%s,%s)", prepBase, prepCase);
+				ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
+				for (String schema : schemata) {
+					if (schemaContains(schema, prepnp)) {
+						return true;
+					}
+				}
+			} else if (prepWord.getTokens().size() > 1) {
+				String prepOrth = prepWord.getOrth().toLowerCase();
+				String comprepnp = String.format("comprepnp(%s)", prepOrth);
+				ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
+				for (String schema : schemata) {
+					if (schemaContains(schema, comprepnp)) {
+						return true;
+					}
+				}
+				
+			}
+			
+		}
+		return false;
+    }
+    
+    private static boolean NGNGValenceCompatibility(SyntacticGroup NG1,
+    		SyntacticGroup NG2, Sentence sentence, 
+    		Map<String,ArrayList<String>> walentyMapping) {
+    	Token NG1Head = NG1.getSemanticHeadTokens().get(0);
+    	
+    	String NGHeadBase = NG1Head.getBase();
+		
+		if (!walentyMapping.containsKey(NGHeadBase)) {
+			return false;
+		} else {
+			ArrayList<String> NG2realizations = NG2.getWalentyRealizations();
+			
+			ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
+			for (String real : NG2realizations) {
+				for (String schema : schemata) {
+					if (schemaContains(schema, real)) {
+						return true;
+					}
+				}
+			}
+		}
+		return false;
+    }
+    
+    private static boolean NGcomplexPrepNGValenceCompatibility(SyntacticGroup NGGroup1,
+    		SyntacticGroup NGGroup2, Sentence sentence, 
+    		Map<String,ArrayList<String>> walentyMapping) {
+
+    	Token NGHead = NGGroup1.getSemanticHeadTokens().get(0);
+    	String NGHeadBase = NGHead.getBase();
+		
+		if (!walentyMapping.containsKey(NGHeadBase)) {
+			return false;
+		} else {
+	    	int prepStart = NGGroup1.getSentencePositionEnd() + 1;
+	    	int prepEnd = NGGroup2.getSentencePositionStart() - 1;
+	    	String complexPrep = sentence.getTextInsideSpan(prepStart, prepEnd);
+			String comprepnp = String.format("comprepnp(%s)", complexPrep);
+			ArrayList<String> schemata = walentyMapping.get(NGHeadBase);
+			for (String schema : schemata) {
+				if (schemaContains(schema, comprepnp)) {
+					return true;
+				}
+			}
+		}
+		return false;
+    }
+    
+    private static boolean schemaContains(String schema, String phraseType) {
+    	for (String position : schema.split("\\s\\+\\s")) {
+    		position = position.trim();
+    		position = position.substring(1, position.length()-1);
+    		for (String phrT : position.split(";")) {
+    			if (phrT.equals(phraseType)) {
+    				return true;
+    			}
+    		}
+    	}
+    	return false;
+    }
+    
+    private static boolean schemaContainsType(String schema, String type) {
+    	// to lepiej dziala dla rzeczownikow
+    	for (String position : schema.split("\\s\\+\\s")) {
+    		position = position.trim();
+    		position = position.substring(1, position.length()-1);
+    		for (String phrT : position.split(";")) {
+    			
+    			if (phrT.startsWith(type+"(")) {
+    				return true;
+    			}
+    		}
+    	}
+    	return false;
+    }
+    
+    
+    // compar ??
+    private static ArrayList<String> getPrepnps(String prepBase, String prepCase) {
+    	ArrayList<String> prepnps = new ArrayList<String>();
+    	prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase));
+    	if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) {
+    		prepnps.add(String.format("prepnp(%s,str)", prepBase));
+    	}
+    	if (prepCase.equals("gen") || prepCase.equals("acc")) {
+    		prepnps.add(String.format("prepnp(%s,part)", prepBase));
+    	}
+    	return prepnps;
+    }
+    
+    // eliminuje "od wsi do wsi"
+    private static boolean sameSemanticHeads(SyntacticGroup group1, 
+    		SyntacticGroup group2) {
+    	
+    	List<Token> group1HeadTokens = group1.getSemanticHeadTokens();
+    	List<Token> group2HeadTokens = group2.getSemanticHeadTokens();
+    	if (group1HeadTokens.size() != group2HeadTokens.size()) {
+    		return false;
+    	}
+    	
+    	for (int i=0; i < group1HeadTokens.size(); i++) {
+    		if (!group1HeadTokens.get(i).getBase().equals(group2HeadTokens.get(i).getBase())) {
+    			return false;
+    		}
+    	}
+    	
+    	return true;
+    }
+    
     /**
      * Wyszukuję i oznaczam wszystkie NER
@@ -151,8 +615,9 @@ public class Detector {
      * @param sentence
      */
     private static void addMentionsByTokenCtag(Sentence sentence) {
-        for (Token token : sentence)
+        for (Token token : sentence) {
             if (token.getCtag().matches(Constants.MORPHO_CTAGS))
                 sentence.addMention(new Mention(token));
+        }
     }
 }
 package pl.waw.ipipan.zil.core.md.entities;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 /**
@@ -203,4 +204,83 @@ public class Mention implements Comparable&lt;Mention&gt; {
 	public boolean isZeroSubject() {
 		return isZeroSubject;
 	}
+	
+	public int getSentencePositionStart() {
+		Token startToken = this.getFirstSegment();
+		return startToken.getSentencePosition();
+	}
+	
+	public int getSentencePositionEnd() {
+		Token endToken = this.getLastSegment();
+		return endToken.getSentencePosition();
+	}
+	
+    public boolean isPartOfQub() {
+    	if (this.segments.size() == 1) {
+        	Sentence sentence = this.segments.get(0).getSentence();
+        	for (SyntacticWord word : sentence.getSyntacticWords()) {
+        		if (word.getTokens().contains(this.segments.get(0)) &&
+        				word.getCtag().equals("Qub")) {
+        			return true;
+        		}
+        	}
+    	}
+    	return false;
+    }
+    
+    public boolean isPartOfPrep() {
+    	if (this.segments.size() == 1) {
+        	Sentence sentence = this.segments.get(0).getSentence();
+        	for (SyntacticWord word : sentence.getSyntacticWords()) {
+        		if (word.getTokens().contains(this.segments.get(0)) &&
+        				word.getCtag().equals("Prep")) {
+        			return true;
+        		}
+        	}
+    	}
+    	return false;
+    }
+    
+    private final List<String> FRAZEOS = Arrays.asList("Prep", "Qub", "Adv", "Interj",
+    		"Adj", "Conj", "Comp");
+    
+    public boolean isPartOfFrazeo() {
+    	if (this.segments.size() == 1) {
+        	Sentence sentence = this.segments.get(0).getSentence();
+        	for (SyntacticWord word : sentence.getSyntacticWords()) {
+        		if (word.getTokens().contains(this.segments.get(0)) &&
+        				FRAZEOS.contains(word.getCtag())) {
+        			return true;
+        		}
+        	}
+    	}
+    	return false;
+    }
+    
+    public boolean isPartOfComplexPrep(ArrayList<String> complexPreps) {
+    	if (this.segments.size() == 1) {
+        	Sentence sentence = this.segments.get(0).getSentence();
+        	if (this.getSentencePositionStart() - 1 >= 0) {
+        		String prep = sentence.get(this.getSentencePositionStart() - 1).getOrth();
+        		String noun = sentence.get(this.getSentencePositionStart()).getOrth();
+        		String possiblePrep = String.format("%s %s", prep, noun);
+        		if (complexPreps.contains(possiblePrep)) {
+        			return true;
+        		}
+        	}
+        	
+        	if (this.getSentencePositionStart() - 1 >= 0 && 
+        			this.getSentencePositionStart() + 1 < sentence.size()) {
+        		String prep1 = sentence.get(this.getSentencePositionStart() - 1).getOrth();
+        		String noun = sentence.get(this.getSentencePositionStart()).getOrth();
+        		String prep2 = sentence.get(this.getSentencePositionStart() + 1).getOrth();
+        		String possiblePrep = String.format("%s %s %s", prep1, noun, prep2);
+        		if (complexPreps.contains(possiblePrep)) {
+        			return true;
+        		}
+        	}
+    	}
+    	return false;
+    }
+	
 }
@@ -109,4 +109,118 @@ public class Sentence extends ArrayList&lt;Token&gt; {
 	public void addNamedEntity(NamedEntity namedEntity) {
 		namedEntities.add(namedEntity);
 	}
+	
+	public ArrayList<SyntacticGroup> getGroupsInsideSpan(int start, int end) {
+		ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>();
+		for (SyntacticGroup group : this.syntacticGroups) {
+			if (group.getSentencePositionStart() >= start &&
+					group.getSentencePositionEnd() <= end) {
+				if (!(group.getSentencePositionStart() == start &&
+						group.getSentencePositionEnd() == end)) {
+					groupsAtSpan.add(group);
+				}
+			}
+		}
+		return groupsAtSpan;
+	}
+	
+	public ArrayList<SyntacticGroup> getLargestNotIntersectingGroupsInsideSpan(int start, int end) {
+		ArrayList<SyntacticGroup> groupsAtSpan = new ArrayList<SyntacticGroup>();
+		for (SyntacticGroup group : this.syntacticGroups) {
+			
+			if (group.getSentencePositionStart() >= start &&
+					group.getSentencePositionEnd() <= end) {
+				if (!(group.getSentencePositionStart() == start &&
+						group.getSentencePositionEnd() == end)) {
+					groupsAtSpan.add(group);
+				}
+			}
+		}
+		return groupsAtSpan;
+	}
+	
+	public SyntacticGroup getFirstGroup(int start, int end) {
+		SyntacticGroup largestGroup = null;
+		int step = start;
+		while (step <= end && largestGroup == null) {
+			largestGroup = getLargestGroupOnStartPoint(step, end);
+			step++;
+		}
+		return largestGroup;
+	}
+	
+	private SyntacticGroup getLargestGroupOnStartPoint(int start, int end) {
+		SyntacticGroup largestGroup = null;
+		for (SyntacticGroup group : this.getGroups()) {
+			int groupStart = group.getSentencePositionStart();
+			int groupEnd = group.getSentencePositionEnd();
+			if (groupStart == start && groupEnd <= end &&
+					!(groupStart == start && groupEnd == end) &&
+					(largestGroup == null || 
+					largestGroup.getTokens().size() < group.getTokens().size())) {
+				largestGroup = group;
+			}
+		}
+		return largestGroup;
+	}
+	
+	public SyntacticGroup getLastGroup(int start, int end) {
+		SyntacticGroup largestGroup = null;
+		int step = end;
+		while (step != start && largestGroup == null) {
+			largestGroup = getLargestGroupOnEndPoint(start, step);
+			step--;
+		}
+		return largestGroup;
+	}
+	
+	private SyntacticGroup getLargestGroupOnEndPoint(int start, int end) {
+		SyntacticGroup largestGroup = null;
+		for (SyntacticGroup group : this.getGroups()) {
+			int groupStart = group.getSentencePositionStart();
+			int groupEnd = group.getSentencePositionEnd();
+			if (groupEnd == end && groupStart >= start &&
+					!(groupStart == start && groupEnd == end) &&
+					(largestGroup == null || 
+					largestGroup.getTokens().size() < group.getTokens().size())) {
+				largestGroup = group;
+			}
+		}
+		return largestGroup;
+	}
+	
+	public ArrayList<Mention> getMentionsInsideSpan(int start, int end) {
+		ArrayList<Mention> mentionsAtSpan = new ArrayList<Mention>();
+		for (Mention mention : this.mentions) {
+			if (mention.getSentencePositionStart() >= start &&
+					mention.getSentencePositionEnd() <= end) {
+				mentionsAtSpan.add(mention);
+			}
+		}
+		return mentionsAtSpan;
+	}
+	
+	public String getTextInsideSpan(int start, int end) {
+		String text = "";
+		int step = start;
+		while (step <= end) {
+			if (step != start) {
+				text += " ";
+			}
+			text += this.get(step).getOrth();
+			step++;
+		}
+		return text;
+	}
+	
+	public ArrayList<Token> getSegmentsInsideSpan(int start, int end) {
+		ArrayList<Token> tokensAtSpan = new ArrayList<Token>();
+		int step = start;
+		while (step <= end) {
+			tokensAtSpan.add(this.get(step));
+			step++;
+		}
+		return tokensAtSpan;
+	}
+	
 }
 package pl.waw.ipipan.zil.core.md.entities;
+import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
@@ -53,4 +54,175 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
 		return getType().compareTo(o.getType());
 	}
+	
+	public int getSentencePositionStart() {
+		Token startToken = tokens.get(0);
+		return startToken.getSentencePosition();
+	}
+	
+	public int getSentencePositionEnd() {
+		Token endToken = tokens.get(tokens.size()-1);
+		return endToken.getSentencePosition();
+	}
+	
+	
+	public SyntacticWord getFirstWord() {
+		SyntacticWord firstWord = null;
+		Token startToken = tokens.get(0);
+		Sentence sentence = startToken.getSentence();
+    	for (SyntacticWord word : sentence.getSyntacticWords()) {
+    		if(startToken.compareTo(word.getTokens().get(0)) == 0 &&
+    				(firstWord == null || firstWord.getTokens().size() < word.getTokens().size())) {
+    			firstWord = word;
+    		}
+    	}
+    	return firstWord;
+	}
+	
+	// NG and PrepNG only now
+	public ArrayList<String> getWalentyRealizations() {
+		ArrayList<String> realizations = new ArrayList<String>();
+		if (this.type.startsWith("PrepNG")) {
+			SyntacticWord prepWord = this.getFirstWord();
+			if (prepWord.getTokens().size() == 1) {
+				
+		    	Token prep = prepWord.getTokens().get(0);
+				String prepBase = prep.getBase();
+				String prepCase = this.getSemanticHeadTokens().get(0).getCase();
+				realizations.addAll(getPrepnps(prepBase, prepCase));
+				
+			} else if (prepWord.getTokens().size() > 1) {
+				
+				String prepOrth = prepWord.getOrth().toLowerCase();
+				String comprepnp = String.format("comprepnp(%s)", prepOrth);
+				realizations.add(comprepnp);
+
+			}
+		} else if (this.type.startsWith("NG")) {
+			String npCase = this.getSemanticHeadTokens().get(0).getCase();
+			realizations.addAll(getNps(npCase));
+		}
+		return realizations;
+	}
+	
+    // compar ??
+    private ArrayList<String> getPrepnps(String prepBase, String prepCase) {
+    	ArrayList<String> prepnps = new ArrayList<String>();
+    	prepnps.add(String.format("prepnp(%s,%s)", prepBase, prepCase));
+    	if (prepCase.equals("nom") || prepCase.equals("gen") || prepCase.equals("acc")) {
+    		prepnps.add(String.format("prepnp(%s,str)", prepBase));
+    	}
+    	if (prepCase.equals("gen") || prepCase.equals("acc")) {
+    		prepnps.add(String.format("prepnp(%s,part)", prepBase));
+    	}
+    	return prepnps;
+    }
+    
+    private ArrayList<String> getNps(String npCase) {
+    	ArrayList<String> nps = new ArrayList<String>();
+    	nps.add(String.format("np(%s)", npCase));
+    	if (npCase.equals("nom") || npCase.equals("gen") || npCase.equals("acc")) {
+    		nps.add(String.format("np(str)"));
+    	}
+    	if (npCase.equals("gen") || npCase.equals("acc")) {
+    		nps.add(String.format("np(part)"));
+    	}
+    	return nps;
+    }
+    
+    public boolean precedingWordIsVerb() {
+    	Sentence sentence = this.tokens.get(0).getSentence();
+    	int precedingTokenPosition = this.getSentencePositionStart() - 1;
+    	for (SyntacticWord word : sentence.getSyntacticWords()) {
+    		int lastWordPosition = word.getSentencePositionEnd();
+    		if (precedingTokenPosition == lastWordPosition &&
+    				(word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
+    			return true;
+    		}
+    	}
+    	return false;
+    }
+    
+    public SyntacticGroup getNextNG() {
+    	Sentence sentence = this.tokens.get(0).getSentence();
+    	int thisGroupEnd = this.getSentencePositionEnd();
+    	int sentenceLength = sentence.size();
+    	
+    	SyntacticGroup nextNG = null;
+    	for (int step = thisGroupEnd; step < sentenceLength; step++) {
+    		nextNG = sentence.getFirstGroup(step, sentenceLength);
+    		if (nextNG != null && nextNG.type.startsWith("NG") && 
+    				this.getSentencePositionEnd() < nextNG.getSentencePositionStart()) {
+    			break;
+    		} else {
+    			nextNG = null;
+    		}
+    	}
+    	return nextNG;
+    }
+    
+    public SyntacticGroup getFollowingGroup() {
+    	SyntacticGroup largestGroup = null;
+    	Sentence sentence = this.tokens.get(0).getSentence();
+    	int nextTokenPosition = this.getSentencePositionEnd() + 1;
+    	for (SyntacticGroup group : sentence.getGroups()) {
+    		if ((group.getType().startsWith("PrepNG") || group.getType().startsWith("NG")) &&
+    				group.getSentencePositionStart() == nextTokenPosition) {
+    			if (largestGroup == null || 
+    					largestGroup.getTokens().size() < group.getTokens().size()) {
+    				largestGroup = group;
+    			}
+    		}
+    	}
+    	return largestGroup;
+    }
+    
+    public SyntacticWord getPrecedingVerb() {
+    	int precedingTokenPosition = this.getSentencePositionStart() - 1;
+    	Sentence sentence = this.tokens.get(0).getSentence();
+    	if(this.isPartOfPrepNG()) {
+    		SyntacticGroup parentNGGroup = this.getParentPrepNG();
+    		precedingTokenPosition = parentNGGroup.getSentencePositionStart() - 1;
+    	}
+    	for (SyntacticWord word : sentence.getSyntacticWords()) {
+    		int lastWordPosition = word.getSentencePositionEnd();
+    		if (precedingTokenPosition == lastWordPosition &&
+    				(word.getCtag().equals("Verbfin") || word.getCtag().equals("Inf"))) {
+    			return word;
+    		}
+    	}
+    	return null;
+    }
+    
+    private boolean isPartOfPrepNG() {
+    	int NGGroupStart = this.getSentencePositionStart();
+    	int NGGroupEnd = this.getSentencePositionEnd();
+    	Sentence sentence = this.tokens.get(0).getSentence();
+    	for (SyntacticGroup group : sentence.getGroups()) {
+    		if (group.getType().startsWith("PrepNG") &&
+    				group.getSentencePositionStart() <= NGGroupStart &&
+    				group.getSentencePositionEnd() >= NGGroupEnd) {
+    			return true;
+    		}
+    	}
+    	return false;
+    }
+    
+    private SyntacticGroup getParentPrepNG() {
+    	SyntacticGroup parentPrepNG = null;
+    	int NGGroupStart = this.getSentencePositionStart();
+    	int NGGroupEnd = this.getSentencePositionEnd();
+    	Sentence sentence = this.tokens.get(0).getSentence();
+    	for (SyntacticGroup group : sentence.getGroups()) {
+    		if (group.getType().startsWith("PrepNG") &&
+    				group.getSentencePositionStart() <= NGGroupStart &&
+    				group.getSentencePositionEnd() >= NGGroupEnd) {
+    			if (parentPrepNG == null || group.getTokens().size() > parentPrepNG.getTokens().size()) {
+    				parentPrepNG = group;
+    			}
+    		}
+    	}
+    	return parentPrepNG;
+    }
+	
 }
@@ -6,11 +6,16 @@ import java.util.List;
 public class SyntacticWord implements Comparable<SyntacticWord> {
+	private String base;
 	private String ctag;
+	private String orth;
 	private List<Token> tokens = new ArrayList<>();
-	public SyntacticWord(String ctag, List<Token> tokens) {
+	public SyntacticWord(String ctag, List<Token> tokens, 
+			String base, String orth) {
+		this.base = base;
 		this.ctag = ctag;
+		this.orth = orth;
 		this.tokens = tokens;
 	}
@@ -39,5 +44,37 @@ public class SyntacticWord implements Comparable&lt;SyntacticWord&gt; {
 		return getCtag().compareTo(o.getCtag());
 	}
+	
+	public int getSentencePositionStart() {
+		Token startToken = tokens.get(0);
+		return startToken.getSentencePosition();
+	}
+	
+	public int getSentencePositionEnd() {
+		Token endToken = tokens.get(tokens.size()-1);
+		return endToken.getSentencePosition();
+	}
+	
+	public String getBase() {
+		return this.base;
+	}
+	
+	public String getOrth() {
+		return this.orth;
+	}
+	
+	public boolean isVerb() {
+		if (this.ctag.equals("Verbfin") || this.ctag.equals("Inf")) {
+			return true;
+		}
+		return false;
+	}
+	
+	public boolean isInterp() {
+		if (this.ctag.equals("Interp")) {
+			return true;
+		}
+		return false;
+	}
 }
@@ -70,6 +70,7 @@ public class TeiLoader {
         for (TEIMorph mo : m.getHeadMorphs())
             headTokens.add(teiMorph2Segment.get(mo));
         s.addMention(new Mention(tokens, headTokens, m.isZeroSubject()));
+        System.out.println(tokens.toString());
     }
     private static void loadSyntacticGroup(Sentence s, TEIGroup g,
@@ -94,10 +95,12 @@ public class TeiLoader {
     private static void loadSyntacticWord(Sentence s, TEIWord w,
                                           Map<TEIMorph, Token> teiMorph2Segment) {
         String ctag = w.getInterpretation().getCtag();
+        String base = w.getInterpretation().getBase();
+        String orth = w.getOrth();
         List<Token> tokens = new ArrayList<>();
         for (TEIMorph m : w.getAllMorphs())
             tokens.add(teiMorph2Segment.get(m));
-        s.addSyntacticWord(new SyntacticWord(ctag, tokens));
+        s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth));
     }
     private static void loadNE(Sentence s, TEINamedEntity ne,
@@ -73,10 +73,12 @@ public class ThriftLoader {
     private static void loadSyntacticWord(Sentence s, TSyntacticWord w,
                                           Map<String, Object> thirftId2Entity,
                                           Map<String, Token> thiftTokenId2Token) {
+    	String base = w.getChosenInterpretation().getBase();
         String ctag = w.getChosenInterpretation().getCtag();
+        String orth = w.getOrth();
         List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity,
                 thiftTokenId2Token, false);
-        s.addSyntacticWord(new SyntacticWord(ctag, tokens));
+        s.addSyntacticWord(new SyntacticWord(ctag, tokens, base, orth));
     }
     private static void loadNE(Sentence s, TNamedEntity ne,