Fully statistical mention detector version (2.0).

Bartłomiej Nitoń
1 parent 86cf20ea
Showing 13 changed files with 680 additions and 37 deletions
pom.xml
src/main/java/pl/waw/ipipan/zil/core/md/Main.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/Constants.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/Detector.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/zero/InstanceCreator.java
src/main/java/pl/waw/ipipan/zil/core/md/entities/Mention.java
src/main/java/pl/waw/ipipan/zil/core/md/entities/NamedEntity.java
src/main/java/pl/waw/ipipan/zil/core/md/entities/SyntacticGroup.java
src/main/java/pl/waw/ipipan/zil/core/md/entities/Token.java
src/main/java/pl/waw/ipipan/zil/core/md/io/tei/TeiLoader.java
src/main/java/pl/waw/ipipan/zil/core/md/io/thrift/ThriftLoader.java
src/main/resources/head_model.bin
src/main/resources/nominal_model.bin
@@ -4,13 +4,13 @@
  
     <groupId>pl.waw.ipipan.zil.core</groupId>
     <artifactId>md</artifactId>
-    <version>1.3</version>
+    <version>2.0</version>
  
     <developers>
         <developer>
-            <name>Mateusz Kopeć</name>
+            <name>Bartłomiej Nitoń</name>
             <organization>ICS PAS</organization>
-            <email>m.kopec@ipipan.waw.pl</email>
+            <email>bartek.niton@gmail.com</email>
         </developer>
     </developers>
  
@@ -4,6 +4,8 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
  
 import pl.waw.ipipan.zil.core.md.detection.Detector;
+import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector;
+import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector;
 import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
 import pl.waw.ipipan.zil.core.md.entities.Text;
 import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
@@ -19,9 +21,11 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.EnumMap;
 import java.util.HashMap;
@@ -32,13 +36,17 @@ public class Main {
     private static final Logger logger = LoggerFactory.getLogger(Main.class);
  
     private static final boolean GZIP_OUTPUT = true;
+    private static final String DEFAULT_HEAD_MODEL = "/head_model.bin";
+    private static final String DEFAULT_NOMINAL_MENTION_MODEL = "/nominal_model.bin";
     private static final String DEFAULT_ZERO_SUBJECT_MODEL = "/zero_subject_model.bin";
     private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt";
     private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt";
  
+    private static HeadDetector headModel;
+    private static NominalMentionDetector nominalMentionModel;
     private static ZeroSubjectDetector zeroSubjectModel;
  
-    public static enum ValenceDicts {
+    public static enum ValenceDicts { 
     	VerbsValence,
         NounsValence
     }
@@ -47,6 +55,12 @@ public class Main {
     		new EnumMap(ValenceDicts.class);
  
     static {
+    	InputStream headDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_HEAD_MODEL);
+        headModel = new HeadDetector(headDetectionModelStream);
+    	
+    	InputStream nominalMentionDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_NOMINAL_MENTION_MODEL);
+        nominalMentionModel = new NominalMentionDetector(nominalMentionDetectionModelStream);
+    	
         InputStream zeroSubjectDetectionModelStream = Main.class.getResourceAsStream(DEFAULT_ZERO_SUBJECT_MODEL);
         zeroSubjectModel = new ZeroSubjectDetector(zeroSubjectDetectionModelStream);
  
@@ -138,6 +152,14 @@ public class Main {
  
         File inputDir = new File(args[0]);
         File outputDir = new File(args[1]);
+        File defsOutputFile = new File(args[1], "definitions.csv");
+        PrintWriter defsWriter = null;
+		try {
+			defsWriter = new PrintWriter(defsOutputFile);
+		} catch (FileNotFoundException e1) {
+			// TODO Auto-generated catch block
+			e1.printStackTrace();
+		}
  
         if (!inputDir.isDirectory()) {
             logger.error(inputDir + " is not a directory!");
@@ -159,7 +181,6 @@ public class Main {
         }
  
  
-
         int all = 0;
         int errors = 0;
         for (File teiDir : IOUtils.getNKJPDirs(inputDir)) {
@@ -167,13 +188,15 @@ public class Main {
             try {
                 File targetDir = createTargetTextDir(inputDir, outputDir, teiDir);
                 TEICorpusText teiText = TeiLoader.readTeiText(teiDir);
-                annotateTeiText(teiText);
+                annotateTeiText(teiText, teiDir, defsWriter);
                 TeiSaver.saveTeiText(teiText, targetDir, GZIP_OUTPUT);
             } catch (IOException e) {
                 logger.error("Error processing text in dir:" + teiDir + " Error details: " + e.getLocalizedMessage(), e);
                 errors++;
             }
         }
+        
+        defsWriter.close();
  
         logger.info(all + " texts processed succesfully.");
         if (errors > 0)
@@ -208,9 +231,9 @@ public class Main {
      * @param thriftText text to annotate with mentions
      * @throws MultiserviceException when an error occures
      */
-    public static void annotateThriftText(TText thriftText) throws MultiserviceException {
+    public static void annotateThriftText(TText thriftText, PrintWriter defsWriter) throws MultiserviceException {
         Text responseText = ThriftLoader.loadTextFromThrift(thriftText);
-        Detector.findMentionsInText(responseText, zeroSubjectModel, valence);
+        Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter);
         ThriftSaver.updateThriftText(responseText, thriftText);
     }
  
@@ -221,9 +244,9 @@ public class Main {
      * @param teiText text to annotate with mentions
      * @throws TEIException when an error occurs
      */
-    public static void annotateTeiText(TEICorpusText teiText) throws TEIException {
-        Text responseText = TeiLoader.loadTextFromTei(teiText);
-        Detector.findMentionsInText(responseText, zeroSubjectModel, valence);
+    public static void annotateTeiText(TEICorpusText teiText, File textDir, PrintWriter defsWriter) throws TEIException {
+        Text responseText = TeiLoader.loadTextFromTei(teiText, textDir);
+        Detector.findMentionsInText(responseText, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter);
         TeiSaver.updateTeiText(responseText, teiText);
     }
  
@@ -15,6 +15,8 @@ public class Constants {
     		"Adj", "Conj", "Comp");
  
     public static final List<String> VERB_CTAGS = Arrays.asList("Inf", "Verbfin");
+    
+    public static final List<String> DEPPARSE_MLABELS = Arrays.asList("subj", "obj", "comp");//, "pd");
  
     private Constants() {
     }
@@ -4,10 +4,15 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
  
 import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
+import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector;
+import pl.waw.ipipan.zil.core.md.detection.nominal.NominalMentionDetector;
 import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
 import pl.waw.ipipan.zil.core.md.entities.*;
  
+import java.io.PrintWriter;
 import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
@@ -21,36 +26,47 @@ public class Detector {
     }
  
     public static void findMentionsInText(Text text,
+    									  HeadDetector headModel,
                                           ZeroSubjectDetector zeroSubjectModel,
-                                          Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
+                                          NominalMentionDetector nominalMentionModel,
+                                          Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
+                                          PrintWriter defsWriter) {
         text.clearMentions();
         logger.debug("Detecting mentions in text " + text.getId());
         for (Paragraph p : text)
             for (Sentence s : p)
-                detectMentionsInSentence(s, zeroSubjectModel, valence);
+                detectMentionsInSentence(s, headModel, zeroSubjectModel, nominalMentionModel, valence, defsWriter);
     }
  
     private static void detectMentionsInSentence(Sentence sentence,
+    											 HeadDetector headModel,
                                                  ZeroSubjectDetector zeroSubjectModel,
-                                                 Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
+                                                 NominalMentionDetector nominalMentionModel,
+                                                 Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
+                                                 PrintWriter defsWriter) {
         // adding mentions
-        addMentionsByTokenCtag(sentence);
-        addMentionsBySyntacticWordsCtag(sentence);
-        addMentionsByNamedEntities(sentence);
-        addMentionsByGroups(sentence, valence);
-        addSpeakerMentionsInSpoken(sentence);
+//        addMentionsByTokenCtag(sentence);
+//        addMentionsBySyntacticWordsCtag(sentence);
+//        addMentionsByNamedEntities(sentence);
+//        addMentionsByGroups(sentence, valence);
+//        //addMentionsByDeppParse(sentence);
+//        addSpeakerMentionsInSpoken(sentence);
  
         // zero subject detection
         zeroSubjectModel.addZeroSubjectMentions(sentence);
+        
+        List<Token> heads = headModel.detectHeads(sentence);
+        nominalMentionModel.addNominalMentions(sentence, valence, heads);
  
         // removing mentions
-        removeTo(sentence);
-        Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));
-        Cleaner.cleanUnnecessarySentenceMentions(sentence);
-        Cleaner.cleanFrazeos(sentence);
+        // removeTo(sentence); to nic nie daje, jeszcze ponizsze spradzic
+//        Cleaner.cleanWalentyFramedMentions(sentence, valence.get(ValenceDicts.VerbsValence));
+//        Cleaner.cleanUnnecessarySentenceMentions(sentence);
+//        Cleaner.cleanFrazeos(sentence);
+        
  
         // updating mention heads
-        updateMentionHeads(sentence);
+   //     updateMentionHeads(sentence);
     }
  
     /**
@@ -106,7 +122,7 @@ public class Detector {
     private static void addMentionsByGroups(Sentence sentence, 
     		Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
  
-    	for (SyntacticGroup group : sentence.getGroups()) {    		
+    	for (SyntacticGroup group : sentence.getGroups()) {
     		if (group.getType().startsWith("NG")) {
 	    		ArrayList<SyntacticGroup> nestedGroups = new ArrayList<SyntacticGroup>();
 	    		nestedGroups.add(group);
@@ -286,4 +302,431 @@ public class Detector {
                 sentence.addMention(new Mention(token));
         }
     }
+    
+    private static void addMentionsByDeppParse(Sentence sentence) {
+    	for (Token tok : sentence) {
+    		// sprawdzac czy wzmianka jest ciagla tekstowo, bo czasami depparser zwraca dziwne drzewka
+    		/*HashSet<Relation> relations = tok.getRelations();
+    		for (Relation rel : relations) {
+    			if (Constants.DEPPARSE_MLABELS.contains(rel.getName())
+    					&& !rel.getTarget().getCtag().matches(Constants.MORPHO_CTAGS)
+    					&& !rel.getTarget().getCtag().equals("prep")) {
+    				Mention mention = buildMentionFromSubtree(rel.getTarget());
+    				if (mention != null && !sentence.getMentions().contains(mention)) {
+    					sentence.addMention(mention);
+    				}
+    			}
+    		}*/
+    		if (tok.getCtag().matches(Constants.MORPHO_CTAGS) || tok.getCtag().equals("num")) {
+    			Mention mention = buildMentionFromSubtree(tok);
+    			if (mention != null && !sentence.getMentions().contains(mention)) {
+    				sentence.addMention(mention);
+    			}
+    		}
+    	}
+    }
+    
+    private static Mention buildMentionFromSubtree(Token head) {
+    	List<Token> heads = new ArrayList<Token>();
+    	List<Token> segments = new ArrayList<Token>();
+    	heads.add(head);
+    	//segments.add(head);
+    	segments.addAll(getTreeSegments(head));
+    	Collections.sort(segments);
+    	Mention mention = null;
+    	try {
+    		segments = removeBorderingSegments(segments, Arrays.asList("qub", "interp"));
+    		if (!segments.isEmpty()) {
+    			mention = new Mention(segments, heads);
+    		}
+    	} catch (ArrayIndexOutOfBoundsException e) {
+            logger.warn("Strange dependency structure");
+        }
+    	return mention;
+    }
+    
+    private static List<Token> removeBorderingSegments(List<Token> segments, List<String> tags2Remove) {
+    	Token firstSeg = segments.get(0);
+    	while(tags2Remove.contains(firstSeg.getCtag())) {
+    		segments.remove(firstSeg);
+    		if (segments.isEmpty()) {
+    			return segments;
+    		}
+    		firstSeg = segments.get(0);
+    	}
+    	
+    	Token lastSeg = segments.get(segments.size() - 1);
+    	while(tags2Remove.contains(lastSeg.getCtag())) {
+    		segments.remove(lastSeg);
+    		if (segments.isEmpty()) {
+    			return segments;
+    		}
+    		lastSeg = segments.get(segments.size() - 1);
+    	}
+    	
+    	return segments;
+    }
+    
+    private static List<Token> removePrecedingAdjs(List<Token> segments) {
+    	Token firstSeg = segments.get(0);
+    	while(firstSeg.getCtag().equals("adj")) {
+    		segments.remove(firstSeg);
+    		if (segments.isEmpty()) {
+    			return segments;
+    		}
+    		firstSeg = segments.get(0);
+    	}
+    	return segments;
+    }
+    
+    private static HashSet<Token> getTreeSegments(Token tok) {
+    	HashSet<Token> segments = new HashSet<Token>();
+    	segments.add(tok);
+    	for (Relation rel : tok.getRelations()) {
+    		segments.addAll(getTreeSegments(rel.getTarget()));
+    	}
+    	return segments;
+    }
+    
+    
+    private static final List<String> DEF_CONJS_ORTHS = 
+        	Arrays.asList(//"to", 
+        			"to jest", "jest to", "zwane inaczej", "czyli", "inaczej mówiąc", 
+        			"inaczej nazywane", "zwane też", "zwane także", "zwane również", "zwane często",
+        			"zwane zwykle", "definiowane jako", "znaczy tyle co", "rozumiane jako", "rozumiane jest",
+        			"ktoś kto", "coś co", "nazywa się", "tak definiuje się");
+    
+    private static final List<String> DEF_CONJS_BASES = 
+        	Arrays.asList(//"to", 
+        			"to być", "być to", "zwać inaczej", "czyli", "inaczej mówić", 
+        			"inaczej nazywać", "zwać też", "zwać także", "zwać również", "zwać często",
+        			"zwać zwykle", "definiować jako", "znaczyć tyle co", "rozumieć jako", "rozumieć być",
+        			"ktoś kto", "kto być kto", 
+        			"coś co", "co być co",
+        			"nazywać się", "tak definiować się");
+    
+    
+    private static final List<String> ANN_SOURCE_TO_OMMIT = 
+        	Arrays.asList("pan", "pani");
+    
+    
+    private static void getDefinitionsByGroups(Sentence sentence, String form, PrintWriter defsWriter) {
+    	List<String> def_conjs = DEF_CONJS_ORTHS;
+    	if (form.equals("base")) {
+    		def_conjs = DEF_CONJS_BASES;
+    	}
+    	for (SyntacticGroup group : sentence.getGroups()) {
+    		if (group.getType().startsWith("NG")) {
+	    		SyntacticGroup nextGroup = group.getClosestNGroup();
+	    		
+	    		if (nextGroup != null) {
+		    		int conjStart = group.getSentenceEndPosition() + 1;
+		    		int conjEnd = nextGroup.getSentenceStartPosition() - 1;
+		    		String conj = "";
+		    		if (conjEnd > conjStart && (group.containsNE() || nextGroup.containsNE())) {
+			    		conj = getText(sentence, conjStart, conjEnd, form);
+			    		if (def_conjs.contains(conj)) {
+			    			String definition = String.format("%s\t[%s%s%s]\t%s\t%s", 
+			    					group.toString(),
+			    					conj, "/groups/", form,
+			    					nextGroup.toString(),
+			    					sentence.toStringWithoutMentions());
+			    			defsWriter.println(definition);
+			    		}
+		    		}
+	    		}
+	    		
+    		}
+    	}
+    }
+    
+    private static void getDefinitionsByMentions(Sentence sentence, String form, PrintWriter defsWriter) {
+    	List<String> def_conjs = DEF_CONJS_ORTHS;
+    	if (form.equals("base")) {
+    		def_conjs = DEF_CONJS_BASES;
+    	}
+    	for (Mention mnt1 : sentence.getMentions()) {
+    		int mnt1End = mnt1.getSentenceEndPosition();
+    		for (Mention mnt2 : sentence.getMentions()) {
+    			int mnt2Start = mnt2.getSentenceStartPosition();
+    			int conjStart = mnt1End + 1;
+				int conjEnd = mnt2Start - 1;
+    			if (conjEnd > conjStart) {
+    				String conj = getText(sentence, conjStart, conjEnd, form);
+		    		if (def_conjs.contains(conj)) {
+		    			String definition = String.format("%s\t[%s%s%s]\t%s\t%s", 
+		    					mnt1.toStringWithoutBrackets(),
+		    					conj, "/mentions/", form,
+		    					mnt2.toStringWithoutBrackets(),
+		    					sentence.toStringWithoutMentions());
+		    			defsWriter.println(definition);
+		    		}
+    			}
+    		}
+    	}
+    }
+    
+    /*==> buildDefinitionsFromSubtree, 
+	zwrocic dla drzewa o korzeniu subj, wszystkie poddrzewa 
+	rozpoczynane relacja app, to co pod samym subj, to keyword:
+		patrz zdanie: 
+			
+			Dr David Warner , neurofizjolog Akademii Medycznej Loma Linda w Kalifornii , wspólnie 
+			ze specjalistami z Uniwersytetu Stanforda opracował urządzenie reagujące na ruchy mięśni twarzy .
+			*/
+    
+    private static void getDefinitionsByDeppParse(Sentence sentence, PrintWriter defsWriter) {
+
+    	// podzielic mention przez relacje apozycji
+    	
+		for (Token source : sentence) {
+    		HashSet<Relation> relations = source.getRelations();
+    		for (Relation rel : relations) {
+    			if (//Constants.DEPPARSE_MLABELS.contains(rel.getName())
+    					//rel.getName().equals("subj")
+    					rel.getName().equals("app") &&
+    					source.getReturnRelation() != null &&
+    					//Constants.DEPPARSE_MLABELS.contains(source.getReturnRelation().getName()) 
+    					((source.getCase().equals("nom") && rel.getTarget().getCase().equals("nom")
+    					&& source.getNumber().equals(rel.getTarget().getNumber()) 
+    					&& source.getGender().equals(rel.getTarget().getGender())
+    					&& !source.isPpron() && !rel.getTarget().isPpron()) 
+    							//|| source.getCtag().equals("brev")
+    							) //cos z tym brevem zrobic trzeba
+    					) {
+    				ArrayList<List<Token>> appositions = getAppositionsFromSubtree(source, rel.getTarget());
+    				if (appositions.size() > 1 && containsNE(appositions)) {
+    					appositions = mergeNEs(appositions);
+    				}
+    				if (appositions.size() > 1 && containsNE(appositions)) {
+    					ArrayList<String> appsStrList = appositionsToString(appositions);
+    					String appositionsStr = String.join("\t", appsStrList);
+        				
+        				String definition = String.format("%s\t!!!!!\t%s",
+        						//source.getOrth(),
+        						appositionsStr,
+    	    					sentence.toStringWithoutMentions());
+        				defsWriter.println(definition);
+    				}
+    			}
+    		}	
+    	}
+    }
+    
+    private static ArrayList<List<Token>> getAppositionsFromSubtree(Token root) {
+    	
+    	ArrayList<List<Token>> appositions = new ArrayList<List<Token>>();
+    	
+    	List<Token> segments = new ArrayList<Token>();
+    	segments.addAll(getTreeSegments(root, "app"));
+    	List<Token> allSegments = new ArrayList<Token>();
+    	allSegments.addAll(extendByNEs(segments));
+    	
+    	Collections.sort(allSegments);
+    	if (!ommitApp(allSegments)) {
+    		appositions.add(allSegments);
+    	}
+    	
+    	
+    	
+    	for (Token tok : allSegments) {
+    		for (Relation rel : tok.getRelations()) {
+    			if (rel.getName().equals("app") && !sameNE(tok, rel.getTarget())) {
+    				appositions.addAll(getAppositionsFromSubtree(rel.getTarget()));
+    			}
+    		}
+    	}
+    	
+    	return appositions;
+    }
+    
+    private static ArrayList<List<Token>> getAppositionsFromSubtree(Token source, Token target) {
+    	
+    	ArrayList<List<Token>> appositions = new ArrayList<List<Token>>();
+    	if (sameNE(source, target)) {
+    		return appositions;
+    	}
+    	
+    	List<Token> sourceSegments = new ArrayList<Token>();
+    	sourceSegments.addAll(getTreeSegments(source, target));
+    	List<Token> allSourceSegments = new ArrayList<Token>();
+    	allSourceSegments.addAll(extendByNEs(sourceSegments));
+    	
+    	Collections.sort(allSourceSegments);
+    	if (!ommitApp(allSourceSegments)) {
+    		appositions.add(allSourceSegments);
+    	}
+    	
+    	List<Token> targetSegments = new ArrayList<Token>();
+    	targetSegments.addAll(getTreeSegments(target));
+    	List<Token> allTargetSegments = new ArrayList<Token>();
+    	allTargetSegments.addAll(extendByNEs(targetSegments));
+    	
+    	Collections.sort(allTargetSegments);
+    	if (!ommitApp(allTargetSegments)) {
+    		appositions.add(allTargetSegments);
+    	}
+    	
+    	return appositions;
+    }
+    
+    private static ArrayList<List<Token>> mergeNEs(ArrayList<List<Token>> appositions) {
+    	ArrayList<List<Token>> appositionsCopy = new ArrayList<List<Token>>(appositions);
+    	Sentence sentence = appositions.get(0).get(0).getSentence();
+    	for (NamedEntity ne : sentence.getNamedEntities()) {
+    		if (ne.getType().equals("persName") 
+    				&& (ne.getSubtype() == null || ne.getSubtype().isEmpty())) {
+    			HashSet<Token> mergedNE = new HashSet<Token>();
+    			for (List<Token> app : appositionsCopy) {
+    				if (ne.getTokens().containsAll(app)) {
+    					mergedNE.addAll(app);
+    					appositions.remove(app);
+    				}
+    			}
+    			if (mergedNE.size() > 0) {
+    				ArrayList newApposition = new ArrayList<Token>();
+        			newApposition.addAll(mergedNE);
+    				Collections.sort(newApposition);
+        			appositions.add(newApposition);
+    			}
+    			appositionsCopy = new ArrayList<List<Token>>(appositions);
+    		}
+    	}
+    	return appositions;
+    }
+    
+    public static boolean containsNE(ArrayList<List<Token>> appositions) {
+    	for (List<Token> app : appositions) {
+    		if (isNE(app)) {
+    			return true;
+    		}
+    		/*for (Token tok : app) {
+        		for (NamedEntity ne : sentence.getNamedEntities()) {
+        			if (ne.getSubtype() != null && ne.getSubtype().equals("forename")) {
+        				continue;
+        			}
+        			if (ne.getTokens().contains(tok)) {
+        				return true;
+        			}
+        		}
+        	}*/
+    	}
+    	return false;
+    }
+    
+    private static boolean isNE(List<Token> segments) {
+    	Sentence sentence = segments.get(0).getSentence();
+    	for (NamedEntity ne : sentence.getNamedEntities()) {
+			if (ne.getTokens().containsAll(segments) &&
+					segments.containsAll(ne.getTokens())) {
+				return true;
+			}
+		}
+    	return false;
+    }
+    
+    private static ArrayList<String> appositionsToString(ArrayList<List<Token>> appositions) {
+    	ArrayList<String> apposistionsStrs = new ArrayList<String>();
+    	for (List<Token> apposition : appositions) {
+    		String appText = getText(apposition, "orth");
+    		apposistionsStrs.add(appText);
+    	}
+    	return apposistionsStrs;
+    }
+    
+    
+    private static boolean ommitApp(List<Token> segments) {
+    	segments = removeBorderingSegments(segments, Arrays.asList("interp"));
+    	if (segments.size() == 0) {
+    		return true;
+    	}
+    	String appositionBase = getText(segments, "base");
+    	if (ANN_SOURCE_TO_OMMIT.contains(segments.get(0).getBase().toLowerCase()) || 
+    			appositionBase.length() < 2) {
+    		return true;
+    	}
+    	return false;
+    }
+    
+    private static HashSet<Token> getTreeSegments(Token tok, String divRel) {
+    	HashSet<Token> segments = new HashSet<Token>();
+    	segments.add(tok);
+    	
+    	for (Relation rel : tok.getRelations()) {
+    		if (!rel.getName().equals(divRel)) {
+    			segments.addAll(getTreeSegments(rel.getTarget(), divRel));
+    		}
+    		
+    	}
+    	return segments;
+    }
+    
+    private static HashSet<Token> getTreeSegments(Token tok, Token nGoThere) {
+    	HashSet<Token> segments = new HashSet<Token>();
+    	segments.add(tok);
+    	
+    	for (Relation rel : tok.getRelations()) {
+    		if (!rel.getTarget().equals(nGoThere)) {
+    			segments.addAll(getTreeSegments(rel.getTarget(), nGoThere));
+    		}
+    		
+    	}
+    	return segments;
+    }
+    
+    private static HashSet<Token> extendByNEs(List<Token> segments) {
+    	HashSet<Token> allSegments = new HashSet<Token>();
+    	allSegments.addAll(segments);
+    	for (Token tok : segments) {
+    		Token neTok = tok;
+    		while (neTok.getReturnRelation() != null 
+    				&& (//neTok.getReturnRelation().getName().equals("ne") 
+    						//|| 
+    						sameNE(neTok, neTok.getReturnRelation().getTarget()))) {
+    			neTok = neTok.getReturnRelation().getTarget();
+    			allSegments.add(neTok);
+    		}
+    	}
+    	return allSegments;
+    }
+    
+    private static boolean sameNE(Token tok1, Token tok2) {
+    	Sentence sentence = tok1.getSentence();
+    	for (NamedEntity ne : sentence.getNamedEntities()) {
+    		if (ne.getTokens().contains(tok1) 
+    				&& ne.getTokens().contains(tok2) 
+    				&& ne.getType().equals("persName")) {
+    			return true;
+    		}
+    	}
+    	return false;
+    }
+    
+    // TODO: przeniesc do klasy Sentence i wywalic static
+    private static String getText(Sentence sentence, int start, int end, String form) {
+    	String conj = "";
+    	for (Token tok : sentence.subList(start, end+1)) {
+			if (!tok.getCtag().equals("interp")) {
+				if (form.equals("orth")) {
+					conj += " " + tok.getOrth();
+				} else if (form.equals("base")) {
+					conj += " " + tok.getBase();
+				}
+			}
+		}
+		return conj.trim();
+    }
+    
+    private static String getText(List<Token> segments, String form) {
+    	String conj = "";
+    	for (Token tok : segments) {
+			if (form.equals("orth")) {
+				conj += " " + tok.getOrth();
+			} else if (form.equals("base")) {
+				conj += " " + tok.getBase();
+			}
+		}
+		return conj.trim();
+    }
+    
 }
@@ -35,7 +35,7 @@ public class InstanceCreator {
                 allTexts++;
                 logger.info("Processing text " + textDir);
                 TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
-                Text text = TeiLoader.loadTextFromTei(ct);
+                Text text = TeiLoader.loadTextFromTei(ct, textDir);
  
                 for (Paragraph p : text)
                     for (Sentence s : p) {
@@ -83,6 +83,14 @@ public class Mention implements Comparable&lt;Mention&gt; {
 		sb.append("]");
 		return sb.toString();
 	}
+	
+	public String toStringWithoutBrackets() {
+		StringBuffer sb = new StringBuffer();
+		for (Token seg : segments) {
+			sb.append(seg.toString() + " ");
+		}
+		return sb.toString();
+	}
  
 	public MentionGroup getMentionGroup() {
 		return mentionGroup;
@@ -6,14 +6,26 @@ import java.util.List;
 public class NamedEntity implements Comparable<NamedEntity> {
  
 	private List<Token> tokens;
+	private String type;
+	private String subtype;
  
-	public NamedEntity(List<Token> tokens) {
+	public NamedEntity(List<Token> tokens, String type, String subType) {
 		this.tokens = tokens;
+		this.type = type;
+		this.subtype = subType;
 	}
  
 	public List<Token> getTokens() {
 		return this.tokens;
 	}
+	
+	public String getType() {
+		return this.type;
+	}
+	
+	public String getSubtype() {
+		return this.subtype;
+	}
  
 	@Override
 	public int compareTo(NamedEntity o) {
 package pl.waw.ipipan.zil.core.md.entities;
  
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
  
@@ -143,6 +144,30 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
     	return largestGroup;
     }
  
+    public SyntacticGroup getClosestNGroup() {
+    	SyntacticGroup nextGroup = null;
+    	Sentence sentence = this.tokens.get(0).getSentence();
+    	int nextTokenPosition = this.getSentenceEndPosition() + 1;
+    	while (nextTokenPosition <= sentence.size()) {
+    		
+    		for (SyntacticGroup group : sentence.getGroups()) {
+        		if (group.getType().startsWith("NG") &&
+        				group.getSentenceStartPosition() == nextTokenPosition) {
+        			if (nextGroup == null || 
+        					nextGroup.getTokens().size() < group.getTokens().size()) {
+        				nextGroup = group;
+        			}
+        		}
+        	}
+    		if (nextGroup != null) {
+    			break;
+    		}
+    		nextTokenPosition ++;
+    	}
+    	
+    	return nextGroup;
+    }
+    
     public SyntacticWord getPrecedingVerb() {
     	int precedingTokenPosition = this.getSentenceStartPosition() - 1;
     	Sentence sentence = this.tokens.get(0).getSentence();
@@ -190,5 +215,28 @@ public class SyntacticGroup implements Comparable&lt;SyntacticGroup&gt; {
     	}
     	return parentPrepNG;
     }
+    
+    public String toString() {
+    	String textRep = "";
+    	for (Token tok : tokens) {
+    		textRep += " " + tok.getOrth();
+    	}
+    	return textRep.trim();
+    }
+    
+    public boolean containsNE() {
+    	Sentence sentence = this.tokens.get(0).getSentence();
+    	for (Token tok : tokens) {
+    		for (NamedEntity ne : sentence.getNamedEntities()) {
+    			if (ne.getSubtype() != null && ne.getSubtype().equals("forename")) {
+    				continue;
+    			}
+    			if (ne.getTokens().contains(tok)) {
+    				return true;
+    			}
+    		}
+    	}
+    	return false;
+    }
  
 }
@@ -7,6 +7,8 @@ public class Token implements Comparable&lt;Token&gt; {
 	private int sentencePosition;
  
 	private Set<Mention> mentions = null;
+	private HashSet<Relation> relations = new HashSet<Relation>();
+	private Relation returnRelation = null;
  
 	private String orth;
 	private Interpretation chosenInterpretation;
@@ -119,10 +121,33 @@ public class Token implements Comparable&lt;Token&gt; {
 	public String getCtag() {
 		return getChosenInterpretation().getCtag();
 	}
+	
+	public boolean isPpron() {
+		if (this.getCtag().startsWith("ppron")) {
+			return true;
+		}
+		return false;
+	}
  
 	@Override
 	public int compareTo(Token o) {
 		return getSentencePosition().compareTo(o.getSentencePosition());
 	}
+	
+	public void addRelation(Relation relation) {
+		relations.add(relation);
+	}
+	
+	public HashSet<Relation> getRelations() {
+		return relations;
+	}
+	
+	public void setReturnRelation(Relation relation) {
+		returnRelation = relation;
+	}
+	
+	public Relation getReturnRelation() {
+		return returnRelation;
+	}
  
 }
@@ -8,11 +8,19 @@ import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
 import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
  
 import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Paths;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
  
+import org.json.JSONArray;
+import org.json.JSONObject;
+
+
 public class TeiLoader {
  
     private static Logger logger = LoggerFactory.getLogger(TeiLoader.class);
@@ -24,28 +32,75 @@ public class TeiLoader {
     public static TEICorpusText readTeiText(File teiDir) throws TEIException {
         return teiAPI.readFromNKJPDirectory(teiDir);
     }
-
-    public static Text loadTextFromTei(TEICorpusText teiText) {
+    
+    public static Text loadTextFromTei(TEICorpusText teiText, File textDir) {
         Text text = new Text(teiText.getCorpusHeader().getId());
+        
+        String textId = textDir.getName();
+        
+        System.out.println(textId);
+        
+        byte[] encoded;
+        JSONArray jsonParagraphs = null;
+		try {
+			//encoded = Files.readAllBytes(Paths.get("/home/bniton/Projekty/koreferencja/COTHEC/clients/python/data/train-JSON-Concraft-old_grammar/"+textId+".json"));
+			encoded = Files.readAllBytes(Paths.get("/home/bartek/Projekty/COTHEC/clients_data/"+textId+".json"));
+			String jsonContent = new String(encoded, StandardCharsets.UTF_8);
+			JSONObject jsonObject = new JSONObject(jsonContent);
+	        
+	        jsonParagraphs = jsonObject.getJSONArray("paragraphs");
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			//e.printStackTrace();
+			logger.debug("No depparse layer.");
+		}
  
         logger.debug("Loading tei text " + text.getId() + "...");
-        for (TEIParagraph teiP : teiText.getParagraphs())
-            loadParagraph(text, teiP);
+        
+        List<TEIParagraph> teiParagraphs = teiText.getParagraphs();
+        
+        for (int i=0; i < teiParagraphs.size(); i++) {
+        	TEIParagraph teiP = teiParagraphs.get(i);
+        	JSONObject jsonP = null;
+        	if (jsonParagraphs != null) {
+        		jsonP = new JSONObject(jsonParagraphs.get(i).toString());
+        	}
+        	loadParagraph(text, teiP, jsonP);
+        }
         logger.debug("Tei text loaded.");
  
         return text;
     }
  
-    private static void loadParagraph(Text text, TEIParagraph teiP) {
+    private static void loadParagraph(Text text, TEIParagraph teiP, JSONObject jsonP) {
         Paragraph p = new Paragraph();
         text.add(p);
-        for (TEISentence teiS : teiP.getSentences())
-            loadSentence(p, teiS);
+        
+        List<TEISentence> teiSentences = teiP.getSentences();
+        
+        JSONArray jsonSentences = null;
+        if (jsonP != null) {
+        	jsonSentences = jsonP.getJSONArray("sentences");
+        }
+        
+        for (int i=0; i < teiSentences.size(); i++) {
+        	TEISentence teiS = teiSentences.get(i);
+        	
+        	JSONObject jsonS = null;
+        	if (jsonP != null) {
+            	if (i < jsonSentences.length()) {
+            		jsonS = new JSONObject(jsonSentences.get(i).toString());
+            	}
+        	}
+        	
+        	loadSentence(p, teiS, jsonS);
+        }
     }
  
-    private static void loadSentence(Paragraph p, TEISentence teiS) {
+    private static void loadSentence(Paragraph p, TEISentence teiS, JSONObject jsonS) {
         Sentence s = new Sentence();
         p.add(s);
+        
         Map<TEIMorph, Token> teiMorph2Segment = new HashMap<>();
         for (TEIMorph teiM : teiS.getMorphs()) {
             Token token = loadToken(s, teiM);
@@ -59,6 +114,33 @@ public class TeiLoader {
             loadSyntacticGroup(s, g, teiMorph2Segment);
         for (TEIMention m : teiS.getAllMentions())
             loadMentions(s, m, teiMorph2Segment);
+        
+        if (jsonS != null && s.size() == jsonS.getJSONArray("tokens").length()) {
+        	JSONArray relations = jsonS.getJSONArray("dependencyParse");
+            for (int i=0; i<relations.length(); i++) {
+            	loadRelation(s, new JSONObject(relations.get(i).toString()));
+            }
+        } else {
+        	//System.out.println(s.toStringWithoutMentions());
+        }
+    }
+    
+    private static void loadRelation(Sentence s, JSONObject jsonRelation) {
+    	String label = jsonRelation.getString("label");
+    	if (!label.equals("root") && !jsonRelation.getString("startTokenId").isEmpty() &&
+    			jsonRelation.get("startTokenId").getClass() == String.class) {
+    		String[] sourceIdParts = jsonRelation.getString("startTokenId").split("\\.");
+    		String[] targetIdParts = jsonRelation.getString("endTokenId").split("\\.");
+    		
+        	int sourceId = Integer.parseInt(sourceIdParts[sourceIdParts.length-1]);
+        	int targetId = Integer.parseInt(targetIdParts[sourceIdParts.length-1]);
+        	
+        	Token source = s.get(sourceId);
+        	Token target = s.get(targetId);
+        	
+        	source.addRelation(new Relation(label, target));
+        	target.setReturnRelation(new Relation(label, source));
+    	}
     }
  
     private static void loadMentions(Sentence s, TEIMention m,
@@ -107,7 +189,7 @@ public class TeiLoader {
         List<Token> tokens = new ArrayList<>();
         for (TEIMorph m : ne.getLeaves())
             tokens.add(teiMorph2Segment.get(m));
-        s.addNamedEntity(new NamedEntity(tokens));
+        s.addNamedEntity(new NamedEntity(tokens, ne.getType(), ne.getSubtype()));
     }
  
     private static Token loadToken(Sentence s, TEIMorph teiM) {
@@ -86,7 +86,7 @@ public class ThriftLoader {
                                Map<String, Token> thiftTokenId2Token) {
         List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity,
                 thiftTokenId2Token, false);
-        s.addNamedEntity(new NamedEntity(tokens));
+        s.addNamedEntity(new NamedEntity(tokens, ne.getType(), ne.getSubtype()));
     }
  
     private static Map<String, Object> getThriftId2EntityMap(