Added missing files.

Bartłomiej Nitoń
1 parent 2d60e476
Showing 14 changed files with 2360 additions and 0 deletions
.gitignore
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/FeatureGeneration.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/HeadDetector.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/InstanceCreator.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Model.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Serializer.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Trainer.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/FeatureGeneration.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/InstanceCreator.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Model.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/NominalMentionDetector.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Serializer.java
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Trainer.java
src/main/java/pl/waw/ipipan/zil/core/md/entities/Relation.java
@@ -2,3 +2,4 @@
 .classpath
 .project
 .settings
+/bin/
+package pl.waw.ipipan.zil.core.md.detection.head;
+
+import pl.waw.ipipan.zil.core.md.detection.Constants;
+import pl.waw.ipipan.zil.core.md.entities.*;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph;
+
+import java.util.*;
+
+public class FeatureGeneration {
+	final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo",
+			"lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" }));
+
+	final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale",
+			"lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie",
+			"przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" }));
+
+	final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>(
+			Arrays.asList(new String[] { "?", "!" }));
+
+	final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>();
+	static {
+		CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")");
+		CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\"");
+		CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'");
+	}
+
+	final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12",
+			"ppron3", "ger", "num", "numcol" }));
+
+	final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" }));
+
+	final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt",
+			"praet", "winien" }));
+
+	final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki",
+			"który" }));
+
+	public static void generateFeatures(Map<String, Object> features, Token t, Sentence s, Set<String> quasiVerbs) {
+
+		features.put("ctag", t.getChosenInterpretation().getCtag());
+		features.put("number", t.getChosenInterpretation().getNumber());
+		
+		features.put("NGHead", NGHead(t, s));
+		features.put("isNextColon", isNextColon(t, s));
+		features.put("wordCtag", wordCtag(t, s));
+		features.put("isPartOfNE", isPartOfNE(t, s));
+		features.put("isFirstInNE", isFirstInNE(t, s));
+		features.put("nextCtag", getNeighbouringTag(s, t, 1)); 
+		features.put("prevCtag", getNeighbouringTag(s, t, -1));
+		features.put("sentLength", s.size());
+		
+		features.put("tokenOrthLength", t.getOrth().length());
+		features.put("tokenBaseLength", t.getBase().length());
+		features.put("isNextDot", isNextDot(t, s));
+		features.put("closestNEDistance", closestNEDistance(t, s));
+		features.put("startsWithUpperOrth", Character.isUpperCase(t.getOrth().codePointAt(0)));
+		features.put("startsWithUpperBase", Character.isUpperCase(t.getBase().codePointAt(0)));
+
+		
+		//features.put("isPartOfFrazeo", isPartOfFrazeo(t, s)); 
+		//features.put("gender", t.getChosenInterpretation().getGender());
+		//features.put("person", t.getChosenInterpretation().getPerson()); 
+		//features.put("quasi", quasiVerbs.contains(m.getChosenInterpretation().getBase()));
+		//features.put("isPrevPraet", isPrevPraet(t, s)); 
+		//features.put("isPrevComma", isPrevComma(t, s));
+		//features.put("isPrev2Pred", isPrev2Pred(t, s));
+		//features.put("isNextInf", isNextInf(t, s));
+
+		
+		//List<Token> clause = getClause(s, m);
+//		features.put("clauseLength", clause.size());
+  
+		//addFeatures(features, clause, "clause", m);
+/*		addFeatures(features, s, "sent", t);
+		for (int i = 1; i < 6; i++) 
+			addFeatures(features, getWindow(s, t, i, 0), "window_" + i + "_" + 0, t);
+		for (int i = 1; i < 6; i++)
+			addFeatures(features, getWindow(s, t, 0, i), "window_" + 0 + "_" + i, t);
+		for (int i = 1; i < 6; i++)
+			addFeatures(features, getWindow(s, t, i, i), "window_" + i + "_" + i, t);*/
+	}
+	
+	///////////////////////////////////
+	
+	private static boolean NGHead(Token t, Sentence s) {
+		
+		for (SyntacticGroup group : s.getGroups()) {
+			if (group.getType().startsWith("NG") && group.getSemanticHeadTokens().contains(t)) {
+				return Boolean.valueOf(true);
+			}
+		}
+		return Boolean.valueOf(false);
+	}
+	
+	private static boolean isNextColon(Token t, Sentence s) {
+		int idx = s.indexOf(t) + 1;
+		if (idx >= s.size() || idx < 0)
+			return Boolean.valueOf(false);
+		return Boolean.valueOf(s.get(idx).getOrth().equals(":"));
+	}
+	
+	private static boolean isNextDot(Token t, Sentence s) {
+		int idx = s.indexOf(t) + 1;
+		if (idx >= s.size() || idx < 0)
+			return Boolean.valueOf(false);
+		return Boolean.valueOf(s.get(idx).getOrth().equals("."));
+	}
+	
+	private static String wordCtag(Token t, Sentence s) {
+		for (SyntacticWord w : s.getSyntacticWords()) {
+			if (w.getTokens().contains(t)) {
+				return w.getCtag();
+			}
+		}
+		return "None";
+	}
+	
+	private static boolean isPartOfNE(Token t, Sentence s) {
+		for (NamedEntity ne : s.getNamedEntities()) {
+			if (ne.getTokens().contains(t)) {
+				return Boolean.valueOf(true);
+			}
+        }
+		return Boolean.valueOf(false);
+	}
+	
+	private static int closestNEDistance(Token t, Sentence s) {
+		int lowestDistance = -1;
+		for (NamedEntity ne : s.getNamedEntities()) {
+			int distance = ne.getTokens().get(0).getSentencePosition() - t.getSentencePosition();
+			if ( distance >= 0 && (distance < lowestDistance || lowestDistance < 0)) {
+				lowestDistance = distance;
+			}
+        }
+		return lowestDistance;
+	}
+	
+	private static boolean isFirstInNE(Token t, Sentence s) {
+		for (NamedEntity ne : s.getNamedEntities()) {
+			if (ne.getTokens().get(0).compareTo(t) == 0) {
+				return Boolean.valueOf(true);
+			}
+        }
+		return Boolean.valueOf(false);
+	}
+	
+	private static boolean isPartOfFrazeo(Token t, Sentence s) {
+    	for (SyntacticWord word : s.getSyntacticWords()) {
+    		if (word.getTokens().contains(t) &&
+    				Constants.FRAZEO_CTAGS.contains(word.getCtag())) {
+    			return true;
+    		}
+    	}
+    	return false;
+    }
+	
+	///////////////////////////////////
+
+	private static boolean isNextInf(Token m, Sentence s) {
+		boolean now = false;
+		for (Token morph : s) {
+			if (now)
+				return morph.getChosenInterpretation().getCtag().equals("inf");
+			if (m.equals(morph))
+				now = true;
+		}
+		return false;
+	}
+
+	private static boolean isPrev2Pred(Token m, Sentence s) {
+		Token prev = null;
+		Token prev2 = null;
+		for (Token morph : s) {
+			if (m.equals(morph))
+				break;
+			prev2 = prev;
+			prev = morph;
+		}
+		return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred"))
+				|| (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred"));
+	}
+
+	private static Object isPrevComma(Token m, Sentence s) {
+		Token prev = null;
+		for (Token morph : s) {
+			if (m.equals(morph))
+				break;
+			prev = morph;
+		}
+		return prev != null && prev.getChosenInterpretation().getBase().equals(",");
+	}
+
+	private static String getNeighbouringTag(Sentence s, Token m, int i) {
+		int idx = s.indexOf(m) + i;
+		if (idx >= s.size() || idx < 0)
+			return "None";
+		return s.get(idx).getChosenInterpretation().getCtag();
+	}
+
+	private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) {
+
+		boolean hasNom = false; // 1
+		boolean hasNum = false; // 2
+		boolean hasPOG = false; // 3
+
+		boolean hasNomNum = false;
+		boolean hasNumPOG = false;
+		boolean hasNomPOG = false;
+		boolean hasNomNumPOG = false;
+
+		boolean has2Nom = false;
+		boolean has2NomPOG = false;
+		boolean has2POG = false;
+
+		Token prev = null;
+		for (Token candidate : clause) {
+
+			if (!isNoun(candidate) || isJakJako(prev)) {
+				prev = candidate;
+				continue;
+			}
+
+			// nom, nom2
+			if (isNom(candidate)) {
+				if (hasNom)
+					has2Nom = true;
+				hasNom = true;
+			}
+			// num
+			if (agreedNum(candidate, m)) {
+				hasNum = true;
+			}
+			// pog, pog2
+			if (agreedGenderOrPerson(candidate, m)) {
+				if (hasPOG)
+					has2POG = true;
+				hasPOG = true;
+			}
+
+			// nom num, nom num pog
+			if (isNom(candidate) && agreedNum(candidate, m)) {
+				if (agreedGenderOrPerson(candidate, m))
+					hasNomNumPOG = true;
+				hasNomNum = true;
+			}
+
+			// nom pog, num pog
+			if (agreedGenderOrPerson(candidate, m))
+				if (isNom(candidate)) {
+					if (hasNomPOG)
+						has2NomPOG = true;
+					hasNomPOG = true;
+				} else if (agreedNum(candidate, m))
+					hasNumPOG = true;
+
+			prev = candidate;
+		}
+
+		// features.put("conj_" + prefix, hasConj);
+		features.put("cand_2_nom_" + prefix, has2Nom);
+		features.put("cand_2_POG_" + prefix, has2POG);
+		features.put("cand_2_nom+POG_" + prefix, has2NomPOG);
+
+		features.put("cand_nom_" + prefix, hasNom);
+		features.put("cand_num_" + prefix, hasNum);
+		features.put("cand_POG_" + prefix, hasPOG);
+
+		features.put("cand_nom+num_" + prefix, hasNomNum);
+		features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG);
+		features.put("cand_nom+POG_" + prefix, hasNomPOG);
+		features.put("cand_num+POG_" + prefix, hasNumPOG);
+	}
+
+	private static List<Token> getWindow(Sentence s, Token m, int pre, int post) {
+
+		int idx = s.indexOf(m);
+		int from = Math.max(0, idx - pre);
+		int to = Math.min(s.size(), idx + post + 1);
+
+		return new ArrayList<>(s.subList(from, to));
+	}
+
+	private static boolean isPrevPraet(Token m, Sentence s) {
+		Token prev = null;
+		for (Token morph : s) {
+			if (m.equals(morph))
+				break;
+			prev = morph;
+		}
+		return prev != null && prev.getChosenInterpretation().getCtag().equals("praet");
+	}
+
+	/**
+	 * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo,
+	 * lub (jak przy streszczeniach: w środku musi być czasownik w formie
+	 * osobowej),
+	 * 
+	 * @param s
+	 *            sentence
+	 * @param m2
+	 *            token
+	 * @return clause with the token
+	 */
+	public static List<Token> getClause(Sentence s, Token m2) {
+
+		List<List<Token>> sublists = getClauses(s);
+
+		for (List<Token> sub : sublists)
+			for (Token m : sub)
+				if (m.equals(m2))
+					return sub;
+
+		return null;
+	}
+
+	public static List<List<Token>> getClauses(Sentence s) {
+
+		Set<Token> noSplitMorphs = new HashSet<>();
+		for (SyntacticGroup g : s.getGroups()) {
+			for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
+				noSplitMorphs.add(m);
+			}
+		}
+		for (SyntacticWord g : s.getSyntacticWords()) {
+			for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
+				noSplitMorphs.add(m);
+			}
+		}
+
+		LinkedList<List<Token>> sublists = new LinkedList<>();
+		List<Token> currentSublist = new ArrayList<>();
+		boolean clauseHasVerb = false;
+		for (Token m : s) {
+			String base = m.getChosenInterpretation().getBase();
+			if (!noSplitMorphs.contains(m)
+					&& (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2
+							.contains(base)) && clauseHasVerb))) {
+				sublists.add(currentSublist);
+				currentSublist = new ArrayList<>();
+				clauseHasVerb = false;
+			} else {
+				if (isVerb(m))
+					clauseHasVerb = true;
+			}
+			currentSublist.add(m);
+		}
+		if (currentSublist.size() > 0) {
+			if (clauseHasVerb)
+				sublists.add(currentSublist);
+			else
+				sublists.getLast().addAll(currentSublist);
+		}
+
+		// merge clause beginning with zaimek wzgl. etc to previous clause
+		List<Token> prev = null;
+		Iterator<List<Token>> it = sublists.iterator();
+		while (it.hasNext()) {
+			List<Token> sublist = it.next();
+			boolean containsRelPron = false;
+			int i = 1;
+			for (Token m : sublist) {
+				if (i > 2)
+					break;
+				if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) {
+					containsRelPron = true;
+					break;
+				}
+				i++;
+			}
+			if (prev != null && containsRelPron) {
+				prev.addAll(sublist);
+				it.remove();
+			} else
+				prev = sublist;
+		}
+
+		return sublists;
+	}
+
+	private static boolean agreedNum(Token candidate, Token keyword) {
+		String keywordNum = keyword.getNumber();
+		String wordNum = candidate.getNumber();
+		return keywordNum.equals(wordNum);
+	}
+
+	private static boolean agreedGenderOrPerson(Token candidate, Token keyword) {
+		if (isPraet(keyword)) {
+			// praet has number:gender
+			String keywordGender = keyword.getGender();
+			String wordGender = candidate.getGender();
+			return keywordGender.equals(wordGender);
+		} else {
+			// other verbs have number:person
+			String keywordPerson = keyword.getPerson();
+			String wordPerson = "ter"; // default
+			if (PRONOUN_TAGS.contains(candidate.getCtag()))
+				wordPerson = candidate.getPerson();
+			return wordPerson.equals(keywordPerson);
+		}
+	}
+
+	private static boolean isJakJako(Token prev) {
+		String base = prev == null ? null : prev.getBase();
+		return prev != null && (base.equals("jak") || base.equals("jako"));
+	}
+
+	private static boolean isPraet(Token keyword) {
+		return keyword.getCtag().equals("praet");
+	}
+
+	private static boolean isNom(Token candidate) {
+		return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow
+													// tylko!
+	}
+
+	private static boolean isNoun(Token m) {
+		return NOUN_TAGS.contains(m.getCtag());
+	}
+
+	public static boolean isVerb(Token morph) {
+		return VERB_TAGS.contains(morph.getCtag());
+	}
+
+	public static boolean isVerb(Mention m) {
+		boolean hasOnlyVerbs = true;
+		for (Token morph : m.getSegments())
+			if (!isVerb(morph)) {
+				hasOnlyVerbs = false;
+				break;
+			}
+		return hasOnlyVerbs;
+	}
+
+	public static boolean isVerb(TEIMention m) {
+		boolean hasOnlyVerbs = true;
+		for (TEIMorph morph : m.getMorphs())
+			if (!isVerb(morph)) {
+				hasOnlyVerbs = false;
+				break;
+			}
+		return hasOnlyVerbs;
+	}
+
+	private static boolean isVerb(TEIMorph morph) {
+		return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag());
+	}
+}
+package pl.waw.ipipan.zil.core.md.detection.head;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import pl.waw.ipipan.zil.core.md.entities.Sentence;
+import pl.waw.ipipan.zil.core.md.entities.Token;
+import weka.core.Instances;
+
+import java.io.File;
+import java.io.InputStream;
+import java.util.*;
+
+public class HeadDetector {
+
+    final private static Logger logger = LoggerFactory.getLogger(HeadDetector.class);
+
+    private Model model;
+    private Set<String> quasiVerbs = new HashSet<>();
+
+    public static int detectedHeads = 0;
+
+    public List<Token> detectHeads(Sentence sentence) {
+        List<TreeMap<String, Object>> examples = new ArrayList<>();
+        InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence);
+        if (examples.isEmpty())
+            return null;
+
+        Instances instances = model.getInstances(examples);
+
+        // label instances
+        List<Boolean> areHeads = new ArrayList<>();
+        List<Token> heads = new ArrayList<>();
+        for (int i = 0; i < instances.numInstances(); i++) {
+            boolean isHead = model.isHead(instances.instance(i), sentence);
+            areHeads.add(isHead);
+            if (isHead)
+                detectedHeads++;
+        }
+
+        int i = 0;
+        for (Token m : sentence) {
+            if (FeatureGeneration.isVerb(m))
+                continue;
+            if (areHeads.get(i))
+            	heads.add(m);
+               // sentence.addMention(new Mention(m, false));
+            i++;
+        }
+        return heads;
+    }
+
+    public HeadDetector(File zeroSubjectDetectionModel) {
+        try {
+            this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath());
+            this.quasiVerbs = this.model.getQuasiVerbs();
+        } catch (Exception e) {
+            logger.error("Error loading model:" + e);
+        }
+    }
+
+    public HeadDetector(InputStream zeroSubjectDetectionModelStream) {
+        try {
+            this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream);
+            this.quasiVerbs = this.model.getQuasiVerbs();
+        } catch (Exception e) {
+            logger.error("Error loading model:" + e);
+        }
+    }
+}
+package pl.waw.ipipan.zil.core.md.detection.head;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import pl.waw.ipipan.zil.core.md.entities.*;
+import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
+import weka.core.Attribute;
+import weka.core.FastVector;
+import weka.core.Instance;
+import weka.core.Instances;
+
+import java.io.File;
+import java.util.*;
+import java.util.Map.Entry;
+
+public class InstanceCreator {
+
+    private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class);
+    private static final TEI_IO teiIO = TEI_IO.getInstance();
+
+    private InstanceCreator() {
+    }
+
+    public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) {
+        int allTexts = 0;
+        int exceptions = 0;
+        int allSentences = 0;
+
+        List<TreeMap<String, Object>> examples = new ArrayList<>();
+        for (File textDir : IOUtils.getNKJPDirs(dataDir)) {
+            try {
+                allTexts++;
+                logger.info("Processing text " + textDir);
+                TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
+                Text text = TeiLoader.loadTextFromTei(ct, textDir);
+
+                for (Paragraph p : text)
+                    for (Sentence s : p) {
+                        allSentences++;
+                        loadExamplesFromSentence(quasiVerbs, examples, s);
+                    }
+
+            } catch (Exception e) {
+                logger.error(e.getLocalizedMessage());
+                exceptions++;
+            }
+        }
+
+        logger.info(allTexts + " texts found.");
+        if (exceptions != 0)
+            logger.error(exceptions + " texts with exceptions.");
+        logger.info(allSentences + " sentences found.");
+
+        return examples;
+    }
+
+    public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples,
+                                                Sentence s) {
+
+        // collect positive examples
+        Set<Token> positive = new HashSet<>();
+        for (Mention m : s.getMentions()) {
+            if (!FeatureGeneration.isVerb(m)) {
+                positive.addAll(m.getHeadSegments());
+            }
+        }
+
+        for (Token m : s) {
+            if (FeatureGeneration.isVerb(m))
+                continue;
+
+            TreeMap<String, Object> features = new TreeMap<>();
+            if (positive.contains(m)) {
+                features.put("class", Boolean.valueOf(true));
+            } else {
+                features.put("class", Boolean.valueOf(false));
+            }
+
+            FeatureGeneration.generateFeatures(features, m, s, quasiVerbs);
+            examples.add(features);
+        }
+    }
+
+    public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) {
+
+        TreeSet<String> booleanAttsOccurred = new TreeSet<>();
+        TreeSet<String> doubleAttsOccurred = new TreeSet<>();
+        TreeMap<String, Set<String>> att2values = new TreeMap<>();
+        for (TreeMap<String, Object> example : examples) {
+            for (Entry<String, Object> e : example.entrySet()) {
+                String key = e.getKey();
+                Object val = e.getValue();
+                if (val instanceof Integer || val instanceof Double) {
+                    doubleAttsOccurred.add(key);
+                    continue;
+                }
+                if (val instanceof Boolean) {
+                    booleanAttsOccurred.add(key);
+                    continue;
+                }
+                if (!att2values.containsKey(key))
+                    att2values.put(key, new HashSet<>());
+                att2values.get(key).add(val.toString());
+            }
+        }
+
+        List<Attribute> atts = new ArrayList<>();
+
+        // double attributes
+        for (String attName : doubleAttsOccurred) {
+            Attribute att = new Attribute(attName);
+            atts.add(att);
+        }
+
+        // boolean attributes (treated as nominal)
+        FastVector values = new FastVector(2);
+        values.addElement("false");
+        values.addElement("true");
+        for (String attName : booleanAttsOccurred) {
+            Attribute att = new Attribute(attName, values);
+            atts.add(att);
+        }
+
+        // nominal attributes
+        for (Entry<String, Set<String>> attVals : att2values.entrySet()) {
+            FastVector vals = new FastVector(attVals.getValue().size());
+            for (String val : attVals.getValue())
+                vals.addElement(val);
+            Attribute att = new Attribute(attVals.getKey(), vals);
+            atts.add(att);
+        }
+
+        FastVector fvWekaAttributes = new FastVector(atts.size());
+        for (Attribute attr : atts) {
+            fvWekaAttributes.addElement(attr);
+        }
+
+        Instances data = new Instances("Head", fvWekaAttributes, 10);
+        data.setClass(data.attribute(classFeatureName));
+        return data;
+    }
+
+    public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) {
+        for (TreeMap<String, Object> example : examples) {
+            Instance instance = new Instance(instances.numAttributes());
+
+            for (Entry<String, Object> e : example.entrySet()) {
+                Object val = e.getValue();
+                String name = e.getKey();
+                if (val instanceof Integer) {
+                    instance.setValue(instances.attribute(name), (int) val);
+                } else if (val instanceof Boolean) {
+                    instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false");
+                } else {
+                    int indexOfValue = instances.attribute(name).indexOfValue(val.toString());
+                    if (indexOfValue == -1) {
+                        logger.debug("Unkown value: " + val.toString() + " of feature: " + name
+                                + ". Marking as missing value.");
+                        instance.setMissing(instances.attribute(name));
+                    } else
+                        instance.setValue(instances.attribute(name), indexOfValue);
+                }
+            }
+
+            instance.setDataset(instances);
+            instances.add(instance);
+        }
+    }
+}
+package pl.waw.ipipan.zil.core.md.detection.head;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import pl.waw.ipipan.zil.core.md.entities.Sentence;
+import weka.classifiers.Classifier;
+import weka.core.Instance;
+import weka.core.Instances;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeMap;
+
+public class Model implements Serializable {
+
+    private static final long serialVersionUID = 3351727361273283076L;
+    private static final Logger logger = LoggerFactory.getLogger(Model.class);
+
+    private Classifier classifier;
+    private Set<String> quasiVerbs;
+    private Instances instances;
+
+    public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) {
+        this.classifier = classifier;
+        this.instances = instances;
+        this.quasiVerbs = quasiVerbs;
+    }
+
+    public boolean isHead(Instance instance, Sentence sentence) {
+        try {
+            double response = this.classifier.classifyInstance(instance);
+            return response > 0;
+        } catch (Exception e) {
+            logger.error("Error classyfing head in sentence: " + sentence, e);
+            return false;
+        }
+    }
+
+    public Instances getInstances(List<TreeMap<String, Object>> examples) {
+        Instances instances = new Instances(this.instances);
+        InstanceCreator.fillInstances(examples, instances);
+        return instances;
+    }
+
+    public Set<String> getQuasiVerbs() {
+        return quasiVerbs;
+    }
+}
+package pl.waw.ipipan.zil.core.md.detection.head;
+
+import weka.core.SerializationHelper;
+
+import java.io.InputStream;
+
+public class Serializer {
+
+	public static void saveModel(Model m, String targetModelFilePath) throws Exception {
+		SerializationHelper.write(targetModelFilePath, m);
+	}
+
+	public static Model loadModel(String path) throws Exception {
+		Model m = (Model) SerializationHelper.read(path);
+		return m;
+	}
+
+	public static Model loadModelFromStream(InputStream stream) throws Exception {
+		Model m = (Model) SerializationHelper.read(stream);
+		return m;
+	}
+}
+package pl.waw.ipipan.zil.core.md.detection.head;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import weka.classifiers.Evaluation;
+import weka.classifiers.rules.JRip;
+import weka.classifiers.rules.JRip.RipperRule;
+import weka.core.Attribute;
+import weka.core.Instance;
+import weka.core.Instances;
+
+import java.io.*;
+import java.util.*;
+
+public class Trainer {
+
+    private static final Logger logger = LoggerFactory.getLogger(Trainer.class);
+
+    private static final boolean DO_CV = false;
+    private static final String QUASI_LIST_PATH = "/quasi_verbs.txt";
+
+    private Trainer() {
+    }
+
+    public static void main(String[] args) {
+
+        if (args.length != 2) {
+            logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName()
+                    + " trainDir targetModelFile");
+            return;
+        }
+
+        File dataDir = new File(args[0]);
+        String targetModelFilePath = args[1];
+
+        if (!dataDir.isDirectory()) {
+            logger.error(dataDir + " is not a directory!");
+            return;
+        }
+
+        Set<String> quasiVerbs = loadQuasiVerbs();
+
+        List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs);
+        Instances instances = InstanceCreator.createInstances(examples, "class");
+        InstanceCreator.fillInstances(examples, instances);
+
+        printStats(instances);
+
+        try {
+            JRip model;
+
+            if (DO_CV) {
+                logger.info("Crossvalidation...");
+                model = new JRip();
+                Evaluation eval = new Evaluation(instances);
+                eval.crossValidateModel(model, instances, 10, new Random(1));
+                logger.info(eval.toSummaryString());
+                logger.info(eval.toMatrixString());
+                logger.info(eval.toClassDetailsString());
+            }
+
+            logger.info("Building final classifier...");
+            model = new JRip();
+            model.buildClassifier(instances);
+            logger.info(model.getRuleset().size() + " rules generated.");
+            for (int i = 0; i < model.getRuleset().size(); i++) {
+                RipperRule v = (RipperRule) model.getRuleset().elementAt(i);
+                logger.info("\t" + v.toString(instances.classAttribute()));
+            }
+
+            instances.delete();
+            logger.info("Features stats:");
+            for (int i = 0; i < instances.numAttributes(); i++) {
+                Attribute att = instances.attribute(i);
+                logger.info(i + ".\t" + att.toString());
+            }
+
+            logger.info("Saving classifier...");
+            Model m = new Model(model, instances, quasiVerbs);
+            Serializer.saveModel(m, targetModelFilePath);
+            logger.info("Done.");
+
+        } catch (Exception e) {
+            logger.error("Error: " + e);
+        }
+    }
+
+    private static Set<String> loadQuasiVerbs() {
+        Set<String> quasiVerbs = new HashSet<>();
+        InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH);
+        try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
+            String line;
+            while ((line = br.readLine()) != null) {
+                quasiVerbs.add(line.trim());
+            }
+        } catch (IOException e) {
+            logger.error(e.getLocalizedMessage(), e);
+        }
+        return quasiVerbs;
+    }
+
+    private static void printStats(Instances instances) {
+        int positive = 0;
+        int negative = 0;
+        for (int i = 0; i < instances.numInstances(); i++) {
+            Instance inst = instances.instance(i);
+            if (inst.classValue() > 0)
+                negative++;
+            else
+                positive++;
+        }
+        logger.info(positive + " positive examples");
+        logger.info(negative + " negative examples");
+        logger.info((positive + negative) + " examples total");
+        logger.info((instances.numAttributes() - 1) + " attributes");
+        logger.info(instances.toSummaryString());
+    }
+
+}
+package pl.waw.ipipan.zil.core.md.detection.nominal;
+
+import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
+import pl.waw.ipipan.zil.core.md.detection.Constants;
+import pl.waw.ipipan.zil.core.md.entities.*;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph;
+
+import java.util.*;
+
+
+public class FeatureGeneration {
+	final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo",
+			"lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" }));
+
+	final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale",
+			"lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie",
+			"przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" }));
+
+	final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>(
+			Arrays.asList(new String[] { "?", "!" }));
+
+	final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>();
+	static {
+		CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")");
+		CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\"");
+		CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'");
+	}
+
+	final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12",
+			"ppron3", "ger", "num", "numcol" }));
+
+	final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" }));
+
+	final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt",
+			"praet", "winien" }));
+
+	final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki",
+			"który" }));
+
+	public static void generateFeatures(Map<String, Object> features, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, 
+			Token head, Token candidate, Sentence s, List<Token> heads) {
+
+		//addTokenFeatures(features, "head", head, s);
+		addTokenFeatures(features, "candidate", candidate, s);
+
+		//features.put("sentLength", s.size()); // ostatnie sprawdzone
+		features.put("sameWord", sameWord(head, candidate, s)); 
+		features.put("sameNE", sameNE(head, candidate, s));
+		features.put("sameNG", sameNG(head, candidate, s));
+		
+		features.put("distance", Math.abs(head.getSentencePosition() - candidate.getSentencePosition()));
+		//features.put("headIsFirst", Boolean.valueOf(head.compareTo(candidate) < 0));  
+		features.put("candidateIsFirst", Boolean.valueOf(head.compareTo(candidate) > 0));
+		
+		features.put("sameWalentyConstruction", sameWalentyConstruction(head, candidate, s, valence));
+		features.put("sameToken", sameToken(head, candidate));
+		
+		features.put("candidateIsAlsoHead", Boolean.valueOf(heads.contains(candidate)));
+		features.put("isNextToCandidateColon", isNextColon(candidate, s)); 
+		
+		features.put("candidateStartsWithUpperOrth", Character.isUpperCase(candidate.getOrth().codePointAt(0)));
+		features.put("candidateStartsWithUpperBase", Character.isUpperCase(candidate.getBase().codePointAt(0)));
+		features.put("isDotNextToHead", isNextDot(head, s));
+		features.put("closestNEDistance", closestNEDistance(head, candidate, s)); 
+		features.put("headStartsWithUpperOrth", Character.isUpperCase(head.getOrth().codePointAt(0)));
+     	features.put("headStartsWithUpperBase", Character.isUpperCase(head.getBase().codePointAt(0))); // tutaj optymalna wersja sie konczy
+		
+		
+		// candidate in head in closest NE distance
+     	
+//		features.put("candidateOrthLength", candidate.getOrth().length());
+//		features.put("candidateBaseLength", candidate.getBase().length());
+//		features.put("headOrthLength", head.getOrth().length()); 
+//		features.put("headBaseLength", head.getBase().length());
+		
+		//features.put("isNextToHeadColon", isNextColon(head, s));  
+		//features.put("isCandidateColon", Boolean.valueOf(candidate.getOrth().equals(":"))); // tylko run zrobic, tak jeszcze nie sprawdzalem
+		  
+/*		features.put("isClauseSplitLemmaStrict", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(candidate.getBase())));
+		features.put("isClauseSplitLemma", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS.contains(candidate.getBase())));   
+		features.put("isClauseSplitLemma2", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(candidate.getBase())));*/  
+		
+/*		Token next = getNeighbouringToken(s, candidate, 1);  
+		if (next != null) {   
+			features.put("nextIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(next.getBase())));
+			features.put("nextIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(next.getBase())));  
+			features.put("nextIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(next.getBase())));
+		} else {
+			features.put("nextIsClauseSplitLemmaStrict", "sentEnd");
+			features.put("nextIsClauseSplitLemma", "sentEnd");   
+			features.put("nextIsClauseSplitLemma2", "sentEnd");    
+		}
+		
+		Token previous = getNeighbouringToken(s, candidate, -1);
+		if (previous != null) {
+			features.put("previousIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(previous.getBase())));
+			features.put("previousIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(previous.getBase())));  
+			features.put("previousIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(previous.getBase())));
+		} else {
+			features.put("previousIsClauseSplitLemmaStrict", "sentStart");
+			features.put("previousIsClauseSplitLemma", "sentStart");  
+			features.put("previousIsClauseSplitLemma2", "sentStart");  
+		}*/
+		
+		
+		//features.put("candidateIsClosingBracket", candidateIsClosingBracket(head, candidate, s));
+		//features.put("candidateIsQM", candidateIsClosingQM(head, candidate, s));
+		//features.put("candidateIsClosingBracket", Boolean.valueOf(candidate.getOrth().equals(")")));
+	
+		// pozycja glowy we wzmiance, da sie zasymulowac!!cos nie bangla
+		// jeszcze raz niestety trzeba sprawdzic ciaglosc prawostronna chyba  
+		// head NG group length i walenty construction group Length dodac bo moze to dobrze zadzialac z odelgloscia
+		// is stop word dodac dla candidate i jakies rozwiazania z detekcji glowy moze
+				// zrobic tak zeby jeszcze sprawdzalo czy token przed jest czescia wzmianki
+				// z tymi separatorami tez sie pobawic
+		// word Ctag !!
+/*
+		Token next = getNeighbouringToken(s, candidate, 1);
+		if (next != null) {  
+			features.put(String.format("%sCtag", "nextToCandidate"), next.getChosenInterpretation().getCtag()); 
+			features.put(String.format("%sNumber", "nextToCandidate"), next.getChosenInterpretation().getNumber());
+			features.put(String.format("%sGender", "nextToCandidate"), next.getChosenInterpretation().getGender());
+			features.put(String.format("%sPerson", "nextToCandidate"), next.getChosenInterpretation().getPerson());
+		} else {
+			features.put(String.format("%sCtag", "nextToCandidate"), "null");
+			features.put(String.format("%sNumber", "nextToCandidate"), "null"); 
+			features.put(String.format("%sGender", "nextToCandidate"), "null");
+			features.put(String.format("%sPerson", "nextToCandidate"), "null");
+		}
+		
+		Token previous = getNeighbouringToken(s, candidate, -1);
+		if (previous != null) {
+			features.put(String.format("%sCtag", "previousToCandidate"), previous.getChosenInterpretation().getCtag());
+			features.put(String.format("%sNumber", "previousToCandidate"), previous.getChosenInterpretation().getNumber());
+			features.put(String.format("%sGender", "previousToCandidate"), previous.getChosenInterpretation().getGender());
+			features.put(String.format("%sPerson", "previousToCandidate"), previous.getChosenInterpretation().getPerson());
+		} else {
+			features.put(String.format("%sCtag", "previousToCandidate"), "null");
+			features.put(String.format("%sNumber", "previousToCandidate"), "null");
+			features.put(String.format("%sGender", "previousToCandidate"), "null");
+			features.put(String.format("%sPerson", "previousToCandidate"), "null");
+		}
+		*/
+		
+
+	}
+	
+	private static int closestNEDistance(Token head, Token candidate, Sentence s) {
+		int lowestDistance = -1;
+		for (NamedEntity ne : s.getNamedEntities()) {
+			int distance = ne.getTokens().get(0).getSentencePosition() - head.getSentencePosition();
+			if ( distance >= 0 && ne.getTokens().contains(candidate) && (distance < lowestDistance || lowestDistance < 0)) {
+				lowestDistance = distance;
+			}
+        }
+		return lowestDistance;
+	}
+	
+	/////////////////////////////
+	
+/*	private static boolean candidateIsClosingBracket(Token head, Token candidate, Sentence s) {
+		
+		
+		
+		if (!candidate.getOrth().equals(")")) {
+			return Boolean.valueOf(false);
+		}
+		
+		int openedBrackets = 0;
+		int closedBrackets = 0;
+		for (Token t : s) {
+			if (candidate.getSentencePosition() == t.getSentencePosition()) {
+				break;
+			}
+			
+			if (t.getSentencePosition() >= head.getSentencePosition()) {
+				if (t.getOrth().equals("("))
+					openedBrackets++; 
+				if (t.getOrth().equals(")"))
+					closedBrackets++;  
+			}
+		}
+		
+		if (openedBrackets - closedBrackets > 0) {
+			return Boolean.valueOf(true);
+		}
+		
+		return Boolean.valueOf(false);
+	}*/
+	
+	private static boolean isNextColon(Token t, Sentence s) {
+		int idx = s.indexOf(t) + 1;
+		if (idx >= s.size() || idx < 0)
+			return Boolean.valueOf(false);
+		return Boolean.valueOf(s.get(idx).getOrth().equals(":"));
+	}
+	
+	private static boolean isNextDot(Token t, Sentence s) {
+		int idx = s.indexOf(t) + 1;
+		if (idx >= s.size() || idx < 0)
+			return Boolean.valueOf(false);
+		return Boolean.valueOf(s.get(idx).getOrth().equals("."));
+	}
+	
+	private static boolean candidateIsClosingQM(Token head, Token candidate, Sentence s) {
+		
+		if (!candidate.getOrth().equals("\"")) {
+			return Boolean.valueOf(false);
+		}
+		
+		int start = head.getSentencePosition();
+		int end = candidate.getSentencePosition() - 1;
+		if (head.compareTo(candidate) > 0) {
+			start = candidate.getSentencePosition() + 1;
+			end = head.getSentencePosition();
+		}
+		
+		int QMs = 0;
+		for (Token t : s) {
+			if (end == t.getSentencePosition()) {
+				break;
+			}
+			
+			if (t.getSentencePosition() >= start) {
+				if (t.getOrth().equals("\""))
+					QMs++;
+			}
+		}
+		
+		if ((QMs % 2) != 0) {
+			return Boolean.valueOf(true);
+		}
+		
+		return Boolean.valueOf(false);
+	}
+	
+	private static boolean sameWord(Token t1, Token t2, Sentence s) {
+		
+		for (SyntacticWord w : s.getSyntacticWords()) {
+			if (w.getTokens().contains(t1) && w.getTokens().contains(t2)) {
+				return Boolean.valueOf(true);
+			}
+        }
+		return Boolean.valueOf(false);
+	}
+	
+	private static boolean sameNE(Token t1, Token t2, Sentence s) {
+		
+		for (NamedEntity ne : s.getNamedEntities()) {
+			if (ne.getTokens().contains(t1) && ne.getTokens().contains(t2)) {
+				return Boolean.valueOf(true);
+			}
+        }
+		return Boolean.valueOf(false);
+	}
+	
+	private static boolean sameNG(Token head, Token candidate, Sentence s) {
+		
+		for (SyntacticGroup group : s.getGroups()) {
+			if (group.getType().startsWith("NG")) {
+				if (group.getSemanticHeadTokens().contains(head) && group.getTokens().contains(candidate)) {
+					return Boolean.valueOf(true);
+				}
+			}
+		}
+		return Boolean.valueOf(false);
+	}
+	
+    private static boolean sameWalentyConstruction(Token head, Token candidate, Sentence s, 
+    		Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
+    	
+    	for (SyntacticGroup group : s.getGroups()) {
+    		if (group.getType().startsWith("NG")) {
+	    		ArrayList<SyntacticGroup> nestedGroups = new ArrayList<SyntacticGroup>();
+	    		nestedGroups.add(group);
+	    		
+	    		SyntacticGroup nextGroup = group.getFollowingGroup();
+	    		while (nextGroup != null) {
+	    			nestedGroups.add(nextGroup);
+	    			nextGroup = nextGroup.getFollowingGroup();
+	    		}
+	    		
+	    		List<Token> extendedGroupSegments = getExtendedGroupSegments(nestedGroups, valence.get(ValenceDicts.NounsValence));
+	    		List<Token> extendedGroupHeads = getExtendedGroupHeads(nestedGroups);
+	    		if (extendedGroupHeads.contains(head) && extendedGroupSegments.contains(candidate))
+	    			return Boolean.valueOf(true);
+    		}
+    	}
+    	return Boolean.valueOf(false);
+    }
+    
+    private static List<Token> getExtendedGroupSegments(ArrayList<SyntacticGroup> nestedGroups,
+    		Map<String,ArrayList<String>> walentyNouns) {
+    	
+    	SyntacticGroup initialGroup = nestedGroups.get(0);
+    	String initialGroupHead = initialGroup.getSemanticHeadTokens().get(0).getBase();
+    	
+    	List<Token> heads = initialGroup.getSemanticHeadTokens();
+    	List<Token> segments = new ArrayList<Token>();
+    	
+		if (!walentyNouns.containsKey(initialGroupHead)) {
+            segments.addAll(initialGroup.getTokens());
+		} else {
+			
+			ArrayList<String> schemata = walentyNouns.get(initialGroupHead);
+			ArrayList<ArrayList<String>> groupsRealizations = new ArrayList<ArrayList<String>>();
+			ArrayList<SyntacticGroup> largestMatch = new ArrayList<SyntacticGroup>();
+			largestMatch.add(initialGroup);
+			
+			for (int i=1; i < nestedGroups.size(); i++) {
+				SyntacticGroup group = nestedGroups.get(i);
+				ArrayList<String> realizations = group.getWalentyRealizations();
+				groupsRealizations.add(realizations);
+				if (realizationsMatch(schemata, groupsRealizations)) {
+					largestMatch.add(group);
+				} else {
+					break;
+				}
+			}
+			
+			for (SyntacticGroup group : largestMatch) {
+				segments.addAll(group.getTokens());
+			}
+
+		}
+		return segments;
+    }
+    
+    private static List<Token> getExtendedGroupHeads(ArrayList<SyntacticGroup> nestedGroups) {
+    	
+    	SyntacticGroup initialGroup = nestedGroups.get(0);
+    	
+    	List<Token> heads = initialGroup.getSemanticHeadTokens();
+    	
+		return heads;
+    }
+    
+    private static boolean realizationsMatch(ArrayList<String> schemata,
+    		ArrayList<ArrayList<String>> groupsRealizations) {
+    	for (String schema : schemata) {
+    		if (isProperSchema(schema, groupsRealizations)) {
+    			return true;
+    		}
+    	}
+    	return false;
+    }
+    
+    private static boolean isProperSchema(String schema, 
+    		ArrayList<ArrayList<String>> groupsRealizations) {
+    	
+    	ArrayList<ArrayList<String>> matchingPositions = new ArrayList<ArrayList<String>>();
+    	for (ArrayList<String> realizations : groupsRealizations) {
+    		matchingPositions.add(getMatchingPositions(schema, realizations));
+    	}
+    	
+    	if (matchingPositionsExists(matchingPositions)) {
+    		return true;
+    		/*ArrayList<ArrayList<String>> product = cartesianProduct(matchingPositions);
+    		for (ArrayList<String> combination : product) {
+    			Set<String> combinationSet = new HashSet<String>(combination);
+    			if (combinationSet.size() == matchingPositions.size()) {
+    				return true;
+    			}
+    		}*/
+    	}
+    	return false;
+    }
+    
+    private static ArrayList<String> getMatchingPositions(String schema, ArrayList<String> phraseRealizations) {
+    	ArrayList<String> positions = new ArrayList<String>();
+    	for (String position : schema.split("\\s\\+\\s")) {
+    		position = position.trim();
+    		position = position.substring(1, position.length()-1);
+    		for (String phrT : position.split(";")) {
+    			if (phraseRealizations.contains(phrT.trim())) {
+    				positions.add(position);
+    				break;
+    			}
+    		}
+    	}
+    	return positions;
+    }
+    
+    private static boolean matchingPositionsExists(ArrayList<ArrayList<String>> matchingPositions) {
+    	for (ArrayList<String> positions : matchingPositions) {
+    		if (positions.isEmpty()) {
+    			return false;
+    		}
+    	}
+    	return true;
+    }
+	
+	private static boolean sameToken(Token t1, Token t2) {
+		if (t1.compareTo(t2) == 0) {
+			return Boolean.valueOf(true);
+		}
+		return Boolean.valueOf(false);
+	}
+	//////////////////////////////////
+	
+	private static void addTokenFeatures(Map<String, Object> features, String label, Token t, Sentence s) {
+		features.put(String.format("%sCtag", label), t.getChosenInterpretation().getCtag());
+		features.put(String.format("%sNumber", label), t.getChosenInterpretation().getNumber());
+		features.put(String.format("%sGender", label), t.getChosenInterpretation().getGender());
+		features.put(String.format("%sPerson", label), t.getChosenInterpretation().getPerson());
+		features.put(String.format("%sWordCtag", label), wordCtag(t, s));
+		
+		features.put(String.format("%sNextCtag", label), getNeighbouringTag(s, t, 1));
+		features.put(String.format("%sPrevCtag", label), getNeighbouringTag(s, t, -1));  
+		
+		
+		Token next = getNeighbouringToken(s, t, 1);  
+		if (next != null) {
+			features.put(String.format("%sNextWordCtag", label), wordCtag(next, s));
+		} else {
+			features.put(String.format("%sNextWordCtag", label), "None"); 
+		}
+		
+		Token previous = getNeighbouringToken(s, t, -1);
+		if (previous != null) {
+			features.put(String.format("%sPrevWordCtag", label), wordCtag(previous, s));
+		} else {
+			features.put(String.format("%sPrevWordCtag", label), "None");
+		}
+		
+//		features.put(String.format("%sNextNextCtag", label), getNeighbouringTag(s, t, 2));
+//		features.put(String.format("%sPrevPrevCtag", label), getNeighbouringTag(s, t, -2));
+		
+//		features.put(String.format("%sSentPosition", label), t.getSentencePosition()); 
+		
+		
+//		features.put(String.format("%sPrevPraet", label), isPrevPraet(t, s));
+//		features.put(String.format("%sPrevComma", label), isPrevComma(t, s)); 
+//		features.put(String.format("%sPrev2Pred", label), isPrev2Pred(t, s));
+//		features.put(String.format("%sNextInf", label), isNextInf(t, s));
+		
+/*		List<Token> clause = getClause(s, t); 
+		if (clause != null)
+			features.put(String.format("%sClauseLength", label), clause.size());
+		else
+			features.put(String.format("%sClauseLength", label), 0);*/
+		
+		/*addFeatures(features, clause, String.format("%sClause", label), t);
+		addFeatures(features, s, String.format("%sSent", label), t);*/
+//		for (int i = 1; i < 6; i++) // zrobic to ale w oknie od head do candidate
+//			addFeatures(features, getWindow(s, t, i, 0),  String.format("%sWindow_", label) + i + "_" + 0, t);
+//		for (int i = 1; i < 6; i++)
+//			addFeatures(features, getWindow(s, t, 0, i), String.format("%sWindow_", label) + 0 + "_" + i, t);
+//		for (int i = 1; i < 6; i++)
+//			addFeatures(features, getWindow(s, t, i, i), String.format("%sWindow_", label) + i + "_" + i, t);
+	}
+	
+	private static String wordCtag(Token t, Sentence s) {
+		for (SyntacticWord w : s.getSyntacticWords()) {
+			if (w.getTokens().contains(t)) {
+				return w.getCtag();
+			}
+		}
+		return "None";
+	}
+
+	private static boolean isNextInf(Token m, Sentence s) {
+		boolean now = false;
+		for (Token morph : s) {
+			if (now)
+				return morph.getChosenInterpretation().getCtag().equals("inf");
+			if (m.equals(morph))
+				now = true;
+		}
+		return false;
+	}
+
+	private static boolean isPrev2Pred(Token m, Sentence s) {
+		Token prev = null;
+		Token prev2 = null;
+		for (Token morph : s) {
+			if (m.equals(morph))
+				break;
+			prev2 = prev;
+			prev = morph;
+		}
+		return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred"))
+				|| (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred"));
+	}
+
+	private static Object isPrevComma(Token m, Sentence s) {
+		Token prev = null;
+		for (Token morph : s) {
+			if (m.equals(morph))
+				break;
+			prev = morph;
+		}
+		return prev != null && prev.getChosenInterpretation().getBase().equals(",");
+	}
+
+	private static String getNeighbouringTag(Sentence s, Token m, int i) {
+		int idx = s.indexOf(m) + i;
+		if (idx >= s.size() || idx < 0)
+			return "None";
+		return s.get(idx).getChosenInterpretation().getCtag();
+	}
+	
+	private static Token getNeighbouringToken(Sentence s, Token m, int i) {
+		int idx = s.indexOf(m) + i;
+		if (idx >= s.size() || idx < 0)
+			return null;
+		return s.get(idx);
+	}
+
+	private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) {
+
+		boolean hasNom = false; // 1
+		boolean hasNum = false; // 2
+		boolean hasPOG = false; // 3
+
+		boolean hasNomNum = false;
+		boolean hasNumPOG = false;
+		boolean hasNomPOG = false;
+		boolean hasNomNumPOG = false;
+
+		boolean has2Nom = false;
+		boolean has2NomPOG = false;
+		boolean has2POG = false;
+
+		Token prev = null;
+		for (Token candidate : clause) {
+
+			if (!isNoun(candidate) || isJakJako(prev)) {
+				prev = candidate;
+				continue;
+			}
+
+			// nom, nom2
+			if (isNom(candidate)) {
+				if (hasNom)
+					has2Nom = true;
+				hasNom = true;
+			}
+			// num
+			if (agreedNum(candidate, m)) {
+				hasNum = true;
+			}
+			// pog, pog2
+			if (agreedGenderOrPerson(candidate, m)) {
+				if (hasPOG)
+					has2POG = true;
+				hasPOG = true;
+			}
+
+			// nom num, nom num pog
+			if (isNom(candidate) && agreedNum(candidate, m)) {
+				if (agreedGenderOrPerson(candidate, m))
+					hasNomNumPOG = true;
+				hasNomNum = true;
+			}
+
+			// nom pog, num pog
+			if (agreedGenderOrPerson(candidate, m))
+				if (isNom(candidate)) {
+					if (hasNomPOG)
+						has2NomPOG = true;
+					hasNomPOG = true;
+				} else if (agreedNum(candidate, m))
+					hasNumPOG = true;
+
+			prev = candidate;
+		}
+
+		// features.put("conj_" + prefix, hasConj);
+		features.put("cand_2_nom_" + prefix, has2Nom);
+		features.put("cand_2_POG_" + prefix, has2POG);
+		features.put("cand_2_nom+POG_" + prefix, has2NomPOG);
+
+		features.put("cand_nom_" + prefix, hasNom);
+		features.put("cand_num_" + prefix, hasNum);
+		features.put("cand_POG_" + prefix, hasPOG);
+
+		features.put("cand_nom+num_" + prefix, hasNomNum);
+		features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG);
+		features.put("cand_nom+POG_" + prefix, hasNomPOG);
+		features.put("cand_num+POG_" + prefix, hasNumPOG);
+	}
+
+	private static List<Token> getWindow(Sentence s, Token m, int pre, int post) {
+
+		int idx = s.indexOf(m);
+		int from = Math.max(0, idx - pre);
+		int to = Math.min(s.size(), idx + post + 1);
+
+		return new ArrayList<>(s.subList(from, to));
+	}
+
+	private static boolean isPrevPraet(Token m, Sentence s) {
+		Token prev = null;
+		for (Token morph : s) {
+			if (m.equals(morph))
+				break;
+			prev = morph;
+		}
+		return prev != null && prev.getChosenInterpretation().getCtag().equals("praet");
+	}
+
+	/**
+	 * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo,
+	 * lub (jak przy streszczeniach: w środku musi być czasownik w formie
+	 * osobowej),
+	 * 
+	 * @param s
+	 *            sentence
+	 * @param m2
+	 *            token
+	 * @return clause with the token
+	 */
+	public static List<Token> getClause(Sentence s, Token m2) {
+		
+		List<List<Token>> sublists = getClauses(s);
+
+		for (List<Token> sub : sublists)
+			for (Token m : sub)
+				if (m.equals(m2))
+					return sub;
+
+		return null;
+	}
+
+	public static List<List<Token>> getClauses(Sentence s) {
+
+		Set<Token> noSplitMorphs = new HashSet<>();
+		for (SyntacticGroup g : s.getGroups()) {
+			for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
+				noSplitMorphs.add(m);
+			}
+		}
+		for (SyntacticWord g : s.getSyntacticWords()) {
+			for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
+				noSplitMorphs.add(m);
+			}
+		}
+
+		LinkedList<List<Token>> sublists = new LinkedList<>();
+		List<Token> currentSublist = new ArrayList<>();
+		boolean clauseHasVerb = false;
+		for (Token m : s) {
+			String base = m.getChosenInterpretation().getBase();
+			if (!noSplitMorphs.contains(m)
+					&& (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2
+							.contains(base)) && clauseHasVerb))) {
+				sublists.add(currentSublist);
+				currentSublist = new ArrayList<>();
+				clauseHasVerb = false;
+			} else {
+				if (isVerb(m))
+					clauseHasVerb = true;
+			}
+			currentSublist.add(m);
+		}
+		if (currentSublist.size() > 0) {
+			if (clauseHasVerb)
+				sublists.add(currentSublist);
+			else if (!sublists.isEmpty())
+				sublists.getLast().addAll(currentSublist);
+		}
+
+		// merge clause beginning with zaimek wzgl. etc to previous clause
+		List<Token> prev = null;
+		Iterator<List<Token>> it = sublists.iterator();
+		while (it.hasNext()) {
+			List<Token> sublist = it.next();
+			boolean containsRelPron = false;
+			int i = 1;
+			for (Token m : sublist) {
+				if (i > 2)
+					break;
+				if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) {
+					containsRelPron = true;
+					break;
+				}
+				i++;
+			}
+			if (prev != null && containsRelPron) {
+				prev.addAll(sublist);
+				it.remove();
+			} else
+				prev = sublist;
+		}
+
+		return sublists;
+	}
+
+	private static boolean agreedNum(Token candidate, Token keyword) {
+		String keywordNum = keyword.getNumber();
+		String wordNum = candidate.getNumber();
+		return keywordNum.equals(wordNum);
+	}
+
+	private static boolean agreedGenderOrPerson(Token candidate, Token keyword) {
+		if (isPraet(keyword)) {
+			// praet has number:gender
+			String keywordGender = keyword.getGender();
+			String wordGender = candidate.getGender();
+			return keywordGender.equals(wordGender);
+		} else {
+			// other verbs have number:person
+			String keywordPerson = keyword.getPerson();
+			String wordPerson = "ter"; // default
+			if (PRONOUN_TAGS.contains(candidate.getCtag()))
+				wordPerson = candidate.getPerson();
+			return wordPerson.equals(keywordPerson);
+		}
+	}
+
+	private static boolean isJakJako(Token prev) {
+		String base = prev == null ? null : prev.getBase();
+		return prev != null && (base.equals("jak") || base.equals("jako"));
+	}
+
+	private static boolean isPraet(Token keyword) {
+		return keyword.getCtag().equals("praet");
+	}
+
+	private static boolean isNom(Token candidate) {
+		return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow
+													// tylko!
+	}
+
+	public static boolean isNoun(Token m) {
+		return NOUN_TAGS.contains(m.getCtag());
+	}
+	
+	public static boolean isNoun(Mention m) {
+		return NOUN_TAGS.contains(m.getHeadSegments().get(0).getCtag());
+	}
+
+	public static boolean isVerb(Token morph) {
+		return VERB_TAGS.contains(morph.getCtag());
+	}
+
+	public static boolean isVerb(Mention m) {
+		boolean hasOnlyVerbs = true;
+		for (Token morph : m.getSegments())
+			if (!isVerb(morph)) {
+				hasOnlyVerbs = false;
+				break;
+			}
+		return hasOnlyVerbs;
+	}
+
+	public static boolean isVerb(TEIMention m) {
+		boolean hasOnlyVerbs = true;
+		for (TEIMorph morph : m.getMorphs())
+			if (!isVerb(morph)) {
+				hasOnlyVerbs = false;
+				break;
+			}
+		return hasOnlyVerbs;
+	}
+
+	private static boolean isVerb(TEIMorph morph) {
+		return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag());
+	}
+}
+package pl.waw.ipipan.zil.core.md.detection.nominal;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
+import pl.waw.ipipan.zil.core.md.entities.*;
+import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
+import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
+import weka.core.Attribute;
+import weka.core.FastVector;
+import weka.core.Instance;
+import weka.core.Instances;
+
+import java.io.File;
+import java.util.*;
+import java.util.Map.Entry;
+
+public class InstanceCreator {
+
+    private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class);
+    private static final TEI_IO teiIO = TEI_IO.getInstance();
+
+    private InstanceCreator() {
+    }
+
+    public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs, 
+    		Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
+        int allTexts = 0;
+        int exceptions = 0;
+        int allSentences = 0;
+
+        List<TreeMap<String, Object>> examples = new ArrayList<>();
+        for (File textDir : IOUtils.getNKJPDirs(dataDir)) {
+            try {
+                allTexts++;
+                logger.info("Processing text " + textDir);
+                TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
+                Text text = TeiLoader.loadTextFromTei(ct, textDir);
+
+                for (Paragraph p : text)
+                    for (Sentence s : p) {
+                        allSentences++;
+                        loadExamplesFromSentence(quasiVerbs, valence, examples, s);
+                    }
+
+            } catch (Exception e) {
+                //logger.error(e.getLocalizedMessage()); 
+            	e.printStackTrace();
+                exceptions++;
+            }
+        }
+
+        logger.info(allTexts + " texts found.");
+        if (exceptions != 0)
+            logger.error(exceptions + " texts with exceptions.");
+        logger.info(allSentences + " sentences found.");
+
+        return examples;
+    }
+    
+    public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, 
+    		List<TreeMap<String, Object>> examples, Sentence s) {
+	
+    	
+		ArrayList<Token> heads = new ArrayList<>();
+        for (Mention m : s.getMentions()) {
+        	heads.addAll(m.getHeadSegments());
+        }
+    	
+        // collect positive examples
+        HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>();
+        for (Mention m : s.getMentions()) {
+            if (heads.containsAll(m.getHeadSegments())) {
+                positives.put(m.getHeadSegments().get(0), m.getSegments());
+            }
+        }
+		
+		for (Token head : s) {
+			if (heads.contains(head)) {
+				for (Token t : s) {
+					//if (head.compareTo(t) != 0) {// && Math.abs(head.getSentencePosition() - t.getSentencePosition()) <= window) {
+						TreeMap<String, Object> features = new TreeMap<>();
+						if (positives.containsKey(head) && positives.get(head).contains(t)) {
+							features.put("class", Boolean.valueOf(true)); 
+							//features.put("candidatePositionInMention", positionInMention(head, t, s));
+							
+						} else {
+							features.put("class", Boolean.valueOf(false));
+							//features.put("candidatePositionInMention", 0);
+						}
+						
+						
+						FeatureGeneration.generateFeatures(features, valence, head, t, s, heads);
+						//features.put("candidatePositionInMention", positionInMention(head, t, s));
+						addPreviousStates(features, head, t, s);
+						
+						examples.add(features);
+				//	}
+				}
+			}
+		}
+	}
+    
+    public static void addPreviousStates(Map<String, Object> features, Token head, Token candidate, Sentence s) {
+    	int context = 1;
+    	int candidateLocation = candidate.getSentencePosition();
+    	for (int i = 1; i <= context; i++) {
+    		if (candidateLocation - i  < 0) {
+    			features.put(String.format("location-%d", i), Boolean.valueOf(false));
+    		} else if (sameMention(s.get(candidateLocation - i), head, s) ) {
+    			features.put(String.format("location-%d", i), Boolean.valueOf(true));
+    		} else {
+    			features.put(String.format("location-%d", i), Boolean.valueOf(false));
+    		}
+    	}
+    }
+    
+    public static int positionInMention(Token head, Token t, Sentence s) {
+    	
+    	Token previous = null;
+    	if (t.getSentencePosition()-1 >= 0) {
+    		previous = s.get(t.getSentencePosition()-1);
+    	} else {
+    		return 0;
+    	}
+    	
+    	for (Mention m : s.getMentions()) {
+    		if (m.getHeadSegments().contains(head) && m.getSegments().contains(previous)) {
+/*    			if (m.getSegments().get(0).getSentencePosition() - t.getSentencePosition() <= -1) {
+    				System.out.println(m.getSegments().get(0));
+    				System.out.println(t);
+    				System.out.println(m.getSegments());
+    			}*/
+    			return previous.getSentencePosition() - m.getSegments().get(0).getSentencePosition();
+    		}
+    	}
+    	return 0;
+    }
+    
+    private static boolean sameMention(Token t1, Token t2, Sentence s) {
+    	for (Mention m : s.getMentions()) {
+    		if (m.getSegments().contains(t1) && m.getSegments().contains(t2)) {
+    			return true;
+    		}
+    	}
+    	return false;
+    }
+
+    public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, 
+    		List<TreeMap<String, Object>> examples, Sentence s, List<Token> heads) {
+    	
+    	
+        if (heads == null || heads.isEmpty())
+        	return;
+
+        // collect positive examples
+        HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>();
+        for (Mention m : s.getMentions()) {
+            if (heads.containsAll(m.getHeadSegments())) {
+                positives.put(m.getHeadSegments().get(0), m.getSegments());
+            }
+        }
+        
+        for (Token head : s) {
+        	if (heads.contains(head)) {
+        		for (Token t : s) {
+        			TreeMap<String, Object> features = new TreeMap<>();
+            		
+					if (positives.containsKey(head) && positives.get(head).contains(t)) {
+						features.put("class", Boolean.valueOf(true)); 
+						//features.put("candidatePositionInMention", positionInMention(head, t, s));
+						
+					} else {
+						features.put("class", Boolean.valueOf(false));
+						//features.put("candidatePositionInMention", 0);
+					}
+            		
+            		FeatureGeneration.generateFeatures(features, valence, head, t, s, heads);
+            		//features.put("candidatePositionInMention", positionInMention(head, t, s));
+            		addPreviousStates(features, head, t, s);
+                    examples.add(features);
+        		}
+        	}
+        }
+    }
+
+    public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) {
+
+        TreeSet<String> booleanAttsOccurred = new TreeSet<>();
+        TreeSet<String> doubleAttsOccurred = new TreeSet<>();
+        TreeMap<String, Set<String>> att2values = new TreeMap<>();
+        for (TreeMap<String, Object> example : examples) {
+            for (Entry<String, Object> e : example.entrySet()) {
+                String key = e.getKey();
+                Object val = e.getValue();
+                if (val instanceof Integer || val instanceof Double) {
+                    doubleAttsOccurred.add(key);
+                    continue;
+                }
+                if (val instanceof Boolean) {
+                    booleanAttsOccurred.add(key);
+                    continue;
+                }
+                if (!att2values.containsKey(key))
+                    att2values.put(key, new HashSet<>());
+                att2values.get(key).add(val.toString());
+            }
+        }
+
+        List<Attribute> atts = new ArrayList<>();
+
+        // double attributes
+        for (String attName : doubleAttsOccurred) {
+            Attribute att = new Attribute(attName);
+            atts.add(att);
+        }
+
+        // boolean attributes (treated as nominal)
+        FastVector values = new FastVector(2);
+        values.addElement("false");
+        values.addElement("true");
+        for (String attName : booleanAttsOccurred) {
+            Attribute att = new Attribute(attName, values);
+            atts.add(att);
+        }
+
+        // nominal attributes
+        for (Entry<String, Set<String>> attVals : att2values.entrySet()) {
+            FastVector vals = new FastVector(attVals.getValue().size());
+            for (String val : attVals.getValue())
+                vals.addElement(val);
+            Attribute att = new Attribute(attVals.getKey(), vals);
+            atts.add(att);
+        }
+
+        FastVector fvWekaAttributes = new FastVector(atts.size());
+        for (Attribute attr : atts) {
+            fvWekaAttributes.addElement(attr);
+        }
+
+        Instances data = new Instances("Nominal", fvWekaAttributes, 10);
+        data.setClass(data.attribute(classFeatureName));
+        return data;
+    }
+
+    public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) {
+        for (TreeMap<String, Object> example : examples) {
+        	addInstance(example, instances);
+        }
+    }
+    
+    public static void addInstance(TreeMap<String, Object> example, Instances instances) {
+        Instance instance = new Instance(instances.numAttributes());
+
+        for (Entry<String, Object> e : example.entrySet()) {
+            Object val = e.getValue();
+            String name = e.getKey();
+            if (val instanceof Integer) {
+                instance.setValue(instances.attribute(name), (int) val);
+            } else if (val instanceof Boolean) {
+                instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false");
+            } else {
+                int indexOfValue = instances.attribute(name).indexOfValue(val.toString());
+                if (indexOfValue == -1) {
+                    logger.debug("Unkown value: " + val.toString() + " of feature: " + name
+                            + ". Marking as missing value.");
+                    instance.setMissing(instances.attribute(name));
+                } else
+                    instance.setValue(instances.attribute(name), indexOfValue);
+            }
+        }
+
+        instance.setDataset(instances);
+        instances.add(instance);
+    }
+    
+}
+package pl.waw.ipipan.zil.core.md.detection.nominal;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import pl.waw.ipipan.zil.core.md.entities.Sentence;
+import weka.classifiers.Classifier;
+import weka.core.Instance;
+import weka.core.Instances;
+
+import java.io.Serializable;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeMap;
+
+public class Model implements Serializable {
+
+    private static final long serialVersionUID = 3351727361273283076L;
+    private static final Logger logger = LoggerFactory.getLogger(Model.class);
+
+    private Classifier classifier;
+    private Set<String> quasiVerbs;
+    private Instances instances;
+
+    public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) {
+        this.classifier = classifier;
+        this.instances = instances;
+        this.quasiVerbs = quasiVerbs;
+    }
+
+    public boolean arePartOfSameMention(Instance instance, Sentence sentence) {
+        try {
+            double response = this.classifier.classifyInstance(instance);
+            return response > 0;
+        } catch (Exception e) {
+            logger.error("Error classyfing verb in sentence: " + sentence, e);
+            return false;
+        }
+    }
+
+    public Instances getInstances(List<TreeMap<String, Object>> examples) {
+        Instances instances = new Instances(this.instances);
+        InstanceCreator.fillInstances(examples, instances);
+        return instances;
+    }
+    
+    public Instances getInstances() {
+        Instances instances = new Instances(this.instances);
+        return instances;
+    }
+
+    public Set<String> getQuasiVerbs() {
+        return quasiVerbs;
+    }
+}
+package pl.waw.ipipan.zil.core.md.detection.nominal;
+
+import java.io.File;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeMap;
+import java.util.Map.Entry;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
+import pl.waw.ipipan.zil.core.md.detection.nominal.FeatureGeneration;
+import pl.waw.ipipan.zil.core.md.detection.nominal.InstanceCreator;
+import pl.waw.ipipan.zil.core.md.detection.nominal.Model;
+import pl.waw.ipipan.zil.core.md.detection.nominal.Serializer;
+import pl.waw.ipipan.zil.core.md.entities.Mention;
+import pl.waw.ipipan.zil.core.md.entities.Sentence;
+import pl.waw.ipipan.zil.core.md.entities.Token;
+import weka.core.Instances;
+
+public class NominalMentionDetector {
+    final private static Logger logger = LoggerFactory.getLogger(NominalMentionDetector.class);
+
+    private Model model;
+    private Set<String> quasiVerbs = new HashSet<>();
+    
+    public void addNominalMentions(Sentence sentence, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, List<Token> heads) {
+        List<TreeMap<String, Object>> examples = new ArrayList<>();
+        InstanceCreator.loadExamplesFromSentence(quasiVerbs, valence, examples, sentence, heads);
+        if (examples.isEmpty())
+            return;
+
+        Instances instances = model.getInstances();
+
+        // label instances
+        List<Boolean> areInSameMention = new ArrayList<>();
+        for (int i = 0; i < examples.size(); i++) {
+        	TreeMap<String, Object> example = examples.get(i);
+        	if (i - 1 < 0) {
+        		example.put("location-1", Boolean.valueOf(false));
+        		//example.put("candidatePositionInMention", 0);
+        	} else {
+        		example.put("location-1", Boolean.valueOf(areInSameMention.get(i-1)));
+//        		int positionInMention = 1;
+//        		while (i - positionInMention >= 0 && areInSameMention.get(i-positionInMention)) {
+//        			positionInMention++;
+//        		}
+//        		example.put("candidatePositionInMention", positionInMention-1);
+        	}
+        	
+        	InstanceCreator.addInstance(example, instances);
+        	boolean inSameMention = model.arePartOfSameMention(instances.instance(i), sentence);
+            areInSameMention.add(inSameMention);
+        }
+        
+        int i = 0;
+        for (Token head : sentence) {
+        	if (heads.contains(head)) {
+        		ArrayList<Token> mSegments = new ArrayList<Token>();
+        		ArrayList<Token> mHead = new ArrayList<Token>();
+        		mHead.add(head);
+        		for (Token t : sentence) {
+        			if (head.compareTo(t) != 0) {
+	        			if (areInSameMention.get(i)) {
+	        				mSegments.add(t);
+	        			}
+        			} else {
+        				mSegments.add(t);
+        			}
+        			i++;
+        		}
+        		
+        		// cleaning
+        		if(mSegments.get(mSegments.size()-1).getCtag().equals("prep") || mSegments.get(mSegments.size()-1).getCtag().equals("conj") ||
+        				mSegments.get(mSegments.size()-1).getCtag().equals("comp")) {
+        			mSegments.remove(mSegments.size()-1);
+        		}
+    			if(mSegments.get(0).getCtag().equals("prep") || mSegments.get(0).getCtag().equals("conj") ||
+        				mSegments.get(0).getCtag().equals("comp")) {
+        			mSegments.remove(0);
+        		}
+        			
+        		sentence.addMention(new Mention(mSegments, mHead));
+        	}
+        }
+    }
+
+    public NominalMentionDetector(File zeroSubjectDetectionModel) {
+        try {
+            this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath());
+            this.quasiVerbs = this.model.getQuasiVerbs();
+        } catch (Exception e) {
+            logger.error("Error loading model:" + e);
+        }
+    }
+
+    public NominalMentionDetector(InputStream zeroSubjectDetectionModelStream) {
+        try {
+            this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream);
+            this.quasiVerbs = this.model.getQuasiVerbs();
+        } catch (Exception e) {
+            logger.error("Error loading model:" + e);
+        }
+    }
+}
+package pl.waw.ipipan.zil.core.md.detection.nominal;
+
+import weka.core.SerializationHelper;
+
+import java.io.InputStream;
+
+public class Serializer {
+
+	public static void saveModel(Model m, String targetModelFilePath) throws Exception {
+		SerializationHelper.write(targetModelFilePath, m);
+	}
+
+	public static Model loadModel(String path) throws Exception {
+		Model m = (Model) SerializationHelper.read(path);
+		return m;
+	}
+
+	public static Model loadModelFromStream(InputStream stream) throws Exception {
+		Model m = (Model) SerializationHelper.read(stream);
+		return m;
+	}
+}
+package pl.waw.ipipan.zil.core.md.detection.nominal;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import pl.waw.ipipan.zil.core.md.Main;
+import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
+import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector;
+import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
+import weka.classifiers.Evaluation;
+import weka.classifiers.rules.JRip;
+import weka.classifiers.rules.JRip.RipperRule;
+import weka.classifiers.trees.J48;
+import weka.core.Attribute;
+import weka.core.Instance;
+import weka.core.Instances;
+
+import java.io.*;
+import java.util.*;
+
+public class Trainer {
+
+    private static final Logger logger = LoggerFactory.getLogger(Trainer.class);
+
+    private static final boolean DO_CV = false;
+    private static final String QUASI_LIST_PATH = "/quasi_verbs.txt";
+    private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt";
+    private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt";
+    
+    private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = 
+    		new EnumMap(ValenceDicts.class);
+    
+    static {
+        InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE);
+        valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream));
+        
+        InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
+        valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
+    }
+
+    private Trainer() {
+    }
+
+    public static void main(String[] args) {
+
+        if (args.length != 2) {
+            logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName()
+                    + " trainDir targetModelFile");
+            return;
+        }
+
+        File dataDir = new File(args[0]);
+        String targetModelFilePath = args[1];
+
+        if (!dataDir.isDirectory()) {
+            logger.error(dataDir + " is not a directory!");
+            return;
+        }
+
+        Set<String> quasiVerbs = loadQuasiVerbs();
+        
+        InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE);
+        valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream));
+        
+        InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
+        valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
+
+        List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs, valence);
+        Instances instances = InstanceCreator.createInstances(examples, "class");
+        InstanceCreator.fillInstances(examples, instances);
+
+        printStats(instances);
+        
+        try {
+            J48 model;
+
+            logger.info("Building final classifier...");
+            model = new J48();
+            model.buildClassifier(instances);
+            logger.info("J48 tree:");
+            logger.info(model.toString());
+
+            instances.delete();
+            logger.info("Features stats:");
+            for (int i = 0; i < instances.numAttributes(); i++) {
+                Attribute att = instances.attribute(i);
+                logger.info(i + ".\t" + att.toString());
+            }
+
+            logger.info("Saving classifier...");
+            Model m = new Model(model, instances, quasiVerbs);
+            Serializer.saveModel(m, targetModelFilePath);
+            logger.info("Done.");
+
+        } catch (Exception e) {
+            logger.error("Error: " + e);
+        }
+
+/*        try {
+            JRip model;
+
+            if (DO_CV) {
+                logger.info("Crossvalidation...");
+                model = new JRip();
+                Evaluation eval = new Evaluation(instances);
+                eval.crossValidateModel(model, instances, 10, new Random(1));
+                logger.info(eval.toSummaryString());
+                logger.info(eval.toMatrixString());
+                logger.info(eval.toClassDetailsString());
+            }
+
+            logger.info("Building final classifier...");
+            model = new JRip();
+            model.buildClassifier(instances);
+            logger.info(model.getRuleset().size() + " rules generated.");
+            for (int i = 0; i < model.getRuleset().size(); i++) {
+                RipperRule v = (RipperRule) model.getRuleset().elementAt(i);
+                logger.info("\t" + v.toString(instances.classAttribute()));
+            }
+
+            instances.delete();
+            logger.info("Features stats:");
+            for (int i = 0; i < instances.numAttributes(); i++) {
+                Attribute att = instances.attribute(i);
+                logger.info(i + ".\t" + att.toString());
+            }
+
+            logger.info("Saving classifier...");
+            Model m = new Model(model, instances, quasiVerbs);
+            Serializer.saveModel(m, targetModelFilePath);
+            logger.info("Done.");
+
+        } catch (Exception e) {
+            logger.error("Error: " + e);
+        }*/
+    }
+    
+    public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream)
+    {
+	    Map<String,ArrayList<String>> map;
+	    try {
+	        BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream));
+	        map = new HashMap<String,ArrayList<String>>();
+	        String line;
+	        boolean firstLine = true;
+			while((line = br.readLine()) != null) {
+				if (firstLine) {
+					line = line.replace("\uFEFF", ""); // remove BOM character
+					firstLine = false;
+				}
+
+				if (!line.startsWith("%")) {
+				    String[] lineParts = line.split(":");
+				    String lemma = lineParts[0].trim();
+				    String schema = lineParts[5].trim();
+				    
+				    if (schema.trim().isEmpty()) {
+				    	continue;
+				    }
+				    
+				    String[] lemmaParts = lemma.split(" ");
+				    if(lemmaParts.length == 1 && schemaContainsSie(schema)) {
+				    	lemma = lemma + " się";
+				    }
+				    
+				    ArrayList<String> schemata;
+				    if (!map.containsKey(lemma)) {
+				    	schemata = new ArrayList<String>();
+				    	schemata.add(schema);
+				    	map.put(lemma, schemata);
+				    } else {
+				    	schemata = map.get(lemma);
+				    	schemata.add(schema);
+				    	map.put(lemma, schemata);
+				    }
+				}
+			}
+	        br.close();
+	    } catch (IOException ex) {
+	        ex.printStackTrace();
+	        throw new RuntimeException(ex);
+	    }
+	    return map;
+    }
+    
+    private static boolean schemaContainsSie(String schema) {
+    	for (String position : schema.split("\\s\\+\\s")) {
+    		position = position.trim();
+    		position = position.substring(1, position.length()-1);
+    		for (String phrT : position.split(";")) {
+    			if (phrT.equals("refl") || phrT.equals("recip")) {
+    				return true;
+    			}
+    		}
+    	}
+    	
+    	return false;
+    }
+
+    private static Set<String> loadQuasiVerbs() {
+        Set<String> quasiVerbs = new HashSet<>();
+        InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH);
+        try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
+            String line;
+            while ((line = br.readLine()) != null) {
+                quasiVerbs.add(line.trim());
+            }
+        } catch (IOException e) {
+            logger.error(e.getLocalizedMessage(), e);
+        }
+        return quasiVerbs;
+    }
+
+    private static void printStats(Instances instances) {
+        int positive = 0;
+        int negative = 0;
+        for (int i = 0; i < instances.numInstances(); i++) {
+            Instance inst = instances.instance(i);
+            if (inst.classValue() > 0)
+                negative++;
+            else
+                positive++;
+        }
+        logger.info(positive + " positive examples");
+        logger.info(negative + " negative examples");
+        logger.info((positive + negative) + " examples total");
+        logger.info((instances.numAttributes() - 1) + " attributes");
+        logger.info(instances.toSummaryString());
+    }
+
+}
+package pl.waw.ipipan.zil.core.md.entities;
+
+public class Relation {
+	
+	private String name;
+	private Token target;
+	
+	public Relation(String name, Token target) {
+		this.name = name;
+		this.target = target;
+	}
+	
+	public String getName() {
+		return name;
+	}
+	
+	public Token getTarget() {
+		return target;
+	}
+	
+}