Commit 3d23a642e950208184da7ac7d198861326de7415

Authored by Bartłomiej Nitoń
1 parent 2d60e476

Added missing files.

.gitignore
... ... @@ -2,3 +2,4 @@
2 2 .classpath
3 3 .project
4 4 .settings
  5 +/bin/
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/FeatureGeneration.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.head;
  2 +
  3 +import pl.waw.ipipan.zil.core.md.detection.Constants;
  4 +import pl.waw.ipipan.zil.core.md.entities.*;
  5 +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention;
  6 +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph;
  7 +
  8 +import java.util.*;
  9 +
  10 +public class FeatureGeneration {
  11 + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo",
  12 + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" }));
  13 +
  14 + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale",
  15 + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie",
  16 + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" }));
  17 +
  18 + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>(
  19 + Arrays.asList(new String[] { "?", "!" }));
  20 +
  21 + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>();
  22 + static {
  23 + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")");
  24 + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\"");
  25 + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'");
  26 + }
  27 +
  28 + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12",
  29 + "ppron3", "ger", "num", "numcol" }));
  30 +
  31 + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" }));
  32 +
  33 + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt",
  34 + "praet", "winien" }));
  35 +
  36 + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki",
  37 + "który" }));
  38 +
  39 + public static void generateFeatures(Map<String, Object> features, Token t, Sentence s, Set<String> quasiVerbs) {
  40 +
  41 + features.put("ctag", t.getChosenInterpretation().getCtag());
  42 + features.put("number", t.getChosenInterpretation().getNumber());
  43 +
  44 + features.put("NGHead", NGHead(t, s));
  45 + features.put("isNextColon", isNextColon(t, s));
  46 + features.put("wordCtag", wordCtag(t, s));
  47 + features.put("isPartOfNE", isPartOfNE(t, s));
  48 + features.put("isFirstInNE", isFirstInNE(t, s));
  49 + features.put("nextCtag", getNeighbouringTag(s, t, 1));
  50 + features.put("prevCtag", getNeighbouringTag(s, t, -1));
  51 + features.put("sentLength", s.size());
  52 +
  53 + features.put("tokenOrthLength", t.getOrth().length());
  54 + features.put("tokenBaseLength", t.getBase().length());
  55 + features.put("isNextDot", isNextDot(t, s));
  56 + features.put("closestNEDistance", closestNEDistance(t, s));
  57 + features.put("startsWithUpperOrth", Character.isUpperCase(t.getOrth().codePointAt(0)));
  58 + features.put("startsWithUpperBase", Character.isUpperCase(t.getBase().codePointAt(0)));
  59 +
  60 +
  61 + //features.put("isPartOfFrazeo", isPartOfFrazeo(t, s));
  62 + //features.put("gender", t.getChosenInterpretation().getGender());
  63 + //features.put("person", t.getChosenInterpretation().getPerson());
  64 + //features.put("quasi", quasiVerbs.contains(m.getChosenInterpretation().getBase()));
  65 + //features.put("isPrevPraet", isPrevPraet(t, s));
  66 + //features.put("isPrevComma", isPrevComma(t, s));
  67 + //features.put("isPrev2Pred", isPrev2Pred(t, s));
  68 + //features.put("isNextInf", isNextInf(t, s));
  69 +
  70 +
  71 + //List<Token> clause = getClause(s, m);
  72 +// features.put("clauseLength", clause.size());
  73 +
  74 + //addFeatures(features, clause, "clause", m);
  75 +/* addFeatures(features, s, "sent", t);
  76 + for (int i = 1; i < 6; i++)
  77 + addFeatures(features, getWindow(s, t, i, 0), "window_" + i + "_" + 0, t);
  78 + for (int i = 1; i < 6; i++)
  79 + addFeatures(features, getWindow(s, t, 0, i), "window_" + 0 + "_" + i, t);
  80 + for (int i = 1; i < 6; i++)
  81 + addFeatures(features, getWindow(s, t, i, i), "window_" + i + "_" + i, t);*/
  82 + }
  83 +
  84 + ///////////////////////////////////
  85 +
  86 + private static boolean NGHead(Token t, Sentence s) {
  87 +
  88 + for (SyntacticGroup group : s.getGroups()) {
  89 + if (group.getType().startsWith("NG") && group.getSemanticHeadTokens().contains(t)) {
  90 + return Boolean.valueOf(true);
  91 + }
  92 + }
  93 + return Boolean.valueOf(false);
  94 + }
  95 +
  96 + private static boolean isNextColon(Token t, Sentence s) {
  97 + int idx = s.indexOf(t) + 1;
  98 + if (idx >= s.size() || idx < 0)
  99 + return Boolean.valueOf(false);
  100 + return Boolean.valueOf(s.get(idx).getOrth().equals(":"));
  101 + }
  102 +
  103 + private static boolean isNextDot(Token t, Sentence s) {
  104 + int idx = s.indexOf(t) + 1;
  105 + if (idx >= s.size() || idx < 0)
  106 + return Boolean.valueOf(false);
  107 + return Boolean.valueOf(s.get(idx).getOrth().equals("."));
  108 + }
  109 +
  110 + private static String wordCtag(Token t, Sentence s) {
  111 + for (SyntacticWord w : s.getSyntacticWords()) {
  112 + if (w.getTokens().contains(t)) {
  113 + return w.getCtag();
  114 + }
  115 + }
  116 + return "None";
  117 + }
  118 +
  119 + private static boolean isPartOfNE(Token t, Sentence s) {
  120 + for (NamedEntity ne : s.getNamedEntities()) {
  121 + if (ne.getTokens().contains(t)) {
  122 + return Boolean.valueOf(true);
  123 + }
  124 + }
  125 + return Boolean.valueOf(false);
  126 + }
  127 +
  128 + private static int closestNEDistance(Token t, Sentence s) {
  129 + int lowestDistance = -1;
  130 + for (NamedEntity ne : s.getNamedEntities()) {
  131 + int distance = ne.getTokens().get(0).getSentencePosition() - t.getSentencePosition();
  132 + if ( distance >= 0 && (distance < lowestDistance || lowestDistance < 0)) {
  133 + lowestDistance = distance;
  134 + }
  135 + }
  136 + return lowestDistance;
  137 + }
  138 +
  139 + private static boolean isFirstInNE(Token t, Sentence s) {
  140 + for (NamedEntity ne : s.getNamedEntities()) {
  141 + if (ne.getTokens().get(0).compareTo(t) == 0) {
  142 + return Boolean.valueOf(true);
  143 + }
  144 + }
  145 + return Boolean.valueOf(false);
  146 + }
  147 +
  148 + private static boolean isPartOfFrazeo(Token t, Sentence s) {
  149 + for (SyntacticWord word : s.getSyntacticWords()) {
  150 + if (word.getTokens().contains(t) &&
  151 + Constants.FRAZEO_CTAGS.contains(word.getCtag())) {
  152 + return true;
  153 + }
  154 + }
  155 + return false;
  156 + }
  157 +
  158 + ///////////////////////////////////
  159 +
  160 + private static boolean isNextInf(Token m, Sentence s) {
  161 + boolean now = false;
  162 + for (Token morph : s) {
  163 + if (now)
  164 + return morph.getChosenInterpretation().getCtag().equals("inf");
  165 + if (m.equals(morph))
  166 + now = true;
  167 + }
  168 + return false;
  169 + }
  170 +
  171 + private static boolean isPrev2Pred(Token m, Sentence s) {
  172 + Token prev = null;
  173 + Token prev2 = null;
  174 + for (Token morph : s) {
  175 + if (m.equals(morph))
  176 + break;
  177 + prev2 = prev;
  178 + prev = morph;
  179 + }
  180 + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred"))
  181 + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred"));
  182 + }
  183 +
  184 + private static Object isPrevComma(Token m, Sentence s) {
  185 + Token prev = null;
  186 + for (Token morph : s) {
  187 + if (m.equals(morph))
  188 + break;
  189 + prev = morph;
  190 + }
  191 + return prev != null && prev.getChosenInterpretation().getBase().equals(",");
  192 + }
  193 +
  194 + private static String getNeighbouringTag(Sentence s, Token m, int i) {
  195 + int idx = s.indexOf(m) + i;
  196 + if (idx >= s.size() || idx < 0)
  197 + return "None";
  198 + return s.get(idx).getChosenInterpretation().getCtag();
  199 + }
  200 +
  201 + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) {
  202 +
  203 + boolean hasNom = false; // 1
  204 + boolean hasNum = false; // 2
  205 + boolean hasPOG = false; // 3
  206 +
  207 + boolean hasNomNum = false;
  208 + boolean hasNumPOG = false;
  209 + boolean hasNomPOG = false;
  210 + boolean hasNomNumPOG = false;
  211 +
  212 + boolean has2Nom = false;
  213 + boolean has2NomPOG = false;
  214 + boolean has2POG = false;
  215 +
  216 + Token prev = null;
  217 + for (Token candidate : clause) {
  218 +
  219 + if (!isNoun(candidate) || isJakJako(prev)) {
  220 + prev = candidate;
  221 + continue;
  222 + }
  223 +
  224 + // nom, nom2
  225 + if (isNom(candidate)) {
  226 + if (hasNom)
  227 + has2Nom = true;
  228 + hasNom = true;
  229 + }
  230 + // num
  231 + if (agreedNum(candidate, m)) {
  232 + hasNum = true;
  233 + }
  234 + // pog, pog2
  235 + if (agreedGenderOrPerson(candidate, m)) {
  236 + if (hasPOG)
  237 + has2POG = true;
  238 + hasPOG = true;
  239 + }
  240 +
  241 + // nom num, nom num pog
  242 + if (isNom(candidate) && agreedNum(candidate, m)) {
  243 + if (agreedGenderOrPerson(candidate, m))
  244 + hasNomNumPOG = true;
  245 + hasNomNum = true;
  246 + }
  247 +
  248 + // nom pog, num pog
  249 + if (agreedGenderOrPerson(candidate, m))
  250 + if (isNom(candidate)) {
  251 + if (hasNomPOG)
  252 + has2NomPOG = true;
  253 + hasNomPOG = true;
  254 + } else if (agreedNum(candidate, m))
  255 + hasNumPOG = true;
  256 +
  257 + prev = candidate;
  258 + }
  259 +
  260 + // features.put("conj_" + prefix, hasConj);
  261 + features.put("cand_2_nom_" + prefix, has2Nom);
  262 + features.put("cand_2_POG_" + prefix, has2POG);
  263 + features.put("cand_2_nom+POG_" + prefix, has2NomPOG);
  264 +
  265 + features.put("cand_nom_" + prefix, hasNom);
  266 + features.put("cand_num_" + prefix, hasNum);
  267 + features.put("cand_POG_" + prefix, hasPOG);
  268 +
  269 + features.put("cand_nom+num_" + prefix, hasNomNum);
  270 + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG);
  271 + features.put("cand_nom+POG_" + prefix, hasNomPOG);
  272 + features.put("cand_num+POG_" + prefix, hasNumPOG);
  273 + }
  274 +
  275 + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) {
  276 +
  277 + int idx = s.indexOf(m);
  278 + int from = Math.max(0, idx - pre);
  279 + int to = Math.min(s.size(), idx + post + 1);
  280 +
  281 + return new ArrayList<>(s.subList(from, to));
  282 + }
  283 +
  284 + private static boolean isPrevPraet(Token m, Sentence s) {
  285 + Token prev = null;
  286 + for (Token morph : s) {
  287 + if (m.equals(morph))
  288 + break;
  289 + prev = morph;
  290 + }
  291 + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet");
  292 + }
  293 +
  294 + /**
  295 + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo,
  296 + * lub (jak przy streszczeniach: w środku musi być czasownik w formie
  297 + * osobowej),
  298 + *
  299 + * @param s
  300 + * sentence
  301 + * @param m2
  302 + * token
  303 + * @return clause with the token
  304 + */
  305 + public static List<Token> getClause(Sentence s, Token m2) {
  306 +
  307 + List<List<Token>> sublists = getClauses(s);
  308 +
  309 + for (List<Token> sub : sublists)
  310 + for (Token m : sub)
  311 + if (m.equals(m2))
  312 + return sub;
  313 +
  314 + return null;
  315 + }
  316 +
  317 + public static List<List<Token>> getClauses(Sentence s) {
  318 +
  319 + Set<Token> noSplitMorphs = new HashSet<>();
  320 + for (SyntacticGroup g : s.getGroups()) {
  321 + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
  322 + noSplitMorphs.add(m);
  323 + }
  324 + }
  325 + for (SyntacticWord g : s.getSyntacticWords()) {
  326 + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
  327 + noSplitMorphs.add(m);
  328 + }
  329 + }
  330 +
  331 + LinkedList<List<Token>> sublists = new LinkedList<>();
  332 + List<Token> currentSublist = new ArrayList<>();
  333 + boolean clauseHasVerb = false;
  334 + for (Token m : s) {
  335 + String base = m.getChosenInterpretation().getBase();
  336 + if (!noSplitMorphs.contains(m)
  337 + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2
  338 + .contains(base)) && clauseHasVerb))) {
  339 + sublists.add(currentSublist);
  340 + currentSublist = new ArrayList<>();
  341 + clauseHasVerb = false;
  342 + } else {
  343 + if (isVerb(m))
  344 + clauseHasVerb = true;
  345 + }
  346 + currentSublist.add(m);
  347 + }
  348 + if (currentSublist.size() > 0) {
  349 + if (clauseHasVerb)
  350 + sublists.add(currentSublist);
  351 + else
  352 + sublists.getLast().addAll(currentSublist);
  353 + }
  354 +
  355 + // merge clause beginning with zaimek wzgl. etc to previous clause
  356 + List<Token> prev = null;
  357 + Iterator<List<Token>> it = sublists.iterator();
  358 + while (it.hasNext()) {
  359 + List<Token> sublist = it.next();
  360 + boolean containsRelPron = false;
  361 + int i = 1;
  362 + for (Token m : sublist) {
  363 + if (i > 2)
  364 + break;
  365 + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) {
  366 + containsRelPron = true;
  367 + break;
  368 + }
  369 + i++;
  370 + }
  371 + if (prev != null && containsRelPron) {
  372 + prev.addAll(sublist);
  373 + it.remove();
  374 + } else
  375 + prev = sublist;
  376 + }
  377 +
  378 + return sublists;
  379 + }
  380 +
  381 + private static boolean agreedNum(Token candidate, Token keyword) {
  382 + String keywordNum = keyword.getNumber();
  383 + String wordNum = candidate.getNumber();
  384 + return keywordNum.equals(wordNum);
  385 + }
  386 +
  387 + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) {
  388 + if (isPraet(keyword)) {
  389 + // praet has number:gender
  390 + String keywordGender = keyword.getGender();
  391 + String wordGender = candidate.getGender();
  392 + return keywordGender.equals(wordGender);
  393 + } else {
  394 + // other verbs have number:person
  395 + String keywordPerson = keyword.getPerson();
  396 + String wordPerson = "ter"; // default
  397 + if (PRONOUN_TAGS.contains(candidate.getCtag()))
  398 + wordPerson = candidate.getPerson();
  399 + return wordPerson.equals(keywordPerson);
  400 + }
  401 + }
  402 +
  403 + private static boolean isJakJako(Token prev) {
  404 + String base = prev == null ? null : prev.getBase();
  405 + return prev != null && (base.equals("jak") || base.equals("jako"));
  406 + }
  407 +
  408 + private static boolean isPraet(Token keyword) {
  409 + return keyword.getCtag().equals("praet");
  410 + }
  411 +
  412 + private static boolean isNom(Token candidate) {
  413 + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow
  414 + // tylko!
  415 + }
  416 +
  417 + private static boolean isNoun(Token m) {
  418 + return NOUN_TAGS.contains(m.getCtag());
  419 + }
  420 +
  421 + public static boolean isVerb(Token morph) {
  422 + return VERB_TAGS.contains(morph.getCtag());
  423 + }
  424 +
  425 + public static boolean isVerb(Mention m) {
  426 + boolean hasOnlyVerbs = true;
  427 + for (Token morph : m.getSegments())
  428 + if (!isVerb(morph)) {
  429 + hasOnlyVerbs = false;
  430 + break;
  431 + }
  432 + return hasOnlyVerbs;
  433 + }
  434 +
  435 + public static boolean isVerb(TEIMention m) {
  436 + boolean hasOnlyVerbs = true;
  437 + for (TEIMorph morph : m.getMorphs())
  438 + if (!isVerb(morph)) {
  439 + hasOnlyVerbs = false;
  440 + break;
  441 + }
  442 + return hasOnlyVerbs;
  443 + }
  444 +
  445 + private static boolean isVerb(TEIMorph morph) {
  446 + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag());
  447 + }
  448 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/HeadDetector.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.head;
  2 +
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  6 +import pl.waw.ipipan.zil.core.md.entities.Token;
  7 +import weka.core.Instances;
  8 +
  9 +import java.io.File;
  10 +import java.io.InputStream;
  11 +import java.util.*;
  12 +
  13 +public class HeadDetector {
  14 +
  15 + final private static Logger logger = LoggerFactory.getLogger(HeadDetector.class);
  16 +
  17 + private Model model;
  18 + private Set<String> quasiVerbs = new HashSet<>();
  19 +
  20 + public static int detectedHeads = 0;
  21 +
  22 + public List<Token> detectHeads(Sentence sentence) {
  23 + List<TreeMap<String, Object>> examples = new ArrayList<>();
  24 + InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence);
  25 + if (examples.isEmpty())
  26 + return null;
  27 +
  28 + Instances instances = model.getInstances(examples);
  29 +
  30 + // label instances
  31 + List<Boolean> areHeads = new ArrayList<>();
  32 + List<Token> heads = new ArrayList<>();
  33 + for (int i = 0; i < instances.numInstances(); i++) {
  34 + boolean isHead = model.isHead(instances.instance(i), sentence);
  35 + areHeads.add(isHead);
  36 + if (isHead)
  37 + detectedHeads++;
  38 + }
  39 +
  40 + int i = 0;
  41 + for (Token m : sentence) {
  42 + if (FeatureGeneration.isVerb(m))
  43 + continue;
  44 + if (areHeads.get(i))
  45 + heads.add(m);
  46 + // sentence.addMention(new Mention(m, false));
  47 + i++;
  48 + }
  49 + return heads;
  50 + }
  51 +
  52 + public HeadDetector(File zeroSubjectDetectionModel) {
  53 + try {
  54 + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath());
  55 + this.quasiVerbs = this.model.getQuasiVerbs();
  56 + } catch (Exception e) {
  57 + logger.error("Error loading model:" + e);
  58 + }
  59 + }
  60 +
  61 + public HeadDetector(InputStream zeroSubjectDetectionModelStream) {
  62 + try {
  63 + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream);
  64 + this.quasiVerbs = this.model.getQuasiVerbs();
  65 + } catch (Exception e) {
  66 + logger.error("Error loading model:" + e);
  67 + }
  68 + }
  69 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/InstanceCreator.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.head;
  2 +
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.entities.*;
  6 +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
  7 +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
  8 +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
  9 +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
  10 +import weka.core.Attribute;
  11 +import weka.core.FastVector;
  12 +import weka.core.Instance;
  13 +import weka.core.Instances;
  14 +
  15 +import java.io.File;
  16 +import java.util.*;
  17 +import java.util.Map.Entry;
  18 +
  19 +public class InstanceCreator {
  20 +
  21 + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class);
  22 + private static final TEI_IO teiIO = TEI_IO.getInstance();
  23 +
  24 + private InstanceCreator() {
  25 + }
  26 +
  27 + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) {
  28 + int allTexts = 0;
  29 + int exceptions = 0;
  30 + int allSentences = 0;
  31 +
  32 + List<TreeMap<String, Object>> examples = new ArrayList<>();
  33 + for (File textDir : IOUtils.getNKJPDirs(dataDir)) {
  34 + try {
  35 + allTexts++;
  36 + logger.info("Processing text " + textDir);
  37 + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
  38 + Text text = TeiLoader.loadTextFromTei(ct, textDir);
  39 +
  40 + for (Paragraph p : text)
  41 + for (Sentence s : p) {
  42 + allSentences++;
  43 + loadExamplesFromSentence(quasiVerbs, examples, s);
  44 + }
  45 +
  46 + } catch (Exception e) {
  47 + logger.error(e.getLocalizedMessage());
  48 + exceptions++;
  49 + }
  50 + }
  51 +
  52 + logger.info(allTexts + " texts found.");
  53 + if (exceptions != 0)
  54 + logger.error(exceptions + " texts with exceptions.");
  55 + logger.info(allSentences + " sentences found.");
  56 +
  57 + return examples;
  58 + }
  59 +
  60 + public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples,
  61 + Sentence s) {
  62 +
  63 + // collect positive examples
  64 + Set<Token> positive = new HashSet<>();
  65 + for (Mention m : s.getMentions()) {
  66 + if (!FeatureGeneration.isVerb(m)) {
  67 + positive.addAll(m.getHeadSegments());
  68 + }
  69 + }
  70 +
  71 + for (Token m : s) {
  72 + if (FeatureGeneration.isVerb(m))
  73 + continue;
  74 +
  75 + TreeMap<String, Object> features = new TreeMap<>();
  76 + if (positive.contains(m)) {
  77 + features.put("class", Boolean.valueOf(true));
  78 + } else {
  79 + features.put("class", Boolean.valueOf(false));
  80 + }
  81 +
  82 + FeatureGeneration.generateFeatures(features, m, s, quasiVerbs);
  83 + examples.add(features);
  84 + }
  85 + }
  86 +
  87 + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) {
  88 +
  89 + TreeSet<String> booleanAttsOccurred = new TreeSet<>();
  90 + TreeSet<String> doubleAttsOccurred = new TreeSet<>();
  91 + TreeMap<String, Set<String>> att2values = new TreeMap<>();
  92 + for (TreeMap<String, Object> example : examples) {
  93 + for (Entry<String, Object> e : example.entrySet()) {
  94 + String key = e.getKey();
  95 + Object val = e.getValue();
  96 + if (val instanceof Integer || val instanceof Double) {
  97 + doubleAttsOccurred.add(key);
  98 + continue;
  99 + }
  100 + if (val instanceof Boolean) {
  101 + booleanAttsOccurred.add(key);
  102 + continue;
  103 + }
  104 + if (!att2values.containsKey(key))
  105 + att2values.put(key, new HashSet<>());
  106 + att2values.get(key).add(val.toString());
  107 + }
  108 + }
  109 +
  110 + List<Attribute> atts = new ArrayList<>();
  111 +
  112 + // double attributes
  113 + for (String attName : doubleAttsOccurred) {
  114 + Attribute att = new Attribute(attName);
  115 + atts.add(att);
  116 + }
  117 +
  118 + // boolean attributes (treated as nominal)
  119 + FastVector values = new FastVector(2);
  120 + values.addElement("false");
  121 + values.addElement("true");
  122 + for (String attName : booleanAttsOccurred) {
  123 + Attribute att = new Attribute(attName, values);
  124 + atts.add(att);
  125 + }
  126 +
  127 + // nominal attributes
  128 + for (Entry<String, Set<String>> attVals : att2values.entrySet()) {
  129 + FastVector vals = new FastVector(attVals.getValue().size());
  130 + for (String val : attVals.getValue())
  131 + vals.addElement(val);
  132 + Attribute att = new Attribute(attVals.getKey(), vals);
  133 + atts.add(att);
  134 + }
  135 +
  136 + FastVector fvWekaAttributes = new FastVector(atts.size());
  137 + for (Attribute attr : atts) {
  138 + fvWekaAttributes.addElement(attr);
  139 + }
  140 +
  141 + Instances data = new Instances("Head", fvWekaAttributes, 10);
  142 + data.setClass(data.attribute(classFeatureName));
  143 + return data;
  144 + }
  145 +
  146 + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) {
  147 + for (TreeMap<String, Object> example : examples) {
  148 + Instance instance = new Instance(instances.numAttributes());
  149 +
  150 + for (Entry<String, Object> e : example.entrySet()) {
  151 + Object val = e.getValue();
  152 + String name = e.getKey();
  153 + if (val instanceof Integer) {
  154 + instance.setValue(instances.attribute(name), (int) val);
  155 + } else if (val instanceof Boolean) {
  156 + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false");
  157 + } else {
  158 + int indexOfValue = instances.attribute(name).indexOfValue(val.toString());
  159 + if (indexOfValue == -1) {
  160 + logger.debug("Unkown value: " + val.toString() + " of feature: " + name
  161 + + ". Marking as missing value.");
  162 + instance.setMissing(instances.attribute(name));
  163 + } else
  164 + instance.setValue(instances.attribute(name), indexOfValue);
  165 + }
  166 + }
  167 +
  168 + instance.setDataset(instances);
  169 + instances.add(instance);
  170 + }
  171 + }
  172 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Model.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.head;
  2 +
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  6 +import weka.classifiers.Classifier;
  7 +import weka.core.Instance;
  8 +import weka.core.Instances;
  9 +
  10 +import java.io.Serializable;
  11 +import java.util.List;
  12 +import java.util.Set;
  13 +import java.util.TreeMap;
  14 +
  15 +public class Model implements Serializable {
  16 +
  17 + private static final long serialVersionUID = 3351727361273283076L;
  18 + private static final Logger logger = LoggerFactory.getLogger(Model.class);
  19 +
  20 + private Classifier classifier;
  21 + private Set<String> quasiVerbs;
  22 + private Instances instances;
  23 +
  24 + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) {
  25 + this.classifier = classifier;
  26 + this.instances = instances;
  27 + this.quasiVerbs = quasiVerbs;
  28 + }
  29 +
  30 + public boolean isHead(Instance instance, Sentence sentence) {
  31 + try {
  32 + double response = this.classifier.classifyInstance(instance);
  33 + return response > 0;
  34 + } catch (Exception e) {
  35 + logger.error("Error classyfing head in sentence: " + sentence, e);
  36 + return false;
  37 + }
  38 + }
  39 +
  40 + public Instances getInstances(List<TreeMap<String, Object>> examples) {
  41 + Instances instances = new Instances(this.instances);
  42 + InstanceCreator.fillInstances(examples, instances);
  43 + return instances;
  44 + }
  45 +
  46 + public Set<String> getQuasiVerbs() {
  47 + return quasiVerbs;
  48 + }
  49 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Serializer.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.head;
  2 +
  3 +import weka.core.SerializationHelper;
  4 +
  5 +import java.io.InputStream;
  6 +
  7 +public class Serializer {
  8 +
  9 + public static void saveModel(Model m, String targetModelFilePath) throws Exception {
  10 + SerializationHelper.write(targetModelFilePath, m);
  11 + }
  12 +
  13 + public static Model loadModel(String path) throws Exception {
  14 + Model m = (Model) SerializationHelper.read(path);
  15 + return m;
  16 + }
  17 +
  18 + public static Model loadModelFromStream(InputStream stream) throws Exception {
  19 + Model m = (Model) SerializationHelper.read(stream);
  20 + return m;
  21 + }
  22 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Trainer.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.head;
  2 +
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import weka.classifiers.Evaluation;
  6 +import weka.classifiers.rules.JRip;
  7 +import weka.classifiers.rules.JRip.RipperRule;
  8 +import weka.core.Attribute;
  9 +import weka.core.Instance;
  10 +import weka.core.Instances;
  11 +
  12 +import java.io.*;
  13 +import java.util.*;
  14 +
  15 +public class Trainer {
  16 +
  17 + private static final Logger logger = LoggerFactory.getLogger(Trainer.class);
  18 +
  19 + private static final boolean DO_CV = false;
  20 + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt";
  21 +
  22 + private Trainer() {
  23 + }
  24 +
  25 + public static void main(String[] args) {
  26 +
  27 + if (args.length != 2) {
  28 + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName()
  29 + + " trainDir targetModelFile");
  30 + return;
  31 + }
  32 +
  33 + File dataDir = new File(args[0]);
  34 + String targetModelFilePath = args[1];
  35 +
  36 + if (!dataDir.isDirectory()) {
  37 + logger.error(dataDir + " is not a directory!");
  38 + return;
  39 + }
  40 +
  41 + Set<String> quasiVerbs = loadQuasiVerbs();
  42 +
  43 + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs);
  44 + Instances instances = InstanceCreator.createInstances(examples, "class");
  45 + InstanceCreator.fillInstances(examples, instances);
  46 +
  47 + printStats(instances);
  48 +
  49 + try {
  50 + JRip model;
  51 +
  52 + if (DO_CV) {
  53 + logger.info("Crossvalidation...");
  54 + model = new JRip();
  55 + Evaluation eval = new Evaluation(instances);
  56 + eval.crossValidateModel(model, instances, 10, new Random(1));
  57 + logger.info(eval.toSummaryString());
  58 + logger.info(eval.toMatrixString());
  59 + logger.info(eval.toClassDetailsString());
  60 + }
  61 +
  62 + logger.info("Building final classifier...");
  63 + model = new JRip();
  64 + model.buildClassifier(instances);
  65 + logger.info(model.getRuleset().size() + " rules generated.");
  66 + for (int i = 0; i < model.getRuleset().size(); i++) {
  67 + RipperRule v = (RipperRule) model.getRuleset().elementAt(i);
  68 + logger.info("\t" + v.toString(instances.classAttribute()));
  69 + }
  70 +
  71 + instances.delete();
  72 + logger.info("Features stats:");
  73 + for (int i = 0; i < instances.numAttributes(); i++) {
  74 + Attribute att = instances.attribute(i);
  75 + logger.info(i + ".\t" + att.toString());
  76 + }
  77 +
  78 + logger.info("Saving classifier...");
  79 + Model m = new Model(model, instances, quasiVerbs);
  80 + Serializer.saveModel(m, targetModelFilePath);
  81 + logger.info("Done.");
  82 +
  83 + } catch (Exception e) {
  84 + logger.error("Error: " + e);
  85 + }
  86 + }
  87 +
  88 + private static Set<String> loadQuasiVerbs() {
  89 + Set<String> quasiVerbs = new HashSet<>();
  90 + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH);
  91 + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
  92 + String line;
  93 + while ((line = br.readLine()) != null) {
  94 + quasiVerbs.add(line.trim());
  95 + }
  96 + } catch (IOException e) {
  97 + logger.error(e.getLocalizedMessage(), e);
  98 + }
  99 + return quasiVerbs;
  100 + }
  101 +
  102 + private static void printStats(Instances instances) {
  103 + int positive = 0;
  104 + int negative = 0;
  105 + for (int i = 0; i < instances.numInstances(); i++) {
  106 + Instance inst = instances.instance(i);
  107 + if (inst.classValue() > 0)
  108 + negative++;
  109 + else
  110 + positive++;
  111 + }
  112 + logger.info(positive + " positive examples");
  113 + logger.info(negative + " negative examples");
  114 + logger.info((positive + negative) + " examples total");
  115 + logger.info((instances.numAttributes() - 1) + " attributes");
  116 + logger.info(instances.toSummaryString());
  117 + }
  118 +
  119 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/FeatureGeneration.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.nominal;
  2 +
  3 +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
  4 +import pl.waw.ipipan.zil.core.md.detection.Constants;
  5 +import pl.waw.ipipan.zil.core.md.entities.*;
  6 +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention;
  7 +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph;
  8 +
  9 +import java.util.*;
  10 +
  11 +
  12 +public class FeatureGeneration {
  13 + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo",
  14 + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" }));
  15 +
  16 + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale",
  17 + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie",
  18 + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" }));
  19 +
  20 + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>(
  21 + Arrays.asList(new String[] { "?", "!" }));
  22 +
  23 + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>();
  24 + static {
  25 + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")");
  26 + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\"");
  27 + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'");
  28 + }
  29 +
  30 + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12",
  31 + "ppron3", "ger", "num", "numcol" }));
  32 +
  33 + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" }));
  34 +
  35 + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt",
  36 + "praet", "winien" }));
  37 +
  38 + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki",
  39 + "który" }));
  40 +
  41 + public static void generateFeatures(Map<String, Object> features, Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
  42 + Token head, Token candidate, Sentence s, List<Token> heads) {
  43 +
  44 + //addTokenFeatures(features, "head", head, s);
  45 + addTokenFeatures(features, "candidate", candidate, s);
  46 +
  47 + //features.put("sentLength", s.size()); // ostatnie sprawdzone
  48 + features.put("sameWord", sameWord(head, candidate, s));
  49 + features.put("sameNE", sameNE(head, candidate, s));
  50 + features.put("sameNG", sameNG(head, candidate, s));
  51 +
  52 + features.put("distance", Math.abs(head.getSentencePosition() - candidate.getSentencePosition()));
  53 + //features.put("headIsFirst", Boolean.valueOf(head.compareTo(candidate) < 0));
  54 + features.put("candidateIsFirst", Boolean.valueOf(head.compareTo(candidate) > 0));
  55 +
  56 + features.put("sameWalentyConstruction", sameWalentyConstruction(head, candidate, s, valence));
  57 + features.put("sameToken", sameToken(head, candidate));
  58 +
  59 + features.put("candidateIsAlsoHead", Boolean.valueOf(heads.contains(candidate)));
  60 + features.put("isNextToCandidateColon", isNextColon(candidate, s));
  61 +
  62 + features.put("candidateStartsWithUpperOrth", Character.isUpperCase(candidate.getOrth().codePointAt(0)));
  63 + features.put("candidateStartsWithUpperBase", Character.isUpperCase(candidate.getBase().codePointAt(0)));
  64 + features.put("isDotNextToHead", isNextDot(head, s));
  65 + features.put("closestNEDistance", closestNEDistance(head, candidate, s));
  66 + features.put("headStartsWithUpperOrth", Character.isUpperCase(head.getOrth().codePointAt(0)));
  67 + features.put("headStartsWithUpperBase", Character.isUpperCase(head.getBase().codePointAt(0))); // tutaj optymalna wersja sie konczy
  68 +
  69 +
  70 + // candidate in head in closest NE distance
  71 +
  72 +// features.put("candidateOrthLength", candidate.getOrth().length());
  73 +// features.put("candidateBaseLength", candidate.getBase().length());
  74 +// features.put("headOrthLength", head.getOrth().length());
  75 +// features.put("headBaseLength", head.getBase().length());
  76 +
  77 + //features.put("isNextToHeadColon", isNextColon(head, s));
  78 + //features.put("isCandidateColon", Boolean.valueOf(candidate.getOrth().equals(":"))); // tylko run zrobic, tak jeszcze nie sprawdzalem
  79 +
  80 +/* features.put("isClauseSplitLemmaStrict", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(candidate.getBase())));
  81 + features.put("isClauseSplitLemma", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS.contains(candidate.getBase())));
  82 + features.put("isClauseSplitLemma2", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(candidate.getBase())));*/
  83 +
  84 +/* Token next = getNeighbouringToken(s, candidate, 1);
  85 + if (next != null) {
  86 + features.put("nextIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(next.getBase())));
  87 + features.put("nextIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(next.getBase())));
  88 + features.put("nextIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(next.getBase())));
  89 + } else {
  90 + features.put("nextIsClauseSplitLemmaStrict", "sentEnd");
  91 + features.put("nextIsClauseSplitLemma", "sentEnd");
  92 + features.put("nextIsClauseSplitLemma2", "sentEnd");
  93 + }
  94 +
  95 + Token previous = getNeighbouringToken(s, candidate, -1);
  96 + if (previous != null) {
  97 + features.put("previousIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(previous.getBase())));
  98 + features.put("previousIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(previous.getBase())));
  99 + features.put("previousIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(previous.getBase())));
  100 + } else {
  101 + features.put("previousIsClauseSplitLemmaStrict", "sentStart");
  102 + features.put("previousIsClauseSplitLemma", "sentStart");
  103 + features.put("previousIsClauseSplitLemma2", "sentStart");
  104 + }*/
  105 +
  106 +
  107 + //features.put("candidateIsClosingBracket", candidateIsClosingBracket(head, candidate, s));
  108 + //features.put("candidateIsQM", candidateIsClosingQM(head, candidate, s));
  109 + //features.put("candidateIsClosingBracket", Boolean.valueOf(candidate.getOrth().equals(")")));
  110 +
  111 + // pozycja glowy we wzmiance, da sie zasymulowac!!cos nie bangla
  112 + // jeszcze raz niestety trzeba sprawdzic ciaglosc prawostronna chyba
  113 + // head NG group length i walenty construction group Length dodac bo moze to dobrze zadzialac z odelgloscia
  114 + // is stop word dodac dla candidate i jakies rozwiazania z detekcji glowy moze
  115 + // zrobic tak zeby jeszcze sprawdzalo czy token przed jest czescia wzmianki
  116 + // z tymi separatorami tez sie pobawic
  117 + // word Ctag !!
  118 +/*
  119 + Token next = getNeighbouringToken(s, candidate, 1);
  120 + if (next != null) {
  121 + features.put(String.format("%sCtag", "nextToCandidate"), next.getChosenInterpretation().getCtag());
  122 + features.put(String.format("%sNumber", "nextToCandidate"), next.getChosenInterpretation().getNumber());
  123 + features.put(String.format("%sGender", "nextToCandidate"), next.getChosenInterpretation().getGender());
  124 + features.put(String.format("%sPerson", "nextToCandidate"), next.getChosenInterpretation().getPerson());
  125 + } else {
  126 + features.put(String.format("%sCtag", "nextToCandidate"), "null");
  127 + features.put(String.format("%sNumber", "nextToCandidate"), "null");
  128 + features.put(String.format("%sGender", "nextToCandidate"), "null");
  129 + features.put(String.format("%sPerson", "nextToCandidate"), "null");
  130 + }
  131 +
  132 + Token previous = getNeighbouringToken(s, candidate, -1);
  133 + if (previous != null) {
  134 + features.put(String.format("%sCtag", "previousToCandidate"), previous.getChosenInterpretation().getCtag());
  135 + features.put(String.format("%sNumber", "previousToCandidate"), previous.getChosenInterpretation().getNumber());
  136 + features.put(String.format("%sGender", "previousToCandidate"), previous.getChosenInterpretation().getGender());
  137 + features.put(String.format("%sPerson", "previousToCandidate"), previous.getChosenInterpretation().getPerson());
  138 + } else {
  139 + features.put(String.format("%sCtag", "previousToCandidate"), "null");
  140 + features.put(String.format("%sNumber", "previousToCandidate"), "null");
  141 + features.put(String.format("%sGender", "previousToCandidate"), "null");
  142 + features.put(String.format("%sPerson", "previousToCandidate"), "null");
  143 + }
  144 + */
  145 +
  146 +
  147 + }
  148 +
  149 + private static int closestNEDistance(Token head, Token candidate, Sentence s) {
  150 + int lowestDistance = -1;
  151 + for (NamedEntity ne : s.getNamedEntities()) {
  152 + int distance = ne.getTokens().get(0).getSentencePosition() - head.getSentencePosition();
  153 + if ( distance >= 0 && ne.getTokens().contains(candidate) && (distance < lowestDistance || lowestDistance < 0)) {
  154 + lowestDistance = distance;
  155 + }
  156 + }
  157 + return lowestDistance;
  158 + }
  159 +
  160 + /////////////////////////////
  161 +
  162 +/* private static boolean candidateIsClosingBracket(Token head, Token candidate, Sentence s) {
  163 +
  164 +
  165 +
  166 + if (!candidate.getOrth().equals(")")) {
  167 + return Boolean.valueOf(false);
  168 + }
  169 +
  170 + int openedBrackets = 0;
  171 + int closedBrackets = 0;
  172 + for (Token t : s) {
  173 + if (candidate.getSentencePosition() == t.getSentencePosition()) {
  174 + break;
  175 + }
  176 +
  177 + if (t.getSentencePosition() >= head.getSentencePosition()) {
  178 + if (t.getOrth().equals("("))
  179 + openedBrackets++;
  180 + if (t.getOrth().equals(")"))
  181 + closedBrackets++;
  182 + }
  183 + }
  184 +
  185 + if (openedBrackets - closedBrackets > 0) {
  186 + return Boolean.valueOf(true);
  187 + }
  188 +
  189 + return Boolean.valueOf(false);
  190 + }*/
  191 +
  192 + private static boolean isNextColon(Token t, Sentence s) {
  193 + int idx = s.indexOf(t) + 1;
  194 + if (idx >= s.size() || idx < 0)
  195 + return Boolean.valueOf(false);
  196 + return Boolean.valueOf(s.get(idx).getOrth().equals(":"));
  197 + }
  198 +
  199 + private static boolean isNextDot(Token t, Sentence s) {
  200 + int idx = s.indexOf(t) + 1;
  201 + if (idx >= s.size() || idx < 0)
  202 + return Boolean.valueOf(false);
  203 + return Boolean.valueOf(s.get(idx).getOrth().equals("."));
  204 + }
  205 +
  206 + private static boolean candidateIsClosingQM(Token head, Token candidate, Sentence s) {
  207 +
  208 + if (!candidate.getOrth().equals("\"")) {
  209 + return Boolean.valueOf(false);
  210 + }
  211 +
  212 + int start = head.getSentencePosition();
  213 + int end = candidate.getSentencePosition() - 1;
  214 + if (head.compareTo(candidate) > 0) {
  215 + start = candidate.getSentencePosition() + 1;
  216 + end = head.getSentencePosition();
  217 + }
  218 +
  219 + int QMs = 0;
  220 + for (Token t : s) {
  221 + if (end == t.getSentencePosition()) {
  222 + break;
  223 + }
  224 +
  225 + if (t.getSentencePosition() >= start) {
  226 + if (t.getOrth().equals("\""))
  227 + QMs++;
  228 + }
  229 + }
  230 +
  231 + if ((QMs % 2) != 0) {
  232 + return Boolean.valueOf(true);
  233 + }
  234 +
  235 + return Boolean.valueOf(false);
  236 + }
  237 +
  238 + private static boolean sameWord(Token t1, Token t2, Sentence s) {
  239 +
  240 + for (SyntacticWord w : s.getSyntacticWords()) {
  241 + if (w.getTokens().contains(t1) && w.getTokens().contains(t2)) {
  242 + return Boolean.valueOf(true);
  243 + }
  244 + }
  245 + return Boolean.valueOf(false);
  246 + }
  247 +
  248 + private static boolean sameNE(Token t1, Token t2, Sentence s) {
  249 +
  250 + for (NamedEntity ne : s.getNamedEntities()) {
  251 + if (ne.getTokens().contains(t1) && ne.getTokens().contains(t2)) {
  252 + return Boolean.valueOf(true);
  253 + }
  254 + }
  255 + return Boolean.valueOf(false);
  256 + }
  257 +
  258 + private static boolean sameNG(Token head, Token candidate, Sentence s) {
  259 +
  260 + for (SyntacticGroup group : s.getGroups()) {
  261 + if (group.getType().startsWith("NG")) {
  262 + if (group.getSemanticHeadTokens().contains(head) && group.getTokens().contains(candidate)) {
  263 + return Boolean.valueOf(true);
  264 + }
  265 + }
  266 + }
  267 + return Boolean.valueOf(false);
  268 + }
  269 +
  270 + private static boolean sameWalentyConstruction(Token head, Token candidate, Sentence s,
  271 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
  272 +
  273 + for (SyntacticGroup group : s.getGroups()) {
  274 + if (group.getType().startsWith("NG")) {
  275 + ArrayList<SyntacticGroup> nestedGroups = new ArrayList<SyntacticGroup>();
  276 + nestedGroups.add(group);
  277 +
  278 + SyntacticGroup nextGroup = group.getFollowingGroup();
  279 + while (nextGroup != null) {
  280 + nestedGroups.add(nextGroup);
  281 + nextGroup = nextGroup.getFollowingGroup();
  282 + }
  283 +
  284 + List<Token> extendedGroupSegments = getExtendedGroupSegments(nestedGroups, valence.get(ValenceDicts.NounsValence));
  285 + List<Token> extendedGroupHeads = getExtendedGroupHeads(nestedGroups);
  286 + if (extendedGroupHeads.contains(head) && extendedGroupSegments.contains(candidate))
  287 + return Boolean.valueOf(true);
  288 + }
  289 + }
  290 + return Boolean.valueOf(false);
  291 + }
  292 +
  293 + private static List<Token> getExtendedGroupSegments(ArrayList<SyntacticGroup> nestedGroups,
  294 + Map<String,ArrayList<String>> walentyNouns) {
  295 +
  296 + SyntacticGroup initialGroup = nestedGroups.get(0);
  297 + String initialGroupHead = initialGroup.getSemanticHeadTokens().get(0).getBase();
  298 +
  299 + List<Token> heads = initialGroup.getSemanticHeadTokens();
  300 + List<Token> segments = new ArrayList<Token>();
  301 +
  302 + if (!walentyNouns.containsKey(initialGroupHead)) {
  303 + segments.addAll(initialGroup.getTokens());
  304 + } else {
  305 +
  306 + ArrayList<String> schemata = walentyNouns.get(initialGroupHead);
  307 + ArrayList<ArrayList<String>> groupsRealizations = new ArrayList<ArrayList<String>>();
  308 + ArrayList<SyntacticGroup> largestMatch = new ArrayList<SyntacticGroup>();
  309 + largestMatch.add(initialGroup);
  310 +
  311 + for (int i=1; i < nestedGroups.size(); i++) {
  312 + SyntacticGroup group = nestedGroups.get(i);
  313 + ArrayList<String> realizations = group.getWalentyRealizations();
  314 + groupsRealizations.add(realizations);
  315 + if (realizationsMatch(schemata, groupsRealizations)) {
  316 + largestMatch.add(group);
  317 + } else {
  318 + break;
  319 + }
  320 + }
  321 +
  322 + for (SyntacticGroup group : largestMatch) {
  323 + segments.addAll(group.getTokens());
  324 + }
  325 +
  326 + }
  327 + return segments;
  328 + }
  329 +
  330 + private static List<Token> getExtendedGroupHeads(ArrayList<SyntacticGroup> nestedGroups) {
  331 +
  332 + SyntacticGroup initialGroup = nestedGroups.get(0);
  333 +
  334 + List<Token> heads = initialGroup.getSemanticHeadTokens();
  335 +
  336 + return heads;
  337 + }
  338 +
  339 + private static boolean realizationsMatch(ArrayList<String> schemata,
  340 + ArrayList<ArrayList<String>> groupsRealizations) {
  341 + for (String schema : schemata) {
  342 + if (isProperSchema(schema, groupsRealizations)) {
  343 + return true;
  344 + }
  345 + }
  346 + return false;
  347 + }
  348 +
  349 + private static boolean isProperSchema(String schema,
  350 + ArrayList<ArrayList<String>> groupsRealizations) {
  351 +
  352 + ArrayList<ArrayList<String>> matchingPositions = new ArrayList<ArrayList<String>>();
  353 + for (ArrayList<String> realizations : groupsRealizations) {
  354 + matchingPositions.add(getMatchingPositions(schema, realizations));
  355 + }
  356 +
  357 + if (matchingPositionsExists(matchingPositions)) {
  358 + return true;
  359 + /*ArrayList<ArrayList<String>> product = cartesianProduct(matchingPositions);
  360 + for (ArrayList<String> combination : product) {
  361 + Set<String> combinationSet = new HashSet<String>(combination);
  362 + if (combinationSet.size() == matchingPositions.size()) {
  363 + return true;
  364 + }
  365 + }*/
  366 + }
  367 + return false;
  368 + }
  369 +
  370 + private static ArrayList<String> getMatchingPositions(String schema, ArrayList<String> phraseRealizations) {
  371 + ArrayList<String> positions = new ArrayList<String>();
  372 + for (String position : schema.split("\\s\\+\\s")) {
  373 + position = position.trim();
  374 + position = position.substring(1, position.length()-1);
  375 + for (String phrT : position.split(";")) {
  376 + if (phraseRealizations.contains(phrT.trim())) {
  377 + positions.add(position);
  378 + break;
  379 + }
  380 + }
  381 + }
  382 + return positions;
  383 + }
  384 +
  385 + private static boolean matchingPositionsExists(ArrayList<ArrayList<String>> matchingPositions) {
  386 + for (ArrayList<String> positions : matchingPositions) {
  387 + if (positions.isEmpty()) {
  388 + return false;
  389 + }
  390 + }
  391 + return true;
  392 + }
  393 +
  394 + private static boolean sameToken(Token t1, Token t2) {
  395 + if (t1.compareTo(t2) == 0) {
  396 + return Boolean.valueOf(true);
  397 + }
  398 + return Boolean.valueOf(false);
  399 + }
  400 + //////////////////////////////////
  401 +
  402 + private static void addTokenFeatures(Map<String, Object> features, String label, Token t, Sentence s) {
  403 + features.put(String.format("%sCtag", label), t.getChosenInterpretation().getCtag());
  404 + features.put(String.format("%sNumber", label), t.getChosenInterpretation().getNumber());
  405 + features.put(String.format("%sGender", label), t.getChosenInterpretation().getGender());
  406 + features.put(String.format("%sPerson", label), t.getChosenInterpretation().getPerson());
  407 + features.put(String.format("%sWordCtag", label), wordCtag(t, s));
  408 +
  409 + features.put(String.format("%sNextCtag", label), getNeighbouringTag(s, t, 1));
  410 + features.put(String.format("%sPrevCtag", label), getNeighbouringTag(s, t, -1));
  411 +
  412 +
  413 + Token next = getNeighbouringToken(s, t, 1);
  414 + if (next != null) {
  415 + features.put(String.format("%sNextWordCtag", label), wordCtag(next, s));
  416 + } else {
  417 + features.put(String.format("%sNextWordCtag", label), "None");
  418 + }
  419 +
  420 + Token previous = getNeighbouringToken(s, t, -1);
  421 + if (previous != null) {
  422 + features.put(String.format("%sPrevWordCtag", label), wordCtag(previous, s));
  423 + } else {
  424 + features.put(String.format("%sPrevWordCtag", label), "None");
  425 + }
  426 +
  427 +// features.put(String.format("%sNextNextCtag", label), getNeighbouringTag(s, t, 2));
  428 +// features.put(String.format("%sPrevPrevCtag", label), getNeighbouringTag(s, t, -2));
  429 +
  430 +// features.put(String.format("%sSentPosition", label), t.getSentencePosition());
  431 +
  432 +
  433 +// features.put(String.format("%sPrevPraet", label), isPrevPraet(t, s));
  434 +// features.put(String.format("%sPrevComma", label), isPrevComma(t, s));
  435 +// features.put(String.format("%sPrev2Pred", label), isPrev2Pred(t, s));
  436 +// features.put(String.format("%sNextInf", label), isNextInf(t, s));
  437 +
  438 +/* List<Token> clause = getClause(s, t);
  439 + if (clause != null)
  440 + features.put(String.format("%sClauseLength", label), clause.size());
  441 + else
  442 + features.put(String.format("%sClauseLength", label), 0);*/
  443 +
  444 + /*addFeatures(features, clause, String.format("%sClause", label), t);
  445 + addFeatures(features, s, String.format("%sSent", label), t);*/
  446 +// for (int i = 1; i < 6; i++) // zrobic to ale w oknie od head do candidate
  447 +// addFeatures(features, getWindow(s, t, i, 0), String.format("%sWindow_", label) + i + "_" + 0, t);
  448 +// for (int i = 1; i < 6; i++)
  449 +// addFeatures(features, getWindow(s, t, 0, i), String.format("%sWindow_", label) + 0 + "_" + i, t);
  450 +// for (int i = 1; i < 6; i++)
  451 +// addFeatures(features, getWindow(s, t, i, i), String.format("%sWindow_", label) + i + "_" + i, t);
  452 + }
  453 +
  454 + private static String wordCtag(Token t, Sentence s) {
  455 + for (SyntacticWord w : s.getSyntacticWords()) {
  456 + if (w.getTokens().contains(t)) {
  457 + return w.getCtag();
  458 + }
  459 + }
  460 + return "None";
  461 + }
  462 +
  463 + private static boolean isNextInf(Token m, Sentence s) {
  464 + boolean now = false;
  465 + for (Token morph : s) {
  466 + if (now)
  467 + return morph.getChosenInterpretation().getCtag().equals("inf");
  468 + if (m.equals(morph))
  469 + now = true;
  470 + }
  471 + return false;
  472 + }
  473 +
  474 + private static boolean isPrev2Pred(Token m, Sentence s) {
  475 + Token prev = null;
  476 + Token prev2 = null;
  477 + for (Token morph : s) {
  478 + if (m.equals(morph))
  479 + break;
  480 + prev2 = prev;
  481 + prev = morph;
  482 + }
  483 + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred"))
  484 + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred"));
  485 + }
  486 +
  487 + private static Object isPrevComma(Token m, Sentence s) {
  488 + Token prev = null;
  489 + for (Token morph : s) {
  490 + if (m.equals(morph))
  491 + break;
  492 + prev = morph;
  493 + }
  494 + return prev != null && prev.getChosenInterpretation().getBase().equals(",");
  495 + }
  496 +
  497 + private static String getNeighbouringTag(Sentence s, Token m, int i) {
  498 + int idx = s.indexOf(m) + i;
  499 + if (idx >= s.size() || idx < 0)
  500 + return "None";
  501 + return s.get(idx).getChosenInterpretation().getCtag();
  502 + }
  503 +
  504 + private static Token getNeighbouringToken(Sentence s, Token m, int i) {
  505 + int idx = s.indexOf(m) + i;
  506 + if (idx >= s.size() || idx < 0)
  507 + return null;
  508 + return s.get(idx);
  509 + }
  510 +
  511 + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) {
  512 +
  513 + boolean hasNom = false; // 1
  514 + boolean hasNum = false; // 2
  515 + boolean hasPOG = false; // 3
  516 +
  517 + boolean hasNomNum = false;
  518 + boolean hasNumPOG = false;
  519 + boolean hasNomPOG = false;
  520 + boolean hasNomNumPOG = false;
  521 +
  522 + boolean has2Nom = false;
  523 + boolean has2NomPOG = false;
  524 + boolean has2POG = false;
  525 +
  526 + Token prev = null;
  527 + for (Token candidate : clause) {
  528 +
  529 + if (!isNoun(candidate) || isJakJako(prev)) {
  530 + prev = candidate;
  531 + continue;
  532 + }
  533 +
  534 + // nom, nom2
  535 + if (isNom(candidate)) {
  536 + if (hasNom)
  537 + has2Nom = true;
  538 + hasNom = true;
  539 + }
  540 + // num
  541 + if (agreedNum(candidate, m)) {
  542 + hasNum = true;
  543 + }
  544 + // pog, pog2
  545 + if (agreedGenderOrPerson(candidate, m)) {
  546 + if (hasPOG)
  547 + has2POG = true;
  548 + hasPOG = true;
  549 + }
  550 +
  551 + // nom num, nom num pog
  552 + if (isNom(candidate) && agreedNum(candidate, m)) {
  553 + if (agreedGenderOrPerson(candidate, m))
  554 + hasNomNumPOG = true;
  555 + hasNomNum = true;
  556 + }
  557 +
  558 + // nom pog, num pog
  559 + if (agreedGenderOrPerson(candidate, m))
  560 + if (isNom(candidate)) {
  561 + if (hasNomPOG)
  562 + has2NomPOG = true;
  563 + hasNomPOG = true;
  564 + } else if (agreedNum(candidate, m))
  565 + hasNumPOG = true;
  566 +
  567 + prev = candidate;
  568 + }
  569 +
  570 + // features.put("conj_" + prefix, hasConj);
  571 + features.put("cand_2_nom_" + prefix, has2Nom);
  572 + features.put("cand_2_POG_" + prefix, has2POG);
  573 + features.put("cand_2_nom+POG_" + prefix, has2NomPOG);
  574 +
  575 + features.put("cand_nom_" + prefix, hasNom);
  576 + features.put("cand_num_" + prefix, hasNum);
  577 + features.put("cand_POG_" + prefix, hasPOG);
  578 +
  579 + features.put("cand_nom+num_" + prefix, hasNomNum);
  580 + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG);
  581 + features.put("cand_nom+POG_" + prefix, hasNomPOG);
  582 + features.put("cand_num+POG_" + prefix, hasNumPOG);
  583 + }
  584 +
  585 + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) {
  586 +
  587 + int idx = s.indexOf(m);
  588 + int from = Math.max(0, idx - pre);
  589 + int to = Math.min(s.size(), idx + post + 1);
  590 +
  591 + return new ArrayList<>(s.subList(from, to));
  592 + }
  593 +
  594 + private static boolean isPrevPraet(Token m, Sentence s) {
  595 + Token prev = null;
  596 + for (Token morph : s) {
  597 + if (m.equals(morph))
  598 + break;
  599 + prev = morph;
  600 + }
  601 + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet");
  602 + }
  603 +
  604 + /**
  605 + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo,
  606 + * lub (jak przy streszczeniach: w środku musi być czasownik w formie
  607 + * osobowej),
  608 + *
  609 + * @param s
  610 + * sentence
  611 + * @param m2
  612 + * token
  613 + * @return clause with the token
  614 + */
  615 + public static List<Token> getClause(Sentence s, Token m2) {
  616 +
  617 + List<List<Token>> sublists = getClauses(s);
  618 +
  619 + for (List<Token> sub : sublists)
  620 + for (Token m : sub)
  621 + if (m.equals(m2))
  622 + return sub;
  623 +
  624 + return null;
  625 + }
  626 +
  627 + public static List<List<Token>> getClauses(Sentence s) {
  628 +
  629 + Set<Token> noSplitMorphs = new HashSet<>();
  630 + for (SyntacticGroup g : s.getGroups()) {
  631 + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
  632 + noSplitMorphs.add(m);
  633 + }
  634 + }
  635 + for (SyntacticWord g : s.getSyntacticWords()) {
  636 + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) {
  637 + noSplitMorphs.add(m);
  638 + }
  639 + }
  640 +
  641 + LinkedList<List<Token>> sublists = new LinkedList<>();
  642 + List<Token> currentSublist = new ArrayList<>();
  643 + boolean clauseHasVerb = false;
  644 + for (Token m : s) {
  645 + String base = m.getChosenInterpretation().getBase();
  646 + if (!noSplitMorphs.contains(m)
  647 + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2
  648 + .contains(base)) && clauseHasVerb))) {
  649 + sublists.add(currentSublist);
  650 + currentSublist = new ArrayList<>();
  651 + clauseHasVerb = false;
  652 + } else {
  653 + if (isVerb(m))
  654 + clauseHasVerb = true;
  655 + }
  656 + currentSublist.add(m);
  657 + }
  658 + if (currentSublist.size() > 0) {
  659 + if (clauseHasVerb)
  660 + sublists.add(currentSublist);
  661 + else if (!sublists.isEmpty())
  662 + sublists.getLast().addAll(currentSublist);
  663 + }
  664 +
  665 + // merge clause beginning with zaimek wzgl. etc to previous clause
  666 + List<Token> prev = null;
  667 + Iterator<List<Token>> it = sublists.iterator();
  668 + while (it.hasNext()) {
  669 + List<Token> sublist = it.next();
  670 + boolean containsRelPron = false;
  671 + int i = 1;
  672 + for (Token m : sublist) {
  673 + if (i > 2)
  674 + break;
  675 + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) {
  676 + containsRelPron = true;
  677 + break;
  678 + }
  679 + i++;
  680 + }
  681 + if (prev != null && containsRelPron) {
  682 + prev.addAll(sublist);
  683 + it.remove();
  684 + } else
  685 + prev = sublist;
  686 + }
  687 +
  688 + return sublists;
  689 + }
  690 +
  691 + private static boolean agreedNum(Token candidate, Token keyword) {
  692 + String keywordNum = keyword.getNumber();
  693 + String wordNum = candidate.getNumber();
  694 + return keywordNum.equals(wordNum);
  695 + }
  696 +
  697 + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) {
  698 + if (isPraet(keyword)) {
  699 + // praet has number:gender
  700 + String keywordGender = keyword.getGender();
  701 + String wordGender = candidate.getGender();
  702 + return keywordGender.equals(wordGender);
  703 + } else {
  704 + // other verbs have number:person
  705 + String keywordPerson = keyword.getPerson();
  706 + String wordPerson = "ter"; // default
  707 + if (PRONOUN_TAGS.contains(candidate.getCtag()))
  708 + wordPerson = candidate.getPerson();
  709 + return wordPerson.equals(keywordPerson);
  710 + }
  711 + }
  712 +
  713 + private static boolean isJakJako(Token prev) {
  714 + String base = prev == null ? null : prev.getBase();
  715 + return prev != null && (base.equals("jak") || base.equals("jako"));
  716 + }
  717 +
  718 + private static boolean isPraet(Token keyword) {
  719 + return keyword.getCtag().equals("praet");
  720 + }
  721 +
  722 + private static boolean isNom(Token candidate) {
  723 + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow
  724 + // tylko!
  725 + }
  726 +
  727 + public static boolean isNoun(Token m) {
  728 + return NOUN_TAGS.contains(m.getCtag());
  729 + }
  730 +
  731 + public static boolean isNoun(Mention m) {
  732 + return NOUN_TAGS.contains(m.getHeadSegments().get(0).getCtag());
  733 + }
  734 +
  735 + public static boolean isVerb(Token morph) {
  736 + return VERB_TAGS.contains(morph.getCtag());
  737 + }
  738 +
  739 + public static boolean isVerb(Mention m) {
  740 + boolean hasOnlyVerbs = true;
  741 + for (Token morph : m.getSegments())
  742 + if (!isVerb(morph)) {
  743 + hasOnlyVerbs = false;
  744 + break;
  745 + }
  746 + return hasOnlyVerbs;
  747 + }
  748 +
  749 + public static boolean isVerb(TEIMention m) {
  750 + boolean hasOnlyVerbs = true;
  751 + for (TEIMorph morph : m.getMorphs())
  752 + if (!isVerb(morph)) {
  753 + hasOnlyVerbs = false;
  754 + break;
  755 + }
  756 + return hasOnlyVerbs;
  757 + }
  758 +
  759 + private static boolean isVerb(TEIMorph morph) {
  760 + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag());
  761 + }
  762 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/InstanceCreator.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.nominal;
  2 +
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +
  6 +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
  7 +import pl.waw.ipipan.zil.core.md.entities.*;
  8 +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader;
  9 +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText;
  10 +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils;
  11 +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
  12 +import weka.core.Attribute;
  13 +import weka.core.FastVector;
  14 +import weka.core.Instance;
  15 +import weka.core.Instances;
  16 +
  17 +import java.io.File;
  18 +import java.util.*;
  19 +import java.util.Map.Entry;
  20 +
  21 +public class InstanceCreator {
  22 +
  23 + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class);
  24 + private static final TEI_IO teiIO = TEI_IO.getInstance();
  25 +
  26 + private InstanceCreator() {
  27 + }
  28 +
  29 + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs,
  30 + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) {
  31 + int allTexts = 0;
  32 + int exceptions = 0;
  33 + int allSentences = 0;
  34 +
  35 + List<TreeMap<String, Object>> examples = new ArrayList<>();
  36 + for (File textDir : IOUtils.getNKJPDirs(dataDir)) {
  37 + try {
  38 + allTexts++;
  39 + logger.info("Processing text " + textDir);
  40 + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir);
  41 + Text text = TeiLoader.loadTextFromTei(ct, textDir);
  42 +
  43 + for (Paragraph p : text)
  44 + for (Sentence s : p) {
  45 + allSentences++;
  46 + loadExamplesFromSentence(quasiVerbs, valence, examples, s);
  47 + }
  48 +
  49 + } catch (Exception e) {
  50 + //logger.error(e.getLocalizedMessage());
  51 + e.printStackTrace();
  52 + exceptions++;
  53 + }
  54 + }
  55 +
  56 + logger.info(allTexts + " texts found.");
  57 + if (exceptions != 0)
  58 + logger.error(exceptions + " texts with exceptions.");
  59 + logger.info(allSentences + " sentences found.");
  60 +
  61 + return examples;
  62 + }
  63 +
  64 + public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
  65 + List<TreeMap<String, Object>> examples, Sentence s) {
  66 +
  67 +
  68 + ArrayList<Token> heads = new ArrayList<>();
  69 + for (Mention m : s.getMentions()) {
  70 + heads.addAll(m.getHeadSegments());
  71 + }
  72 +
  73 + // collect positive examples
  74 + HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>();
  75 + for (Mention m : s.getMentions()) {
  76 + if (heads.containsAll(m.getHeadSegments())) {
  77 + positives.put(m.getHeadSegments().get(0), m.getSegments());
  78 + }
  79 + }
  80 +
  81 + for (Token head : s) {
  82 + if (heads.contains(head)) {
  83 + for (Token t : s) {
  84 + //if (head.compareTo(t) != 0) {// && Math.abs(head.getSentencePosition() - t.getSentencePosition()) <= window) {
  85 + TreeMap<String, Object> features = new TreeMap<>();
  86 + if (positives.containsKey(head) && positives.get(head).contains(t)) {
  87 + features.put("class", Boolean.valueOf(true));
  88 + //features.put("candidatePositionInMention", positionInMention(head, t, s));
  89 +
  90 + } else {
  91 + features.put("class", Boolean.valueOf(false));
  92 + //features.put("candidatePositionInMention", 0);
  93 + }
  94 +
  95 +
  96 + FeatureGeneration.generateFeatures(features, valence, head, t, s, heads);
  97 + //features.put("candidatePositionInMention", positionInMention(head, t, s));
  98 + addPreviousStates(features, head, t, s);
  99 +
  100 + examples.add(features);
  101 + // }
  102 + }
  103 + }
  104 + }
  105 + }
  106 +
  107 + public static void addPreviousStates(Map<String, Object> features, Token head, Token candidate, Sentence s) {
  108 + int context = 1;
  109 + int candidateLocation = candidate.getSentencePosition();
  110 + for (int i = 1; i <= context; i++) {
  111 + if (candidateLocation - i < 0) {
  112 + features.put(String.format("location-%d", i), Boolean.valueOf(false));
  113 + } else if (sameMention(s.get(candidateLocation - i), head, s) ) {
  114 + features.put(String.format("location-%d", i), Boolean.valueOf(true));
  115 + } else {
  116 + features.put(String.format("location-%d", i), Boolean.valueOf(false));
  117 + }
  118 + }
  119 + }
  120 +
  121 + public static int positionInMention(Token head, Token t, Sentence s) {
  122 +
  123 + Token previous = null;
  124 + if (t.getSentencePosition()-1 >= 0) {
  125 + previous = s.get(t.getSentencePosition()-1);
  126 + } else {
  127 + return 0;
  128 + }
  129 +
  130 + for (Mention m : s.getMentions()) {
  131 + if (m.getHeadSegments().contains(head) && m.getSegments().contains(previous)) {
  132 +/* if (m.getSegments().get(0).getSentencePosition() - t.getSentencePosition() <= -1) {
  133 + System.out.println(m.getSegments().get(0));
  134 + System.out.println(t);
  135 + System.out.println(m.getSegments());
  136 + }*/
  137 + return previous.getSentencePosition() - m.getSegments().get(0).getSentencePosition();
  138 + }
  139 + }
  140 + return 0;
  141 + }
  142 +
  143 + private static boolean sameMention(Token t1, Token t2, Sentence s) {
  144 + for (Mention m : s.getMentions()) {
  145 + if (m.getSegments().contains(t1) && m.getSegments().contains(t2)) {
  146 + return true;
  147 + }
  148 + }
  149 + return false;
  150 + }
  151 +
  152 + public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence,
  153 + List<TreeMap<String, Object>> examples, Sentence s, List<Token> heads) {
  154 +
  155 +
  156 + if (heads == null || heads.isEmpty())
  157 + return;
  158 +
  159 + // collect positive examples
  160 + HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>();
  161 + for (Mention m : s.getMentions()) {
  162 + if (heads.containsAll(m.getHeadSegments())) {
  163 + positives.put(m.getHeadSegments().get(0), m.getSegments());
  164 + }
  165 + }
  166 +
  167 + for (Token head : s) {
  168 + if (heads.contains(head)) {
  169 + for (Token t : s) {
  170 + TreeMap<String, Object> features = new TreeMap<>();
  171 +
  172 + if (positives.containsKey(head) && positives.get(head).contains(t)) {
  173 + features.put("class", Boolean.valueOf(true));
  174 + //features.put("candidatePositionInMention", positionInMention(head, t, s));
  175 +
  176 + } else {
  177 + features.put("class", Boolean.valueOf(false));
  178 + //features.put("candidatePositionInMention", 0);
  179 + }
  180 +
  181 + FeatureGeneration.generateFeatures(features, valence, head, t, s, heads);
  182 + //features.put("candidatePositionInMention", positionInMention(head, t, s));
  183 + addPreviousStates(features, head, t, s);
  184 + examples.add(features);
  185 + }
  186 + }
  187 + }
  188 + }
  189 +
  190 + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) {
  191 +
  192 + TreeSet<String> booleanAttsOccurred = new TreeSet<>();
  193 + TreeSet<String> doubleAttsOccurred = new TreeSet<>();
  194 + TreeMap<String, Set<String>> att2values = new TreeMap<>();
  195 + for (TreeMap<String, Object> example : examples) {
  196 + for (Entry<String, Object> e : example.entrySet()) {
  197 + String key = e.getKey();
  198 + Object val = e.getValue();
  199 + if (val instanceof Integer || val instanceof Double) {
  200 + doubleAttsOccurred.add(key);
  201 + continue;
  202 + }
  203 + if (val instanceof Boolean) {
  204 + booleanAttsOccurred.add(key);
  205 + continue;
  206 + }
  207 + if (!att2values.containsKey(key))
  208 + att2values.put(key, new HashSet<>());
  209 + att2values.get(key).add(val.toString());
  210 + }
  211 + }
  212 +
  213 + List<Attribute> atts = new ArrayList<>();
  214 +
  215 + // double attributes
  216 + for (String attName : doubleAttsOccurred) {
  217 + Attribute att = new Attribute(attName);
  218 + atts.add(att);
  219 + }
  220 +
  221 + // boolean attributes (treated as nominal)
  222 + FastVector values = new FastVector(2);
  223 + values.addElement("false");
  224 + values.addElement("true");
  225 + for (String attName : booleanAttsOccurred) {
  226 + Attribute att = new Attribute(attName, values);
  227 + atts.add(att);
  228 + }
  229 +
  230 + // nominal attributes
  231 + for (Entry<String, Set<String>> attVals : att2values.entrySet()) {
  232 + FastVector vals = new FastVector(attVals.getValue().size());
  233 + for (String val : attVals.getValue())
  234 + vals.addElement(val);
  235 + Attribute att = new Attribute(attVals.getKey(), vals);
  236 + atts.add(att);
  237 + }
  238 +
  239 + FastVector fvWekaAttributes = new FastVector(atts.size());
  240 + for (Attribute attr : atts) {
  241 + fvWekaAttributes.addElement(attr);
  242 + }
  243 +
  244 + Instances data = new Instances("Nominal", fvWekaAttributes, 10);
  245 + data.setClass(data.attribute(classFeatureName));
  246 + return data;
  247 + }
  248 +
  249 + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) {
  250 + for (TreeMap<String, Object> example : examples) {
  251 + addInstance(example, instances);
  252 + }
  253 + }
  254 +
  255 + public static void addInstance(TreeMap<String, Object> example, Instances instances) {
  256 + Instance instance = new Instance(instances.numAttributes());
  257 +
  258 + for (Entry<String, Object> e : example.entrySet()) {
  259 + Object val = e.getValue();
  260 + String name = e.getKey();
  261 + if (val instanceof Integer) {
  262 + instance.setValue(instances.attribute(name), (int) val);
  263 + } else if (val instanceof Boolean) {
  264 + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false");
  265 + } else {
  266 + int indexOfValue = instances.attribute(name).indexOfValue(val.toString());
  267 + if (indexOfValue == -1) {
  268 + logger.debug("Unkown value: " + val.toString() + " of feature: " + name
  269 + + ". Marking as missing value.");
  270 + instance.setMissing(instances.attribute(name));
  271 + } else
  272 + instance.setValue(instances.attribute(name), indexOfValue);
  273 + }
  274 + }
  275 +
  276 + instance.setDataset(instances);
  277 + instances.add(instance);
  278 + }
  279 +
  280 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Model.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.nominal;
  2 +
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  6 +import weka.classifiers.Classifier;
  7 +import weka.core.Instance;
  8 +import weka.core.Instances;
  9 +
  10 +import java.io.Serializable;
  11 +import java.util.List;
  12 +import java.util.Set;
  13 +import java.util.TreeMap;
  14 +
  15 +public class Model implements Serializable {
  16 +
  17 + private static final long serialVersionUID = 3351727361273283076L;
  18 + private static final Logger logger = LoggerFactory.getLogger(Model.class);
  19 +
  20 + private Classifier classifier;
  21 + private Set<String> quasiVerbs;
  22 + private Instances instances;
  23 +
  24 + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) {
  25 + this.classifier = classifier;
  26 + this.instances = instances;
  27 + this.quasiVerbs = quasiVerbs;
  28 + }
  29 +
  30 + public boolean arePartOfSameMention(Instance instance, Sentence sentence) {
  31 + try {
  32 + double response = this.classifier.classifyInstance(instance);
  33 + return response > 0;
  34 + } catch (Exception e) {
  35 + logger.error("Error classyfing verb in sentence: " + sentence, e);
  36 + return false;
  37 + }
  38 + }
  39 +
  40 + public Instances getInstances(List<TreeMap<String, Object>> examples) {
  41 + Instances instances = new Instances(this.instances);
  42 + InstanceCreator.fillInstances(examples, instances);
  43 + return instances;
  44 + }
  45 +
  46 + public Instances getInstances() {
  47 + Instances instances = new Instances(this.instances);
  48 + return instances;
  49 + }
  50 +
  51 + public Set<String> getQuasiVerbs() {
  52 + return quasiVerbs;
  53 + }
  54 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/NominalMentionDetector.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.nominal;
  2 +
  3 +import java.io.File;
  4 +import java.io.InputStream;
  5 +import java.util.ArrayList;
  6 +import java.util.HashSet;
  7 +import java.util.List;
  8 +import java.util.Map;
  9 +import java.util.Set;
  10 +import java.util.TreeMap;
  11 +import java.util.Map.Entry;
  12 +
  13 +import org.slf4j.Logger;
  14 +import org.slf4j.LoggerFactory;
  15 +
  16 +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
  17 +import pl.waw.ipipan.zil.core.md.detection.nominal.FeatureGeneration;
  18 +import pl.waw.ipipan.zil.core.md.detection.nominal.InstanceCreator;
  19 +import pl.waw.ipipan.zil.core.md.detection.nominal.Model;
  20 +import pl.waw.ipipan.zil.core.md.detection.nominal.Serializer;
  21 +import pl.waw.ipipan.zil.core.md.entities.Mention;
  22 +import pl.waw.ipipan.zil.core.md.entities.Sentence;
  23 +import pl.waw.ipipan.zil.core.md.entities.Token;
  24 +import weka.core.Instances;
  25 +
  26 +public class NominalMentionDetector {
  27 + final private static Logger logger = LoggerFactory.getLogger(NominalMentionDetector.class);
  28 +
  29 + private Model model;
  30 + private Set<String> quasiVerbs = new HashSet<>();
  31 +
  32 + public void addNominalMentions(Sentence sentence, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, List<Token> heads) {
  33 + List<TreeMap<String, Object>> examples = new ArrayList<>();
  34 + InstanceCreator.loadExamplesFromSentence(quasiVerbs, valence, examples, sentence, heads);
  35 + if (examples.isEmpty())
  36 + return;
  37 +
  38 + Instances instances = model.getInstances();
  39 +
  40 + // label instances
  41 + List<Boolean> areInSameMention = new ArrayList<>();
  42 + for (int i = 0; i < examples.size(); i++) {
  43 + TreeMap<String, Object> example = examples.get(i);
  44 + if (i - 1 < 0) {
  45 + example.put("location-1", Boolean.valueOf(false));
  46 + //example.put("candidatePositionInMention", 0);
  47 + } else {
  48 + example.put("location-1", Boolean.valueOf(areInSameMention.get(i-1)));
  49 +// int positionInMention = 1;
  50 +// while (i - positionInMention >= 0 && areInSameMention.get(i-positionInMention)) {
  51 +// positionInMention++;
  52 +// }
  53 +// example.put("candidatePositionInMention", positionInMention-1);
  54 + }
  55 +
  56 + InstanceCreator.addInstance(example, instances);
  57 + boolean inSameMention = model.arePartOfSameMention(instances.instance(i), sentence);
  58 + areInSameMention.add(inSameMention);
  59 + }
  60 +
  61 + int i = 0;
  62 + for (Token head : sentence) {
  63 + if (heads.contains(head)) {
  64 + ArrayList<Token> mSegments = new ArrayList<Token>();
  65 + ArrayList<Token> mHead = new ArrayList<Token>();
  66 + mHead.add(head);
  67 + for (Token t : sentence) {
  68 + if (head.compareTo(t) != 0) {
  69 + if (areInSameMention.get(i)) {
  70 + mSegments.add(t);
  71 + }
  72 + } else {
  73 + mSegments.add(t);
  74 + }
  75 + i++;
  76 + }
  77 +
  78 + // cleaning
  79 + if(mSegments.get(mSegments.size()-1).getCtag().equals("prep") || mSegments.get(mSegments.size()-1).getCtag().equals("conj") ||
  80 + mSegments.get(mSegments.size()-1).getCtag().equals("comp")) {
  81 + mSegments.remove(mSegments.size()-1);
  82 + }
  83 + if(mSegments.get(0).getCtag().equals("prep") || mSegments.get(0).getCtag().equals("conj") ||
  84 + mSegments.get(0).getCtag().equals("comp")) {
  85 + mSegments.remove(0);
  86 + }
  87 +
  88 + sentence.addMention(new Mention(mSegments, mHead));
  89 + }
  90 + }
  91 + }
  92 +
  93 + public NominalMentionDetector(File zeroSubjectDetectionModel) {
  94 + try {
  95 + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath());
  96 + this.quasiVerbs = this.model.getQuasiVerbs();
  97 + } catch (Exception e) {
  98 + logger.error("Error loading model:" + e);
  99 + }
  100 + }
  101 +
  102 + public NominalMentionDetector(InputStream zeroSubjectDetectionModelStream) {
  103 + try {
  104 + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream);
  105 + this.quasiVerbs = this.model.getQuasiVerbs();
  106 + } catch (Exception e) {
  107 + logger.error("Error loading model:" + e);
  108 + }
  109 + }
  110 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Serializer.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.nominal;
  2 +
  3 +import weka.core.SerializationHelper;
  4 +
  5 +import java.io.InputStream;
  6 +
  7 +public class Serializer {
  8 +
  9 + public static void saveModel(Model m, String targetModelFilePath) throws Exception {
  10 + SerializationHelper.write(targetModelFilePath, m);
  11 + }
  12 +
  13 + public static Model loadModel(String path) throws Exception {
  14 + Model m = (Model) SerializationHelper.read(path);
  15 + return m;
  16 + }
  17 +
  18 + public static Model loadModelFromStream(InputStream stream) throws Exception {
  19 + Model m = (Model) SerializationHelper.read(stream);
  20 + return m;
  21 + }
  22 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Trainer.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.detection.nominal;
  2 +
  3 +import org.slf4j.Logger;
  4 +import org.slf4j.LoggerFactory;
  5 +
  6 +import pl.waw.ipipan.zil.core.md.Main;
  7 +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts;
  8 +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector;
  9 +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
  10 +import weka.classifiers.Evaluation;
  11 +import weka.classifiers.rules.JRip;
  12 +import weka.classifiers.rules.JRip.RipperRule;
  13 +import weka.classifiers.trees.J48;
  14 +import weka.core.Attribute;
  15 +import weka.core.Instance;
  16 +import weka.core.Instances;
  17 +
  18 +import java.io.*;
  19 +import java.util.*;
  20 +
  21 +public class Trainer {
  22 +
  23 + private static final Logger logger = LoggerFactory.getLogger(Trainer.class);
  24 +
  25 + private static final boolean DO_CV = false;
  26 + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt";
  27 + private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt";
  28 + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt";
  29 +
  30 + private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence =
  31 + new EnumMap(ValenceDicts.class);
  32 +
  33 + static {
  34 + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE);
  35 + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream));
  36 +
  37 + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
  38 + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
  39 + }
  40 +
  41 + private Trainer() {
  42 + }
  43 +
  44 + public static void main(String[] args) {
  45 +
  46 + if (args.length != 2) {
  47 + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName()
  48 + + " trainDir targetModelFile");
  49 + return;
  50 + }
  51 +
  52 + File dataDir = new File(args[0]);
  53 + String targetModelFilePath = args[1];
  54 +
  55 + if (!dataDir.isDirectory()) {
  56 + logger.error(dataDir + " is not a directory!");
  57 + return;
  58 + }
  59 +
  60 + Set<String> quasiVerbs = loadQuasiVerbs();
  61 +
  62 + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE);
  63 + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream));
  64 +
  65 + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE);
  66 + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream));
  67 +
  68 + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs, valence);
  69 + Instances instances = InstanceCreator.createInstances(examples, "class");
  70 + InstanceCreator.fillInstances(examples, instances);
  71 +
  72 + printStats(instances);
  73 +
  74 + try {
  75 + J48 model;
  76 +
  77 + logger.info("Building final classifier...");
  78 + model = new J48();
  79 + model.buildClassifier(instances);
  80 + logger.info("J48 tree:");
  81 + logger.info(model.toString());
  82 +
  83 + instances.delete();
  84 + logger.info("Features stats:");
  85 + for (int i = 0; i < instances.numAttributes(); i++) {
  86 + Attribute att = instances.attribute(i);
  87 + logger.info(i + ".\t" + att.toString());
  88 + }
  89 +
  90 + logger.info("Saving classifier...");
  91 + Model m = new Model(model, instances, quasiVerbs);
  92 + Serializer.saveModel(m, targetModelFilePath);
  93 + logger.info("Done.");
  94 +
  95 + } catch (Exception e) {
  96 + logger.error("Error: " + e);
  97 + }
  98 +
  99 +/* try {
  100 + JRip model;
  101 +
  102 + if (DO_CV) {
  103 + logger.info("Crossvalidation...");
  104 + model = new JRip();
  105 + Evaluation eval = new Evaluation(instances);
  106 + eval.crossValidateModel(model, instances, 10, new Random(1));
  107 + logger.info(eval.toSummaryString());
  108 + logger.info(eval.toMatrixString());
  109 + logger.info(eval.toClassDetailsString());
  110 + }
  111 +
  112 + logger.info("Building final classifier...");
  113 + model = new JRip();
  114 + model.buildClassifier(instances);
  115 + logger.info(model.getRuleset().size() + " rules generated.");
  116 + for (int i = 0; i < model.getRuleset().size(); i++) {
  117 + RipperRule v = (RipperRule) model.getRuleset().elementAt(i);
  118 + logger.info("\t" + v.toString(instances.classAttribute()));
  119 + }
  120 +
  121 + instances.delete();
  122 + logger.info("Features stats:");
  123 + for (int i = 0; i < instances.numAttributes(); i++) {
  124 + Attribute att = instances.attribute(i);
  125 + logger.info(i + ".\t" + att.toString());
  126 + }
  127 +
  128 + logger.info("Saving classifier...");
  129 + Model m = new Model(model, instances, quasiVerbs);
  130 + Serializer.saveModel(m, targetModelFilePath);
  131 + logger.info("Done.");
  132 +
  133 + } catch (Exception e) {
  134 + logger.error("Error: " + e);
  135 + }*/
  136 + }
  137 +
  138 + public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream)
  139 + {
  140 + Map<String,ArrayList<String>> map;
  141 + try {
  142 + BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream));
  143 + map = new HashMap<String,ArrayList<String>>();
  144 + String line;
  145 + boolean firstLine = true;
  146 + while((line = br.readLine()) != null) {
  147 + if (firstLine) {
  148 + line = line.replace("\uFEFF", ""); // remove BOM character
  149 + firstLine = false;
  150 + }
  151 +
  152 + if (!line.startsWith("%")) {
  153 + String[] lineParts = line.split(":");
  154 + String lemma = lineParts[0].trim();
  155 + String schema = lineParts[5].trim();
  156 +
  157 + if (schema.trim().isEmpty()) {
  158 + continue;
  159 + }
  160 +
  161 + String[] lemmaParts = lemma.split(" ");
  162 + if(lemmaParts.length == 1 && schemaContainsSie(schema)) {
  163 + lemma = lemma + " się";
  164 + }
  165 +
  166 + ArrayList<String> schemata;
  167 + if (!map.containsKey(lemma)) {
  168 + schemata = new ArrayList<String>();
  169 + schemata.add(schema);
  170 + map.put(lemma, schemata);
  171 + } else {
  172 + schemata = map.get(lemma);
  173 + schemata.add(schema);
  174 + map.put(lemma, schemata);
  175 + }
  176 + }
  177 + }
  178 + br.close();
  179 + } catch (IOException ex) {
  180 + ex.printStackTrace();
  181 + throw new RuntimeException(ex);
  182 + }
  183 + return map;
  184 + }
  185 +
  186 + private static boolean schemaContainsSie(String schema) {
  187 + for (String position : schema.split("\\s\\+\\s")) {
  188 + position = position.trim();
  189 + position = position.substring(1, position.length()-1);
  190 + for (String phrT : position.split(";")) {
  191 + if (phrT.equals("refl") || phrT.equals("recip")) {
  192 + return true;
  193 + }
  194 + }
  195 + }
  196 +
  197 + return false;
  198 + }
  199 +
  200 + private static Set<String> loadQuasiVerbs() {
  201 + Set<String> quasiVerbs = new HashSet<>();
  202 + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH);
  203 + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) {
  204 + String line;
  205 + while ((line = br.readLine()) != null) {
  206 + quasiVerbs.add(line.trim());
  207 + }
  208 + } catch (IOException e) {
  209 + logger.error(e.getLocalizedMessage(), e);
  210 + }
  211 + return quasiVerbs;
  212 + }
  213 +
  214 + private static void printStats(Instances instances) {
  215 + int positive = 0;
  216 + int negative = 0;
  217 + for (int i = 0; i < instances.numInstances(); i++) {
  218 + Instance inst = instances.instance(i);
  219 + if (inst.classValue() > 0)
  220 + negative++;
  221 + else
  222 + positive++;
  223 + }
  224 + logger.info(positive + " positive examples");
  225 + logger.info(negative + " negative examples");
  226 + logger.info((positive + negative) + " examples total");
  227 + logger.info((instances.numAttributes() - 1) + " attributes");
  228 + logger.info(instances.toSummaryString());
  229 + }
  230 +
  231 +}
... ...
src/main/java/pl/waw/ipipan/zil/core/md/entities/Relation.java 0 → 100644
  1 +package pl.waw.ipipan.zil.core.md.entities;
  2 +
  3 +public class Relation {
  4 +
  5 + private String name;
  6 + private Token target;
  7 +
  8 + public Relation(String name, Token target) {
  9 + this.name = name;
  10 + this.target = target;
  11 + }
  12 +
  13 + public String getName() {
  14 + return name;
  15 + }
  16 +
  17 + public Token getTarget() {
  18 + return target;
  19 + }
  20 +
  21 +}
... ...