diff --git a/.gitignore b/.gitignore index cd578f0..c4ffafe 100644 --- a/.gitignore +++ b/.gitignore @@ -2,3 +2,4 @@ .classpath .project .settings +/bin/ diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/FeatureGeneration.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/FeatureGeneration.java new file mode 100644 index 0000000..fa20749 --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/FeatureGeneration.java @@ -0,0 +1,448 @@ +package pl.waw.ipipan.zil.core.md.detection.head; + +import pl.waw.ipipan.zil.core.md.detection.Constants; +import pl.waw.ipipan.zil.core.md.entities.*; +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; + +import java.util.*; + +public class FeatureGeneration { + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); + + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale", + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie", + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" })); + + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>( + Arrays.asList(new String[] { "?", "!" })); + + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>(); + static { + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")"); + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\""); + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'"); + } + + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12", + "ppron3", "ger", "num", "numcol" })); + + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" })); + + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt", + "praet", "winien" })); + + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki", + "który" })); + + public static void generateFeatures(Map<String, Object> features, Token t, Sentence s, Set<String> quasiVerbs) { + + features.put("ctag", t.getChosenInterpretation().getCtag()); + features.put("number", t.getChosenInterpretation().getNumber()); + + features.put("NGHead", NGHead(t, s)); + features.put("isNextColon", isNextColon(t, s)); + features.put("wordCtag", wordCtag(t, s)); + features.put("isPartOfNE", isPartOfNE(t, s)); + features.put("isFirstInNE", isFirstInNE(t, s)); + features.put("nextCtag", getNeighbouringTag(s, t, 1)); + features.put("prevCtag", getNeighbouringTag(s, t, -1)); + features.put("sentLength", s.size()); + + features.put("tokenOrthLength", t.getOrth().length()); + features.put("tokenBaseLength", t.getBase().length()); + features.put("isNextDot", isNextDot(t, s)); + features.put("closestNEDistance", closestNEDistance(t, s)); + features.put("startsWithUpperOrth", Character.isUpperCase(t.getOrth().codePointAt(0))); + features.put("startsWithUpperBase", Character.isUpperCase(t.getBase().codePointAt(0))); + + + //features.put("isPartOfFrazeo", isPartOfFrazeo(t, s)); + //features.put("gender", t.getChosenInterpretation().getGender()); + //features.put("person", t.getChosenInterpretation().getPerson()); + //features.put("quasi", quasiVerbs.contains(m.getChosenInterpretation().getBase())); + //features.put("isPrevPraet", isPrevPraet(t, s)); + //features.put("isPrevComma", isPrevComma(t, s)); + //features.put("isPrev2Pred", isPrev2Pred(t, s)); + //features.put("isNextInf", isNextInf(t, s)); + + + //List<Token> clause = getClause(s, m); +// features.put("clauseLength", clause.size()); + + //addFeatures(features, clause, "clause", m); +/* addFeatures(features, s, "sent", t); + for (int i = 1; i < 6; i++) + addFeatures(features, getWindow(s, t, i, 0), "window_" + i + "_" + 0, t); + for (int i = 1; i < 6; i++) + addFeatures(features, getWindow(s, t, 0, i), "window_" + 0 + "_" + i, t); + for (int i = 1; i < 6; i++) + addFeatures(features, getWindow(s, t, i, i), "window_" + i + "_" + i, t);*/ + } + + /////////////////////////////////// + + private static boolean NGHead(Token t, Sentence s) { + + for (SyntacticGroup group : s.getGroups()) { + if (group.getType().startsWith("NG") && group.getSemanticHeadTokens().contains(t)) { + return Boolean.valueOf(true); + } + } + return Boolean.valueOf(false); + } + + private static boolean isNextColon(Token t, Sentence s) { + int idx = s.indexOf(t) + 1; + if (idx >= s.size() || idx < 0) + return Boolean.valueOf(false); + return Boolean.valueOf(s.get(idx).getOrth().equals(":")); + } + + private static boolean isNextDot(Token t, Sentence s) { + int idx = s.indexOf(t) + 1; + if (idx >= s.size() || idx < 0) + return Boolean.valueOf(false); + return Boolean.valueOf(s.get(idx).getOrth().equals(".")); + } + + private static String wordCtag(Token t, Sentence s) { + for (SyntacticWord w : s.getSyntacticWords()) { + if (w.getTokens().contains(t)) { + return w.getCtag(); + } + } + return "None"; + } + + private static boolean isPartOfNE(Token t, Sentence s) { + for (NamedEntity ne : s.getNamedEntities()) { + if (ne.getTokens().contains(t)) { + return Boolean.valueOf(true); + } + } + return Boolean.valueOf(false); + } + + private static int closestNEDistance(Token t, Sentence s) { + int lowestDistance = -1; + for (NamedEntity ne : s.getNamedEntities()) { + int distance = ne.getTokens().get(0).getSentencePosition() - t.getSentencePosition(); + if ( distance >= 0 && (distance < lowestDistance || lowestDistance < 0)) { + lowestDistance = distance; + } + } + return lowestDistance; + } + + private static boolean isFirstInNE(Token t, Sentence s) { + for (NamedEntity ne : s.getNamedEntities()) { + if (ne.getTokens().get(0).compareTo(t) == 0) { + return Boolean.valueOf(true); + } + } + return Boolean.valueOf(false); + } + + private static boolean isPartOfFrazeo(Token t, Sentence s) { + for (SyntacticWord word : s.getSyntacticWords()) { + if (word.getTokens().contains(t) && + Constants.FRAZEO_CTAGS.contains(word.getCtag())) { + return true; + } + } + return false; + } + + /////////////////////////////////// + + private static boolean isNextInf(Token m, Sentence s) { + boolean now = false; + for (Token morph : s) { + if (now) + return morph.getChosenInterpretation().getCtag().equals("inf"); + if (m.equals(morph)) + now = true; + } + return false; + } + + private static boolean isPrev2Pred(Token m, Sentence s) { + Token prev = null; + Token prev2 = null; + for (Token morph : s) { + if (m.equals(morph)) + break; + prev2 = prev; + prev = morph; + } + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred")) + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred")); + } + + private static Object isPrevComma(Token m, Sentence s) { + Token prev = null; + for (Token morph : s) { + if (m.equals(morph)) + break; + prev = morph; + } + return prev != null && prev.getChosenInterpretation().getBase().equals(","); + } + + private static String getNeighbouringTag(Sentence s, Token m, int i) { + int idx = s.indexOf(m) + i; + if (idx >= s.size() || idx < 0) + return "None"; + return s.get(idx).getChosenInterpretation().getCtag(); + } + + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) { + + boolean hasNom = false; // 1 + boolean hasNum = false; // 2 + boolean hasPOG = false; // 3 + + boolean hasNomNum = false; + boolean hasNumPOG = false; + boolean hasNomPOG = false; + boolean hasNomNumPOG = false; + + boolean has2Nom = false; + boolean has2NomPOG = false; + boolean has2POG = false; + + Token prev = null; + for (Token candidate : clause) { + + if (!isNoun(candidate) || isJakJako(prev)) { + prev = candidate; + continue; + } + + // nom, nom2 + if (isNom(candidate)) { + if (hasNom) + has2Nom = true; + hasNom = true; + } + // num + if (agreedNum(candidate, m)) { + hasNum = true; + } + // pog, pog2 + if (agreedGenderOrPerson(candidate, m)) { + if (hasPOG) + has2POG = true; + hasPOG = true; + } + + // nom num, nom num pog + if (isNom(candidate) && agreedNum(candidate, m)) { + if (agreedGenderOrPerson(candidate, m)) + hasNomNumPOG = true; + hasNomNum = true; + } + + // nom pog, num pog + if (agreedGenderOrPerson(candidate, m)) + if (isNom(candidate)) { + if (hasNomPOG) + has2NomPOG = true; + hasNomPOG = true; + } else if (agreedNum(candidate, m)) + hasNumPOG = true; + + prev = candidate; + } + + // features.put("conj_" + prefix, hasConj); + features.put("cand_2_nom_" + prefix, has2Nom); + features.put("cand_2_POG_" + prefix, has2POG); + features.put("cand_2_nom+POG_" + prefix, has2NomPOG); + + features.put("cand_nom_" + prefix, hasNom); + features.put("cand_num_" + prefix, hasNum); + features.put("cand_POG_" + prefix, hasPOG); + + features.put("cand_nom+num_" + prefix, hasNomNum); + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG); + features.put("cand_nom+POG_" + prefix, hasNomPOG); + features.put("cand_num+POG_" + prefix, hasNumPOG); + } + + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) { + + int idx = s.indexOf(m); + int from = Math.max(0, idx - pre); + int to = Math.min(s.size(), idx + post + 1); + + return new ArrayList<>(s.subList(from, to)); + } + + private static boolean isPrevPraet(Token m, Sentence s) { + Token prev = null; + for (Token morph : s) { + if (m.equals(morph)) + break; + prev = morph; + } + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet"); + } + + /** + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo, + * lub (jak przy streszczeniach: w środku musi być czasownik w formie + * osobowej), + * + * @param s + * sentence + * @param m2 + * token + * @return clause with the token + */ + public static List<Token> getClause(Sentence s, Token m2) { + + List<List<Token>> sublists = getClauses(s); + + for (List<Token> sub : sublists) + for (Token m : sub) + if (m.equals(m2)) + return sub; + + return null; + } + + public static List<List<Token>> getClauses(Sentence s) { + + Set<Token> noSplitMorphs = new HashSet<>(); + for (SyntacticGroup g : s.getGroups()) { + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { + noSplitMorphs.add(m); + } + } + for (SyntacticWord g : s.getSyntacticWords()) { + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { + noSplitMorphs.add(m); + } + } + + LinkedList<List<Token>> sublists = new LinkedList<>(); + List<Token> currentSublist = new ArrayList<>(); + boolean clauseHasVerb = false; + for (Token m : s) { + String base = m.getChosenInterpretation().getBase(); + if (!noSplitMorphs.contains(m) + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2 + .contains(base)) && clauseHasVerb))) { + sublists.add(currentSublist); + currentSublist = new ArrayList<>(); + clauseHasVerb = false; + } else { + if (isVerb(m)) + clauseHasVerb = true; + } + currentSublist.add(m); + } + if (currentSublist.size() > 0) { + if (clauseHasVerb) + sublists.add(currentSublist); + else + sublists.getLast().addAll(currentSublist); + } + + // merge clause beginning with zaimek wzgl. etc to previous clause + List<Token> prev = null; + Iterator<List<Token>> it = sublists.iterator(); + while (it.hasNext()) { + List<Token> sublist = it.next(); + boolean containsRelPron = false; + int i = 1; + for (Token m : sublist) { + if (i > 2) + break; + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) { + containsRelPron = true; + break; + } + i++; + } + if (prev != null && containsRelPron) { + prev.addAll(sublist); + it.remove(); + } else + prev = sublist; + } + + return sublists; + } + + private static boolean agreedNum(Token candidate, Token keyword) { + String keywordNum = keyword.getNumber(); + String wordNum = candidate.getNumber(); + return keywordNum.equals(wordNum); + } + + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) { + if (isPraet(keyword)) { + // praet has number:gender + String keywordGender = keyword.getGender(); + String wordGender = candidate.getGender(); + return keywordGender.equals(wordGender); + } else { + // other verbs have number:person + String keywordPerson = keyword.getPerson(); + String wordPerson = "ter"; // default + if (PRONOUN_TAGS.contains(candidate.getCtag())) + wordPerson = candidate.getPerson(); + return wordPerson.equals(keywordPerson); + } + } + + private static boolean isJakJako(Token prev) { + String base = prev == null ? null : prev.getBase(); + return prev != null && (base.equals("jak") || base.equals("jako")); + } + + private static boolean isPraet(Token keyword) { + return keyword.getCtag().equals("praet"); + } + + private static boolean isNom(Token candidate) { + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow + // tylko! + } + + private static boolean isNoun(Token m) { + return NOUN_TAGS.contains(m.getCtag()); + } + + public static boolean isVerb(Token morph) { + return VERB_TAGS.contains(morph.getCtag()); + } + + public static boolean isVerb(Mention m) { + boolean hasOnlyVerbs = true; + for (Token morph : m.getSegments()) + if (!isVerb(morph)) { + hasOnlyVerbs = false; + break; + } + return hasOnlyVerbs; + } + + public static boolean isVerb(TEIMention m) { + boolean hasOnlyVerbs = true; + for (TEIMorph morph : m.getMorphs()) + if (!isVerb(morph)) { + hasOnlyVerbs = false; + break; + } + return hasOnlyVerbs; + } + + private static boolean isVerb(TEIMorph morph) { + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag()); + } +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/HeadDetector.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/HeadDetector.java new file mode 100644 index 0000000..34b15d1 --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/HeadDetector.java @@ -0,0 +1,69 @@ +package pl.waw.ipipan.zil.core.md.detection.head; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.core.md.entities.Sentence; +import pl.waw.ipipan.zil.core.md.entities.Token; +import weka.core.Instances; + +import java.io.File; +import java.io.InputStream; +import java.util.*; + +public class HeadDetector { + + final private static Logger logger = LoggerFactory.getLogger(HeadDetector.class); + + private Model model; + private Set<String> quasiVerbs = new HashSet<>(); + + public static int detectedHeads = 0; + + public List<Token> detectHeads(Sentence sentence) { + List<TreeMap<String, Object>> examples = new ArrayList<>(); + InstanceCreator.loadExamplesFromSentence(quasiVerbs, examples, sentence); + if (examples.isEmpty()) + return null; + + Instances instances = model.getInstances(examples); + + // label instances + List<Boolean> areHeads = new ArrayList<>(); + List<Token> heads = new ArrayList<>(); + for (int i = 0; i < instances.numInstances(); i++) { + boolean isHead = model.isHead(instances.instance(i), sentence); + areHeads.add(isHead); + if (isHead) + detectedHeads++; + } + + int i = 0; + for (Token m : sentence) { + if (FeatureGeneration.isVerb(m)) + continue; + if (areHeads.get(i)) + heads.add(m); + // sentence.addMention(new Mention(m, false)); + i++; + } + return heads; + } + + public HeadDetector(File zeroSubjectDetectionModel) { + try { + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); + this.quasiVerbs = this.model.getQuasiVerbs(); + } catch (Exception e) { + logger.error("Error loading model:" + e); + } + } + + public HeadDetector(InputStream zeroSubjectDetectionModelStream) { + try { + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); + this.quasiVerbs = this.model.getQuasiVerbs(); + } catch (Exception e) { + logger.error("Error loading model:" + e); + } + } +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/InstanceCreator.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/InstanceCreator.java new file mode 100644 index 0000000..aa94e4b --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/InstanceCreator.java @@ -0,0 +1,172 @@ +package pl.waw.ipipan.zil.core.md.detection.head; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.core.md.entities.*; +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; +import weka.core.Attribute; +import weka.core.FastVector; +import weka.core.Instance; +import weka.core.Instances; + +import java.io.File; +import java.util.*; +import java.util.Map.Entry; + +public class InstanceCreator { + + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class); + private static final TEI_IO teiIO = TEI_IO.getInstance(); + + private InstanceCreator() { + } + + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs) { + int allTexts = 0; + int exceptions = 0; + int allSentences = 0; + + List<TreeMap<String, Object>> examples = new ArrayList<>(); + for (File textDir : IOUtils.getNKJPDirs(dataDir)) { + try { + allTexts++; + logger.info("Processing text " + textDir); + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); + Text text = TeiLoader.loadTextFromTei(ct, textDir); + + for (Paragraph p : text) + for (Sentence s : p) { + allSentences++; + loadExamplesFromSentence(quasiVerbs, examples, s); + } + + } catch (Exception e) { + logger.error(e.getLocalizedMessage()); + exceptions++; + } + } + + logger.info(allTexts + " texts found."); + if (exceptions != 0) + logger.error(exceptions + " texts with exceptions."); + logger.info(allSentences + " sentences found."); + + return examples; + } + + public static void loadExamplesFromSentence(Set<String> quasiVerbs, List<TreeMap<String, Object>> examples, + Sentence s) { + + // collect positive examples + Set<Token> positive = new HashSet<>(); + for (Mention m : s.getMentions()) { + if (!FeatureGeneration.isVerb(m)) { + positive.addAll(m.getHeadSegments()); + } + } + + for (Token m : s) { + if (FeatureGeneration.isVerb(m)) + continue; + + TreeMap<String, Object> features = new TreeMap<>(); + if (positive.contains(m)) { + features.put("class", Boolean.valueOf(true)); + } else { + features.put("class", Boolean.valueOf(false)); + } + + FeatureGeneration.generateFeatures(features, m, s, quasiVerbs); + examples.add(features); + } + } + + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { + + TreeSet<String> booleanAttsOccurred = new TreeSet<>(); + TreeSet<String> doubleAttsOccurred = new TreeSet<>(); + TreeMap<String, Set<String>> att2values = new TreeMap<>(); + for (TreeMap<String, Object> example : examples) { + for (Entry<String, Object> e : example.entrySet()) { + String key = e.getKey(); + Object val = e.getValue(); + if (val instanceof Integer || val instanceof Double) { + doubleAttsOccurred.add(key); + continue; + } + if (val instanceof Boolean) { + booleanAttsOccurred.add(key); + continue; + } + if (!att2values.containsKey(key)) + att2values.put(key, new HashSet<>()); + att2values.get(key).add(val.toString()); + } + } + + List<Attribute> atts = new ArrayList<>(); + + // double attributes + for (String attName : doubleAttsOccurred) { + Attribute att = new Attribute(attName); + atts.add(att); + } + + // boolean attributes (treated as nominal) + FastVector values = new FastVector(2); + values.addElement("false"); + values.addElement("true"); + for (String attName : booleanAttsOccurred) { + Attribute att = new Attribute(attName, values); + atts.add(att); + } + + // nominal attributes + for (Entry<String, Set<String>> attVals : att2values.entrySet()) { + FastVector vals = new FastVector(attVals.getValue().size()); + for (String val : attVals.getValue()) + vals.addElement(val); + Attribute att = new Attribute(attVals.getKey(), vals); + atts.add(att); + } + + FastVector fvWekaAttributes = new FastVector(atts.size()); + for (Attribute attr : atts) { + fvWekaAttributes.addElement(attr); + } + + Instances data = new Instances("Head", fvWekaAttributes, 10); + data.setClass(data.attribute(classFeatureName)); + return data; + } + + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { + for (TreeMap<String, Object> example : examples) { + Instance instance = new Instance(instances.numAttributes()); + + for (Entry<String, Object> e : example.entrySet()) { + Object val = e.getValue(); + String name = e.getKey(); + if (val instanceof Integer) { + instance.setValue(instances.attribute(name), (int) val); + } else if (val instanceof Boolean) { + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); + } else { + int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); + if (indexOfValue == -1) { + logger.debug("Unkown value: " + val.toString() + " of feature: " + name + + ". Marking as missing value."); + instance.setMissing(instances.attribute(name)); + } else + instance.setValue(instances.attribute(name), indexOfValue); + } + } + + instance.setDataset(instances); + instances.add(instance); + } + } +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Model.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Model.java new file mode 100644 index 0000000..1af12a6 --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Model.java @@ -0,0 +1,49 @@ +package pl.waw.ipipan.zil.core.md.detection.head; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.core.md.entities.Sentence; +import weka.classifiers.Classifier; +import weka.core.Instance; +import weka.core.Instances; + +import java.io.Serializable; +import java.util.List; +import java.util.Set; +import java.util.TreeMap; + +public class Model implements Serializable { + + private static final long serialVersionUID = 3351727361273283076L; + private static final Logger logger = LoggerFactory.getLogger(Model.class); + + private Classifier classifier; + private Set<String> quasiVerbs; + private Instances instances; + + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { + this.classifier = classifier; + this.instances = instances; + this.quasiVerbs = quasiVerbs; + } + + public boolean isHead(Instance instance, Sentence sentence) { + try { + double response = this.classifier.classifyInstance(instance); + return response > 0; + } catch (Exception e) { + logger.error("Error classyfing head in sentence: " + sentence, e); + return false; + } + } + + public Instances getInstances(List<TreeMap<String, Object>> examples) { + Instances instances = new Instances(this.instances); + InstanceCreator.fillInstances(examples, instances); + return instances; + } + + public Set<String> getQuasiVerbs() { + return quasiVerbs; + } +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Serializer.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Serializer.java new file mode 100644 index 0000000..3c8df90 --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Serializer.java @@ -0,0 +1,22 @@ +package pl.waw.ipipan.zil.core.md.detection.head; + +import weka.core.SerializationHelper; + +import java.io.InputStream; + +public class Serializer { + + public static void saveModel(Model m, String targetModelFilePath) throws Exception { + SerializationHelper.write(targetModelFilePath, m); + } + + public static Model loadModel(String path) throws Exception { + Model m = (Model) SerializationHelper.read(path); + return m; + } + + public static Model loadModelFromStream(InputStream stream) throws Exception { + Model m = (Model) SerializationHelper.read(stream); + return m; + } +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Trainer.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Trainer.java new file mode 100644 index 0000000..e1fc04f --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/head/Trainer.java @@ -0,0 +1,119 @@ +package pl.waw.ipipan.zil.core.md.detection.head; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import weka.classifiers.Evaluation; +import weka.classifiers.rules.JRip; +import weka.classifiers.rules.JRip.RipperRule; +import weka.core.Attribute; +import weka.core.Instance; +import weka.core.Instances; + +import java.io.*; +import java.util.*; + +public class Trainer { + + private static final Logger logger = LoggerFactory.getLogger(Trainer.class); + + private static final boolean DO_CV = false; + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; + + private Trainer() { + } + + public static void main(String[] args) { + + if (args.length != 2) { + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() + + " trainDir targetModelFile"); + return; + } + + File dataDir = new File(args[0]); + String targetModelFilePath = args[1]; + + if (!dataDir.isDirectory()) { + logger.error(dataDir + " is not a directory!"); + return; + } + + Set<String> quasiVerbs = loadQuasiVerbs(); + + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs); + Instances instances = InstanceCreator.createInstances(examples, "class"); + InstanceCreator.fillInstances(examples, instances); + + printStats(instances); + + try { + JRip model; + + if (DO_CV) { + logger.info("Crossvalidation..."); + model = new JRip(); + Evaluation eval = new Evaluation(instances); + eval.crossValidateModel(model, instances, 10, new Random(1)); + logger.info(eval.toSummaryString()); + logger.info(eval.toMatrixString()); + logger.info(eval.toClassDetailsString()); + } + + logger.info("Building final classifier..."); + model = new JRip(); + model.buildClassifier(instances); + logger.info(model.getRuleset().size() + " rules generated."); + for (int i = 0; i < model.getRuleset().size(); i++) { + RipperRule v = (RipperRule) model.getRuleset().elementAt(i); + logger.info("\t" + v.toString(instances.classAttribute())); + } + + instances.delete(); + logger.info("Features stats:"); + for (int i = 0; i < instances.numAttributes(); i++) { + Attribute att = instances.attribute(i); + logger.info(i + ".\t" + att.toString()); + } + + logger.info("Saving classifier..."); + Model m = new Model(model, instances, quasiVerbs); + Serializer.saveModel(m, targetModelFilePath); + logger.info("Done."); + + } catch (Exception e) { + logger.error("Error: " + e); + } + } + + private static Set<String> loadQuasiVerbs() { + Set<String> quasiVerbs = new HashSet<>(); + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { + String line; + while ((line = br.readLine()) != null) { + quasiVerbs.add(line.trim()); + } + } catch (IOException e) { + logger.error(e.getLocalizedMessage(), e); + } + return quasiVerbs; + } + + private static void printStats(Instances instances) { + int positive = 0; + int negative = 0; + for (int i = 0; i < instances.numInstances(); i++) { + Instance inst = instances.instance(i); + if (inst.classValue() > 0) + negative++; + else + positive++; + } + logger.info(positive + " positive examples"); + logger.info(negative + " negative examples"); + logger.info((positive + negative) + " examples total"); + logger.info((instances.numAttributes() - 1) + " attributes"); + logger.info(instances.toSummaryString()); + } + +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/FeatureGeneration.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/FeatureGeneration.java new file mode 100644 index 0000000..b3e406e --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/FeatureGeneration.java @@ -0,0 +1,762 @@ +package pl.waw.ipipan.zil.core.md.detection.nominal; + +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; +import pl.waw.ipipan.zil.core.md.detection.Constants; +import pl.waw.ipipan.zil.core.md.entities.*; +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMention; +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEIMorph; + +import java.util.*; + + +public class FeatureGeneration { + final private static Set<String> CLAUSE_SPLIT_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "i", "albo", + "lub", "oraz", "bądź", "ani", "czy", "niż", "tudzież", ",", ";", "-", "–", ":" })); + + final private static Set<String> CLAUSE_SPLIT_LEMMAS2 = new HashSet<>(Arrays.asList(new String[] { "a", "ale", + "lecz", "jednak", "jednakże", "zaś", "wszakże", "owszem", "natomiast", "tylko", "dlatego", "jedynie", + "przecież", "tymczasem", "ponieważ", "więc", "dlatego", "toteż", "zatem" })); + + final private static Set<String> CLAUSE_SPLIT_LEMMAS_STRICT = new HashSet<>( + Arrays.asList(new String[] { "?", "!" })); + + final private static Map<String, String> CLAUSE_SPLIT_LEMMAS_PAIRWISE = new HashMap<>(); + static { + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("(", ")"); + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("\"", "\""); + CLAUSE_SPLIT_LEMMAS_PAIRWISE.put("'", "'"); + } + + final private static Set<String> NOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "subst", "depr", "ppron12", + "ppron3", "ger", "num", "numcol" })); + + final private static Set<String> PRONOUN_TAGS = new HashSet<>(Arrays.asList(new String[] { "ppron12", "ppron3" })); + + final private static Set<String> VERB_TAGS = new HashSet<>(Arrays.asList(new String[] { "fin", "bedzie", "aglt", + "praet", "winien" })); + + final private static Set<String> ZAIMKI_WZGLEDNE_LEMMAS = new HashSet<>(Arrays.asList(new String[] { "jaki", + "który" })); + + public static void generateFeatures(Map<String, Object> features, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, + Token head, Token candidate, Sentence s, List<Token> heads) { + + //addTokenFeatures(features, "head", head, s); + addTokenFeatures(features, "candidate", candidate, s); + + //features.put("sentLength", s.size()); // ostatnie sprawdzone + features.put("sameWord", sameWord(head, candidate, s)); + features.put("sameNE", sameNE(head, candidate, s)); + features.put("sameNG", sameNG(head, candidate, s)); + + features.put("distance", Math.abs(head.getSentencePosition() - candidate.getSentencePosition())); + //features.put("headIsFirst", Boolean.valueOf(head.compareTo(candidate) < 0)); + features.put("candidateIsFirst", Boolean.valueOf(head.compareTo(candidate) > 0)); + + features.put("sameWalentyConstruction", sameWalentyConstruction(head, candidate, s, valence)); + features.put("sameToken", sameToken(head, candidate)); + + features.put("candidateIsAlsoHead", Boolean.valueOf(heads.contains(candidate))); + features.put("isNextToCandidateColon", isNextColon(candidate, s)); + + features.put("candidateStartsWithUpperOrth", Character.isUpperCase(candidate.getOrth().codePointAt(0))); + features.put("candidateStartsWithUpperBase", Character.isUpperCase(candidate.getBase().codePointAt(0))); + features.put("isDotNextToHead", isNextDot(head, s)); + features.put("closestNEDistance", closestNEDistance(head, candidate, s)); + features.put("headStartsWithUpperOrth", Character.isUpperCase(head.getOrth().codePointAt(0))); + features.put("headStartsWithUpperBase", Character.isUpperCase(head.getBase().codePointAt(0))); // tutaj optymalna wersja sie konczy + + + // candidate in head in closest NE distance + +// features.put("candidateOrthLength", candidate.getOrth().length()); +// features.put("candidateBaseLength", candidate.getBase().length()); +// features.put("headOrthLength", head.getOrth().length()); +// features.put("headBaseLength", head.getBase().length()); + + //features.put("isNextToHeadColon", isNextColon(head, s)); + //features.put("isCandidateColon", Boolean.valueOf(candidate.getOrth().equals(":"))); // tylko run zrobic, tak jeszcze nie sprawdzalem + +/* features.put("isClauseSplitLemmaStrict", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(candidate.getBase()))); + features.put("isClauseSplitLemma", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS.contains(candidate.getBase()))); + features.put("isClauseSplitLemma2", Boolean.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(candidate.getBase())));*/ + +/* Token next = getNeighbouringToken(s, candidate, 1); + if (next != null) { + features.put("nextIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(next.getBase()))); + features.put("nextIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(next.getBase()))); + features.put("nextIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(next.getBase()))); + } else { + features.put("nextIsClauseSplitLemmaStrict", "sentEnd"); + features.put("nextIsClauseSplitLemma", "sentEnd"); + features.put("nextIsClauseSplitLemma2", "sentEnd"); + } + + Token previous = getNeighbouringToken(s, candidate, -1); + if (previous != null) { + features.put("previousIsClauseSplitLemmaStrict", String.valueOf(CLAUSE_SPLIT_LEMMAS_STRICT.contains(previous.getBase()))); + features.put("previousIsClauseSplitLemma", String.valueOf(CLAUSE_SPLIT_LEMMAS.contains(previous.getBase()))); + features.put("previousIsClauseSplitLemma2", String.valueOf(CLAUSE_SPLIT_LEMMAS2.contains(previous.getBase()))); + } else { + features.put("previousIsClauseSplitLemmaStrict", "sentStart"); + features.put("previousIsClauseSplitLemma", "sentStart"); + features.put("previousIsClauseSplitLemma2", "sentStart"); + }*/ + + + //features.put("candidateIsClosingBracket", candidateIsClosingBracket(head, candidate, s)); + //features.put("candidateIsQM", candidateIsClosingQM(head, candidate, s)); + //features.put("candidateIsClosingBracket", Boolean.valueOf(candidate.getOrth().equals(")"))); + + // pozycja glowy we wzmiance, da sie zasymulowac!!cos nie bangla + // jeszcze raz niestety trzeba sprawdzic ciaglosc prawostronna chyba + // head NG group length i walenty construction group Length dodac bo moze to dobrze zadzialac z odelgloscia + // is stop word dodac dla candidate i jakies rozwiazania z detekcji glowy moze + // zrobic tak zeby jeszcze sprawdzalo czy token przed jest czescia wzmianki + // z tymi separatorami tez sie pobawic + // word Ctag !! +/* + Token next = getNeighbouringToken(s, candidate, 1); + if (next != null) { + features.put(String.format("%sCtag", "nextToCandidate"), next.getChosenInterpretation().getCtag()); + features.put(String.format("%sNumber", "nextToCandidate"), next.getChosenInterpretation().getNumber()); + features.put(String.format("%sGender", "nextToCandidate"), next.getChosenInterpretation().getGender()); + features.put(String.format("%sPerson", "nextToCandidate"), next.getChosenInterpretation().getPerson()); + } else { + features.put(String.format("%sCtag", "nextToCandidate"), "null"); + features.put(String.format("%sNumber", "nextToCandidate"), "null"); + features.put(String.format("%sGender", "nextToCandidate"), "null"); + features.put(String.format("%sPerson", "nextToCandidate"), "null"); + } + + Token previous = getNeighbouringToken(s, candidate, -1); + if (previous != null) { + features.put(String.format("%sCtag", "previousToCandidate"), previous.getChosenInterpretation().getCtag()); + features.put(String.format("%sNumber", "previousToCandidate"), previous.getChosenInterpretation().getNumber()); + features.put(String.format("%sGender", "previousToCandidate"), previous.getChosenInterpretation().getGender()); + features.put(String.format("%sPerson", "previousToCandidate"), previous.getChosenInterpretation().getPerson()); + } else { + features.put(String.format("%sCtag", "previousToCandidate"), "null"); + features.put(String.format("%sNumber", "previousToCandidate"), "null"); + features.put(String.format("%sGender", "previousToCandidate"), "null"); + features.put(String.format("%sPerson", "previousToCandidate"), "null"); + } + */ + + + } + + private static int closestNEDistance(Token head, Token candidate, Sentence s) { + int lowestDistance = -1; + for (NamedEntity ne : s.getNamedEntities()) { + int distance = ne.getTokens().get(0).getSentencePosition() - head.getSentencePosition(); + if ( distance >= 0 && ne.getTokens().contains(candidate) && (distance < lowestDistance || lowestDistance < 0)) { + lowestDistance = distance; + } + } + return lowestDistance; + } + + ///////////////////////////// + +/* private static boolean candidateIsClosingBracket(Token head, Token candidate, Sentence s) { + + + + if (!candidate.getOrth().equals(")")) { + return Boolean.valueOf(false); + } + + int openedBrackets = 0; + int closedBrackets = 0; + for (Token t : s) { + if (candidate.getSentencePosition() == t.getSentencePosition()) { + break; + } + + if (t.getSentencePosition() >= head.getSentencePosition()) { + if (t.getOrth().equals("(")) + openedBrackets++; + if (t.getOrth().equals(")")) + closedBrackets++; + } + } + + if (openedBrackets - closedBrackets > 0) { + return Boolean.valueOf(true); + } + + return Boolean.valueOf(false); + }*/ + + private static boolean isNextColon(Token t, Sentence s) { + int idx = s.indexOf(t) + 1; + if (idx >= s.size() || idx < 0) + return Boolean.valueOf(false); + return Boolean.valueOf(s.get(idx).getOrth().equals(":")); + } + + private static boolean isNextDot(Token t, Sentence s) { + int idx = s.indexOf(t) + 1; + if (idx >= s.size() || idx < 0) + return Boolean.valueOf(false); + return Boolean.valueOf(s.get(idx).getOrth().equals(".")); + } + + private static boolean candidateIsClosingQM(Token head, Token candidate, Sentence s) { + + if (!candidate.getOrth().equals("\"")) { + return Boolean.valueOf(false); + } + + int start = head.getSentencePosition(); + int end = candidate.getSentencePosition() - 1; + if (head.compareTo(candidate) > 0) { + start = candidate.getSentencePosition() + 1; + end = head.getSentencePosition(); + } + + int QMs = 0; + for (Token t : s) { + if (end == t.getSentencePosition()) { + break; + } + + if (t.getSentencePosition() >= start) { + if (t.getOrth().equals("\"")) + QMs++; + } + } + + if ((QMs % 2) != 0) { + return Boolean.valueOf(true); + } + + return Boolean.valueOf(false); + } + + private static boolean sameWord(Token t1, Token t2, Sentence s) { + + for (SyntacticWord w : s.getSyntacticWords()) { + if (w.getTokens().contains(t1) && w.getTokens().contains(t2)) { + return Boolean.valueOf(true); + } + } + return Boolean.valueOf(false); + } + + private static boolean sameNE(Token t1, Token t2, Sentence s) { + + for (NamedEntity ne : s.getNamedEntities()) { + if (ne.getTokens().contains(t1) && ne.getTokens().contains(t2)) { + return Boolean.valueOf(true); + } + } + return Boolean.valueOf(false); + } + + private static boolean sameNG(Token head, Token candidate, Sentence s) { + + for (SyntacticGroup group : s.getGroups()) { + if (group.getType().startsWith("NG")) { + if (group.getSemanticHeadTokens().contains(head) && group.getTokens().contains(candidate)) { + return Boolean.valueOf(true); + } + } + } + return Boolean.valueOf(false); + } + + private static boolean sameWalentyConstruction(Token head, Token candidate, Sentence s, + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { + + for (SyntacticGroup group : s.getGroups()) { + if (group.getType().startsWith("NG")) { + ArrayList<SyntacticGroup> nestedGroups = new ArrayList<SyntacticGroup>(); + nestedGroups.add(group); + + SyntacticGroup nextGroup = group.getFollowingGroup(); + while (nextGroup != null) { + nestedGroups.add(nextGroup); + nextGroup = nextGroup.getFollowingGroup(); + } + + List<Token> extendedGroupSegments = getExtendedGroupSegments(nestedGroups, valence.get(ValenceDicts.NounsValence)); + List<Token> extendedGroupHeads = getExtendedGroupHeads(nestedGroups); + if (extendedGroupHeads.contains(head) && extendedGroupSegments.contains(candidate)) + return Boolean.valueOf(true); + } + } + return Boolean.valueOf(false); + } + + private static List<Token> getExtendedGroupSegments(ArrayList<SyntacticGroup> nestedGroups, + Map<String,ArrayList<String>> walentyNouns) { + + SyntacticGroup initialGroup = nestedGroups.get(0); + String initialGroupHead = initialGroup.getSemanticHeadTokens().get(0).getBase(); + + List<Token> heads = initialGroup.getSemanticHeadTokens(); + List<Token> segments = new ArrayList<Token>(); + + if (!walentyNouns.containsKey(initialGroupHead)) { + segments.addAll(initialGroup.getTokens()); + } else { + + ArrayList<String> schemata = walentyNouns.get(initialGroupHead); + ArrayList<ArrayList<String>> groupsRealizations = new ArrayList<ArrayList<String>>(); + ArrayList<SyntacticGroup> largestMatch = new ArrayList<SyntacticGroup>(); + largestMatch.add(initialGroup); + + for (int i=1; i < nestedGroups.size(); i++) { + SyntacticGroup group = nestedGroups.get(i); + ArrayList<String> realizations = group.getWalentyRealizations(); + groupsRealizations.add(realizations); + if (realizationsMatch(schemata, groupsRealizations)) { + largestMatch.add(group); + } else { + break; + } + } + + for (SyntacticGroup group : largestMatch) { + segments.addAll(group.getTokens()); + } + + } + return segments; + } + + private static List<Token> getExtendedGroupHeads(ArrayList<SyntacticGroup> nestedGroups) { + + SyntacticGroup initialGroup = nestedGroups.get(0); + + List<Token> heads = initialGroup.getSemanticHeadTokens(); + + return heads; + } + + private static boolean realizationsMatch(ArrayList<String> schemata, + ArrayList<ArrayList<String>> groupsRealizations) { + for (String schema : schemata) { + if (isProperSchema(schema, groupsRealizations)) { + return true; + } + } + return false; + } + + private static boolean isProperSchema(String schema, + ArrayList<ArrayList<String>> groupsRealizations) { + + ArrayList<ArrayList<String>> matchingPositions = new ArrayList<ArrayList<String>>(); + for (ArrayList<String> realizations : groupsRealizations) { + matchingPositions.add(getMatchingPositions(schema, realizations)); + } + + if (matchingPositionsExists(matchingPositions)) { + return true; + /*ArrayList<ArrayList<String>> product = cartesianProduct(matchingPositions); + for (ArrayList<String> combination : product) { + Set<String> combinationSet = new HashSet<String>(combination); + if (combinationSet.size() == matchingPositions.size()) { + return true; + } + }*/ + } + return false; + } + + private static ArrayList<String> getMatchingPositions(String schema, ArrayList<String> phraseRealizations) { + ArrayList<String> positions = new ArrayList<String>(); + for (String position : schema.split("\\s\\+\\s")) { + position = position.trim(); + position = position.substring(1, position.length()-1); + for (String phrT : position.split(";")) { + if (phraseRealizations.contains(phrT.trim())) { + positions.add(position); + break; + } + } + } + return positions; + } + + private static boolean matchingPositionsExists(ArrayList<ArrayList<String>> matchingPositions) { + for (ArrayList<String> positions : matchingPositions) { + if (positions.isEmpty()) { + return false; + } + } + return true; + } + + private static boolean sameToken(Token t1, Token t2) { + if (t1.compareTo(t2) == 0) { + return Boolean.valueOf(true); + } + return Boolean.valueOf(false); + } + ////////////////////////////////// + + private static void addTokenFeatures(Map<String, Object> features, String label, Token t, Sentence s) { + features.put(String.format("%sCtag", label), t.getChosenInterpretation().getCtag()); + features.put(String.format("%sNumber", label), t.getChosenInterpretation().getNumber()); + features.put(String.format("%sGender", label), t.getChosenInterpretation().getGender()); + features.put(String.format("%sPerson", label), t.getChosenInterpretation().getPerson()); + features.put(String.format("%sWordCtag", label), wordCtag(t, s)); + + features.put(String.format("%sNextCtag", label), getNeighbouringTag(s, t, 1)); + features.put(String.format("%sPrevCtag", label), getNeighbouringTag(s, t, -1)); + + + Token next = getNeighbouringToken(s, t, 1); + if (next != null) { + features.put(String.format("%sNextWordCtag", label), wordCtag(next, s)); + } else { + features.put(String.format("%sNextWordCtag", label), "None"); + } + + Token previous = getNeighbouringToken(s, t, -1); + if (previous != null) { + features.put(String.format("%sPrevWordCtag", label), wordCtag(previous, s)); + } else { + features.put(String.format("%sPrevWordCtag", label), "None"); + } + +// features.put(String.format("%sNextNextCtag", label), getNeighbouringTag(s, t, 2)); +// features.put(String.format("%sPrevPrevCtag", label), getNeighbouringTag(s, t, -2)); + +// features.put(String.format("%sSentPosition", label), t.getSentencePosition()); + + +// features.put(String.format("%sPrevPraet", label), isPrevPraet(t, s)); +// features.put(String.format("%sPrevComma", label), isPrevComma(t, s)); +// features.put(String.format("%sPrev2Pred", label), isPrev2Pred(t, s)); +// features.put(String.format("%sNextInf", label), isNextInf(t, s)); + +/* List<Token> clause = getClause(s, t); + if (clause != null) + features.put(String.format("%sClauseLength", label), clause.size()); + else + features.put(String.format("%sClauseLength", label), 0);*/ + + /*addFeatures(features, clause, String.format("%sClause", label), t); + addFeatures(features, s, String.format("%sSent", label), t);*/ +// for (int i = 1; i < 6; i++) // zrobic to ale w oknie od head do candidate +// addFeatures(features, getWindow(s, t, i, 0), String.format("%sWindow_", label) + i + "_" + 0, t); +// for (int i = 1; i < 6; i++) +// addFeatures(features, getWindow(s, t, 0, i), String.format("%sWindow_", label) + 0 + "_" + i, t); +// for (int i = 1; i < 6; i++) +// addFeatures(features, getWindow(s, t, i, i), String.format("%sWindow_", label) + i + "_" + i, t); + } + + private static String wordCtag(Token t, Sentence s) { + for (SyntacticWord w : s.getSyntacticWords()) { + if (w.getTokens().contains(t)) { + return w.getCtag(); + } + } + return "None"; + } + + private static boolean isNextInf(Token m, Sentence s) { + boolean now = false; + for (Token morph : s) { + if (now) + return morph.getChosenInterpretation().getCtag().equals("inf"); + if (m.equals(morph)) + now = true; + } + return false; + } + + private static boolean isPrev2Pred(Token m, Sentence s) { + Token prev = null; + Token prev2 = null; + for (Token morph : s) { + if (m.equals(morph)) + break; + prev2 = prev; + prev = morph; + } + return (prev != null && prev.getChosenInterpretation().getCtag().equals("pred")) + || (prev2 != null && prev2.getChosenInterpretation().getCtag().equals("pred")); + } + + private static Object isPrevComma(Token m, Sentence s) { + Token prev = null; + for (Token morph : s) { + if (m.equals(morph)) + break; + prev = morph; + } + return prev != null && prev.getChosenInterpretation().getBase().equals(","); + } + + private static String getNeighbouringTag(Sentence s, Token m, int i) { + int idx = s.indexOf(m) + i; + if (idx >= s.size() || idx < 0) + return "None"; + return s.get(idx).getChosenInterpretation().getCtag(); + } + + private static Token getNeighbouringToken(Sentence s, Token m, int i) { + int idx = s.indexOf(m) + i; + if (idx >= s.size() || idx < 0) + return null; + return s.get(idx); + } + + private static void addFeatures(Map<String, Object> features, List<Token> clause, String prefix, Token m) { + + boolean hasNom = false; // 1 + boolean hasNum = false; // 2 + boolean hasPOG = false; // 3 + + boolean hasNomNum = false; + boolean hasNumPOG = false; + boolean hasNomPOG = false; + boolean hasNomNumPOG = false; + + boolean has2Nom = false; + boolean has2NomPOG = false; + boolean has2POG = false; + + Token prev = null; + for (Token candidate : clause) { + + if (!isNoun(candidate) || isJakJako(prev)) { + prev = candidate; + continue; + } + + // nom, nom2 + if (isNom(candidate)) { + if (hasNom) + has2Nom = true; + hasNom = true; + } + // num + if (agreedNum(candidate, m)) { + hasNum = true; + } + // pog, pog2 + if (agreedGenderOrPerson(candidate, m)) { + if (hasPOG) + has2POG = true; + hasPOG = true; + } + + // nom num, nom num pog + if (isNom(candidate) && agreedNum(candidate, m)) { + if (agreedGenderOrPerson(candidate, m)) + hasNomNumPOG = true; + hasNomNum = true; + } + + // nom pog, num pog + if (agreedGenderOrPerson(candidate, m)) + if (isNom(candidate)) { + if (hasNomPOG) + has2NomPOG = true; + hasNomPOG = true; + } else if (agreedNum(candidate, m)) + hasNumPOG = true; + + prev = candidate; + } + + // features.put("conj_" + prefix, hasConj); + features.put("cand_2_nom_" + prefix, has2Nom); + features.put("cand_2_POG_" + prefix, has2POG); + features.put("cand_2_nom+POG_" + prefix, has2NomPOG); + + features.put("cand_nom_" + prefix, hasNom); + features.put("cand_num_" + prefix, hasNum); + features.put("cand_POG_" + prefix, hasPOG); + + features.put("cand_nom+num_" + prefix, hasNomNum); + features.put("cand_nom+num+POG_" + prefix, hasNomNumPOG); + features.put("cand_nom+POG_" + prefix, hasNomPOG); + features.put("cand_num+POG_" + prefix, hasNumPOG); + } + + private static List<Token> getWindow(Sentence s, Token m, int pre, int post) { + + int idx = s.indexOf(m); + int from = Math.max(0, idx - pre); + int to = Math.min(s.size(), idx + post + 1); + + return new ArrayList<>(s.subList(from, to)); + } + + private static boolean isPrevPraet(Token m, Sentence s) { + Token prev = null; + for (Token morph : s) { + if (m.equals(morph)) + break; + prev = morph; + } + return prev != null && prev.getChosenInterpretation().getCtag().equals("praet"); + } + + /** + * - podział na klauzule: przecinki oraz spójniki bezprzecinkowe: i, albo, + * lub (jak przy streszczeniach: w środku musi być czasownik w formie + * osobowej), + * + * @param s + * sentence + * @param m2 + * token + * @return clause with the token + */ + public static List<Token> getClause(Sentence s, Token m2) { + + List<List<Token>> sublists = getClauses(s); + + for (List<Token> sub : sublists) + for (Token m : sub) + if (m.equals(m2)) + return sub; + + return null; + } + + public static List<List<Token>> getClauses(Sentence s) { + + Set<Token> noSplitMorphs = new HashSet<>(); + for (SyntacticGroup g : s.getGroups()) { + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { + noSplitMorphs.add(m); + } + } + for (SyntacticWord g : s.getSyntacticWords()) { + for (Token m : g.getTokens().subList(0, g.getTokens().size() - 1)) { + noSplitMorphs.add(m); + } + } + + LinkedList<List<Token>> sublists = new LinkedList<>(); + List<Token> currentSublist = new ArrayList<>(); + boolean clauseHasVerb = false; + for (Token m : s) { + String base = m.getChosenInterpretation().getBase(); + if (!noSplitMorphs.contains(m) + && (CLAUSE_SPLIT_LEMMAS_STRICT.contains(base) || ((CLAUSE_SPLIT_LEMMAS.contains(base) || CLAUSE_SPLIT_LEMMAS2 + .contains(base)) && clauseHasVerb))) { + sublists.add(currentSublist); + currentSublist = new ArrayList<>(); + clauseHasVerb = false; + } else { + if (isVerb(m)) + clauseHasVerb = true; + } + currentSublist.add(m); + } + if (currentSublist.size() > 0) { + if (clauseHasVerb) + sublists.add(currentSublist); + else if (!sublists.isEmpty()) + sublists.getLast().addAll(currentSublist); + } + + // merge clause beginning with zaimek wzgl. etc to previous clause + List<Token> prev = null; + Iterator<List<Token>> it = sublists.iterator(); + while (it.hasNext()) { + List<Token> sublist = it.next(); + boolean containsRelPron = false; + int i = 1; + for (Token m : sublist) { + if (i > 2) + break; + if (ZAIMKI_WZGLEDNE_LEMMAS.contains(m.getChosenInterpretation().getBase())) { + containsRelPron = true; + break; + } + i++; + } + if (prev != null && containsRelPron) { + prev.addAll(sublist); + it.remove(); + } else + prev = sublist; + } + + return sublists; + } + + private static boolean agreedNum(Token candidate, Token keyword) { + String keywordNum = keyword.getNumber(); + String wordNum = candidate.getNumber(); + return keywordNum.equals(wordNum); + } + + private static boolean agreedGenderOrPerson(Token candidate, Token keyword) { + if (isPraet(keyword)) { + // praet has number:gender + String keywordGender = keyword.getGender(); + String wordGender = candidate.getGender(); + return keywordGender.equals(wordGender); + } else { + // other verbs have number:person + String keywordPerson = keyword.getPerson(); + String wordPerson = "ter"; // default + if (PRONOUN_TAGS.contains(candidate.getCtag())) + wordPerson = candidate.getPerson(); + return wordPerson.equals(keywordPerson); + } + } + + private static boolean isJakJako(Token prev) { + String base = prev == null ? null : prev.getBase(); + return prev != null && (base.equals("jak") || base.equals("jako")); + } + + private static boolean isPraet(Token keyword) { + return keyword.getCtag().equals("praet"); + } + + private static boolean isNom(Token candidate) { + return "nom".equals(candidate.getCase()); // dziala dla rzeczownikow + // tylko! + } + + public static boolean isNoun(Token m) { + return NOUN_TAGS.contains(m.getCtag()); + } + + public static boolean isNoun(Mention m) { + return NOUN_TAGS.contains(m.getHeadSegments().get(0).getCtag()); + } + + public static boolean isVerb(Token morph) { + return VERB_TAGS.contains(morph.getCtag()); + } + + public static boolean isVerb(Mention m) { + boolean hasOnlyVerbs = true; + for (Token morph : m.getSegments()) + if (!isVerb(morph)) { + hasOnlyVerbs = false; + break; + } + return hasOnlyVerbs; + } + + public static boolean isVerb(TEIMention m) { + boolean hasOnlyVerbs = true; + for (TEIMorph morph : m.getMorphs()) + if (!isVerb(morph)) { + hasOnlyVerbs = false; + break; + } + return hasOnlyVerbs; + } + + private static boolean isVerb(TEIMorph morph) { + return VERB_TAGS.contains(morph.getChosenInterpretation().getCtag()); + } +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/InstanceCreator.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/InstanceCreator.java new file mode 100644 index 0000000..69c23ee --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/InstanceCreator.java @@ -0,0 +1,280 @@ +package pl.waw.ipipan.zil.core.md.detection.nominal; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; +import pl.waw.ipipan.zil.core.md.entities.*; +import pl.waw.ipipan.zil.core.md.io.tei.TeiLoader; +import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.TEICorpusText; +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.IOUtils; +import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO; +import weka.core.Attribute; +import weka.core.FastVector; +import weka.core.Instance; +import weka.core.Instances; + +import java.io.File; +import java.util.*; +import java.util.Map.Entry; + +public class InstanceCreator { + + private static final Logger logger = LoggerFactory.getLogger(InstanceCreator.class); + private static final TEI_IO teiIO = TEI_IO.getInstance(); + + private InstanceCreator() { + } + + public static List<TreeMap<String, Object>> loadExamples(File dataDir, Set<String> quasiVerbs, + Map<ValenceDicts,Map<String,ArrayList<String>>> valence) { + int allTexts = 0; + int exceptions = 0; + int allSentences = 0; + + List<TreeMap<String, Object>> examples = new ArrayList<>(); + for (File textDir : IOUtils.getNKJPDirs(dataDir)) { + try { + allTexts++; + logger.info("Processing text " + textDir); + TEICorpusText ct = teiIO.readFromNKJPDirectory(textDir); + Text text = TeiLoader.loadTextFromTei(ct, textDir); + + for (Paragraph p : text) + for (Sentence s : p) { + allSentences++; + loadExamplesFromSentence(quasiVerbs, valence, examples, s); + } + + } catch (Exception e) { + //logger.error(e.getLocalizedMessage()); + e.printStackTrace(); + exceptions++; + } + } + + logger.info(allTexts + " texts found."); + if (exceptions != 0) + logger.error(exceptions + " texts with exceptions."); + logger.info(allSentences + " sentences found."); + + return examples; + } + + public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, + List<TreeMap<String, Object>> examples, Sentence s) { + + + ArrayList<Token> heads = new ArrayList<>(); + for (Mention m : s.getMentions()) { + heads.addAll(m.getHeadSegments()); + } + + // collect positive examples + HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>(); + for (Mention m : s.getMentions()) { + if (heads.containsAll(m.getHeadSegments())) { + positives.put(m.getHeadSegments().get(0), m.getSegments()); + } + } + + for (Token head : s) { + if (heads.contains(head)) { + for (Token t : s) { + //if (head.compareTo(t) != 0) {// && Math.abs(head.getSentencePosition() - t.getSentencePosition()) <= window) { + TreeMap<String, Object> features = new TreeMap<>(); + if (positives.containsKey(head) && positives.get(head).contains(t)) { + features.put("class", Boolean.valueOf(true)); + //features.put("candidatePositionInMention", positionInMention(head, t, s)); + + } else { + features.put("class", Boolean.valueOf(false)); + //features.put("candidatePositionInMention", 0); + } + + + FeatureGeneration.generateFeatures(features, valence, head, t, s, heads); + //features.put("candidatePositionInMention", positionInMention(head, t, s)); + addPreviousStates(features, head, t, s); + + examples.add(features); + // } + } + } + } + } + + public static void addPreviousStates(Map<String, Object> features, Token head, Token candidate, Sentence s) { + int context = 1; + int candidateLocation = candidate.getSentencePosition(); + for (int i = 1; i <= context; i++) { + if (candidateLocation - i < 0) { + features.put(String.format("location-%d", i), Boolean.valueOf(false)); + } else if (sameMention(s.get(candidateLocation - i), head, s) ) { + features.put(String.format("location-%d", i), Boolean.valueOf(true)); + } else { + features.put(String.format("location-%d", i), Boolean.valueOf(false)); + } + } + } + + public static int positionInMention(Token head, Token t, Sentence s) { + + Token previous = null; + if (t.getSentencePosition()-1 >= 0) { + previous = s.get(t.getSentencePosition()-1); + } else { + return 0; + } + + for (Mention m : s.getMentions()) { + if (m.getHeadSegments().contains(head) && m.getSegments().contains(previous)) { +/* if (m.getSegments().get(0).getSentencePosition() - t.getSentencePosition() <= -1) { + System.out.println(m.getSegments().get(0)); + System.out.println(t); + System.out.println(m.getSegments()); + }*/ + return previous.getSentencePosition() - m.getSegments().get(0).getSentencePosition(); + } + } + return 0; + } + + private static boolean sameMention(Token t1, Token t2, Sentence s) { + for (Mention m : s.getMentions()) { + if (m.getSegments().contains(t1) && m.getSegments().contains(t2)) { + return true; + } + } + return false; + } + + public static void loadExamplesFromSentence(Set<String> quasiVerbs, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, + List<TreeMap<String, Object>> examples, Sentence s, List<Token> heads) { + + + if (heads == null || heads.isEmpty()) + return; + + // collect positive examples + HashMap<Token, List<Token>> positives = new HashMap<Token, List<Token>>(); + for (Mention m : s.getMentions()) { + if (heads.containsAll(m.getHeadSegments())) { + positives.put(m.getHeadSegments().get(0), m.getSegments()); + } + } + + for (Token head : s) { + if (heads.contains(head)) { + for (Token t : s) { + TreeMap<String, Object> features = new TreeMap<>(); + + if (positives.containsKey(head) && positives.get(head).contains(t)) { + features.put("class", Boolean.valueOf(true)); + //features.put("candidatePositionInMention", positionInMention(head, t, s)); + + } else { + features.put("class", Boolean.valueOf(false)); + //features.put("candidatePositionInMention", 0); + } + + FeatureGeneration.generateFeatures(features, valence, head, t, s, heads); + //features.put("candidatePositionInMention", positionInMention(head, t, s)); + addPreviousStates(features, head, t, s); + examples.add(features); + } + } + } + } + + public static Instances createInstances(List<TreeMap<String, Object>> examples, String classFeatureName) { + + TreeSet<String> booleanAttsOccurred = new TreeSet<>(); + TreeSet<String> doubleAttsOccurred = new TreeSet<>(); + TreeMap<String, Set<String>> att2values = new TreeMap<>(); + for (TreeMap<String, Object> example : examples) { + for (Entry<String, Object> e : example.entrySet()) { + String key = e.getKey(); + Object val = e.getValue(); + if (val instanceof Integer || val instanceof Double) { + doubleAttsOccurred.add(key); + continue; + } + if (val instanceof Boolean) { + booleanAttsOccurred.add(key); + continue; + } + if (!att2values.containsKey(key)) + att2values.put(key, new HashSet<>()); + att2values.get(key).add(val.toString()); + } + } + + List<Attribute> atts = new ArrayList<>(); + + // double attributes + for (String attName : doubleAttsOccurred) { + Attribute att = new Attribute(attName); + atts.add(att); + } + + // boolean attributes (treated as nominal) + FastVector values = new FastVector(2); + values.addElement("false"); + values.addElement("true"); + for (String attName : booleanAttsOccurred) { + Attribute att = new Attribute(attName, values); + atts.add(att); + } + + // nominal attributes + for (Entry<String, Set<String>> attVals : att2values.entrySet()) { + FastVector vals = new FastVector(attVals.getValue().size()); + for (String val : attVals.getValue()) + vals.addElement(val); + Attribute att = new Attribute(attVals.getKey(), vals); + atts.add(att); + } + + FastVector fvWekaAttributes = new FastVector(atts.size()); + for (Attribute attr : atts) { + fvWekaAttributes.addElement(attr); + } + + Instances data = new Instances("Nominal", fvWekaAttributes, 10); + data.setClass(data.attribute(classFeatureName)); + return data; + } + + public static void fillInstances(List<TreeMap<String, Object>> examples, Instances instances) { + for (TreeMap<String, Object> example : examples) { + addInstance(example, instances); + } + } + + public static void addInstance(TreeMap<String, Object> example, Instances instances) { + Instance instance = new Instance(instances.numAttributes()); + + for (Entry<String, Object> e : example.entrySet()) { + Object val = e.getValue(); + String name = e.getKey(); + if (val instanceof Integer) { + instance.setValue(instances.attribute(name), (int) val); + } else if (val instanceof Boolean) { + instance.setValue(instances.attribute(name), ((Boolean) val) ? "true" : "false"); + } else { + int indexOfValue = instances.attribute(name).indexOfValue(val.toString()); + if (indexOfValue == -1) { + logger.debug("Unkown value: " + val.toString() + " of feature: " + name + + ". Marking as missing value."); + instance.setMissing(instances.attribute(name)); + } else + instance.setValue(instances.attribute(name), indexOfValue); + } + } + + instance.setDataset(instances); + instances.add(instance); + } + +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Model.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Model.java new file mode 100644 index 0000000..6b9eba7 --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Model.java @@ -0,0 +1,54 @@ +package pl.waw.ipipan.zil.core.md.detection.nominal; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import pl.waw.ipipan.zil.core.md.entities.Sentence; +import weka.classifiers.Classifier; +import weka.core.Instance; +import weka.core.Instances; + +import java.io.Serializable; +import java.util.List; +import java.util.Set; +import java.util.TreeMap; + +public class Model implements Serializable { + + private static final long serialVersionUID = 3351727361273283076L; + private static final Logger logger = LoggerFactory.getLogger(Model.class); + + private Classifier classifier; + private Set<String> quasiVerbs; + private Instances instances; + + public Model(Classifier classifier, Instances instances, Set<String> quasiVerbs) { + this.classifier = classifier; + this.instances = instances; + this.quasiVerbs = quasiVerbs; + } + + public boolean arePartOfSameMention(Instance instance, Sentence sentence) { + try { + double response = this.classifier.classifyInstance(instance); + return response > 0; + } catch (Exception e) { + logger.error("Error classyfing verb in sentence: " + sentence, e); + return false; + } + } + + public Instances getInstances(List<TreeMap<String, Object>> examples) { + Instances instances = new Instances(this.instances); + InstanceCreator.fillInstances(examples, instances); + return instances; + } + + public Instances getInstances() { + Instances instances = new Instances(this.instances); + return instances; + } + + public Set<String> getQuasiVerbs() { + return quasiVerbs; + } +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/NominalMentionDetector.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/NominalMentionDetector.java new file mode 100644 index 0000000..4fe1d0c --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/NominalMentionDetector.java @@ -0,0 +1,110 @@ +package pl.waw.ipipan.zil.core.md.detection.nominal; + +import java.io.File; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; +import java.util.Map.Entry; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; +import pl.waw.ipipan.zil.core.md.detection.nominal.FeatureGeneration; +import pl.waw.ipipan.zil.core.md.detection.nominal.InstanceCreator; +import pl.waw.ipipan.zil.core.md.detection.nominal.Model; +import pl.waw.ipipan.zil.core.md.detection.nominal.Serializer; +import pl.waw.ipipan.zil.core.md.entities.Mention; +import pl.waw.ipipan.zil.core.md.entities.Sentence; +import pl.waw.ipipan.zil.core.md.entities.Token; +import weka.core.Instances; + +public class NominalMentionDetector { + final private static Logger logger = LoggerFactory.getLogger(NominalMentionDetector.class); + + private Model model; + private Set<String> quasiVerbs = new HashSet<>(); + + public void addNominalMentions(Sentence sentence, Map<ValenceDicts,Map<String,ArrayList<String>>> valence, List<Token> heads) { + List<TreeMap<String, Object>> examples = new ArrayList<>(); + InstanceCreator.loadExamplesFromSentence(quasiVerbs, valence, examples, sentence, heads); + if (examples.isEmpty()) + return; + + Instances instances = model.getInstances(); + + // label instances + List<Boolean> areInSameMention = new ArrayList<>(); + for (int i = 0; i < examples.size(); i++) { + TreeMap<String, Object> example = examples.get(i); + if (i - 1 < 0) { + example.put("location-1", Boolean.valueOf(false)); + //example.put("candidatePositionInMention", 0); + } else { + example.put("location-1", Boolean.valueOf(areInSameMention.get(i-1))); +// int positionInMention = 1; +// while (i - positionInMention >= 0 && areInSameMention.get(i-positionInMention)) { +// positionInMention++; +// } +// example.put("candidatePositionInMention", positionInMention-1); + } + + InstanceCreator.addInstance(example, instances); + boolean inSameMention = model.arePartOfSameMention(instances.instance(i), sentence); + areInSameMention.add(inSameMention); + } + + int i = 0; + for (Token head : sentence) { + if (heads.contains(head)) { + ArrayList<Token> mSegments = new ArrayList<Token>(); + ArrayList<Token> mHead = new ArrayList<Token>(); + mHead.add(head); + for (Token t : sentence) { + if (head.compareTo(t) != 0) { + if (areInSameMention.get(i)) { + mSegments.add(t); + } + } else { + mSegments.add(t); + } + i++; + } + + // cleaning + if(mSegments.get(mSegments.size()-1).getCtag().equals("prep") || mSegments.get(mSegments.size()-1).getCtag().equals("conj") || + mSegments.get(mSegments.size()-1).getCtag().equals("comp")) { + mSegments.remove(mSegments.size()-1); + } + if(mSegments.get(0).getCtag().equals("prep") || mSegments.get(0).getCtag().equals("conj") || + mSegments.get(0).getCtag().equals("comp")) { + mSegments.remove(0); + } + + sentence.addMention(new Mention(mSegments, mHead)); + } + } + } + + public NominalMentionDetector(File zeroSubjectDetectionModel) { + try { + this.model = Serializer.loadModel(zeroSubjectDetectionModel.getAbsolutePath()); + this.quasiVerbs = this.model.getQuasiVerbs(); + } catch (Exception e) { + logger.error("Error loading model:" + e); + } + } + + public NominalMentionDetector(InputStream zeroSubjectDetectionModelStream) { + try { + this.model = Serializer.loadModelFromStream(zeroSubjectDetectionModelStream); + this.quasiVerbs = this.model.getQuasiVerbs(); + } catch (Exception e) { + logger.error("Error loading model:" + e); + } + } +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Serializer.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Serializer.java new file mode 100644 index 0000000..df03f73 --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Serializer.java @@ -0,0 +1,22 @@ +package pl.waw.ipipan.zil.core.md.detection.nominal; + +import weka.core.SerializationHelper; + +import java.io.InputStream; + +public class Serializer { + + public static void saveModel(Model m, String targetModelFilePath) throws Exception { + SerializationHelper.write(targetModelFilePath, m); + } + + public static Model loadModel(String path) throws Exception { + Model m = (Model) SerializationHelper.read(path); + return m; + } + + public static Model loadModelFromStream(InputStream stream) throws Exception { + Model m = (Model) SerializationHelper.read(stream); + return m; + } +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Trainer.java b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Trainer.java new file mode 100644 index 0000000..59f3293 --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/detection/nominal/Trainer.java @@ -0,0 +1,231 @@ +package pl.waw.ipipan.zil.core.md.detection.nominal; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import pl.waw.ipipan.zil.core.md.Main; +import pl.waw.ipipan.zil.core.md.Main.ValenceDicts; +import pl.waw.ipipan.zil.core.md.detection.head.HeadDetector; +import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector; +import weka.classifiers.Evaluation; +import weka.classifiers.rules.JRip; +import weka.classifiers.rules.JRip.RipperRule; +import weka.classifiers.trees.J48; +import weka.core.Attribute; +import weka.core.Instance; +import weka.core.Instances; + +import java.io.*; +import java.util.*; + +public class Trainer { + + private static final Logger logger = LoggerFactory.getLogger(Trainer.class); + + private static final boolean DO_CV = false; + private static final String QUASI_LIST_PATH = "/quasi_verbs.txt"; + private static final String DEFAULT_VERBS_VALENCE = "/walenty_verbs.txt"; + private static final String DEFAULT_NOUNS_VALENCE = "/walenty_nouns.txt"; + + private static Map<ValenceDicts,Map<String,ArrayList<String>>> valence = + new EnumMap(ValenceDicts.class); + + static { + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE); + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream)); + + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); + } + + private Trainer() { + } + + public static void main(String[] args) { + + if (args.length != 2) { + logger.error("Wrong number of arguments! Should be: " + Trainer.class.getSimpleName() + + " trainDir targetModelFile"); + return; + } + + File dataDir = new File(args[0]); + String targetModelFilePath = args[1]; + + if (!dataDir.isDirectory()) { + logger.error(dataDir + " is not a directory!"); + return; + } + + Set<String> quasiVerbs = loadQuasiVerbs(); + + InputStream walentyVerbsStream = Main.class.getResourceAsStream(DEFAULT_VERBS_VALENCE); + valence.put(ValenceDicts.VerbsValence, readWalenty(walentyVerbsStream)); + + InputStream walentyNounsStream = Main.class.getResourceAsStream(DEFAULT_NOUNS_VALENCE); + valence.put(ValenceDicts.NounsValence, readWalenty(walentyNounsStream)); + + List<TreeMap<String, Object>> examples = InstanceCreator.loadExamples(dataDir, quasiVerbs, valence); + Instances instances = InstanceCreator.createInstances(examples, "class"); + InstanceCreator.fillInstances(examples, instances); + + printStats(instances); + + try { + J48 model; + + logger.info("Building final classifier..."); + model = new J48(); + model.buildClassifier(instances); + logger.info("J48 tree:"); + logger.info(model.toString()); + + instances.delete(); + logger.info("Features stats:"); + for (int i = 0; i < instances.numAttributes(); i++) { + Attribute att = instances.attribute(i); + logger.info(i + ".\t" + att.toString()); + } + + logger.info("Saving classifier..."); + Model m = new Model(model, instances, quasiVerbs); + Serializer.saveModel(m, targetModelFilePath); + logger.info("Done."); + + } catch (Exception e) { + logger.error("Error: " + e); + } + +/* try { + JRip model; + + if (DO_CV) { + logger.info("Crossvalidation..."); + model = new JRip(); + Evaluation eval = new Evaluation(instances); + eval.crossValidateModel(model, instances, 10, new Random(1)); + logger.info(eval.toSummaryString()); + logger.info(eval.toMatrixString()); + logger.info(eval.toClassDetailsString()); + } + + logger.info("Building final classifier..."); + model = new JRip(); + model.buildClassifier(instances); + logger.info(model.getRuleset().size() + " rules generated."); + for (int i = 0; i < model.getRuleset().size(); i++) { + RipperRule v = (RipperRule) model.getRuleset().elementAt(i); + logger.info("\t" + v.toString(instances.classAttribute())); + } + + instances.delete(); + logger.info("Features stats:"); + for (int i = 0; i < instances.numAttributes(); i++) { + Attribute att = instances.attribute(i); + logger.info(i + ".\t" + att.toString()); + } + + logger.info("Saving classifier..."); + Model m = new Model(model, instances, quasiVerbs); + Serializer.saveModel(m, targetModelFilePath); + logger.info("Done."); + + } catch (Exception e) { + logger.error("Error: " + e); + }*/ + } + + public static Map<String,ArrayList<String>> readWalenty(InputStream walentySchemataStream) + { + Map<String,ArrayList<String>> map; + try { + BufferedReader br=new BufferedReader(new InputStreamReader(walentySchemataStream)); + map = new HashMap<String,ArrayList<String>>(); + String line; + boolean firstLine = true; + while((line = br.readLine()) != null) { + if (firstLine) { + line = line.replace("\uFEFF", ""); // remove BOM character + firstLine = false; + } + + if (!line.startsWith("%")) { + String[] lineParts = line.split(":"); + String lemma = lineParts[0].trim(); + String schema = lineParts[5].trim(); + + if (schema.trim().isEmpty()) { + continue; + } + + String[] lemmaParts = lemma.split(" "); + if(lemmaParts.length == 1 && schemaContainsSie(schema)) { + lemma = lemma + " się"; + } + + ArrayList<String> schemata; + if (!map.containsKey(lemma)) { + schemata = new ArrayList<String>(); + schemata.add(schema); + map.put(lemma, schemata); + } else { + schemata = map.get(lemma); + schemata.add(schema); + map.put(lemma, schemata); + } + } + } + br.close(); + } catch (IOException ex) { + ex.printStackTrace(); + throw new RuntimeException(ex); + } + return map; + } + + private static boolean schemaContainsSie(String schema) { + for (String position : schema.split("\\s\\+\\s")) { + position = position.trim(); + position = position.substring(1, position.length()-1); + for (String phrT : position.split(";")) { + if (phrT.equals("refl") || phrT.equals("recip")) { + return true; + } + } + } + + return false; + } + + private static Set<String> loadQuasiVerbs() { + Set<String> quasiVerbs = new HashSet<>(); + InputStream stream = Trainer.class.getResourceAsStream(QUASI_LIST_PATH); + try (BufferedReader br = new BufferedReader(new InputStreamReader(stream))) { + String line; + while ((line = br.readLine()) != null) { + quasiVerbs.add(line.trim()); + } + } catch (IOException e) { + logger.error(e.getLocalizedMessage(), e); + } + return quasiVerbs; + } + + private static void printStats(Instances instances) { + int positive = 0; + int negative = 0; + for (int i = 0; i < instances.numInstances(); i++) { + Instance inst = instances.instance(i); + if (inst.classValue() > 0) + negative++; + else + positive++; + } + logger.info(positive + " positive examples"); + logger.info(negative + " negative examples"); + logger.info((positive + negative) + " examples total"); + logger.info((instances.numAttributes() - 1) + " attributes"); + logger.info(instances.toSummaryString()); + } + +} diff --git a/src/main/java/pl/waw/ipipan/zil/core/md/entities/Relation.java b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Relation.java new file mode 100644 index 0000000..aa16374 --- /dev/null +++ b/src/main/java/pl/waw/ipipan/zil/core/md/entities/Relation.java @@ -0,0 +1,21 @@ +package pl.waw.ipipan.zil.core.md.entities; + +public class Relation { + + private String name; + private Token target; + + public Relation(String name, Token target) { + this.name = name; + this.target = target; + } + + public String getName() { + return name; + } + + public Token getTarget() { + return target; + } + +}