Detector.java 5.06 KB
package pl.waw.ipipan.zil.core.md.detection;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.waw.ipipan.zil.core.md.detection.zero.ZeroSubjectDetector;
import pl.waw.ipipan.zil.core.md.entities.*;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class Detector {

    private static final Logger logger = LoggerFactory.getLogger(Detector.class);

    private Detector() {
    }

    public static void findMentionsInText(Text text,
                                          ZeroSubjectDetector zeroSubjectModel) {
        text.clearMentions();
        logger.debug("Detecting mentions in text " + text.getId());
        for (Paragraph p : text)
            for (Sentence s : p)
                detectMentionsInSentence(s, zeroSubjectModel);
    }

    private static void detectMentionsInSentence(Sentence sentence,
                                                 ZeroSubjectDetector zeroSubjectModel) {
        // adding mentions
        addMentionsByTokenCtag(sentence);
        addMentionsBySyntacticWordsCtag(sentence);
        addMentionsByNamedEntities(sentence);
        addMentionsByGroups(sentence);
        addSpeakerMentionsInSpoken(sentence);

        // zero subject detection
        zeroSubjectModel.addZeroSubjectMentions(sentence);

        // removing mentions
        removeTo(sentence);
        Cleaner.cleanUnnecessarySentenceMentions(sentence);

        // updating mention heads
        updateMentionHeads(sentence);
    }

    /**
     * heurystyka ustawiajaca jako glowe pierwszy segment gdy glowy brak
     *
     * @param sentence
     */
    private static void updateMentionHeads(Sentence sentence) {
        for (Mention m : sentence.getMentions())
            if (m.getHeadSegments().isEmpty())
                m.addHeadSegment(m.getFirstSegment());
    }

    /**
     * heurystyka dla "to" w zdaniu z ""jeśli"/"jeżeli"/"skoro""
     *
     * @param sentence
     */
    private static void removeTo(Sentence sentence) {
        Set<String> orths = new HashSet<>();
        for (Token morph : sentence)
            orths.add(morph.getOrth());

        if (orths.contains("jeśli") || orths.contains("jeżeli")
                || orths.contains("skoro")) {
            for (Mention mention : sentence.getMentions()) {
                List<Token> mentSegs = mention.getSegments();
                if (mentSegs.size() == 1
                        && "to".equals(mentSegs.get(0).getBase())) {
                    sentence.removeMention(mention);
                }
            }
        }
    }

    private static void addSpeakerMentionsInSpoken(Sentence sentence) {
        // heurystyka dla sp1:, sp2:, MarszałekJAkistam:
        if (sentence.size() > 2) {
            Token first = sentence.get(0);
            Token second = sentence.get(1);
            if (":".equals(second.getOrth())) {
                sentence.addMention(new Mention(first));
            }
        }
    }

    /**
     * Wyszukuję i oznaczam wszystkie NG*
     *
     * @param sentence
     */
    private static void addMentionsByGroups(Sentence sentence) {
        for (SyntacticGroup group : sentence.getGroups()) {
            if (group.getType().startsWith("NG")) {
                List<Token> segments = group.getTokens();
                List<Token> heads = group.getSemanticHeadTokens();

                sentence.addMention(new Mention(segments, heads));
            }
        }
    }

    /**
     * Wyszukuję i oznaczam wszystkie NER
     *
     * @param sentence
     */
    private static void addMentionsByNamedEntities(Sentence sentence) {
        for (NamedEntity ne : sentence.getNamedEntities()) {

            List<Token> headTokens = new ArrayList<>();
            List<Token> tokens = ne.getTokens();

            boolean containsNoun = false;
            for (Token seg : tokens) {
                if (seg.getCtag().matches(Constants.MORPHO_NOUN_CTAGS)) {
                    containsNoun = true;
                    break;
                }
            }
            if (!containsNoun)
                continue;

            sentence.addMention(new Mention(tokens, headTokens));
        }
    }

    private static void addMentionsBySyntacticWordsCtag(Sentence sentence) {
        for (SyntacticWord w : sentence.getSyntacticWords())
            if (w.getCtag().matches(Constants.WORDS_CTAGS)) {
                List<Token> tokens = w.getTokens();
                if (tokens.size() == 1) {
                    sentence.addMention(new Mention(tokens.get(0)));
                } else {
                    List<Token> heads = new ArrayList<>();
                    sentence.addMention(new Mention(tokens, heads));
                }
            }
    }

    /**
     * Wyszukuję wszystkie interesujace czesci mowy jesli jest poziom slow
     * skladniowych, to korzystam z niego zamiast morfoskladni
     *
     * @param sentence
     */
    private static void addMentionsByTokenCtag(Sentence sentence) {
        for (Token token : sentence)
            if (token.getCtag().matches(Constants.MORPHO_CTAGS))
                sentence.addMention(new Mention(token));
    }
}