Cleaner.java 4.66 KB
package pl.waw.ipipan.zil.core.md.detection;

import pl.waw.ipipan.zil.core.md.entities.Mention;
import pl.waw.ipipan.zil.core.md.entities.Sentence;
import pl.waw.ipipan.zil.core.md.entities.Token;

import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class Cleaner {
    public static void cleanUnnecessarySentenceMentions(Sentence sentence) {
        List<Mention> mentions = sentence.getMentions();
        Collection<Mention> unnecessaryMentions = new HashSet<>();

        for (int i = 0; i < mentions.size(); i++) {
            Mention m1 = mentions.get(i);
            for (int j = i + 1; j < mentions.size(); j++) {
                Mention m2 = mentions.get(j);

                Mention lessImportantMention = getLessImportantMention(m1, m2);
                Mention moreImportantMention = m1 == lessImportantMention ? m2
                        : m1;

                // same mention borders
                if (m1.getSegments().equals(m2.getSegments())) {
                    unnecessaryMentions.add(lessImportantMention);
                    continue;
                }
                // same mention heads
                if (!m1.getHeadSegments().isEmpty()
                        && !m2.getHeadSegments().isEmpty()) {
                    if (m1.getHeadSegments().equals(m2.getHeadSegments())) {

                        List<Token> segments = moreImportantMention
                                .getSegments();

                        boolean isConj = false;
                        for (Token seg : segments) {
                            if (seg.getChosenInterpretation().getCtag()
                                    .equals("conj")) {
                                isConj = true;
                                break;
                            }
                        }

                        if (!isConj) {
                            unnecessaryMentions.add(lessImportantMention);
                            continue;
                        }
                    }
                }

                // mention head equals whole other mention
                if (m1.getHeadSegments().isEmpty()
                        && !m2.getHeadSegments().isEmpty()) {
                    if (m2.getHeadSegments().equals(m1.getSegments())) {
                        unnecessaryMentions.add(lessImportantMention);
                        continue;
                    }
                }

                // the same, but other way round
                if (m2.getHeadSegments().isEmpty()
                        && !m1.getHeadSegments().isEmpty()) {

                    if (m1.getHeadSegments().equals(m2.getSegments())) {
                        unnecessaryMentions.add(lessImportantMention);
                        continue;
                    }
                }

                // nie zawieraja sie w sobie, lecz maja czesc wspolna
                boolean intersect = false;

                Set<Token> notInM1 = new HashSet<>(m2.getSegments());
                notInM1.removeAll(m1.getSegments());
                if (notInM1.size() < m2.getSegments().size())
                    intersect = true;

                Set<Token> notInM2 = new HashSet<>(m1.getSegments());
                notInM2.removeAll(m2.getSegments());
                if (notInM2.size() < m1.getSegments().size())
                    intersect = true;

                if (intersect && !notInM1.isEmpty() && !notInM2.isEmpty()) {
                    unnecessaryMentions.add(lessImportantMention);
                    continue;
                }

            }
        }

        for (Mention m : unnecessaryMentions)
            sentence.removeMention(m);

        // heurystyka dla usuwania rzeczy w stylu: [[Ernest][Kwiecien]]
        unnecessaryMentions.clear();

        OUTER:
        for (Mention m : sentence.getMentions()) {
            for (Token seg : m.getSegments())
                if (seg.getOrth().toLowerCase().equals(seg.getOrth()))
                    continue OUTER;

            //only for children of fully capitalized mentions
            Set<Mention> allMentions = new HashSet<>();
            for (Token seg : m.getSegments())
                for (Mention m2 : seg.getMentions())
                    if (m.getSegments().containsAll(m2.getSegments()))
                        allMentions.add(m2);

            allMentions.remove(m);

            unnecessaryMentions.addAll(allMentions);
        }
        for (Mention m : unnecessaryMentions)
            sentence.removeMention(m);
    }

    private static Mention getLessImportantMention(Mention m1, Mention m2) {
        if (m1.getSegments().size() > m2.getSegments().size())
            return m2;
        else
            return m1;
    }
}