TeiSaver.java 4.1 KB
package pl.waw.ipipan.zil.core.md.io.tei;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.waw.ipipan.zil.core.md.entities.*;
import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.*;
import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO.CompressionMethod;

import java.io.File;
import java.util.*;

public class TeiSaver {

    private static final Logger logger = LoggerFactory.getLogger(TeiSaver.class);
    private static final TEI_IO teiAPI = TEI_IO.getInstance();
    private static final EntitiesFactory ef = EntitiesFactory.getInstance();

    private TeiSaver() {
    }

    public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException {
        logger.debug("Saving text in " + targetDir);
        CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE;
        teiAPI.writeToNKJPDirectory(teiText, targetDir, cm);
    }

    public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException {
        Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>();

        Iterator<Paragraph> pIt = t.iterator();
        Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator();
        int mentionId = 0;
        while (pIt.hasNext() && pItTei.hasNext()) {
            Paragraph p = pIt.next();
            TEIParagraph pTei = pItTei.next();

            mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei);
        }
        checkIterators(pIt, pItTei, "paragraph");

        teiText.addAnnotationLayer(AnnotationLayer.MENTIONS,
                EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS));

        // clear coreference as we have new mentions it became invalid
        teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE);
        teiText.setCoreferences(new ArrayList<TEICoreference>());

        logger.debug(mentionId + " mentions added");
    }

    private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p,
                                          TEIParagraph pTei) throws TEIException {
        Iterator<Sentence> sIt = p.iterator();
        Iterator<TEISentence> sItTei = pTei.getSentences().iterator();

        while (sIt.hasNext() && sItTei.hasNext()) {
            Sentence s = sIt.next();
            TEISentence sTei = sItTei.next();
            mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei);
        }
        checkIterators(sIt, sItTei, "sentence");
        return mentionId;
    }

    private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s,
                                         TEISentence sTei) throws TEIException {
        sTei.getAllMentions().clear();

        Map<Token, TEIMorph> seg2morph = new HashMap<>();

        Iterator<Token> segIt = s.iterator();
        Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator();

        while (segIt.hasNext() && segItTei.hasNext()) {
            seg2morph.put(segIt.next(), segItTei.next());
        }
        checkIterators(segIt, segItTei, "token");

        List<TEIMention> mentions = new ArrayList<>();

        for (Mention m : s.getMentions()) {
            List<TEIMorph> morphs = new ArrayList<>();
            List<TEIMorph> heads = new ArrayList<>();

            for (Token seg : m.getSegments())
                morphs.add(seg2morph.get(seg));

            for (Token seg : m.getHeadSegments())
                heads.add(seg2morph.get(seg));

            TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject());
            mentions.add(mention);
            mention2mention.put(m, mention);
        }
        sTei.setMentions(mentions);
        return mentionId;
    }

    private static void checkIterators(Iterator<?> one, Iterator<?> other, String level)
            throws TEIException {
        if (one.hasNext() || other.hasNext())
            throw new TEIException("Problem mapping tei to thrift for level " + level);
    }

}