ThriftLoader.java 6.98 KB
package pl.waw.ipipan.zil.core.md.io.thrift;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.waw.ipipan.zil.core.md.entities.*;
import pl.waw.ipipan.zil.multiservice.thrift.types.*;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ThriftLoader {

    private static Logger logger = LoggerFactory.getLogger(ThriftLoader.class);

    public static Text loadTextFromThrift(TText thriftText)
            throws MultiserviceException {
        Text text = new Text(thriftText.getTextHeader() == null ? "null"
                : thriftText.getTextHeader().getId());

        logger.debug("Loading text " + text.getId() + " from thrift format...");
        for (TParagraph teiP : thriftText.getParagraphs())
            loadParagraph(text, teiP);
        logger.debug("Thrift text loaded.");

        return text;
    }

    private static void loadParagraph(Text text, TParagraph teiP)
            throws MultiserviceException {
        Paragraph p = new Paragraph();
        text.add(p);

        for (TSentence teiS : teiP.getSentences())
            loadSentence(p, teiS);
    }

    private static void loadSentence(Paragraph p, TSentence thriftSent)
            throws MultiserviceException {
        Sentence s = new Sentence();
        p.add(s);

        Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent);

        Map<String, Token> thiftTokenId2Token = new HashMap<>();
        for (TToken teiM : thriftSent.getTokens()) {
            Token token = loadToken(s, teiM);
            thiftTokenId2Token.put(teiM.getId(), token);
        }
        if (thriftSent.isSetNames())
            for (TNamedEntity ne : thriftSent.getNames())
                loadNE(s, ne, thirftId2Entity, thiftTokenId2Token);
        if (thriftSent.isSetWords())
            for (TSyntacticWord w : thriftSent.getWords())
                loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token);
        if (thriftSent.isSetGroups())
            for (TSyntacticGroup g : thriftSent.getGroups())
                loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token);
    }

    private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g,
                                           Map<String, Object> thirftId2Entity,
                                           Map<String, Token> thiftTokenId2Token) {
        String type = g.getType();
        List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity,
                thiftTokenId2Token, false);
        List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity,
                thiftTokenId2Token, true);
        s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
    }

    private static void loadSyntacticWord(Sentence s, TSyntacticWord w,
                                          Map<String, Object> thirftId2Entity,
                                          Map<String, Token> thiftTokenId2Token) {
        String ctag = w.getChosenInterpretation().getCtag();
        List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity,
                thiftTokenId2Token, false);
        s.addSyntacticWord(new SyntacticWord(ctag, tokens));
    }

    private static void loadNE(Sentence s, TNamedEntity ne,
                               Map<String, Object> thirftId2Entity,
                               Map<String, Token> thiftTokenId2Token) {
        List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity,
                thiftTokenId2Token, false);
        s.addNamedEntity(new NamedEntity(tokens));
    }

    private static Map<String, Object> getThriftId2EntityMap(
            TSentence thriftSent) {
        Map<String, Object> idToEntity = new HashMap<>();
        for (TToken tok : thriftSent.getTokens())
            idToEntity.put(tok.getId(), tok);
        if (thriftSent.isSetWords())
            for (TSyntacticWord w : thriftSent.getWords())
                idToEntity.put(w.getId(), w);
        if (thriftSent.isSetNames())
            for (TNamedEntity ne : thriftSent.getNames())
                idToEntity.put(ne.getId(), ne);
        if (thriftSent.isSetGroups())
            for (TSyntacticGroup group : thriftSent.getGroups())
                idToEntity.put(group.getId(), group);
        return idToEntity;
    }

    private static Token loadToken(Sentence s, TToken teiM)
            throws MultiserviceException {
        Token seg = new Token();
        s.add(seg);

        seg.setOrth(teiM.getOrth());
        TInterpretation interp = getTokenChosenInt(teiM);
        Interpretation chosenIterpretation = new Interpretation(
                interp.getCtag(), interp.getMsd(), interp.getBase());
        seg.addChosenInterpretation(chosenIterpretation);

        for (TInterpretation interp2 : teiM.getInterpretations()) {
            Interpretation inter = new Interpretation(interp2.getCtag(),
                    interp2.getMsd(), interp.getBase());
            seg.addInterpretation(inter);
        }
        return seg;
    }

    private static TInterpretation getTokenChosenInt(TToken token)
            throws MultiserviceException {
        TInterpretation interp = token.getChosenInterpretation();
        if (interp == null || interp.getBase() == null
                || "".equals(interp.getBase())) {
            if (token.getCandidateInterpretations() == null
                    || token.getCandidateInterpretations().isEmpty()
                    || token.getCandidateInterpretations().get(0).getBase() == null
                    || "".equals(token.getCandidateInterpretations().get(0).getBase()))
                throw new MultiserviceException(
                        "No proper chosen or candidate interpretation for segment: "
                                + token.id);
            interp = token.getCandidateInterpretations().get(0);
        }
        return interp;
    }

    private static List<Token> getUnderlyingSegments(Object entity,
                                                     Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment,
                                                     boolean headsOnly) {
        List<Token> result = new ArrayList<>();

        if (entity instanceof TToken) {
            result.add(tokenId2Segment.get(((TToken) entity).getId()));
            return result;
        }

        List<String> childIds = new ArrayList<>();
        if (entity instanceof TSyntacticWord)
            childIds = ((TSyntacticWord) entity).getChildIds();
        else if (entity instanceof TNamedEntity)
            childIds = ((TNamedEntity) entity).getChildIds();
        else if (entity instanceof TSyntacticGroup)
            if (headsOnly) {
                childIds = new ArrayList<>();
                childIds.add(((TSyntacticGroup) entity).getSemanticHeadId());
            } else
                childIds = ((TSyntacticGroup) entity).getChildIds();

        for (String id : childIds)
            result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity,
                    tokenId2Segment, headsOnly));

        return result;
    }
}