GroupsReader.java 3.33 KB
package ipipan.clarin.tei.impl.io.read;

import ipipan.clarin.tei.api.entities.AnnotationLayer;
import ipipan.clarin.tei.api.entities.TEIGroup;
import ipipan.clarin.tei.api.entities.TEIParagraph;
import ipipan.clarin.tei.api.entities.TEISentence;
import ipipan.clarin.tei.api.entities.TEISyntacticEntity;
import ipipan.clarin.tei.api.entities.TEIWord;
import ipipan.clarin.tei.api.exceptions.TEIException;
import ipipan.clarin.tei.impl.io.IdValuePair;

import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;

import javax.xml.stream.XMLStreamException;

/**
 *
 * @author mlenart
 */
public class GroupsReader extends BodyReader {

	GroupsReader(InWrapper in) {
		super(in);
	}

	@Override
	protected void readNextParagraph(TEIParagraph par) throws TEIException {
		try {
			while (!in.isStartParagraph()) {
				in.next();
			}
			String parId = in.getXmlId();
			for (TEISentence sent : par.getSentences()) {
				in.nextTag();
				in.requireStart("s");
				readNextSent(sent);
			}
			in.nextTag();
			in.requireEnd(); // p
			par.setId(AnnotationLayer.GROUPS, parId);
		} catch (Exception ex) {
			throw new TEIException("Error in groups: " + ex.getMessage(), ex);
		}
	}

	private TEISentence readNextSent(TEISentence sent)
			throws XMLStreamException, TEIException {
		List<GroupBuilder> builders = new LinkedList<GroupBuilder>();
		String sentId = in.getXmlId();
		in.nextTag();
		while (!in.isEnd()) {
			in.requireStart("seg");
			builders.add(readGroupBuilder());
			in.nextTag();
		}
		in.requireEnd(); // s
		sent.setSyntacticGroups(getGroupsFromBuilders(builders, sent));
		sent.setId(AnnotationLayer.GROUPS, sentId);
		return sent;
	}

	private List<TEIGroup> getGroupsFromBuilders(List<GroupBuilder> builders,
			TEISentence sent) throws TEIException {
		LinkedList<TEIGroup> res = new LinkedList<TEIGroup>();
		Map<String, TEISyntacticEntity> ptr2Child = new LinkedHashMap<String, TEISyntacticEntity>();
		for (TEIWord word : sent.getAllWords()) {
			ptr2Child.put(word.getId(), word);
		}
		Collections.reverse(builders);
		for (GroupBuilder b : builders) {
			TEIGroup group = b.getGroup(ptr2Child);
			res.addFirst(group);
			ptr2Child.put(group.getId(), group);
		}

		filterNEs(res);
		return res;
	}

	private void filterNEs(LinkedList<TEIGroup> nes) {
		Set<TEISyntacticEntity> nonRootNEs = new LinkedHashSet<TEISyntacticEntity>();
		for (TEIGroup group : nes) {
			nonRootNEs.addAll(group.getChildren());
		}
		nes.removeAll(nonRootNEs);
	}

	private GroupBuilder readGroupBuilder() throws XMLStreamException {
		in.requireStart("seg");
		GroupBuilder b = new GroupBuilder(in.getXmlId());
		in.nextTag();
		in.requireStartFS("group");
		in.nextTag();

		while (!in.isEndFS()) {
			if (in.isStartF("orth")) {
				b.setOrth(in.readStringF().getValue());
			} else if (in.isStartF("type")) {
				b.setType(in.readSymbolF().getValue());
			} else if (in.isStartF("semh")) {
				b.setSemHead(in.readFValue());
			} else if (in.isStartF("synh")) {
				b.setSynHead(in.readFValue());
			}
			in.nextTag();
		}

		in.nextTag();
		List<String> ptrs = new LinkedList<String>();
		for (IdValuePair ptr : PtrHelper.readPtrsWithTypes(in)) {
			String target = ptr.getId();
			ptrs.add(target);
		}
		b.setPtrs(ptrs);
		in.requireEnd(); // seg

		return b;
	}
}