PackageReader.java 3.82 KB
package ipipan.clarin.tei.impl.io.read;

import ipipan.clarin.tei.api.entities.AnnotationLayer;
import ipipan.clarin.tei.api.entities.TEICorpusText;
import ipipan.clarin.tei.api.entities.TEIHeader;
import ipipan.clarin.tei.api.entities.TEIParagraph;
import ipipan.clarin.tei.api.exceptions.TEIException;
import ipipan.clarin.tei.impl.entities.TEICorpusTextImpl;

import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;

import javax.xml.stream.XMLStreamException;

/**
 *
 * @author mlenart
 */
public class PackageReader {

	private final InWrapper in;
	private final PackageHeaderReader headerReader;

	public static PackageReader getInstance(Reader reader) throws TEIException {
		return new PackageReader(reader);
	}

	private PackageReader(Reader reader) throws TEIException {
		try {
			in = new InWrapper(reader, null);
		} catch (XMLStreamException ex) {
			throw new TEIException(ex);
		} catch (IOException ex) {
			throw new TEIException(ex);
		}
		headerReader = new PackageHeaderReader(in);
	}

	public TEICorpusText readCorpusText() throws TEIException {
		try {
			return doReadCorpusText();
		} catch (XMLStreamException ex) {
			throw new TEIException(ex);
		}
	}

	public void close() throws TEIException {
		try {
			in.close();
		} catch (XMLStreamException ex) {
			throw new TEIException(ex);
		}
	}

	private TEICorpusText doReadCorpusText() throws XMLStreamException,
			TEIException {
		TEICorpusTextImpl tei = new TEICorpusTextImpl();
		if (!hasNextHeader()) {
			throw new TEIException("Missing package header");
		}
		tei.setCorpusHeader(headerReader.readCorpusHeader());
		List<TEIParagraph> pars = new ArrayList<TEIParagraph>();
		TextStructureReader tsr = new TextStructureReader(in);
		while (tsr.hasNextParagraph()) {
			TEIParagraph par = tsr.getNextParagraph();
			pars.add(par);
		}

		boolean parsAdded = false;
		while (hasNextHeader()) {
			TEIHeader header = headerReader.readHeader();
			tei.addAnnotationLayer(header.getLayer(), header);
			readLayer(pars, header.getLayer(), tei);

			if (!parsAdded && pars.size() > 0
					&& pars.get(0).getSentences() != null) {
				for (TEIParagraph par : pars)
					tei.addParagraph(par);

				parsAdded = true;
			}
		}

		return tei;
	}

	private void readLayer(List<TEIParagraph> pars,
			AnnotationLayer annotationLayer, TEICorpusText text)
			throws TEIException, XMLStreamException {
		while (!in.isStart("body"))
			in.next();
		BodyReader bodyReader = BodyReader.create(in, annotationLayer);
		for (TEIParagraph par : pars) {
			bodyReader.readNextParagraph(par);
		}
		bodyReader.readBody(text);
	}

	private boolean hasNextHeader() throws XMLStreamException {
		while (in.hasNext() && !in.isStart("teiHeader")) {
			in.next();
		}
		return in.isStart("teiHeader");
	}

	// private TEIHeader readNextHeader() throws XMLStreamException,
	// TEIException {
	// while (!in.isStart("teiHeader")) {
	// in.next();
	// }
	// AnnotationLayer layer = getLayer();
	// in.readUntilEnd();
	// // while (!isHeaderEnd(xer.peek())) {
	// // xer.nextEvent();
	// // }
	// // xer.nextEvent();
	// return ef.createHeader(layer);
	// }

	// private AnnotationLayer getLayer() throws TEIException,
	// XMLStreamException {
	// String type = in.getAttr("type");
	// if (type.equals("text"))
	// return AnnotationLayer.TEXT;
	// else if (type.equals("segmentation"))
	// return AnnotationLayer.SEGMENTATION;
	// else if (type.equals("morphosyntax"))
	// return AnnotationLayer.MORPHOSYNTAX;
	// else if (type.equals("named"))
	// return AnnotationLayer.NAMES;
	// else if (type.equals("words"))
	// return AnnotationLayer.WORDS;
	// else if (type.equals("groups"))
	// return AnnotationLayer.GROUPS;
	// else
	// throw new TEIException(
	// String.format(
	// "Invalid type '%s' at %d:%d",
	// type,
	// in.getLocation().getLineNumber(),
	// in.getLocation().getColumnNumber()));
	//
	// }
}