PackageHeaderReader.java 3.89 KB
package ipipan.clarin.tei.impl.io.read;

import ipipan.clarin.tei.api.entities.AnnotationLayer;
import ipipan.clarin.tei.api.entities.EntitiesFactory;
import ipipan.clarin.tei.api.entities.TEIHeader;
import ipipan.clarin.tei.api.exceptions.TEIException;
import ipipan.clarin.tei.impl.io.Constants;
import ipipan.clarin.tei.impl.utils.TEIConst;

import java.text.DateFormat;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Calendar;

import javax.xml.stream.XMLStreamException;

import org.apache.log4j.Logger;

/**
 * 
 * @author mlenart
 */
class PackageHeaderReader {

	private final static Logger logger = Logger
			.getLogger(PackageHeaderReader.class);
	private final static EntitiesFactory ef = EntitiesFactory.getInstance();
	private final InWrapper in;

	public PackageHeaderReader(InWrapper in) {
		this.in = in;
	}

	private TEIHeader readHeader(boolean isCorpusHeader)
			throws XMLStreamException, TEIException {
		TEIHeader res = null;
		while (!in.isStart("teiHeader")
				&& !in.isXInclude(isCorpusHeader ? "NKJP_1M_header.xml"
						: "header.xml")) {
			in.next();
		}
		if (in.isStart("teiHeader")) {
			logger.debug("DUPA 1 " + in.getAttr("type"));
			logger.debug("DUPA 2 "
					+ Constants.attr2LayerMap.get(in.getAttr("type")));
			AnnotationLayer layer = Constants.attr2LayerMap.get(in
					.getAttr("type"));
			in.requireStart("teiHeader");
			in.nextTag();
			if (in.isStart("fileDesc")) {
				in.requireStart("fileDesc");
				in.nextTag();
				in.requireStart("titleStmt");
				in.nextTag();
				in.requireStart("title");
				in.next();
				String title = in.getText();
				if (layer == null)
					layer = TEIConst.title2Layer.get(title);
				res = isCorpusHeader ? ef.createCorpusHeader() : ef
						.createHeader(layer);
				res.setTitle(title);
				while (!in.isEnd("titleStmt"))
					in.next();

				while (!in.isStart("publicationStmt")
						&& !in.isStart("sourceDesc") && !in.isEnd("fileDesc")) {
					in.next();
					// Preconditions.require("Unexpected end of header",
					// !in.isEnd("teiHeader"));
				}
				if (in.isStart("publicationStmt")) {
					in.nextTag();
					if (in.isStart("distributor")) {
						in.next();
						String distributor = in.getText();
						res.setDistributor(distributor);
						in.nextTag();
						in.requireEnd("distributor");
						in.nextTag();
					}
					if (in.isStart("date")) {
						DateFormat format = new SimpleDateFormat(
								"yyyy-MM-dd HH:mm:ss");
						Calendar cal = Calendar.getInstance();
						String dateStr = in.getAttr("when");
						String timeStr = "00:00:00";
						in.nextTag();
						if (in.isStart("time")) {
							timeStr = in.getAttr("when");
							res.setDuration(in.getAttr("dur"));
							in.nextTag();
							in.requireEnd("time");
							in.nextTag();
						}

						try {
							cal.setTime(format.parse(dateStr + " " + timeStr));
						} catch (ParseException ex) {
							throw new TEIException("Invalid date string: "
									+ dateStr, ex);
						}

						res.setTime(cal);
						// in.readUntilEnd();
						in.requireEnd("date");
						in.nextTag();
					}
					in.requireEnd("publicationStmt");
					in.nextTag();
				}

				if (in.isStart("sourceDesc")) {
					in.nextTag();
					in.requireStart("p");
					in.next();
					String source = in.getText();
					if (source.startsWith("Text retrieved from")) {
						in.nextTag();
						in.requireStart("ptr");
						String target = in.getAttr("target");
						res.setRetrievedFrom(target);
					} else {
						res.setSourceDescText(source);
					}
					while (!in.isEnd("sourceDesc")) {
						in.next();
					}
					in.nextTag();
				}
				in.requireEnd("fileDesc");
				in.nextTag();
			}
			in.requireEnd("teiHeader");
			return res;
		} else {
			return null;
		}
	}

	TEIHeader readCorpusHeader() throws TEIException, XMLStreamException {
		return this.readHeader(true);
	}

	TEIHeader readHeader() throws TEIException, XMLStreamException {
		return this.readHeader(false);
	}
}