NKJPWypluwkaReaderImpl.java 7.17 KB
package ipipan.clarin.tei.impl.io.read;

import ipipan.clarin.tei.api.entities.AnnotationLayer;
import ipipan.clarin.tei.api.entities.EntitiesFactory;
import ipipan.clarin.tei.api.entities.TEICorpusText;
import ipipan.clarin.tei.api.entities.TEIHeader;
import ipipan.clarin.tei.api.entities.TEIParagraph;
import ipipan.clarin.tei.api.exceptions.TEIException;
import ipipan.clarin.tei.api.io.NKJPWypluwkaReader;
import ipipan.clarin.tei.impl.io.Constants;
import ipipan.clarin.tei.impl.utils.Preconditions;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.EnumMap;
import java.util.EnumSet;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import java.util.zip.GZIPInputStream;

import javax.xml.stream.XMLStreamException;

import org.apache.log4j.Logger;

/**
 *
 * @author mlenart
 */
public class NKJPWypluwkaReaderImpl extends NKJPWypluwkaReader {
	private final static Logger logger = Logger
			.getLogger(NKJPWypluwkaReaderImpl.class);
	private final static EntitiesFactory ef = EntitiesFactory.getInstance();
	private final File directory;
	private final AnnotationLayer baseLayer;

	private final TextStructureReader textStructureReader;
	private final MorphosyntaxReader0 morphReader;

	private final EnumMap<AnnotationLayer, NKJPWypluwkaFileReader> layer2ReaderMap = new EnumMap<AnnotationLayer, NKJPWypluwkaFileReader>(
			AnnotationLayer.class);
	private final Set<InputStream> openStreams = new LinkedHashSet<InputStream>();

	public NKJPWypluwkaReaderImpl(File directory) throws TEIException {
		this(directory, getAvailableLayers(directory));
	}

	public NKJPWypluwkaReaderImpl(File directory,
			EnumSet<AnnotationLayer> layers) throws TEIException {
		this.directory = directory;

		try {
			if (layers.contains(AnnotationLayer.TEXT)) {
				this.textStructureReader = new TextStructureReader(
						getInput(AnnotationLayer.TEXT));
				morphReader = null;
				baseLayer = AnnotationLayer.TEXT;
			} else {
				this.textStructureReader = null;
				if (!layers.contains(AnnotationLayer.MORPHOSYNTAX))
					throw new TEIException(
							"Must have at least one of layers: [TEXT, MORPHOSYNTAX]");
				morphReader = new MorphosyntaxReader0(
						getInput(AnnotationLayer.MORPHOSYNTAX));
				baseLayer = AnnotationLayer.MORPHOSYNTAX;
			}
			// logger.info("got textStructureReader");
			for (AnnotationLayer layer : layers) {
				if (layer != baseLayer) {
					layer2ReaderMap.put(layer, getTEIReader(layer));
					// logger.info("got reader for "+layer);
				}
			}
		} catch (IOException ex) {
			throw new TEIException(ex);
		} catch (XMLStreamException ex) {
			throw new TEIException(ex);
		}
	}

	@Override
	public TEIHeader readCorpusHeader() {
		TEIHeader res = ef.createCorpusHeader();
		return res;
	}

	@Override
	public TEIHeader readTextHeader() {
		// TODO
		return ef.createHeader(AnnotationLayer.TEXT);
	}

	@Override
	public boolean hasNextParagraph() throws TEIException {
		try {
			switch (baseLayer) {
			case TEXT:
				return textStructureReader.hasNextParagraph();
			case MORPHOSYNTAX:
				return morphReader.hasNextParagraph();
			default:
				throw new AssertionError();
			}
		} catch (XMLStreamException ex) {
			throw new TEIException(ex);
		}
	}

	@Override
	public TEIParagraph readNextParagraph() throws TEIException {
		try {
			TEIParagraph par;
			switch (baseLayer) {
			case TEXT:
				par = textStructureReader.getNextParagraph();
				break;
			case MORPHOSYNTAX:
				par = morphReader.getNextParagraph();
				break;
			default:
				throw new AssertionError();
			}
			for (NKJPWypluwkaFileReader reader : layer2ReaderMap.values()) {
				reader.readNextParagraph(par);
			}
			return par;
		} catch (XMLStreamException ex) {
			throw new TEIException(ex);
		}
	}

	@Override
	public void close() throws TEIException {
		try {
			switch (baseLayer) {
			case TEXT:
				textStructureReader.close();
				break;
			case MORPHOSYNTAX:
				morphReader.close();
				break;
			default:
				throw new AssertionError();
			}
			for (NKJPWypluwkaFileReader reader : layer2ReaderMap.values()) {
				reader.close();
			}
			for (InputStream in : openStreams)
				in.close();
		} catch (IOException ex) {
			throw new TEIException(ex);
		}
	}

	private static EnumSet<AnnotationLayer> getAvailableLayers(File directory)
			throws TEIException {
		List<AnnotationLayer> res = new LinkedList<AnnotationLayer>();

		Preconditions
				.require(
						"text.xml, text_structure.xml and ann_morphosyntax.xml unavailable in "
								+ directory.getPath(),
						isAvailable(directory, AnnotationLayer.TEXT)
								|| isAvailable(directory,
										AnnotationLayer.MORPHOSYNTAX));
		// logger.info("checking layers");

		for (AnnotationLayer layer : Constants.layer2FilenameMap.keySet()) {
			// logger.info("checking layer "+layer);
			if (isAvailable(directory, layer)) {
				res.add(layer);
				// logger.info("layer available "+layer);
			}
		}
		if (!isAvailable(directory, AnnotationLayer.TEXT))
			res.remove(AnnotationLayer.SEGMENTATION);

		return EnumSet.copyOf(res);
	}

	private static boolean isAvailable(File directory, AnnotationLayer layer) {
		if (layer == AnnotationLayer.TEXT)
			return containsOneOf(directory, "text.xml", "text_structure.xml");
		else {
			String fname = Constants.layer2FilenameMap.get(layer);
			return containsOneOf(directory, fname, fname + ".gz");
		}
	}

	private static boolean containsOneOf(File directory, String... fnames) {
		for (String fname : fnames)
			if (new File(directory, fname).isFile())
				return true;
		return false;
	}

	// private static boolean isAvailable(File directory, String file) {
	// return new File(directory, file).isFile();
	// }

	private InWrapper getInput(AnnotationLayer layer)
			throws FileNotFoundException, XMLStreamException, IOException {
		File file;
		boolean gzipped = false;
		if (layer == AnnotationLayer.TEXT) {
			file = new File(directory, "text.xml");
			if (!file.isFile())
				file = new File(directory, "text_structure.xml");
		} else {
			file = new File(directory, Constants.layer2FilenameMap.get(layer));
			if (!file.isFile()) {
				file = new File(directory,
						Constants.layer2FilenameMap.get(layer) + ".gz");
				gzipped = true;
			}
		}
		FileInputStream fileIn = new FileInputStream(file);
		InputStream inStream = gzipped ? new GZIPInputStream(fileIn) : fileIn;
		openStreams.add(inStream);
		logger.debug("reading " + file.getName());
		InWrapper in = new InWrapper(new InputStreamReader(inStream),
				file.getName());
		// in.nextTag();
		while (!in.isStart("TEI")) {
			// in.debug();
			in.next();
		}

		return in;
	}

	private NKJPWypluwkaFileReader getTEIReader(AnnotationLayer layer)
			throws FileNotFoundException, XMLStreamException, TEIException,
			IOException {
		InWrapper in = getInput(layer);
		return new NKJPWypluwkaFileReader(in, layer);
	}

	@Override
	public EnumSet<AnnotationLayer> getAvailableLayers() throws TEIException {
		return EnumSet.copyOf(getAvailableLayers(directory));
	}

	@Override
	public void readBody(TEICorpusText text) throws TEIException {
		for (NKJPWypluwkaFileReader reader : layer2ReaderMap.values()) {
			reader.readBody(text);
		}
	}
}