TextStructureReader.java 3.26 KB
package ipipan.clarin.tei.impl.io.read;

import ipipan.clarin.tei.api.entities.EntitiesFactory;
import ipipan.clarin.tei.api.entities.TEIParagraph;
import ipipan.clarin.tei.api.entities.TEIParagraph.ParagraphType;
import ipipan.clarin.tei.api.exceptions.TEIException;

import java.util.ArrayList;
import java.util.List;

import javax.xml.stream.XMLStreamException;

/**
 *
 * @author mlenart
 */
class TextStructureReader {

	private final EntitiesFactory ef = EntitiesFactory.getInstance();
	private final InWrapper in;

	TextStructureReader(InWrapper in) {
		// logger.info("new");
		this.in = in;
	}

	public boolean hasNextParagraph() throws TEIException {
		try {
			while (!(in.isEnd() && in.getName().equals("body"))
					&& !in.isStartParagraph()) {
				in.next();
			}
			return in.isStartParagraph();
		} catch (XMLStreamException ex) {
			throw new TEIException("Error in text layer: " + ex.getMessage(),
					ex);
		}
	}

	public TEIParagraph getNextParagraph() throws XMLStreamException,
			TEIException {

		try {
			if (!hasNextParagraph()) {
				in.fail("new paragraph");
			}

			String id = in.getXmlId();
			// logger.debug("paragraph id="+id);
			ParagraphType type = ParagraphType.valueOf(in.getName()
					.toUpperCase());
			String n = in.getAttr("n");
			String text = "";
			List<TEIParagraph> subparagraphs = new ArrayList<TEIParagraph>();

			in.next();

			while (true) {
				if (in.isCharacters() || in.isStart("hi")) {
					text = readAllText();
					assert (in.isEnd());
					return ef.createParagraph(id, type, n, text);
				} else if (in.isStartParagraph()) {
					while (in.isStartParagraph()) {
						subparagraphs.add(getNextParagraph());
						in.nextTag();
						// logger.info("PARAGRAPH: "+subparagraphs.get(subparagraphs.size()-1).getId());
					}
					if (in.isStart()) {
						in.readUntilEnd();
						in.nextTag();
					}
					in.requireEnd();
					return ef.createParagraph(id, type, subparagraphs);
				}
				// else if (in.isStart("hi")) {
				// readAllText();
				// }
				else if (in.isStart()) {
					in.readUntilEnd();
				} else {
					in.fail("paragraph text, subparagraph start or <header> tag");
				}
				in.nextTag();
			}
		} catch (XMLStreamException ex) {
			throw new TEIException("Error in text layer: " + ex.getMessage(),
					ex);
		} catch (TEIException ex) {
			throw new TEIException("Error in text layer: " + ex.getMessage(),
					ex);
		}
	}

	public void close() throws TEIException {
		try {
			in.close();
		} catch (XMLStreamException ex) {
			throw new TEIException(ex);
		}
	}

	private String readAllText() throws XMLStreamException {
		StringBuilder sb = new StringBuilder();
		while (!in.isEnd()) {
			if (in.isCharacters()) {
				String text = in.getText();
				sb.append(text);
				in.next();
			} else if (in.isStart("hi")) {
				// in.readUntilEnd();
				in.next();
				// while (lastCol < in.getLocation().getColumnNumber() - 1) {
				// sb.append(" ");
				// lastCol++;
				// }
				if (in.isCharacters()) {
					sb.append(in.getText());
					in.next();
				}
				in.requireEnd();
				// while (lastCol < in.getLocation().getColumnNumber() - 1) {
				// sb.append(" ");
				// lastCol++;
				// }
				in.next();
				// sb.append(readAllText());
			} else if (in.isStart()) {
				in.readUntilEnd();
				in.next();
			}
		}
		return sb.toString();
	}
}