SegmentationReader.java 4.4 KB
package ipipan.clarin.tei.impl.io.read;

import ipipan.clarin.tei.api.entities.AnnotationLayer;
import ipipan.clarin.tei.api.entities.TEIParagraph;
import ipipan.clarin.tei.api.entities.TEISegment;
import ipipan.clarin.tei.api.entities.TEISentence;
import ipipan.clarin.tei.api.exceptions.TEIException;

import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.xml.stream.XMLStreamException;

/**
 *
 * @author mlenart
 */
class SegmentationReader extends BodyReader {

	private final static Pattern correspPattern = Pattern
			.compile("(?:text(?:_structure)?\\.xml\\#)?string-range\\(([a-zA-Z_0-9\\.\\-]+),((?:[0-9]+,)?[0-9]+),([1-9][0-9]*)\\)");

	SegmentationReader(InWrapper in) {
		super(in);
	}

	@Override
	protected void readNextParagraph(TEIParagraph par) throws TEIException {
		try {
			List<TEISentence> sents = new ArrayList<TEISentence>();
			while (!in.isStartParagraph()) {
				in.next();

			}
			String parId = in.getXmlId();
			in.nextTag();
			int lastOffset = -1;
			while (!in.isEnd()) {
				in.requireStart("s");
				sents.add(readNextSent(par, lastOffset));
				TEISegment lastSeg = getLastSegment(sents);
				lastOffset = lastSeg.getOffset() + lastSeg.getLength();
				in.nextTag();
			}
			par.setSentences(sents);
			par.setId(AnnotationLayer.SEGMENTATION, parId);
		} catch (Exception ex) {
			throw new TEIException("Error in segmentation: " + ex.getMessage(),
					ex);
		}
	}

	private TEISentence readNextSent(TEIParagraph par, int lastOffset)
			throws XMLStreamException, TEIException {
		String sentId = in.getXmlId();
		List<TEISegment> segs = new ArrayList<TEISegment>();
		in.nextTag();
		int currLastOffset = lastOffset;
		while (!in.isEnd()) {
			if (in.isStart("seg")) {
				segs.add(readNextSegment(par, currLastOffset));
			} else if (in.isStart("choice")) {
				segs.addAll(readChoice(par, currLastOffset));
			} else {
				in.fail("seg or choice tag start");
			}
			TEISegment lastSeg = segs.get(segs.size() - 1);
			currLastOffset = lastSeg.getOffset() + lastSeg.getLength();
			in.nextTag();
		}
		return ef.createSentence(sentId, segs);
	}

	private List<TEISegment> readChoice(TEIParagraph par, int lastOffset)
			throws XMLStreamException, TEIException {
		List<TEISegment> res = new LinkedList<TEISegment>();
		int choiceNum = 0;
		in.nextTag();
		while (!in.isEnd()) {
			if (in.isStart("seg")) {
				res.add(readNextSegment(par, lastOffset));
			} else if (in.isStart("paren")) {
				res.addAll(readParen(par, choiceNum, lastOffset));
			} else {
				in.fail("seg or nkjp:paren start tag");
			}
			choiceNum++;
			in.nextTag();
		}

		return res;
	}

	private List<TEISegment> readParen(TEIParagraph par, int choiceNum,
			int lastOffset) throws XMLStreamException, TEIException {
		List<TEISegment> res = new LinkedList<TEISegment>();
		in.nextTag();
		int currLastOffset = lastOffset;
		while (!in.isEnd()) {
			in.requireStart("seg");
			TEISegment seg = readNextSegment(par, currLastOffset);
			seg.setChoiceNum(choiceNum);
			res.add(seg);
			currLastOffset = seg.getOffset() + seg.getLength();
			in.nextTag();
		}

		return res;
	}

	private TEISegment readNextSegment(TEIParagraph par, int lastOffset)
			throws XMLStreamException, TEIException {
		String id = in.getXmlId();
		// boolean nps = in.getBoolNKJPAttr("nps");
		boolean rejected = in.getBoolNKJPAttr("rejected");
		in.requireAttr("corresp");
		String corresp = in.getAttr("corresp");
		Matcher m = correspPattern.matcher(corresp);
		if (!m.find()) {
			throw new TEIException(String.format(
					"Invalid corresp: '%s' at %s:%s", corresp, in.getLocation()
							.getLineNumber(), in.getLocation()
							.getColumnNumber()));
		}
		String parId = m.group(1);
		TEIParagraph correspPar = par.getId().equals(parId) ? par : par
				.getSubparagraph(parId);
		int offset = parseOffset(m.group(2));
		int length = Integer.parseInt(m.group(3));
		boolean nps = offset == lastOffset;

		in.nextTag();
		in.requireEnd();

		TEISegment seg = ef.createSegment(correspPar, id, offset, length, nps);
		seg.setRejected(rejected);
		return seg;
	}

	private int parseOffset(String offsetStr) {
		offsetStr = offsetStr.replace(",", "");
		return Integer.parseInt(offsetStr);
	}

	private TEISegment getLastSegment(List<TEISentence> sents) {
		TEISentence lastSent = sents.get(sents.size() - 1);
		return lastSent.getChosenSegments().get(
				lastSent.getChosenSegments().size() - 1);
	}
}