PackageReader.java
3.82 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
package ipipan.clarin.tei.impl.io.read;
import ipipan.clarin.tei.api.entities.AnnotationLayer;
import ipipan.clarin.tei.api.entities.TEICorpusText;
import ipipan.clarin.tei.api.entities.TEIHeader;
import ipipan.clarin.tei.api.entities.TEIParagraph;
import ipipan.clarin.tei.api.exceptions.TEIException;
import ipipan.clarin.tei.impl.entities.TEICorpusTextImpl;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.List;
import javax.xml.stream.XMLStreamException;
/**
*
* @author mlenart
*/
public class PackageReader {
private final InWrapper in;
private final PackageHeaderReader headerReader;
public static PackageReader getInstance(Reader reader) throws TEIException {
return new PackageReader(reader);
}
private PackageReader(Reader reader) throws TEIException {
try {
in = new InWrapper(reader, null);
} catch (XMLStreamException ex) {
throw new TEIException(ex);
} catch (IOException ex) {
throw new TEIException(ex);
}
headerReader = new PackageHeaderReader(in);
}
public TEICorpusText readCorpusText() throws TEIException {
try {
return doReadCorpusText();
} catch (XMLStreamException ex) {
throw new TEIException(ex);
}
}
public void close() throws TEIException {
try {
in.close();
} catch (XMLStreamException ex) {
throw new TEIException(ex);
}
}
private TEICorpusText doReadCorpusText() throws XMLStreamException,
TEIException {
TEICorpusTextImpl tei = new TEICorpusTextImpl();
if (!hasNextHeader()) {
throw new TEIException("Missing package header");
}
tei.setCorpusHeader(headerReader.readCorpusHeader());
List<TEIParagraph> pars = new ArrayList<TEIParagraph>();
TextStructureReader tsr = new TextStructureReader(in);
while (tsr.hasNextParagraph()) {
TEIParagraph par = tsr.getNextParagraph();
pars.add(par);
}
boolean parsAdded = false;
while (hasNextHeader()) {
TEIHeader header = headerReader.readHeader();
tei.addAnnotationLayer(header.getLayer(), header);
readLayer(pars, header.getLayer(), tei);
if (!parsAdded && pars.size() > 0
&& pars.get(0).getSentences() != null) {
for (TEIParagraph par : pars)
tei.addParagraph(par);
parsAdded = true;
}
}
return tei;
}
private void readLayer(List<TEIParagraph> pars,
AnnotationLayer annotationLayer, TEICorpusText text)
throws TEIException, XMLStreamException {
while (!in.isStart("body"))
in.next();
BodyReader bodyReader = BodyReader.create(in, annotationLayer);
for (TEIParagraph par : pars) {
bodyReader.readNextParagraph(par);
}
bodyReader.readBody(text);
}
private boolean hasNextHeader() throws XMLStreamException {
while (in.hasNext() && !in.isStart("teiHeader")) {
in.next();
}
return in.isStart("teiHeader");
}
// private TEIHeader readNextHeader() throws XMLStreamException,
// TEIException {
// while (!in.isStart("teiHeader")) {
// in.next();
// }
// AnnotationLayer layer = getLayer();
// in.readUntilEnd();
// // while (!isHeaderEnd(xer.peek())) {
// // xer.nextEvent();
// // }
// // xer.nextEvent();
// return ef.createHeader(layer);
// }
// private AnnotationLayer getLayer() throws TEIException,
// XMLStreamException {
// String type = in.getAttr("type");
// if (type.equals("text"))
// return AnnotationLayer.TEXT;
// else if (type.equals("segmentation"))
// return AnnotationLayer.SEGMENTATION;
// else if (type.equals("morphosyntax"))
// return AnnotationLayer.MORPHOSYNTAX;
// else if (type.equals("named"))
// return AnnotationLayer.NAMES;
// else if (type.equals("words"))
// return AnnotationLayer.WORDS;
// else if (type.equals("groups"))
// return AnnotationLayer.GROUPS;
// else
// throw new TEIException(
// String.format(
// "Invalid type '%s' at %d:%d",
// type,
// in.getLocation().getLineNumber(),
// in.getLocation().getColumnNumber()));
//
// }
}