GroupsReader.java
3.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package ipipan.clarin.tei.impl.io.read;
import ipipan.clarin.tei.api.entities.AnnotationLayer;
import ipipan.clarin.tei.api.entities.TEIGroup;
import ipipan.clarin.tei.api.entities.TEIParagraph;
import ipipan.clarin.tei.api.entities.TEISentence;
import ipipan.clarin.tei.api.entities.TEISyntacticEntity;
import ipipan.clarin.tei.api.entities.TEIWord;
import ipipan.clarin.tei.api.exceptions.TEIException;
import ipipan.clarin.tei.impl.io.IdValuePair;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.xml.stream.XMLStreamException;
/**
*
* @author mlenart
*/
public class GroupsReader extends BodyReader {
GroupsReader(InWrapper in) {
super(in);
}
@Override
protected void readNextParagraph(TEIParagraph par) throws TEIException {
try {
while (!in.isStartParagraph()) {
in.next();
}
String parId = in.getXmlId();
for (TEISentence sent : par.getSentences()) {
in.nextTag();
in.requireStart("s");
readNextSent(sent);
}
in.nextTag();
in.requireEnd(); // p
par.setId(AnnotationLayer.GROUPS, parId);
} catch (Exception ex) {
throw new TEIException("Error in groups: " + ex.getMessage(), ex);
}
}
private TEISentence readNextSent(TEISentence sent)
throws XMLStreamException, TEIException {
List<GroupBuilder> builders = new LinkedList<GroupBuilder>();
String sentId = in.getXmlId();
in.nextTag();
while (!in.isEnd()) {
in.requireStart("seg");
builders.add(readGroupBuilder());
in.nextTag();
}
in.requireEnd(); // s
sent.setSyntacticGroups(getGroupsFromBuilders(builders, sent));
sent.setId(AnnotationLayer.GROUPS, sentId);
return sent;
}
private List<TEIGroup> getGroupsFromBuilders(List<GroupBuilder> builders,
TEISentence sent) throws TEIException {
LinkedList<TEIGroup> res = new LinkedList<TEIGroup>();
Map<String, TEISyntacticEntity> ptr2Child = new LinkedHashMap<String, TEISyntacticEntity>();
for (TEIWord word : sent.getAllWords()) {
ptr2Child.put(word.getId(), word);
}
Collections.reverse(builders);
for (GroupBuilder b : builders) {
TEIGroup group = b.getGroup(ptr2Child);
res.addFirst(group);
ptr2Child.put(group.getId(), group);
}
filterNEs(res);
return res;
}
private void filterNEs(LinkedList<TEIGroup> nes) {
Set<TEISyntacticEntity> nonRootNEs = new LinkedHashSet<TEISyntacticEntity>();
for (TEIGroup group : nes) {
nonRootNEs.addAll(group.getChildren());
}
nes.removeAll(nonRootNEs);
}
private GroupBuilder readGroupBuilder() throws XMLStreamException {
in.requireStart("seg");
GroupBuilder b = new GroupBuilder(in.getXmlId());
in.nextTag();
in.requireStartFS("group");
in.nextTag();
while (!in.isEndFS()) {
if (in.isStartF("orth")) {
b.setOrth(in.readStringF().getValue());
} else if (in.isStartF("type")) {
b.setType(in.readSymbolF().getValue());
} else if (in.isStartF("semh")) {
b.setSemHead(in.readFValue());
} else if (in.isStartF("synh")) {
b.setSynHead(in.readFValue());
}
in.nextTag();
}
in.nextTag();
List<String> ptrs = new LinkedList<String>();
for (IdValuePair ptr : PtrHelper.readPtrsWithTypes(in)) {
String target = ptr.getId();
ptrs.add(target);
}
b.setPtrs(ptrs);
in.requireEnd(); // seg
return b;
}
}