ThriftLoader.java
6.98 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
package pl.waw.ipipan.zil.core.md.io.thrift;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.waw.ipipan.zil.core.md.entities.*;
import pl.waw.ipipan.zil.multiservice.thrift.types.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ThriftLoader {
private static Logger logger = LoggerFactory.getLogger(ThriftLoader.class);
public static Text loadTextFromThrift(TText thriftText)
throws MultiserviceException {
Text text = new Text(thriftText.getTextHeader() == null ? "null"
: thriftText.getTextHeader().getId());
logger.debug("Loading text " + text.getId() + " from thrift format...");
for (TParagraph teiP : thriftText.getParagraphs())
loadParagraph(text, teiP);
logger.debug("Thrift text loaded.");
return text;
}
private static void loadParagraph(Text text, TParagraph teiP)
throws MultiserviceException {
Paragraph p = new Paragraph();
text.add(p);
for (TSentence teiS : teiP.getSentences())
loadSentence(p, teiS);
}
private static void loadSentence(Paragraph p, TSentence thriftSent)
throws MultiserviceException {
Sentence s = new Sentence();
p.add(s);
Map<String, Object> thirftId2Entity = getThriftId2EntityMap(thriftSent);
Map<String, Token> thiftTokenId2Token = new HashMap<>();
for (TToken teiM : thriftSent.getTokens()) {
Token token = loadToken(s, teiM);
thiftTokenId2Token.put(teiM.getId(), token);
}
if (thriftSent.isSetNames())
for (TNamedEntity ne : thriftSent.getNames())
loadNE(s, ne, thirftId2Entity, thiftTokenId2Token);
if (thriftSent.isSetWords())
for (TSyntacticWord w : thriftSent.getWords())
loadSyntacticWord(s, w, thirftId2Entity, thiftTokenId2Token);
if (thriftSent.isSetGroups())
for (TSyntacticGroup g : thriftSent.getGroups())
loadSyntacticGroup(s, g, thirftId2Entity, thiftTokenId2Token);
}
private static void loadSyntacticGroup(Sentence s, TSyntacticGroup g,
Map<String, Object> thirftId2Entity,
Map<String, Token> thiftTokenId2Token) {
String type = g.getType();
List<Token> tokens = getUnderlyingSegments(g, thirftId2Entity,
thiftTokenId2Token, false);
List<Token> headTokens = getUnderlyingSegments(g, thirftId2Entity,
thiftTokenId2Token, true);
s.addSyntacticGroup(new SyntacticGroup(type, tokens, headTokens));
}
private static void loadSyntacticWord(Sentence s, TSyntacticWord w,
Map<String, Object> thirftId2Entity,
Map<String, Token> thiftTokenId2Token) {
String ctag = w.getChosenInterpretation().getCtag();
List<Token> tokens = getUnderlyingSegments(w, thirftId2Entity,
thiftTokenId2Token, false);
s.addSyntacticWord(new SyntacticWord(ctag, tokens));
}
private static void loadNE(Sentence s, TNamedEntity ne,
Map<String, Object> thirftId2Entity,
Map<String, Token> thiftTokenId2Token) {
List<Token> tokens = getUnderlyingSegments(ne, thirftId2Entity,
thiftTokenId2Token, false);
s.addNamedEntity(new NamedEntity(tokens));
}
private static Map<String, Object> getThriftId2EntityMap(
TSentence thriftSent) {
Map<String, Object> idToEntity = new HashMap<>();
for (TToken tok : thriftSent.getTokens())
idToEntity.put(tok.getId(), tok);
if (thriftSent.isSetWords())
for (TSyntacticWord w : thriftSent.getWords())
idToEntity.put(w.getId(), w);
if (thriftSent.isSetNames())
for (TNamedEntity ne : thriftSent.getNames())
idToEntity.put(ne.getId(), ne);
if (thriftSent.isSetGroups())
for (TSyntacticGroup group : thriftSent.getGroups())
idToEntity.put(group.getId(), group);
return idToEntity;
}
private static Token loadToken(Sentence s, TToken teiM)
throws MultiserviceException {
Token seg = new Token();
s.add(seg);
seg.setOrth(teiM.getOrth());
TInterpretation interp = getTokenChosenInt(teiM);
Interpretation chosenIterpretation = new Interpretation(
interp.getCtag(), interp.getMsd(), interp.getBase());
seg.addChosenInterpretation(chosenIterpretation);
for (TInterpretation interp2 : teiM.getInterpretations()) {
Interpretation inter = new Interpretation(interp2.getCtag(),
interp2.getMsd(), interp.getBase());
seg.addInterpretation(inter);
}
return seg;
}
private static TInterpretation getTokenChosenInt(TToken token)
throws MultiserviceException {
TInterpretation interp = token.getChosenInterpretation();
if (interp == null || interp.getBase() == null
|| "".equals(interp.getBase())) {
if (token.getCandidateInterpretations() == null
|| token.getCandidateInterpretations().isEmpty()
|| token.getCandidateInterpretations().get(0).getBase() == null
|| "".equals(token.getCandidateInterpretations().get(0).getBase()))
throw new MultiserviceException(
"No proper chosen or candidate interpretation for segment: "
+ token.id);
interp = token.getCandidateInterpretations().get(0);
}
return interp;
}
private static List<Token> getUnderlyingSegments(Object entity,
Map<String, Object> idToEntity, Map<String, Token> tokenId2Segment,
boolean headsOnly) {
List<Token> result = new ArrayList<>();
if (entity instanceof TToken) {
result.add(tokenId2Segment.get(((TToken) entity).getId()));
return result;
}
List<String> childIds = new ArrayList<>();
if (entity instanceof TSyntacticWord)
childIds = ((TSyntacticWord) entity).getChildIds();
else if (entity instanceof TNamedEntity)
childIds = ((TNamedEntity) entity).getChildIds();
else if (entity instanceof TSyntacticGroup)
if (headsOnly) {
childIds = new ArrayList<>();
childIds.add(((TSyntacticGroup) entity).getSemanticHeadId());
} else
childIds = ((TSyntacticGroup) entity).getChildIds();
for (String id : childIds)
result.addAll(getUnderlyingSegments(idToEntity.get(id), idToEntity,
tokenId2Segment, headsOnly));
return result;
}
}