TeiSaver.java
4.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
package pl.waw.ipipan.zil.core.md.io.tei;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import pl.waw.ipipan.zil.core.md.entities.*;
import pl.waw.ipipan.zil.nkjp.teiapi.api.entities.*;
import pl.waw.ipipan.zil.nkjp.teiapi.api.exceptions.TEIException;
import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO;
import pl.waw.ipipan.zil.nkjp.teiapi.api.io.TEI_IO.CompressionMethod;
import java.io.File;
import java.util.*;
public class TeiSaver {
private static final Logger logger = LoggerFactory.getLogger(TeiSaver.class);
private static final TEI_IO teiAPI = TEI_IO.getInstance();
private static final EntitiesFactory ef = EntitiesFactory.getInstance();
private TeiSaver() {
}
public static void saveTeiText(TEICorpusText teiText, File targetDir, boolean gzip) throws TEIException {
logger.debug("Saving text in " + targetDir);
CompressionMethod cm = gzip ? CompressionMethod.GZIP : CompressionMethod.NONE;
teiAPI.writeToNKJPDirectory(teiText, targetDir, cm);
}
public static void updateTeiText(Text t, TEICorpusText teiText) throws TEIException {
Map<Mention, TEIMention> mention2mention = new HashMap<Mention, TEIMention>();
Iterator<Paragraph> pIt = t.iterator();
Iterator<TEIParagraph> pItTei = teiText.getParagraphs().iterator();
int mentionId = 0;
while (pIt.hasNext() && pItTei.hasNext()) {
Paragraph p = pIt.next();
TEIParagraph pTei = pItTei.next();
mentionId = updateTeiParagraph(mention2mention, mentionId, p, pTei);
}
checkIterators(pIt, pItTei, "paragraph");
teiText.addAnnotationLayer(AnnotationLayer.MENTIONS,
EntitiesFactory.getInstance().createHeader(AnnotationLayer.MENTIONS));
// clear coreference as we have new mentions it became invalid
teiText.getAnnotationLayers().remove(AnnotationLayer.COREFERENCE);
teiText.setCoreferences(new ArrayList<TEICoreference>());
logger.debug(mentionId + " mentions added");
}
private static int updateTeiParagraph(Map<Mention, TEIMention> mention2mention, int mentionId, Paragraph p,
TEIParagraph pTei) throws TEIException {
Iterator<Sentence> sIt = p.iterator();
Iterator<TEISentence> sItTei = pTei.getSentences().iterator();
while (sIt.hasNext() && sItTei.hasNext()) {
Sentence s = sIt.next();
TEISentence sTei = sItTei.next();
mentionId = updateTeiSentence(mention2mention, mentionId, s, sTei);
}
checkIterators(sIt, sItTei, "sentence");
return mentionId;
}
private static int updateTeiSentence(Map<Mention, TEIMention> mention2mention, int mentionId, Sentence s,
TEISentence sTei) throws TEIException {
sTei.getAllMentions().clear();
Map<Token, TEIMorph> seg2morph = new HashMap<>();
Iterator<Token> segIt = s.iterator();
Iterator<TEIMorph> segItTei = sTei.getMorphs().iterator();
while (segIt.hasNext() && segItTei.hasNext()) {
seg2morph.put(segIt.next(), segItTei.next());
}
checkIterators(segIt, segItTei, "token");
List<TEIMention> mentions = new ArrayList<>();
for (Mention m : s.getMentions()) {
List<TEIMorph> morphs = new ArrayList<>();
List<TEIMorph> heads = new ArrayList<>();
for (Token seg : m.getSegments())
morphs.add(seg2morph.get(seg));
for (Token seg : m.getHeadSegments())
heads.add(seg2morph.get(seg));
TEIMention mention = ef.createMention("mention_" + mentionId++, morphs, heads, m.isZeroSubject());
mentions.add(mention);
mention2mention.put(m, mention);
}
sTei.setMentions(mentions);
return mentionId;
}
private static void checkIterators(Iterator<?> one, Iterator<?> other, String level)
throws TEIException {
if (one.hasNext() || other.hasNext())
throw new TEIException("Problem mapping tei to thrift for level " + level);
}
}