BC2GMtoStandoff.py
3.47 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#!/usr/bin/env python
# Converts the BioCreative 2 Gene Mention task data into brat-flavored
# standoff format.
from __future__ import with_statement
import sys
import re
import os
def char_offsets(text, start, end, ttext):
# Given a text and a tagged span marked by start and end offsets
# ignoring space (plus tagged text for reference), returns the
# character-based offsets for the marked span. This is necessary
# as BC2 data has offsets that ignore space. Note also that input
# offsets are assumed inclusive of last char (ala BC2), but return
# offsets are exclusive of last (ala BioNLP ST/brat).
# scan to start offset
idx, nospcidx = 0,0
while True:
while idx < len(text) and text[idx].isspace():
idx += 1
assert idx < len(text), "Error in data"
if nospcidx == start:
break
nospcidx += 1
idx += 1
char_start = idx
# scan to end offset
while nospcidx < end:
nospcidx += 1
idx += 1
while idx < len(text) and text[idx].isspace():
idx += 1
char_end = idx+1
# special case allowing for slight adjustment for known error in
# BC2 data
if (text[char_start:char_end] == '/translation upstream factor' and
ttext == 'translation upstream factor'):
print >> sys.stderr, "NOTE: applying special-case fix ..."
char_start += 1
# sanity
ref_text = text[char_start:char_end]
assert ref_text == ttext, "Mismatch: '%s' vs '%s' [%d:%d] (%s %d-%d)" % (ttext, ref_text, char_start, char_end, text, start, end)
return char_start, char_end
def main(argv):
if len(argv) != 4:
print >> sys.stderr, "Usage:", argv[0], "BC2TEXT BC2TAGS OUTPUT-DIR"
return 1
textfn, tagfn, outdir = argv[1:]
# read in tags, store by sentence ID
tags = {}
with open(tagfn, 'rU') as tagf:
for l in tagf:
l = l.rstrip('\n')
m = re.match(r'^([^\|]+)\|(\d+) (\d+)\|(.*)$', l)
assert m, "Format error in %s: %s" % (tagfn, l)
sid, start, end, text = m.groups()
start, end = int(start), int(end)
if sid not in tags:
tags[sid] = []
tags[sid].append((start, end, text))
# read in sentences, store by sentence ID
texts = {}
with open(textfn, 'rU') as textf:
for l in textf:
l = l.rstrip('\n')
m = re.match(r'(\S+) (.*)$', l)
assert m, "Format error in %s: %s" % (textfn, l)
sid, text = m.groups()
assert sid not in texts, "Error: duplicate ID %s" % sid
texts[sid] = text
# combine tags with sentences, converting offsets into
# character-based ones. (BC2 data offsets ignore space)
offsets = {}
for sid in texts:
offsets[sid] = []
for start, end, ttext in tags.get(sid,[]):
soff, eoff = char_offsets(texts[sid], start, end, ttext)
offsets[sid].append((soff, eoff))
# output one .txt and one .a1 file per sentence
for sid in texts:
with open(os.path.join(outdir, sid+".txt"), 'w') as txtf:
print >> txtf, texts[sid]
with open(os.path.join(outdir, sid+".ann"), 'w') as annf:
tidx = 1
for soff, eoff in offsets[sid]:
print >> annf, "T%d\tGENE %d %d\t%s" % (tidx, soff, eoff, texts[sid][soff:eoff])
tidx += 1
if __name__ == "__main__":
sys.exit(main(sys.argv))