biocontext2standoff.py
4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
import sys
import re
import os
options = None
DEFAULT_INPUT = 'entities-anatomy.csv'
# Document ID format in BioContext data
BIOCONTEXT_ID_RE = re.compile(r'^([0-9]+|PMC[0-9]+\.[0-9]+\.[0-9])+$')
def argparser():
import argparse
ap=argparse.ArgumentParser(description='Convert BioContext data ' +
'into brat-flavored standoff.')
ap.add_argument('-d', '--directory', default=None,
help='Output directory (default output to STDOUT)')
ap.add_argument('-e', '--entitytype', default='Anatomical_entity',
help='Type to assign to annotations.')
ap.add_argument('-f', '--file', default=DEFAULT_INPUT,
help='BioContext data (default "'+DEFAULT_INPUT+'")')
ap.add_argument('-n', '--no-norm', default=False, action='store_true',
help='Do not output normalization annotations')
ap.add_argument('-o', '--outsuffix', default='ann',
help='Suffix to add to output files (default "ann")')
ap.add_argument('-v', '--verbose', default=False, action='store_true',
help='Verbose output')
ap.add_argument('id', metavar='ID/FILE', nargs='+',
help='IDs of documents for which to extract annotations.')
return ap
def read_ids(fn):
ids = set()
with open(fn, 'rU') as f:
for l in f:
l = l.rstrip('\n')
if not BIOCONTEXT_ID_RE.match(l):
print >> sys.stderr, 'Warning: ID %s not in expected format' % l
ids.add(l)
return ids
def get_ids(items):
"""Given a list of either document IDs in BioContext format or
names of files containing one ID per line, return the combined set
of IDs."""
combined = set()
for item in items:
if BIOCONTEXT_ID_RE.match(item):
combined.add(item)
else:
# assume name of file containing IDs
combined |= read_ids(item)
return combined
def convert_line(l, converted):
try:
doc_id, id_, eid, start, end, text, group = l.split('\t')
if id_ == 'NULL':
return 0
start, end = int(start), int(end)
except:
print >> sys.stderr, 'Format error: %s' % l
raise
# textbound annotation
converted.append('T%s\t%s %d %d\t%s' % (id_, options.entitytype,
start, end, text))
# normalization (grounding) annotation
if not options.no_norm:
converted.append('N%s\tReference T%s %s' % (id_, id_, eid))
def output_(out, ann):
for a in ann:
print >> out, a
def output(id_, ann, append):
if not options.directory:
output(sys.stdout, ann)
else:
fn = os.path.join(options.directory, id_+'.'+options.outsuffix)
with open(fn, 'a' if append else 'w') as f:
output_(f, ann)
def process_(f, ids):
ann, current, processed = [], None, set()
for l in f:
l = l.strip()
id_ = l.split('\t')[0]
if id_ == current:
if id_ in ids:
convert_line(l, ann)
else:
# new document
if current in ids:
output(current, ann, current in processed)
ann = []
processed.add(current)
if id_ in ids:
if id_ in processed and options.verbose:
print >> sys.stderr, 'Warning: %s split' % id_
convert_line(l, ann)
current = id_
# short-circuit after processing last
if ids == processed:
break
if ann:
output(current, ann, current in processed)
for id_ in ids - processed:
print >> sys.stderr, 'Warning: id %s not found' % id_
def process(fn, ids):
try:
with open(fn, 'rU') as f:
# first line should be header; skip and confirm
header = f.readline()
if not header.startswith('doc_id\tid'):
print >> sys.stderr, 'Warning: %s missing header' % fn
process_(f, ids)
except IOError, e:
print >> sys.stderr, e, '(try -f argument?)'
def main(argv=None):
global options
if argv is None:
argv = sys.argv
options = argparser().parse_args(argv[1:])
ids = get_ids(options.id)
process(options.file, ids)
if __name__ == '__main__':
sys.exit(main(sys.argv))