MetaMaptoStandoff.py
2.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env python
# Script to convert MetaMap "fielded" ("-N" argument) output into
# standoff with reference to the original text.
import sys
import re
import os
import codecs
# Regex for the "signature" of a metamap "fielded" output line
FIELDED_OUTPUT_RE = re.compile(r'^\d+\|')
class taggedEntity:
def __init__(self, startOff, endOff, eType, idNum):
self.startOff = startOff
self.endOff = endOff
self.eType = eType
self.idNum = idNum
def __str__(self):
return "T%d\t%s %d %d" % (self.idNum, self.eType, self.startOff, self.endOff)
def MetaMap_lines_to_standoff(metamap_lines, reftext=None):
tagged = []
idseq = 1
for l in metamap_lines:
l = l.rstrip('\n')
# silently skip lines that don't match the expected format
if not FIELDED_OUTPUT_RE.match(l):
continue
# format is pipe-separated ("|") fields, the ones of interest
# are in the following indices:
# 3: preferred text form
# 4: CUI
# 5: semantic type (MetaMap code)
# 8: start offset and length of match
fields = l.split('|')
if len(fields) < 9:
print >> sys.stderr, "Note: skipping unparseable MetaMap output line: %s" % l
continue
ctext, CUI, semtype, offset = fields[3], fields[4], fields[5], fields[8]
# strip surrounding brackets from semantic type
semtype = semtype.replace('[','').replace(']','')
# parse length; note that this will only pick the of multiple
# discontinuous spans if they occur (simple heuristic for the
# head)
m = re.match(r'^(?:\d+:\d+,)*(\d+):(\d+)$', offset)
start, length = m.groups()
start, length = int(start), int(length)
tagged.append(taggedEntity(start, start+length, semtype, idseq))
idseq += 1
print >> sys.stderr, "MetaMaptoStandoff: returning %s tagged spans" % len(tagged)
return tagged
if __name__ == "__main__":
lines = [l for l in sys.stdin]
standoff = MetaMap_lines_to_standoff(lines)
for s in standoff:
print s