conllXtostandoff.py
6.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
#!/usr/bin/env python
# Script to convert a CoNLL X (2006) tabbed dependency tree format
# file into BioNLP ST-flavored standoff and a reconstruction of the
# original text.
from __future__ import with_statement
import sys
import re
import os
import codecs
# maximum number of sentences to include in single output document
# (if None, doesn't split into documents)
MAX_DOC_SENTENCES = 10
# whether to output an explicit root note
OUTPUT_ROOT = True
# the string to use to represent the root node
ROOT_STR = 'ROOT'
INPUT_ENCODING = "UTF-8"
OUTPUT_ENCODING = "UTF-8"
output_directory = None
# rewrites for characters appearing in CoNLL-X types that cannot be
# directly used in identifiers in brat-flavored standoff
charmap = {
'<' : '_lt_',
'>' : '_gt_',
'+' : '_plus_',
'?' : '_question_',
'&' : '_amp_',
':' : '_colon_',
'.' : '_period_',
'!' : '_exclamation_',
}
def maptype(s):
return "".join([charmap.get(c,c) for c in s])
def tokstr(start, end, ttype, idnum, text):
# sanity checks
assert '\n' not in text, "ERROR: newline in entity '%s'" % (text)
assert text == text.strip(), "ERROR: tagged span contains extra whitespace: '%s'" % (text)
return "T%d\t%s %d %d\t%s" % (idnum, maptype(ttype), start, end, text)
def depstr(depid, headid, rel, idnum):
return "R%d\t%s Arg1:T%d Arg2:T%d" % (idnum, maptype(rel), headid, depid)
def output(infn, docnum, sentences):
global output_directory
if output_directory is None:
txtout = sys.stdout
soout = sys.stdout
else:
# add doc numbering if there is a sentence count limit,
# implying multiple outputs per input
if MAX_DOC_SENTENCES:
outfnbase = os.path.basename(infn)+'-doc-'+str(docnum)
else:
outfnbase = os.path.basename(infn)
outfn = os.path.join(output_directory, outfnbase)
txtout = codecs.open(outfn+'.txt', 'wt', encoding=OUTPUT_ENCODING)
soout = codecs.open(outfn+'.ann', 'wt', encoding=OUTPUT_ENCODING)
offset, idnum, ridnum = 0, 1, 1
doctext = ""
for si, sentence in enumerate(sentences):
tokens, deps = sentence
# store mapping from per-sentence token sequence IDs to
# document-unique token IDs
idmap = {}
# output tokens
prev_form = None
if OUTPUT_ROOT:
# add an explicit root node with seq ID 0 (zero)
tokens = [('0', ROOT_STR, ROOT_STR)] + tokens
for ID, form, POS in tokens:
if prev_form is not None:
doctext = doctext + ' '
offset += 1
# output a token annotation
print >> soout, tokstr(offset, offset+len(form), POS, idnum, form)
assert ID not in idmap, "Error in data: dup ID"
idmap[ID] = idnum
idnum += 1
doctext = doctext + form
offset += len(form)
prev_form = form
# output dependencies
for dep, head, rel in deps:
# if root is not added, skip deps to the root (idx 0)
if not OUTPUT_ROOT and head == '0':
continue
print >> soout, depstr(idmap[dep], idmap[head], rel, ridnum)
ridnum += 1
if si+1 != len(sentences):
doctext = doctext + '\n'
offset += 1
print >> txtout, doctext
def process(fn):
docnum = 1
sentences = []
with codecs.open(fn, encoding=INPUT_ENCODING) as f:
tokens, deps = [], []
lines = f.readlines()
for ln, l in enumerate(lines):
l = l.strip()
# igore lines starting with "#" as comments
if len(l) > 0 and l[0] == "#":
continue
if re.match(r'^\s*$', l):
# blank lines separate sentences
if len(tokens) > 0:
sentences.append((tokens, deps))
tokens, deps = [], []
# limit sentences per output "document"
if MAX_DOC_SENTENCES and len(sentences) >= MAX_DOC_SENTENCES:
output(fn, docnum, sentences)
sentences = []
docnum += 1
continue
# Assume it's a normal line. The format is tab-separated,
# with ten fields, of which the following are used here
# (from http://ilk.uvt.nl/conll/):
# 1 ID Token counter, starting at 1 for each new sentence.
# 2 FORM Word form or punctuation symbol.
# 5 POSTAG Fine-grained part-of-speech tag
# 7 HEAD Head of the current token
# 8 DEPREL Dependency relation to the HEAD.
fields = l.split('\t')
assert len(fields) == 10, "Format error on line %d in %s: %s" % (ln, fn, l)
ID, form, POS = fields[0], fields[1], fields[4]
head, rel = fields[6], fields[7]
tokens.append((ID, form, POS))
# allow value "_" for HEAD to indicate no dependency
if head != "_":
deps.append((ID, head, rel))
# process leftovers, if any
if len(tokens) > 0:
sentences.append((tokens, deps))
if len(sentences) > 0:
output(fn, docnum, sentences)
def main(argv):
global output_directory
# Take an optional "-o" arg specifying an output directory for the results
output_directory = None
filenames = argv[1:]
if len(argv) > 2 and argv[1] == "-o":
output_directory = argv[2]
print >> sys.stderr, "Writing output to %s" % output_directory
filenames = argv[3:]
fail_count = 0
for fn in filenames:
try:
process(fn)
except Exception, e:
print >> sys.stderr, "Error processing %s: %s" % (fn, e)
fail_count += 1
if fail_count > 0:
print >> sys.stderr, """
##############################################################################
#
# WARNING: error in processing %d/%d files, output is incomplete!
#
##############################################################################
""" % (fail_count, len(filenames))
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))