tag.py
6.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python
# -*- Mode: Python; tab-width: 4; indent-tabs-mode: nil; coding: utf-8; -*-
# vim:set ft=python ts=4 sw=4 sts=4 autoindent:
'''
Functionality for invoking tagging services.
Author: Pontus Stenetorp
Version: 2011-04-22
'''
from __future__ import with_statement
from httplib import HTTPConnection
from os.path import join as path_join
from socket import error as SocketError
from urlparse import urlparse
from annotation import TextAnnotations, TextBoundAnnotationWithText
from annotator import _json_from_ann, ModificationTracker
from common import ProtocolError
from document import real_directory
from jsonwrap import loads
from message import Messager
from projectconfig import ProjectConfiguration
### Constants
QUERY_TIMEOUT = 30
###
class UnknownTaggerError(ProtocolError):
def __init__(self, tagger):
self.tagger = tagger
def __str__(self):
return ('Tagging request received for '
'an unknown tagger "%s"') % self.tagger
def json(self, json_dic):
json_dic['exception'] = 'unknownTaggerError'
class InvalidConnectionSchemeError(ProtocolError):
def __init__(self, tagger, scheme):
self.tagger = tagger
self.scheme = scheme
def __str__(self):
return ('The tagger "%s" uses the unsupported scheme "%s"'
' "%s"') % (self.tagger, self.scheme, )
def json(self, json_dic):
json_dic['exception'] = 'unknownTaggerError'
class InvalidTaggerResponseError(ProtocolError):
def __init__(self, tagger, response):
self.tagger = tagger
self.response = response
def __str__(self):
return (('The tagger "%s" returned an invalid JSON response, please '
'contact the tagger service mantainer. Response: "%s"')
% (self.tagger, self.response, ))
def json(self, json_dic):
json_dic['exception'] = 'unknownTaggerError'
class TaggerConnectionError(ProtocolError):
def __init__(self, tagger, error):
self.tagger = tagger
self.error = error
def __str__(self):
return ('Tagger service %s returned the error: "%s"'
% (self.tagger, self.error, ))
def json(self, json_dic):
json_dic['exception'] = 'taggerConnectionError'
def tag(collection, document, tagger):
pconf = ProjectConfiguration(real_directory(collection))
for tagger_token, _, _, tagger_service_url in pconf.get_annotator_config():
if tagger == tagger_token:
break
else:
raise UnknownTaggerError(tagger)
doc_path = path_join(real_directory(collection), document)
with TextAnnotations(path_join(real_directory(collection),
document)) as ann_obj:
url_soup = urlparse(tagger_service_url)
if url_soup.scheme == 'http':
Connection = HTTPConnection
elif url_soup.scheme == 'https':
# Delayed HTTPS import since it relies on SSL which is commonly
# missing if you roll your own Python, for once we should not
# fail early since tagging is currently an edge case and we
# can't allow it to bring down the whole server.
from httplib import HTTPSConnection
Connection = HTTPSConnection
else:
raise InvalidConnectionSchemeError(tagger_token, url_soup.scheme)
conn = None
try:
conn = Connection(url_soup.netloc)
req_headers = {
'Content-type': 'text/plain; charset=utf-8',
'Accept': 'application/json',
}
# Build a new service URL since the request method doesn't accept
# a parameters argument
service_url = url_soup.path + (
'?' + url_soup.query if url_soup.query else '')
try:
data = ann_obj.get_document_text().encode('utf-8')
req_headers['Content-length'] = len(data)
# Note: Trout slapping for anyone sending Unicode objects here
conn.request('POST',
# As per: http://bugs.python.org/issue11898
# Force the url to be an ascii string
str(url_soup.path),
data,
headers=req_headers)
except SocketError, e:
raise TaggerConnectionError(tagger_token, e)
resp = conn.getresponse()
# Did the request succeed?
if resp.status != 200:
raise TaggerConnectionError(tagger_token,
'%s %s' % (resp.status, resp.reason))
# Finally, we can read the response data
resp_data = resp.read()
finally:
if conn is not None:
conn.close()
try:
json_resp = loads(resp_data)
except ValueError:
raise InvalidTaggerResponseError(tagger_token, resp_data)
mods = ModificationTracker()
for ann_data in json_resp.itervalues():
assert 'offsets' in ann_data, 'Tagger response lacks offsets'
offsets = ann_data['offsets']
assert 'type' in ann_data, 'Tagger response lacks type'
_type = ann_data['type']
assert 'texts' in ann_data, 'Tagger response lacks texts'
texts = ann_data['texts']
# sanity
assert len(offsets) != 0, 'Tagger response has empty offsets'
assert len(texts) == len(offsets), 'Tagger response has different numbers of offsets and texts'
# Note: We do not support discontinuous spans at this point
assert len(offsets) < 2, 'Tagger response has multiple offsets (discontinuous spans not supported)'
start, end = offsets[0]
text = texts[0]
_id = ann_obj.get_new_id('T')
tb = TextBoundAnnotationWithText(((start, end),), _id, _type, text)
mods.addition(tb)
ann_obj.add_annotation(tb)
mod_resp = mods.json_response()
mod_resp['annotations'] = _json_from_ann(ann_obj)
return mod_resp
if __name__ == '__main__':
# Silly test, but helps
tag('/BioNLP-ST_2011_ID_devel', 'PMC1874608-01-INTRODUCTION', 'random')