convert.py
2.83 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
'''
Conversion services, we may want to move these out later on.
Author: Pontus Stenetorp <pontus stenetorp>
Version: 2012-06-26
'''
from __future__ import with_statement
from os.path import join as path_join
from shutil import rmtree
from tempfile import mkdtemp
from annotation import open_textfile, Annotations
from common import ProtocolError
from document import _document_json_dict
from stanford import (
basic_dep as stanford_basic_dep,
collapsed_ccproc_dep as stanford_collapsed_ccproc_dep,
collapsed_dep as stanford_collapsed_dep,
coref as stanford_coref,
ner as stanford_ner,
pos as stanford_pos,
text as stanford_text,
token_offsets as stanford_token_offsets,
sentence_offsets as stanford_sentence_offsets
)
### Constants
CONV_BY_SRC = {
'stanford-pos': (stanford_text, stanford_pos, ),
'stanford-ner': (stanford_text, stanford_ner, ),
'stanford-coref': (stanford_text, stanford_coref, ),
'stanford-basic_dep': (stanford_text, stanford_basic_dep, ),
'stanford-collapsed_dep': (stanford_text, stanford_collapsed_dep, ),
'stanford-collapsed_ccproc_dep': (stanford_text, stanford_collapsed_ccproc_dep, ),
}
###
class InvalidSrcFormat(ProtocolError):
def json(self, json_dic):
json_dic['exception'] = 'InvalidSrcFormat'
return json_dic
def convert(data, src):
# Fail early if we don't have a converter
try:
conv_text, conv_ann = CONV_BY_SRC[src]
except KeyError:
raise InvalidSrcFormat
# Note: Due to a lack of refactoring we need to write to disk to read
# annotions, once this is fixed, the below code needs some clean-up
tmp_dir = None
try:
tmp_dir = mkdtemp()
doc_base = path_join(tmp_dir, 'tmp')
with open_textfile(doc_base + '.txt', 'w') as txt_file:
txt_file.write(conv_text(data))
with open(doc_base + '.ann', 'w'):
pass
with Annotations(doc_base) as ann_obj:
for ann in conv_ann(data):
ann_obj.add_annotation(ann)
json_dic = _document_json_dict(doc_base)
# Note: Blank the comments, they rarely do anything good but whine
# about configuration when we use the tool solely for visualisation
# purposes
json_dic['comments'] = []
# Note: This is an ugly hack... we want to ride along with the
# Stanford tokenisation and sentence splits when returning their
# output rather than relying on the ones generated by brat.
if src.startswith('stanford-'):
json_dic['token_offsets'] = stanford_token_offsets(data)
json_dic['sentence_offsets'] = stanford_sentence_offsets(data)
return json_dic
finally:
if tmp_dir is not None:
rmtree(tmp_dir)