annotator.py
45.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
#!/usr/bin/env python
# -*- Mode: Python; tab-width: 4; indent-tabs-mode: nil; coding: utf-8; -*-
# vim:set ft=python ts=4 sw=4 sts=4 autoindent:
'''
Annotator functionality, editing and retrieving status.
Author: Pontus Stenetorp
Version: 2011-04-22
'''
# XXX: This module is messy, re-factor to be done
from __future__ import with_statement
from os.path import join as path_join
from os.path import split as path_split
from re import compile as re_compile
from annotation import (OnelineCommentAnnotation, TEXT_FILE_SUFFIX,
TextAnnotations, DependingAnnotationDeleteError, TextBoundAnnotation,
EventAnnotation, EquivAnnotation, open_textfile,
AnnotationsIsReadOnlyError, AttributeAnnotation,
NormalizationAnnotation, SpanOffsetOverlapError, DISCONT_SEP)
from common import ProtocolError, ProtocolArgumentError
try:
from config import DEBUG
except ImportError:
DEBUG = False
from document import real_directory
from jsonwrap import loads as json_loads, dumps as json_dumps
from message import Messager
from projectconfig import ProjectConfiguration, ENTITY_CATEGORY, EVENT_CATEGORY, RELATION_CATEGORY, UNKNOWN_CATEGORY
### Constants
MUL_NL_REGEX = re_compile(r'\n+')
###
#TODO: Couldn't we incorporate this nicely into the Annotations class?
#TODO: Yes, it is even gimped compared to what it should do when not. This
# has been a long pending goal for refactoring.
class ModificationTracker(object):
def __init__(self):
self.__added = []
self.__changed = []
self.__deleted = []
def __len__(self):
return len(self.__added) + len(self.__changed) + len(self.__deleted)
def addition(self, added):
self.__added.append(added)
def deletion(self, deleted):
self.__deleted.append(deleted)
def change(self, before, after):
self.__changed.append((before, after))
def json_response(self, response=None):
if response is None:
response = {}
# debugging
if DEBUG:
msg_str = ''
if self.__added:
msg_str += ('Added the following line(s):\n'
+ '\n'.join([unicode(a).rstrip() for a in self.__added]))
if self.__changed:
changed_strs = []
for before, after in self.__changed:
changed_strs.append('\t%s\n\tInto:\n\t%s' % (unicode(before).rstrip(), unicode(after).rstrip()))
msg_str += ('Changed the following line(s):\n'
+ '\n'.join([unicode(a).rstrip() for a in changed_strs]))
if self.__deleted:
msg_str += ('Deleted the following line(s):\n'
+ '\n'.join([unicode(a).rstrip() for a in self.__deleted]))
if msg_str:
Messager.info(msg_str, duration=3*len(self))
else:
Messager.info('No changes made')
# highlighting
response['edited'] = []
# TODO: implement cleanly, e.g. add a highlightid() method to Annotation classes
for a in self.__added:
try:
response['edited'].append(a.reference_id())
except AttributeError:
pass # not all implement reference_id()
for b,a in self.__changed:
# can't mark "before" since it's stopped existing
try:
response['edited'].append(a.reference_id())
except AttributeError:
pass # not all implement reference_id()
# unique, preserve order
seen = set()
uniqued = []
for i in response['edited']:
s = str(i)
if s not in seen:
uniqued.append(i)
seen.add(s)
response['edited'] = uniqued
return response
# TODO: revive the "unconfirmed annotation" functionality;
# the following currently unused bit may help
# def confirm_span(docdir, docname, span_id):
# document = path_join(docdir, docname)
# txt_file_path = document + '.' + TEXT_FILE_SUFFIX
# with TextAnnotations(document) as ann_obj:
# mods = ModificationTracker()
# # find AnnotationUnconfirmed comments that refer
# # to the span and remove them
# # TODO: error checking
# for ann in ann_obj.get_oneline_comments():
# if ann.type == "AnnotationUnconfirmed" and ann.target == span_id:
# ann_obj.del_annotation(ann, mods)
# mods_json = mods.json_response()
# # save a roundtrip and send the annotations also
# j_dic = _json_from_ann(ann_obj)
# mods_json['annotations'] = j_dic
# add_messages_to_json(mods_json)
# print dumps(mods_json)
def _json_from_ann(ann_obj):
# Returns json with ann_obj contents and the relevant text. Used
# for saving a round-trip when modifying annotations by attaching
# the latest annotation data into the response to the edit
# request.
j_dic = {}
txt_file_path = ann_obj.get_document() + '.' + TEXT_FILE_SUFFIX
from document import (_enrich_json_with_data, _enrich_json_with_base,
_enrich_json_with_text)
_enrich_json_with_base(j_dic)
# avoid reading text file if the given ann_obj already holds it
try:
doctext = ann_obj.get_document_text()
except AttributeError:
# no such luck
doctext = None
_enrich_json_with_text(j_dic, txt_file_path, doctext)
_enrich_json_with_data(j_dic, ann_obj)
return j_dic
from logging import info as log_info
from annotation import TextBoundAnnotation, TextBoundAnnotationWithText
from copy import deepcopy
def _offsets_equal(o1, o2):
"""
Given two lists of (start, end) integer offset sets, returns
whether they identify the same sets of characters.
"""
# TODO: full implementation; current doesn't check for special
# cases such as dup or overlapping (start, end) pairs in a single
# set.
# short-circuit (expected to be the most common case)
if o1 == o2:
return True
return sorted(o1) == sorted(o2)
def _text_for_offsets(text, offsets):
"""
Given a text and a list of (start, end) integer offsets, returns
the (catenated) text corresponding to those offsets, joined
appropriately for use in a TextBoundAnnotation(WithText).
"""
try:
return DISCONT_SEP.join(text[s:e] for s,e in offsets)
except Exception:
Messager.error('_text_for_offsets: failed to get text for given offsets (%s)' % str(offsets))
raise ProtocolArgumentError
def _edit_span(ann_obj, mods, id, offsets, projectconf, attributes, type,
undo_resp={}):
#TODO: Handle failure to find!
ann = ann_obj.get_ann_by_id(id)
if isinstance(ann, EventAnnotation):
# We should actually modify the trigger
tb_ann = ann_obj.get_ann_by_id(ann.trigger)
e_ann = ann
undo_resp['id'] = e_ann.id
ann_category = EVENT_CATEGORY
else:
tb_ann = ann
e_ann = None
undo_resp['id'] = tb_ann.id
ann_category = ENTITY_CATEGORY
# Store away what we need to restore the old annotation
undo_resp['action'] = 'mod_tb'
undo_resp['offsets'] = tb_ann.spans[:]
undo_resp['type'] = tb_ann.type
if not _offsets_equal(tb_ann.spans, offsets):
if not isinstance(tb_ann, TextBoundAnnotation):
# TODO XXX: the following comment is no longer valid
# (possibly related code also) since the introduction of
# TextBoundAnnotationWithText. Check.
# This scenario has been discussed and changing the span inevitably
# leads to the text span being out of sync since we can't for sure
# determine where in the data format the text (if at all) it is
# stored. For now we will fail loudly here.
error_msg = ('unable to change the span of an existing annotation'
'(annotation: %s)' % repr(tb_ann))
Messager.error(error_msg)
# Not sure if we only get an internal server error or the data
# will actually reach the client to be displayed.
assert False, error_msg
else:
# TODO: Log modification too?
before = unicode(tb_ann)
#log_info('Will alter span of: "%s"' % str(to_edit_span).rstrip('\n'))
tb_ann.spans = offsets[:]
tb_ann.text = _text_for_offsets(ann_obj._document_text, tb_ann.spans)
#log_info('Span altered')
mods.change(before, tb_ann)
if ann.type != type:
if ann_category != projectconf.type_category(type):
# Can't convert event to entity etc. (The client should protect
# against this in any case.)
# TODO: Raise some sort of protocol error
Messager.error("Cannot convert %s (%s) into %s (%s)"
% (ann.type, projectconf.type_category(ann.type),
type, projectconf.type_category(type)),
duration=10)
pass
else:
before = unicode(ann)
ann.type = type
# Try to propagate the type change
try:
#XXX: We don't take into consideration other anns with the
# same trigger here!
ann_trig = ann_obj.get_ann_by_id(ann.trigger)
if ann_trig.type != ann.type:
# At this stage we need to determine if someone else
# is using the same trigger
if any((event_ann
for event_ann in ann_obj.get_events()
if (event_ann.trigger == ann.trigger
and event_ann != ann))):
# Someone else is using it, create a new one
from copy import copy
# A shallow copy should be enough
new_ann_trig = copy(ann_trig)
# It needs a new id
new_ann_trig.id = ann_obj.get_new_id('T')
# And we will change the type
new_ann_trig.type = ann.type
# Update the old annotation to use this trigger
ann.trigger = unicode(new_ann_trig.id)
ann_obj.add_annotation(new_ann_trig)
mods.addition(new_ann_trig)
else:
# Okay, we own the current trigger, but does an
# identical to our sought one already exist?
found = None
for tb_ann in ann_obj.get_textbounds():
if (_offsets_equal(tb_ann.spans, ann_trig.spans) and
tb_ann.type == ann.type):
found = tb_ann
break
if found is None:
# Just change the trigger type since we are the
# only users
before = unicode(ann_trig)
ann_trig.type = ann.type
mods.change(before, ann_trig)
else:
# Attach the new trigger THEN delete
# or the dep will hit you
ann.trigger = unicode(found.id)
ann_obj.del_annotation(ann_trig)
mods.deletion(ann_trig)
except AttributeError:
# It was most likely a TextBound entity
pass
# Finally remember the change
mods.change(before, ann)
return tb_ann, e_ann
def __create_span(ann_obj, mods, type, offsets, txt_file_path,
projectconf, attributes):
# For event types, reuse trigger if a matching one exists.
found = None
if projectconf.is_event_type(type):
for tb_ann in ann_obj.get_textbounds():
try:
if (_offsets_equal(tb_ann.spans, offsets)
and tb_ann.type == type):
found = tb_ann
break
except AttributeError:
# Not a trigger then
pass
if found is None:
# Get a new ID
new_id = ann_obj.get_new_id('T') #XXX: Cons
# Get the text span
with open_textfile(txt_file_path, 'r') as txt_file:
# TODO discont: use offsets instead (note need for int conversion)
text = _text_for_offsets(txt_file.read(), offsets)
# The below code resolves cases where there are newlines in the
# offsets by creating discontinuous annotations for each span
# separated by newlines. For most cases it preserves the offsets.
seg_offsets = []
for o_start, o_end in offsets:
pos = o_start
for text_seg in text.split('\n'):
if not text_seg and o_start != o_end:
# Double new-line, skip ahead
pos += 1
continue
end = pos + len(text_seg)
seg_offsets.append((pos, end))
# Our current position is after the newline
pos = end + 1
ann = TextBoundAnnotationWithText(seg_offsets, new_id, type,
# Replace any newlines with the discontinuous separator
MUL_NL_REGEX.sub(DISCONT_SEP, text))
ann_obj.add_annotation(ann)
mods.addition(ann)
else:
ann = found
if ann is not None:
if projectconf.is_physical_entity_type(type):
# TODO: alert that negation / speculation are ignored if set
event = None
else:
# Create the event also
new_event_id = ann_obj.get_new_id('E') #XXX: Cons
event = EventAnnotation(ann.id, [], unicode(new_event_id), type, '')
ann_obj.add_annotation(event)
mods.addition(event)
else:
# We got a newline in the span, don't take any action
event = None
return ann, event
def _set_attributes(ann_obj, ann, attributes, mods, undo_resp={}):
# Find existing attributes (if any)
existing_attr_anns = set((a for a in ann_obj.get_attributes()
if a.target == ann.id))
#log_info('ATTR: %s' %(existing_attr_anns, ))
# Note the existing annotations for undo
undo_resp['attributes'] = json_dumps(dict([(e.type, e.value)
for e in existing_attr_anns]))
for existing_attr_ann in existing_attr_anns:
if existing_attr_ann.type not in attributes:
# Delete attributes that were un-set existed previously
ann_obj.del_annotation(existing_attr_ann)
mods.deletion(existing_attr_ann)
else:
# If the value of the attribute is different, alter it
new_value = attributes[existing_attr_ann.type]
#log_info('ATTR: "%s" "%s"' % (new_value, existing_attr_ann.value))
if existing_attr_ann.value != new_value:
before = unicode(existing_attr_ann)
existing_attr_ann.value = new_value
mods.change(before, existing_attr_ann)
# The remaining annotations are new and should be created
for attr_type, attr_val in attributes.iteritems():
if attr_type not in set((a.type for a in existing_attr_anns)):
new_attr = AttributeAnnotation(ann.id, ann_obj.get_new_id('A'),
attr_type, '', attr_val)
ann_obj.add_annotation(new_attr)
mods.addition(new_attr)
def _json_offsets_to_list(offsets):
try:
offsets = json_loads(offsets)
except Exception:
Messager.error('create_span: protocol argument error: expected offsets as JSON, but failed to parse "%s"' % str(offsets))
raise ProtocolArgumentError
try:
offsets = [(int(s),int(e)) for s,e in offsets]
except Exception:
Messager.error('create_span: protocol argument error: expected offsets as list of int pairs, received "%s"' % str(offsets))
raise ProtocolArgumentError
return offsets
#TODO: unshadow Python internals like "type" and "id"
def create_span(collection, document, offsets, type, attributes=None,
normalizations=None, id=None, comment=None):
# offsets should be JSON string corresponding to a list of (start,
# end) pairs; convert once at this interface
offsets = _json_offsets_to_list(offsets)
return _create_span(collection, document, offsets, type, attributes,
normalizations, id, comment)
def _set_normalizations(ann_obj, ann, normalizations, mods, undo_resp={}):
# Find existing normalizations (if any)
existing_norm_anns = set((a for a in ann_obj.get_normalizations()
if a.target == ann.id))
# Note the existing annotations for undo
undo_resp['normalizations'] = json_dumps([(n.refdb, n.refid, n.reftext)
for n in existing_norm_anns])
# Organize into dictionaries for easier access
old_norms = dict([((n.refdb,n.refid),n) for n in existing_norm_anns])
new_norms = dict([((n[0],n[1]), n[2]) for n in normalizations])
#Messager.info("Old norms: "+str(old_norms))
#Messager.info("New norms: "+str(new_norms))
# sanity check
for refdb, refid, refstr in normalizations:
# TODO: less aggressive failure
assert refdb is not None and refdb.strip() != '', "Error: client sent empty norm DB"
assert refid is not None and refid.strip() != '', "Error: client sent empty norm ID"
# (the reference string is allwed to be empty)
# Process deletions and updates of existing normalizations
for old_norm_id, old_norm in old_norms.items():
if old_norm_id not in new_norms:
# Delete IDs that were referenced previously but not anymore
ann_obj.del_annotation(old_norm)
mods.deletion(old_norm)
else:
# If the text value of the normalizations is different, update
# (this shouldn't happen on a stable norm DB, but anyway)
new_reftext = new_norms[old_norm_id]
if old_norm.reftext != new_reftext:
old = unicode(old_norm)
old_norm.reftext = new_reftext
mods.change(old, old_norm)
# Process new normalizations
for new_norm_id, new_reftext in new_norms.items():
if new_norm_id not in old_norms:
new_id = ann_obj.get_new_id('N')
# TODO: avoid magic string value
norm_type = u'Reference'
new_norm = NormalizationAnnotation(new_id, norm_type,
ann.id, new_norm_id[0],
new_norm_id[1],
u'\t'+new_reftext)
ann_obj.add_annotation(new_norm)
mods.addition(new_norm)
# helper for _create methods
def _parse_attributes(attributes):
if attributes is None:
_attributes = {}
else:
try:
_attributes = json_loads(attributes)
except ValueError:
# Failed to parse, warn the client
Messager.warning((u'Unable to parse attributes string "%s" for '
u'"createSpan", ignoring attributes for request and '
u'assuming no attributes set') % (attributes, ))
_attributes = {}
### XXX: Hack since the client is sending back False and True as values...
# These are __not__ to be sent, they violate the protocol
for _del in [k for k, v in _attributes.items() if v == False]:
del _attributes[_del]
# These are to be old-style modifiers without values
for _revalue in [k for k, v in _attributes.items() if v == True]:
_attributes[_revalue] = True
###
return _attributes
# helper for _create_span
def _parse_span_normalizations(normalizations):
if normalizations is None:
_normalizations = {}
else:
try:
_normalizations = json_loads(normalizations)
except ValueError:
# Failed to parse, warn the client
Messager.warning((u'Unable to parse normalizations string "%s" for '
u'"createSpan", ignoring normalizations for request and '
u'assuming no normalizations set') % (normalizations, ))
_normalizations = {}
return _normalizations
# Helper for _create functions
def _set_comments(ann_obj, ann, comment, mods, undo_resp={}):
# We are only interested in id;ed comments
try:
ann.id
except AttributeError:
return None
# Check if there is already an annotation comment
for com_ann in ann_obj.get_oneline_comments():
if (com_ann.type == 'AnnotatorNotes'
and com_ann.target == ann.id):
found = com_ann
# Note the comment in the undo
undo_resp['comment'] = found.tail[1:]
break
else:
found = None
if comment:
if found is not None:
# Change the comment
# XXX: Note the ugly tab, it is for parsing the tail
before = unicode(found)
found.tail = u'\t' + comment
mods.change(before, found)
else:
# Create a new comment
new_comment = OnelineCommentAnnotation(
ann.id, ann_obj.get_new_id('#'),
# XXX: Note the ugly tab
u'AnnotatorNotes', u'\t' + comment)
ann_obj.add_annotation(new_comment)
mods.addition(new_comment)
else:
# We are to erase the annotation
if found is not None:
ann_obj.del_annotation(found)
mods.deletion(found)
# Sanity check, a span can't overlap itself
def _offset_overlaps(offsets):
for i in xrange(len(offsets)):
i_start, i_end = offsets[i]
for j in xrange(i + 1, len(offsets)):
j_start, j_end = offsets[j]
if (
# i overlapping or in j
(j_start <= i_start < j_end) or (j_start < i_end < j_end)
or
# j overlapping or in i
(i_start <= j_start < i_end) or (i_start < j_end < i_end)
):
return True
# No overlap detected
return False
#TODO: ONLY determine what action to take! Delegate to Annotations!
def _create_span(collection, document, offsets, _type, attributes=None,
normalizations=None, _id=None, comment=None):
if _offset_overlaps(offsets):
raise SpanOffsetOverlapError(offsets)
directory = collection
undo_resp = {}
_attributes = _parse_attributes(attributes)
_normalizations = _parse_span_normalizations(normalizations)
#log_info('ATTR: %s' %(_attributes, ))
real_dir = real_directory(directory)
document = path_join(real_dir, document)
projectconf = ProjectConfiguration(real_dir)
txt_file_path = document + '.' + TEXT_FILE_SUFFIX
working_directory = path_split(document)[0]
with TextAnnotations(document) as ann_obj:
# bail as quick as possible if read-only
if ann_obj._read_only:
raise AnnotationsIsReadOnlyError(ann_obj.get_document())
mods = ModificationTracker()
if _id is not None:
# We are to edit an existing annotation
tb_ann, e_ann = _edit_span(ann_obj, mods, _id, offsets, projectconf,
_attributes, _type, undo_resp=undo_resp)
else:
# We are to create a new annotation
tb_ann, e_ann = __create_span(ann_obj, mods, _type, offsets, txt_file_path,
projectconf, _attributes)
undo_resp['action'] = 'add_tb'
if e_ann is not None:
undo_resp['id'] = e_ann.id
else:
undo_resp['id'] = tb_ann.id
# Determine which annotation attributes, normalizations,
# comments etc. should be attached to. If there's an event,
# attach to that; otherwise attach to the textbound.
if e_ann is not None:
# Assign to the event, not the trigger
target_ann = e_ann
else:
target_ann = tb_ann
# Set attributes
_set_attributes(ann_obj, target_ann, _attributes, mods,
undo_resp=undo_resp)
# Set normalizations
_set_normalizations(ann_obj, target_ann, _normalizations, mods,
undo_resp=undo_resp)
# Set comments
if tb_ann is not None:
_set_comments(ann_obj, target_ann, comment, mods,
undo_resp=undo_resp)
if tb_ann is not None:
mods_json = mods.json_response()
else:
# Hack, probably we had a new-line in the span
mods_json = {}
Messager.error('Text span contained new-line, rejected', duration=3)
if undo_resp:
mods_json['undo'] = json_dumps(undo_resp)
mods_json['annotations'] = _json_from_ann(ann_obj)
return mods_json
from annotation import BinaryRelationAnnotation
def _create_equiv(ann_obj, projectconf, mods, origin, target, type, attributes,
old_type, old_target):
# due to legacy representation choices for Equivs (i.e. no
# unique ID), support for attributes for Equivs would need
# some extra work. Getting the easy non-Equiv case first.
if attributes is not None:
Messager.warning('_create_equiv: attributes for Equiv annotation not supported yet, please tell the devs if you need this feature (mention "issue #799").')
attributes = None
ann = None
if old_type is None:
# new annotation
# sanity
assert old_target is None, '_create_equiv: incoherent args: old_type is None, old_target is not None (client/protocol error?)'
ann = EquivAnnotation(type, [unicode(origin.id),
unicode(target.id)], '')
ann_obj.add_annotation(ann)
mods.addition(ann)
# TODO: attributes
assert attributes is None, "INTERNAL ERROR" # see above
else:
# change to existing Equiv annotation. Other than the no-op
# case, this remains TODO.
assert projectconf.is_equiv_type(old_type), 'attempting to change equiv relation to non-equiv relation, operation not supported'
# sanity
assert old_target is not None, '_create_equiv: incoherent args: old_type is not None, old_target is None (client/protocol error?)'
if old_type != type:
Messager.warning('_create_equiv: equiv type change not supported yet, please tell the devs if you need this feature (mention "issue #798").')
if old_target != target.id:
Messager.warning('_create_equiv: equiv reselect not supported yet, please tell the devs if you need this feature (mention "issue #797").')
# TODO: attributes
assert attributes is None, "INTERNAL ERROR" # see above
return ann
def _create_relation(ann_obj, projectconf, mods, origin, target, type,
attributes, old_type, old_target, undo_resp={}):
attributes = _parse_attributes(attributes)
if old_type is not None or old_target is not None:
assert type in projectconf.get_relation_types(), (
('attempting to convert relation to non-relation "%s" ' % (target.type, )) +
('(legit types: %s)' % (unicode(projectconf.get_relation_types()), )))
sought_target = (old_target
if old_target is not None else target.id)
sought_type = (old_type
if old_type is not None else type)
sought_origin = origin.id
# We are to change the type, target, and/or attributes
found = None
for ann in ann_obj.get_relations():
if (ann.arg1 == sought_origin and ann.arg2 == sought_target and
ann.type == sought_type):
found = ann
break
if found is None:
# TODO: better response
Messager.error('_create_relation: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target)))
elif found.arg2 == target.id and found.type == type:
# no changes to type or target
pass
else:
# type and/or target changed, mark.
before = unicode(found)
found.arg2 = target.id
found.type = type
mods.change(before, found)
target_ann = found
else:
# Create a new annotation
new_id = ann_obj.get_new_id('R')
# TODO: do we need to support different relation arg labels
# depending on participant types? This doesn't.
rels = projectconf.get_relations_by_type(type)
rel = rels[0] if rels else None
assert rel is not None and len(rel.arg_list) == 2
a1l, a2l = rel.arg_list
ann = BinaryRelationAnnotation(new_id, type, a1l, origin.id, a2l, target.id, '\t')
mods.addition(ann)
ann_obj.add_annotation(ann)
target_ann = ann
# process attributes
if target_ann is not None:
_set_attributes(ann_obj, ann, attributes, mods, undo_resp)
elif attributes != None:
Messager.error('_create_relation: cannot set arguments: failed to identify target relation (type %s, target %s) (deleted?)' % (str(old_type), str(old_target)))
return target_ann
def _create_argument(ann_obj, projectconf, mods, origin, target, type,
attributes, old_type, old_target):
try:
arg_tup = (type, unicode(target.id))
# Is this an addition or an update?
if old_type is None and old_target is None:
if arg_tup not in origin.args:
before = unicode(origin)
origin.add_argument(type, unicode(target.id))
mods.change(before, origin)
else:
# It already existed as an arg, we were called to do nothing...
pass
else:
# Construct how the old arg would have looked like
old_arg_tup = (type if old_type is None else old_type,
target if old_target is None else old_target)
if old_arg_tup in origin.args and arg_tup not in origin.args:
before = unicode(origin)
origin.args.remove(old_arg_tup)
origin.add_argument(type, unicode(target.id))
mods.change(before, origin)
else:
# Collision etc. don't do anything
pass
except AttributeError:
# The annotation did not have args, it was most likely an entity
# thus we need to create a new Event...
new_id = ann_obj.get_new_id('E')
ann = EventAnnotation(
origin.id,
[arg_tup],
new_id,
origin.type,
''
)
ann_obj.add_annotation(ann)
mods.addition(ann)
# No addressing mechanism for arguments at the moment
return None
def reverse_arc(collection, document, origin, target, type, attributes=None):
directory = collection
#undo_resp = {} # TODO
real_dir = real_directory(directory)
#mods = ModificationTracker() # TODO
projectconf = ProjectConfiguration(real_dir)
document = path_join(real_dir, document)
with TextAnnotations(document) as ann_obj:
# bail as quick as possible if read-only
if ann_obj._read_only:
raise AnnotationsIsReadOnlyError(ann_obj.get_document())
if projectconf.is_equiv_type(type):
Messager.warning('Cannot reverse Equiv arc')
elif not projectconf.is_relation_type(type):
Messager.warning('Can only reverse configured binary relations')
else:
# OK to reverse
found = None
# TODO: more sensible lookup
for ann in ann_obj.get_relations():
if (ann.arg1 == origin and ann.arg2 == target and
ann.type == type):
found = ann
break
if found is None:
Messager.error('reverse_arc: failed to identify target relation (from %s to %s, type %s) (deleted?)' % (str(origin), str(target), str(type)))
else:
# found it; just adjust this
found.arg1, found.arg2 = found.arg2, found.arg1
# TODO: modification tracker
json_response = {}
json_response['annotations'] = _json_from_ann(ann_obj)
return json_response
# TODO: undo support
def create_arc(collection, document, origin, target, type, attributes=None,
old_type=None, old_target=None, comment=None):
directory = collection
undo_resp = {}
real_dir = real_directory(directory)
mods = ModificationTracker()
projectconf = ProjectConfiguration(real_dir)
document = path_join(real_dir, document)
with TextAnnotations(document) as ann_obj:
# bail as quick as possible if read-only
# TODO: make consistent across the different editing
# functions, integrate ann_obj initialization and checks
if ann_obj._read_only:
raise AnnotationsIsReadOnlyError(ann_obj.get_document())
origin = ann_obj.get_ann_by_id(origin)
target = ann_obj.get_ann_by_id(target)
# if there is a previous annotation and the arcs aren't in
# the same category (e.g. relation vs. event arg), process
# as delete + create instead of update.
if old_type is not None and (
projectconf.is_relation_type(old_type) !=
projectconf.is_relation_type(type) or
projectconf.is_equiv_type(old_type) !=
projectconf.is_equiv_type(type)):
_delete_arc_with_ann(origin.id, old_target, old_type, mods,
ann_obj, projectconf)
old_target, old_type = None, None
if projectconf.is_equiv_type(type):
ann =_create_equiv(ann_obj, projectconf, mods, origin, target,
type, attributes, old_type, old_target)
elif projectconf.is_relation_type(type):
ann = _create_relation(ann_obj, projectconf, mods, origin, target,
type, attributes, old_type, old_target)
else:
ann = _create_argument(ann_obj, projectconf, mods, origin, target,
type, attributes, old_type, old_target)
# process comments
if ann is not None:
_set_comments(ann_obj, ann, comment, mods,
undo_resp=undo_resp)
elif comment is not None:
Messager.warning('create_arc: non-empty comment for None annotation (unsupported type for comment?)')
mods_json = mods.json_response()
mods_json['annotations'] = _json_from_ann(ann_obj)
return mods_json
# helper for delete_arc
def _delete_arc_equiv(origin, target, type_, mods, ann_obj):
# TODO: this is slow, we should have a better accessor
for eq_ann in ann_obj.get_equivs():
# We don't assume that the ids only occur in one Equiv, we
# keep on going since the data "could" be corrupted
if (unicode(origin) in eq_ann.entities and
unicode(target) in eq_ann.entities and
type_ == eq_ann.type):
before = unicode(eq_ann)
eq_ann.entities.remove(unicode(origin))
eq_ann.entities.remove(unicode(target))
mods.change(before, eq_ann)
if len(eq_ann.entities) < 2:
# We need to delete this one
try:
ann_obj.del_annotation(eq_ann)
mods.deletion(eq_ann)
except DependingAnnotationDeleteError, e:
#TODO: This should never happen, dep on equiv
raise
# TODO: warn on failure to delete?
# helper for delete_arc
def _delete_arc_nonequiv_rel(origin, target, type_, mods, ann_obj):
# TODO: this is slow, we should have a better accessor
for ann in ann_obj.get_relations():
if ann.type == type_ and ann.arg1 == origin and ann.arg2 == target:
ann_obj.del_annotation(ann)
mods.deletion(ann)
# TODO: warn on failure to delete?
# helper for delete_arc
def _delete_arc_event_arg(origin, target, type_, mods, ann_obj):
event_ann = ann_obj.get_ann_by_id(origin)
# Try if it is an event
arg_tup = (type_, unicode(target))
if arg_tup in event_ann.args:
before = unicode(event_ann)
event_ann.args.remove(arg_tup)
mods.change(before, event_ann)
else:
# What we were to remove did not even exist in the first place
# TODO: warn on failure to delete?
pass
def _delete_arc_with_ann(origin, target, type_, mods, ann_obj, projectconf):
origin_ann = ann_obj.get_ann_by_id(origin)
# specifics of delete determined by arc type (equiv relation,
# other relation, event argument)
if projectconf.is_relation_type(type_):
if projectconf.is_equiv_type(type_):
_delete_arc_equiv(origin, target, type_, mods, ann_obj)
else:
_delete_arc_nonequiv_rel(origin, target, type_, mods, ann_obj)
elif projectconf.is_event_type(origin_ann.type):
_delete_arc_event_arg(origin, target, type_, mods, ann_obj)
else:
Messager.error('Unknown annotation types for delete')
def delete_arc(collection, document, origin, target, type):
directory = collection
real_dir = real_directory(directory)
mods = ModificationTracker()
projectconf = ProjectConfiguration(real_dir)
document = path_join(real_dir, document)
with TextAnnotations(document) as ann_obj:
# bail as quick as possible if read-only
if ann_obj._read_only:
raise AnnotationsIsReadOnlyError(ann_obj.get_document())
_delete_arc_with_ann(origin, target, type, mods, ann_obj, projectconf)
mods_json = mods.json_response()
mods_json['annotations'] = _json_from_ann(ann_obj)
return mods_json
# TODO: error handling?
#TODO: ONLY determine what action to take! Delegate to Annotations!
def delete_span(collection, document, id):
directory = collection
real_dir = real_directory(directory)
document = path_join(real_dir, document)
with TextAnnotations(document) as ann_obj:
# bail as quick as possible if read-only
if ann_obj._read_only:
raise AnnotationsIsReadOnlyError(ann_obj.get_document())
mods = ModificationTracker()
#TODO: Handle a failure to find it
#XXX: Slow, O(2N)
ann = ann_obj.get_ann_by_id(id)
try:
# Note: need to pass the tracker to del_annotation to track
# recursive deletes. TODO: make usage consistent.
ann_obj.del_annotation(ann, mods)
try:
trig = ann_obj.get_ann_by_id(ann.trigger)
try:
ann_obj.del_annotation(trig, mods)
except DependingAnnotationDeleteError:
# Someone else depended on that trigger
pass
except AttributeError:
pass
except DependingAnnotationDeleteError, e:
Messager.error(e.html_error_str())
return {
'exception': True,
}
mods_json = mods.json_response()
mods_json['annotations'] = _json_from_ann(ann_obj)
return mods_json
class AnnotationSplitError(ProtocolError):
def __init__(self, message):
self.message = message
def __str__(self):
return self.message
def json(self, json_dic):
json_dic['exception'] = 'annotationSplitError'
Messager.error(self.message)
return json_dic
def split_span(collection, document, args, id):
directory = collection
real_dir = real_directory(directory)
document = path_join(real_dir, document)
# TODO don't know how to pass an array directly, so doing extra catenate and split
tosplit_args = json_loads(args)
with TextAnnotations(document) as ann_obj:
# bail as quick as possible if read-only
if ann_obj._read_only:
raise AnnotationsIsReadOnlyError(ann_obj.get_document())
mods = ModificationTracker()
ann = ann_obj.get_ann_by_id(id)
# currently only allowing splits for events
if not isinstance(ann, EventAnnotation):
raise AnnotationSplitError("Cannot split an annotation of type %s" % ann.type)
# group event arguments into ones that will be split on and
# ones that will not, placing the former into a dict keyed by
# the argument without trailing numbers (e.g. "Theme1" ->
# "Theme") and the latter in a straight list.
split_args = {}
nonsplit_args = []
import re
for arg, aid in ann.args:
m = re.match(r'^(.*?)\d*$', arg)
if m:
arg = m.group(1)
if arg in tosplit_args:
if arg not in split_args:
split_args[arg] = []
split_args[arg].append(aid)
else:
nonsplit_args.append((arg, aid))
# verify that split is possible
for a in tosplit_args:
acount = len(split_args.get(a,[]))
if acount < 2:
raise AnnotationSplitError("Cannot split %s on %s: only %d %s arguments (need two or more)" % (ann.id, a, acount, a))
# create all combinations of the args on which to split
argument_combos = [[]]
for a in tosplit_args:
new_combos = []
for aid in split_args[a]:
for c in argument_combos:
new_combos.append(c + [(a, aid)])
argument_combos = new_combos
# create the new events (first combo will use the existing event)
from copy import deepcopy
new_events = []
for i, arg_combo in enumerate(argument_combos):
# tweak args
if i == 0:
ann.args = nonsplit_args[:] + arg_combo
else:
newann = deepcopy(ann)
newann.id = ann_obj.get_new_id("E") # TODO: avoid hard-coding ID prefix
newann.args = nonsplit_args[:] + arg_combo
ann_obj.add_annotation(newann)
new_events.append(newann)
mods.addition(newann)
# then, go through all the annotations referencing the original
# event, and create appropriate copies
for a in ann_obj:
soft_deps, hard_deps = a.get_deps()
refs = soft_deps | hard_deps
if ann.id in refs:
# Referenced; make duplicates appropriately
if isinstance(a, EventAnnotation):
# go through args and make copies for referencing
new_args = []
for arg, aid in a.args:
if aid == ann.id:
for newe in new_events:
new_args.append((arg, newe.id))
a.args.extend(new_args)
elif isinstance(a, AttributeAnnotation):
for newe in new_events:
newmod = deepcopy(a)
newmod.target = newe.id
newmod.id = ann_obj.get_new_id("A") # TODO: avoid hard-coding ID prefix
ann_obj.add_annotation(newmod)
mods.addition(newmod)
elif isinstance(a, BinaryRelationAnnotation):
# TODO
raise AnnotationSplitError("Cannot adjust annotation referencing split: not implemented for relations! (WARNING: annotations may be in inconsistent state, please reload!) (Please complain to the developers to fix this!)")
elif isinstance(a, OnelineCommentAnnotation):
for newe in new_events:
newcomm = deepcopy(a)
newcomm.target = newe.id
newcomm.id = ann_obj.get_new_id("#") # TODO: avoid hard-coding ID prefix
ann_obj.add_annotation(newcomm)
mods.addition(newcomm)
elif isinstance(a, NormalizationAnnotation):
for newe in new_events:
newnorm = deepcopy(a)
newnorm.target = newe.id
newnorm.id = ann_obj.get_new_id("N") # TODO: avoid hard-coding ID prefix
ann_obj.add_annotation(newnorm)
mods.addition(newnorm)
else:
raise AnnotationSplitError("Cannot adjust annotation referencing split: not implemented for %s! (Please complain to the lazy developers to fix this!)" % a.__class__)
mods_json = mods.json_response()
mods_json['annotations'] = _json_from_ann(ann_obj)
return mods_json
def set_status(directory, document, status=None):
real_dir = real_directory(directory)
with TextAnnotations(path_join(real_dir, document)) as ann:
# Erase all old status annotations
for status in ann.get_statuses():
ann.del_annotation(status)
if status is not None:
# XXX: This could work, not sure if it can induce an id collision
new_status_id = ann.get_new_id('#')
ann.add_annotation(OnelineCommentAnnotation(
new_status, new_status_id, 'STATUS', ''
))
json_dic = {
'status': new_status
}
return json_dic
def get_status(directory, document):
with TextAnnotations(path_join(real_directory, document),
read_only=True) as ann:
# XXX: Assume the last one is correct if we have more
# than one (which is a violation of protocol anyway)
statuses = [c for c in ann.get_statuses()]
if statuses:
status = statuses[-1].target
else:
status = None
json_dic = {
'status': status
}
return json_dic