respace.py
17.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
#!/usr/bin/env python
# Script to revise the whitespace content of a PMC NXML file for text
# content extraction.
from __future__ import with_statement
import sys
import os
import re
import codecs
# TODO: switch to lxml
try:
import xml.etree.ElementTree as ET
except ImportError:
import cElementTree as ET
# TODO: the model of "space wrap" is unnecessarily crude in many
# cases. For example, for <issue> we would ideally want to have
# "<issue>1</issue><title>Foo</title>" spaced as "1 Foo" but
# "<issue>1</issue>: <page>100</page>" spaced as "1: 100". This could
# be addressed by differentiating between things that should be
# wrapped by space in all cases and ones where it's only needed
# at non-word-boundaries (\b).
# tag to use for inserted elements
INSERTED_ELEMENT_TAG = "n2t-spc"
INPUT_ENCODING="UTF-8"
OUTPUT_ENCODING="UTF-8"
# command-line options
options = None
newline_wrap_element = set([
"CURRENT_TITLE",
"CURRENT_AUTHORLIST",
"ABSTRACT",
"P",
"TABLE",
"FIGURE",
"HEADER",
"REFERENCE",
"article-title",
"abstract",
"title",
"sec",
"p",
"contrib", # contributor (author list)
"aff", # affiliation
"pub-date", # publication date
"copyright-statement",
"table",
"table-wrap",
"figure",
"fig", # figure (alternate)
"tr", # table row
"kwd-group", # keyword group
])
space_wrap_element = set([
"AUTHOR",
"SURNAME",
"CURRENT_AUTHOR",
"CURRENT_SURNAME",
"TITLE",
"JOURNAL",
"YEAR",
# author lists
"surname",
"given-names",
"email",
# citation details
"volume",
"issue",
"year",
"month",
"day",
"fpage",
"lpage",
"pub-id",
"copyright-year",
# journal meta
"journal-id",
"journal-title",
"issn",
"publisher-name",
# article meta
"article-id",
"kwd", # keyword
# miscellaneous
"label",
"th",
"td",
])
# strip anything that we're wrapping; this is a bit unnecessarily
# aggressive in cases but guarantees normalization
strip_element = newline_wrap_element | space_wrap_element
class Standoff:
def __init__(self, element, start, end):
self.element = element
self.start = start
self.end = end
def txt(s):
return s if s is not None else ""
def text_and_standoffs(e):
strings, standoffs = [], []
_text_and_standoffs(e, 0, strings, standoffs)
text = "".join(strings)
return text, standoffs
def _text_and_standoffs(e, curroff, strings, standoffs):
startoff = curroff
# to keep standoffs in element occurrence order, append
# a placeholder before recursing
so = Standoff(e, 0, 0)
standoffs.append(so)
if e.text is not None and e.text != "":
strings.append(e.text)
curroff += len(e.text)
curroff = _subelem_text_and_standoffs(e, curroff, strings, standoffs)
so.start = startoff
so.end = curroff
return curroff
def _subelem_text_and_standoffs(e, curroff, strings, standoffs):
startoff = curroff
for s in e:
curroff = _text_and_standoffs(s, curroff, strings, standoffs)
if s.tail is not None and s.tail != "":
strings.append(s.tail)
curroff += len(s.tail)
return curroff
def preceding_space(pos, text, rewritten={}):
while pos > 0:
pos -= 1
if pos not in rewritten:
# no rewrite, check normally
return text[pos].isspace()
elif rewritten[pos] is not None:
# refer to rewritten instead of original
return rewritten[pos].isspace()
else:
# character deleted, ignore position
pass
# accept start of text
return True
def following_space(pos, text, rewritten={}):
while pos < len(text):
if pos not in rewritten:
# no rewrite, check normally
return text[pos].isspace()
elif rewritten[pos] is not None:
# refer to rewritten instead of original
return rewritten[pos].isspace()
else:
# character deleted, ignore position
pass
pos += 1
# accept end of text
return True
def preceding_linebreak(pos, text, rewritten={}):
if pos >= len(text):
return True
while pos > 0:
pos -= 1
c = rewritten.get(pos, text[pos])
if c == "\n":
return True
elif c is not None and not c.isspace():
return False
else:
# space or deleted, check further
pass
return True
def following_linebreak(pos, text, rewritten={}):
while pos < len(text):
c = rewritten.get(pos, text[pos])
if c == "\n":
return True
elif c is not None and not c.isspace():
return False
else:
# space or deleted, check further
pass
pos += 1
return True
def index_in_parent(e, p):
"""
Returns the index of the given element in its parent element e.
"""
index = None
for i in range(len(p)):
if p[i] == e:
index = i
break
assert i is not None, "index_in_parent: error: not parent and child"
return i
def space_normalize(root, text=None, standoffs=None):
"""
Eliminates multiple consequtive spaces and normalizes newlines
(and other space) into regular space.
"""
if text is None or standoffs is None:
text, standoffs = text_and_standoffs(root)
# TODO: this is crude and destructive; improve!
for so in standoffs:
e = so.element
if e.text is not None and e.text != "":
e.text = re.sub(r'\s+', ' ', e.text)
if e.tail is not None and e.tail != "":
e.tail = re.sub(r'\s+', ' ', e.tail)
def strip_elements(root, elements_to_strip=set(), text=None, standoffs=None):
"""
Removes initial and terminal space from elements that either have
surrounding space or belong to given set of elements to strip.
"""
if text is None or standoffs is None:
text, standoffs = text_and_standoffs(root)
# during processing, keep note at which offsets spaces have
# been eliminated.
rewritten = {}
for so in standoffs:
e = so.element
# don't remove expressly inserted space
if e.tag == INSERTED_ELEMENT_TAG:
continue
# if the element contains initial space and is either marked
# for space stripping or preceded by space, remove the initial
# space.
if ((e.text is not None and e.text != "" and e.text[0].isspace()) and
(element_in_set(e, elements_to_strip) or
preceding_space(so.start, text, rewritten))):
l = 0
while l < len(e.text) and e.text[l].isspace():
l += 1
space, end = e.text[:l], e.text[l:]
for i in range(l):
assert so.start+i not in rewritten, "ERROR: dup remove at %d" % (so.start+i)
rewritten[so.start+i] = None
e.text = end
# element-final space is in e.text only if the element has no
# children; if it does, the element-final space is found in
# the tail of the last child.
if len(e) == 0:
if ((e.text is not None and e.text != "" and e.text[-1].isspace()) and
(element_in_set(e, elements_to_strip) or
following_space(so.end, text, rewritten))):
l = 0
while l < len(e.text) and e.text[-l-1].isspace():
l += 1
start, space = e.text[:-l], e.text[-l:]
for i in range(l):
o = so.end-i-1
assert o not in rewritten, "ERROR: dup remove"
rewritten[o] = None
e.text = start
else:
c = e[-1]
if ((c.tail is not None and c.tail != "" and c.tail[-1].isspace()) and
(element_in_set(e, elements_to_strip) or
following_space(so.end, text, rewritten))):
l = 0
while l < len(c.tail) and c.tail[-l-1].isspace():
l += 1
start, space = c.tail[:-l], c.tail[-l:]
for i in range(l):
o = so.end-i-1
assert o not in rewritten, "ERROR: dup remove"
rewritten[o] = None
c.tail = start
def trim_tails(root):
"""
Trims the beginning of the tail of elements where it is preceded
by space.
"""
# This function is primarily necessary to cover the special case
# of empty elements preceded and followed by space, as the
# consecutive spaces created by such elements are not accessible
# to the normal text content-stripping functionality.
# work with standoffs for reference
text, standoffs = text_and_standoffs(root)
for so in standoffs:
e = so.element
if (e.tail is not None and e.tail != "" and e.tail[0].isspace() and
preceding_space(so.end, text)):
l = 0
while l < len(e.tail) and e.tail[l].isspace():
l += 1
space, end = e.tail[:l], e.tail[l:]
e.tail = end
def reduce_space(root, elements_to_strip=set()):
"""
Performs space-removing normalizations.
"""
# convert tree into text and standoffs for reference
text, standoffs = text_and_standoffs(root)
strip_elements(root, elements_to_strip, text, standoffs)
trim_tails(root)
space_normalize(root, text, standoffs)
def element_in_set(e, s):
# strip namespaces for lookup
if e.tag[0] == "{":
tag = re.sub(r'\{.*?\}', '', e.tag)
else:
tag = e.tag
return tag in s
def process(fn):
global strip_element
global options
# ugly hack for testing: allow "-" for "/dev/stdin"
if fn == "-":
fn = "/dev/stdin"
try:
tree = ET.parse(fn)
except:
print >> sys.stderr, "Error parsing %s" % fn
raise
root = tree.getroot()
########## space normalization and stripping ##########
reduce_space(root, strip_element)
########## additional space ##########
# convert tree into text and standoffs
text, standoffs = text_and_standoffs(root)
# traverse standoffs and mark each position before which a space
# or a newline should be assured. Values are (pos, early), where
# pos is the offset where the break should be placed, and early
# determines whether to select the first or the last among
# multiple alternative tags before/after which to place the break.
respace = {}
for so in standoffs:
e = so.element
if element_in_set(e, newline_wrap_element):
# "late" newline gets priority
if not (so.start in respace and (respace[so.start][0] == "\n" and
respace[so.start][1] == False)):
respace[so.start] = ("\n", True)
respace[so.end] = ("\n", False)
elif element_in_set(e, space_wrap_element):
# newlines and "late" get priority
if not (so.start in respace and (respace[so.start][0] == "\n" or
respace[so.start][1] == False)):
respace[so.start] = (" ", True)
if not (so.end in respace and respace[so.end][0] == "\n"):
respace[so.end] = (" ", False)
# next, filter respace to remove markers where the necessary space
# is already present in the text.
# to allow the filter to take into account linebreaks that will be
# introduced as part of the processing, maintain rewritten
# positions separately. (live updating of the text would be too
# expensive computationally.) As the processing is left-to-right,
# it's enough to use this for preceding positions and to mark
# inserts as appearing "before" the place where space is required.
rewritten = {}
filtered = {}
for pos in sorted(respace.keys()):
if respace[pos][0] == " ":
# unnecessary if initial, terminal, or preceded/followed
# by a space
if not (preceding_space(pos, text, rewritten) or
following_space(pos, text, rewritten)):
filtered[pos] = respace[pos]
rewritten[pos-1] = " "
else:
assert respace[pos][0] == "\n", "INTERNAL ERROR"
# unnecessary if there's either a preceding or following
# newline connected by space
if not (preceding_linebreak(pos, text, rewritten) or
following_linebreak(pos, text, rewritten)):
filtered[pos] = respace[pos]
rewritten[pos-1] = "\n"
respace = filtered
# for reference, create a map from elements to their parents in the tree.
parent_map = {}
for parent in root.getiterator():
for child in parent:
parent_map[child] = parent
# for reference, create a map from positions to standoffs ending
# at each.
# TODO: avoid indexing everything; this is only required for
# standoffs ending at respace positions
end_map = {}
for so in standoffs:
if so.end not in end_map:
end_map[so.end] = []
end_map[so.end].append(so)
# traverse standoffs again, adding the new elements as needed.
for so in standoffs:
if so.start in respace and respace[so.start][1] == True:
# Early space needed here. The current node can be assumed
# to be the first to "discover" this, so it's appropriate
# to add space before the current node. We can further
# assume the current node has a parent (adding space
# before the root is meaningless), so we can add the space
# node as the preceding child in the parent.
e = so.element
assert e in parent_map, "INTERNAL ERROR: add space before root?"
p = parent_map[e]
i = index_in_parent(e, p)
rse = ET.Element(INSERTED_ELEMENT_TAG)
rse.text = respace[so.start][0]
p.insert(i, rse)
# done, clear
del respace[so.start]
if so.end in respace and respace[so.end][1] == False:
# Late space needed here. Add after the current node iff
# it's the first of the nodes with the longest span ending
# here (i.e. the outermost).
maxlen = max([s.end-s.start for s in end_map[so.end]])
if so.end-so.start != maxlen:
continue
longest = [s for s in end_map[so.end] if s.end-s.start == maxlen]
if so != longest[0]:
continue
# OK to add.
e = so.element
assert e in parent_map, "INTERNAL ERROR: add space after root?"
p = parent_map[e]
i = index_in_parent(e, p)
rse = ET.Element(INSERTED_ELEMENT_TAG)
rse.text = respace[so.end][0]
p.insert(i+1, rse)
# need to relocate tail
rse.tail = e.tail
e.tail = ""
# done, clear
del respace[so.end]
assert len(respace) == 0, "INTERNAL ERROR: failed to insert %s" % str(respace)
# re-process to clear out consequtive space potentially introduced
# in previous processing.
strip_elements(root)
trim_tails(root)
# all done, output
if options.stdout:
tree.write(sys.stdout, encoding=OUTPUT_ENCODING)
return True
if options is not None and options.directory is not None:
output_dir = options.directory
else:
output_dir = ""
output_fn = os.path.join(output_dir, os.path.basename(fn))
# TODO: better checking of path identify to protect against
# clobbering.
if output_fn == fn and not options.overwrite:
print >> sys.stderr, 'respace: skipping output for %s: file would overwrite input (consider -d and -o options)' % fn
else:
# OK to write output_fn
try:
with open(output_fn, 'w') as of:
tree.write(of, encoding=OUTPUT_ENCODING)
except IOError, ex:
print >> sys.stderr, 'respace: failed write: %s' % ex
return True
def argparser():
import argparse
ap=argparse.ArgumentParser(description='Revise whitespace content of a PMC NXML file for text extraction.')
ap.add_argument('-d', '--directory', default=None, metavar='DIR', help='output directory')
ap.add_argument('-o', '--overwrite', default=False, action='store_true', help='allow output to overwrite input files')
ap.add_argument('-s', '--stdout', default=False, action='store_true', help='output to stdout')
ap.add_argument('file', nargs='+', help='input PubMed Central NXML file')
return ap
def main(argv):
global options
options = argparser().parse_args(argv[1:])
for fn in options.file:
process(fn)
return 0
if __name__ == "__main__":
sys.exit(main(sys.argv))