sentencesplit.py
2.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#!/usr/bin/env python
'''
Basic sentence splitter using brat segmentation to add newlines to
input text at likely sentence boundaries.
'''
import sys
from os.path import join as path_join
from os.path import dirname
# Assuming this script is found in the brat tools/ directory ...
from sys import path as sys_path
sys_path.append(path_join(dirname(__file__), '../server/src'))
# import brat sentence boundary generator
from ssplit import regex_sentence_boundary_gen
def _text_by_offsets_gen(text, offsets):
for start, end in offsets:
yield text[start:end]
def _normspace(s):
import re
return re.sub(r'\s', ' ', s)
def sentencebreaks_to_newlines(text):
offsets = [o for o in regex_sentence_boundary_gen(text)]
# break into sentences
sentences = [s for s in _text_by_offsets_gen(text, offsets)]
# join up, adding a newline for space where possible
orig_parts = []
new_parts = []
sentnum = len(sentences)
for i in range(sentnum):
sent = sentences[i]
orig_parts.append(sent)
new_parts.append(sent)
if i < sentnum-1:
orig_parts.append(text[offsets[i][1]:offsets[i+1][0]])
if (offsets[i][1] < offsets[i+1][0] and
text[offsets[i][1]].isspace()):
# intervening space; can add newline
new_parts.append('\n'+text[offsets[i][1]+1:offsets[i+1][0]])
else:
new_parts.append(text[offsets[i][1]:offsets[i+1][0]])
if len(offsets) and offsets[-1][1] < len(text):
orig_parts.append(text[offsets[-1][1]:])
new_parts.append(text[offsets[-1][1]:])
# sanity check
assert text == ''.join(orig_parts), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % (text, ''.join(orig_parts))
splittext = ''.join(new_parts)
# sanity
assert len(text) == len(splittext), "INTERNAL ERROR"
assert _normspace(text) == _normspace(splittext), "INTERNAL ERROR:\n '%s'\nvs\n '%s'" % (_normspace(text), _normspace(splittext))
return splittext
def main(argv):
while True:
text = sys.stdin.readline()
if len(text) == 0:
break
sys.stdout.write(sentencebreaks_to_newlines(text))
if __name__ == "__main__":
sys.exit(main(sys.argv))