merge.py
2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env python
# -*- Mode: Python; tab-width: 4; indent-tabs-mode: nil; coding: utf-8; -*-
# vim:set ft=python ts=4 sw=4 sts=4 autoindent:
from __future__ import with_statement
'''
Merge BioNLP Shared Task annotation format into a single annotation file.
find data -name '*.a1' -o -name '*.a2' -o -name '*.rel' -o -name '*.co' \
| ./merge.py
Author: Pontus Stenetorp
Version: 2011-01-17
'''
from collections import defaultdict
from os.path import join as join_path
from os.path import split as split_path
from shlex import split as shlex_split
from sys import stderr, stdin
from subprocess import Popen, PIPE
try:
from argparse import ArgumentParser
except ImportError:
from os.path import basename
from sys import path as sys_path
# We are most likely on an old Python and need to use our internal version
sys_path.append(join_path(basename(__file__), '../server/lib'))
from argparse import ArgumentParser
### Constants
#TODO: Add to options?
UNMERGED_SUFFIXES=['a1', 'a2', 'co', 'rel']
#TODO: Add to options?
MERGED_SUFFIX='ann'
ARGPARSER = ArgumentParser(description=("Merge BioNLP'11 ST annotations "
'into a single file, reads paths from stdin'))
ARGPARSER.add_argument('-w', '--no-warn', action='store_true',
help='suppress warnings')
#ARGPARSER.add_argument('-d', '--debug', action='store_true',
# help='activate additional debug output')
###
def keynat(string):
'''
http://code.activestate.com/recipes/285264-natural-string-sorting/
'''
it = type(1)
r = []
for c in string:
if c.isdigit():
d = int(c)
if r and type( r[-1] ) == it:
r[-1] = r[-1] * 10 + d
else:
r.append(d)
else:
r.append(c.lower())
return r
def main(args):
argp = ARGPARSER.parse_args(args[1:])
# ID is the stem of a file
id_to_ann_files = defaultdict(list)
# Index all ID;s before we merge so that we can do a little magic
for file_path in (l.strip() for l in stdin):
if not any((file_path.endswith(suff) for suff in UNMERGED_SUFFIXES)):
if not argp.no_warn:
import sys
print >> sys.stderr, (
'WARNING: invalid file suffix for %s, ignoring'
) % (file_path, )
continue
dirname, basename = split_path(file_path)
id = join_path(dirname, basename.split('.')[0])
id_to_ann_files[id].append(file_path)
for id, ann_files in id_to_ann_files.iteritems():
#XXX: Check if output file exists
lines = []
for ann_file_path in ann_files:
with open(ann_file_path, 'r') as ann_file:
for line in ann_file:
lines.append(line)
with open(id + '.' + MERGED_SUFFIX, 'w') as merged_ann_file:
for line in lines:
merged_ann_file.write(line)
if __name__ == '__main__':
from sys import argv
exit(main(argv))