extract_files.py
3.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
import sys
import os
import re
import shutil
from collections import defaultdict
from optparse import OptionParser
# Solution with no hard coded path would be welcome
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), ".."))
from dfs.database import Database
from dfs.repo import Repo
from dfs.config import Config
def extract_file(file, exts, target_dir):
print "Extracting file", file
for ext in exts:
filename = file + ext
print "\t-", os.path.basename(filename)
shutil.copy(filename, target_dir)
if __name__ == "__main__":
optparser = OptionParser(usage="""usage: %prog [options] CONFIG TYPE TARGET_DIR""")
optparser.add_option("--extensions", dest="exts", default=".mmax,_mentions.xml,_words.xml",
help="List of comma-separated file extensions")
(options, args) = optparser.parse_args()
if len(args) != 3:
optparser.print_help()
sys.exit(0)
conf_path = args[0]
cfg = Config(conf_path)
type = args[1]
target_dir = args[2]
anno_per_file = int(cfg["anno_per_file"])
exts = options.exts.split(",")
wc = Repo(cfg["svn.repository"], cfg["svn.login"], cfg["svn.passwd"])
db = Database(wc.db_path(), anno_per_file)
poss_types = ["finished", "returned", "annotated"]
if type not in poss_types:
print "Possible types: ", poss_types
sys.exit(0)
filepaths = {}
for filename, file_elem in db.file_index.iteritems():
if db.rejected(file_elem):
continue
if type == "finished":
sann = file_elem.find("s_ann")
if sann is not None and db.finished(sann):
filepaths[wc.upload_prim_path(filename)] = None
elif type == "annotated":
idx = 0
for ann in file_elem.findall("ann"):
if db.finished(ann):
filepaths[wc.upload_path(filename, idx)] = None
idx = idx + 1
elif type == "returned":
reason = None
idx = 0
for ann in file_elem.findall("ann"):
if db.returned(ann):
reason = db.get_reason(ann)
break
idx = idx + 1
if reason is not None:
filepaths[wc.upload_path(filename, idx)] = reason
continue
reason = None
for sann in file_elem.findall("s_ann"):
if db.returned(sann):
reason = db.get_reason(sann)
break
if reason is not None:
filepaths[wc.upload_prim_path(filename)] = reason
for k, v in filepaths.iteritems():
extract_file(k, exts, target_dir)
if type == "returned":
filename = os.path.join(target_dir, "reasons.txt")
with open(filename, 'w') as f:
for k, v in sorted(filepaths.iteritems()):
name = os.path.basename(k)
f.write(name)
f.write(" - ")
f.write(v.encode("utf-8"))
f.write("\n")