extract_files.py 3.23 KB
#!/usr/bin/env python

import sys
import os
import re
import shutil
from collections import defaultdict
from optparse import OptionParser

# Solution with no hard coded path would be welcome
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), ".."))

from dfs.database import Database
from dfs.repo import Repo
from dfs.config import Config

def extract_file(file, exts, target_dir):
    print "Extracting file", file
    for ext in exts:
        filename = file + ext
        print "\t-", os.path.basename(filename) 
        shutil.copy(filename, target_dir)    

if __name__ == "__main__":
    optparser = OptionParser(usage="""usage: %prog [options] CONFIG TYPE TARGET_DIR""")
    optparser.add_option("--extensions", dest="exts", default=".mmax,_mentions.xml,_words.xml",
            help="List of comma-separated file extensions")
    (options, args) = optparser.parse_args()
    if len(args) != 3:
        optparser.print_help()
        sys.exit(0)

    conf_path = args[0]
    cfg = Config(conf_path)
    type = args[1]
    target_dir = args[2]
    anno_per_file = int(cfg["anno_per_file"])
    exts = options.exts.split(",")
    wc = Repo(cfg["svn.repository"], cfg["svn.login"], cfg["svn.passwd"])
    db = Database(wc.db_path(), anno_per_file)
    
    poss_types = ["finished", "returned", "annotated"]
    if type not in poss_types:
        print "Possible types: ", poss_types
        sys.exit(0)
    
    filepaths = {}
    for filename, file_elem in db.file_index.iteritems():
        if db.rejected(file_elem):
            continue
        
        if type == "finished":
            sann = file_elem.find("s_ann")
            if sann is not None and db.finished(sann):                
                filepaths[wc.upload_prim_path(filename)] = None
                
        elif type == "annotated":
            idx = 0
            for ann in file_elem.findall("ann"):
                if db.finished(ann):
                    filepaths[wc.upload_path(filename, idx)] = None
                idx = idx + 1
                
        elif type == "returned":
            reason = None
            idx = 0
            for ann in file_elem.findall("ann"):                
                if db.returned(ann):
                    reason = db.get_reason(ann)
                    break
                idx = idx + 1
                
            if reason is not None:
                filepaths[wc.upload_path(filename, idx)] = reason
                continue
            
            reason = None  
            for sann in file_elem.findall("s_ann"):
                if db.returned(sann):
                    reason = db.get_reason(sann)
                    break
                
            if reason is not None:
                filepaths[wc.upload_prim_path(filename)] = reason
        
    for k, v in filepaths.iteritems():
        extract_file(k, exts, target_dir)
    
    if type == "returned":    
        filename = os.path.join(target_dir, "reasons.txt")
        with open(filename, 'w') as f:
            for k, v in sorted(filepaths.iteritems()):
                name = os.path.basename(k)
                f.write(name)
                f.write(" - ")
                f.write(v.encode("utf-8"))
                f.write("\n")