fix_files.py 3.46 KB
#!/usr/bin/env python

import sys
import os
import re
import shutil
from collections import defaultdict
from optparse import OptionParser

# Solution with no hard coded path would be welcome
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), ".."))

from dfs.database import Database
from dfs.repo import Repo
from dfs.config import Config
            
def fix_file(wc, db, file_id, contents, anno_per_file):
    file_elem = db.file_index[file_id]
    if file_elem is None:    
        print "File", file_id, "not found in database." 
        return False
    
    ann_elem = None
    idx = -1
    for ann in file_elem.findall("ann") + file_elem.findall("s_ann"):
        idx = idx + 1
        if db.returned(ann):
            ann_elem = ann
            break
                
    if ann_elem is None:
        print "File", file_id, "not returned in database." 
        return False
    
    print "Fixing file " + file_id
                
    try:
        db.fix(ann_elem)
        
        if ann_elem.tag == "ann":            
            wc.upload(file_id, idx, contents)
        elif ann_elem.tag == "s_ann":
            wc.upload_prim(file_id, contents)
                      
    except Exception as ex:
        print "\t error: " + str(ex)
        wc.revert()
        return False
    
    return True          
                        
def match_ext(path, exts):
    for ext in exts:
        if path.endswith(ext):
            return ext

def path_id(path, ext):
    _, filename = os.path.split(path)
    return re.sub("%s$" % ext, "", filename)

def group_paths(paths, exts):
    result = defaultdict(list)            
    for path in paths:
        ext = match_ext(path, exts)
        if (ext != None):
            file_id = path_id(path, ext)            
            if file_id not in result:
                result[file_id] = {}            
            with open(path, "r") as f:
                result[file_id][ext]=f.read()
    return result

def get_rec_paths(paths):
    result = []
    for path in paths:
        if os.path.isdir(path):
            for dirname, dirnames, filenames in os.walk(path):                
                for filename in filenames:
                    result.append(os.path.join(dirname, filename))                
        else:
            result.append(path)
    return result

if __name__ == "__main__":
    optparser = OptionParser(usage="""usage: %prog [options] CONFIG FILES""")
    optparser.add_option("--extensions", dest="exts", default=".mmax,_mentions.xml,_words.xml",
            help="List of comma-separated file extensions")
    (options, args) = optparser.parse_args()
    if len(args) < 2:
        optparser.print_help()
        sys.exit(0)

    conf_path = args[0]    
    cfg = Config(conf_path)
    anno_per_file = int(cfg["anno_per_file"])
    paths = get_rec_paths(args[1:])
    exts = options.exts.split(",")
    files = group_paths(paths, exts)
    wc = Repo(cfg["svn.repository"], cfg["svn.login"], cfg["svn.passwd"])
    db = Database(wc.db_path(), anno_per_file)
    
    success = []
    fail = []
    for file_id, contents in files.iteritems():
        if fix_file(wc, db, file_id, contents, anno_per_file):
            success.append(file_id)
        else:
            fail.append(file_id)     
    
    db.save()           
    wc.commit("Fixed files: " + str(success))        
    
    print ""
    if len(success) > 0:
        print "Fixed files: " + str(success)
    if len(fail) > 0:
        print "Failed to fix files: " + str(fail)