db_averages_stats.py 9.92 KB
#!/usr/bin/env python
import re
import sys
import os
import re
import shutil
from collections import defaultdict
from optparse import OptionParser

# Solution with no hard coded path would be welcome
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), ".."))

from dfs.database import Database
from dfs.repo import Repo
from dfs.config import Config

def fill(text):
    return fill_custom(text, 20, " ")

def fill_custom(text, l, sym):
    text = text.decode("utf-8")    
    to_add = max(0, l - len(text))
    spaces = ""
    for i in range(to_add):
        spaces = spaces + sym
    return (text + spaces).encode("utf-8")

def get_span_size(span, words):    
    w = span.split(",")
    size = 0
    for fragment in w:
        f = fragment.split("..")
        id1 = words.index(f[0])
        id2 = words.index(f[-1])        
        size = size + id2 - id1 + 1
                
    return size
        
def count_file_stats(path):
    words = []
    id = re.compile("<word.* id=\"(.*?)\".*")
    with open(path + "_words.xml", "r") as f:
        for line in f.readlines():
            groups = id.findall(line)
            if len(groups) == 1:                
                ident = groups[0]
                words.append(ident)                
        
    sp = re.compile("<markable.*span=\"(.*?)\".*")
    mg = re.compile(".*mention_group=\"(.*?)\".*")
    ni = re.compile(".*near_identity=\"(.*?)\".*") 
    
    sets = {}
    near_id = 0
    mentions = 0
    mention_sizes = {}
    with open(path + "_mentions.xml", "r") as f:
        for line in f.readlines():
            groups = sp.findall(line)          
            if len(groups) == 1:
                mentions = mentions + 1                
                span = groups[0]
                             
                mention_size = get_span_size(span, words)
                if mention_size in mention_sizes:
                    mention_sizes[mention_size] = mention_sizes[mention_size] + 1
                else: 
                    mention_sizes[mention_size] = 1 
                
                group = mg.findall(line)[0]                
                near = ni.findall(line)[0]
                
                if near != "empty" and near != "":
                    near_id = near_id + 1
                    
                if group != "empty" and group != "":                    
                    if group in sets:
                        sets[group] = sets[group] + 1
                    else:       
                        sets[group] = 1
    
    mg_sizes = {}
    for key, val in sets.iteritems():
        if val in mg_sizes:
            mg_sizes[val] = mg_sizes[val] + 1
        else:
            mg_sizes[val] = 1
            
    return len(words), mention_sizes, mg_sizes, near_id

def merge_dicts(dict1, dict2):
    
    for key, val in dict2.iteritems():
        if key in dict1:
            dict1[key] = dict1[key] + val
        else:
            dict1[key] = val

def get_text_types(db, used_path, mapping_path):    
    names = []
    for filename, file in db.file_index.iteritems():
        if db.rejected(file):
            continue        
        names.append(filename)
    
    id2nrs = {}
    with open(used_path, "r") as f:
        for line in f.readlines():
            spl = line.split(";")
            id = spl[1]
            nr = spl[0]
            if id not in id2nrs:
                id2nrs[id] = []
            id2nrs[id].append(nr)
        
    types = {}
    with open(mapping_path, "r") as f:
        for line in f.readlines():
            spl = line.strip().split(";")
            id = spl[2]         
            if id in id2nrs.keys():
                for nr in id2nrs[id]:
                    types[nr] = spl[0].decode("utf-8")
        
    return types
        
def print_stats(db, ann, types):
    
    gms = {}
    gmgs = {}
    wbalance = {}
    tbalance = {}    
    nibalance = {}
    
    for filename, file in db.file_index.iteritems():
        
        type = types[filename]
        idx = 0
        
        if ann == "rejected":
            if db.rejected(file):
                path = os.path.join(wc.new_path(), filename)
                wcnt, ms, mgs, nidcnt = count_file_stats(path)
                                    
                merge_dicts(wbalance, {type : wcnt})
                merge_dicts(tbalance, {type : 1})
                merge_dicts(nibalance, {type : nidcnt})                 
                
                if type not in gms:
                    gms[type] = {}
                if type not in gmgs:
                    gmgs[type] = {}
                    
                merge_dicts(gms[type], ms)
                merge_dicts(gmgs[type], mgs)
                
        else: 
            if db.rejected(file):
                continue        
                        
            for annotation in file.findall(ann):            
                if db.finished(annotation):
                    if ann == "ann":                
                        path = wc.upload_path(filename, idx)
                    else:
                        path = wc.upload_prim_path(filename)
                        
                    wcnt, ms, mgs, nidcnt = count_file_stats(path)
                                    
                    merge_dicts(wbalance, {type : wcnt})
                    merge_dicts(tbalance, {type : 1}) 
                    merge_dicts(nibalance, {type : nidcnt})           
                         
                    if type not in gms:
                        gms[type] = {}
                    if type not in gmgs:
                        gmgs[type] = {}
                        
                    merge_dicts(gms[type], ms)
                    merge_dicts(gmgs[type], mgs)
                                
                idx = idx + 1
    
    
    gwcnt = reduce((lambda l, a : l + a), wbalance.values(), 0)
    tc = reduce((lambda l, a : l + a), tbalance.values(), 0)
    print
    print "######## Statystyki ilosciowe"
    print fill_custom("Typ tekstow", 60, " "), fill("Liczba tekstow"), fill("Liczba slow"), fill("Procent slow")
    print fill_custom("", 120, "-")
    for type in sorted(set(wbalance.keys()) | set(tbalance.keys())):
        percent = round(1.0 * wbalance[type] / gwcnt * 100, 2)
        print fill_custom(type.encode("utf-8"), 60, " "), fill(str(tbalance[type])), fill(str(wbalance[type])), fill(str(percent) + "%")        
    print fill_custom("", 120, "-")
    print fill_custom("dowolny", 60, " "), fill(str(tc)), fill(str(gwcnt)), fill("100.0%")    
    
    if ann != "rejected":
        print
        print "######## Statystyki wystapien"
        print fill_custom("Typ tekstow", 60, " "), fill("Wystapien/tekst"), fill("Dl. wystapienia")
        print fill_custom("", 100, "-")
        tmc = 0
        tsc = 0
        for type in sorted(set(gms.keys())):            
            mention_count = reduce((lambda l, a : l + a), gms[type].values(), 0)
            segment_count = reduce((lambda a, (k, v) : k * v + a), gms[type].iteritems(), 0)       
            avg_ms = round(1.0 * segment_count / mention_count, 2)
            avg_mpt = round(1.0 * mention_count / tbalance[type], 2)
            print fill_custom(type.encode("utf-8"), 60, " "), fill(str(avg_mpt)), fill(str(avg_ms))
            tmc = tmc + mention_count
            tsc = tsc + segment_count        
        print fill_custom("", 100, "-")
        
        if tmc == 0:
            tavg_ms = 0
        else:
            tavg_ms = round(1.0 * tsc / tmc, 2)
            
        if tc == 0:
            tavg_mpt = 0
        else:
            tavg_mpt = round(1.0 * tmc / tc, 2)
            
        print fill_custom("dowolny", 60, " "), fill(str(tavg_mpt)), fill(str(tavg_ms))
        
        print
        print "######## Statystyki klastrow i linkow"
        print fill_custom("Typ tekstow", 60, " "), fill("Klastrow/tekst"), fill("Wielkosc klastra"), fill("Linkow/tekst")
        print fill_custom("", 120, "-")
        tmc = 0
        tsc = 0
        tnic = 0
        for type in sorted(set(gmgs.keys())):            
            group_count = reduce((lambda l, a : l + a), gmgs[type].values(), 0)
            mention_count = reduce((lambda a, (k, v) : k * v + a), gmgs[type].iteritems(), 0)       
            avg_ms = round(1.0 * mention_count / group_count, 2)
            avg_mpt = round(1.0 * group_count / tbalance[type], 2)
            avg_ni = round(1.0 * nibalance[type] / tbalance[type], 2)
            print fill_custom(type.encode("utf-8"), 60, " "), fill(str(avg_mpt)), fill(str(avg_ms)), fill(str(avg_ni))
            tmc = tmc + group_count
            tsc = tsc + mention_count
            tnic = tnic + nibalance[type]  
        print fill_custom("", 120, "-")
        
        if tmc == 0:
            tavg_ms = 0
        else:
            tavg_ms = round(1.0 * tsc / tmc, 2)
            
        if tc == 0:
            tavg_mpt = 0
            tavg_ni = 0
        else:
            tavg_mpt = round(1.0 * tmc / tc, 2)
            tavg_ni = round(1.0 * tnic / tc, 2)
            
        print fill_custom("dowolny", 60, " "), fill(str(tavg_mpt)), fill(str(tavg_ms)), fill(str(tavg_ni))


if __name__ == "__main__":
    optparser = OptionParser(usage="""usage: %prog CONFIG USED_LIST MAPPING""")
    (options, args) = optparser.parse_args()
    if len(args) < 3:
        optparser.print_help()
        sys.exit(0)
        
    conf_path = args[0]
    used_path = args[1]
    mapping_path = args[2]
    cfg = Config(conf_path)
    wc = Repo(cfg["svn.repository"], cfg["svn.login"], cfg["svn.passwd"])
    db = Database(wc.db_path(), int(cfg["anno_per_file"]))    
    
    types = get_text_types(db, used_path, mapping_path)
    
    print "################ Statystyki tekstow anotowanych #############################"
    print_stats(db, "ann", types)
    print
    print "################ Statystyki tekstow superanotowanych ########################"
    print_stats(db, "s_ann", types)
    print
#    print "################ Statystyki tekstow odrzuconych #############################"
#    print_stats(db, "rejected", types)