db_detailed_stats.py 8.53 KB

Edit Raw Blame History

# -*- coding: utf-8 -*-

#!/usr/bin/env python
import re
import sys
import os
import re
import shutil
from collections import defaultdict
from optparse import OptionParser

# Solution with no hard coded path would be welcome
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), ".."))

from dfs.database import Database
from dfs.repo import Repo
from dfs.config import Config

def get_desired():
    result = {}
    result[u"Dzienniki"] = 25.5
    result[u"Pozostałe periodyki"] = 23.5
    result[u"Książki publicystyczne"] = 1.0
    result[u"Literatura piękna"] = 16.0
    result[u"Literatura faktu"] = 5.5
    result[u"Typ informacyjno-poradnikowy"] = 5.5
    result[u"Typ naukowo-dydaktyczny"] = 2.0
    result[u"Internetowe interaktywne (blogi, fora, usenet)"] = 3.5
    result[u"Internetowe nieinteraktywne (statyczne strony, Wikipedia)"] = 3.5
    result[u"Quasi-mówione (protokoły sesji parlamentu)"] = 2.5
    result[u"Mówione medialne"] = 2.5
    result[u"Mówione konwersacyjne"] = 5.0
    result[u"Inne teksty pisane"] = 3.0
    result[u"Książka niebeletrystyczna nieklasyfikowana"] = 1.0

    return result

def fill(text):
    return fill_custom(text, 20, " ")

def fill_custom(text, l, sym):
    text = text.decode("utf-8")
    to_add = max(0, l - len(text))
    spaces = ""
    for i in range(to_add):
        spaces = spaces + sym
    return (text + spaces).encode("utf-8")

def get_span_size(span, words):
    w = span.split(",")
    size = 0
    for fragment in w:
        f = fragment.split("..")
        id1 = words.index(f[0])
        id2 = words.index(f[-1])
        size = size + id2 - id1 + 1

    return size

def count_file_stats(path):
    words = []
    id = re.compile("<word.* id=\"(.*?)\".*")
    with open(path + "_words.xml", "r") as f:
        for line in f.readlines():
            groups = id.findall(line)
            if len(groups) == 1:
                ident = groups[0]
                words.append(ident)

    sp = re.compile("<markable.*span=\"(.*?)\".*")
    mg = re.compile(".*mention_group=\"(.*?)\".*")
    ni = re.compile(".*near_identity=\"(.*?)\".*")

    sets = {}
    near_id = 0
    mentions = 0
    mention_sizes = {}
    with open(path + "_mentions.xml", "r") as f:
        for line in f.readlines():
            groups = sp.findall(line)
            if len(groups) == 1:
                mentions = mentions + 1
                span = groups[0]

                mention_size = get_span_size(span, words)
                if mention_size in mention_sizes:
                    mention_sizes[mention_size] = mention_sizes[mention_size] + 1
                else:
                    mention_sizes[mention_size] = 1

                group = mg.findall(line)[0]
                near = ni.findall(line)[0]

                if near != "empty" and near != "":
                    near_id = near_id + 1

                if group != "empty" and group != "":
                    if group in sets:
                        sets[group] = sets[group] + 1
                    else:
                        sets[group] = 1

    mg_sizes = {}
    for key, val in sets.iteritems():
        if val in mg_sizes:
            mg_sizes[val] = mg_sizes[val] + 1
        else:
            mg_sizes[val] = 1

    return len(words), mention_sizes, mg_sizes, near_id

def merge_dicts(dict1, dict2):

    for key, val in dict2.iteritems():
        if key in dict1:
            dict1[key] = dict1[key] + val
        else:
            dict1[key] = val

def get_text_types(db, used_path, mapping_path):
    names = []
    for filename, file in db.file_index.iteritems():
        if db.rejected(file):
            continue
        names.append(filename)

    id2nrs = {}
    with open(used_path, "r") as f:
        for line in f.readlines():
            spl = line.split(";")
            id = spl[1]
            nr = spl[0]
            if id not in id2nrs:
                id2nrs[id] = []
            id2nrs[id].append(nr)

    types = {}
    with open(mapping_path, "r") as f:
        for line in f.readlines():
            spl = line.strip().split(";")
            id = spl[2]
            if id in id2nrs.keys():
                for nr in id2nrs[id]:
                    types[nr] = spl[0].decode("utf-8")

    return types

def print_stats(db, ann, types):
    gnidcnt = 0
    gwcnt = 0
    gms = {}
    gmgs = {}
    wbalance = {}
    tbalance = {}

    tc = 0

    for filename, file in db.file_index.iteritems():

        type = types[filename]
        idx = 0

        if ann == "rejected":
            if db.rejected(file):
                path = os.path.join(wc.new_path(), filename)
                wcnt, ms, mgs, nidcnt = count_file_stats(path)

                merge_dicts(wbalance, {type : wcnt})
                merge_dicts(tbalance, {type : 1})

                gwcnt = gwcnt + wcnt
                gnidcnt = gnidcnt + nidcnt
                merge_dicts(gms, ms)
                merge_dicts(gmgs, mgs)

                tc = tc + 1

        else:
            if db.rejected(file):
                continue

            for annotation in file.findall(ann):
                if db.finished(annotation):
                    if ann == "ann":
                        path = wc.upload_path(filename, idx)
                    else:
                        path = wc.upload_prim_path(filename)

                    wcnt, ms, mgs, nidcnt = count_file_stats(path)

                    merge_dicts(wbalance, {type : wcnt})
                    merge_dicts(tbalance, {type : 1})

                    gwcnt = gwcnt + wcnt
                    gnidcnt = gnidcnt + nidcnt
                    merge_dicts(gms, ms)
                    merge_dicts(gmgs, mgs)

                    tc = tc + 1

                idx = idx + 1

    desired = get_desired()
    print
    print fill_custom("Typ tekstow", 60, " "), fill("Liczba tekstow"), fill("Liczba slow"), fill("Procent slow"), fill("Docelowy procent")
    print fill_custom("", 150, "-")
    for type in sorted(set(wbalance.keys()) | set(tbalance.keys())):
        percent = round(1.0 * wbalance[type] / gwcnt * 100, 2)
        print fill_custom(type.encode("utf-8"), 60, " "), fill(str(tbalance[type])), fill(str(wbalance[type])), fill(str(percent) + "%"), fill(str(desired[type])+"%")
    print fill_custom("", 150, "-")
    print fill_custom("dowolny", 60, " "), fill(str(tc)), fill(str(gwcnt)), fill("100.0%"), fill("100.0%")

    print

    if ann != "rejected":
        print "Wielkosci wystapien:"
        suma = 0
        for key, val in sorted(gms.iteritems()):
            print "  ", val, "wystapien o wielkosci", key
            suma = suma + val
        print "  ", "--------------------------"
        print "  ", suma, "wystapien o dowolnej wielkosci"

        print

        print "Wielkosci klastrow:"
        suma = 0
        for key, val in sorted(gmgs.iteritems()):
            print "  ", val, "klastrow o wielkosci", key
            suma = suma + val
        print "  ", "--------------------------"
        print "  ", suma, "klastrow o dowolnej wielkosci"

        print

        print "Liczba linkow:", gnidcnt


if __name__ == "__main__":
    optparser = OptionParser(usage="""usage: %prog CONFIG USED_LIST MAPPING""")
    (options, args) = optparser.parse_args()
    if len(args) < 3:
        optparser.print_help()
        sys.exit(0)

    conf_path = args[0]
    used_path = args[1]
    mapping_path = args[2]
    cfg = Config(conf_path)
    wc = Repo(cfg["svn.repository"], cfg["svn.login"], cfg["svn.passwd"])
    db = Database(wc.db_path(), int(cfg["anno_per_file"]))

    types = get_text_types(db, used_path, mapping_path)

    print "################ Statystyki anotacji #########################"
    print_stats(db, "ann", types)
    print
    print "################ Statystyki superanotacji ####################"
    print_stats(db, "s_ann", types)
    print
#    print "################ rejected files stats####################"
#    print_stats(db, "rejected", types)