clusters.py 4.6 KB
#!/usr/bin/env python
import re
import sys
import os
import re
import shutil
from collections import defaultdict
from optparse import OptionParser

# Solution with no hard coded path would be welcome
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), ".."))

from dfs.database import Database
from dfs.repo import Repo
from dfs.config import Config

def fill(text):
    return fill_custom(text, 20, " ")

def fill_custom(text, l, sym):
    to_add = max(0, l - len(text))
    spaces = ""
    for i in range(to_add):
        spaces = spaces + sym
    return text + spaces

def parse_span(span, words_list):    
    words = []
    
    w = span.split(",")    
    for fragment in w:
        f = fragment.split("..")
        id1 = words_list.index(f[0])
        id2 = words_list.index(f[-1])        
        for i in range(id1, id2+1):
            words.append(words_list[i])
                
    return words

def get_context(ids, words_list, size):
    first = -1
    last = -1
    
    i = 0
    for i in range(len(words_list)):
        w = words_list[i]
        if w in ids:
            if first == -1:
                first = i
            last = i

    first = max(0, first - size)
    last = min(len(words_list), last + size)

    return words_list[first:last]
        
def print_file(path):                
    word_id_2_orth = {}
    words_list = [] 
    w = re.compile("<word.* id=\"(.*?)\".*>(.*?)</word>.*")
    with open(path + "_words.xml", "r") as f:
        for line in f.readlines():
            groups = w.findall(line)
            if len(groups) == 1:
                group = groups[0]
                id = group[0]
                orth = group[1]
                word_id_2_orth[id] = orth
                words_list.append(id)
    
    me = re.compile("<markable.*id=\"(.*?)\".*")
    sp = re.compile(".*span=\"(.*?)\".*")
    co = re.compile(".*comment=\"(.*?)\".*")
    mg = re.compile(".*mention_group=\"(.*?)\".*")  
 
    mention_id_2_span = {}
    mention_id_2_comment = {}          
    clusters = {}
    with open(path + "_mentions.xml", "r") as f:
        for line in f.readlines():
            groups1 = me.findall(line)
            groups2 = sp.findall(line)
            groups3 = co.findall(line)       
            
            if len(groups1) == 1 and len(groups2) == 1:            
                id = groups1[0]
                span = groups2[0]
                mention_id_2_span[id] = parse_span(span, words_list)
                
                if len(groups3) == 1:
                    mention_id_2_comment[id] = groups3[0]                
                                 
                clu = mg.findall(line)
                if len(clu) == 1 and clu[0] != "empty" and clu[0] != "":
                    if clu[0] not in clusters:
                        clusters[clu[0]] = []                        
                    clusters[clu[0]].append(id)
        
    all_clusters = {}
    for list in clusters.values():
        l = ["["+" ".join(word_id_2_orth[wid] for wid in mention_id_2_span[id])+"]" for id in list]
        length = len(l)
        if length not in all_clusters:
            all_clusters[length] = []
        all_clusters[length].append(l)       
                             
    return all_clusters

def merge_dicts(dict1, dict2):
    for key, val in dict2.iteritems():
        if key in dict1:
            dict1[key].extend(val)
        else:
            dict1[key] = val
                    
def print_clusters(db, min_size):
    all = {}      
    for filename, file in db.file_index.iteritems():
        
        if db.rejected(file):
            continue        
                        
        sann = file.find("s_ann")
                
        if sann is not None and db.finished(sann):            
            path = wc.upload_prim_path(filename)              
            clus = print_file(path)
            merge_dicts(all, clus)
                
    for key, val in sorted(all.iteritems()):
        if key >= min_size:
            print
            print "#######", "Dlugosc klastra:", key, "#######"
            for cl in val:
                print                
                print ", ".join(cl)                
        
if __name__ == "__main__":
    optparser = OptionParser(usage="""usage: %prog CONFIG MINCLUSTERSIZE""")
    (options, args) = optparser.parse_args()
    if len(args) < 2:
        optparser.print_help()
        sys.exit(0)
        
    conf_path = args[0]
    min_size = int(args[1])
    cfg = Config(conf_path)
    wc = Repo(cfg["svn.repository"], cfg["svn.login"], cfg["svn.passwd"])
    db = Database(wc.db_path(), int(cfg["anno_per_file"]))    
    
    print_clusters(db, min_size)