quasi.py 5.49 KB
#!/usr/bin/env python
import re
import sys
import os
import re
import shutil
from collections import defaultdict
from optparse import OptionParser

# Solution with no hard coded path would be welcome
sys.path.append(os.path.join(os.path.dirname(os.path.abspath(sys.argv[0])), ".."))

from dfs.database import Database
from dfs.repo import Repo
from dfs.config import Config

def fill(text):
    return fill_custom(text, 20, " ")

def fill_custom(text, l, sym):
    to_add = max(0, l - len(text))
    spaces = ""
    for i in range(to_add):
        spaces = spaces + sym
    return text + spaces

def parse_span(span, words_list):    
    words = []
    
    w = span.split(",")    
    for fragment in w:
        f = fragment.split("..")
        id1 = words_list.index(f[0])
        id2 = words_list.index(f[-1])        
        for i in range(id1, id2+1):
            words.append(words_list[i])
                
    return words

def get_context(ids, words_list, size):
    first = -1
    last = -1
    
    i = 0
    for i in range(len(words_list)):
        w = words_list[i]
        if w in ids:
            if first == -1:
                first = i
            last = i

    first = max(0, first - size)
    last = min(len(words_list), last + size)

    return words_list[first:last]
        
def print_file(path):                
    word_id_2_orth = {}
    words_list = [] 
    w = re.compile("<word.* id=\"(.*?)\".*>(.*?)</word>.*")
    with open(path + "_words.xml", "r") as f:
        for line in f.readlines():
            groups = w.findall(line)
            if len(groups) == 1:
                group = groups[0]
                id = group[0]
                orth = group[1]
                word_id_2_orth[id] = orth
                words_list.append(id)
    
    me = re.compile("<markable.*id=\"(.*?)\".*")
    sp = re.compile(".*span=\"(.*?)\".*")
    co = re.compile(".*comment=\"(.*?)\".*")
    ni = re.compile(".*near_identity=\"(.*?)\".*") 
 
    mention_id_2_span = {}
    mention_id_2_comment = {}          
    near_links = []
    with open(path + "_mentions.xml", "r") as f:
        for line in f.readlines():
            groups1 = me.findall(line)
            groups2 = sp.findall(line)
            groups3 = co.findall(line)       
            
            if len(groups1) == 1 and len(groups2) == 1:            
                id = groups1[0]
                span = groups2[0]
                mention_id_2_span[id] = parse_span(span, words_list)
                
                if len(groups3) == 1:
                    mention_id_2_comment[id] = groups3[0]                
                                 
                near = ni.findall(line)
                if len(near) == 1 and near[0] != "empty" and near[0] != "":
                    near_links.append((id, near[0]))                                        
    
    if len(near_links) > 0:        
        print "###", "Tekst", path.split("/")[-1], "###"
        print 
        
    c = 0
    for m1, m2 in near_links:
        comments = []
        if m1 in mention_id_2_comment:
            comments.append(mention_id_2_comment[m1])
        if m2 in mention_id_2_comment:
            comments.append(mention_id_2_comment[m2])
        
        if m1 not in mention_id_2_span or m2 not in mention_id_2_span:
            print "ERROR", m1, m2
            continue
        
        span1 = mention_id_2_span[m1]
        span2 = mention_id_2_span[m2]
        
        spans = set(span1)
        spans = spans.union(set(span2))    
        ctx = get_context(spans, words_list, 3)
        
        result = ""        
        for wid in ctx:
            result = result + " "
            if wid == span1[0]:
                result = result + "["
            if wid == span2[0]:
                result = result + "["
            
            result = result + word_id_2_orth[wid]    
            
            if wid == span1[-1]:
                result = result + "]"
            if wid == span2[-1]:
                result = result + "]"                            
        
        m1orth = "[" + reduce(lambda a, i: a + " " + i, map(lambda i : word_id_2_orth[i], span1)) + "]"
        m2orth = "[" + reduce(lambda a, i: a + " " + i, map(lambda i : word_id_2_orth[i], span2)) + "]"
        
        if words_list.index(span1[0]) <= words_list.index(span2[0]):
            print str(c)+".", m1orth, "<-->", m2orth
        else:
            print str(c)+".", m2orth, "<-->", m1orth
            
        for comm in comments:            
            print "Komentarz:", comm
        print "...", result, "..."
        print
             
        c = c + 1
        
    return c
        
def print_quasi(db):
    links = 0        
    for filename, file in db.file_index.iteritems():
        
        if db.rejected(file):
            continue        
                        
        sann = file.find("s_ann")
        
        if sann is not None and db.finished(sann):            
            path = wc.upload_prim_path(filename)              
            links = links + print_file(path)
            
    return links

if __name__ == "__main__":
    optparser = OptionParser(usage="""usage: %prog CONFIG """)
    (options, args) = optparser.parse_args()
    if len(args) < 1:
        optparser.print_help()
        sys.exit(0)
        
    conf_path = args[0]
    cfg = Config(conf_path)
    wc = Repo(cfg["svn.repository"], cfg["svn.login"], cfg["svn.passwd"])
    db = Database(wc.db_path(), int(cfg["anno_per_file"]))    
    
    l = print_quasi(db)
    print "##################"
    print
    print "Wszystkich linkow:", l