ajax_vocabulary_management.py 20 KB

Edit Raw Blame History

# -*- coding: utf-8 -*-

import codecs
import datetime
import HTMLParser
import os
from collections import Counter
from tempfile import mkdtemp, mkstemp

from django.contrib.auth.models import User
from django.db.models import Count, Max, Sum, Q
from django.http import HttpResponse
from django.template.loader import render_to_string
from django.utils.encoding import smart_str

from common.decorators import ajax, AjaxError, render
from dictionary.forms import ManageVocabPermForm
from dictionary.models import Frame_Opinion_Value, Lemma, Lemma_Status, \
                              POS, Vocabulary, VocabularyFormat, WalentyStat
from dictionary.teixml import createteixml

TEXT_VOCABULARY_CLAUSE = u"""
% The Polish Valence Dictionary (Walenty)
% <date>
%
% The Polish Valence Dictionary (Walenty) is an adaptation of
% the Syntactic Dictionary of Polish Verbs by Marek Świdziński
% in its electronic version provided by Łukasz Dębowski and
% Elżbieta Hajnicz and further expanded by Witold Kieraś to
% include the most frequent verbs in the 1 million sample of
% NKJP (National Corpus of Polish).
%
% The presented resource results from an automatic conversion
% of the aforementioned dictionary, manually reviewed by Filip
% Skwarski to include correct information about a number of new
% features, including sentential subjects, passivisation, and
% control relations.
%
% The format of the new dictionary has been established by Filip
% Skwarski, Elżbieta Hajnicz, Agnieszka Patejuk, Adam Przepiórkowski,
% Marek Świdziński, and Marcin Woliński.
%
% The dictionary has been edited and compiled using a tool
% created by Bartłomiej Nitoń.
%
% The original Syntactic Dictionary of Polish Verbs derives from:
%
% Marek Świdziński
% Institute of Polish
% Warsaw University
% Warsaw, Poland
%
% © Copyright 1998,2012 by Marek Świdziński
%
% This work is distributed under a CC BY-SA license:
% http://creativecommons.org/licenses/by-sa/2.0/
%
% Parameters:
%     Dictionaries:   <vocabularies>
%     Schema opinions: <opinions>
%     Lemma statuses: <statuses>
%     Owners:         <owners>
%     Part of speech: <part of speech>
%     Opinions added: <opinions added>
%
"""

EN_MONTHS = ['January', 'February', 'March', 'April', 'May', 'June',
             'July', 'August', 'September', 'October', 'November', 'December']

@ajax(method='post')
def create_vocabulary(request, form_data):
    form_dict = dict((x['name'], x['value']) for x in form_data)

    if not form_dict['format']:
        raise AjaxError('format not selected')
    voc_format_obj = VocabularyFormat.objects.get(pk=form_dict['format'])

    lemmas = Lemma.objects.filter(old=False).order_by('entry')
    if form_dict['vocabularies']:
        lemmas = lemmas.filter(vocabulary__in=form_dict['vocabularies'])
    if form_dict['lemma_statuses']:
        lemmas = lemmas.filter(status__in=form_dict['lemma_statuses'])
    if form_dict['owners']:
        lemmas = lemmas.filter(owner__in=form_dict['owners'])
    if form_dict['poss']:
        lemmas = lemmas.filter(entry_obj__pos__in=form_dict['poss'])
    lemmas = lemmas.distinct()

    if voc_format_obj.format == u'Tekstowy':
        tmp_folder = mkdtemp()
        os.chdir(tmp_folder)
        tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
        os.close(tmpfile)
        file_name = create_text_walenty(tmpfilename, lemmas,
                                        form_dict['vocabularies'], form_dict['frame_opinions'],
                                        form_dict['lemma_statuses'], form_dict['owners'],
                                        form_dict['poss'], form_dict['addframeopinion'])
    elif voc_format_obj.format == u'TEX':
        file_name = create_tex_walenty(lemmas, form_dict)
    elif voc_format_obj.format == u'TEI':
        file_name = create_tei_walenty(lemmas, form_dict)
    return {'file_name': file_name}

def create_tei_walenty(lemmas, form_dict):
    tmp_folder = mkdtemp()
    os.chdir(tmp_folder)
    tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
    os.close(tmpfile)
    frame_opinion_values = Frame_Opinion_Value.objects.none()
    if form_dict['frame_opinions']:
        frame_opinion_values = Frame_Opinion_Value.objects.filter(pk__in=form_dict['frame_opinions'])
    createteixml(tmpfilename,
                 lemmas,
                 frame_opinion_values)
    file_name = tmpfilename + '.xml'
    os.rename(tmpfilename, file_name)
    return file_name

def create_text_walenty(file_name, lemmas, vocabularies, frame_opinions,
                          lemma_statuses, owners, poss, add_frame_opinions):
    try:
        f = codecs.open(file_name, 'w+', 'utf-8-sig' )
        f.write(create_copyrights_str(vocabularies, frame_opinions,
                                      lemma_statuses, owners, poss, add_frame_opinions))
        for lemma in lemmas:
            founded_frame_opinions = lemma.frame_opinions.filter(value__in=frame_opinions)
            #frame_chars_dict = sorted_frame_char_values_dict()
            for reflex_val in lemma.get_existing_frame_char_values(u'ZWROTNOŚĆ'):
                for neg_val in lemma.get_existing_frame_char_values(u'NEGATYWNOŚĆ'):
                    for pred_val in lemma.get_existing_frame_char_values(u'PREDYKATYWNOŚĆ'):
                        for aspect_val in lemma.get_existing_frame_char_values(u'ASPEKT'):
                            matching_frames = lemma.get_frames_by_char_values(reflex_val=reflex_val,
                                                                              neg_val=neg_val,
                                                                              pred_val=pred_val,
                                                                              aspect_val=aspect_val).order_by('text_rep')
                            for frame in matching_frames:
                                if not lemma.phraseology_ready() and frame.phraseologic:
                                    continue
                                if not frame_opinions or founded_frame_opinions.filter(frame=frame).exists():
                                    text_rep = frame.get_position_spaced_text_rep()
                                    if add_frame_opinions:
                                        text_rep = add_frame_opinion_and_return_text_rep(text_rep, lemma, frame)
                                    if frame.characteristics.filter(type=u'ZWROTNOŚĆ', value__value=u'się').exists():
                                        text_rep = ' ' + text_rep
                                    f.write(lemma.entry+text_rep.replace(':',': ')+'\n')
        os.rename(file_name, file_name+'.txt')
        file_name = file_name+'.txt'
    finally:
        f.close()
        return file_name

def create_copyrights_str(vocabularies_pks, frame_opinions_pks,
                            lemma_statuses_pks, owners_pks, poss_pks,
                            add_frame_opinions):
    date = datetime.datetime.now()
    month = EN_MONTHS[date.month-1]
    vocabularies = Vocabulary.objects.filter(pk__in=vocabularies_pks).order_by('name')
    if not vocabularies.exists():
        vocabularies = Vocabulary.objects.order_by('name')
    frame_opinions = Frame_Opinion_Value.objects.filter(pk__in=frame_opinions_pks).order_by('priority')
    if not frame_opinions.exists():
        frame_opinions = Frame_Opinion_Value.objects.order_by('priority')
    lemma_statuses = Lemma_Status.objects.filter(pk__in=lemma_statuses_pks).order_by('priority')
    if not lemma_statuses.exists():
        lemma_statuses = Lemma_Status.objects.order_by('priority')
    owners = User.objects.filter(pk__in=owners_pks).order_by('username')
    if not owners.exists():
        owners = User.objects.filter(lemmas__old=False).distinct().order_by('username')
    poss = POS.objects.filter(pk__in=poss_pks).order_by('priority')
    if not poss.exists():
        poss = POS.objects.exclude(tag=u'unk').order_by('priority')
    copyrights = (TEXT_VOCABULARY_CLAUSE.replace('<date>', date.strftime(month + ' %d, %Y')).
                                         replace('<vocabularies>', ', '.join([vocab.name for vocab in vocabularies])).
                                         replace('<opinions>', ', '.join([opinion.value for opinion in frame_opinions])).
                                         replace('<statuses>', ', '.join([status.status for status in lemma_statuses])).
                                         replace('<owners>', ', '.join([owner.username for owner in owners])).
                                         replace('<part of speech>', ', '.join([pos.name for pos in poss])).
                                         replace('<opinions added>', ('True' if add_frame_opinions else 'False')))
    return copyrights

def create_tex_walenty(lemmas, form_dict):
    q_frame_opinions = []
    if form_dict['frame_opinions']:
        for pk in form_dict['frame_opinions']:
            q_frame_opinions.append(Q(value__pk=pk))
    tmp_folder = mkdtemp()
    os.chdir(tmp_folder)
    tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
    h = HTMLParser.HTMLParser()
    # Pass the TeX template through Django templating engine and into the temp file
    os.write(tmpfile, smart_str(h.unescape(render_to_string('tex/slowal.tex', {'lemmas': lemmas,
                                                            'q_frame_opinions': q_frame_opinions,
                                                            'download_dict'   : form_dict}))))
    os.close(tmpfile)
    file_name = tmpfilename + '.tex'
    os.rename(tmpfilename, file_name)
    return file_name

def add_frame_opinion_and_return_text_rep(text_rep, lemma, frame):
    try:
        frame_op = lemma.frame_opinions.filter(frame__pk=frame.pk).all()[0].value.value
    except:
        frame_op = 'unk'
    frame_form_list = text_rep.split(':')
    frame_form_list.insert(1, frame_op)
    text_rep = ':'.join(frame_form_list)
    return text_rep

def download_vocabulary(request, file_name):
    fullpath = '/' + file_name
    download_file_name = '%s_%s' % ('walenty', datetime.datetime.now().strftime('%Y%m%d'))
    with open(fullpath, "r") as f:
        data = f.read()
    response = HttpResponse(data, mimetype='text/txt')
    if file_name.endswith('.txt'):
        response['Content-Disposition'] = 'attachment; filename=%s.txt' % download_file_name
    elif file_name.endswith('.tex'):
        response['Content-Disposition'] = 'attachment; filename=%s.tex' % download_file_name
    elif file_name.endswith('.xml'):
        response['Content-Disposition'] = 'attachment; filename=%s.xml' % download_file_name
    os.remove(fullpath)
    os.rmdir(os.path.split(fullpath)[0])
    return response

@render('vocab_perm_manage_form.html')
@ajax(method='get', encode_result=False)
def vocab_perm_manage_form(request, vocabulary_name):
  vocabulary_obj = Vocabulary.objects.get(name=vocabulary_name)
  form = ManageVocabPermForm(editors=vocabulary_obj.editors,
                             viewers=vocabulary_obj.viewers)
  return {'form': form}

@ajax(method='post')
def vocab_perm_manage_form_submit(request, form_data):
  form_dict = dict((x['name'], x['value']) for x in form_data)
  vocabulary = Vocabulary.objects.get(name=form_dict['vocabulary_name'])
  vocabulary.editors.clear()
  vocabulary.viewers.clear()
  for user_pk in form_dict['editors']:
    selected_user = User.objects.get(pk=user_pk)
    vocabulary.editors.add(selected_user)
    vocabulary.viewers.add(selected_user)
  for user_pk in form_dict['viewers']:
    vocabulary.viewers.add(User.objects.get(pk=user_pk))
  return {}

@render('vocabulary_stats.html')
@ajax(method='get', encode_result=False)
def get_vocabulary_stats(request, vocabulary_name):
    if vocabulary_name:
        voc = Vocabulary.objects.get(name=vocabulary_name)
        lemmas = voc.lemmas.filter(old=False)
    else:
        lemmas = Lemma.objects.filter(old=False)
    lemma_statuses = Lemma_Status.objects.order_by('priority')
    poss = POS.objects.exclude(tag=u'unk').order_by('priority')
    top_labels = create_top_labels(poss)
    all_statuses_line = create_status_stats_line(lemmas, poss, 'wszystkie')
    vocabulary_stats_tab_lines = [top_labels, all_statuses_line]
    stats_by_pos = []
    for lemma_status in lemma_statuses:
        stat_lemmas = lemmas.filter(status__status=lemma_status.status)
        stats_line = create_status_stats_line(stat_lemmas, poss, lemma_status.status)
        vocabulary_stats_tab_lines.append(stats_line)
        stats_by_pos.append(stats_line)
    return {'vocabulary_stats': vocabulary_stats_tab_lines}

def create_top_labels(poss):
    top_labels = ['Hasła / Schematy', 'wszystkie']
    for pos in poss:
        top_labels.append(pos.name)
    return top_labels

def create_status_stats_line(lemmas, poss, status):
    all_count = {'lemmas': lemmas.count(),
                 'schemas': count_schemas(lemmas)}
    status_line = [status, all_count]
    for pos in poss:
        pos_lemmas = lemmas.filter(entry_obj__pos=pos)
        pos_count = {'lemmas': pos_lemmas.count(),
                     'schemas': count_schemas(pos_lemmas)}
        status_line.append(pos_count)
    return status_line

def count_schemas(lemmas):
    schemas_count = lemmas.annotate(num_frames=Count('frames')).aggregate(Sum('num_frames'))['num_frames__sum']
    if not schemas_count:
        schemas_count = 0
    return schemas_count

@render('other_stats.html')
@ajax(method='get', encode_result=False)
def get_other_stats(request):
    return {'stats': WalentyStat.objects.order_by('label')}

def get_stats(statuses, pos):
    stats_dict = Counter({u'phrases': 0,
                          u'poss': 0,
                          u'lemmas': 0,
                          u'sub_lemmas': 0,
                          u'schemata': 0,
                          u'frames': 0,
                          u'cer_schemata': 0,
                          u'uncer_schemata': 0,
                          u'bad_schemata': 0,
                          u'arch_schemata': 0,
                          u'col_schemata': 0,
                          u'vul_schemata': 0,
                          u'coor_schemata': 0,
                          u'lex_schemata': 0,
                          u'coor_lemmas': 0,
                          u'lex_lemmas': 0})

    lemmas = Lemma.objects.filter(old=False,
                                  entry_obj__pos__tag=pos).filter(status__in=statuses).distinct()
    stats_dict[u'lemmas'] = lemmas.count()
    for lemma in lemmas.order_by('entry').all():
        stats_dict[u'cer_schemata'] += lemma.frame_opinions.filter(value__value=u'pewny').count()
        stats_dict[u'uncer_schemata'] += lemma.frame_opinions.filter(value__value=u'wątpliwy').count()
        stats_dict[u'bad_schemata'] += lemma.frame_opinions.filter(value__value=u'zły').count()
        stats_dict[u'arch_schemata'] += lemma.frame_opinions.filter(value__value=u'archaiczny').count()
        stats_dict[u'col_schemata'] += lemma.frame_opinions.filter(value__value=u'potoczny').count()
        stats_dict[u'vul_schemata'] += lemma.frame_opinions.filter(value__value=u'wulgarny').count()
        stats_dict[u'schemata'] += lemma.frames.count()
        stats_dict[u'frames'] += lemma.entry_obj.actual_frames().count()

        stats_dict = stats_dict + Counter(get_sub_entries_dict(lemma))

        has_phraseology = False
        has_coordination = False
        for frame in lemma.frames.all():
            stats_dict[u'poss'] += frame.positions.count()
            flat_frames = frame.positions.annotate(num_args=Count('arguments')).aggregate(Max('num_args'))['num_args__max']
            if flat_frames > 1:
                stats_dict[u'coor_schemata'] += 1
                has_coordination = True
            for pos in frame.positions.all():
                stats_dict[u'phrases'] += pos.arguments.count()
            if frame.phraseologic:
                stats_dict[u'lex_schemata'] += 1
                has_phraseology = True

        if has_phraseology:
            stats_dict[u'lex_lemmas'] += 1
        if has_coordination:
            stats_dict[u'coor_lemmas'] += 1

    return stats_dict

def get_sub_entries_dict(lemma):
    sub_entries_dict = {'sub_lemmas': 0}
    for reflex in lemma.get_existing_frame_char_values(u'ZWROTNOŚĆ'):
        for neg in lemma.get_existing_frame_char_values(u'NEGATYWNOŚĆ'):
            for pred in lemma.get_existing_frame_char_values(u'PREDYKATYWNOŚĆ'):
                for aspect in lemma.get_existing_frame_char_values(u'ASPEKT'):
                    matching_frames = lemma.get_frames_by_char_values(reflex_val=reflex,
                                                                      neg_val=neg,
                                                                      pred_val=pred,
                                                                      aspect_val=aspect)
                    if matching_frames.exists():
                        sub_entries_dict[u'sub_lemmas'] += 1
                        subentry_key = u'Liczba podhaseł postaci: (%s,%s,%s,%s)' % (reflex.value, neg.value,
                                                                                    pred.value, aspect.value)
                        if not subentry_key in sub_entries_dict:
                            sub_entries_dict[subentry_key] = 0
                        sub_entries_dict[subentry_key] += 1
    return sub_entries_dict

def write_stats(stats_path, stats):
    try:
        outfile = codecs.open(stats_path, 'wt', 'utf-8')

        outfile.write(u'Łączna liczba haseł:\t%d\n\n' % stats['lemmas'])
        outfile.write(u'Łączna liczba pozycji w schematach:\t%d\n' % stats['poss'])
        outfile.write(u'Łączna liczba realizacji w schematach:\t%d\n\n' % stats['phrases'])

        outfile.write(u'Łączna liczba podhaseł:\t%d\n' % stats['sub_lemmas'])
        outfile.write(u'Liczba podhaseł postaci (ZWROTNOŚĆ, NEGATYWNOŚĆ, PREDYKATYWNOŚĆ, ASPEKT)\n')
        write_subschemas_stats(outfile, stats)

        outfile.write(u'Łączna liczba schematów:\t%d\n' % stats['schemata'])
        outfile.write(u'Liczba schematów pewnych:\t%d\n' % stats['cer_schemata'])
        outfile.write(u'Liczba schematów wątpliwych:\t%d\n' % stats['uncer_schemata'])
        outfile.write(u'Liczba schematów złych:\t%d\n' % stats['bad_schemata'])
        outfile.write(u'Liczba schematów archaicznych:\t%d\n' % stats['arch_schemata'])
        outfile.write(u'Liczba schematów potocznych:\t%d\n' % stats['col_schemata'])
        outfile.write(u'Liczba schematów wulgarnych:\t%d\n\n' % stats['vul_schemata'])

        outfile.write(u'Łączna liczba schematów z koordynacją:\t%d\n' % stats['coor_schemata'])
        outfile.write(u'Łączna liczba schematów zleksykalizowanych:\t%d\n\n' % stats['lex_schemata'])

        outfile.write(u'Łączna liczba haseł zawierających pozycje z koordynacją:\t%d\n' % stats['coor_lemmas'])
        outfile.write(u'Łączna liczba haseł zawierających schematy zleksykalizowane:\t%d\n\n' % stats['lex_lemmas'])

        outfile.write(u'Łączna liczba ram semantycznych:\t%d\n\n' % stats['frames'])
    finally:
        outfile.close()

def write_subschemas_stats(stats_file, stats):
    subschemas_stats = ['%s:\t%d\n' % (k, v) for k,v in stats.iteritems() if k.startswith(u'Liczba podhaseł postaci:')]
    subschemas_stats.sort()
    for stat in subschemas_stats:
        stats_file.write(stat)
    stats_file.write('\n')

def update_walenty_stats(stats):
    WalentyStat.objects.all().delete()
    WalentyStat(label=u'Łączna liczba haseł', value=str(stats['lemmas'])).save()
    WalentyStat(label=u'Łączna liczba pozycji w schematach', value=str(stats['poss'])).save()
    WalentyStat(label=u'Łączna liczba realizacji w schematach', value=str(stats['phrases'])).save()
    WalentyStat(label=u'Łączna liczba schematów', value=str(stats['schemata'])).save()
    WalentyStat(label=u'Łączna liczba schematów z koordynacją', value=str(stats['coor_schemata'])).save()
    WalentyStat(label=u'Łączna liczba schematów zleksykalizowanych', value=str(stats['lex_schemata'])).save()
    WalentyStat(label=u'Łączna liczba ram semantycznych', value=str(stats['frames'])).save()