ajax_vocabulary_management.py 15.2 KB

Edit Raw Blame History

# -*- coding: utf-8 -*-

#Copyright (c) 2015, Bartłomiej Nitoń
#All rights reserved.

#Redistribution and use in source and binary forms, with or without modification, are permitted provided
#that the following conditions are met:

#    Redistributions of source code must retain the above copyright notice, this list of conditions and
#    the following disclaimer.
#    Redistributions in binary form must reproduce the above copyright notice, this list of conditions
#    and the following disclaimer in the documentation and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import codecs
import datetime
import HTMLParser
import os
from tempfile import mkdtemp, mkstemp

from django.contrib.auth.models import User
from django.db.models import Count, Sum, Q
from django.http import HttpResponse
from django.template.loader import render_to_string
from django.utils.encoding import smart_str

from common.decorators import ajax, AjaxError, render
from dictionary.forms import ManageVocabPermForm
from dictionary.models import Frame_Opinion_Value, Lemma, Lemma_Status, \
                              POS, Vocabulary, VocabularyFormat, \
                              sorted_frame_char_values_dict
from dictionary.teixml import createteixml

TEXT_VOCABULARY_CLAUSE = u"""
% The Polish Valence Dictionary (Walenty)
% <date>
%
% The Polish Valence Dictionary (Walenty) is an adaptation of
% the Syntactic Dictionary of Polish Verbs by Marek Świdziński
% in its electronic version provided by Łukasz Dębowski and
% Elżbieta Hajnicz and further expanded by Witold Kieraś to
% include the most frequent verbs in the 1 million sample of
% NKJP (National Corpus of Polish).
%
% The presented resource results from an automatic conversion
% of the aforementioned dictionary, manually reviewed by Filip
% Skwarski to include correct information about a number of new
% features, including sentential subjects, passivisation, and
% control relations.
%
% The format of the new dictionary has been established by Filip
% Skwarski, Elżbieta Hajnicz, Agnieszka Patejuk, Adam Przepiórkowski,
% Marek Świdziński, and Marcin Woliński.
%
% The dictionary has been edited and compiled using a tool
% created by Bartłomiej Nitoń.
%
% The original Syntactic Dictionary of Polish Verbs derives from:
%
% Marek Świdziński
% Institute of Polish
% Warsaw University
% Warsaw, Poland
%
% © Copyright 1998,2012 by Marek Świdziński
%
% This work is distributed under a CC BY-SA license:
% http://creativecommons.org/licenses/by-sa/2.0/
%
% Parameters:
%     Dictionaries:   <vocabularies>
%     Schema opinions: <opinions>
%     Lemma statuses: <statuses>
%     Owners:         <owners>
%     Part of speech: <part of speech>
%     Opinions added: <opinions added>
%
"""

EN_MONTHS = ['January', 'February', 'March', 'April', 'May', 'June',
             'July', 'August', 'September', 'October', 'November', 'December']

@ajax(method='post')
def create_vocabulary(request, form_data):
    form_dict = dict((x['name'], x['value']) for x in form_data)

    if not form_dict['format']:
        raise AjaxError('format not selected')
    voc_format_obj = VocabularyFormat.objects.get(pk=form_dict['format'])

    lemmas = Lemma.objects.filter(old=False).order_by('entry')
    if form_dict['vocabularies']:
        lemmas = lemmas.filter(vocabulary__in=form_dict['vocabularies'])
    if form_dict['lemma_statuses']:
        lemmas = lemmas.filter(status__in=form_dict['lemma_statuses'])
    if form_dict['owners']:
        lemmas = lemmas.filter(owner__in=form_dict['owners'])
    if form_dict['poss']:
        lemmas = lemmas.filter(entry_obj__pos__in=form_dict['poss'])
    lemmas = lemmas.distinct()

    if voc_format_obj.format == u'Tekstowy':
        tmp_folder = mkdtemp()
        os.chdir(tmp_folder)
        tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
        os.close(tmpfile)
        file_name = create_text_walenty(tmpfilename, lemmas,
                                        form_dict['vocabularies'], form_dict['frame_opinions'],
                                        form_dict['lemma_statuses'], form_dict['owners'],
                                        form_dict['poss'], form_dict['addframeopinion'])
    elif voc_format_obj.format == u'TEX':
        file_name = create_tex_walenty(lemmas, form_dict)
    elif voc_format_obj.format == u'TEI':
        file_name = create_tei_walenty(lemmas, form_dict)
    return {'file_name': file_name}

def create_tei_walenty(lemmas, form_dict):
    tmp_folder = mkdtemp()
    os.chdir(tmp_folder)
    tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
    os.close(tmpfile)
    frame_chars_dict = sorted_frame_char_values_dict()
    q_frame_opinions = []
    if form_dict['frame_opinions']:
        for pk in form_dict['frame_opinions']:
            q_frame_opinions.append(Q(value__pk=pk))
    createteixml(tmpfilename,
                 lemmas,
                 sort_reflex_vals=frame_chars_dict['sorted_reflex_vals'],
                 sort_aspect_vals=frame_chars_dict['sorted_aspect_vals'],
                 sort_neg_vals=frame_chars_dict['sorted_neg_vals'],
                 sort_pred_vals=frame_chars_dict['sorted_pred_vals'],
                 frame_char_models=[],
                 form_dict=form_dict,
                 q_frame_opinions=q_frame_opinions)
    file_name = tmpfilename + '.xml'
    os.rename(tmpfilename, file_name)
    return file_name

def create_text_walenty(file_name, lemmas, vocabularies, frame_opinions,
                          lemma_statuses, owners, poss, add_frame_opinions):
    try:
        f = codecs.open(file_name, 'w+', 'utf-8-sig' )
        f.write(create_copyrights_str(vocabularies, frame_opinions,
                                      lemma_statuses, owners, poss, add_frame_opinions))
        for lemma in lemmas:
            founded_frame_opinions = lemma.frame_opinions.filter(value__in=frame_opinions)
            frame_chars_dict = sorted_frame_char_values_dict()
            for reflex_val in frame_chars_dict['sorted_reflex_vals']:
                for neg_val in frame_chars_dict['sorted_neg_vals']:
                    for pred_val in frame_chars_dict['sorted_pred_vals']:
                        for aspect_val in frame_chars_dict['sorted_aspect_vals']:
                            matching_frames = lemma.get_frames_by_char_values(reflex_val=reflex_val,
                                                                              neg_val=neg_val,
                                                                              pred_val=pred_val,
                                                                              aspect_val=aspect_val).order_by('text_rep')
                            for frame in matching_frames:
                                if not frame_opinions or founded_frame_opinions.filter(frame=frame).exists():
                                    text_rep = frame.get_position_spaced_text_rep()
                                    if add_frame_opinions:
                                        text_rep = add_frame_opinion_and_return_text_rep(text_rep, lemma, frame)
                                    if frame.characteristics.filter(type=u'ZWROTNOŚĆ', value__value=u'się').exists():
                                        text_rep = ' ' + text_rep
                                    f.write(lemma.entry+text_rep.replace(':',': ')+'\n')
        os.rename(file_name, file_name+'.txt')
        file_name = file_name+'.txt'
    finally:
        f.close()
        return file_name

def create_copyrights_str(vocabularies_pks, frame_opinions_pks,
                            lemma_statuses_pks, owners_pks, poss_pks,
                            add_frame_opinions):
    date = datetime.datetime.now()
    month = EN_MONTHS[date.month-1]
    vocabularies = Vocabulary.objects.filter(pk__in=vocabularies_pks).order_by('name')
    if not vocabularies.exists():
        vocabularies = Vocabulary.objects.order_by('name')
    frame_opinions = Frame_Opinion_Value.objects.filter(pk__in=frame_opinions_pks).order_by('priority')
    if not frame_opinions.exists():
        frame_opinions = Frame_Opinion_Value.objects.order_by('priority')
    lemma_statuses = Lemma_Status.objects.filter(pk__in=lemma_statuses_pks).order_by('priority')
    if not lemma_statuses.exists():
        lemma_statuses = Lemma_Status.objects.order_by('priority')
    owners = User.objects.filter(pk__in=owners_pks).order_by('username')
    if not owners.exists():
        owners = User.objects.filter(lemmas__old=False).distinct().order_by('username')
    poss = POS.objects.filter(pk__in=poss_pks).order_by('priority')
    if not poss.exists():
        poss = POS.objects.exclude(tag=u'unk').order_by('priority')
    copyrights = (TEXT_VOCABULARY_CLAUSE.replace('<date>', date.strftime(month + ' %d, %Y')).
                                         replace('<vocabularies>', ', '.join([vocab.name for vocab in vocabularies])).
                                         replace('<opinions>', ', '.join([opinion.value for opinion in frame_opinions])).
                                         replace('<statuses>', ', '.join([status.status for status in lemma_statuses])).
                                         replace('<owners>', ', '.join([owner.username for owner in owners])).
                                         replace('<part of speech>', ', '.join([pos.name for pos in poss])).
                                         replace('<opinions added>', ('True' if add_frame_opinions else 'False')))
    return copyrights

def create_tex_walenty(lemmas, form_dict):
    frame_chars_dict = sorted_frame_char_values_dict()
    q_frame_opinions = []
    if form_dict['frame_opinions']:
        for pk in form_dict['frame_opinions']:
            q_frame_opinions.append(Q(value__pk=pk))
    tmp_folder = mkdtemp()
    os.chdir(tmp_folder)
    tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
    h = HTMLParser.HTMLParser()
    # Pass the TeX template through Django templating engine and into the temp file
    os.write(tmpfile, smart_str(h.unescape(render_to_string('tex/slowal.tex', {'lemmas': lemmas,
                                                            'q_frame_opinions': q_frame_opinions,
                                                            'sort_reflex_vals': frame_chars_dict['sorted_reflex_vals'],
                                                            'sort_aspect_vals': frame_chars_dict['sorted_aspect_vals'],
                                                            'sort_neg_vals'   : frame_chars_dict['sorted_neg_vals'],
                                                            'sort_pred_vals'  : frame_chars_dict['sorted_pred_vals'],
                                                            'download_dict'   : form_dict}))))
    os.close(tmpfile)
    file_name = tmpfilename + '.tex'
    os.rename(tmpfilename, file_name)
    return file_name

def add_frame_opinion_and_return_text_rep(text_rep, lemma, frame):
    try:
        frame_op = lemma.frame_opinions.filter(frame__pk=frame.pk).all()[0].value.value
    except:
        frame_op = 'unk'
    frame_form_list = text_rep.split(':')
    frame_form_list.insert(1, frame_op)
    text_rep = ':'.join(frame_form_list)
    return text_rep

def download_vocabulary(request, file_name):
    fullpath = '/' + file_name
    download_file_name = '%s_%s' % ('walenty', datetime.datetime.now().strftime('%Y%m%d'))
    with open(fullpath, "r") as f:
        data = f.read()
    response = HttpResponse(data, mimetype='text/txt')
    if file_name.endswith('.txt'):
        response['Content-Disposition'] = 'attachment; filename=%s.txt' % download_file_name
    elif file_name.endswith('.tex'):
        response['Content-Disposition'] = 'attachment; filename=%s.tex' % download_file_name
    elif file_name.endswith('.xml'):
        response['Content-Disposition'] = 'attachment; filename=%s.xml' % download_file_name
    os.remove(fullpath)
    os.rmdir(os.path.split(fullpath)[0])
    return response

@render('vocab_perm_manage_form.html')
@ajax(method='get', encode_result=False)
def vocab_perm_manage_form(request, vocabulary_name):
  vocabulary_obj = Vocabulary.objects.get(name=vocabulary_name)
  form = ManageVocabPermForm(editors=vocabulary_obj.editors,
                             viewers=vocabulary_obj.viewers)
  return {'form': form}

@ajax(method='post')
def vocab_perm_manage_form_submit(request, form_data):
  form_dict = dict((x['name'], x['value']) for x in form_data)
  vocabulary = Vocabulary.objects.get(name=form_dict['vocabulary_name'])
  vocabulary.editors.clear()
  vocabulary.viewers.clear()
  for user_pk in form_dict['editors']:
    selected_user = User.objects.get(pk=user_pk)
    vocabulary.editors.add(selected_user)
    vocabulary.viewers.add(selected_user)
  for user_pk in form_dict['viewers']:
    vocabulary.viewers.add(User.objects.get(pk=user_pk))
  return {}

@render('vocabulary_stats.html')
@ajax(method='get', encode_result=False)
def get_vocabulary_stats(request, vocabulary_name):
    if vocabulary_name:
        voc = Vocabulary.objects.get(name=vocabulary_name)
        lemmas = voc.lemmas.filter(old=False)
    else:
        lemmas = Lemma.objects.filter(old=False)
    lemma_statuses = Lemma_Status.objects.order_by('priority')
    poss = POS.objects.exclude(tag=u'unk').order_by('priority')
    top_labels = create_top_labels(poss)
    all_statuses_line = create_status_stats_line(lemmas, poss, 'wszystkie')
    vocabulary_stats_tab_lines = [top_labels, all_statuses_line]
    stats_by_pos = []
    for lemma_status in lemma_statuses:
        stat_lemmas = lemmas.filter(status__status=lemma_status.status)
        stats_line = create_status_stats_line(stat_lemmas, poss, lemma_status.status)
        vocabulary_stats_tab_lines.append(stats_line)
        stats_by_pos.append(stats_line)
    return {'vocabulary_stats': vocabulary_stats_tab_lines}

def create_top_labels(poss):
    top_labels = ['Hasła / Schematy', 'wszystkie']
    for pos in poss:
        top_labels.append(pos.name)
    return top_labels

def create_status_stats_line(lemmas, poss, status):
    all_count = {'lemmas': lemmas.count(),
                 'schemas': count_schemas(lemmas)}
    status_line = [status, all_count]
    for pos in poss:
        pos_lemmas = lemmas.filter(entry_obj__pos=pos)
        pos_count = {'lemmas': pos_lemmas.count(),
                     'schemas': count_schemas(pos_lemmas)}
        status_line.append(pos_count)
    return status_line

def count_schemas(lemmas):
    schemas_count = lemmas.annotate(num_frames=Count('frames')).aggregate(Sum('num_frames'))['num_frames__sum']
    if not schemas_count:
        schemas_count = 0
    return schemas_count