ajax_vocabulary_management.py 21.9 KB

Edit Raw Blame History

# -*- coding: utf-8 -*-

#Copyright (c) 2015, Bartłomiej Nitoń
#All rights reserved.

#Redistribution and use in source and binary forms, with or without modification, are permitted provided
#that the following conditions are met:

#    Redistributions of source code must retain the above copyright notice, this list of conditions and
#    the following disclaimer.
#    Redistributions in binary form must reproduce the above copyright notice, this list of conditions
#    and the following disclaimer in the documentation and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

import codecs
import datetime
import HTMLParser
import os
from collections import Counter
from tempfile import mkdtemp, mkstemp

from django.contrib.auth.models import User
from django.db.models import Count, Max, Sum, Q
from django.http import HttpResponse
from django.template.loader import render_to_string
from django.utils.encoding import smart_str

from common.decorators import ajax, AjaxError, render
from dictionary.forms import ManageVocabPermForm
from dictionary.models import Frame_Opinion_Value, Lemma, Lemma_Status, \
                              POS, Vocabulary, VocabularyFormat, WalentyStat,\
                              sorted_frame_char_values_dict
from dictionary.teixml import createteixml

LEX_TYPES = ['lex', 'fixed', 'comprepnp']

TEXT_VOCABULARY_CLAUSE = u"""
% The Polish Valence Dictionary (Walenty)
% <date>
%
% The Polish Valence Dictionary (Walenty) is an adaptation of
% the Syntactic Dictionary of Polish Verbs by Marek Świdziński
% in its electronic version provided by Łukasz Dębowski and
% Elżbieta Hajnicz and further expanded by Witold Kieraś to
% include the most frequent verbs in the 1 million sample of
% NKJP (National Corpus of Polish).
%
% The presented resource results from an automatic conversion
% of the aforementioned dictionary, manually reviewed by Filip
% Skwarski to include correct information about a number of new
% features, including sentential subjects, passivisation, and
% control relations.
%
% The format of the new dictionary has been established by Filip
% Skwarski, Elżbieta Hajnicz, Agnieszka Patejuk, Adam Przepiórkowski,
% Marek Świdziński, and Marcin Woliński.
%
% The dictionary has been edited and compiled using a tool
% created by Bartłomiej Nitoń.
%
% The original Syntactic Dictionary of Polish Verbs derives from:
%
% Marek Świdziński
% Institute of Polish
% Warsaw University
% Warsaw, Poland
%
% © Copyright 1998,2012 by Marek Świdziński
%
% This work is distributed under a CC BY-SA license:
% http://creativecommons.org/licenses/by-sa/2.0/
%
% Parameters:
%     Dictionaries:   <vocabularies>
%     Schema opinions: <opinions>
%     Lemma statuses: <statuses>
%     Owners:         <owners>
%     Part of speech: <part of speech>
%     Opinions added: <opinions added>
%
"""

EN_MONTHS = ['January', 'February', 'March', 'April', 'May', 'June',
             'July', 'August', 'September', 'October', 'November', 'December']

@ajax(method='post')
def create_vocabulary(request, form_data):
    form_dict = dict((x['name'], x['value']) for x in form_data)

    if not form_dict['format']:
        raise AjaxError('format not selected')
    voc_format_obj = VocabularyFormat.objects.get(pk=form_dict['format'])

    lemmas = Lemma.objects.filter(old=False).order_by('entry')
    if form_dict['vocabularies']:
        lemmas = lemmas.filter(vocabulary__in=form_dict['vocabularies'])
    if form_dict['lemma_statuses']:
        lemmas = lemmas.filter(status__in=form_dict['lemma_statuses'])
    if form_dict['owners']:
        lemmas = lemmas.filter(owner__in=form_dict['owners'])
    if form_dict['poss']:
        lemmas = lemmas.filter(entry_obj__pos__in=form_dict['poss'])
    lemmas = lemmas.distinct()

    if voc_format_obj.format == u'Tekstowy':
        tmp_folder = mkdtemp()
        os.chdir(tmp_folder)
        tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
        os.close(tmpfile)
        file_name = create_text_walenty(tmpfilename, lemmas,
                                        form_dict['vocabularies'], form_dict['frame_opinions'],
                                        form_dict['lemma_statuses'], form_dict['owners'],
                                        form_dict['poss'], form_dict['addframeopinion'])
    elif voc_format_obj.format == u'TEX':
        file_name = create_tex_walenty(lemmas, form_dict)
    elif voc_format_obj.format == u'TEI':
        file_name = create_tei_walenty(lemmas, form_dict)
    return {'file_name': file_name}

def create_tei_walenty(lemmas, form_dict):
    tmp_folder = mkdtemp()
    os.chdir(tmp_folder)
    tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
    os.close(tmpfile)
    frame_chars_dict = sorted_frame_char_values_dict()
    q_frame_opinions = []
    if form_dict['frame_opinions']:
        for pk in form_dict['frame_opinions']:
            q_frame_opinions.append(Q(value__pk=pk))
    createteixml(tmpfilename,
                 lemmas,
                 sort_reflex_vals=frame_chars_dict['sorted_reflex_vals'],
                 sort_aspect_vals=frame_chars_dict['sorted_aspect_vals'],
                 sort_neg_vals=frame_chars_dict['sorted_neg_vals'],
                 sort_pred_vals=frame_chars_dict['sorted_pred_vals'],
                 frame_char_models=[],
                 form_dict=form_dict,
                 q_frame_opinions=q_frame_opinions)
    file_name = tmpfilename + '.xml'
    os.rename(tmpfilename, file_name)
    return file_name

def create_text_walenty(file_name, lemmas, vocabularies, frame_opinions,
                          lemma_statuses, owners, poss, add_frame_opinions):
    try:
        f = codecs.open(file_name, 'w+', 'utf-8-sig' )
        f.write(create_copyrights_str(vocabularies, frame_opinions,
                                      lemma_statuses, owners, poss, add_frame_opinions))
        for lemma in lemmas:
            founded_frame_opinions = lemma.frame_opinions.filter(value__in=frame_opinions)
            frame_chars_dict = sorted_frame_char_values_dict()
            for reflex_val in frame_chars_dict['sorted_reflex_vals']:
                for neg_val in frame_chars_dict['sorted_neg_vals']:
                    for pred_val in frame_chars_dict['sorted_pred_vals']:
                        for aspect_val in frame_chars_dict['sorted_aspect_vals']:
                            matching_frames = lemma.get_frames_by_char_values(reflex_val=reflex_val,
                                                                              neg_val=neg_val,
                                                                              pred_val=pred_val,
                                                                              aspect_val=aspect_val).order_by('text_rep')
                            for frame in matching_frames:
                                if not frame_opinions or founded_frame_opinions.filter(frame=frame).exists():
                                    text_rep = frame.get_position_spaced_text_rep()
                                    if add_frame_opinions:
                                        text_rep = add_frame_opinion_and_return_text_rep(text_rep, lemma, frame)
                                    if frame.characteristics.filter(type=u'ZWROTNOŚĆ', value__value=u'się').exists():
                                        text_rep = ' ' + text_rep
                                    f.write(lemma.entry+text_rep.replace(':',': ')+'\n')
        os.rename(file_name, file_name+'.txt')
        file_name = file_name+'.txt'
    finally:
        f.close()
        return file_name

def create_copyrights_str(vocabularies_pks, frame_opinions_pks,
                            lemma_statuses_pks, owners_pks, poss_pks,
                            add_frame_opinions):
    date = datetime.datetime.now()
    month = EN_MONTHS[date.month-1]
    vocabularies = Vocabulary.objects.filter(pk__in=vocabularies_pks).order_by('name')
    if not vocabularies.exists():
        vocabularies = Vocabulary.objects.order_by('name')
    frame_opinions = Frame_Opinion_Value.objects.filter(pk__in=frame_opinions_pks).order_by('priority')
    if not frame_opinions.exists():
        frame_opinions = Frame_Opinion_Value.objects.order_by('priority')
    lemma_statuses = Lemma_Status.objects.filter(pk__in=lemma_statuses_pks).order_by('priority')
    if not lemma_statuses.exists():
        lemma_statuses = Lemma_Status.objects.order_by('priority')
    owners = User.objects.filter(pk__in=owners_pks).order_by('username')
    if not owners.exists():
        owners = User.objects.filter(lemmas__old=False).distinct().order_by('username')
    poss = POS.objects.filter(pk__in=poss_pks).order_by('priority')
    if not poss.exists():
        poss = POS.objects.exclude(tag=u'unk').order_by('priority')
    copyrights = (TEXT_VOCABULARY_CLAUSE.replace('<date>', date.strftime(month + ' %d, %Y')).
                                         replace('<vocabularies>', ', '.join([vocab.name for vocab in vocabularies])).
                                         replace('<opinions>', ', '.join([opinion.value for opinion in frame_opinions])).
                                         replace('<statuses>', ', '.join([status.status for status in lemma_statuses])).
                                         replace('<owners>', ', '.join([owner.username for owner in owners])).
                                         replace('<part of speech>', ', '.join([pos.name for pos in poss])).
                                         replace('<opinions added>', ('True' if add_frame_opinions else 'False')))
    return copyrights

def create_tex_walenty(lemmas, form_dict):
    frame_chars_dict = sorted_frame_char_values_dict()
    q_frame_opinions = []
    if form_dict['frame_opinions']:
        for pk in form_dict['frame_opinions']:
            q_frame_opinions.append(Q(value__pk=pk))
    tmp_folder = mkdtemp()
    os.chdir(tmp_folder)
    tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
    h = HTMLParser.HTMLParser()
    # Pass the TeX template through Django templating engine and into the temp file
    os.write(tmpfile, smart_str(h.unescape(render_to_string('tex/slowal.tex', {'lemmas': lemmas,
                                                            'q_frame_opinions': q_frame_opinions,
                                                            'sort_reflex_vals': frame_chars_dict['sorted_reflex_vals'],
                                                            'sort_aspect_vals': frame_chars_dict['sorted_aspect_vals'],
                                                            'sort_neg_vals'   : frame_chars_dict['sorted_neg_vals'],
                                                            'sort_pred_vals'  : frame_chars_dict['sorted_pred_vals'],
                                                            'download_dict'   : form_dict}))))
    os.close(tmpfile)
    file_name = tmpfilename + '.tex'
    os.rename(tmpfilename, file_name)
    return file_name

def add_frame_opinion_and_return_text_rep(text_rep, lemma, frame):
    try:
        frame_op = lemma.frame_opinions.filter(frame__pk=frame.pk).all()[0].value.value
    except:
        frame_op = 'unk'
    frame_form_list = text_rep.split(':')
    frame_form_list.insert(1, frame_op)
    text_rep = ':'.join(frame_form_list)
    return text_rep

def download_vocabulary(request, file_name):
    fullpath = '/' + file_name
    download_file_name = '%s_%s' % ('walenty', datetime.datetime.now().strftime('%Y%m%d'))
    with open(fullpath, "r") as f:
        data = f.read()
    response = HttpResponse(data, mimetype='text/txt')
    if file_name.endswith('.txt'):
        response['Content-Disposition'] = 'attachment; filename=%s.txt' % download_file_name
    elif file_name.endswith('.tex'):
        response['Content-Disposition'] = 'attachment; filename=%s.tex' % download_file_name
    elif file_name.endswith('.xml'):
        response['Content-Disposition'] = 'attachment; filename=%s.xml' % download_file_name
    os.remove(fullpath)
    os.rmdir(os.path.split(fullpath)[0])
    return response

@render('vocab_perm_manage_form.html')
@ajax(method='get', encode_result=False)
def vocab_perm_manage_form(request, vocabulary_name):
  vocabulary_obj = Vocabulary.objects.get(name=vocabulary_name)
  form = ManageVocabPermForm(editors=vocabulary_obj.editors,
                             viewers=vocabulary_obj.viewers)
  return {'form': form}

@ajax(method='post')
def vocab_perm_manage_form_submit(request, form_data):
  form_dict = dict((x['name'], x['value']) for x in form_data)
  vocabulary = Vocabulary.objects.get(name=form_dict['vocabulary_name'])
  vocabulary.editors.clear()
  vocabulary.viewers.clear()
  for user_pk in form_dict['editors']:
    selected_user = User.objects.get(pk=user_pk)
    vocabulary.editors.add(selected_user)
    vocabulary.viewers.add(selected_user)
  for user_pk in form_dict['viewers']:
    vocabulary.viewers.add(User.objects.get(pk=user_pk))
  return {}

@render('vocabulary_stats.html')
@ajax(method='get', encode_result=False)
def get_vocabulary_stats(request, vocabulary_name):
    if vocabulary_name:
        voc = Vocabulary.objects.get(name=vocabulary_name)
        lemmas = voc.lemmas.filter(old=False)
    else:
        lemmas = Lemma.objects.filter(old=False)
    lemma_statuses = Lemma_Status.objects.order_by('priority')
    poss = POS.objects.exclude(tag=u'unk').order_by('priority')
    top_labels = create_top_labels(poss)
    all_statuses_line = create_status_stats_line(lemmas, poss, 'wszystkie')
    vocabulary_stats_tab_lines = [top_labels, all_statuses_line]
    stats_by_pos = []
    for lemma_status in lemma_statuses:
        stat_lemmas = lemmas.filter(status__status=lemma_status.status)
        stats_line = create_status_stats_line(stat_lemmas, poss, lemma_status.status)
        vocabulary_stats_tab_lines.append(stats_line)
        stats_by_pos.append(stats_line)
    return {'vocabulary_stats': vocabulary_stats_tab_lines}

def create_top_labels(poss):
    top_labels = ['Hasła / Schematy', 'wszystkie']
    for pos in poss:
        top_labels.append(pos.name)
    return top_labels

def create_status_stats_line(lemmas, poss, status):
    all_count = {'lemmas': lemmas.count(),
                 'schemas': count_schemas(lemmas)}
    status_line = [status, all_count]
    for pos in poss:
        pos_lemmas = lemmas.filter(entry_obj__pos=pos)
        pos_count = {'lemmas': pos_lemmas.count(),
                     'schemas': count_schemas(pos_lemmas)}
        status_line.append(pos_count)
    return status_line

def count_schemas(lemmas):
    schemas_count = lemmas.annotate(num_frames=Count('frames')).aggregate(Sum('num_frames'))['num_frames__sum']
    if not schemas_count:
        schemas_count = 0
    return schemas_count

@render('other_stats.html')
@ajax(method='get', encode_result=False)
def get_other_stats(request):
    return WalentyStat.objects.order_by('label')

def get_stats(statuses, pos):
    stats_dict = Counter({u'phrases': 0,
                          u'poss': 0,
                          u'lemmas': 0,
                          u'sub_lemmas': 0,
                          u'schemata': 0,
                          u'cer_schemata': 0,
                          u'uncer_schemata': 0,
                          u'bad_schemata': 0,
                          u'arch_schemata': 0,
                          u'col_schemata': 0,
                          u'vul_schemata': 0,
                          u'coor_schemata': 0,
                          u'lex_schemata': 0,
                          u'coor_lemmas': 0,
                          u'lex_lemmas': 0})

    lemmas = Lemma.objects.filter(old=False,
                                  entry_obj__pos__tag=pos).filter(status__in=statuses).distinct()
    stats_dict[u'lemmas'] = lemmas.count()
    for lemma in lemmas.order_by('entry').all():
        print lemma
        stats_dict[u'cer_schemata'] += lemma.frame_opinions.filter(value__value=u'pewny').count()
        stats_dict[u'uncer_schemata'] += lemma.frame_opinions.filter(value__value=u'wątpliwy').count()
        stats_dict[u'bad_schemata'] += lemma.frame_opinions.filter(value__value=u'zły').count()
        stats_dict[u'arch_schemata'] += lemma.frame_opinions.filter(value__value=u'archaiczny').count()
        stats_dict[u'col_schemata'] += lemma.frame_opinions.filter(value__value=u'potoczny').count()
        stats_dict[u'vul_schemata'] += lemma.frame_opinions.filter(value__value=u'wulgarny').count()
        stats_dict[u'schemata'] += lemma.frames.count()

        stats_dict = stats_dict + Counter(get_sub_entries_dict(lemma))

        has_phraseology = False
        has_coordination = False
        for frame in lemma.frames.all():
            stats_dict[u'poss'] += frame.positions.count()
            flat_frames = frame.positions.annotate(num_args=Count('arguments')).aggregate(Max('num_args'))['num_args__max']
            if flat_frames > 1:
                stats_dict[u'coor_schemata'] += 1
                has_coordination = True
            for pos in frame.positions.all():
                stats_dict[u'phrases'] += pos.arguments.count()
            if frame.positions.filter(arguments__type__in=LEX_TYPES).exists():
                stats_dict[u'lex_schemata'] += 1
                has_phraseology = True

        if has_phraseology:
            stats_dict[u'lex_lemmas'] += 1
        if has_coordination:
            stats_dict[u'coor_lemmas'] += 1

    return stats_dict

def get_sub_entries_dict(lemma):
    sub_entries_dict = {'sub_lemmas': 0}
    frame_chars_dict = sorted_frame_char_values_dict()
    for reflex in frame_chars_dict['sorted_reflex_vals']:
        for neg in frame_chars_dict['sorted_neg_vals']:
            for pred in frame_chars_dict['sorted_pred_vals']:
                for aspect in frame_chars_dict['sorted_aspect_vals']:
                    matching_frames = lemma.get_frames_by_char_values(reflex_val=reflex,
                                                                      neg_val=neg,
                                                                      pred_val=pred,
                                                                      aspect_val=aspect)
                    if matching_frames.exists():
                        sub_entries_dict[u'sub_lemmas'] += 1
                        subentry_key = u'Liczba podhaseł postaci: (%s,%s,%s,%s)' % (reflex.value, neg.value,
                                                                                    pred.value, aspect.value)
                        if not subentry_key in sub_entries_dict:
                            sub_entries_dict[subentry_key] = 0
                        sub_entries_dict[subentry_key] += 1
    return sub_entries_dict

def write_stats(stats_path, stats):
    try:
        outfile = codecs.open(stats_path, 'wt', 'utf-8')

        outfile.write(u'Łączna liczba haseł:\t%d\n\n' % stats['lemmas'])
        outfile.write(u'Łączna liczba pozycji w schematach:\t%d\n' % stats['poss'])
        outfile.write(u'Łączna liczba realizacji w schematach:\t%d\n\n' % stats['phrases'])

        outfile.write(u'Łączna liczba podhaseł:\t%d\n' % stats['sub_lemmas'])
        outfile.write(u'Liczba podhaseł postaci (ZWROTNOŚĆ, NEGATYWNOŚĆ, PREDYKATYWNOŚĆ, ASPEKT)\n')
        write_subschemas_stats(outfile, stats)

        outfile.write(u'Łączna liczba schematów:\t%d\n' % stats['schemata'])
        outfile.write(u'Liczba schematów pewnych:\t%d\n' % stats['cer_schemata'])
        outfile.write(u'Liczba schematów wątpliwych:\t%d\n' % stats['uncer_schemata'])
        outfile.write(u'Liczba schematów złych:\t%d\n' % stats['bad_schemata'])
        outfile.write(u'Liczba schematów archaicznych:\t%d\n' % stats['arch_schemata'])
        outfile.write(u'Liczba schematów potocznych:\t%d\n' % stats['col_schemata'])
        outfile.write(u'Liczba schematów wulgarnych:\t%d\n\n' % stats['vul_schemata'])

        outfile.write(u'Łączna liczba schematów z koordynacją:\t%d\n' % stats['coor_schemata'])
        outfile.write(u'Łączna liczba schematów zleksykalizowanych:\t%d\n\n' % stats['lex_schemata'])

        outfile.write(u'Łączna liczba haseł zawierających pozycje z koordynacją:\t%d\n' % stats['coor_lemmas'])
        outfile.write(u'Łączna liczba haseł zawierających schematy zleksykalizowane:\t%d\n\n' % stats['lex_lemmas'])
    finally:
        outfile.close()

def write_subschemas_stats(stats_file, stats):
    subschemas_stats = ['%s:\t%d\n' % (k, v) for k,v in stats.iteritems() if k.startswith(u'Liczba podhaseł postaci:')]
    subschemas_stats.sort()
    for stat in subschemas_stats:
        stats_file.write(stat)
    stats_file.write('\n')

def update_walenty_stats(stats):
    WalentyStat.objects.all().delete()
    WalentyStat(label=u'Łączna liczba haseł', value=str(stats['lemmas'])).save()
    WalentyStat(label=u'Łączna liczba pozycji w schematach', value=str(stats['poss'])).save()
    WalentyStat(label=u'Łączna liczba realizacji w schematach', value=str(stats['phrases'])).save()
    WalentyStat(label=u'Łączna liczba schematów', value=str(stats['schemata'])).save()
    WalentyStat(label=u'Łączna liczba schematów z koordynacją', value=str(stats['coor_schemata'])).save()
    WalentyStat(label=u'Łączna liczba schematów zleksykalizowanych', value=str(stats['lex_schemata'])).save()