ajax_vocabulary_management.py 21.9 KB
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447
# -*- coding: utf-8 -*-

#Copyright (c) 2015, Bartłomiej Nitoń
#All rights reserved.

#Redistribution and use in source and binary forms, with or without modification, are permitted provided 
#that the following conditions are met:

#    Redistributions of source code must retain the above copyright notice, this list of conditions and 
#    the following disclaimer.
#    Redistributions in binary form must reproduce the above copyright notice, this list of conditions 
#    and the following disclaimer in the documentation and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED 
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A 
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED 
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
# POSSIBILITY OF SUCH DAMAGE.

import codecs
import datetime
import HTMLParser
import os
from collections import Counter
from tempfile import mkdtemp, mkstemp

from django.contrib.auth.models import User
from django.db.models import Count, Max, Sum, Q
from django.http import HttpResponse
from django.template.loader import render_to_string
from django.utils.encoding import smart_str

from common.decorators import ajax, AjaxError, render
from dictionary.forms import ManageVocabPermForm
from dictionary.models import Frame_Opinion_Value, Lemma, Lemma_Status, \
                              POS, Vocabulary, VocabularyFormat, WalentyStat,\
                              sorted_frame_char_values_dict
from dictionary.teixml import createteixml

LEX_TYPES = ['lex', 'fixed', 'comprepnp']

TEXT_VOCABULARY_CLAUSE = u"""
% The Polish Valence Dictionary (Walenty)
% <date>
%
% The Polish Valence Dictionary (Walenty) is an adaptation of
% the Syntactic Dictionary of Polish Verbs by Marek Świdziński
% in its electronic version provided by Łukasz Dębowski and
% Elżbieta Hajnicz and further expanded by Witold Kieraś to
% include the most frequent verbs in the 1 million sample of
% NKJP (National Corpus of Polish).
%
% The presented resource results from an automatic conversion
% of the aforementioned dictionary, manually reviewed by Filip
% Skwarski to include correct information about a number of new
% features, including sentential subjects, passivisation, and
% control relations.
%
% The format of the new dictionary has been established by Filip
% Skwarski, Elżbieta Hajnicz, Agnieszka Patejuk, Adam Przepiórkowski,
% Marek Świdziński, and Marcin Woliński.
%
% The dictionary has been edited and compiled using a tool
% created by Bartłomiej Nitoń.
%
% The original Syntactic Dictionary of Polish Verbs derives from:
%
% Marek Świdziński
% Institute of Polish
% Warsaw University
% Warsaw, Poland
%
% © Copyright 1998,2012 by Marek Świdziński
%
% This work is distributed under a CC BY-SA license:
% http://creativecommons.org/licenses/by-sa/2.0/
%
% Parameters:
%     Dictionaries:   <vocabularies>
%     Schema opinions: <opinions>
%     Lemma statuses: <statuses>
%     Owners:         <owners>
%     Part of speech: <part of speech>
%     Opinions added: <opinions added>
%     
"""

EN_MONTHS = ['January', 'February', 'March', 'April', 'May', 'June', 
             'July', 'August', 'September', 'October', 'November', 'December']

@ajax(method='post')
def create_vocabulary(request, form_data):  
    form_dict = dict((x['name'], x['value']) for x in form_data)
    
    if not form_dict['format']:
        raise AjaxError('format not selected')
    voc_format_obj = VocabularyFormat.objects.get(pk=form_dict['format']) 
       
    lemmas = Lemma.objects.filter(old=False).order_by('entry')
    if form_dict['vocabularies']:
        lemmas = lemmas.filter(vocabulary__in=form_dict['vocabularies'])
    if form_dict['lemma_statuses']:
        lemmas = lemmas.filter(status__in=form_dict['lemma_statuses'])  
    if form_dict['owners']:
        lemmas = lemmas.filter(owner__in=form_dict['owners']) 
    if form_dict['poss']:
        lemmas = lemmas.filter(entry_obj__pos__in=form_dict['poss'])
    lemmas = lemmas.distinct()

    if voc_format_obj.format == u'Tekstowy':
        tmp_folder = mkdtemp()
        os.chdir(tmp_folder)      
        tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
        os.close(tmpfile)
        file_name = create_text_walenty(tmpfilename, lemmas, 
                                        form_dict['vocabularies'], form_dict['frame_opinions'],
                                        form_dict['lemma_statuses'], form_dict['owners'], 
                                        form_dict['poss'], form_dict['addframeopinion'])
    elif voc_format_obj.format == u'TEX':
        file_name = create_tex_walenty(lemmas, form_dict)
    elif voc_format_obj.format == u'TEI':
        file_name = create_tei_walenty(lemmas, form_dict)
    return {'file_name': file_name}  

def create_tei_walenty(lemmas, form_dict):
    tmp_folder = mkdtemp()
    os.chdir(tmp_folder)      
    tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
    os.close(tmpfile)
    frame_chars_dict = sorted_frame_char_values_dict()
    q_frame_opinions = []
    if form_dict['frame_opinions']:
        for pk in form_dict['frame_opinions']:
            q_frame_opinions.append(Q(value__pk=pk))
    createteixml(tmpfilename, 
                 lemmas, 
                 sort_reflex_vals=frame_chars_dict['sorted_reflex_vals'],
                 sort_aspect_vals=frame_chars_dict['sorted_aspect_vals'], 
                 sort_neg_vals=frame_chars_dict['sorted_neg_vals'],
                 sort_pred_vals=frame_chars_dict['sorted_pred_vals'],
                 frame_char_models=[],
                 form_dict=form_dict, 
                 q_frame_opinions=q_frame_opinions)
    file_name = tmpfilename + '.xml'
    os.rename(tmpfilename, file_name)
    return file_name

def create_text_walenty(file_name, lemmas, vocabularies, frame_opinions,
                          lemma_statuses, owners, poss, add_frame_opinions):
    try:
        f = codecs.open(file_name, 'w+', 'utf-8-sig' )
        f.write(create_copyrights_str(vocabularies, frame_opinions,
                                      lemma_statuses, owners, poss, add_frame_opinions))
        for lemma in lemmas:
            founded_frame_opinions = lemma.frame_opinions.filter(value__in=frame_opinions)  
            frame_chars_dict = sorted_frame_char_values_dict()  
            for reflex_val in frame_chars_dict['sorted_reflex_vals']:
                for neg_val in frame_chars_dict['sorted_neg_vals']:
                    for pred_val in frame_chars_dict['sorted_pred_vals']:
                        for aspect_val in frame_chars_dict['sorted_aspect_vals']:
                            matching_frames = lemma.get_frames_by_char_values(reflex_val=reflex_val, 
                                                                              neg_val=neg_val, 
                                                                              pred_val=pred_val, 
                                                                              aspect_val=aspect_val).order_by('text_rep')                     
                            for frame in matching_frames:
                                if not frame_opinions or founded_frame_opinions.filter(frame=frame).exists():
                                    text_rep = frame.get_position_spaced_text_rep()
                                    if add_frame_opinions:
                                        text_rep = add_frame_opinion_and_return_text_rep(text_rep, lemma, frame)
                                    if frame.characteristics.filter(type=u'ZWROTNOŚĆ', value__value=u'się').exists():
                                        text_rep = ' ' + text_rep
                                    f.write(lemma.entry+text_rep.replace(':',': ')+'\n')
        os.rename(file_name, file_name+'.txt')
        file_name = file_name+'.txt'
    finally:
        f.close() 
        return file_name
    
def create_copyrights_str(vocabularies_pks, frame_opinions_pks,
                            lemma_statuses_pks, owners_pks, poss_pks, 
                            add_frame_opinions):
    date = datetime.datetime.now()
    month = EN_MONTHS[date.month-1]
    vocabularies = Vocabulary.objects.filter(pk__in=vocabularies_pks).order_by('name')
    if not vocabularies.exists():
        vocabularies = Vocabulary.objects.order_by('name')
    frame_opinions = Frame_Opinion_Value.objects.filter(pk__in=frame_opinions_pks).order_by('priority')
    if not frame_opinions.exists():
        frame_opinions = Frame_Opinion_Value.objects.order_by('priority')
    lemma_statuses = Lemma_Status.objects.filter(pk__in=lemma_statuses_pks).order_by('priority')
    if not lemma_statuses.exists():
        lemma_statuses = Lemma_Status.objects.order_by('priority')
    owners = User.objects.filter(pk__in=owners_pks).order_by('username')
    if not owners.exists():
        owners = User.objects.filter(lemmas__old=False).distinct().order_by('username')
    poss = POS.objects.filter(pk__in=poss_pks).order_by('priority')
    if not poss.exists():
        poss = POS.objects.exclude(tag=u'unk').order_by('priority')
    copyrights = (TEXT_VOCABULARY_CLAUSE.replace('<date>', date.strftime(month + ' %d, %Y')).
                                         replace('<vocabularies>', ', '.join([vocab.name for vocab in vocabularies])).
                                         replace('<opinions>', ', '.join([opinion.value for opinion in frame_opinions])).
                                         replace('<statuses>', ', '.join([status.status for status in lemma_statuses])).
                                         replace('<owners>', ', '.join([owner.username for owner in owners])).
                                         replace('<part of speech>', ', '.join([pos.name for pos in poss])).
                                         replace('<opinions added>', ('True' if add_frame_opinions else 'False')))
    return copyrights

def create_tex_walenty(lemmas, form_dict):
    frame_chars_dict = sorted_frame_char_values_dict()
    q_frame_opinions = []
    if form_dict['frame_opinions']:
        for pk in form_dict['frame_opinions']:
            q_frame_opinions.append(Q(value__pk=pk))
    tmp_folder = mkdtemp()
    os.chdir(tmp_folder)      
    tmpfile, tmpfilename = mkstemp(dir=tmp_folder)
    h = HTMLParser.HTMLParser()
    # Pass the TeX template through Django templating engine and into the temp file
    os.write(tmpfile, smart_str(h.unescape(render_to_string('tex/slowal.tex', {'lemmas': lemmas, 
                                                            'q_frame_opinions': q_frame_opinions,
                                                            'sort_reflex_vals': frame_chars_dict['sorted_reflex_vals'],
                                                            'sort_aspect_vals': frame_chars_dict['sorted_aspect_vals'],
                                                            'sort_neg_vals'   : frame_chars_dict['sorted_neg_vals'],
                                                            'sort_pred_vals'  : frame_chars_dict['sorted_pred_vals'],
                                                            'download_dict'   : form_dict}))))
    os.close(tmpfile)
    file_name = tmpfilename + '.tex'
    os.rename(tmpfilename, file_name)
    return file_name

def add_frame_opinion_and_return_text_rep(text_rep, lemma, frame):
    try:
        frame_op = lemma.frame_opinions.filter(frame__pk=frame.pk).all()[0].value.value
    except:
        frame_op = 'unk'
    frame_form_list = text_rep.split(':')
    frame_form_list.insert(1, frame_op)
    text_rep = ':'.join(frame_form_list)
    return text_rep 

def download_vocabulary(request, file_name):
    fullpath = '/' + file_name
    download_file_name = '%s_%s' % ('walenty', datetime.datetime.now().strftime('%Y%m%d'))
    with open(fullpath, "r") as f:
        data = f.read()
    response = HttpResponse(data, mimetype='text/txt')
    if file_name.endswith('.txt'):
        response['Content-Disposition'] = 'attachment; filename=%s.txt' % download_file_name 
    elif file_name.endswith('.tex'):
        response['Content-Disposition'] = 'attachment; filename=%s.tex' % download_file_name
    elif file_name.endswith('.xml'):
        response['Content-Disposition'] = 'attachment; filename=%s.xml' % download_file_name
    os.remove(fullpath)
    os.rmdir(os.path.split(fullpath)[0])
    return response

@render('vocab_perm_manage_form.html')
@ajax(method='get', encode_result=False)
def vocab_perm_manage_form(request, vocabulary_name):
  vocabulary_obj = Vocabulary.objects.get(name=vocabulary_name) 
  form = ManageVocabPermForm(editors=vocabulary_obj.editors,
                             viewers=vocabulary_obj.viewers)
  return {'form': form}

@ajax(method='post')
def vocab_perm_manage_form_submit(request, form_data):
  form_dict = dict((x['name'], x['value']) for x in form_data)
  vocabulary = Vocabulary.objects.get(name=form_dict['vocabulary_name'])
  vocabulary.editors.clear()
  vocabulary.viewers.clear()
  for user_pk in form_dict['editors']:
    selected_user = User.objects.get(pk=user_pk)
    vocabulary.editors.add(selected_user)
    vocabulary.viewers.add(selected_user)
  for user_pk in form_dict['viewers']:
    vocabulary.viewers.add(User.objects.get(pk=user_pk))  
  return {}
  
@render('vocabulary_stats.html')
@ajax(method='get', encode_result=False)
def get_vocabulary_stats(request, vocabulary_name):
    if vocabulary_name:
        voc = Vocabulary.objects.get(name=vocabulary_name)     
        lemmas = voc.lemmas.filter(old=False) 
    else:
        lemmas = Lemma.objects.filter(old=False)
    lemma_statuses = Lemma_Status.objects.order_by('priority')
    poss = POS.objects.exclude(tag=u'unk').order_by('priority')
    top_labels = create_top_labels(poss)
    all_statuses_line = create_status_stats_line(lemmas, poss, 'wszystkie')
    vocabulary_stats_tab_lines = [top_labels, all_statuses_line]
    stats_by_pos = []
    for lemma_status in lemma_statuses: 
        stat_lemmas = lemmas.filter(status__status=lemma_status.status)
        stats_line = create_status_stats_line(stat_lemmas, poss, lemma_status.status)
        vocabulary_stats_tab_lines.append(stats_line)
        stats_by_pos.append(stats_line) 
    return {'vocabulary_stats': vocabulary_stats_tab_lines}
  
def create_top_labels(poss):
    top_labels = ['Hasła / Schematy', 'wszystkie']
    for pos in poss:
        top_labels.append(pos.name)
    return top_labels

def create_status_stats_line(lemmas, poss, status):
    all_count = {'lemmas': lemmas.count(),
                 'schemas': count_schemas(lemmas)}
    status_line = [status, all_count]
    for pos in poss:
        pos_lemmas = lemmas.filter(entry_obj__pos=pos)
        pos_count = {'lemmas': pos_lemmas.count(),
                     'schemas': count_schemas(pos_lemmas)}
        status_line.append(pos_count)
    return status_line

def count_schemas(lemmas):
    schemas_count = lemmas.annotate(num_frames=Count('frames')).aggregate(Sum('num_frames'))['num_frames__sum']
    if not schemas_count:
        schemas_count = 0
    return schemas_count

@render('other_stats.html')
@ajax(method='get', encode_result=False)
def get_other_stats(request):
    return WalentyStat.objects.order_by('label')

def get_stats(statuses, pos):
    stats_dict = Counter({u'phrases': 0,
                          u'poss': 0,
                          u'lemmas': 0,
                          u'sub_lemmas': 0,
                          u'schemata': 0,
                          u'cer_schemata': 0,
                          u'uncer_schemata': 0,
                          u'bad_schemata': 0,
                          u'arch_schemata': 0,
                          u'col_schemata': 0,
                          u'vul_schemata': 0,
                          u'coor_schemata': 0,
                          u'lex_schemata': 0,
                          u'coor_lemmas': 0,
                          u'lex_lemmas': 0})
    
    lemmas = Lemma.objects.filter(old=False, 
                                  entry_obj__pos__tag=pos).filter(status__in=statuses).distinct()
    stats_dict[u'lemmas'] = lemmas.count()
    for lemma in lemmas.order_by('entry').all():
        print lemma
        stats_dict[u'cer_schemata'] += lemma.frame_opinions.filter(value__value=u'pewny').count()
        stats_dict[u'uncer_schemata'] += lemma.frame_opinions.filter(value__value=u'wątpliwy').count()
        stats_dict[u'bad_schemata'] += lemma.frame_opinions.filter(value__value=u'zły').count()
        stats_dict[u'arch_schemata'] += lemma.frame_opinions.filter(value__value=u'archaiczny').count()
        stats_dict[u'col_schemata'] += lemma.frame_opinions.filter(value__value=u'potoczny').count()
        stats_dict[u'vul_schemata'] += lemma.frame_opinions.filter(value__value=u'wulgarny').count()
        stats_dict[u'schemata'] += lemma.frames.count()

        stats_dict = stats_dict + Counter(get_sub_entries_dict(lemma))
                          
        has_phraseology = False
        has_coordination = False
        for frame in lemma.frames.all():
            stats_dict[u'poss'] += frame.positions.count()
            flat_frames = frame.positions.annotate(num_args=Count('arguments')).aggregate(Max('num_args'))['num_args__max']
            if flat_frames > 1:
                stats_dict[u'coor_schemata'] += 1
                has_coordination = True  
            for pos in frame.positions.all():
                stats_dict[u'phrases'] += pos.arguments.count()     
            if frame.positions.filter(arguments__type__in=LEX_TYPES).exists():
                stats_dict[u'lex_schemata'] += 1
                has_phraseology = True
                        
        if has_phraseology:
            stats_dict[u'lex_lemmas'] += 1
        if has_coordination:
            stats_dict[u'coor_lemmas'] += 1
                     
    return stats_dict

def get_sub_entries_dict(lemma):
    sub_entries_dict = {'sub_lemmas': 0}
    frame_chars_dict = sorted_frame_char_values_dict()  
    for reflex in frame_chars_dict['sorted_reflex_vals']:
        for neg in frame_chars_dict['sorted_neg_vals']:
            for pred in frame_chars_dict['sorted_pred_vals']:
                for aspect in frame_chars_dict['sorted_aspect_vals']:
                    matching_frames = lemma.get_frames_by_char_values(reflex_val=reflex, 
                                                                      neg_val=neg, 
                                                                      pred_val=pred, 
                                                                      aspect_val=aspect)
                    if matching_frames.exists():
                        sub_entries_dict[u'sub_lemmas'] += 1
                        subentry_key = u'Liczba podhaseł postaci: (%s,%s,%s,%s)' % (reflex.value, neg.value, 
                                                                                    pred.value, aspect.value)
                        if not subentry_key in sub_entries_dict:
                            sub_entries_dict[subentry_key] = 0
                        sub_entries_dict[subentry_key] += 1
    return sub_entries_dict

def write_stats(stats_path, stats):
    try:
        outfile = codecs.open(stats_path, 'wt', 'utf-8')
        
        outfile.write(u'Łączna liczba haseł:\t%d\n\n' % stats['lemmas'])
        outfile.write(u'Łączna liczba pozycji w schematach:\t%d\n' % stats['poss'])
        outfile.write(u'Łączna liczba realizacji w schematach:\t%d\n\n' % stats['phrases'])
        
        outfile.write(u'Łączna liczba podhaseł:\t%d\n' % stats['sub_lemmas'])
        outfile.write(u'Liczba podhaseł postaci (ZWROTNOŚĆ, NEGATYWNOŚĆ, PREDYKATYWNOŚĆ, ASPEKT)\n')
        write_subschemas_stats(outfile, stats)

        outfile.write(u'Łączna liczba schematów:\t%d\n' % stats['schemata'])
        outfile.write(u'Liczba schematów pewnych:\t%d\n' % stats['cer_schemata'])
        outfile.write(u'Liczba schematów wątpliwych:\t%d\n' % stats['uncer_schemata'])       
        outfile.write(u'Liczba schematów złych:\t%d\n' % stats['bad_schemata'])
        outfile.write(u'Liczba schematów archaicznych:\t%d\n' % stats['arch_schemata'])
        outfile.write(u'Liczba schematów potocznych:\t%d\n' % stats['col_schemata'])          
        outfile.write(u'Liczba schematów wulgarnych:\t%d\n\n' % stats['vul_schemata'])
        
        outfile.write(u'Łączna liczba schematów z koordynacją:\t%d\n' % stats['coor_schemata'])
        outfile.write(u'Łączna liczba schematów zleksykalizowanych:\t%d\n\n' % stats['lex_schemata'])
        
        outfile.write(u'Łączna liczba haseł zawierających pozycje z koordynacją:\t%d\n' % stats['coor_lemmas'])
        outfile.write(u'Łączna liczba haseł zawierających schematy zleksykalizowane:\t%d\n\n' % stats['lex_lemmas'])                
    finally:
        outfile.close()
        
def write_subschemas_stats(stats_file, stats):
    subschemas_stats = ['%s:\t%d\n' % (k, v) for k,v in stats.iteritems() if k.startswith(u'Liczba podhaseł postaci:')]
    subschemas_stats.sort()
    for stat in subschemas_stats:
        stats_file.write(stat)
    stats_file.write('\n')
    
def update_walenty_stats(stats):
    WalentyStat.objects.all().delete()
    WalentyStat(label=u'Łączna liczba haseł', value=str(stats['lemmas'])).save()
    WalentyStat(label=u'Łączna liczba pozycji w schematach', value=str(stats['poss'])).save()
    WalentyStat(label=u'Łączna liczba realizacji w schematach', value=str(stats['phrases'])).save()
    WalentyStat(label=u'Łączna liczba schematów', value=str(stats['schemata'])).save()
    WalentyStat(label=u'Łączna liczba schematów z koordynacją', value=str(stats['coor_schemata'])).save()
    WalentyStat(label=u'Łączna liczba schematów zleksykalizowanych', value=str(stats['lex_schemata'])).save()