ajax_argument_stats.py 7.55 KB

Edit Raw Blame History

# -*- coding: utf-8 -*-

#Copyright (c) 2012, Bartłomiej Nitoń
#All rights reserved.

#Redistribution and use in source and binary forms, with or without modification, are permitted provided
#that the following conditions are met:

#    Redistributions of source code must retain the above copyright notice, this list of conditions and
#    the following disclaimer.
#    Redistributions in binary form must reproduce the above copyright notice, this list of conditions
#    and the following disclaimer in the documentation and/or other materials provided with the distribution.

# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

"""
This module is responsible for server side handling of argument statistics
interface.
"""

import os
import re
import codecs

from settings import PROJECT_PATH
from common.decorators import ajax, AjaxError
from dictionary.models import Lemma, Argument

FULL_LIST_PATH = os.path.join(PROJECT_PATH, 'data', 'susp-1-verbs-300M-counts.txt')
PREP_LIST_PATH = os.path.join(PROJECT_PATH, 'data', 'checked_freq_ranked_')
TEST_PATH = os.path.join(PROJECT_PATH, 'data', 'test_freq_ranked.txt')

def create_list_file(path, bucket_size):
    try:
        verbs_count = 0
        rank = 1
        lemmas_count = 0
        with codecs.open(FULL_LIST_PATH, 'rt', 'utf-8') as infile:
            for line in infile:
                line = line.strip()
                freq_pattern = re.compile(ur'^(\*\*)?[\s]*([^\s]+)[\s]*([\d]+).*$')
                m = freq_pattern.match(line)
                if m:
                    lemma_str = m.group(2).strip()
                    if m.group(1) or Lemma.objects.filter(entry=lemma_str,
                                                          old=False,
                                                          status__status=u'sprawdzone').count() == 0:
                        continue
                    lemmas_count += 1
        rest = lemmas_count%bucket_size
        buckets = lemmas_count/bucket_size
        to_add = rest/buckets + 1
        if to_add == 0:
            to_add = 1
        to_add_count = to_add
        outfile = codecs.open(path, 'wt', 'utf-8')
        added_rest = False
        first_verb = True
        with codecs.open(FULL_LIST_PATH, 'rt', 'utf-8') as infile:
            for line in infile:
                line = line.strip()
                freq_pattern = re.compile(ur'^(\*\*)?[\s]*([^\s]+)[\s]*([\d]+).*$')
                m = freq_pattern.match(line)
                if m:
                    lemma_str = m.group(2).strip()
                    if m.group(1) or Lemma.objects.filter(entry=lemma_str,
                                                          old=False,
                                                          status__status=u'sprawdzone').count() == 0:
                        continue
                    verbs_count += 1
                    if verbs_count%bucket_size == 1 and not first_verb:
                        if rest > 0 and not added_rest:
                            rest -= 1
                            to_add_count -= 1
                            verbs_count -= 1
                            if to_add_count == 0:
                                added_rest = True
                                to_add_count = to_add
                        else:
                            added_rest = False
                            rank += 1
                    if first_verb:
                        first_verb = False
                    outfile.write(lemma_str + '\t' + str(rank) + '\n')
    finally:
        outfile.close()
        infile.close()

@ajax(method='post')
def gen_list(request, form_data):
    form_dict = dict((x['name'], x['value']) for x in form_data)
    if not form_dict['bucket_size']:
        raise AjaxError('bucket size not selected')
    if not form_dict['bucket_size'].isdigit():
        raise AjaxError('bucket size not digit')
    bucket_size = int(form_dict['bucket_size'])
    if bucket_size < 1 or bucket_size > 100:
        raise AjaxError('wrong bucket size range')
    create_list_file(PREP_LIST_PATH+form_dict['bucket_size'], bucket_size)
    return {}

@ajax(method='post')
def prepare_graph_data(request, form_data):
    args_data = []
    form_dict = dict((x['name'], x['value']) for x in form_data)
    if not form_dict['arguments'] and not form_dict['arg_types']:
        raise AjaxError('select arguments')
    if not form_dict['bucket_size']:
        raise AjaxError('bucket size not selected')
    if not form_dict['bucket_size'].isdigit():
        raise AjaxError('bucket size not digit')
    bucket_size = int(form_dict['bucket_size'])
    if bucket_size < 1:
        raise AjaxError('wrong bucket size range')
    lemmas_count = Lemma.objects.filter(old=False,
                                            status__status=u'sprawdzone').count()
    buckets = lemmas_count/bucket_size
    for arg_id in form_dict['arguments']:
        arg_data = {'text_rep' : Argument.objects.get(pk=arg_id).text_rep,
                    'freq'     : [],
                    'own'      : [],
                    'sum_freq' : 0}
        args_data.append(arg_data)
    if form_dict['arg_types']:
        for arg_obj in Argument.objects.filter(type=form_dict['arg_types']):
            arg_data = {'text_rep' : arg_obj.text_rep,
                        'freq'     : [],
                        'own'      : [],
                        'sum_freq' : 0}
            args_data.append(arg_data)
    ranked_list_path = PREP_LIST_PATH+form_dict['bucket_size']
    if not os.path.isfile(ranked_list_path):
        create_list_file(ranked_list_path, bucket_size)

    with codecs.open(ranked_list_path, 'rt', 'utf-8') as infile:
        act_rank = 0
        first_line = True
        for line in infile:
            line = line.strip()
            freq_pattern = re.compile(ur'^([^\s]+)[\s]*([\d]+).*$')
            m = freq_pattern.match(line)
            if m:
                add_rank = False
                lemma_str = m.group(1).strip()
                rank = int(m.group(2).strip())
                if rank-1 > act_rank:
                    act_rank = rank - 1
                    add_rank = True
                for arg in args_data:
                    if add_rank or first_line:
                        arg['freq'].append(0)
                        arg['own'].append(0)
                    lemma = Lemma.objects.get(entry=lemma_str, old=False)
                    lemma_args = Argument.objects.filter(text_rep=arg['text_rep'], positions__frames__lemmas__pk=lemma.pk).count()
                    arg['freq'][len(arg['freq'])-1] += lemma_args
                    arg['sum_freq'] += lemma_args
                    if lemma_args != 0:
                        arg['own'][len(arg['freq'])-1] += 1
            first_line = False

    args_data.sort(key=lambda x:x['sum_freq'], reverse=True)

    return {'graph_data': args_data,
            'xmax'      : buckets}