frames_freq.py 4.41 KB
#-*- coding:utf-8 -*-
# author: B.Niton

import codecs

import operator

from django.core.management.base import BaseCommand
from optparse import make_option

from dictionary.models import *

DEFAULT_SAVE_PATH = 'tmp/frames_freq.txt'

class Command(BaseCommand):
    help = 'Gets frames frequency list.'
    option_list = BaseCommand.option_list + (
        make_option('--status',
            action='store',
            default='checked',
            help='Minimal lemma status. Values: all, progress, done, checked.'),
        make_option('--filepath',
            action='store',
            default=None,
            help='Path to file with verbs list.'),
        make_option('--dicts',
            action='store',
            default='all',
            help='List of dicts to select verbs from.'),
        make_option('--out',
            action='store',
            default=DEFAULT_SAVE_PATH,
            help='Path to output file.'),
    )

    def handle(self, *args, **options):
        print options['status']
        min_status = None
        if options['status'] == 'all':
            min_status = u'do obróbki'
        elif options['status'] == 'progress':
            min_status = u'w obróbce'
        elif options['status'] == 'done':
            min_status = u'gotowe'
        elif options['status'] == 'checked':
            min_status = u'sprawdzone'
        else:
            print 'Select proper status name.'
            
        lemma_statuses = Lemma_Status.objects.all()
        sel_min_status = Lemma_Status.objects.get(status=min_status)
        sel_statuses = []
        for lemma_status in lemma_statuses:
            if lemma_status.priority >= sel_min_status.priority:
                sel_statuses.append(lemma_status)
        q_sel_statuses = []
        for status in sel_statuses:
            q_sel_statuses.append(Q(status=status))   
        lemmas = Lemma.objects.filter(old=False).filter(reduce(operator.or_, 
                                                               q_sel_statuses))
        if options['filepath']:
            get_frames_freq_file(lemmas, options['filepath'], options['out'])
        elif options['dicts']:
            dicts_list = options['dicts'].split()
            get_frames_freq_dicts(lemmas, dicts_list, options['out'])
        else:
            print 'No verbs input selected.'     

def get_frames_freq_file(lemmas, verbs_path, out):
    """Gets frames frequency list for lemmas in selected file."""
    with codecs.open(verbs_path, 'rt', 'utf8') as infile:
        lemmas_ls = []
        for line in infile:
            pos_lemma = line.strip()
            try:
                lemma = lemmas.get(entry_obj__name=pos_lemma)
                lemmas_ls.append(lemma)
            except Lemma.DoesNotExist:
                pass
        write_frame_freq(lemmas_ls, out)

def get_frames_freq_dicts(lemmas, dicts_list, out):
    """Gets frames frequency list for verbs in selected dicts."""
    if 'all' in dicts_list:
        vocabs = Vocabulary.objects.all()
        for vocab in vocabs:
            dicts_list.append(vocab.name)
    q_sel_dicts = []
    for vocab in dicts_list:
        q_sel_dicts.append(Q(vocabulary__name=vocab))
    lemmas = lemmas.filter(reduce(operator.or_, q_sel_dicts))
    write_frame_freq(lemmas.all(), out)

def write_frame_freq(lemmas, out):
    """Writes frames frequency list for given lemmas to given file."""
    try:
        outfile = codecs.open(out, 'wt', 'utf-8')
        frames_freq_ls = []
        for lemma in lemmas:
            for frame in lemma.frames.all():
                text_rep_frg = frame.text_rep.split(":")
                text_rep = text_rep_frg[0] + ':' + text_rep_frg[2]
                try:
                    index = map(operator.itemgetter('text_rep'), 
                                frames_freq_ls).index(text_rep)
                    frames_freq_ls[index]['freq'] += 1
                except ValueError:
                    frames_freq_ls.append({'text_rep': text_rep,
                                           'freq': 1})
        print frames_freq_ls
        frames_freq_ls.sort(key=lambda x:x['freq'], reverse=True)
        print frames_freq_ls
        for frame in frames_freq_ls:
            print frame
            outfile.write(str(frame['freq']) + '   ' + 
                          frame['text_rep'].strip().replace('+', ' + ').
                          replace(':',': ').replace(';', '; ') 
                          + '\n')
    finally:
        outfile.close()