frames_freq.py
4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#-*- coding:utf-8 -*-
# author: B.Niton
import codecs
from django.db.models import Q
import operator
from django.core.management.base import BaseCommand
from optparse import make_option
from dictionary.models import *
DEFAULT_SAVE_PATH = 'tmp/frames_freq.txt'
class Command(BaseCommand):
help = 'Gets frames frequency list.'
option_list = BaseCommand.option_list + (
make_option('--status',
action='store',
default='checked',
help='Minimal lemma status. Values: all, progress, done, checked.'),
make_option('--filepath',
action='store',
default=None,
help='Path to file with verbs list.'),
make_option('--dicts',
action='store',
default='all',
help='List of dicts to select verbs from.'),
make_option('--out',
action='store',
default=DEFAULT_SAVE_PATH,
help='Path to output file.'),
)
def handle(self, *args, **options):
print options['status']
min_status = None
if options['status'] == 'all':
min_status = u'do obróbki'
elif options['status'] == 'progress':
min_status = u'w obróbce'
elif options['status'] == 'done':
min_status = u'gotowe'
elif options['status'] == 'checked':
min_status = u'sprawdzone'
else:
print 'Select proper status name.'
lemma_statuses = Lemma_Status.objects.all()
sel_min_status = Lemma_Status.objects.get(status=min_status)
sel_statuses = []
for lemma_status in lemma_statuses:
if lemma_status.priority >= sel_min_status.priority:
sel_statuses.append(lemma_status)
q_sel_statuses = []
for status in sel_statuses:
q_sel_statuses.append(Q(status=status))
lemmas = Lemma.objects.filter(old=False).filter(reduce(operator.or_,
q_sel_statuses))
if options['filepath']:
get_frames_freq_file(lemmas, options['filepath'], options['out'])
elif options['dicts']:
dicts_list = options['dicts'].split()
get_frames_freq_dicts(lemmas, dicts_list, options['out'])
else:
print 'No verbs input selected.'
def get_frames_freq_file(lemmas, verbs_path, out):
"""Gets frames frequency list for verbs in selected file."""
with codecs.open(verbs_path, 'rt', 'utf8') as infile:
lemmas_ls = []
for line in infile:
pos_lemma = line.strip()
try:
lemma = lemmas.get(entry=pos_lemma)
lemmas_ls.append(lemma)
except Lemma.DoesNotExist:
pass
write_frame_freq(lemmas_ls, out)
def get_frames_freq_dicts(lemmas, dicts_list, out):
"""Gets frames frequency list for verbs in selected dicts."""
if 'all' in dicts_list:
vocabs = Vocabulary.objects.all()
for vocab in vocabs:
dicts_list.append(vocab.name)
q_sel_dicts = []
for vocab in dicts_list:
q_sel_dicts.append(Q(vocabulary__name=vocab))
lemmas = lemmas.filter(reduce(operator.or_, q_sel_dicts))
write_frame_freq(lemmas.all(), out)
def write_frame_freq(lemmas, out):
"""Writes frames frequency list for given lemmas to given file."""
try:
outfile = codecs.open(out, 'wt', 'utf-8')
frames_freq_ls = []
for lemma in lemmas:
for frame in lemma.frames.all():
text_rep_frg = frame.text_rep.split(":")
text_rep = text_rep_frg[0] + ':' + text_rep_frg[2]
try:
index = map(operator.itemgetter('text_rep'),
frames_freq_ls).index(text_rep)
frames_freq_ls[index]['freq'] += 1
except ValueError:
frames_freq_ls.append({'text_rep': text_rep,
'freq': 1})
print frames_freq_ls
frames_freq_ls.sort(key=lambda x:x['freq'], reverse=True)
print frames_freq_ls
for frame in frames_freq_ls:
print frame
outfile.write(str(frame['freq']) + ' ' +
frame['text_rep'].strip().replace('+', ' + ').
replace(':',': ').replace(';', '; ')
+ '\n')
finally:
outfile.close()