create_TEI_walenty.py
4.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#-*- coding:utf-8 -*-
import datetime
import os
import tarfile
from django.core.management.base import BaseCommand
from optparse import make_option
from dictionary.models import Lemma, Frame_Opinion_Value, \
get_statuses
from dictionary.teixml import createteixml, write_phrase_types_expansions_in_TEI
from settings import WALENTY_PATH
class Command(BaseCommand):
args = '<dict dict ...>'
help = 'Get Walenty in TEI format.'
option_list = BaseCommand.option_list + (
make_option('--min_status',
action='store',
type='string',
dest='min_status_type',
default='ready',
help='Minimum lemma status.'),
make_option('--start_date',
action='store',
type='string',
dest='start_date',
default='all',
help='Status change start date (format: YYYY-MM-DD).'),
)
def handle(self, *args, **options):
try:
now = datetime.datetime.now().strftime('%Y%m%d')
vocab_names = list(args)
vocab_names.sort()
filename_base = self.create_filename_base(vocab_names, options, now)
base_path = os.path.join(WALENTY_PATH, filename_base)
outpath = base_path + '.xml'
statuses = get_statuses(options['min_status_type'])
lemmas = Lemma.objects.filter(old=False)
if vocab_names:
lemmas = lemmas.filter(vocabulary__name__in=vocab_names)
lemmas = lemmas.filter(status__in=statuses)
if options['start_date'] != 'all':
lemmas = self.filter_lemmas_by_status_change(lemmas, statuses, options['start_date'])
lemmas = lemmas.order_by('entry_obj__name')
self.print_statistics(lemmas)
frame_opinion_values = Frame_Opinion_Value.objects.all()
createteixml(outpath, lemmas, frame_opinion_values)
archive = tarfile.open(base_path + '-TEI.tar.gz', 'w:gz')
phrase_types_expand_path = os.path.join(WALENTY_PATH,
'%s_%s.xml' % ('phrase_types_expand', now))
write_phrase_types_expansions_in_TEI(phrase_types_expand_path)
os.chdir(WALENTY_PATH)
archive.add(os.path.basename(outpath))
archive.add(os.path.basename(phrase_types_expand_path))
finally:
archive.close()
os.remove(outpath)
os.remove(phrase_types_expand_path)
def create_filename_base(self, vocab_names, options, now):
start_date = ''
if options['start_date'] != 'all':
start_date = '-' + options['start_date'].replace('-', '')
vocab_names_str = ''
if vocab_names:
vocab_names_str = '-' + '+'.join(vocab_names)
min_status = ''
if options['min_status_type'] != 'ready':
min_status = '-' + options['min_status_type']
filename_base = 'walenty%s%s%s_%s' % (min_status, vocab_names_str,
start_date, now)
return filename_base
def filter_lemmas_by_status_change(self, lemmas, statuses, start_date_str):
start_date = self.parse_date(start_date_str)
filtered_lemmas_pks = []
for lemma in lemmas:
if lemma.status_history.filter(status=statuses[0], date__gte=start_date).exists():
filtered_lemmas_pks.append(lemma.pk)
return lemmas.filter(pk__in=filtered_lemmas_pks)
def parse_date(self, date_str):
date_parts = date_str.split('-')
year = int(date_parts[0])
month = int(date_parts[1].lstrip('0'))
day = int(date_parts[2].lstrip('0'))
date = datetime.datetime(year, month, day, 00, 00)
return date
def print_statistics(self, lemmas):
count = {'frames': 0,
'arguments': 0}
for lemma in lemmas:
frames = lemma.entry_obj.actual_frames()
count['frames'] += frames.count()
for frame in frames.all():
count['arguments'] += frame.complements.count()
print (u'Lemmas:\t%d' % lemmas.count())
print (u'Frames:\t%d' % count['frames'])
print (u'Arguments:\t%d' % count['arguments'])