export_as_xml.py
4.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
# -*- coding:utf-8 -*-
import sys
from optparse import make_option
from lxml import etree
from django.core.management.base import BaseCommand
from webapp.models import Entry
from normalization import normalize
class Command(BaseCommand):
args = '<source source ...>'
help = 'Get database as xml file.'
option_list = BaseCommand.option_list + (
make_option('-o',
'--output',
action='store',
dest='output',
type='str',
default='',
help='output path'),
make_option('-a',
'--authorized',
action='store_true',
dest='authorized',
default=False,
help='add protected data'),
make_option('-n',
'--normalize',
action='store_true',
dest='normalize',
default=False,
help='use normalization'),
)
def handle(self, *args, **options):
if not options['output']:
print >> sys.stderr, 'Output must be selected!'
return
sources = list(args)
write_xml(sources, options['output'], options['authorized'], options['normalize'])
def write_xml(sources, outpath, authorized, normalize_descrs):
try:
root = etree.Element('entries')
write_entries(sources, root, authorized, normalize_descrs)
finally:
with open(outpath, 'w') as output_file:
output_file.write(etree.tostring(root, pretty_print=True, encoding='UTF-8'))
def write_entries(sources, root, authorized, normalize_descrs):
meanings_count = 0
expressions_count = 0
entries = Entry.objects
if not authorized:
entries = entries.filter(protected=False)
for entry in entries.order_by('name'):
print entry
for meaning in entry.meanings.order_by('id'):
expressions = meaning.valid_expressions(authorized)
if sources:
expressions = expressions.filter(link__source__key__in=sources).distinct()
if expressions.count() > 1:
write_meaning(meaning, expressions.order_by('-is_catchword', 'text'), root, normalize_descrs)
meanings_count += 1
expressions_count += expressions.count()
print 'Meanings:\t', str(meanings_count)
print 'Expressions:\t', str(expressions_count)
def write_meaning(meaning, expressions, root, normalize_descrs):
meaning_node = etree.SubElement(root, 'meaning')
categories = [domain.name for domain in meaning.domains.order_by('name')]
meaning_node.attrib['categories'] = ';'.join(categories)
orths = []
for expr in expressions:
if expr.orth_text not in orths:
orths.append(expr.orth_text)
desc = etree.SubElement(meaning_node, 'desc')
desc.attrib['catchword'] = 'true' if expr.is_catchword else 'false'
desc.attrib['entrylink'] = expr.link.exact_link
desc.attrib['source'] = expr.link.source.key
desc.attrib['base'] = expr.base_text
try:
desc.attrib['head_orth'] = expr.segments.get(is_head=True).orth
desc.attrib['head_base'] = expr.segments.get(is_head=True).base
desc.attrib['ctag'] = expr.segments.get(is_head=True).ctag
desc.attrib['msd'] = expr.segments.get(is_head=True).msd
except:
desc.attrib['head_orth'] = ''
desc.attrib['head_base'] = ''
desc.attrib['ctag'] = ''
desc.attrib['msd'] = ''
desc.text = expr.orth_text
if normalize_descrs:
for expr in expressions:
for form in normalize.generate_forms(expr, 'both'):
base_text, orth_text = normalize.get_normalized_expr_text_in_both_forms(expr, form)
if orth_text not in orths:
desc = etree.SubElement(meaning_node, 'desc')
desc.attrib['catchword'] = 'false'
desc.attrib['entrylink'] = ''
desc.attrib['source'] = 'AUTO'
desc.attrib['base'] = base_text
try:
head_position = expr.segments.get(is_head=True).position_in_expr
desc.attrib['head_orth'] = form[head_position]['orth']
desc.attrib['head_base'] = form[head_position]['base']
desc.attrib['ctag'] = expr.segments.get(is_head=True).ctag
desc.attrib['msd'] = expr.segments.get(is_head=True).msd
except:
desc.attrib['head_orth'] = ''
desc.attrib['head_base'] = ''
desc.attrib['ctag'] = ''
desc.attrib['msd'] = ''
desc.text = orth_text