export_as_xml.py 5.05 KB
# -*- coding:utf-8 -*-

import sys

from optparse import make_option

from lxml import etree

from django.core.management.base import BaseCommand

from webapp.models import Entry
from normalization import normalize


class Command(BaseCommand):
    args = '<source source ...>'
    help = 'Get database as xml file.'

    option_list = BaseCommand.option_list + (
        make_option('-o',
                    '--output',
                    action='store',
                    dest='output',
                    type='str',
                    default='',
                    help='output path'),
        make_option('-a',
                    '--authorized',
                    action='store_true',
                    dest='authorized',
                    default=False,
                    help='add protected data'),
        make_option('-n',
                    '--normalize',
                    action='store_true',
                    dest='normalize',
                    default=False,
                    help='use normalization'),
    )

    def handle(self, *args, **options):

        if not options['output']:
            print >> sys.stderr, 'Output must be selected!'
            return

        sources = list(args)
        write_xml(sources, options['output'], options['authorized'], options['normalize'])


def write_xml(sources, outpath, authorized, normalize_descrs):
    try:
        root = etree.Element('entries')
        write_entries(sources, root, authorized, normalize_descrs)
    finally:
        with open(outpath, 'w') as output_file:
            output_file.write(etree.tostring(root, pretty_print=True, encoding='UTF-8'))


def write_entries(sources, root, authorized, normalize_descrs):
    meanings_count = 0
    expressions_count = 0

    entries = Entry.objects

    if not authorized:
        entries = entries.filter(protected=False)

    for entry in entries.order_by('name'):
        print entry
        for meaning in entry.meanings.order_by('id'):
            expressions = meaning.valid_expressions(authorized)
            if sources:
                expressions = expressions.filter(link__source__key__in=sources).distinct()

            if expressions.count() > 1:
                write_meaning(meaning, expressions.order_by('-is_catchword', 'text'), root, normalize_descrs)
                meanings_count += 1
                expressions_count += expressions.count()
    print 'Meanings:\t', str(meanings_count)
    print 'Expressions:\t', str(expressions_count)


def write_meaning(meaning, expressions, root, normalize_descrs):
    meaning_node = etree.SubElement(root, 'meaning')
    categories = [domain.name for domain in meaning.domains.order_by('name')]
    meaning_node.attrib['categories'] = ';'.join(categories)

    orths = []

    for expr in expressions:

        if expr.orth_text not in orths:
            orths.append(expr.orth_text)
            desc = etree.SubElement(meaning_node, 'desc')
            desc.attrib['catchword'] = 'true' if expr.is_catchword else 'false'
            desc.attrib['entrylink'] = expr.link.exact_link
            desc.attrib['source'] = expr.link.source.key
            desc.attrib['base'] = expr.base_text
            try:
                desc.attrib['head_orth'] = expr.segments.get(is_head=True).orth
                desc.attrib['head_base'] = expr.segments.get(is_head=True).base
                desc.attrib['ctag'] = expr.segments.get(is_head=True).ctag
                desc.attrib['msd'] = expr.segments.get(is_head=True).msd
            except:
                desc.attrib['head_orth'] = ''
                desc.attrib['head_base'] = ''
                desc.attrib['ctag'] = ''
                desc.attrib['msd'] = ''
            desc.text = expr.orth_text

    if normalize_descrs:

        for expr in expressions:

            for orth_form, base_form in zip(normalize.generate_forms(expr, 'orth'),
                                            normalize.generate_forms(expr, 'base')):

                base_text = normalize.get_normalized_expr_text(expr, base_form)
                orth_text = normalize.get_normalized_expr_text(expr, orth_form)

                if orth_text not in orths:
                    desc = etree.SubElement(meaning_node, 'desc')
                    desc.attrib['catchword'] = 'false'
                    desc.attrib['entrylink'] = ''
                    desc.attrib['source'] = 'AUTO'
                    desc.attrib['base'] = base_text
                    try:
                        head_position = expr.segments.get(is_head=True).position_in_expr
                        desc.attrib['head_orth'] = orth_form[head_position]
                        desc.attrib['head_base'] = base_form[head_position]
                        desc.attrib['ctag'] = expr.segments.get(is_head=True).ctag
                        desc.attrib['msd'] = expr.segments.get(is_head=True).msd
                    except:
                        desc.attrib['head_orth'] = ''
                        desc.attrib['head_base'] = ''
                        desc.attrib['ctag'] = ''
                        desc.attrib['msd'] = ''
                    desc.text = orth_text