mtas.py 9.6 KB
import importlib
import json
import os

from lxml import etree
from natsort import natsorted

from writers import tei


NSMAP = {'re': 'http://exslt.org/regular-expressions'}


def write(document):
    print('Writing %s in MTAS format.' % document.id)
    _write_doc(document)
    tei.write_header(document)
    if document.annotated():
        tei.write_segmentation(document)
        _write_mtas(document)


def _write_doc(document):
    doc_path = os.path.join(document.path, 'doc.json')

    metadata = {'publisher': document.publisher,
                'title': document.title,
                'lang': document.lang,
                'text_type': document.type,
                'text_data': os.path.relpath(os.path.join(document.path, 'mtas_tei.xml'),
                                             document.pipeline.project.path),
                'type': 'tei',
                'id': document.id}

    if document.meta_url:
        metadata['meta_url'] = document.meta_url

    if document.file_url or document.source_url:
        metadata['url'] = document.file_url if document.file_url else document.source_url

    if document.publication_date:
        metadata['date'] = document.publication_date.strftime('%Y-%m-%d')
        metadata['date_int'] = document.publication_date.year
    elif document.metadata.filter(name='dateStr').exists():
        metadata['date'] = document.metadata.get(name='dateStr').value

    if document.participants.filter(role='author').exists():
        metadata['author'] = [author.name for author in document.participants.filter(role='author').order_by('order')]

    project_mappings = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name)
    for meta in document.metadata.order_by('sequence'):
        if _meta_multivalue(project_mappings.META_TYPES, meta.name):
            if meta.name in metadata:
                metadata[meta.name].append(meta.value)
            else:
                metadata[meta.name] = [meta.value]
        else:
            metadata[meta.name] = meta.value

        if meta.target:
            meta_name = '%s-%s' % (meta.name, document.participants.get(abbrev=meta.target.lstrip('#')).type)
            meta_value = '%s-%s' % (meta.value, document.participants.get(abbrev=meta.target.lstrip('#')).name)
            if meta_name in metadata:
                metadata[meta_name].append(meta_value)
            else:
                metadata[meta_name] = [meta_value]

    if document.participants.filter(role='committee').exists():
        metadata['committee'] = [committee.name for committee in
                                 document.participants.filter(role='committee').order_by('order')]

    with open(doc_path, 'w', encoding='utf-8') as f:
        json.dump([metadata], f, ensure_ascii=False)


def _meta_multivalue(meta_mapping, pl_name):
    for meta_type in meta_mapping:
        if meta_type['pl'] == pl_name:
            return meta_type['multivalue']
    return False


def _write_mtas(document):
    mtas_tei_path = os.path.join(document.path, 'mtas_tei.xml')
    root = etree.Element('teiCorpus', nsmap={'xi': tei.NSMAP['xi'], 'nkjp': tei.NSMAP['nkjp'], None: tei.NSMAP['tei']})
    tei.add_element(root, 'xi:include', {'href': '%s_header.xml' % document.pipeline.project.name.upper()})
    tei_elem = tei.add_element(root, 'TEI')
    tei.add_element(tei_elem, 'xi:include', {'href': 'header.xml'})

    text = tei.add_element(tei_elem, 'text')
    _add_named(text, document)
    _add_utterances(text, document)

    body = tei.add_element(text, 'body')
    _add_morphosyntax(body, document)

    tei.save(mtas_tei_path, root)


def _add_named(parent, document):
    named = etree.Element('named', nsmap={'re': NSMAP['re']})
    parent.append(named)

    seg_count = 1
    for pi, chunk in enumerate(document.chunks.order_by('sequence'), 1):
        if chunk.utterances.exists():
            for ui, utt in enumerate(chunk.utterances.order_by('sequence')):
                for si, sent in enumerate(utt.anno['chunks'][0]['sentences'], 1):
                    sent_id = 'u-%d.%d.%d-s' % (pi, ui, si)

                    morphs_map = {}
                    for tok in sent['tokens']:
                        morphs_map[tok['id']] = 'morph_u-%d.%d.%d-seg' % (pi, ui, seg_count)
                        seg_count += 1

                    local_nes = _get_local_nes(utt.anno['names'], sent['tokens'])
                    nes_map = tei.map_nes_ids(local_nes)

                    for ne in local_nes:
                        _write_ne(named, sent_id, ne, nes_map, morphs_map)

        else:
            for si, sent in enumerate(chunk.anno['chunks'][0]['sentences'], 1):
                sent_id = 'div-%d.%d-s' % (pi, si)

                morphs_map = {}
                for tok in sent['tokens']:
                    morphs_map[tok['id']] = 'morph_div-%d.%d-seg' % (pi, seg_count)
                    seg_count += 1

                local_nes = _get_local_nes(chunk.anno['names'], sent['tokens'])
                nes_map = tei.map_nes_ids(local_nes)

                for ne in local_nes:
                    _write_ne(named, sent_id, ne, nes_map, morphs_map)


def _get_local_nes(nes, tokens):
    local_nes = []
    for ne in nes:
        for tok in tokens:
            if tok['id'] in ne['tokens']:
                local_nes.append(ne)
                break
    tei.map_ne_types(local_nes)
    return local_nes


def _write_ne(parent, sent_id, ne, nes_map, morphs_map):
    ne_type = ne['type'] if ne['subtype'] is None else '%s.%s' % (ne['type'], ne['subtype'])
    ne_elem = tei.add_element(parent, 'ne', {'id': '%s_%s' % (sent_id, nes_map[ne['id']].replace('a', 'n')),
                                             'type': ne_type})
    for tok_id in [morphs_map[tok] for tok in natsorted(ne['tokens'])]:
        tei.add_element(ne_elem, 'wref', {'id': tok_id})


def _add_morphosyntax(parent, document):
    seg_count = 1
    total_sent_count = 0
    total_p_count = 0
    for pi, chunk in enumerate(document.chunks.order_by('sequence'), 1):
        if chunk.utterances.exists():
            for ui, utt in enumerate(chunk.utterances.order_by('sequence')):
                total_p_count += 1
                previous_tok = None
                p = tei.add_element(parent, 'p')

                for si, sent in enumerate(utt.anno['chunks'][0]['sentences'], 1):
                    total_sent_count += 1
                    s = tei.add_element(p, 's', {'xml:id': 'u-%d.%d.%d-s' % (pi, ui, si),
                                                 'corresp': 'ann_segmentation.xml#segm_u-%d.%d.%d-s' % (pi, ui, si)})
                    for ti, tok in enumerate(sent['tokens'], 1):

                        if previous_tok and previous_tok['ns']:
                            nps = True
                        else:
                            nps = False

                        seg_base_id = '%d.%d.%d' % (pi, ui, seg_count)
                        seg_attrs = {'corresp': 'ann_segmentation.xml#segm_u-%s-seg' % seg_base_id,
                                     'xml:id': 'morph_u-%s-seg' % seg_base_id}
                        if si == 1 and ti == 1:
                            seg_attrs['u_begin'] = 'true'
                        seg = tei.add_element(s, 'seg', seg_attrs)

                        morph_base_id = '%d.%d.%d' % (total_p_count, total_sent_count, ti)
                        tei.write_morph(seg, morph_base_id, tok, nps)

                        seg_count += 1
                        previous_tok = tok
        else:
            total_p_count += 1
            previous_tok = None
            p = tei.add_element(parent, 'p')

            for si, sent in enumerate(chunk.anno['chunks'][0]['sentences'], 1):
                total_sent_count += 1
                s = tei.add_element(p, 's', {'xml:id': 'div-%d.%d-s' % (pi, si),
                                             'corresp': 'ann_segmentation.xml#segm_div-%d.%d-s' % (pi, si)})
                for ti, tok in enumerate(sent['tokens'], 1):

                    if previous_tok and previous_tok['ns']:
                        nps = True
                    else:
                        nps = False

                    seg_base_id = '%d.%d' % (pi, seg_count)
                    seg_attrs = {'corresp': 'ann_segmentation.xml#segm_div-%s-seg' % seg_base_id,
                                 'xml:id': 'morph_div-%s-seg' % seg_base_id}
                    if si == 1 and ti == 1:
                        seg_attrs['p_begin'] = 'true'
                    seg = tei.add_element(s, 'seg', seg_attrs)

                    morph_base_id = '%d.%d.%d' % (total_p_count, total_sent_count, ti)
                    tei.write_morph(seg, morph_base_id, tok, nps)

                    seg_count += 1
                    previous_tok = tok


def _add_utterances(parent, document):
    utterances = etree.Element('utterances', nsmap={'re': NSMAP['re']})
    parent.append(utterances)

    seg_count = 1
    for pi, chunk in enumerate(document.chunks.order_by('sequence'), 1):
        if chunk.utterances.exists():
            for ui, utt in enumerate(chunk.utterances.order_by('sequence')):
                start = None
                seg_id = None
                for si, sent in enumerate(utt.anno['chunks'][0]['sentences'], 1):
                    for _ in sent['tokens']:
                        seg_id = 'morph_u-%d.%d.%d-seg' % (pi, ui, seg_count)
                        if start is None:
                            start = seg_id
                        seg_count += 1
                end = seg_id
                u = tei.add_element(utterances, 'u', {'xml:id': 'u-%d.%d' % (pi, ui), 'who': utt.speaker.name})
                tei.add_element(u, 'wref', {'id': start})
                tei.add_element(u, 'wref', {'id': end})
        else:
            pass