ppc_tei.py 7.35 KB
import importlib
import multiprocessing
import os

from django import db
from django.utils.dateparse import parse_date
from lxml import etree

from collector import settings
from pipeline.models import Pipeline
from projects.ppc import utils
from projects.ppc.models import Utterance
from storage.models import Chunk, Document, Metadata, Participant


TEI_NS = 'http://www.tei-c.org/ns/1.0'
XML_NS = 'http://www.w3.org/XML/1998/namespace'


def load_documents(pipeline):
    db.connection.close()
    loading_map = []
    for root, dirs, files in os.walk(pipeline.source.path):
        if 'header.xml' in files and 'text_structure.xml' in files:
            loading_map.append((pipeline.name, root))
    pool = multiprocessing.Pool(settings.LOADER_CORES)
    pool.map(_multi_loader_wrapper, loading_map)
    pool.close()
    pool.join()


def _multi_loader_wrapper(args):
    return load_document(*args)


def load_document(pipeline_name, document_path):
    doc_id = os.path.basename(document_path)
    if not Document.objects.filter(id=doc_id).exists():
        print('Loading document %s.' % doc_id)
        pipeline = Pipeline.objects.get(name=pipeline_name)
        header_path = os.path.join(document_path, 'header.xml')
        header_data = _get_metadata(header_path)
        doc = _create_document(pipeline, document_path, doc_id, header_data)
        text_structure_path = os.path.join(document_path, 'text_structure.xml')
        _add_text_content(doc, text_structure_path)


def _get_metadata(header_path):
    data = {'date': None,
            'metadata': [],
            'pers_participants': [],
            'org_participants': []}

    parser = etree.XMLParser(remove_blank_text=True)
    header_tree = etree.parse(header_path, parser)

    data['source_id'] = header_tree.getroot().attrib['{%s}id' % XML_NS]

    for meta in header_tree.find('//ns:sourceDesc/ns:bibl', namespaces={'ns': TEI_NS}):
        if meta.tag == '{%s}note' % TEI_NS and meta.attrib['type'] == 'type':
            data['type'] = meta.text
        elif meta.tag == '{%s}note' % TEI_NS and 'target' in meta.attrib:
            data['metadata'].append({'type': meta.attrib['type'],
                                     'value': meta.text,
                                     'target': meta.attrib['target']})
        elif meta.tag == '{%s}note' % TEI_NS:
            data['metadata'].append({'type': meta.attrib['type'],
                                     'value': meta.text})
        elif meta.tag == '{%s}author' % TEI_NS:
            data['pers_participants'].extend(_get_authors(meta.text))
        elif meta.tag == '{%s}title' % TEI_NS:
            data['title'] = meta.text
        elif meta.tag == '{%s}publisher' % TEI_NS:
            data['publisher'] = meta.text
        elif meta.tag == '{%s}date' % TEI_NS:
            data['date'] = parse_date(meta.text)
            if data['date'] is None:
                data['metadata'].append({'type': 'dateStr',
                                         'value': meta.text})

    participants = header_tree.find('//ns:profileDesc/ns:particDesc', namespaces={'ns': TEI_NS})
    if participants is not None:
        for partic in participants:
            if partic.tag == '{%s}person' % TEI_NS:
                data['pers_participants'].append({'name': partic.find('ns:persName', namespaces={'ns': TEI_NS}).text,
                                                  'id': partic.attrib['{%s}id' % XML_NS],
                                                  'role': partic.attrib['role']})
            elif partic.tag == '{%s}org' % TEI_NS:
                data['org_participants'].append({'name': partic.find('ns:orgName', namespaces={'ns': TEI_NS}).text,
                                                 'id': partic.attrib['{%s}id' % XML_NS],
                                                 'role': 'committee'})

    return data


def _get_authors(authors):
    pers_participants = []
    for name in authors.split(','):
        name = name.strip()
        pers_participants.append({'name': name,
                                  'id': utils.create_person_id(name),
                                  'role': 'author'})
    return pers_participants


def _create_document(pipeline, doc_path, doc_id, header_data):
    document = Document.objects.create(id=doc_id,
                                       source_id=header_data['source_id'],
                                       lang='pl',
                                       pipeline=pipeline,
                                       publisher=header_data['publisher'],
                                       publication_date=header_data['date'],
                                       meta_url='',
                                       source_url='',
                                       path=doc_path,
                                       title=header_data['title'],
                                       type=header_data['type'],
                                       status='opublikowany')

    for sequence, meta in enumerate(header_data['metadata']):
        if 'target' in meta:
            Metadata.objects.create(document=document, name=meta['type'], value=meta['value'],
                                    target=meta['target'], sequence=sequence)
        else:
            Metadata.objects.create(document=document, name=meta['type'], value=meta['value'], sequence=sequence)

    for order, person in enumerate(header_data['pers_participants']):
        Participant.objects.create(abbrev=person['id'], document=document, name=person['name'], order=order,
                                   role=person['role'], type='person')

    for order, org in enumerate(header_data['org_participants']):
        Participant.objects.create(abbrev=org['id'], document=document, name=org['name'], order=order, role=org['role'],
                                   type='org')

    return document


def _add_text_content(document, text_structure_path):
    parser = etree.XMLParser(remove_blank_text=True)
    text_structure_tree = etree.parse(text_structure_path, parser)

    mappings = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name)

    for cseq, chunk in enumerate(text_structure_tree.find('//ns:body', namespaces={'ns': TEI_NS})):
        if len(chunk) == 0:
            if chunk.text is not None:
                Chunk.objects.create(document=document, sequence=cseq, text=chunk.text)
        else:
            chunk_obj = Chunk.objects.create(document=document, sequence=cseq, text='')
            for useq, utterance in enumerate(chunk):
                if utterance.text is not None:
                    speaker_abbrev = utterance.attrib['who'].lstrip('#')
                    if document.participants.filter(abbrev=speaker_abbrev, type='person').exists():
                        speaker = document.participants.get(abbrev=speaker_abbrev, type='person')
                    else:
                        speaker = Participant.objects.create(abbrev=speaker_abbrev,
                                                             document=document,
                                                             name=mappings.MISSING_PARTICIPANTS[speaker_abbrev],
                                                             order=document.participants.filter(type='person').count(),
                                                             role='speaker', type='person')
                    Utterance.objects.create(chunk=chunk_obj, sequence=useq, speaker=speaker, text=utterance.text)