marcell_rest.py 3.54 KB
import importlib
import json
import os
from datetime import datetime

from storage.models import Document, Keyword, Metadata


def load_document(pipeline, doc_id, metadata, text_file):
    doc_id = doc_id[len('pl-'):] if doc_id.startswith('pl-') else doc_id
    if pipeline.documents.filter(id=doc_id).exists():
        print('Document %s exists.' % doc_id)
        return pipeline.documents.get(id=doc_id)
    else:
        print('Loading document %s.' % doc_id)
        return _create_document(pipeline, doc_id, metadata, text_file)


def _create_document(pipeline, doc_id, metadata, text_file):
    path = os.path.join(pipeline.project.path, str(metadata['year']), doc_id)

    os.makedirs(path, exist_ok=True)

    meta_path = os.path.join(path, 'meta.json')
    with open(meta_path, 'w') as f:
        json.dump(metadata, f)

    source_path = None
    if metadata['content-type'] == 'application/pdf':
        source_path = os.path.join(path, 'source.pdf')
    elif metadata['content-type'] == 'text/plain':
        source_path = os.path.join(path, 'source.txt')
    elif metadata['content-type'] == 'text/html':
        source_path = os.path.join(path, 'source.html')

    with open(source_path, 'wb') as f:
        f.write(text_file.read())

    document = Document.objects.create(id=doc_id,
                                       source_id='',
                                       lang=metadata['language'],
                                       pipeline=pipeline,
                                       publisher=metadata['publisher'],
                                       publication_date=datetime.strptime(metadata['date'], '%Y-%m-%d').date(),
                                       meta_url=metadata['meta_url'],
                                       source_url=metadata['source_url'],
                                       file_url=metadata['file_url'],
                                       path=path,
                                       title=metadata['file_url'],
                                       type=metadata['type'],
                                       status=metadata['status'],
                                       in_effect=metadata['in_effect'])

    project_mappings = importlib.import_module('projects.%s.mappings' % pipeline.project.name)

    Metadata.objects.create(document=document, name='Rocznik', value=metadata['year'], sequence=0)
    Metadata.objects.create(document=document, name='Pozycja', value=metadata['position'], sequence=1)

    metadata_sequence = 2
    for name, value in metadata.items():
        if name not in ['content-type', 'pipeline', 'language', 'publisher', 'year', 'position', 'date', 'title',
                        'status', 'in_effect', 'type', 'keywords', 'source_url', 'meta_url', 'file_url']:

            if type(value) == list:
                value = ';'.join(value)

            translated_name = _pl(project_mappings.META_TYPES, name)
            if translated_name is not None:
                name = translated_name

            Metadata.objects.create(document=document, name=name, value=value,
                                    sequence=metadata_sequence)
            metadata_sequence += 1

    for label in metadata['keywords']:
        keyword_obj, _ = Keyword.objects.get_or_create(label=label)
        document.keywords.add(keyword_obj)

    return document


def _pl(translations, en_name):
    for translation in translations:
        if translation['en'] == en_name:
            if translation['pl'] is None:
                return en_name
            else:
                return '%s' % translation['pl']
    return None