marcell_rest.py
3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import importlib
import json
import os
from datetime import datetime
from storage.models import Document, Keyword, Metadata
def load_document(pipeline, doc_id, metadata, text_file):
doc_id = doc_id[len('pl-'):] if doc_id.startswith('pl-') else doc_id
if pipeline.documents.filter(id=doc_id).exists():
print('Document %s exists.' % doc_id)
return pipeline.documents.get(id=doc_id)
else:
print('Loading document %s.' % doc_id)
return _create_document(pipeline, doc_id, metadata, text_file)
def _create_document(pipeline, doc_id, metadata, text_file):
path = os.path.join(pipeline.project.path, str(metadata['year']), doc_id)
os.makedirs(path, exist_ok=True)
meta_path = os.path.join(path, 'meta.json')
with open(meta_path, 'w') as f:
json.dump(metadata, f)
source_path = None
if metadata['content-type'] == 'application/pdf':
source_path = os.path.join(path, 'source.pdf')
elif metadata['content-type'] == 'text/plain':
source_path = os.path.join(path, 'source.txt')
elif metadata['content-type'] == 'text/html':
source_path = os.path.join(path, 'source.html')
with open(source_path, 'wb') as f:
f.write(text_file.read())
document = Document.objects.create(id=doc_id,
source_id='',
lang=metadata['language'],
pipeline=pipeline,
publisher=metadata['publisher'],
publication_date=datetime.strptime(metadata['date'], '%Y-%m-%d').date(),
meta_url=metadata['meta_url'],
source_url=metadata['source_url'],
file_url=metadata['file_url'],
path=path,
title=metadata['file_url'],
type=metadata['type'],
status=metadata['status'],
in_effect=metadata['in_effect'])
project_mappings = importlib.import_module('projects.%s.mappings' % pipeline.project.name)
Metadata.objects.create(document=document, name='Rocznik', value=metadata['year'], sequence=0)
Metadata.objects.create(document=document, name='Pozycja', value=metadata['position'], sequence=1)
metadata_sequence = 2
for name, value in metadata.items():
if name not in ['content-type', 'pipeline', 'language', 'publisher', 'year', 'position', 'date', 'title',
'status', 'in_effect', 'type', 'keywords', 'source_url', 'meta_url', 'file_url']:
if type(value) == list:
value = ';'.join(value)
translated_name = _pl(project_mappings.META_TYPES, name)
if translated_name is not None:
name = translated_name
Metadata.objects.create(document=document, name=name, value=value,
sequence=metadata_sequence)
metadata_sequence += 1
for label in metadata['keywords']:
keyword_obj, _ = Keyword.objects.get_or_create(label=label)
document.keywords.add(keyword_obj)
return document
def _pl(translations, en_name):
for translation in translations:
if translation['en'] == en_name:
if translation['pl'] is None:
return en_name
else:
return '%s' % translation['pl']
return None