txt_plus.py
2.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import importlib
import json
import os
from writers import txt
def write(document):
txt.write(document)
if document.chunks.exists() and not document.image and not document.broken_source:
_write_metadata(document, os.path.join(document.path, 'meta.json'))
def _write_metadata(document, meta_path):
print('Writing %s metadata in JSON format.' % document.id)
project_mappings = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name)
metadata = {'pipeline': document.pipeline.project.name,
'content-type': 'text/plain',
'language': document.lang,
'publisher': document.publisher,
'date': document.publication_date.strftime('%Y-%m-%d'),
'title': document.title,
'status': document.status,
'in_effect': document.in_effect,
'type': document.type,
'keywords': [keyword.label for keyword in document.keywords.all()],
'source_url': document.source_url,
'meta_url': document.meta_url,
'file_url': document.file_url}
for meta in document.metadata.order_by('sequence'):
meta_multivalue, meta_separator = _meta_multivalue(project_mappings.META_TYPES, meta.name)
translated_meta_name = _en(project_mappings.META_TYPES, meta.name)
if meta_multivalue:
if translated_meta_name in metadata:
if meta_separator:
metadata[translated_meta_name].extend(meta.value.split(meta_separator))
else:
metadata[translated_meta_name].append(meta.value)
else:
if meta_separator:
metadata[translated_meta_name] = meta.value.split(meta_separator)
else:
metadata[translated_meta_name] = [meta.value]
else:
if meta.value.isdigit():
metadata[translated_meta_name] = int(meta.value)
else:
metadata[translated_meta_name] = meta.value
with open(meta_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False)
def _meta_multivalue(meta_mapping, pl_name):
for meta_type in meta_mapping:
if meta_type['pl'] == pl_name:
return meta_type['multivalue'], meta_type['separator']
return False, None
def _en(translations, pl_name):
for translation in translations:
if translation['pl'] == pl_name:
if translation['en'] is None:
return pl_name
else:
return '%s' % translation['en']
return pl_name