Commit afc31f2685b53c3b39d1511ff53452c2da76842d
1 parent
ebf6cd35
Prevent creating new documents when there is some with the same name
Showing
1 changed file
with
23 additions
and
21 deletions
collector/loaders/korpushumanistyczny_tei.py
... | ... | @@ -12,7 +12,6 @@ from projects.ppc import utils |
12 | 12 | from projects.ppc.models import Utterance |
13 | 13 | from storage.models import Chunk, Document, Metadata, Participant |
14 | 14 | |
15 | - | |
16 | 15 | TEI_NS = 'http://www.tei-c.org/ns/1.0' |
17 | 16 | XML_NS = 'http://www.w3.org/XML/1998/namespace' |
18 | 17 | |
... | ... | @@ -108,29 +107,32 @@ def _get_authors(authors): |
108 | 107 | def _create_document(pipeline, doc_path, doc_name, header_data): |
109 | 108 | metadata_path = doc_path.split(pipeline.source.path)[1] |
110 | 109 | metadata = metadata_path.split('/')[1:] # all except empty string at the beginning of the list |
111 | - document = Document.objects.create(name=metadata_path[1:], # without "/" at the beginning | |
112 | - source_id='', | |
113 | - lang='pl', | |
114 | - original_lang='pl', | |
115 | - pipeline=pipeline, | |
116 | - publisher='', | |
117 | - publication_date=None, | |
118 | - publication_place='', | |
119 | - number='', | |
120 | - meta_url='', | |
121 | - source_url='', | |
122 | - path=doc_path, | |
123 | - title='', | |
124 | - channel='inne', | |
125 | - type='publ', | |
126 | - text_origin='korpus_humanistyczny', | |
127 | - status='ND', | |
128 | - processing_status=ProcessingStatus.objects.get(key='to_correct')) | |
129 | - | |
110 | + name = metadata_path[1:] | |
111 | + document, created = Document.objects.get_or_create(name=name, # without "/" at the beginning | |
112 | + source_id='', | |
113 | + lang='pl', | |
114 | + original_lang='pl', | |
115 | + pipeline=pipeline, | |
116 | + publisher='', | |
117 | + publication_date=None, | |
118 | + publication_place='', | |
119 | + number='', | |
120 | + meta_url='', | |
121 | + source_url='', | |
122 | + path=doc_path, | |
123 | + title='', | |
124 | + channel='inne', | |
125 | + type='publ', | |
126 | + text_origin='korpus_humanistyczny', | |
127 | + status='ND', | |
128 | + processing_status=ProcessingStatus.objects.get(key='to_correct')) | |
129 | + if created is False: | |
130 | + def_seq = Document._meta.get_field('sequence').get_default() | |
131 | + print(f'Document with name "{name}" and sequence "{def_seq}" already exists.') | |
130 | 132 | for sequence, value in enumerate(metadata): |
131 | 133 | Metadata.objects.create(document=document, |
132 | 134 | name='nieznane', |
133 | - sequence=sequence+1, | |
135 | + sequence=sequence + 1, | |
134 | 136 | value=value) |
135 | 137 | |
136 | 138 | # for sequence, meta in enumerate(header_data['metadata']): |
... | ... |