Prevent creating new documents when there is some with the same name

Marcel Kawski
1 parent ebf6cd35
Showing 1 changed file with 23 additions and 21 deletions
collector/loaders/korpushumanistyczny_tei.py
@@ -12,7 +12,6 @@ from projects.ppc import utils
 from projects.ppc.models import Utterance
 from storage.models import Chunk, Document, Metadata, Participant
  
-
 TEI_NS = 'http://www.tei-c.org/ns/1.0'
 XML_NS = 'http://www.w3.org/XML/1998/namespace'
  
@@ -108,29 +107,32 @@ def _get_authors(authors):
 def _create_document(pipeline, doc_path, doc_name, header_data):
     metadata_path = doc_path.split(pipeline.source.path)[1]
     metadata = metadata_path.split('/')[1:]  # all except empty string at the beginning of the list
-    document = Document.objects.create(name=metadata_path[1:],  # without "/" at the beginning
-                                       source_id='',
-                                       lang='pl',
-                                       original_lang='pl',
-                                       pipeline=pipeline,
-                                       publisher='',
-                                       publication_date=None,
-                                       publication_place='',
-                                       number='',
-                                       meta_url='',
-                                       source_url='',
-                                       path=doc_path,
-                                       title='',
-                                       channel='inne',
-                                       type='publ',
-                                       text_origin='korpus_humanistyczny',
-                                       status='ND',
-                                       processing_status=ProcessingStatus.objects.get(key='to_correct'))
-
+    name = metadata_path[1:]
+    document, created = Document.objects.get_or_create(name=name,  # without "/" at the beginning
+                                                       source_id='',
+                                                       lang='pl',
+                                                       original_lang='pl',
+                                                       pipeline=pipeline,
+                                                       publisher='',
+                                                       publication_date=None,
+                                                       publication_place='',
+                                                       number='',
+                                                       meta_url='',
+                                                       source_url='',
+                                                       path=doc_path,
+                                                       title='',
+                                                       channel='inne',
+                                                       type='publ',
+                                                       text_origin='korpus_humanistyczny',
+                                                       status='ND',
+                                                       processing_status=ProcessingStatus.objects.get(key='to_correct'))
+    if created is False:
+        def_seq = Document._meta.get_field('sequence').get_default()
+        print(f'Document with name "{name}" and sequence "{def_seq}" already exists.')
     for sequence, value in enumerate(metadata):
         Metadata.objects.create(document=document,
                                 name='nieznane',
-                                sequence=sequence+1,
+                                sequence=sequence + 1,
                                 value=value)
  
     # for sequence, meta in enumerate(header_data['metadata']):