Commit afc31f2685b53c3b39d1511ff53452c2da76842d

Authored by Marcel Kawski
1 parent ebf6cd35

Prevent creating new documents when there is some with the same name

collector/loaders/korpushumanistyczny_tei.py
... ... @@ -12,7 +12,6 @@ from projects.ppc import utils
12 12 from projects.ppc.models import Utterance
13 13 from storage.models import Chunk, Document, Metadata, Participant
14 14  
15   -
16 15 TEI_NS = 'http://www.tei-c.org/ns/1.0'
17 16 XML_NS = 'http://www.w3.org/XML/1998/namespace'
18 17  
... ... @@ -108,29 +107,32 @@ def _get_authors(authors):
108 107 def _create_document(pipeline, doc_path, doc_name, header_data):
109 108 metadata_path = doc_path.split(pipeline.source.path)[1]
110 109 metadata = metadata_path.split('/')[1:] # all except empty string at the beginning of the list
111   - document = Document.objects.create(name=metadata_path[1:], # without "/" at the beginning
112   - source_id='',
113   - lang='pl',
114   - original_lang='pl',
115   - pipeline=pipeline,
116   - publisher='',
117   - publication_date=None,
118   - publication_place='',
119   - number='',
120   - meta_url='',
121   - source_url='',
122   - path=doc_path,
123   - title='',
124   - channel='inne',
125   - type='publ',
126   - text_origin='korpus_humanistyczny',
127   - status='ND',
128   - processing_status=ProcessingStatus.objects.get(key='to_correct'))
129   -
  110 + name = metadata_path[1:]
  111 + document, created = Document.objects.get_or_create(name=name, # without "/" at the beginning
  112 + source_id='',
  113 + lang='pl',
  114 + original_lang='pl',
  115 + pipeline=pipeline,
  116 + publisher='',
  117 + publication_date=None,
  118 + publication_place='',
  119 + number='',
  120 + meta_url='',
  121 + source_url='',
  122 + path=doc_path,
  123 + title='',
  124 + channel='inne',
  125 + type='publ',
  126 + text_origin='korpus_humanistyczny',
  127 + status='ND',
  128 + processing_status=ProcessingStatus.objects.get(key='to_correct'))
  129 + if created is False:
  130 + def_seq = Document._meta.get_field('sequence').get_default()
  131 + print(f'Document with name "{name}" and sequence "{def_seq}" already exists.')
130 132 for sequence, value in enumerate(metadata):
131 133 Metadata.objects.create(document=document,
132 134 name='nieznane',
133   - sequence=sequence+1,
  135 + sequence=sequence + 1,
134 136 value=value)
135 137  
136 138 # for sequence, meta in enumerate(header_data['metadata']):
... ...