Commit a82b248aee82add38120d340f6c7fab1f8690749

Authored by Marcel Kawski
1 parent 70470a9d

Prevent adding text again for already existing documents

collector/loaders/korpushumanistyczny_tei.py
... ... @@ -41,9 +41,10 @@ def load_document(pipeline_name, document_path):
41 41 # header_path = os.path.join(document_path, 'header.xml')
42 42 # header_data = _get_metadata(header_path)
43 43 print(document_path)
44   - doc = _create_document(pipeline, document_path, doc_name, [])
45   - text_structure_path = os.path.join(document_path, 'text_structure.xml')
46   - _add_text_content(doc, text_structure_path)
  44 + doc, created = _create_document(pipeline, document_path, doc_name, [])
  45 + if created is True:
  46 + text_structure_path = os.path.join(document_path, 'text_structure.xml')
  47 + _add_text_content(doc, text_structure_path)
47 48  
48 49  
49 50 def _get_metadata(header_path):
... ... @@ -129,11 +130,12 @@ def _create_document(pipeline, doc_path, doc_name, header_data):
129 130 if created is False:
130 131 def_seq = Document._meta.get_field('sequence').get_default()
131 132 print(f'Document with name "{name}" and sequence "{def_seq}" already exists.')
132   - for sequence, value in enumerate(metadata):
133   - Metadata.objects.create(document=document,
134   - name='nieznane',
135   - sequence=sequence + 1,
136   - value=value)
  133 + else:
  134 + for sequence, value in enumerate(metadata):
  135 + Metadata.objects.create(document=document,
  136 + name='nieznane',
  137 + sequence=sequence + 1,
  138 + value=value)
137 139  
138 140 # for sequence, meta in enumerate(header_data['metadata']):
139 141 # if 'target' in meta:
... ... @@ -142,7 +144,7 @@ def _create_document(pipeline, doc_path, doc_name, header_data):
142 144 # else:
143 145 # Metadata.objects.create(document=document, name=meta['type'], value=meta['value'], sequence=sequence)
144 146  
145   - return document
  147 + return document, created
146 148  
147 149  
148 150 def _add_text_content(document, text_structure_path):
... ...