Commit a82b248aee82add38120d340f6c7fab1f8690749
1 parent
70470a9d
Prevent adding text again for already existing documents
Showing
1 changed file
with
11 additions
and
9 deletions
collector/loaders/korpushumanistyczny_tei.py
... | ... | @@ -41,9 +41,10 @@ def load_document(pipeline_name, document_path): |
41 | 41 | # header_path = os.path.join(document_path, 'header.xml') |
42 | 42 | # header_data = _get_metadata(header_path) |
43 | 43 | print(document_path) |
44 | - doc = _create_document(pipeline, document_path, doc_name, []) | |
45 | - text_structure_path = os.path.join(document_path, 'text_structure.xml') | |
46 | - _add_text_content(doc, text_structure_path) | |
44 | + doc, created = _create_document(pipeline, document_path, doc_name, []) | |
45 | + if created is True: | |
46 | + text_structure_path = os.path.join(document_path, 'text_structure.xml') | |
47 | + _add_text_content(doc, text_structure_path) | |
47 | 48 | |
48 | 49 | |
49 | 50 | def _get_metadata(header_path): |
... | ... | @@ -129,11 +130,12 @@ def _create_document(pipeline, doc_path, doc_name, header_data): |
129 | 130 | if created is False: |
130 | 131 | def_seq = Document._meta.get_field('sequence').get_default() |
131 | 132 | print(f'Document with name "{name}" and sequence "{def_seq}" already exists.') |
132 | - for sequence, value in enumerate(metadata): | |
133 | - Metadata.objects.create(document=document, | |
134 | - name='nieznane', | |
135 | - sequence=sequence + 1, | |
136 | - value=value) | |
133 | + else: | |
134 | + for sequence, value in enumerate(metadata): | |
135 | + Metadata.objects.create(document=document, | |
136 | + name='nieznane', | |
137 | + sequence=sequence + 1, | |
138 | + value=value) | |
137 | 139 | |
138 | 140 | # for sequence, meta in enumerate(header_data['metadata']): |
139 | 141 | # if 'target' in meta: |
... | ... | @@ -142,7 +144,7 @@ def _create_document(pipeline, doc_path, doc_name, header_data): |
142 | 144 | # else: |
143 | 145 | # Metadata.objects.create(document=document, name=meta['type'], value=meta['value'], sequence=sequence) |
144 | 146 | |
145 | - return document | |
147 | + return document, created | |
146 | 148 | |
147 | 149 | |
148 | 150 | def _add_text_content(document, text_structure_path): |
... | ... |