From 85f97aaca15c0c8d03a6034f46318665d6b62394 Mon Sep 17 00:00:00 2001 From: Bartłomiej Nitoń <bartek.niton@gmail.com> Date: Mon, 21 Feb 2022 11:56:34 +0100 Subject: [PATCH] Change conllu writer to match CURLICAT project needs. --- collector/projects/curlicat/management/commands/export_curlicat.py | 9 ++++----- collector/projects/curlicat/mappings.py | 4 +++- collector/projects/marcell/mappings.py | 2 ++ collector/projects/ppc/mappings.py | 2 ++ collector/projects/ppc/models.py | 7 +++++++ collector/storage/models.py | 18 ++++++++++++++++++ collector/terminology/management/commands/match_iate_terms.py | 4 ++-- collector/terminology/models.py | 7 +++++++ collector/writers/conllu.py | 54 ++++++++++++++++++++++++++++++------------------------ 9 files changed, 75 insertions(+), 32 deletions(-) diff --git a/collector/projects/curlicat/management/commands/export_curlicat.py b/collector/projects/curlicat/management/commands/export_curlicat.py index f58fd16..0353629 100644 --- a/collector/projects/curlicat/management/commands/export_curlicat.py +++ b/collector/projects/curlicat/management/commands/export_curlicat.py @@ -52,8 +52,7 @@ class Command(BaseCommand): os.makedirs(options['output'], exist_ok=True) for doc in documents: conllu_path = os.path.join(doc.path, 'text.conllup') - if not os.path.isfile(conllu_path) or not doc.annotated(): - continue - print('Exporting %s.' % doc.name) - dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name)) - shutil.copyfile(conllu_path, dest_path) + if os.path.isfile(conllu_path) and (doc.get_abstract_anno() or doc.pl_chunks_longer_than_min()): + print('Exporting %s.' % doc.name) + dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name)) + shutil.copyfile(conllu_path, dest_path) diff --git a/collector/projects/curlicat/mappings.py b/collector/projects/curlicat/mappings.py index 4a6f840..13aaa1f 100644 --- a/collector/projects/curlicat/mappings.py +++ b/collector/projects/curlicat/mappings.py @@ -1,7 +1,7 @@ from urllib.parse import urljoin CONLLU_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', - 'CURLICAT:NE', 'CURLICAT:NP'] + 'CURLICAT:NE', 'CURLICAT:NP', 'CURLICAT:IATE'] DOC_TYPES = [] @@ -35,6 +35,8 @@ PUBLISHERS = {} PUBLISHERS2ABBREV = {} +MIN_SEGMENTS_BY_CHUNK = 100 # counted 92.21 + def get_abstract_anno(document): for meta in document.metadata.all(): diff --git a/collector/projects/marcell/mappings.py b/collector/projects/marcell/mappings.py index 3d21c7c..3e9a383 100755 --- a/collector/projects/marcell/mappings.py +++ b/collector/projects/marcell/mappings.py @@ -74,6 +74,8 @@ PUBLISHERS = {'WDU': 'Dziennik Ustaw', PUBLISHERS2ABBREV = {'Dziennik Ustaw': 'WDU', 'Monitor Polski': 'WMP'} +MIN_SEGMENTS_BY_CHUNK = 0 + def get_lang(document): return document.lang diff --git a/collector/projects/ppc/mappings.py b/collector/projects/ppc/mappings.py index 897880f..587c568 100755 --- a/collector/projects/ppc/mappings.py +++ b/collector/projects/ppc/mappings.py @@ -499,6 +499,8 @@ MISSING_PARTICIPANTS = { 'PodsekretarzStanuWMinisterstwieGospodarkiDariuszBogdan': 'Podsekretarz Stanu w Ministerstwie Gospodarki Dariusz Bogdan' } +MIN_SEGMENTS_BY_CHUNK = 0 + def get_lang(document): return document.lang diff --git a/collector/projects/ppc/models.py b/collector/projects/ppc/models.py index 2436a86..7c7088b 100755 --- a/collector/projects/ppc/models.py +++ b/collector/projects/ppc/models.py @@ -15,6 +15,13 @@ class Utterance(models.Model): def words_count(self): return len(self.text.split()) + def segments_count(self): + segments_count = 0 + for chunk in self.anno['chunks']: + for sent in chunk['sentences']: + segments_count += len(sent['tokens']) + return segments_count + class Meta: db_table = 'utterance' ordering = ['sequence'] diff --git a/collector/storage/models.py b/collector/storage/models.py index e3105c6..47de9ac 100755 --- a/collector/storage/models.py +++ b/collector/storage/models.py @@ -167,6 +167,15 @@ class Document(models.Model): words_count += chunk.words_count() return words_count + def pl_chunks_longer_than_min(self): + min_segments = importlib.import_module('projects.%s.mappings' % + self.pipeline.project.name).MIN_SEGMENTS_BY_CHUNK + longer = [] + for chunk in self.chunks.filter(lang='pl'): + if chunk.anno and chunk.segments_count() > min_segments: + longer.append(chunk) + return longer + class Meta: db_table = 'document' ordering = ['id'] @@ -189,6 +198,15 @@ class Chunk(models.Model): words_count += utt.words_count() return words_count + def segments_count(self): + segments_count = 0 + for chunk in self.anno['chunks']: + for sent in chunk['sentences']: + segments_count += len(sent['tokens']) + for utt in self.utterances.all(): + segments_count += utt.segments_count() + return segments_count + class Meta: db_table = 'chunk' ordering = ['sequence'] diff --git a/collector/terminology/management/commands/match_iate_terms.py b/collector/terminology/management/commands/match_iate_terms.py index 8ce8bde..5544af2 100755 --- a/collector/terminology/management/commands/match_iate_terms.py +++ b/collector/terminology/management/commands/match_iate_terms.py @@ -29,9 +29,9 @@ class Command(BaseCommand): return if not options['prefix']: - docs = Document.objects.filter(pipeline__project__name=options['pipeline']).order_by('name') + docs = Document.objects.filter(pipeline__name=options['pipeline']).order_by('name') else: - docs = Document.objects.filter(pipeline__project__name=options['pipeline'], + docs = Document.objects.filter(pipeline__name=options['pipeline'], name__startswith=options['prefix']).order_by('name') iate.annotate(docs) diff --git a/collector/terminology/models.py b/collector/terminology/models.py index 33b1077..bbe2cfb 100755 --- a/collector/terminology/models.py +++ b/collector/terminology/models.py @@ -68,6 +68,13 @@ class IATETerm(models.Model): db_table = 'iate_term' ordering = ['tid'] + def eurovoc_terms(self): + eurovoc_ids = [] + for subject in self.subject_field.split(';'): + for evlabel in EuroVocLabel.objects.filter(lang='en', text=subject.strip(), used_for=False): + eurovoc_ids.append(evlabel.term.tid) + return EuroVocTerm.objects.filter(tid__in=eurovoc_ids, type__in=['domain', 'thesaurus']) + def __str__(self): return ' | '.join([str(label) for label in self.labels.all()]) diff --git a/collector/writers/conllu.py b/collector/writers/conllu.py index 3735745..8887fcc 100755 --- a/collector/writers/conllu.py +++ b/collector/writers/conllu.py @@ -4,14 +4,14 @@ import os from natsort import natsorted -from projects.marcell.models import IATETerm, EuroVocTerm +from terminology.models import IATETerm, EuroVocTerm DEFAULT_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] def write(document): - if document.annotated(): + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min(): print('Writing %s in CONLLU format.' % document.name) cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS @@ -26,7 +26,7 @@ def write(document): def write_to_dir(document, export_dir_path): - if document.annotated(): + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min(): print('Writing %s in CONLLU format.' % document.name) cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS @@ -73,6 +73,10 @@ def _write_metadata(document, text_file): metadata['enkeywords'] = ' | '.join(sorted([keyword.label for keyword in document.get_en_keywords()])) metadata['url'] = document.get_source_url() + metadata['content_type'] = 'abstract' + if document.pl_chunks_longer_than_min(): + metadata['content_type'] = 'full_text' + if document.issue: metadata['publishing_company'] = document.issue.journal.publishing_company.name metadata['journal'] = document.issue.journal.title @@ -120,22 +124,7 @@ def _en(translations, pl_name): def _write_paragraphs(document, text_file, cols): ci = 1 - - title_anno = document.get_title_anno() - if title_anno: - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) - text_file.write('# newpar id = %s\n' % par_id) - _write_sentences(par_id, title_anno, text_file, cols) - ci += 1 - - abstract_anno = document.get_abstract_anno() - if abstract_anno: - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) - text_file.write('# newpar id = %s\n' % par_id) - _write_sentences(par_id, abstract_anno, text_file, cols) - ci += 1 - - if document.chunks_annotated(): + if document.pl_chunks_longer_than_min(): for ci, chunk in enumerate(document.chunks.order_by('sequence'), ci): if chunk.utterances.exists(): for ui, utt in enumerate(chunk.utterances.order_by('sequence')): @@ -147,6 +136,20 @@ def _write_paragraphs(document, text_file, cols): par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) text_file.write('# newpar id = %s\n' % par_id) _write_sentences(par_id, chunk.anno, text_file, cols) + else: + title_anno = document.get_title_anno() + if title_anno: + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) + text_file.write('# newpar id = %s\n' % par_id) + _write_sentences(par_id, title_anno, text_file, cols) + ci += 1 + + abstract_anno = document.get_abstract_anno() + if abstract_anno: + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) + text_file.write('# newpar id = %s\n' % par_id) + _write_sentences(par_id, abstract_anno, text_file, cols) + ci += 1 def _write_sentences(par_id, anno, text_file, cols): @@ -469,10 +472,13 @@ def _get_local_iate_terms(tokens, iate): def _get_local_longest_iate_terms(tokens, iate): local_terms = [] for term in iate: - for tok in tokens: - if tok['id'] in term['tokens']: - local_terms.append(term) - break + if term['type'] in ['fullForm', 'phrase', 'shortForm', 'formula'] and term['reliability_code'] > 6 and \ + term['administrative_status'] in ['', 'admittedTerm-admn-sts', 'deprecatedTerm-admn-sts', + 'preferredTerm-admn-sts']: + for tok in tokens: + if tok['id'] in term['tokens']: + local_terms.append(term) + break return _get_longest_terms(local_terms) @@ -485,7 +491,7 @@ def _get_iate_col_value(tok, iate): ordered_term_tokens = natsorted(term['tokens']) if tok['id'] == ordered_term_tokens[0]: term_obj = IATETerm.objects.get(tid=term['id']) - domains = term_obj.eurovoc_terms.order_by('tid') + domains = term_obj.eurovoc_terms().order_by('tid') if domains.exists(): iate_vals.append('%d:%s-%s' % (ti, term['id'], ','.join([domain.tid for domain in domains]))) else: -- libgit2 0.22.2