From 2082bcf0bfa6bc8318b4c6bbe97f5c273a5edce3 Mon Sep 17 00:00:00 2001 From: Bartłomiej Nitoń <bartek.niton@gmail.com> Date: Wed, 19 Oct 2022 12:09:36 +0200 Subject: [PATCH] Add base forms counting to count_prasa_freq command. --- collector/projects/nkjp/management/commands/count_prasa_freq.py | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ collector/projects/nkjp/management/commands/count_prasa_orth_freq.py | 65 ----------------------------------------------------------------- 2 files changed, 80 insertions(+), 65 deletions(-) create mode 100644 collector/projects/nkjp/management/commands/count_prasa_freq.py delete mode 100644 collector/projects/nkjp/management/commands/count_prasa_orth_freq.py diff --git a/collector/projects/nkjp/management/commands/count_prasa_freq.py b/collector/projects/nkjp/management/commands/count_prasa_freq.py new file mode 100644 index 0000000..93e7489 --- /dev/null +++ b/collector/projects/nkjp/management/commands/count_prasa_freq.py @@ -0,0 +1,80 @@ +import operator + +from django.core.management.base import BaseCommand + +from pipeline.models import Project + + +class Command(BaseCommand): + + def add_arguments(self, parser): + + parser.add_argument('-f', + '--form', + action='store', + dest='form', + type=str, + help='form: base or orth') + + parser.add_argument('-o', + '--output', + action='store', + dest='output', + type=str, + help='output path') + + def handle(self, *args, **options): + + if not options['output']: + print('Error: Output must be selected!') + return + + if not options['form']: + print('Error: Form must be selected!') + return + + if options['form'] not in ['base', 'orth']: + print('Error: Selected form must be orth or base!') + return + + forms_freqs = {} + project = Project.objects.get(name='nkjp') + self._count_frequency(project, options['form'], forms_freqs) + self._write_freq_list(forms_freqs, options['output']) + + def _count_frequency(self, project, form, forms_freqs): + print('Counting stats for %s:' % project.name) + for pipeline in project.pipelines.all(): + print('-- counting stats for %s:' % pipeline.name) + for doc in pipeline.documents.filter(indexed=True): + if doc.metadata.filter(name='channel').exists() and \ + doc.metadata.get(name='channel').value.startswith('prasa_'): + print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value)) + self._count_doc_segments(doc, form, forms_freqs) + + def _count_doc_segments(self, doc, form, forms_freqs): + for chunk in doc.chunks.order_by('sequence'): + if chunk.utterances.exists(): + for utt in chunk.utterances.order_by('sequence'): + if utt.anno: + for sent in utt.anno['chunks'][0]['sentences']: + for tok in sent['tokens']: + f = tok[form] + if f in forms_freqs: + forms_freqs[f] += 1 + else: + forms_freqs[f] = 1 + else: + if chunk.anno: + for sent in chunk.anno['chunks'][0]['sentences']: + for tok in sent['tokens']: + f = tok[form] + if f in forms_freqs: + forms_freqs[f] += 1 + else: + forms_freqs[f] = 1 + + def _write_freq_list(self, forms_freqs, freq_path): + with open(freq_path, 'w') as freqfile: + for form, freq in sorted(forms_freqs.items(), key=operator.itemgetter(1), reverse=True): + freqfile.write(f'{form}\t{freq}\n') diff --git a/collector/projects/nkjp/management/commands/count_prasa_orth_freq.py b/collector/projects/nkjp/management/commands/count_prasa_orth_freq.py deleted file mode 100644 index e91e023..0000000 --- a/collector/projects/nkjp/management/commands/count_prasa_orth_freq.py +++ /dev/null @@ -1,65 +0,0 @@ -import operator - -from django.core.management.base import BaseCommand - -from pipeline.models import Project - - -class Command(BaseCommand): - - def add_arguments(self, parser): - - parser.add_argument('-o', - '--output', - action='store', - dest='output', - type=str, - help='output path') - - def handle(self, *args, **options): - - if not options['output']: - print('Error: Output must be selected!') - return - - orths = {} - project = Project.objects.get(name='nkjp') - self._count_frequency(project, orths) - self._write_freq_list(orths, options['output']) - - def _count_frequency(self, project, orths): - print('Counting stats for %s:' % project.name) - for pipeline in project.pipelines.all(): - print('-- counting stats for %s:' % pipeline.name) - for doc in pipeline.documents.filter(indexed=True): - if doc.metadata.filter(name='channel').exists() and \ - doc.metadata.get(name='channel').value.startswith('prasa_'): - print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value)) - self._count_doc_segments(doc, orths) - - def _count_doc_segments(self, doc, orths): - for chunk in doc.chunks.order_by('sequence'): - if chunk.utterances.exists(): - for utt in chunk.utterances.order_by('sequence'): - if utt.anno: - for sent in utt.anno['chunks'][0]['sentences']: - for tok in sent['tokens']: - orth = tok['orth'] - if orth in orths: - orths[orth] += 1 - else: - orths[orth] = 1 - else: - if chunk.anno: - for sent in chunk.anno['chunks'][0]['sentences']: - for tok in sent['tokens']: - orth = tok['orth'] - if orth in orths: - orths[orth] += 1 - else: - orths[orth] = 1 - - def _write_freq_list(self, forms, freq_path): - with open(freq_path, 'w') as freqfile: - for orth, freq in sorted(forms.items(), key=operator.itemgetter(1), reverse=True): - freqfile.write(f'{orth}\t{freq}\n') -- libgit2 0.22.2