diff --git a/collector/projects/nkjp/management/commands/count_prasa_orth_freq.py b/collector/projects/nkjp/management/commands/count_prasa_freq.py index e91e023..93e7489 100644 --- a/collector/projects/nkjp/management/commands/count_prasa_orth_freq.py +++ b/collector/projects/nkjp/management/commands/count_prasa_freq.py @@ -9,6 +9,13 @@ class Command(BaseCommand): def add_arguments(self, parser): + parser.add_argument('-f', + '--form', + action='store', + dest='form', + type=str, + help='form: base or orth') + parser.add_argument('-o', '--output', action='store', @@ -22,12 +29,20 @@ class Command(BaseCommand): print('Error: Output must be selected!') return - orths = {} + if not options['form']: + print('Error: Form must be selected!') + return + + if options['form'] not in ['base', 'orth']: + print('Error: Selected form must be orth or base!') + return + + forms_freqs = {} project = Project.objects.get(name='nkjp') - self._count_frequency(project, orths) - self._write_freq_list(orths, options['output']) + self._count_frequency(project, options['form'], forms_freqs) + self._write_freq_list(forms_freqs, options['output']) - def _count_frequency(self, project, orths): + def _count_frequency(self, project, form, forms_freqs): print('Counting stats for %s:' % project.name) for pipeline in project.pipelines.all(): print('-- counting stats for %s:' % pipeline.name) @@ -35,31 +50,31 @@ class Command(BaseCommand): if doc.metadata.filter(name='channel').exists() and \ doc.metadata.get(name='channel').value.startswith('prasa_'): print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value)) - self._count_doc_segments(doc, orths) + self._count_doc_segments(doc, form, forms_freqs) - def _count_doc_segments(self, doc, orths): + def _count_doc_segments(self, doc, form, forms_freqs): for chunk in doc.chunks.order_by('sequence'): if chunk.utterances.exists(): for utt in chunk.utterances.order_by('sequence'): if utt.anno: for sent in utt.anno['chunks'][0]['sentences']: for tok in sent['tokens']: - orth = tok['orth'] - if orth in orths: - orths[orth] += 1 + f = tok[form] + if f in forms_freqs: + forms_freqs[f] += 1 else: - orths[orth] = 1 + forms_freqs[f] = 1 else: if chunk.anno: for sent in chunk.anno['chunks'][0]['sentences']: for tok in sent['tokens']: - orth = tok['orth'] - if orth in orths: - orths[orth] += 1 + f = tok[form] + if f in forms_freqs: + forms_freqs[f] += 1 else: - orths[orth] = 1 + forms_freqs[f] = 1 - def _write_freq_list(self, forms, freq_path): + def _write_freq_list(self, forms_freqs, freq_path): with open(freq_path, 'w') as freqfile: - for orth, freq in sorted(forms.items(), key=operator.itemgetter(1), reverse=True): - freqfile.write(f'{orth}\t{freq}\n') + for form, freq in sorted(forms_freqs.items(), key=operator.itemgetter(1), reverse=True): + freqfile.write(f'{form}\t{freq}\n')