Commit 9a0f7741b70fd57fd3206c898cd5aacece5d69d5

Authored by Bartłomiej Nitoń
1 parent db9a9ecc

Add count_prasa_orth_freq command to nkjp project.

collector/projects/nkjp/management/commands/count_prasa_orth_freq.py 0 → 100644
  1 +import operator
  2 +
  3 +from django.core.management.base import BaseCommand
  4 +
  5 +from pipeline.models import Project
  6 +
  7 +
  8 +class Command(BaseCommand):
  9 +
  10 + def add_arguments(self, parser):
  11 +
  12 + parser.add_argument('-o',
  13 + '--output',
  14 + action='store',
  15 + dest='output',
  16 + type=str,
  17 + help='output path')
  18 +
  19 + def handle(self, *args, **options):
  20 +
  21 + if not options['output']:
  22 + print('Error: Output must be selected!')
  23 + return
  24 +
  25 + orths = {}
  26 + project = Project.objects.get(name='nkjp')
  27 + self._count_frequency(project, orths)
  28 + self._write_freq_list(orths, options['output'])
  29 +
  30 + def _count_frequency(self, project, orths):
  31 + print('Counting stats for %s:' % project.name)
  32 + for pipeline in project.pipelines.all():
  33 + print('-- counting stats for %s:' % pipeline.name)
  34 + for doc in pipeline.documents.filter(image=False, broken_source=False):
  35 + if doc.metadata.filter(name='channel').exists() and \
  36 + doc.metadata.get(name='channel').value.startswith('prasa_'):
  37 + print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value))
  38 + self._count_doc_segments(doc, orths)
  39 +
  40 + def _count_doc_segments(self, doc, orths):
  41 + for chunk in doc.chunks.order_by('sequence'):
  42 + if chunk.utterances.exists():
  43 + for utt in chunk.utterances.order_by('sequence'):
  44 + if utt.anno:
  45 + for sent in utt.anno['chunks'][0]['sentences']:
  46 + for tok in sent['tokens']:
  47 + orth = tok['orth']
  48 + if orth in orths:
  49 + orths[orth] += 1
  50 + else:
  51 + orths[orth] = 1
  52 + else:
  53 + if chunk.anno:
  54 + for sent in chunk.anno['chunks'][0]['sentences']:
  55 + for tok in sent['tokens']:
  56 + orth = tok['orth']
  57 + if orth in orths:
  58 + orths[orth] += 1
  59 + else:
  60 + orths[orth] = 1
  61 +
  62 + def _write_freq_list(self, forms, freq_path):
  63 + with open(freq_path, 'w') as freqfile:
  64 + for orth, freq in sorted(forms.items(), key=operator.itemgetter(1), reverse=True):
  65 + freqfile.write(f'{orth}\t{freq}\n')
... ...