Commit 9a0f7741b70fd57fd3206c898cd5aacece5d69d5
1 parent
db9a9ecc
Add count_prasa_orth_freq command to nkjp project.
Showing
1 changed file
with
65 additions
and
0 deletions
collector/projects/nkjp/management/commands/count_prasa_orth_freq.py
0 → 100644
1 | +import operator | |
2 | + | |
3 | +from django.core.management.base import BaseCommand | |
4 | + | |
5 | +from pipeline.models import Project | |
6 | + | |
7 | + | |
8 | +class Command(BaseCommand): | |
9 | + | |
10 | + def add_arguments(self, parser): | |
11 | + | |
12 | + parser.add_argument('-o', | |
13 | + '--output', | |
14 | + action='store', | |
15 | + dest='output', | |
16 | + type=str, | |
17 | + help='output path') | |
18 | + | |
19 | + def handle(self, *args, **options): | |
20 | + | |
21 | + if not options['output']: | |
22 | + print('Error: Output must be selected!') | |
23 | + return | |
24 | + | |
25 | + orths = {} | |
26 | + project = Project.objects.get(name='nkjp') | |
27 | + self._count_frequency(project, orths) | |
28 | + self._write_freq_list(orths, options['output']) | |
29 | + | |
30 | + def _count_frequency(self, project, orths): | |
31 | + print('Counting stats for %s:' % project.name) | |
32 | + for pipeline in project.pipelines.all(): | |
33 | + print('-- counting stats for %s:' % pipeline.name) | |
34 | + for doc in pipeline.documents.filter(image=False, broken_source=False): | |
35 | + if doc.metadata.filter(name='channel').exists() and \ | |
36 | + doc.metadata.get(name='channel').value.startswith('prasa_'): | |
37 | + print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value)) | |
38 | + self._count_doc_segments(doc, orths) | |
39 | + | |
40 | + def _count_doc_segments(self, doc, orths): | |
41 | + for chunk in doc.chunks.order_by('sequence'): | |
42 | + if chunk.utterances.exists(): | |
43 | + for utt in chunk.utterances.order_by('sequence'): | |
44 | + if utt.anno: | |
45 | + for sent in utt.anno['chunks'][0]['sentences']: | |
46 | + for tok in sent['tokens']: | |
47 | + orth = tok['orth'] | |
48 | + if orth in orths: | |
49 | + orths[orth] += 1 | |
50 | + else: | |
51 | + orths[orth] = 1 | |
52 | + else: | |
53 | + if chunk.anno: | |
54 | + for sent in chunk.anno['chunks'][0]['sentences']: | |
55 | + for tok in sent['tokens']: | |
56 | + orth = tok['orth'] | |
57 | + if orth in orths: | |
58 | + orths[orth] += 1 | |
59 | + else: | |
60 | + orths[orth] = 1 | |
61 | + | |
62 | + def _write_freq_list(self, forms, freq_path): | |
63 | + with open(freq_path, 'w') as freqfile: | |
64 | + for orth, freq in sorted(forms.items(), key=operator.itemgetter(1), reverse=True): | |
65 | + freqfile.write(f'{orth}\t{freq}\n') | |
... | ... |