Commit 2082bcf0bfa6bc8318b4c6bbe97f5c273a5edce3
1 parent
95e06a45
Add base forms counting to count_prasa_freq command.
Showing
1 changed file
with
32 additions
and
17 deletions
collector/projects/nkjp/management/commands/count_prasa_orth_freq.py renamed to collector/projects/nkjp/management/commands/count_prasa_freq.py
... | ... | @@ -9,6 +9,13 @@ class Command(BaseCommand): |
9 | 9 | |
10 | 10 | def add_arguments(self, parser): |
11 | 11 | |
12 | + parser.add_argument('-f', | |
13 | + '--form', | |
14 | + action='store', | |
15 | + dest='form', | |
16 | + type=str, | |
17 | + help='form: base or orth') | |
18 | + | |
12 | 19 | parser.add_argument('-o', |
13 | 20 | '--output', |
14 | 21 | action='store', |
... | ... | @@ -22,12 +29,20 @@ class Command(BaseCommand): |
22 | 29 | print('Error: Output must be selected!') |
23 | 30 | return |
24 | 31 | |
25 | - orths = {} | |
32 | + if not options['form']: | |
33 | + print('Error: Form must be selected!') | |
34 | + return | |
35 | + | |
36 | + if options['form'] not in ['base', 'orth']: | |
37 | + print('Error: Selected form must be orth or base!') | |
38 | + return | |
39 | + | |
40 | + forms_freqs = {} | |
26 | 41 | project = Project.objects.get(name='nkjp') |
27 | - self._count_frequency(project, orths) | |
28 | - self._write_freq_list(orths, options['output']) | |
42 | + self._count_frequency(project, options['form'], forms_freqs) | |
43 | + self._write_freq_list(forms_freqs, options['output']) | |
29 | 44 | |
30 | - def _count_frequency(self, project, orths): | |
45 | + def _count_frequency(self, project, form, forms_freqs): | |
31 | 46 | print('Counting stats for %s:' % project.name) |
32 | 47 | for pipeline in project.pipelines.all(): |
33 | 48 | print('-- counting stats for %s:' % pipeline.name) |
... | ... | @@ -35,31 +50,31 @@ class Command(BaseCommand): |
35 | 50 | if doc.metadata.filter(name='channel').exists() and \ |
36 | 51 | doc.metadata.get(name='channel').value.startswith('prasa_'): |
37 | 52 | print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value)) |
38 | - self._count_doc_segments(doc, orths) | |
53 | + self._count_doc_segments(doc, form, forms_freqs) | |
39 | 54 | |
40 | - def _count_doc_segments(self, doc, orths): | |
55 | + def _count_doc_segments(self, doc, form, forms_freqs): | |
41 | 56 | for chunk in doc.chunks.order_by('sequence'): |
42 | 57 | if chunk.utterances.exists(): |
43 | 58 | for utt in chunk.utterances.order_by('sequence'): |
44 | 59 | if utt.anno: |
45 | 60 | for sent in utt.anno['chunks'][0]['sentences']: |
46 | 61 | for tok in sent['tokens']: |
47 | - orth = tok['orth'] | |
48 | - if orth in orths: | |
49 | - orths[orth] += 1 | |
62 | + f = tok[form] | |
63 | + if f in forms_freqs: | |
64 | + forms_freqs[f] += 1 | |
50 | 65 | else: |
51 | - orths[orth] = 1 | |
66 | + forms_freqs[f] = 1 | |
52 | 67 | else: |
53 | 68 | if chunk.anno: |
54 | 69 | for sent in chunk.anno['chunks'][0]['sentences']: |
55 | 70 | for tok in sent['tokens']: |
56 | - orth = tok['orth'] | |
57 | - if orth in orths: | |
58 | - orths[orth] += 1 | |
71 | + f = tok[form] | |
72 | + if f in forms_freqs: | |
73 | + forms_freqs[f] += 1 | |
59 | 74 | else: |
60 | - orths[orth] = 1 | |
75 | + forms_freqs[f] = 1 | |
61 | 76 | |
62 | - def _write_freq_list(self, forms, freq_path): | |
77 | + def _write_freq_list(self, forms_freqs, freq_path): | |
63 | 78 | with open(freq_path, 'w') as freqfile: |
64 | - for orth, freq in sorted(forms.items(), key=operator.itemgetter(1), reverse=True): | |
65 | - freqfile.write(f'{orth}\t{freq}\n') | |
79 | + for form, freq in sorted(forms_freqs.items(), key=operator.itemgetter(1), reverse=True): | |
80 | + freqfile.write(f'{form}\t{freq}\n') | |
... | ... |