Commit 2082bcf0bfa6bc8318b4c6bbe97f5c273a5edce3
1 parent
95e06a45
Add base forms counting to count_prasa_freq command.
Showing
1 changed file
with
32 additions
and
17 deletions
collector/projects/nkjp/management/commands/count_prasa_orth_freq.py renamed to collector/projects/nkjp/management/commands/count_prasa_freq.py
@@ -9,6 +9,13 @@ class Command(BaseCommand): | @@ -9,6 +9,13 @@ class Command(BaseCommand): | ||
9 | 9 | ||
10 | def add_arguments(self, parser): | 10 | def add_arguments(self, parser): |
11 | 11 | ||
12 | + parser.add_argument('-f', | ||
13 | + '--form', | ||
14 | + action='store', | ||
15 | + dest='form', | ||
16 | + type=str, | ||
17 | + help='form: base or orth') | ||
18 | + | ||
12 | parser.add_argument('-o', | 19 | parser.add_argument('-o', |
13 | '--output', | 20 | '--output', |
14 | action='store', | 21 | action='store', |
@@ -22,12 +29,20 @@ class Command(BaseCommand): | @@ -22,12 +29,20 @@ class Command(BaseCommand): | ||
22 | print('Error: Output must be selected!') | 29 | print('Error: Output must be selected!') |
23 | return | 30 | return |
24 | 31 | ||
25 | - orths = {} | 32 | + if not options['form']: |
33 | + print('Error: Form must be selected!') | ||
34 | + return | ||
35 | + | ||
36 | + if options['form'] not in ['base', 'orth']: | ||
37 | + print('Error: Selected form must be orth or base!') | ||
38 | + return | ||
39 | + | ||
40 | + forms_freqs = {} | ||
26 | project = Project.objects.get(name='nkjp') | 41 | project = Project.objects.get(name='nkjp') |
27 | - self._count_frequency(project, orths) | ||
28 | - self._write_freq_list(orths, options['output']) | 42 | + self._count_frequency(project, options['form'], forms_freqs) |
43 | + self._write_freq_list(forms_freqs, options['output']) | ||
29 | 44 | ||
30 | - def _count_frequency(self, project, orths): | 45 | + def _count_frequency(self, project, form, forms_freqs): |
31 | print('Counting stats for %s:' % project.name) | 46 | print('Counting stats for %s:' % project.name) |
32 | for pipeline in project.pipelines.all(): | 47 | for pipeline in project.pipelines.all(): |
33 | print('-- counting stats for %s:' % pipeline.name) | 48 | print('-- counting stats for %s:' % pipeline.name) |
@@ -35,31 +50,31 @@ class Command(BaseCommand): | @@ -35,31 +50,31 @@ class Command(BaseCommand): | ||
35 | if doc.metadata.filter(name='channel').exists() and \ | 50 | if doc.metadata.filter(name='channel').exists() and \ |
36 | doc.metadata.get(name='channel').value.startswith('prasa_'): | 51 | doc.metadata.get(name='channel').value.startswith('prasa_'): |
37 | print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value)) | 52 | print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value)) |
38 | - self._count_doc_segments(doc, orths) | 53 | + self._count_doc_segments(doc, form, forms_freqs) |
39 | 54 | ||
40 | - def _count_doc_segments(self, doc, orths): | 55 | + def _count_doc_segments(self, doc, form, forms_freqs): |
41 | for chunk in doc.chunks.order_by('sequence'): | 56 | for chunk in doc.chunks.order_by('sequence'): |
42 | if chunk.utterances.exists(): | 57 | if chunk.utterances.exists(): |
43 | for utt in chunk.utterances.order_by('sequence'): | 58 | for utt in chunk.utterances.order_by('sequence'): |
44 | if utt.anno: | 59 | if utt.anno: |
45 | for sent in utt.anno['chunks'][0]['sentences']: | 60 | for sent in utt.anno['chunks'][0]['sentences']: |
46 | for tok in sent['tokens']: | 61 | for tok in sent['tokens']: |
47 | - orth = tok['orth'] | ||
48 | - if orth in orths: | ||
49 | - orths[orth] += 1 | 62 | + f = tok[form] |
63 | + if f in forms_freqs: | ||
64 | + forms_freqs[f] += 1 | ||
50 | else: | 65 | else: |
51 | - orths[orth] = 1 | 66 | + forms_freqs[f] = 1 |
52 | else: | 67 | else: |
53 | if chunk.anno: | 68 | if chunk.anno: |
54 | for sent in chunk.anno['chunks'][0]['sentences']: | 69 | for sent in chunk.anno['chunks'][0]['sentences']: |
55 | for tok in sent['tokens']: | 70 | for tok in sent['tokens']: |
56 | - orth = tok['orth'] | ||
57 | - if orth in orths: | ||
58 | - orths[orth] += 1 | 71 | + f = tok[form] |
72 | + if f in forms_freqs: | ||
73 | + forms_freqs[f] += 1 | ||
59 | else: | 74 | else: |
60 | - orths[orth] = 1 | 75 | + forms_freqs[f] = 1 |
61 | 76 | ||
62 | - def _write_freq_list(self, forms, freq_path): | 77 | + def _write_freq_list(self, forms_freqs, freq_path): |
63 | with open(freq_path, 'w') as freqfile: | 78 | with open(freq_path, 'w') as freqfile: |
64 | - for orth, freq in sorted(forms.items(), key=operator.itemgetter(1), reverse=True): | ||
65 | - freqfile.write(f'{orth}\t{freq}\n') | 79 | + for form, freq in sorted(forms_freqs.items(), key=operator.itemgetter(1), reverse=True): |
80 | + freqfile.write(f'{form}\t{freq}\n') |