Commit 2082bcf0bfa6bc8318b4c6bbe97f5c273a5edce3

Authored by Bartłomiej Nitoń
1 parent 95e06a45

Add base forms counting to count_prasa_freq command.

collector/projects/nkjp/management/commands/count_prasa_orth_freq.py renamed to collector/projects/nkjp/management/commands/count_prasa_freq.py
... ... @@ -9,6 +9,13 @@ class Command(BaseCommand):
9 9  
10 10 def add_arguments(self, parser):
11 11  
  12 + parser.add_argument('-f',
  13 + '--form',
  14 + action='store',
  15 + dest='form',
  16 + type=str,
  17 + help='form: base or orth')
  18 +
12 19 parser.add_argument('-o',
13 20 '--output',
14 21 action='store',
... ... @@ -22,12 +29,20 @@ class Command(BaseCommand):
22 29 print('Error: Output must be selected!')
23 30 return
24 31  
25   - orths = {}
  32 + if not options['form']:
  33 + print('Error: Form must be selected!')
  34 + return
  35 +
  36 + if options['form'] not in ['base', 'orth']:
  37 + print('Error: Selected form must be orth or base!')
  38 + return
  39 +
  40 + forms_freqs = {}
26 41 project = Project.objects.get(name='nkjp')
27   - self._count_frequency(project, orths)
28   - self._write_freq_list(orths, options['output'])
  42 + self._count_frequency(project, options['form'], forms_freqs)
  43 + self._write_freq_list(forms_freqs, options['output'])
29 44  
30   - def _count_frequency(self, project, orths):
  45 + def _count_frequency(self, project, form, forms_freqs):
31 46 print('Counting stats for %s:' % project.name)
32 47 for pipeline in project.pipelines.all():
33 48 print('-- counting stats for %s:' % pipeline.name)
... ... @@ -35,31 +50,31 @@ class Command(BaseCommand):
35 50 if doc.metadata.filter(name='channel').exists() and \
36 51 doc.metadata.get(name='channel').value.startswith('prasa_'):
37 52 print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value))
38   - self._count_doc_segments(doc, orths)
  53 + self._count_doc_segments(doc, form, forms_freqs)
39 54  
40   - def _count_doc_segments(self, doc, orths):
  55 + def _count_doc_segments(self, doc, form, forms_freqs):
41 56 for chunk in doc.chunks.order_by('sequence'):
42 57 if chunk.utterances.exists():
43 58 for utt in chunk.utterances.order_by('sequence'):
44 59 if utt.anno:
45 60 for sent in utt.anno['chunks'][0]['sentences']:
46 61 for tok in sent['tokens']:
47   - orth = tok['orth']
48   - if orth in orths:
49   - orths[orth] += 1
  62 + f = tok[form]
  63 + if f in forms_freqs:
  64 + forms_freqs[f] += 1
50 65 else:
51   - orths[orth] = 1
  66 + forms_freqs[f] = 1
52 67 else:
53 68 if chunk.anno:
54 69 for sent in chunk.anno['chunks'][0]['sentences']:
55 70 for tok in sent['tokens']:
56   - orth = tok['orth']
57   - if orth in orths:
58   - orths[orth] += 1
  71 + f = tok[form]
  72 + if f in forms_freqs:
  73 + forms_freqs[f] += 1
59 74 else:
60   - orths[orth] = 1
  75 + forms_freqs[f] = 1
61 76  
62   - def _write_freq_list(self, forms, freq_path):
  77 + def _write_freq_list(self, forms_freqs, freq_path):
63 78 with open(freq_path, 'w') as freqfile:
64   - for orth, freq in sorted(forms.items(), key=operator.itemgetter(1), reverse=True):
65   - freqfile.write(f'{orth}\t{freq}\n')
  79 + for form, freq in sorted(forms_freqs.items(), key=operator.itemgetter(1), reverse=True):
  80 + freqfile.write(f'{form}\t{freq}\n')
... ...