Commit 2082bcf0bfa6bc8318b4c6bbe97f5c273a5edce3

Authored by Bartłomiej Nitoń
1 parent 95e06a45

Add base forms counting to count_prasa_freq command.

collector/projects/nkjp/management/commands/count_prasa_orth_freq.py renamed to collector/projects/nkjp/management/commands/count_prasa_freq.py
@@ -9,6 +9,13 @@ class Command(BaseCommand): @@ -9,6 +9,13 @@ class Command(BaseCommand):
9 9
10 def add_arguments(self, parser): 10 def add_arguments(self, parser):
11 11
  12 + parser.add_argument('-f',
  13 + '--form',
  14 + action='store',
  15 + dest='form',
  16 + type=str,
  17 + help='form: base or orth')
  18 +
12 parser.add_argument('-o', 19 parser.add_argument('-o',
13 '--output', 20 '--output',
14 action='store', 21 action='store',
@@ -22,12 +29,20 @@ class Command(BaseCommand): @@ -22,12 +29,20 @@ class Command(BaseCommand):
22 print('Error: Output must be selected!') 29 print('Error: Output must be selected!')
23 return 30 return
24 31
25 - orths = {} 32 + if not options['form']:
  33 + print('Error: Form must be selected!')
  34 + return
  35 +
  36 + if options['form'] not in ['base', 'orth']:
  37 + print('Error: Selected form must be orth or base!')
  38 + return
  39 +
  40 + forms_freqs = {}
26 project = Project.objects.get(name='nkjp') 41 project = Project.objects.get(name='nkjp')
27 - self._count_frequency(project, orths)  
28 - self._write_freq_list(orths, options['output']) 42 + self._count_frequency(project, options['form'], forms_freqs)
  43 + self._write_freq_list(forms_freqs, options['output'])
29 44
30 - def _count_frequency(self, project, orths): 45 + def _count_frequency(self, project, form, forms_freqs):
31 print('Counting stats for %s:' % project.name) 46 print('Counting stats for %s:' % project.name)
32 for pipeline in project.pipelines.all(): 47 for pipeline in project.pipelines.all():
33 print('-- counting stats for %s:' % pipeline.name) 48 print('-- counting stats for %s:' % pipeline.name)
@@ -35,31 +50,31 @@ class Command(BaseCommand): @@ -35,31 +50,31 @@ class Command(BaseCommand):
35 if doc.metadata.filter(name='channel').exists() and \ 50 if doc.metadata.filter(name='channel').exists() and \
36 doc.metadata.get(name='channel').value.startswith('prasa_'): 51 doc.metadata.get(name='channel').value.startswith('prasa_'):
37 print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value)) 52 print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value))
38 - self._count_doc_segments(doc, orths) 53 + self._count_doc_segments(doc, form, forms_freqs)
39 54
40 - def _count_doc_segments(self, doc, orths): 55 + def _count_doc_segments(self, doc, form, forms_freqs):
41 for chunk in doc.chunks.order_by('sequence'): 56 for chunk in doc.chunks.order_by('sequence'):
42 if chunk.utterances.exists(): 57 if chunk.utterances.exists():
43 for utt in chunk.utterances.order_by('sequence'): 58 for utt in chunk.utterances.order_by('sequence'):
44 if utt.anno: 59 if utt.anno:
45 for sent in utt.anno['chunks'][0]['sentences']: 60 for sent in utt.anno['chunks'][0]['sentences']:
46 for tok in sent['tokens']: 61 for tok in sent['tokens']:
47 - orth = tok['orth']  
48 - if orth in orths:  
49 - orths[orth] += 1 62 + f = tok[form]
  63 + if f in forms_freqs:
  64 + forms_freqs[f] += 1
50 else: 65 else:
51 - orths[orth] = 1 66 + forms_freqs[f] = 1
52 else: 67 else:
53 if chunk.anno: 68 if chunk.anno:
54 for sent in chunk.anno['chunks'][0]['sentences']: 69 for sent in chunk.anno['chunks'][0]['sentences']:
55 for tok in sent['tokens']: 70 for tok in sent['tokens']:
56 - orth = tok['orth']  
57 - if orth in orths:  
58 - orths[orth] += 1 71 + f = tok[form]
  72 + if f in forms_freqs:
  73 + forms_freqs[f] += 1
59 else: 74 else:
60 - orths[orth] = 1 75 + forms_freqs[f] = 1
61 76
62 - def _write_freq_list(self, forms, freq_path): 77 + def _write_freq_list(self, forms_freqs, freq_path):
63 with open(freq_path, 'w') as freqfile: 78 with open(freq_path, 'w') as freqfile:
64 - for orth, freq in sorted(forms.items(), key=operator.itemgetter(1), reverse=True):  
65 - freqfile.write(f'{orth}\t{freq}\n') 79 + for form, freq in sorted(forms_freqs.items(), key=operator.itemgetter(1), reverse=True):
  80 + freqfile.write(f'{form}\t{freq}\n')