From 2082bcf0bfa6bc8318b4c6bbe97f5c273a5edce3 Mon Sep 17 00:00:00 2001
From: Bartłomiej Nitoń <bartek.niton@gmail.com>
Date: Wed, 19 Oct 2022 12:09:36 +0200
Subject: [PATCH] Add base forms counting to count_prasa_freq command.

---
 collector/projects/nkjp/management/commands/count_prasa_freq.py      | 80 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 collector/projects/nkjp/management/commands/count_prasa_orth_freq.py | 65 -----------------------------------------------------------------
 2 files changed, 80 insertions(+), 65 deletions(-)
 create mode 100644 collector/projects/nkjp/management/commands/count_prasa_freq.py
 delete mode 100644 collector/projects/nkjp/management/commands/count_prasa_orth_freq.py

diff --git a/collector/projects/nkjp/management/commands/count_prasa_freq.py b/collector/projects/nkjp/management/commands/count_prasa_freq.py
new file mode 100644
index 0000000..93e7489
--- /dev/null
+++ b/collector/projects/nkjp/management/commands/count_prasa_freq.py
@@ -0,0 +1,80 @@
+import operator
+
+from django.core.management.base import BaseCommand
+
+from pipeline.models import Project
+
+
+class Command(BaseCommand):
+
+    def add_arguments(self, parser):
+
+        parser.add_argument('-f',
+                            '--form',
+                            action='store',
+                            dest='form',
+                            type=str,
+                            help='form: base or orth')
+
+        parser.add_argument('-o',
+                            '--output',
+                            action='store',
+                            dest='output',
+                            type=str,
+                            help='output path')
+
+    def handle(self, *args, **options):
+
+        if not options['output']:
+            print('Error: Output must be selected!')
+            return
+
+        if not options['form']:
+            print('Error: Form must be selected!')
+            return
+
+        if options['form'] not in ['base', 'orth']:
+            print('Error: Selected form must be orth or base!')
+            return
+
+        forms_freqs = {}
+        project = Project.objects.get(name='nkjp')
+        self._count_frequency(project, options['form'], forms_freqs)
+        self._write_freq_list(forms_freqs, options['output'])
+
+    def _count_frequency(self, project, form, forms_freqs):
+        print('Counting stats for %s:' % project.name)
+        for pipeline in project.pipelines.all():
+            print('-- counting stats for %s:' % pipeline.name)
+            for doc in pipeline.documents.filter(indexed=True):
+                if doc.metadata.filter(name='channel').exists() and \
+                        doc.metadata.get(name='channel').value.startswith('prasa_'):
+                    print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value))
+                    self._count_doc_segments(doc, form, forms_freqs)
+
+    def _count_doc_segments(self, doc, form, forms_freqs):
+        for chunk in doc.chunks.order_by('sequence'):
+            if chunk.utterances.exists():
+                for utt in chunk.utterances.order_by('sequence'):
+                    if utt.anno:
+                        for sent in utt.anno['chunks'][0]['sentences']:
+                            for tok in sent['tokens']:
+                                f = tok[form]
+                                if f in forms_freqs:
+                                    forms_freqs[f] += 1
+                                else:
+                                    forms_freqs[f] = 1
+            else:
+                if chunk.anno:
+                    for sent in chunk.anno['chunks'][0]['sentences']:
+                        for tok in sent['tokens']:
+                            f = tok[form]
+                            if f in forms_freqs:
+                                forms_freqs[f] += 1
+                            else:
+                                forms_freqs[f] = 1
+
+    def _write_freq_list(self, forms_freqs, freq_path):
+        with open(freq_path, 'w') as freqfile:
+            for form, freq in sorted(forms_freqs.items(), key=operator.itemgetter(1), reverse=True):
+                freqfile.write(f'{form}\t{freq}\n')
diff --git a/collector/projects/nkjp/management/commands/count_prasa_orth_freq.py b/collector/projects/nkjp/management/commands/count_prasa_orth_freq.py
deleted file mode 100644
index e91e023..0000000
--- a/collector/projects/nkjp/management/commands/count_prasa_orth_freq.py
+++ /dev/null
@@ -1,65 +0,0 @@
-import operator
-
-from django.core.management.base import BaseCommand
-
-from pipeline.models import Project
-
-
-class Command(BaseCommand):
-
-    def add_arguments(self, parser):
-
-        parser.add_argument('-o',
-                            '--output',
-                            action='store',
-                            dest='output',
-                            type=str,
-                            help='output path')
-
-    def handle(self, *args, **options):
-
-        if not options['output']:
-            print('Error: Output must be selected!')
-            return
-
-        orths = {}
-        project = Project.objects.get(name='nkjp')
-        self._count_frequency(project, orths)
-        self._write_freq_list(orths, options['output'])
-
-    def _count_frequency(self, project, orths):
-        print('Counting stats for %s:' % project.name)
-        for pipeline in project.pipelines.all():
-            print('-- counting stats for %s:' % pipeline.name)
-            for doc in pipeline.documents.filter(indexed=True):
-                if doc.metadata.filter(name='channel').exists() and \
-                        doc.metadata.get(name='channel').value.startswith('prasa_'):
-                    print('---- counting segments in %s (%s)' % (doc.id, doc.metadata.get(name='channel').value))
-                    self._count_doc_segments(doc, orths)
-
-    def _count_doc_segments(self, doc, orths):
-        for chunk in doc.chunks.order_by('sequence'):
-            if chunk.utterances.exists():
-                for utt in chunk.utterances.order_by('sequence'):
-                    if utt.anno:
-                        for sent in utt.anno['chunks'][0]['sentences']:
-                            for tok in sent['tokens']:
-                                orth = tok['orth']
-                                if orth in orths:
-                                    orths[orth] += 1
-                                else:
-                                    orths[orth] = 1
-            else:
-                if chunk.anno:
-                    for sent in chunk.anno['chunks'][0]['sentences']:
-                        for tok in sent['tokens']:
-                            orth = tok['orth']
-                            if orth in orths:
-                                orths[orth] += 1
-                            else:
-                                orths[orth] = 1
-
-    def _write_freq_list(self, forms, freq_path):
-        with open(freq_path, 'w') as freqfile:
-            for orth, freq in sorted(forms.items(), key=operator.itemgetter(1), reverse=True):
-                freqfile.write(f'{orth}\t{freq}\n')
--
libgit2 0.22.2