Change conllu writer to match CURLICAT project needs.

Bartłomiej Nitoń
1 parent 8e747a23
Showing 9 changed files with 75 additions and 32 deletions
collector/projects/curlicat/management/commands/export_curlicat.py
collector/projects/curlicat/mappings.py
collector/projects/marcell/mappings.py
collector/projects/ppc/mappings.py
collector/projects/ppc/models.py
collector/storage/models.py
collector/terminology/management/commands/match_iate_terms.py
collector/terminology/models.py
collector/writers/conllu.py
@@ -52,8 +52,7 @@ class Command(BaseCommand):
             os.makedirs(options['output'], exist_ok=True)
             for doc in documents:
                 conllu_path = os.path.join(doc.path, 'text.conllup')
-                if not os.path.isfile(conllu_path) or not doc.annotated():
-                    continue
-                print('Exporting %s.' % doc.name)
-                dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name))
-                shutil.copyfile(conllu_path, dest_path)
+                if os.path.isfile(conllu_path) and (doc.get_abstract_anno() or doc.pl_chunks_longer_than_min()):
+                    print('Exporting %s.' % doc.name)
+                    dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name))
+                    shutil.copyfile(conllu_path, dest_path)
 from urllib.parse import urljoin
  
 CONLLU_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC',
-               'CURLICAT:NE', 'CURLICAT:NP']
+               'CURLICAT:NE', 'CURLICAT:NP', 'CURLICAT:IATE']
  
 DOC_TYPES = []
  
@@ -35,6 +35,8 @@ PUBLISHERS = {}
  
 PUBLISHERS2ABBREV = {}
  
+MIN_SEGMENTS_BY_CHUNK = 100  # counted 92.21
+
  
 def get_abstract_anno(document):
     for meta in document.metadata.all():
@@ -74,6 +74,8 @@ PUBLISHERS = {&#39;WDU&#39;: &#39;Dziennik Ustaw&#39;,
 PUBLISHERS2ABBREV = {'Dziennik Ustaw': 'WDU',
                      'Monitor Polski': 'WMP'}
  
+MIN_SEGMENTS_BY_CHUNK = 0
+
  
 def get_lang(document):
     return document.lang
@@ -499,6 +499,8 @@ MISSING_PARTICIPANTS = {
     'PodsekretarzStanuWMinisterstwieGospodarkiDariuszBogdan': 'Podsekretarz Stanu w Ministerstwie Gospodarki Dariusz Bogdan'
 }
  
+MIN_SEGMENTS_BY_CHUNK = 0
+
  
 def get_lang(document):
     return document.lang
@@ -15,6 +15,13 @@ class Utterance(models.Model):
     def words_count(self):
         return len(self.text.split())
  
+    def segments_count(self):
+        segments_count = 0
+        for chunk in self.anno['chunks']:
+            for sent in chunk['sentences']:
+                segments_count += len(sent['tokens'])
+        return segments_count
+
     class Meta:
         db_table = 'utterance'
         ordering = ['sequence']
@@ -167,6 +167,15 @@ class Document(models.Model):
             words_count += chunk.words_count()
         return words_count
  
+    def pl_chunks_longer_than_min(self):
+        min_segments = importlib.import_module('projects.%s.mappings' %
+                                               self.pipeline.project.name).MIN_SEGMENTS_BY_CHUNK
+        longer = []
+        for chunk in self.chunks.filter(lang='pl'):
+            if chunk.anno and chunk.segments_count() > min_segments:
+                longer.append(chunk)
+        return longer
+
     class Meta:
         db_table = 'document'
         ordering = ['id']
@@ -189,6 +198,15 @@ class Chunk(models.Model):
             words_count += utt.words_count()
         return words_count
  
+    def segments_count(self):
+        segments_count = 0
+        for chunk in self.anno['chunks']:
+            for sent in chunk['sentences']:
+                segments_count += len(sent['tokens'])
+        for utt in self.utterances.all():
+            segments_count += utt.segments_count()
+        return segments_count
+
     class Meta:
         db_table = 'chunk'
         ordering = ['sequence']
@@ -29,9 +29,9 @@ class Command(BaseCommand):
             return
  
         if not options['prefix']:
-            docs = Document.objects.filter(pipeline__project__name=options['pipeline']).order_by('name')
+            docs = Document.objects.filter(pipeline__name=options['pipeline']).order_by('name')
         else:
-            docs = Document.objects.filter(pipeline__project__name=options['pipeline'],
+            docs = Document.objects.filter(pipeline__name=options['pipeline'],
                                            name__startswith=options['prefix']).order_by('name')
  
         iate.annotate(docs)
@@ -68,6 +68,13 @@ class IATETerm(models.Model):
         db_table = 'iate_term'
         ordering = ['tid']
  
+    def eurovoc_terms(self):
+        eurovoc_ids = []
+        for subject in self.subject_field.split(';'):
+            for evlabel in EuroVocLabel.objects.filter(lang='en', text=subject.strip(), used_for=False):
+                eurovoc_ids.append(evlabel.term.tid)
+        return EuroVocTerm.objects.filter(tid__in=eurovoc_ids, type__in=['domain', 'thesaurus'])
+
     def __str__(self):
         return ' | '.join([str(label) for label in self.labels.all()])
  
@@ -4,14 +4,14 @@ import os
  
 from natsort import natsorted
  
-from projects.marcell.models import IATETerm, EuroVocTerm
+from terminology.models import IATETerm, EuroVocTerm
  
  
 DEFAULT_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
  
  
 def write(document):
-    if document.annotated():
+    if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min():
         print('Writing %s in CONLLU format.' % document.name)
  
         cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS
@@ -26,7 +26,7 @@ def write(document):
  
  
 def write_to_dir(document, export_dir_path):
-    if document.annotated():
+    if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min():
         print('Writing %s in CONLLU format.' % document.name)
  
         cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS
@@ -73,6 +73,10 @@ def _write_metadata(document, text_file):
     metadata['enkeywords'] = ' | '.join(sorted([keyword.label for keyword in document.get_en_keywords()]))
     metadata['url'] = document.get_source_url()
  
+    metadata['content_type'] = 'abstract'
+    if document.pl_chunks_longer_than_min():
+        metadata['content_type'] = 'full_text'
+
     if document.issue:
         metadata['publishing_company'] = document.issue.journal.publishing_company.name
         metadata['journal'] = document.issue.journal.title
@@ -120,22 +124,7 @@ def _en(translations, pl_name):
  
 def _write_paragraphs(document, text_file, cols):
     ci = 1
-
-    title_anno = document.get_title_anno()
-    if title_anno:
-        par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
-        text_file.write('# newpar id = %s\n' % par_id)
-        _write_sentences(par_id, title_anno, text_file, cols)
-        ci += 1
-
-    abstract_anno = document.get_abstract_anno()
-    if abstract_anno:
-        par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
-        text_file.write('# newpar id = %s\n' % par_id)
-        _write_sentences(par_id, abstract_anno, text_file, cols)
-        ci += 1
-
-    if document.chunks_annotated():
+    if document.pl_chunks_longer_than_min():
         for ci, chunk in enumerate(document.chunks.order_by('sequence'), ci):
             if chunk.utterances.exists():
                 for ui, utt in enumerate(chunk.utterances.order_by('sequence')):
@@ -147,6 +136,20 @@ def _write_paragraphs(document, text_file, cols):
                 par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
                 text_file.write('# newpar id = %s\n' % par_id)
                 _write_sentences(par_id, chunk.anno, text_file, cols)
+    else:
+        title_anno = document.get_title_anno()
+        if title_anno:
+            par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
+            text_file.write('# newpar id = %s\n' % par_id)
+            _write_sentences(par_id, title_anno, text_file, cols)
+            ci += 1
+
+        abstract_anno = document.get_abstract_anno()
+        if abstract_anno:
+            par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
+            text_file.write('# newpar id = %s\n' % par_id)
+            _write_sentences(par_id, abstract_anno, text_file, cols)
+            ci += 1
  
  
 def _write_sentences(par_id, anno, text_file, cols):
@@ -469,10 +472,13 @@ def _get_local_iate_terms(tokens, iate):
 def _get_local_longest_iate_terms(tokens, iate):
     local_terms = []
     for term in iate:
-        for tok in tokens:
-            if tok['id'] in term['tokens']:
-                local_terms.append(term)
-                break
+        if term['type'] in ['fullForm', 'phrase', 'shortForm', 'formula'] and term['reliability_code'] > 6 and \
+                term['administrative_status'] in ['', 'admittedTerm-admn-sts', 'deprecatedTerm-admn-sts',
+                                                  'preferredTerm-admn-sts']:
+            for tok in tokens:
+                if tok['id'] in term['tokens']:
+                    local_terms.append(term)
+                    break
     return _get_longest_terms(local_terms)
  
  
@@ -485,7 +491,7 @@ def _get_iate_col_value(tok, iate):
             ordered_term_tokens = natsorted(term['tokens'])
             if tok['id'] == ordered_term_tokens[0]:
                 term_obj = IATETerm.objects.get(tid=term['id'])
-                domains = term_obj.eurovoc_terms.order_by('tid')
+                domains = term_obj.eurovoc_terms().order_by('tid')
                 if domains.exists():
                     iate_vals.append('%d:%s-%s' % (ti, term['id'], ','.join([domain.tid for domain in domains])))
                 else: