Commit 85f97aaca15c0c8d03a6034f46318665d6b62394

Authored by Bartłomiej Nitoń
1 parent 8e747a23

Change conllu writer to match CURLICAT project needs.

collector/projects/curlicat/management/commands/export_curlicat.py
... ... @@ -52,8 +52,7 @@ class Command(BaseCommand):
52 52 os.makedirs(options['output'], exist_ok=True)
53 53 for doc in documents:
54 54 conllu_path = os.path.join(doc.path, 'text.conllup')
55   - if not os.path.isfile(conllu_path) or not doc.annotated():
56   - continue
57   - print('Exporting %s.' % doc.name)
58   - dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name))
59   - shutil.copyfile(conllu_path, dest_path)
  55 + if os.path.isfile(conllu_path) and (doc.get_abstract_anno() or doc.pl_chunks_longer_than_min()):
  56 + print('Exporting %s.' % doc.name)
  57 + dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name))
  58 + shutil.copyfile(conllu_path, dest_path)
... ...
collector/projects/curlicat/mappings.py
1 1 from urllib.parse import urljoin
2 2  
3 3 CONLLU_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC',
4   - 'CURLICAT:NE', 'CURLICAT:NP']
  4 + 'CURLICAT:NE', 'CURLICAT:NP', 'CURLICAT:IATE']
5 5  
6 6 DOC_TYPES = []
7 7  
... ... @@ -35,6 +35,8 @@ PUBLISHERS = {}
35 35  
36 36 PUBLISHERS2ABBREV = {}
37 37  
  38 +MIN_SEGMENTS_BY_CHUNK = 100 # counted 92.21
  39 +
38 40  
39 41 def get_abstract_anno(document):
40 42 for meta in document.metadata.all():
... ...
collector/projects/marcell/mappings.py
... ... @@ -74,6 +74,8 @@ PUBLISHERS = {'WDU': 'Dziennik Ustaw',
74 74 PUBLISHERS2ABBREV = {'Dziennik Ustaw': 'WDU',
75 75 'Monitor Polski': 'WMP'}
76 76  
  77 +MIN_SEGMENTS_BY_CHUNK = 0
  78 +
77 79  
78 80 def get_lang(document):
79 81 return document.lang
... ...
collector/projects/ppc/mappings.py
... ... @@ -499,6 +499,8 @@ MISSING_PARTICIPANTS = {
499 499 'PodsekretarzStanuWMinisterstwieGospodarkiDariuszBogdan': 'Podsekretarz Stanu w Ministerstwie Gospodarki Dariusz Bogdan'
500 500 }
501 501  
  502 +MIN_SEGMENTS_BY_CHUNK = 0
  503 +
502 504  
503 505 def get_lang(document):
504 506 return document.lang
... ...
collector/projects/ppc/models.py
... ... @@ -15,6 +15,13 @@ class Utterance(models.Model):
15 15 def words_count(self):
16 16 return len(self.text.split())
17 17  
  18 + def segments_count(self):
  19 + segments_count = 0
  20 + for chunk in self.anno['chunks']:
  21 + for sent in chunk['sentences']:
  22 + segments_count += len(sent['tokens'])
  23 + return segments_count
  24 +
18 25 class Meta:
19 26 db_table = 'utterance'
20 27 ordering = ['sequence']
... ...
collector/storage/models.py
... ... @@ -167,6 +167,15 @@ class Document(models.Model):
167 167 words_count += chunk.words_count()
168 168 return words_count
169 169  
  170 + def pl_chunks_longer_than_min(self):
  171 + min_segments = importlib.import_module('projects.%s.mappings' %
  172 + self.pipeline.project.name).MIN_SEGMENTS_BY_CHUNK
  173 + longer = []
  174 + for chunk in self.chunks.filter(lang='pl'):
  175 + if chunk.anno and chunk.segments_count() > min_segments:
  176 + longer.append(chunk)
  177 + return longer
  178 +
170 179 class Meta:
171 180 db_table = 'document'
172 181 ordering = ['id']
... ... @@ -189,6 +198,15 @@ class Chunk(models.Model):
189 198 words_count += utt.words_count()
190 199 return words_count
191 200  
  201 + def segments_count(self):
  202 + segments_count = 0
  203 + for chunk in self.anno['chunks']:
  204 + for sent in chunk['sentences']:
  205 + segments_count += len(sent['tokens'])
  206 + for utt in self.utterances.all():
  207 + segments_count += utt.segments_count()
  208 + return segments_count
  209 +
192 210 class Meta:
193 211 db_table = 'chunk'
194 212 ordering = ['sequence']
... ...
collector/terminology/management/commands/match_iate_terms.py
... ... @@ -29,9 +29,9 @@ class Command(BaseCommand):
29 29 return
30 30  
31 31 if not options['prefix']:
32   - docs = Document.objects.filter(pipeline__project__name=options['pipeline']).order_by('name')
  32 + docs = Document.objects.filter(pipeline__name=options['pipeline']).order_by('name')
33 33 else:
34   - docs = Document.objects.filter(pipeline__project__name=options['pipeline'],
  34 + docs = Document.objects.filter(pipeline__name=options['pipeline'],
35 35 name__startswith=options['prefix']).order_by('name')
36 36  
37 37 iate.annotate(docs)
... ...
collector/terminology/models.py
... ... @@ -68,6 +68,13 @@ class IATETerm(models.Model):
68 68 db_table = 'iate_term'
69 69 ordering = ['tid']
70 70  
  71 + def eurovoc_terms(self):
  72 + eurovoc_ids = []
  73 + for subject in self.subject_field.split(';'):
  74 + for evlabel in EuroVocLabel.objects.filter(lang='en', text=subject.strip(), used_for=False):
  75 + eurovoc_ids.append(evlabel.term.tid)
  76 + return EuroVocTerm.objects.filter(tid__in=eurovoc_ids, type__in=['domain', 'thesaurus'])
  77 +
71 78 def __str__(self):
72 79 return ' | '.join([str(label) for label in self.labels.all()])
73 80  
... ...
collector/writers/conllu.py
... ... @@ -4,14 +4,14 @@ import os
4 4  
5 5 from natsort import natsorted
6 6  
7   -from projects.marcell.models import IATETerm, EuroVocTerm
  7 +from terminology.models import IATETerm, EuroVocTerm
8 8  
9 9  
10 10 DEFAULT_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
11 11  
12 12  
13 13 def write(document):
14   - if document.annotated():
  14 + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min():
15 15 print('Writing %s in CONLLU format.' % document.name)
16 16  
17 17 cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS
... ... @@ -26,7 +26,7 @@ def write(document):
26 26  
27 27  
28 28 def write_to_dir(document, export_dir_path):
29   - if document.annotated():
  29 + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min():
30 30 print('Writing %s in CONLLU format.' % document.name)
31 31  
32 32 cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS
... ... @@ -73,6 +73,10 @@ def _write_metadata(document, text_file):
73 73 metadata['enkeywords'] = ' | '.join(sorted([keyword.label for keyword in document.get_en_keywords()]))
74 74 metadata['url'] = document.get_source_url()
75 75  
  76 + metadata['content_type'] = 'abstract'
  77 + if document.pl_chunks_longer_than_min():
  78 + metadata['content_type'] = 'full_text'
  79 +
76 80 if document.issue:
77 81 metadata['publishing_company'] = document.issue.journal.publishing_company.name
78 82 metadata['journal'] = document.issue.journal.title
... ... @@ -120,22 +124,7 @@ def _en(translations, pl_name):
120 124  
121 125 def _write_paragraphs(document, text_file, cols):
122 126 ci = 1
123   -
124   - title_anno = document.get_title_anno()
125   - if title_anno:
126   - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
127   - text_file.write('# newpar id = %s\n' % par_id)
128   - _write_sentences(par_id, title_anno, text_file, cols)
129   - ci += 1
130   -
131   - abstract_anno = document.get_abstract_anno()
132   - if abstract_anno:
133   - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
134   - text_file.write('# newpar id = %s\n' % par_id)
135   - _write_sentences(par_id, abstract_anno, text_file, cols)
136   - ci += 1
137   -
138   - if document.chunks_annotated():
  127 + if document.pl_chunks_longer_than_min():
139 128 for ci, chunk in enumerate(document.chunks.order_by('sequence'), ci):
140 129 if chunk.utterances.exists():
141 130 for ui, utt in enumerate(chunk.utterances.order_by('sequence')):
... ... @@ -147,6 +136,20 @@ def _write_paragraphs(document, text_file, cols):
147 136 par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
148 137 text_file.write('# newpar id = %s\n' % par_id)
149 138 _write_sentences(par_id, chunk.anno, text_file, cols)
  139 + else:
  140 + title_anno = document.get_title_anno()
  141 + if title_anno:
  142 + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
  143 + text_file.write('# newpar id = %s\n' % par_id)
  144 + _write_sentences(par_id, title_anno, text_file, cols)
  145 + ci += 1
  146 +
  147 + abstract_anno = document.get_abstract_anno()
  148 + if abstract_anno:
  149 + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
  150 + text_file.write('# newpar id = %s\n' % par_id)
  151 + _write_sentences(par_id, abstract_anno, text_file, cols)
  152 + ci += 1
150 153  
151 154  
152 155 def _write_sentences(par_id, anno, text_file, cols):
... ... @@ -469,10 +472,13 @@ def _get_local_iate_terms(tokens, iate):
469 472 def _get_local_longest_iate_terms(tokens, iate):
470 473 local_terms = []
471 474 for term in iate:
472   - for tok in tokens:
473   - if tok['id'] in term['tokens']:
474   - local_terms.append(term)
475   - break
  475 + if term['type'] in ['fullForm', 'phrase', 'shortForm', 'formula'] and term['reliability_code'] > 6 and \
  476 + term['administrative_status'] in ['', 'admittedTerm-admn-sts', 'deprecatedTerm-admn-sts',
  477 + 'preferredTerm-admn-sts']:
  478 + for tok in tokens:
  479 + if tok['id'] in term['tokens']:
  480 + local_terms.append(term)
  481 + break
476 482 return _get_longest_terms(local_terms)
477 483  
478 484  
... ... @@ -485,7 +491,7 @@ def _get_iate_col_value(tok, iate):
485 491 ordered_term_tokens = natsorted(term['tokens'])
486 492 if tok['id'] == ordered_term_tokens[0]:
487 493 term_obj = IATETerm.objects.get(tid=term['id'])
488   - domains = term_obj.eurovoc_terms.order_by('tid')
  494 + domains = term_obj.eurovoc_terms().order_by('tid')
489 495 if domains.exists():
490 496 iate_vals.append('%d:%s-%s' % (ti, term['id'], ','.join([domain.tid for domain in domains])))
491 497 else:
... ...