Commit 85f97aaca15c0c8d03a6034f46318665d6b62394

Authored by Bartłomiej Nitoń
1 parent 8e747a23

Change conllu writer to match CURLICAT project needs.

collector/projects/curlicat/management/commands/export_curlicat.py
@@ -52,8 +52,7 @@ class Command(BaseCommand): @@ -52,8 +52,7 @@ class Command(BaseCommand):
52 os.makedirs(options['output'], exist_ok=True) 52 os.makedirs(options['output'], exist_ok=True)
53 for doc in documents: 53 for doc in documents:
54 conllu_path = os.path.join(doc.path, 'text.conllup') 54 conllu_path = os.path.join(doc.path, 'text.conllup')
55 - if not os.path.isfile(conllu_path) or not doc.annotated():  
56 - continue  
57 - print('Exporting %s.' % doc.name)  
58 - dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name))  
59 - shutil.copyfile(conllu_path, dest_path) 55 + if os.path.isfile(conllu_path) and (doc.get_abstract_anno() or doc.pl_chunks_longer_than_min()):
  56 + print('Exporting %s.' % doc.name)
  57 + dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name))
  58 + shutil.copyfile(conllu_path, dest_path)
collector/projects/curlicat/mappings.py
1 from urllib.parse import urljoin 1 from urllib.parse import urljoin
2 2
3 CONLLU_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', 3 CONLLU_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC',
4 - 'CURLICAT:NE', 'CURLICAT:NP'] 4 + 'CURLICAT:NE', 'CURLICAT:NP', 'CURLICAT:IATE']
5 5
6 DOC_TYPES = [] 6 DOC_TYPES = []
7 7
@@ -35,6 +35,8 @@ PUBLISHERS = {} @@ -35,6 +35,8 @@ PUBLISHERS = {}
35 35
36 PUBLISHERS2ABBREV = {} 36 PUBLISHERS2ABBREV = {}
37 37
  38 +MIN_SEGMENTS_BY_CHUNK = 100 # counted 92.21
  39 +
38 40
39 def get_abstract_anno(document): 41 def get_abstract_anno(document):
40 for meta in document.metadata.all(): 42 for meta in document.metadata.all():
collector/projects/marcell/mappings.py
@@ -74,6 +74,8 @@ PUBLISHERS = {'WDU': 'Dziennik Ustaw', @@ -74,6 +74,8 @@ PUBLISHERS = {'WDU': 'Dziennik Ustaw',
74 PUBLISHERS2ABBREV = {'Dziennik Ustaw': 'WDU', 74 PUBLISHERS2ABBREV = {'Dziennik Ustaw': 'WDU',
75 'Monitor Polski': 'WMP'} 75 'Monitor Polski': 'WMP'}
76 76
  77 +MIN_SEGMENTS_BY_CHUNK = 0
  78 +
77 79
78 def get_lang(document): 80 def get_lang(document):
79 return document.lang 81 return document.lang
collector/projects/ppc/mappings.py
@@ -499,6 +499,8 @@ MISSING_PARTICIPANTS = { @@ -499,6 +499,8 @@ MISSING_PARTICIPANTS = {
499 'PodsekretarzStanuWMinisterstwieGospodarkiDariuszBogdan': 'Podsekretarz Stanu w Ministerstwie Gospodarki Dariusz Bogdan' 499 'PodsekretarzStanuWMinisterstwieGospodarkiDariuszBogdan': 'Podsekretarz Stanu w Ministerstwie Gospodarki Dariusz Bogdan'
500 } 500 }
501 501
  502 +MIN_SEGMENTS_BY_CHUNK = 0
  503 +
502 504
503 def get_lang(document): 505 def get_lang(document):
504 return document.lang 506 return document.lang
collector/projects/ppc/models.py
@@ -15,6 +15,13 @@ class Utterance(models.Model): @@ -15,6 +15,13 @@ class Utterance(models.Model):
15 def words_count(self): 15 def words_count(self):
16 return len(self.text.split()) 16 return len(self.text.split())
17 17
  18 + def segments_count(self):
  19 + segments_count = 0
  20 + for chunk in self.anno['chunks']:
  21 + for sent in chunk['sentences']:
  22 + segments_count += len(sent['tokens'])
  23 + return segments_count
  24 +
18 class Meta: 25 class Meta:
19 db_table = 'utterance' 26 db_table = 'utterance'
20 ordering = ['sequence'] 27 ordering = ['sequence']
collector/storage/models.py
@@ -167,6 +167,15 @@ class Document(models.Model): @@ -167,6 +167,15 @@ class Document(models.Model):
167 words_count += chunk.words_count() 167 words_count += chunk.words_count()
168 return words_count 168 return words_count
169 169
  170 + def pl_chunks_longer_than_min(self):
  171 + min_segments = importlib.import_module('projects.%s.mappings' %
  172 + self.pipeline.project.name).MIN_SEGMENTS_BY_CHUNK
  173 + longer = []
  174 + for chunk in self.chunks.filter(lang='pl'):
  175 + if chunk.anno and chunk.segments_count() > min_segments:
  176 + longer.append(chunk)
  177 + return longer
  178 +
170 class Meta: 179 class Meta:
171 db_table = 'document' 180 db_table = 'document'
172 ordering = ['id'] 181 ordering = ['id']
@@ -189,6 +198,15 @@ class Chunk(models.Model): @@ -189,6 +198,15 @@ class Chunk(models.Model):
189 words_count += utt.words_count() 198 words_count += utt.words_count()
190 return words_count 199 return words_count
191 200
  201 + def segments_count(self):
  202 + segments_count = 0
  203 + for chunk in self.anno['chunks']:
  204 + for sent in chunk['sentences']:
  205 + segments_count += len(sent['tokens'])
  206 + for utt in self.utterances.all():
  207 + segments_count += utt.segments_count()
  208 + return segments_count
  209 +
192 class Meta: 210 class Meta:
193 db_table = 'chunk' 211 db_table = 'chunk'
194 ordering = ['sequence'] 212 ordering = ['sequence']
collector/terminology/management/commands/match_iate_terms.py
@@ -29,9 +29,9 @@ class Command(BaseCommand): @@ -29,9 +29,9 @@ class Command(BaseCommand):
29 return 29 return
30 30
31 if not options['prefix']: 31 if not options['prefix']:
32 - docs = Document.objects.filter(pipeline__project__name=options['pipeline']).order_by('name') 32 + docs = Document.objects.filter(pipeline__name=options['pipeline']).order_by('name')
33 else: 33 else:
34 - docs = Document.objects.filter(pipeline__project__name=options['pipeline'], 34 + docs = Document.objects.filter(pipeline__name=options['pipeline'],
35 name__startswith=options['prefix']).order_by('name') 35 name__startswith=options['prefix']).order_by('name')
36 36
37 iate.annotate(docs) 37 iate.annotate(docs)
collector/terminology/models.py
@@ -68,6 +68,13 @@ class IATETerm(models.Model): @@ -68,6 +68,13 @@ class IATETerm(models.Model):
68 db_table = 'iate_term' 68 db_table = 'iate_term'
69 ordering = ['tid'] 69 ordering = ['tid']
70 70
  71 + def eurovoc_terms(self):
  72 + eurovoc_ids = []
  73 + for subject in self.subject_field.split(';'):
  74 + for evlabel in EuroVocLabel.objects.filter(lang='en', text=subject.strip(), used_for=False):
  75 + eurovoc_ids.append(evlabel.term.tid)
  76 + return EuroVocTerm.objects.filter(tid__in=eurovoc_ids, type__in=['domain', 'thesaurus'])
  77 +
71 def __str__(self): 78 def __str__(self):
72 return ' | '.join([str(label) for label in self.labels.all()]) 79 return ' | '.join([str(label) for label in self.labels.all()])
73 80
collector/writers/conllu.py
@@ -4,14 +4,14 @@ import os @@ -4,14 +4,14 @@ import os
4 4
5 from natsort import natsorted 5 from natsort import natsorted
6 6
7 -from projects.marcell.models import IATETerm, EuroVocTerm 7 +from terminology.models import IATETerm, EuroVocTerm
8 8
9 9
10 DEFAULT_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] 10 DEFAULT_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC']
11 11
12 12
13 def write(document): 13 def write(document):
14 - if document.annotated(): 14 + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min():
15 print('Writing %s in CONLLU format.' % document.name) 15 print('Writing %s in CONLLU format.' % document.name)
16 16
17 cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS 17 cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS
@@ -26,7 +26,7 @@ def write(document): @@ -26,7 +26,7 @@ def write(document):
26 26
27 27
28 def write_to_dir(document, export_dir_path): 28 def write_to_dir(document, export_dir_path):
29 - if document.annotated(): 29 + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min():
30 print('Writing %s in CONLLU format.' % document.name) 30 print('Writing %s in CONLLU format.' % document.name)
31 31
32 cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS 32 cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS
@@ -73,6 +73,10 @@ def _write_metadata(document, text_file): @@ -73,6 +73,10 @@ def _write_metadata(document, text_file):
73 metadata['enkeywords'] = ' | '.join(sorted([keyword.label for keyword in document.get_en_keywords()])) 73 metadata['enkeywords'] = ' | '.join(sorted([keyword.label for keyword in document.get_en_keywords()]))
74 metadata['url'] = document.get_source_url() 74 metadata['url'] = document.get_source_url()
75 75
  76 + metadata['content_type'] = 'abstract'
  77 + if document.pl_chunks_longer_than_min():
  78 + metadata['content_type'] = 'full_text'
  79 +
76 if document.issue: 80 if document.issue:
77 metadata['publishing_company'] = document.issue.journal.publishing_company.name 81 metadata['publishing_company'] = document.issue.journal.publishing_company.name
78 metadata['journal'] = document.issue.journal.title 82 metadata['journal'] = document.issue.journal.title
@@ -120,22 +124,7 @@ def _en(translations, pl_name): @@ -120,22 +124,7 @@ def _en(translations, pl_name):
120 124
121 def _write_paragraphs(document, text_file, cols): 125 def _write_paragraphs(document, text_file, cols):
122 ci = 1 126 ci = 1
123 -  
124 - title_anno = document.get_title_anno()  
125 - if title_anno:  
126 - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)  
127 - text_file.write('# newpar id = %s\n' % par_id)  
128 - _write_sentences(par_id, title_anno, text_file, cols)  
129 - ci += 1  
130 -  
131 - abstract_anno = document.get_abstract_anno()  
132 - if abstract_anno:  
133 - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)  
134 - text_file.write('# newpar id = %s\n' % par_id)  
135 - _write_sentences(par_id, abstract_anno, text_file, cols)  
136 - ci += 1  
137 -  
138 - if document.chunks_annotated(): 127 + if document.pl_chunks_longer_than_min():
139 for ci, chunk in enumerate(document.chunks.order_by('sequence'), ci): 128 for ci, chunk in enumerate(document.chunks.order_by('sequence'), ci):
140 if chunk.utterances.exists(): 129 if chunk.utterances.exists():
141 for ui, utt in enumerate(chunk.utterances.order_by('sequence')): 130 for ui, utt in enumerate(chunk.utterances.order_by('sequence')):
@@ -147,6 +136,20 @@ def _write_paragraphs(document, text_file, cols): @@ -147,6 +136,20 @@ def _write_paragraphs(document, text_file, cols):
147 par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) 136 par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
148 text_file.write('# newpar id = %s\n' % par_id) 137 text_file.write('# newpar id = %s\n' % par_id)
149 _write_sentences(par_id, chunk.anno, text_file, cols) 138 _write_sentences(par_id, chunk.anno, text_file, cols)
  139 + else:
  140 + title_anno = document.get_title_anno()
  141 + if title_anno:
  142 + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
  143 + text_file.write('# newpar id = %s\n' % par_id)
  144 + _write_sentences(par_id, title_anno, text_file, cols)
  145 + ci += 1
  146 +
  147 + abstract_anno = document.get_abstract_anno()
  148 + if abstract_anno:
  149 + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci)
  150 + text_file.write('# newpar id = %s\n' % par_id)
  151 + _write_sentences(par_id, abstract_anno, text_file, cols)
  152 + ci += 1
150 153
151 154
152 def _write_sentences(par_id, anno, text_file, cols): 155 def _write_sentences(par_id, anno, text_file, cols):
@@ -469,10 +472,13 @@ def _get_local_iate_terms(tokens, iate): @@ -469,10 +472,13 @@ def _get_local_iate_terms(tokens, iate):
469 def _get_local_longest_iate_terms(tokens, iate): 472 def _get_local_longest_iate_terms(tokens, iate):
470 local_terms = [] 473 local_terms = []
471 for term in iate: 474 for term in iate:
472 - for tok in tokens:  
473 - if tok['id'] in term['tokens']:  
474 - local_terms.append(term)  
475 - break 475 + if term['type'] in ['fullForm', 'phrase', 'shortForm', 'formula'] and term['reliability_code'] > 6 and \
  476 + term['administrative_status'] in ['', 'admittedTerm-admn-sts', 'deprecatedTerm-admn-sts',
  477 + 'preferredTerm-admn-sts']:
  478 + for tok in tokens:
  479 + if tok['id'] in term['tokens']:
  480 + local_terms.append(term)
  481 + break
476 return _get_longest_terms(local_terms) 482 return _get_longest_terms(local_terms)
477 483
478 484
@@ -485,7 +491,7 @@ def _get_iate_col_value(tok, iate): @@ -485,7 +491,7 @@ def _get_iate_col_value(tok, iate):
485 ordered_term_tokens = natsorted(term['tokens']) 491 ordered_term_tokens = natsorted(term['tokens'])
486 if tok['id'] == ordered_term_tokens[0]: 492 if tok['id'] == ordered_term_tokens[0]:
487 term_obj = IATETerm.objects.get(tid=term['id']) 493 term_obj = IATETerm.objects.get(tid=term['id'])
488 - domains = term_obj.eurovoc_terms.order_by('tid') 494 + domains = term_obj.eurovoc_terms().order_by('tid')
489 if domains.exists(): 495 if domains.exists():
490 iate_vals.append('%d:%s-%s' % (ti, term['id'], ','.join([domain.tid for domain in domains]))) 496 iate_vals.append('%d:%s-%s' % (ti, term['id'], ','.join([domain.tid for domain in domains])))
491 else: 497 else: