Commit 85f97aaca15c0c8d03a6034f46318665d6b62394
1 parent
8e747a23
Change conllu writer to match CURLICAT project needs.
Showing
9 changed files
with
75 additions
and
32 deletions
collector/projects/curlicat/management/commands/export_curlicat.py
... | ... | @@ -52,8 +52,7 @@ class Command(BaseCommand): |
52 | 52 | os.makedirs(options['output'], exist_ok=True) |
53 | 53 | for doc in documents: |
54 | 54 | conllu_path = os.path.join(doc.path, 'text.conllup') |
55 | - if not os.path.isfile(conllu_path) or not doc.annotated(): | |
56 | - continue | |
57 | - print('Exporting %s.' % doc.name) | |
58 | - dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name)) | |
59 | - shutil.copyfile(conllu_path, dest_path) | |
55 | + if os.path.isfile(conllu_path) and (doc.get_abstract_anno() or doc.pl_chunks_longer_than_min()): | |
56 | + print('Exporting %s.' % doc.name) | |
57 | + dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name)) | |
58 | + shutil.copyfile(conllu_path, dest_path) | |
... | ... |
collector/projects/curlicat/mappings.py
1 | 1 | from urllib.parse import urljoin |
2 | 2 | |
3 | 3 | CONLLU_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', |
4 | - 'CURLICAT:NE', 'CURLICAT:NP'] | |
4 | + 'CURLICAT:NE', 'CURLICAT:NP', 'CURLICAT:IATE'] | |
5 | 5 | |
6 | 6 | DOC_TYPES = [] |
7 | 7 | |
... | ... | @@ -35,6 +35,8 @@ PUBLISHERS = {} |
35 | 35 | |
36 | 36 | PUBLISHERS2ABBREV = {} |
37 | 37 | |
38 | +MIN_SEGMENTS_BY_CHUNK = 100 # counted 92.21 | |
39 | + | |
38 | 40 | |
39 | 41 | def get_abstract_anno(document): |
40 | 42 | for meta in document.metadata.all(): |
... | ... |
collector/projects/marcell/mappings.py
collector/projects/ppc/mappings.py
... | ... | @@ -499,6 +499,8 @@ MISSING_PARTICIPANTS = { |
499 | 499 | 'PodsekretarzStanuWMinisterstwieGospodarkiDariuszBogdan': 'Podsekretarz Stanu w Ministerstwie Gospodarki Dariusz Bogdan' |
500 | 500 | } |
501 | 501 | |
502 | +MIN_SEGMENTS_BY_CHUNK = 0 | |
503 | + | |
502 | 504 | |
503 | 505 | def get_lang(document): |
504 | 506 | return document.lang |
... | ... |
collector/projects/ppc/models.py
... | ... | @@ -15,6 +15,13 @@ class Utterance(models.Model): |
15 | 15 | def words_count(self): |
16 | 16 | return len(self.text.split()) |
17 | 17 | |
18 | + def segments_count(self): | |
19 | + segments_count = 0 | |
20 | + for chunk in self.anno['chunks']: | |
21 | + for sent in chunk['sentences']: | |
22 | + segments_count += len(sent['tokens']) | |
23 | + return segments_count | |
24 | + | |
18 | 25 | class Meta: |
19 | 26 | db_table = 'utterance' |
20 | 27 | ordering = ['sequence'] |
... | ... |
collector/storage/models.py
... | ... | @@ -167,6 +167,15 @@ class Document(models.Model): |
167 | 167 | words_count += chunk.words_count() |
168 | 168 | return words_count |
169 | 169 | |
170 | + def pl_chunks_longer_than_min(self): | |
171 | + min_segments = importlib.import_module('projects.%s.mappings' % | |
172 | + self.pipeline.project.name).MIN_SEGMENTS_BY_CHUNK | |
173 | + longer = [] | |
174 | + for chunk in self.chunks.filter(lang='pl'): | |
175 | + if chunk.anno and chunk.segments_count() > min_segments: | |
176 | + longer.append(chunk) | |
177 | + return longer | |
178 | + | |
170 | 179 | class Meta: |
171 | 180 | db_table = 'document' |
172 | 181 | ordering = ['id'] |
... | ... | @@ -189,6 +198,15 @@ class Chunk(models.Model): |
189 | 198 | words_count += utt.words_count() |
190 | 199 | return words_count |
191 | 200 | |
201 | + def segments_count(self): | |
202 | + segments_count = 0 | |
203 | + for chunk in self.anno['chunks']: | |
204 | + for sent in chunk['sentences']: | |
205 | + segments_count += len(sent['tokens']) | |
206 | + for utt in self.utterances.all(): | |
207 | + segments_count += utt.segments_count() | |
208 | + return segments_count | |
209 | + | |
192 | 210 | class Meta: |
193 | 211 | db_table = 'chunk' |
194 | 212 | ordering = ['sequence'] |
... | ... |
collector/terminology/management/commands/match_iate_terms.py
... | ... | @@ -29,9 +29,9 @@ class Command(BaseCommand): |
29 | 29 | return |
30 | 30 | |
31 | 31 | if not options['prefix']: |
32 | - docs = Document.objects.filter(pipeline__project__name=options['pipeline']).order_by('name') | |
32 | + docs = Document.objects.filter(pipeline__name=options['pipeline']).order_by('name') | |
33 | 33 | else: |
34 | - docs = Document.objects.filter(pipeline__project__name=options['pipeline'], | |
34 | + docs = Document.objects.filter(pipeline__name=options['pipeline'], | |
35 | 35 | name__startswith=options['prefix']).order_by('name') |
36 | 36 | |
37 | 37 | iate.annotate(docs) |
... | ... |
collector/terminology/models.py
... | ... | @@ -68,6 +68,13 @@ class IATETerm(models.Model): |
68 | 68 | db_table = 'iate_term' |
69 | 69 | ordering = ['tid'] |
70 | 70 | |
71 | + def eurovoc_terms(self): | |
72 | + eurovoc_ids = [] | |
73 | + for subject in self.subject_field.split(';'): | |
74 | + for evlabel in EuroVocLabel.objects.filter(lang='en', text=subject.strip(), used_for=False): | |
75 | + eurovoc_ids.append(evlabel.term.tid) | |
76 | + return EuroVocTerm.objects.filter(tid__in=eurovoc_ids, type__in=['domain', 'thesaurus']) | |
77 | + | |
71 | 78 | def __str__(self): |
72 | 79 | return ' | '.join([str(label) for label in self.labels.all()]) |
73 | 80 | |
... | ... |
collector/writers/conllu.py
... | ... | @@ -4,14 +4,14 @@ import os |
4 | 4 | |
5 | 5 | from natsort import natsorted |
6 | 6 | |
7 | -from projects.marcell.models import IATETerm, EuroVocTerm | |
7 | +from terminology.models import IATETerm, EuroVocTerm | |
8 | 8 | |
9 | 9 | |
10 | 10 | DEFAULT_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] |
11 | 11 | |
12 | 12 | |
13 | 13 | def write(document): |
14 | - if document.annotated(): | |
14 | + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min(): | |
15 | 15 | print('Writing %s in CONLLU format.' % document.name) |
16 | 16 | |
17 | 17 | cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS |
... | ... | @@ -26,7 +26,7 @@ def write(document): |
26 | 26 | |
27 | 27 | |
28 | 28 | def write_to_dir(document, export_dir_path): |
29 | - if document.annotated(): | |
29 | + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min(): | |
30 | 30 | print('Writing %s in CONLLU format.' % document.name) |
31 | 31 | |
32 | 32 | cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS |
... | ... | @@ -73,6 +73,10 @@ def _write_metadata(document, text_file): |
73 | 73 | metadata['enkeywords'] = ' | '.join(sorted([keyword.label for keyword in document.get_en_keywords()])) |
74 | 74 | metadata['url'] = document.get_source_url() |
75 | 75 | |
76 | + metadata['content_type'] = 'abstract' | |
77 | + if document.pl_chunks_longer_than_min(): | |
78 | + metadata['content_type'] = 'full_text' | |
79 | + | |
76 | 80 | if document.issue: |
77 | 81 | metadata['publishing_company'] = document.issue.journal.publishing_company.name |
78 | 82 | metadata['journal'] = document.issue.journal.title |
... | ... | @@ -120,22 +124,7 @@ def _en(translations, pl_name): |
120 | 124 | |
121 | 125 | def _write_paragraphs(document, text_file, cols): |
122 | 126 | ci = 1 |
123 | - | |
124 | - title_anno = document.get_title_anno() | |
125 | - if title_anno: | |
126 | - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) | |
127 | - text_file.write('# newpar id = %s\n' % par_id) | |
128 | - _write_sentences(par_id, title_anno, text_file, cols) | |
129 | - ci += 1 | |
130 | - | |
131 | - abstract_anno = document.get_abstract_anno() | |
132 | - if abstract_anno: | |
133 | - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) | |
134 | - text_file.write('# newpar id = %s\n' % par_id) | |
135 | - _write_sentences(par_id, abstract_anno, text_file, cols) | |
136 | - ci += 1 | |
137 | - | |
138 | - if document.chunks_annotated(): | |
127 | + if document.pl_chunks_longer_than_min(): | |
139 | 128 | for ci, chunk in enumerate(document.chunks.order_by('sequence'), ci): |
140 | 129 | if chunk.utterances.exists(): |
141 | 130 | for ui, utt in enumerate(chunk.utterances.order_by('sequence')): |
... | ... | @@ -147,6 +136,20 @@ def _write_paragraphs(document, text_file, cols): |
147 | 136 | par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) |
148 | 137 | text_file.write('# newpar id = %s\n' % par_id) |
149 | 138 | _write_sentences(par_id, chunk.anno, text_file, cols) |
139 | + else: | |
140 | + title_anno = document.get_title_anno() | |
141 | + if title_anno: | |
142 | + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) | |
143 | + text_file.write('# newpar id = %s\n' % par_id) | |
144 | + _write_sentences(par_id, title_anno, text_file, cols) | |
145 | + ci += 1 | |
146 | + | |
147 | + abstract_anno = document.get_abstract_anno() | |
148 | + if abstract_anno: | |
149 | + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) | |
150 | + text_file.write('# newpar id = %s\n' % par_id) | |
151 | + _write_sentences(par_id, abstract_anno, text_file, cols) | |
152 | + ci += 1 | |
150 | 153 | |
151 | 154 | |
152 | 155 | def _write_sentences(par_id, anno, text_file, cols): |
... | ... | @@ -469,10 +472,13 @@ def _get_local_iate_terms(tokens, iate): |
469 | 472 | def _get_local_longest_iate_terms(tokens, iate): |
470 | 473 | local_terms = [] |
471 | 474 | for term in iate: |
472 | - for tok in tokens: | |
473 | - if tok['id'] in term['tokens']: | |
474 | - local_terms.append(term) | |
475 | - break | |
475 | + if term['type'] in ['fullForm', 'phrase', 'shortForm', 'formula'] and term['reliability_code'] > 6 and \ | |
476 | + term['administrative_status'] in ['', 'admittedTerm-admn-sts', 'deprecatedTerm-admn-sts', | |
477 | + 'preferredTerm-admn-sts']: | |
478 | + for tok in tokens: | |
479 | + if tok['id'] in term['tokens']: | |
480 | + local_terms.append(term) | |
481 | + break | |
476 | 482 | return _get_longest_terms(local_terms) |
477 | 483 | |
478 | 484 | |
... | ... | @@ -485,7 +491,7 @@ def _get_iate_col_value(tok, iate): |
485 | 491 | ordered_term_tokens = natsorted(term['tokens']) |
486 | 492 | if tok['id'] == ordered_term_tokens[0]: |
487 | 493 | term_obj = IATETerm.objects.get(tid=term['id']) |
488 | - domains = term_obj.eurovoc_terms.order_by('tid') | |
494 | + domains = term_obj.eurovoc_terms().order_by('tid') | |
489 | 495 | if domains.exists(): |
490 | 496 | iate_vals.append('%d:%s-%s' % (ti, term['id'], ','.join([domain.tid for domain in domains]))) |
491 | 497 | else: |
... | ... |