Commit 85f97aaca15c0c8d03a6034f46318665d6b62394
1 parent
8e747a23
Change conllu writer to match CURLICAT project needs.
Showing
9 changed files
with
75 additions
and
32 deletions
collector/projects/curlicat/management/commands/export_curlicat.py
@@ -52,8 +52,7 @@ class Command(BaseCommand): | @@ -52,8 +52,7 @@ class Command(BaseCommand): | ||
52 | os.makedirs(options['output'], exist_ok=True) | 52 | os.makedirs(options['output'], exist_ok=True) |
53 | for doc in documents: | 53 | for doc in documents: |
54 | conllu_path = os.path.join(doc.path, 'text.conllup') | 54 | conllu_path = os.path.join(doc.path, 'text.conllup') |
55 | - if not os.path.isfile(conllu_path) or not doc.annotated(): | ||
56 | - continue | ||
57 | - print('Exporting %s.' % doc.name) | ||
58 | - dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name)) | ||
59 | - shutil.copyfile(conllu_path, dest_path) | 55 | + if os.path.isfile(conllu_path) and (doc.get_abstract_anno() or doc.pl_chunks_longer_than_min()): |
56 | + print('Exporting %s.' % doc.name) | ||
57 | + dest_path = os.path.join(options['output'], '%s-%s.conllup' % (doc.get_lang(), doc.name)) | ||
58 | + shutil.copyfile(conllu_path, dest_path) |
collector/projects/curlicat/mappings.py
1 | from urllib.parse import urljoin | 1 | from urllib.parse import urljoin |
2 | 2 | ||
3 | CONLLU_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', | 3 | CONLLU_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC', |
4 | - 'CURLICAT:NE', 'CURLICAT:NP'] | 4 | + 'CURLICAT:NE', 'CURLICAT:NP', 'CURLICAT:IATE'] |
5 | 5 | ||
6 | DOC_TYPES = [] | 6 | DOC_TYPES = [] |
7 | 7 | ||
@@ -35,6 +35,8 @@ PUBLISHERS = {} | @@ -35,6 +35,8 @@ PUBLISHERS = {} | ||
35 | 35 | ||
36 | PUBLISHERS2ABBREV = {} | 36 | PUBLISHERS2ABBREV = {} |
37 | 37 | ||
38 | +MIN_SEGMENTS_BY_CHUNK = 100 # counted 92.21 | ||
39 | + | ||
38 | 40 | ||
39 | def get_abstract_anno(document): | 41 | def get_abstract_anno(document): |
40 | for meta in document.metadata.all(): | 42 | for meta in document.metadata.all(): |
collector/projects/marcell/mappings.py
@@ -74,6 +74,8 @@ PUBLISHERS = {'WDU': 'Dziennik Ustaw', | @@ -74,6 +74,8 @@ PUBLISHERS = {'WDU': 'Dziennik Ustaw', | ||
74 | PUBLISHERS2ABBREV = {'Dziennik Ustaw': 'WDU', | 74 | PUBLISHERS2ABBREV = {'Dziennik Ustaw': 'WDU', |
75 | 'Monitor Polski': 'WMP'} | 75 | 'Monitor Polski': 'WMP'} |
76 | 76 | ||
77 | +MIN_SEGMENTS_BY_CHUNK = 0 | ||
78 | + | ||
77 | 79 | ||
78 | def get_lang(document): | 80 | def get_lang(document): |
79 | return document.lang | 81 | return document.lang |
collector/projects/ppc/mappings.py
@@ -499,6 +499,8 @@ MISSING_PARTICIPANTS = { | @@ -499,6 +499,8 @@ MISSING_PARTICIPANTS = { | ||
499 | 'PodsekretarzStanuWMinisterstwieGospodarkiDariuszBogdan': 'Podsekretarz Stanu w Ministerstwie Gospodarki Dariusz Bogdan' | 499 | 'PodsekretarzStanuWMinisterstwieGospodarkiDariuszBogdan': 'Podsekretarz Stanu w Ministerstwie Gospodarki Dariusz Bogdan' |
500 | } | 500 | } |
501 | 501 | ||
502 | +MIN_SEGMENTS_BY_CHUNK = 0 | ||
503 | + | ||
502 | 504 | ||
503 | def get_lang(document): | 505 | def get_lang(document): |
504 | return document.lang | 506 | return document.lang |
collector/projects/ppc/models.py
@@ -15,6 +15,13 @@ class Utterance(models.Model): | @@ -15,6 +15,13 @@ class Utterance(models.Model): | ||
15 | def words_count(self): | 15 | def words_count(self): |
16 | return len(self.text.split()) | 16 | return len(self.text.split()) |
17 | 17 | ||
18 | + def segments_count(self): | ||
19 | + segments_count = 0 | ||
20 | + for chunk in self.anno['chunks']: | ||
21 | + for sent in chunk['sentences']: | ||
22 | + segments_count += len(sent['tokens']) | ||
23 | + return segments_count | ||
24 | + | ||
18 | class Meta: | 25 | class Meta: |
19 | db_table = 'utterance' | 26 | db_table = 'utterance' |
20 | ordering = ['sequence'] | 27 | ordering = ['sequence'] |
collector/storage/models.py
@@ -167,6 +167,15 @@ class Document(models.Model): | @@ -167,6 +167,15 @@ class Document(models.Model): | ||
167 | words_count += chunk.words_count() | 167 | words_count += chunk.words_count() |
168 | return words_count | 168 | return words_count |
169 | 169 | ||
170 | + def pl_chunks_longer_than_min(self): | ||
171 | + min_segments = importlib.import_module('projects.%s.mappings' % | ||
172 | + self.pipeline.project.name).MIN_SEGMENTS_BY_CHUNK | ||
173 | + longer = [] | ||
174 | + for chunk in self.chunks.filter(lang='pl'): | ||
175 | + if chunk.anno and chunk.segments_count() > min_segments: | ||
176 | + longer.append(chunk) | ||
177 | + return longer | ||
178 | + | ||
170 | class Meta: | 179 | class Meta: |
171 | db_table = 'document' | 180 | db_table = 'document' |
172 | ordering = ['id'] | 181 | ordering = ['id'] |
@@ -189,6 +198,15 @@ class Chunk(models.Model): | @@ -189,6 +198,15 @@ class Chunk(models.Model): | ||
189 | words_count += utt.words_count() | 198 | words_count += utt.words_count() |
190 | return words_count | 199 | return words_count |
191 | 200 | ||
201 | + def segments_count(self): | ||
202 | + segments_count = 0 | ||
203 | + for chunk in self.anno['chunks']: | ||
204 | + for sent in chunk['sentences']: | ||
205 | + segments_count += len(sent['tokens']) | ||
206 | + for utt in self.utterances.all(): | ||
207 | + segments_count += utt.segments_count() | ||
208 | + return segments_count | ||
209 | + | ||
192 | class Meta: | 210 | class Meta: |
193 | db_table = 'chunk' | 211 | db_table = 'chunk' |
194 | ordering = ['sequence'] | 212 | ordering = ['sequence'] |
collector/terminology/management/commands/match_iate_terms.py
@@ -29,9 +29,9 @@ class Command(BaseCommand): | @@ -29,9 +29,9 @@ class Command(BaseCommand): | ||
29 | return | 29 | return |
30 | 30 | ||
31 | if not options['prefix']: | 31 | if not options['prefix']: |
32 | - docs = Document.objects.filter(pipeline__project__name=options['pipeline']).order_by('name') | 32 | + docs = Document.objects.filter(pipeline__name=options['pipeline']).order_by('name') |
33 | else: | 33 | else: |
34 | - docs = Document.objects.filter(pipeline__project__name=options['pipeline'], | 34 | + docs = Document.objects.filter(pipeline__name=options['pipeline'], |
35 | name__startswith=options['prefix']).order_by('name') | 35 | name__startswith=options['prefix']).order_by('name') |
36 | 36 | ||
37 | iate.annotate(docs) | 37 | iate.annotate(docs) |
collector/terminology/models.py
@@ -68,6 +68,13 @@ class IATETerm(models.Model): | @@ -68,6 +68,13 @@ class IATETerm(models.Model): | ||
68 | db_table = 'iate_term' | 68 | db_table = 'iate_term' |
69 | ordering = ['tid'] | 69 | ordering = ['tid'] |
70 | 70 | ||
71 | + def eurovoc_terms(self): | ||
72 | + eurovoc_ids = [] | ||
73 | + for subject in self.subject_field.split(';'): | ||
74 | + for evlabel in EuroVocLabel.objects.filter(lang='en', text=subject.strip(), used_for=False): | ||
75 | + eurovoc_ids.append(evlabel.term.tid) | ||
76 | + return EuroVocTerm.objects.filter(tid__in=eurovoc_ids, type__in=['domain', 'thesaurus']) | ||
77 | + | ||
71 | def __str__(self): | 78 | def __str__(self): |
72 | return ' | '.join([str(label) for label in self.labels.all()]) | 79 | return ' | '.join([str(label) for label in self.labels.all()]) |
73 | 80 |
collector/writers/conllu.py
@@ -4,14 +4,14 @@ import os | @@ -4,14 +4,14 @@ import os | ||
4 | 4 | ||
5 | from natsort import natsorted | 5 | from natsort import natsorted |
6 | 6 | ||
7 | -from projects.marcell.models import IATETerm, EuroVocTerm | 7 | +from terminology.models import IATETerm, EuroVocTerm |
8 | 8 | ||
9 | 9 | ||
10 | DEFAULT_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] | 10 | DEFAULT_COLS = ['ID', 'FORM', 'LEMMA', 'UPOS', 'XPOS', 'FEATS', 'HEAD', 'DEPREL', 'DEPS', 'MISC'] |
11 | 11 | ||
12 | 12 | ||
13 | def write(document): | 13 | def write(document): |
14 | - if document.annotated(): | 14 | + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min(): |
15 | print('Writing %s in CONLLU format.' % document.name) | 15 | print('Writing %s in CONLLU format.' % document.name) |
16 | 16 | ||
17 | cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS | 17 | cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS |
@@ -26,7 +26,7 @@ def write(document): | @@ -26,7 +26,7 @@ def write(document): | ||
26 | 26 | ||
27 | 27 | ||
28 | def write_to_dir(document, export_dir_path): | 28 | def write_to_dir(document, export_dir_path): |
29 | - if document.annotated(): | 29 | + if document.get_abstract_anno() or document.get_title_anno() or document.pl_chunks_longer_than_min(): |
30 | print('Writing %s in CONLLU format.' % document.name) | 30 | print('Writing %s in CONLLU format.' % document.name) |
31 | 31 | ||
32 | cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS | 32 | cols = importlib.import_module('projects.%s.mappings' % document.pipeline.project.name).CONLLU_COLS |
@@ -73,6 +73,10 @@ def _write_metadata(document, text_file): | @@ -73,6 +73,10 @@ def _write_metadata(document, text_file): | ||
73 | metadata['enkeywords'] = ' | '.join(sorted([keyword.label for keyword in document.get_en_keywords()])) | 73 | metadata['enkeywords'] = ' | '.join(sorted([keyword.label for keyword in document.get_en_keywords()])) |
74 | metadata['url'] = document.get_source_url() | 74 | metadata['url'] = document.get_source_url() |
75 | 75 | ||
76 | + metadata['content_type'] = 'abstract' | ||
77 | + if document.pl_chunks_longer_than_min(): | ||
78 | + metadata['content_type'] = 'full_text' | ||
79 | + | ||
76 | if document.issue: | 80 | if document.issue: |
77 | metadata['publishing_company'] = document.issue.journal.publishing_company.name | 81 | metadata['publishing_company'] = document.issue.journal.publishing_company.name |
78 | metadata['journal'] = document.issue.journal.title | 82 | metadata['journal'] = document.issue.journal.title |
@@ -120,22 +124,7 @@ def _en(translations, pl_name): | @@ -120,22 +124,7 @@ def _en(translations, pl_name): | ||
120 | 124 | ||
121 | def _write_paragraphs(document, text_file, cols): | 125 | def _write_paragraphs(document, text_file, cols): |
122 | ci = 1 | 126 | ci = 1 |
123 | - | ||
124 | - title_anno = document.get_title_anno() | ||
125 | - if title_anno: | ||
126 | - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) | ||
127 | - text_file.write('# newpar id = %s\n' % par_id) | ||
128 | - _write_sentences(par_id, title_anno, text_file, cols) | ||
129 | - ci += 1 | ||
130 | - | ||
131 | - abstract_anno = document.get_abstract_anno() | ||
132 | - if abstract_anno: | ||
133 | - par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) | ||
134 | - text_file.write('# newpar id = %s\n' % par_id) | ||
135 | - _write_sentences(par_id, abstract_anno, text_file, cols) | ||
136 | - ci += 1 | ||
137 | - | ||
138 | - if document.chunks_annotated(): | 127 | + if document.pl_chunks_longer_than_min(): |
139 | for ci, chunk in enumerate(document.chunks.order_by('sequence'), ci): | 128 | for ci, chunk in enumerate(document.chunks.order_by('sequence'), ci): |
140 | if chunk.utterances.exists(): | 129 | if chunk.utterances.exists(): |
141 | for ui, utt in enumerate(chunk.utterances.order_by('sequence')): | 130 | for ui, utt in enumerate(chunk.utterances.order_by('sequence')): |
@@ -147,6 +136,20 @@ def _write_paragraphs(document, text_file, cols): | @@ -147,6 +136,20 @@ def _write_paragraphs(document, text_file, cols): | ||
147 | par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) | 136 | par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) |
148 | text_file.write('# newpar id = %s\n' % par_id) | 137 | text_file.write('# newpar id = %s\n' % par_id) |
149 | _write_sentences(par_id, chunk.anno, text_file, cols) | 138 | _write_sentences(par_id, chunk.anno, text_file, cols) |
139 | + else: | ||
140 | + title_anno = document.get_title_anno() | ||
141 | + if title_anno: | ||
142 | + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) | ||
143 | + text_file.write('# newpar id = %s\n' % par_id) | ||
144 | + _write_sentences(par_id, title_anno, text_file, cols) | ||
145 | + ci += 1 | ||
146 | + | ||
147 | + abstract_anno = document.get_abstract_anno() | ||
148 | + if abstract_anno: | ||
149 | + par_id = '%s-%s-p%d' % (document.get_lang(), document.name, ci) | ||
150 | + text_file.write('# newpar id = %s\n' % par_id) | ||
151 | + _write_sentences(par_id, abstract_anno, text_file, cols) | ||
152 | + ci += 1 | ||
150 | 153 | ||
151 | 154 | ||
152 | def _write_sentences(par_id, anno, text_file, cols): | 155 | def _write_sentences(par_id, anno, text_file, cols): |
@@ -469,10 +472,13 @@ def _get_local_iate_terms(tokens, iate): | @@ -469,10 +472,13 @@ def _get_local_iate_terms(tokens, iate): | ||
469 | def _get_local_longest_iate_terms(tokens, iate): | 472 | def _get_local_longest_iate_terms(tokens, iate): |
470 | local_terms = [] | 473 | local_terms = [] |
471 | for term in iate: | 474 | for term in iate: |
472 | - for tok in tokens: | ||
473 | - if tok['id'] in term['tokens']: | ||
474 | - local_terms.append(term) | ||
475 | - break | 475 | + if term['type'] in ['fullForm', 'phrase', 'shortForm', 'formula'] and term['reliability_code'] > 6 and \ |
476 | + term['administrative_status'] in ['', 'admittedTerm-admn-sts', 'deprecatedTerm-admn-sts', | ||
477 | + 'preferredTerm-admn-sts']: | ||
478 | + for tok in tokens: | ||
479 | + if tok['id'] in term['tokens']: | ||
480 | + local_terms.append(term) | ||
481 | + break | ||
476 | return _get_longest_terms(local_terms) | 482 | return _get_longest_terms(local_terms) |
477 | 483 | ||
478 | 484 | ||
@@ -485,7 +491,7 @@ def _get_iate_col_value(tok, iate): | @@ -485,7 +491,7 @@ def _get_iate_col_value(tok, iate): | ||
485 | ordered_term_tokens = natsorted(term['tokens']) | 491 | ordered_term_tokens = natsorted(term['tokens']) |
486 | if tok['id'] == ordered_term_tokens[0]: | 492 | if tok['id'] == ordered_term_tokens[0]: |
487 | term_obj = IATETerm.objects.get(tid=term['id']) | 493 | term_obj = IATETerm.objects.get(tid=term['id']) |
488 | - domains = term_obj.eurovoc_terms.order_by('tid') | 494 | + domains = term_obj.eurovoc_terms().order_by('tid') |
489 | if domains.exists(): | 495 | if domains.exists(): |
490 | iate_vals.append('%d:%s-%s' % (ti, term['id'], ','.join([domain.tid for domain in domains]))) | 496 | iate_vals.append('%d:%s-%s' % (ti, term['id'], ','.join([domain.tid for domain in domains]))) |
491 | else: | 497 | else: |