get_exact_domains_mapping.py
2.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import argparse
import os
def main():
args = parse_arguments()
if not args.input:
print('Error: Corpora root directory must be selected!')
mappings = get_mappings(args.input)
write_mappings(mappings, args.output)
def parse_arguments():
parser = argparse.ArgumentParser(description='Generate domains mapping list for conllup corpora.')
parser.add_argument('-o', '--output', help='output tsv file with domains mappings')
required_arguments = parser.add_argument_group('required arguments')
required_arguments.add_argument('-i', '--input', help='corpora root directory', required=True)
return parser.parse_args()
def get_mappings(root_directory):
mappings = {}
for root, dirs, files in os.walk(root_directory):
for filename in files:
if filename.endswith('.conllup') or filename.endswith('.conllu'):
filepath = os.path.join(root, filename)
domain = get_domain(filepath)
scientific_disciplines = get_disciplines(filepath)
if domain in mappings:
if scientific_disciplines in mappings[domain]:
pass
else:
mappings[domain].append(scientific_disciplines)
else:
mappings[domain] = [scientific_disciplines]
return mappings
def get_domain(filepath):
with open(filepath, 'r') as conllup_file:
for line in conllup_file:
line = line.strip()
if is_segment(line):
continue
elif is_metadata(line):
name, value = get_metadata(line)
if name == 'domain':
return value.strip()
return ''
def get_disciplines(filepath):
with open(filepath, 'r') as conllup_file:
for line in conllup_file:
line = line.strip()
if is_segment(line):
continue
elif is_metadata(line):
name, value = get_metadata(line)
if name == 'disciplines':
return value.strip()
return ''
def is_segment(line):
if line and line[0].isdigit():
return True
return False
def is_metadata(line):
if line.startswith('#'):
return True
return False
def get_metadata(line):
name_value_pair = line.split('=', 1)
name = name_value_pair[0].lstrip('#').strip()
value = name_value_pair[1].strip()
return name, value
def write_mappings(mappings, outpath):
if outpath:
with open(outpath, 'w') as outfile:
outfile.write(stats_to_string(mappings))
else:
print(stats_to_string(mappings))
def stats_to_string(mappings):
lines = ['CURLICAT Domain\tScientificDiscipline(s)']
for domain in sorted(mappings.keys()):
for sds in sorted(mappings[domain]):
lines.append(f'{domain}\t{sds}')
return '\n'.join(lines)
if __name__ == '__main__':
main()