harmonize_licences.py
3.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import argparse
import json
import os
LICENCES_MAP = {
'CC BY - Creative Commons Uznanie Autorstwa 4.0': 'CC BY 4.0',
'CC BY - Creative Commons Uznanie Autorstwa 3.0 PL': 'CC BY 3.0 PL',
'CC BY-SA Creative Commons Uznanie Autorstwa - Na tych samych warunkach 4.0': 'CC BY-SA 4.0',
'CC BY-NC Creative Commons Uznanie Autorstwa - Użycie niekomercyjne 4.0': 'CC BY-NC 4.0',
'CC BY-SA Creative Commons Uznanie Autorstwa - Na tych samych warunkach 3.0 PL': 'CC BY-SA 3.0 PL',
'CC BY-NC Creative Commons Uznanie Autorstwa - Użycie niekomercyjne 3.0 PL': 'CC BY-NC 3.0 PL',
'CC BY-NC-SA Creative Commons Uznanie autorstwa - Użycie niekomercyjne - Na tych samych warunkach 4.0':
'CC BY-NC-SA 4.0',
'CC BY-NC-SA Creative Commons Uznanie autorstwa - Użycie niekomercyjne - Na tych samych warunkach 3.0 PL':
'CC BY-NC-SA 3.0 PL'
}
def main():
args = parse_arguments()
if not os.path.isdir(args.input):
print('Error: Input must be a root corpora directory!')
harmonize_licenses(args.input, args.output)
def parse_arguments():
parser = argparse.ArgumentParser(description='Harmonize licences.')
required_arguments = parser.add_argument_group('required arguments')
required_arguments.add_argument('-i', '--input', help='corpora root directory', required=True)
required_arguments.add_argument('-o', '--output', help='output directory', required=True)
return parser.parse_args()
def harmonize_licenses(root_directory, harmonized_outpath):
for root, dirs, files in os.walk(root_directory):
for filename in files:
if filename.endswith('.conllup') or filename.endswith('.conllu'):
src = os.path.join(root, filename)
licence = get_licence(src)
if not is_full_text(src):
licence = 'CC0'
else:
licence = LICENCES_MAP[licence]
os.makedirs(harmonized_outpath, exist_ok=True)
dst = os.path.join(harmonized_outpath, filename)
copy_and_harmonize(src, dst, licence)
def get_licence(filepath):
with open(filepath, 'r') as conllup_file:
for line in conllup_file:
line = line.strip()
if is_segment(line):
continue
elif is_metadata(line):
name, value = get_metadata(line)
if name == 'Licence':
return value
return '0'
def is_segment(line):
if line and line[0].isdigit():
return True
return False
def is_metadata(line):
if line.startswith('#'):
return True
return False
def get_metadata(line):
name_value_pair = line.split('=', 1)
name = name_value_pair[0].lstrip('#').strip()
value = name_value_pair[1].strip()
return name, value
def is_full_text(filepath):
with open(filepath, 'r') as conllup_file:
for line in conllup_file:
line = line.strip()
if is_segment(line):
continue
elif is_metadata(line):
name, value = get_metadata(line)
if name == 'DocumentType':
if value.strip() == 'paper':
return True
else:
return False
return False
def copy_and_harmonize(src, dst, licence):
harmonized_lines = []
with open(src, 'r') as conllup_file:
for line in conllup_file:
line = line.strip()
if is_metadata(line):
name, value = get_metadata(line)
if name == 'Licence':
harmonized_lines.append(f'# Licence = {licence}')
else:
harmonized_lines.append(line)
else:
harmonized_lines.append(line)
with open(dst, 'w') as dst_file:
dst_file.write('\n'.join(harmonized_lines))
if __name__ == '__main__':
main()