to_raw.py
3.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import argparse
import os
def main():
args = parse_arguments()
if not args.input:
print('Error: Corpora root directory must be selected!')
if not args.output:
print('Error: Directory for raw data must be selected!')
if not os.path.isdir(args.output):
print('Error: Selected output must be a directory!')
create_raw_text_corpora(args.input, args.output)
def parse_arguments():
parser = argparse.ArgumentParser(description='Create raw text corpora from conllup files.')
required_arguments = parser.add_argument_group('required arguments')
required_arguments.add_argument('-i', '--input', help='corpora root directory', required=True)
required_arguments.add_argument('-o', '--output', help='output directory (must exist)', required=True)
return parser.parse_args()
def create_raw_text_corpora(conllu_root_directory, raw_text_directory):
for root, dirs, files in os.walk(conllu_root_directory):
for filename in files:
if filename.endswith('.conllup') or filename.endswith('.conllu'):
print('Extracting text from %s.' % filename)
in_doc_path = os.path.join(root, filename)
doc_rel_path = os.path.relpath(in_doc_path, conllu_root_directory)
out_doc_path = os.path.join(raw_text_directory, doc_rel_path)
out_doc_dir = os.path.dirname(out_doc_path)
os.makedirs(out_doc_dir, exist_ok=True)
extract_text_and_save(os.path.join(root, filename), out_doc_dir)
def extract_text_and_save(filepath, raw_text_directory):
paragraphs_specified = False
document_id = None
document_text = None
with open(filepath, 'r') as conllup_file:
for line in conllup_file:
line = line.strip()
if is_metadata(line):
name, value = get_metadata(line)
if name == 'newdoc id':
if document_text and document_id:
document_text = document_text.strip()
raw_file_path = os.path.join(raw_text_directory, '%s.txt' % document_id)
save_raw_text(document_text, raw_file_path)
document_id = value
document_text = ''
elif name == 'newpar id':
paragraphs_specified = True
document_text += '\n\n'
elif name == 'text':
if paragraphs_specified:
if document_text.endswith('\n\n'):
document_text += value
else:
document_text += ' %s' % value
else:
document_text += '\n\n%s' % value
if document_text and document_id:
document_text = document_text.strip()
raw_file_path = os.path.join(raw_text_directory, '%s.txt' % document_id)
save_raw_text(document_text, raw_file_path)
def is_metadata(line):
if line.startswith('#'):
return True
return False
def get_metadata(line):
name_value_pair = line.split('=', 1)
name = name_value_pair[0].lstrip('#').strip()
value = name_value_pair[1].strip()
return name, value
def save_raw_text(text, raw_file_path):
with open(raw_file_path, 'w') as raw_file:
raw_file.write(text)
if __name__ == '__main__':
main()