|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
|
import re
import sys
from typing import Match
def prepare(m: Match) -> str:
dic = {"&":"&", ">":">", "<":"<"}
return re.sub("|".join(dic.keys()), lambda subm: dic[subm.group(0)], m.group(1)) + '\n'
if __name__ == '__main__':
with open(sys.argv[1] + '/text.xml', 'r') as i, open('extract_text.txt', 'w') as o:
f = re.finditer(r'<ab.+?>(.+?)</ab>', i.read())
for ab in f: o.write(prepare(ab))
with open(sys.argv[1] + '/fold_text.txt', 'r') as i, open('extract_fold_text.txt', 'w') as o:
f = re.finditer(r'-ab;\n(.*);', i.read())
for ab in f: o.write(prepare(ab))
with open(sys.argv[1] + '/fold_segm.txt', 'r') as i, open('extract_fold_segm.txt', 'w') as o:
f = re.finditer(r'-ab;\n(.*);', i.read())
for ab in f: o.write(prepare(ab))
with open(sys.argv[1] + '/fold_morph.txt', 'r') as i, open('extract_fold_morph.txt', 'w') as o:
f = re.finditer(r'-ab;\n(.*);', i.read())
for ab in f: o.write(prepare(ab))
with open(sys.argv[1] + '/fold_sense.txt', 'r') as i, open('extract_fold_sense.txt', 'w') as o:
f = re.finditer(r'-ab;\n(.*);', i.read())
for ab in f: o.write(prepare(ab))
|