preProcess.py 1 KB
import re
import sys
from typing import Dict, Match, Tuple

def posDict(ann_segm: str) -> Dict[str, Tuple[str, str, bool]]:
	srch = re.finditer(r'<seg corresp="text.xml#string-range\(txt_(.+?-ab),(\d+?),\d+?\)".*?xml:id="(.+?)"/>', ann_segm)
	return dict((m.group(3), (m.group(1), m.group(2), 'nkjp:rejected="true"' in m.group(0))) for m in srch)

def tamperFun(m: Match, pD: Dict[str, Tuple[str, str, bool]]) -> str:
	(ab, pos, rej) = pD[m.group(2)]
	rejStr = ' nkjp:rejected="true"' if rej else ''
	return m.group(1) + m.group(2) + m.group(3) + ' pos="' + pos + '" ab="' + ab + '"' + rejStr + '>'

def tamper(source: str, pD: Dict[str, Tuple[str, str, bool]]) -> str:
	return re.sub(r'(<seg corresp="ann_segmentation.xml#)(.+?)(" xml:id=".+?")>', lambda m: tamperFun(m, pD), source, flags=re.DOTALL)

if __name__ == '__main__':
	if len(sys.argv) < 3:
		print("Too few arguments.")
		exit()
	with open(sys.argv[1], 'r') as i, open(sys.argv[2], 'r') as ann_segm:
		pD = posDict(ann_segm.read())
		print(tamper(i.read(), pD))