preProcess.py
1 KB
import re
import sys
from typing import Dict, Match, Tuple
def posDict(ann_segm: str) -> Dict[str, Tuple[str, str, bool]]:
srch = re.finditer(r'<seg corresp="text.xml#string-range\(txt_(.+?-ab),(\d+?),\d+?\)".*?xml:id="(.+?)"/>', ann_segm)
return dict((m.group(3), (m.group(1), m.group(2), 'nkjp:rejected="true"' in m.group(0))) for m in srch)
def tamperFun(m: Match, pD: Dict[str, Tuple[str, str, bool]]) -> str:
(ab, pos, rej) = pD[m.group(2)]
rejStr = ' nkjp:rejected="true"' if rej else ''
return m.group(1) + m.group(2) + m.group(3) + ' pos="' + pos + '" ab="' + ab + '"' + rejStr + '>'
def tamper(source: str, pD: Dict[str, Tuple[str, str, bool]]) -> str:
return re.sub(r'(<seg corresp="ann_segmentation.xml#)(.+?)(" xml:id=".+?")>', lambda m: tamperFun(m, pD), source, flags=re.DOTALL)
if __name__ == '__main__':
if len(sys.argv) < 3:
print("Too few arguments.")
exit()
with open(sys.argv[1], 'r') as i, open(sys.argv[2], 'r') as ann_segm:
pD = posDict(ann_segm.read())
print(tamper(i.read(), pD))