Blame view

NKJP/validation/checkSentences.py 1.09 KB
Jan Lupa authored
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
from enum import Enum
import sys

class State(Enum):
	DEF = 0
	NEWPAR = 1
	NEWSEN = 2
	SKIP = 3

def indent(l: str) -> int:
	i = 0
	while l[i] == '\t': i += 2
	return i / 2

if __name__ == '__main__':
	with open(sys.argv[1]) as i:
		state = State.DEF
		ab = ""
		sen = ""
		senBeg = 0
		senEnd = 0
		for l in i:
			if len(l) == 1:
				if state != State.SKIP:
					sen = sen.replace(chr(160), ' ')
					ab = ab.replace(chr(160), ' ')
					if sen != ab[senBeg:senEnd]:
						print("Mismatch in " + sys.argv[1] + ":\n" + sen + "\n" + ab[senBeg:senEnd])
						print([i for i in range(len(sen)) if sen[i] != ab[senBeg:senEnd][i]])
					sen = ""
					state = State.SKIP
			elif indent(l) == 0:
				if state == State.NEWPAR:
					ab = l
				state = State.DEF
			elif indent(l) == 1:
				state = State.NEWPAR
			elif indent(l) == 2:
				state = State.NEWSEN
			elif indent(l) == 3:
				split = l.split('; ')
				beg = int(split[1])
				if state == State.NEWSEN:
					senBeg = beg
					senEnd = beg
				length = int(split[2])
				sen += ' ' * (beg - senEnd)
				sen += split[3]
				senEnd = beg + length
				state = State.DEF