preprocess_frequencies.py
586 Bytes
#!/usr/bin/env python3
import sys
import math
filename = sys.argv[1]
res = {}
with open(filename) as f:
lines = f.readlines()
for line in lines:
splt = line.split("\t")
lemma = splt[1]
val = int(splt[3])
attrs = splt[2].split(":")
if attrs[0] == "subst":
key = "subst_" + attrs[2]
else:
key = attrs[0]
key = lemma + "\t" + key
res[key] = res.get(key, 0) + val
for key in res.keys():
out_val = math.log(1 + res[key], 10)
print(key + "\t" + str(out_val))