preprocess_frequencies.py 586 Bytes
#!/usr/bin/env python3
import sys
import math

filename = sys.argv[1]

res = {}

with open(filename) as f:
    lines = f.readlines()
    for line in lines:
        splt = line.split("\t")
        lemma = splt[1]
        val = int(splt[3])
        attrs = splt[2].split(":")
        if attrs[0] == "subst":
            key = "subst_" + attrs[2]
        else:
            key = attrs[0]
        key = lemma + "\t" + key
        res[key] = res.get(key, 0) + val

for key in res.keys():
    out_val = math.log(1 + res[key], 10)
    print(key + "\t" + str(out_val))