prepare2grams.py 1.13 KB

Edit Raw Blame History

from collections import Counter

#do / w / na

# 2grams: 75395184 lines

prep_2grams = Counter()
# to check that no interpunction etc. gets through
characters = Counter()

STOPLIST = '0123456789.,:;?!()[]"\'„“”˝‘’`‹›«»\\/_-–—<>=+@#$%^&*~|…§°²•×‰½→'

with open('2grams', encoding='utf-8', mode='r') as fin:
    l = fin.readline()
    i = 0
    while l:
        i += 1
        if i % 500000 == 0:
            print(i)
        freq, w1, w2 = l.strip().split()
        if w1 == 'we':
            w1 = 'w'
        freq = int(freq)
        if freq < 3:
            break
        if w1 in ('do', 'w', 'na',):
            w2 = w2.strip(STOPLIST)
            for char in STOPLIST:
                # eg. ‘do pomocy!przy’
                w2 = w2.split(char)[0]
            prep_2grams['{} {}'.format(w1, w2)] += freq
            characters.update(w2)
        l = fin.readline()

with open('2grams_prep_nkjp', encoding='utf-8', mode='w') as fout:
    for freq, digram in prep_2grams.most_common():
        print('{}\t{}'.format(freq, digram), file=fout)

print()

for n, char in characters.most_common():
    print(n, char)