prepare2grams.py
1.13 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
from collections import Counter
#do / w / na
# 2grams: 75395184 lines
prep_2grams = Counter()
# to check that no interpunction etc. gets through
characters = Counter()
STOPLIST = '0123456789.,:;?!()[]"\'„“”˝‘’`‹›«»\\/_-–—<>=+@#$%^&*~|…§°²•×‰½→'
with open('2grams', encoding='utf-8', mode='r') as fin:
l = fin.readline()
i = 0
while l:
i += 1
if i % 500000 == 0:
print(i)
freq, w1, w2 = l.strip().split()
if w1 == 'we':
w1 = 'w'
freq = int(freq)
if freq < 3:
break
if w1 in ('do', 'w', 'na',):
w2 = w2.strip(STOPLIST)
for char in STOPLIST:
# eg. ‘do pomocy!przy’
w2 = w2.split(char)[0]
prep_2grams['{} {}'.format(w1, w2)] += freq
characters.update(w2)
l = fin.readline()
with open('2grams_prep_nkjp', encoding='utf-8', mode='w') as fout:
for freq, digram in prep_2grams.most_common():
print('{}\t{}'.format(freq, digram), file=fout)
print()
for n, char in characters.most_common():
print(n, char)