load_frequency.py
1.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#-*- coding:utf-8 -*-
# author: B.Niton
import re
import os
import codecs
from django.core.management.base import BaseCommand
from dictionary.models import *
from settings import PROJECT_PATH
FREQ_300M_PATH = os.path.join(PROJECT_PATH, 'data', 'dis-verbs-300M-counts.txt')
FREQ_1M_PATH = os.path.join(PROJECT_PATH, 'data', 'dis-verbs-1M-counts.txt')
class Command(BaseCommand):
help = 'Load lemma frequency informations to system.'
def handle(self, **options):
load_frequency()
def load_frequency():
with codecs.open(FREQ_300M_PATH, 'rt', 'utf-8') as infile:
for line in infile:
line = line.strip()
freq_pattern = re.compile(ur'^(\*\*)?[\s]*([^\s]+)[\s]*([\d]+).*$')
m = freq_pattern.match(line)
if m:
lemma_str = m.group(2).strip()
frequency = int(m.group(3).strip())
lemmas = Lemma.objects.filter(entry_obj__name=lemma_str, entry_obj__pos__tag='verb').all()
for lemma in lemmas:
if lemma.frequency_300M == 0:
print lemma
lemma.frequency_300M = frequency
lemma.save()
with codecs.open(FREQ_1M_PATH, 'rt', 'utf-8') as infile:
for line in infile:
line = line.strip()
freq_pattern = re.compile(ur'^(\*\*)?[\s]*([^\s]+)[\s]*([\d]+).*$')
m = freq_pattern.match(line)
if m:
lemma_str = m.group(2).strip()
frequency = int(m.group(3).strip())
lemmas = Lemma.objects.filter(entry_obj__name=lemma_str, entry_obj__pos__tag='verb').all()
for lemma in lemmas:
if lemma.frequency_1M == 0:
print lemma
lemma.frequency_1M = frequency
lemma.save()