load_frequency.py 1.89 KB
#-*- coding:utf-8 -*-
# author: B.Niton

import re
import os
import codecs

from django.core.management.base import BaseCommand


from dictionary.models import *
from settings import PROJECT_PATH

FREQ_300M_PATH = os.path.join(PROJECT_PATH, 'data', 'dis-verbs-300M-counts.txt')
FREQ_1M_PATH = os.path.join(PROJECT_PATH, 'data', 'dis-verbs-1M-counts.txt')

class Command(BaseCommand):
    help = 'Load lemma frequency informations to system.'

    def handle(self, **options):
        load_frequency()  

def load_frequency():                                
    with codecs.open(FREQ_300M_PATH, 'rt', 'utf-8') as infile:
        for line in infile:
            line = line.strip()
            freq_pattern = re.compile(ur'^(\*\*)?[\s]*([^\s]+)[\s]*([\d]+).*$')
            m = freq_pattern.match(line)
            if m:
                lemma_str = m.group(2).strip()
                frequency = int(m.group(3).strip())
                lemmas = Lemma.objects.filter(entry=lemma_str, entry_obj__pos__tag='verb').all()
                for lemma in lemmas:
                    if lemma.frequency_300M == 0:
                        print lemma
                        lemma.frequency_300M = frequency
                        lemma.save()
                          
    with codecs.open(FREQ_1M_PATH, 'rt', 'utf-8') as infile:
        for line in infile:
            line = line.strip()
            freq_pattern = re.compile(ur'^(\*\*)?[\s]*([^\s]+)[\s]*([\d]+).*$')
            m = freq_pattern.match(line)
            if m:
                lemma_str = m.group(2).strip()
                frequency = int(m.group(3).strip())
                lemmas = Lemma.objects.filter(entry=lemma_str, entry_obj__pos__tag='verb').all()
                for lemma in lemmas:
                    if lemma.frequency_1M == 0:
                        print lemma
                        lemma.frequency_1M = frequency
                        lemma.save()