load_entries_relations.py 3.94 KB
#-*- coding:utf-8 -*-

import codecs

from django.core.management.base import BaseCommand

from dictionary.models import Lemma, POS, get_or_create_entry

NOUN_VERB_RELATIONS_PATH = 'data/nverbs/nouns/nouns+verb-freq.txt'

ADJ_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt'

CHECK_PATH = 'data/nverbs/nouns/deriv_nouns-adj-freq-sel.txt'

class Command(BaseCommand):
    args = 'none'
    help = """
    Add relations between entries from given file.
    """

    def handle(self, **options):
        #add_relations(NOUN_VERB_RELATIONS_PATH, 'noun')   
        #add_relations(ADJ_VERB_RELATIONS_PATH, 'adj')
        check_if_deriv_good_to_add('adj', 'noun', 'data/nverbs/nouns/deriv_nouns-adj-existing-20150928.txt')

def add_relations(entries_path, pos_tag): 
    entries = []
    pos = POS.objects.get(tag=pos_tag)
    try:
        freq_file = codecs.open(entries_path, "rt", 'utf-8')
        for line in freq_file:
            #print line
            line_ls = line.split()
            verb = line_ls[3].lstrip('(').strip()
            try:
                nverb = line_ls[0].strip()
                verb_obj = Lemma.objects.get(old=False, entry_obj__name=verb, entry_obj__pos__tag='verb')
                entry = {'entry'  : nverb,
                         'verb'   : verb,
                         'freq_1M': int(line_ls[1].strip()),
                         'freq_300M': int(line_ls[2].strip())}
                nverb_entry, created = get_or_create_entry(entry['entry'], pos)
                verb_entry = verb_obj.entry_obj
                verb_entry.rel_entries.add(nverb_entry)
                nverb_entry.rel_entries.add(verb_entry)
                print line
            except Lemma.DoesNotExist:
                pass
    finally:
        freq_file.close()
        return entries

def add_relations_by_nverb_entries(entries, entries_path, from_pos_tag, to_pos_tag): 
    print 'Adding relations!'
    from_pos = POS.objects.get(tag=from_pos_tag)
    to_pos = POS.objects.get(tag=to_pos_tag)
    try:
        freq_file = codecs.open(entries_path, "rt", 'utf-8')
        for line in freq_file:
            #print line
            line_ls = line.split()
            verb = line_ls[3].lstrip('(').strip()
            try:
                nverb = line_ls[0].strip()
                if nverb in entries:
                    verb_obj = Lemma.objects.get(old=False, entry_obj__name=verb, entry_obj__pos=from_pos)
                    nverb_obj = Lemma.objects.get(old=False, entry_obj__name=nverb, entry_obj__pos=to_pos)
                    entry = {'entry'  : nverb,
                             'verb'   : verb,
                             'freq_1M': int(line_ls[1].strip()),
                             'freq_300M': int(line_ls[2].strip())}
                    nverb_entry = nverb_obj.entry_obj
                    verb_entry = verb_obj.entry_obj
                    verb_entry.rel_entries.add(nverb_entry)
                    nverb_entry.rel_entries.add(verb_entry)
                    print line
            except Lemma.DoesNotExist:
                pass
    finally:
        freq_file.close()

def check_if_deriv_good_to_add(from_pos_tag, to_pos_tag, outpath):
    try:
        freq_file = codecs.open(CHECK_PATH, "rt", 'utf-8')
        good_file = codecs.open(outpath, "wt", 'utf-8')
        for line in freq_file:
            line_ls = line.split()
            to_entry = line_ls[0].strip()
            from_entry = line_ls[3].lstrip('(').strip()
            if not Lemma.objects.filter(old=False, entry_obj__name=to_entry,
                                        entry_obj__pos__tag=to_pos_tag).exists():
                try:
                    from_lemma = Lemma.objects.get(old=False, entry_obj__name=from_entry,
                                                   entry_obj__pos__tag=from_pos_tag)
                    good_file.write(line)
                    print line
                except Lemma.DoesNotExist:
                    pass
    finally:
        good_file.close()
        freq_file.close()