add_nverb_entries.py 9.68 KB
#-*- coding:utf-8 -*-

import codecs
from operator import itemgetter

from django.core.management.base import BaseCommand

from dictionary.models import Lemma, Lemma_Status, POS, Vocabulary, \
                              get_or_create_entry
from dictionary.management.commands.load_initial_nverb_frames import add_initial_frames_by_entries
from dictionary.management.commands.load_entries_relations import add_relations_by_nverb_entries

VERBS_IN_DICT = 2000
ORDERING = '300M'

################# NOUNS ################################
#VERBAL_NOUNS_PATH = 'data/nverbs/nouns/merged_nouns-freq.txt'
#NOUNS_VAL_PATH = 'data/nverbs/nouns/merged_nouns_val-poss.txt'

# loading initial entries
NOUNS_ADDED_PATH = 'data/nverbs/nouns2consider/added-merged_nouns_val_20171102.txt'
NOUNS_ERROR_PATH = 'data/nverbs/nouns2consider/error-merged_nouns_val_20171102.txt'

NOUNS_FRAMES_PATH = 'data/nverbs/nouns2consider/merged_nouns_val-poss.txt' # hasla z ramkami walencyjnymi z tajnego
SELECTED_NOUNS_PATH = 'data/nverbs/nouns2consider/nouns+verb2consider-clarin2-add.txt'

# adding entries relations
NOUN_VERB_RELATIONS_PATH = 'data/nverbs/nouns2consider/nouns+verb2consider-clarin2-add.txt'
NOUN_ADJ_RELATIONS_PATH = 'data/nverbs/nouns2consider/nouns+adj2consider-clarin2.txt'

################## ADJS ################################
#VERBAL_ADJS_PATH = 'data/nverbs/adjs/merged_adjs-freq.txt'
##ADJS_VAL_PATH = 'data/nverbs/adjs/merged_adjs_val-P1.txt'
#ADJS_RELATIONS_PATH = 'data/nverbs/adjs/ver_adjs+verb-freq_cuted.txt'
#
## loading initial entries
#ADJS_ADDED_PATH = 'data/nverbs/adjs/added-merged_adjs_val_20141219.txt'
#ADJS_ERROR_PATH = 'data/nverbs/adjs/error-merged_adjs_val_20141219.txt'
#ADJS_FRAMES_PATH = 'data/nverbs/adjs/merged_adjs_val-P1.txt'
#
## adding entries relations
#ADJ_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt'
#
################## ADVS ################################
#VERBAL_ADVS_PATH = 'data/nverbs/advs/merged_advs-sel-1M-300M.txt' # frekwencja tylko wybrana
#ADVS_VAL_PATH = 'data/nverbs/advs/merged_advs_val_popr_usu_gdyby_20141113.txt'
#
## loading initial entries
#ADVS_ADDED_PATH = 'data/nverbs/advs/added-merged_advs_val.txt'
#ADVS_ERROR_PATH = 'data/nverbs/advs/error-merged_advs_val.txt'
#ADVS_FRAMES_PATH = 'data/nverbs/advs/merged_advs_val_popr_usu_gdyby_20141113.txt'

## adding entries relations # na razie brak danych
#ADV_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt'


class Command(BaseCommand):
    args = 'none'

    def handle(self, **options):
        # load nouns
#        entries_with_val = get_entries(NOUNS_VAL_PATH)
#        entries = get_entries_by_freq(VERBAL_NOUNS_PATH, ORDERING)
#        load_entries(entries, B_entries, 'data/added_nouns_20140627.txt', ORDERING, 'noun', 
#                     'clarin_nouns', 1, 1, 0)      

        # load nouns
        entries_to_add = get_entries(SELECTED_NOUNS_PATH)
        ordered_entries = get_entries_by_freq(SELECTED_NOUNS_PATH, ORDERING)
        #related_entries = get_related_entries(NOUNS_RELATIONS_PATH, 'noun')
        added_entries = load_entries(ordered_entries, entries_to_add, 'data/nverbs/nouns2consider/added_nouns_20171103.txt', ORDERING, 'noun',
                                     'clarin2_nouns', 3, 3, 0)
        add_initial_frames_by_entries(added_entries, 
                                      NOUNS_FRAMES_PATH, NOUNS_ADDED_PATH, NOUNS_ERROR_PATH, 
                                      'noun')
        add_relations_by_nverb_entries(added_entries, NOUN_VERB_RELATIONS_PATH, 'verb', 'noun')
        #add_relations_by_nverb_entries(added_entries, NOUN_ADJ_RELATIONS_PATH, 'adj', 'noun')
        
#        # load adjectives
##        entries_with_val = get_entries(ADJS_VAL_PATH)
#        entries = get_entries_by_freq(VERBAL_ADJS_PATH, ORDERING)
#        related_entries = get_related_entries(ADJS_RELATIONS_PATH, 'adj')
#        added_entries = load_entries(entries, related_entries, 'data/added_adjs_20141219.txt', ORDERING, 'adj', 
#                                     'clarin_adjs', 3, 3, 0)
#        add_initial_frames_by_entries(added_entries, 
#                                      ADJS_FRAMES_PATH, ADJS_ADDED_PATH, ADJS_ERROR_PATH, 
#                                      'adj')
#        add_relations_by_nverb_entries(added_entries, ADJ_VERB_RELATIONS_PATH, 'adj')

#        # load adverbs
#        entries_with_val = get_entries(ADVS_VAL_PATH)
#        entries = get_entries_by_freq(VERBAL_ADVS_PATH, ORDERING)
#        added_entries = load_entries(entries, entries_with_val, 'data/added_advs_20141114.txt', ORDERING, 'adv', 
#                                     'clarin_advs', 1, 1, 0)
#        add_initial_frames_by_entries(added_entries, 
#                                      ADVS_FRAMES_PATH, ADVS_ADDED_PATH, ADVS_ERROR_PATH, 
#                                      'adverb')
#        add_relations_by_nverb_entries(added_entries, ADJ_VERB_RELATIONS_PATH, 'adj')
    
def get_entries(entries_path):
    entries = []
    try:
        entries_file = codecs.open(entries_path, "rt", 'utf-8')
        for line in entries_file:
            line_ls = line.split('\t')
            entries.append({'entry'  : line_ls[0].strip()})
        entries = list(set(entries))
    finally:
        entries_file.close()
        return entries 
    
def get_entries_by_freq(entries_path, ordering):
    entries = []
    try:
        freq_file = codecs.open(entries_path, "rt", 'utf-8')
        for line in freq_file:
            line_ls = line.split()
            entries.append({'entry'  : line_ls[0].strip(),
                            'freq_1M': int(line_ls[1].strip()),
                            'freq_300M': int(line_ls[2].strip())})
        entries = sorted(entries, key=itemgetter('freq_%s' % ordering), reverse=True)
    finally:
        freq_file.close()
        return entries

def load_entries(sorted_entries, entries_to_add, added_path, ordering, pos_tag, 
                  dict_basename, first_dict_idx, last_dict_idx, min_freq):   
    print 'Loading entries!!' 
    try:
        added_entries = []
        added_file = codecs.open(added_path, "wt", 'utf-8')
        dict = first_dict_idx
        new_last_dict = last_dict_idx
        pos_obj = POS.objects.get(tag=pos_tag)
        verbs_per_dict = VERBS_IN_DICT
        initial_status = Lemma_Status.objects.order_by('priority')[0]
        for entry in sorted_entries:
            found_entry = next((item for item in entries_to_add if item['entry'] == entry['entry']), None)
            if found_entry and entry['freq_%s' % ordering] >= min_freq:
                if dict == 0:
                    new_voc = Vocabulary(name=dict_basename)
                    new_voc.save()
                else:
                    new_voc = Vocabulary(name=dict_basename+str(dict))
                    new_voc.save()
                    
                lemmas = Lemma.objects.filter(entry = entry['entry'])
                if lemmas.count() == 0:
                    if verbs_per_dict == 0:
                        verbs_per_dict = VERBS_IN_DICT
                        dict += 1
                        if dict > new_last_dict:
                            break
                        else:
                            new_voc = Vocabulary(name=dict_basename+str(dict))
                            new_voc.save()
                    val_entry, created = get_or_create_entry(entry['entry'], pos_obj)   
                    new_lemma = Lemma(entry=entry['entry'], 
                                      entry_obj=val_entry,
                                      vocabulary=new_voc, 
                                      status=initial_status, 
                                      old=False,
                                      frequency_300M=entry['freq_300M'], 
                                      frequency_1M=entry['freq_1M'])
                    new_lemma.save()
                    verbs_per_dict -= 1
                    added_file.write('%s\t%s\t%s\n' % (entry['entry'],
                                                       entry['freq_1M'],
                                                       entry['freq_300M']))
                    added_entries.append(entry['entry'])
                    print entry
    finally:
        added_file.close()
        return added_entries
    
def get_related_entries(relations_path, pos_tag): 
    print 'Checking relations!!!'
    entries = []
    pos = POS.objects.get(tag=pos_tag)
    try:
        relations_file = codecs.open(relations_path, "rt", 'utf-8')
        for line in relations_file:
            #print line
            line_ls = line.split()
            verb = line_ls[3].lstrip('(').strip()
            nverb = line_ls[0].strip()
            if (not Lemma.objects.filter(entry=nverb).exists() and 
                Lemma.objects.filter(entry=verb, entry_obj__pos__tag='verb').exists()):
                #entries.append(nverb)
                verb_active_lemma = Lemma.objects.get(entry=verb, 
                                                      entry_obj__pos__tag='verb',
                                                      old=False)
                lemma_status_str = verb_active_lemma.status.status
                if (not lemma_status_str == u'w obróbce' and 
                    not lemma_status_str == u'do obróbki'):
                    if (verb_active_lemma.frames.count() == 1 and 
                        verb_active_lemma.frames.filter(text_rep=u'subj{np(str)}').exists()): 
                        pass
                    else:
                        entries.append({'entry'  : nverb,
                                        'verb'   : verb,
                                        'freq_1M': int(line_ls[1].strip()),
                                        'freq_300M': int(line_ls[2].strip())}) 
                        print line
    finally:
        relations_file.close()
        return entries