add_verbs.py 9.24 KB
#-*- coding:utf-8 -*-

import codecs
import re

from django.core.management.base import BaseCommand
from lxml import etree

from dictionary.models import Entry, Lemma, Lemma_Status, POS, Vocabulary

VERBS_IN_DICT = 2000
POLANSKI_PATH = 'data/dictionary.xml'
B_PATH = 'data/B_frames/B_cz_frames.txt'
PATH_300M = 'data/susp-1.1-verbs-300M-counts.txt'
NEW_VERBS_PATH = 'data/new_verbs_140213.txt'
NWALENTY_PATH = 'data/checked-nwalenty.txt'

# dodawanie nowych haseł, skryptami:
# add_verbs
# load_frequency

# !!! po wprowadzeniu haseł sprawdzić czy nie ma niedodanych multirelacji postaci \).*\( !!!!!!!!

ADJS_RELATIONS_PATH = 'data/nverbs/relations/merged_adjs+verb-freq.txt'
NOUNS_RELATIONS_PATH = 'data/nverbs/relations/nouns+verb-freq.txt'

class Command(BaseCommand):
    args = 'none'
    help = """
    Add verbs from given freqency list. Script checks if verb 
    is not already included in Slowal tool database.
    """

    def handle(self, **options):
        #add_verbs_intersec_freq()
        #get_new_verbs()
        #add_verbs(NEW_VERBS_PATH, 'data/added_verbs_20140701_pol_i_tajny.txt')
        #add_verbs(PATH_300M, 'data/added_verbs_20140701.txt')
        verbs = add_verbs('data/verbs/verbs2consider-1M-300M-all.txt',
                          'data/verbs/added_verbs_20170303.txt')
        # add_relations_by_verb_entries(verbs, ADJS_RELATIONS_PATH, 'adj')
        # add_relations_by_verb_entries(verbs, NOUNS_RELATIONS_PATH, 'noun')
        
def add_verbs(verbs_path, added_path):
    added_verbs = []
    added_file = codecs.open(added_path, 'wt', 'utf-8')
    dict_basename = 'clarin2_verbs1'
    dict = 0
    new_last_dict = 1
    verbs_per_dict = VERBS_IN_DICT
    min_freq = 0
    with codecs.open(verbs_path, 'rt', 'utf8') as infile:
        if dict == 0:
            new_voc = Vocabulary(name=dict_basename)
            new_voc.save()
        else:
            new_voc = Vocabulary(name=dict_basename+str(dict))
            new_voc.save()
        initial_status = Lemma_Status.objects.order_by('priority')[0]
        for line in infile:
            line = line.strip()
            if line.startswith('*'):
                continue
            print line
            line_ls = line.split()
            word = line_ls[0].strip()
            freq_1M = int(line_ls[1].strip())
            freq_300M = int(line_ls[2].strip())
            lemmas = Lemma.objects.filter(entry = word)
            if lemmas.count() == 0 and freq_300M >= min_freq:
                if verbs_per_dict == 0:
                    verbs_per_dict = VERBS_IN_DICT
                    dict += 1
                    if dict > new_last_dict:
                        break
                    else:
                        new_voc = Vocabulary(name=dict_basename+str(dict))
                        new_voc.save()
                entry_obj = get_verb_entry(word)
                new_lemma = Lemma(entry_obj=entry_obj, 
                                  entry=word, vocabulary=new_voc, 
                                  status=initial_status, old=False,
                                  frequency_300M=freq_300M, frequency_1M=freq_1M)
                new_lemma.save()
                verbs_per_dict -= 1
                added_file.write('%s\t%s\t%d\n' % (dict_basename+str(dict), word, freq_300M))
                added_verbs.append(word)
    added_file.close()
    return added_verbs

def get_verb_entry(verb):
    try:
        entry = Entry.objects.get(name=verb, pos__tag='verb')
    except Entry.DoesNotExist:
        try:
            entry = Entry.objects.get(name=verb, pos__tag='unk')
            verb_pos = POS.objects.get(tag='verb')
            entry.pos = verb_pos
            entry.save()
        except Entry.DoesNotExist:
            verb_pos = POS.objects.get(tag='verb')
            entry = Entry(name=verb, pos=verb_pos)
            entry.save()
    return entry

def add_relations_by_verb_entries(entries, relations_path, pos_tag): 
    print 'Adding relations!'
    pos = POS.objects.get(tag=pos_tag)
    try:
        freq_file = codecs.open(relations_path, "rt", 'utf-8')
        for line in freq_file:
            line_ls = line.split()
            verb = line_ls[3].lstrip('(').strip()
            try:
                nverb = line_ls[0].strip()
                if verb in entries:
                    verb_obj = Lemma.objects.get(old=False, entry=verb, entry_obj__pos__tag='verb')
                    nverb_obj = Lemma.objects.get(old=False, entry=nverb, entry_obj__pos=pos)
                    nverb_entry = nverb_obj.entry_obj
                    verb_entry = verb_obj.entry_obj
                    verb_entry.rel_entries.add(nverb_entry)
                    nverb_entry.rel_entries.add(verb_entry)
                    print line
            except Lemma.DoesNotExist:
                pass
    finally:
        freq_file.close()

def get_polanski_verbs(inpath):  
    verbs = []  
    tree = etree.parse(inpath) 
    words = tree.xpath("//*[local-name() = 'orth']")   
    for word in words:
        verb = word.text.replace(u'się', '').strip()
        if verb not in verbs:
            verbs.append(verb)
            print verb
    return verbs

def get_B_verbs(inpath):  
    verbs = []
    try:
        f = codecs.open(inpath, "rt", 'utf-8')
        for line in f:
            line_pattern = re.compile(ur"^([^\d]+)[\d]+(.*)$")
            m = line_pattern.match(line)
            if not m:
                print '!!!!!!!!!!!!!!!!match error!!!!!!!!!!!!!!!!!!!!!!!'
            if m:   
                lemma_str = m.group(1).strip()
                lemma_ls = lemma_str.split()
                line = line.strip() 
                if not lemma_ls[0] in verbs:
                    verbs.append(lemma_ls[0]) 
    finally:
        f.close()
        return verbs
    
def load_B_lemmas(inpath, voc_name):
    print 'Loading %s dict.' % (voc_name)
    try:
        f = codecs.open(inpath, "rt", 'utf-8')
        voc_obj, xx = Vocabulary.objects.get_or_create(name=voc_name)
        initial_status = Lemma_Status.objects.get(status=u'do obróbki')
        for line in f:
            line_ls = line.split()
            entry = line_ls[1].strip()
            try:
                Lemma.objects.get(old=False, entry=entry)
            except Lemma.DoesNotExist:
                lemma_obj, created = Lemma.objects.get_or_create(old=False, 
                                                                 entry=entry, 
                                                                 vocabulary=voc_obj,
                                                                 status=initial_status)
                if created:
                    voc_obj.lemmas.add(lemma_obj)
    finally:
        f.close()
    
def compare_to_300M(pol_verbs, b_verbs, path_300M, outpath, nwalenty_path):
    try:
        pol_verbs_to_check = []
        file_300M = codecs.open(path_300M, "rt", 'utf-8')
        outfile = codecs.open(outpath, 'wt', 'utf-8')
        nwalenty_file = codecs.open(nwalenty_path, 'wt', 'utf-8')
        for line in file_300M:
            print line.strip()
            if line.strip().startswith('*'):
                continue
            line_ls = line.split()
            entry = line_ls[0].strip()
            if entry in pol_verbs and not entry in b_verbs:
                pol_verbs_to_check.append(entry)
            if Lemma.objects.filter(old=False, entry=entry).exists():
                continue
            if entry in b_verbs or entry in pol_verbs:
                outfile.write(line)
            else:
                nwalenty_file.write(line)
    finally:
        file_300M.close()
        outfile.close()
 
def get_new_verbs():
    pol_verbs = get_polanski_verbs(POLANSKI_PATH)
    b_verbs = get_B_verbs(B_PATH)
    compare_to_300M(pol_verbs, b_verbs, PATH_300M, NEW_VERBS_PATH, NWALENTY_PATH)

def add_verbs_intersec_freq():
    verbs_path = 'data/polanski_verbs_freq_list.txt'
    added_path = 'data/added_verbs_clarin6.txt'
    added_file = codecs.open(added_path, 'wt', 'utf-8')
    dict_basename = 'clarin'
    dict = 6
    new_last_dict = 10
    verbs_per_dict = VERBS_IN_DICT
    with codecs.open(verbs_path,'rt', 'utf8') as infile:
        new_voc = Vocabulary(name=dict_basename+str(dict))
        new_voc.save()
        initial_status = Lemma_Status.objects.order_by('priority')[0]
        for line in infile:
            line = line.strip()
            ngram_pattern = re.compile(ur'^[\s]*([\d]+)[\s]*([^\s]+).*$')
            m = ngram_pattern.match(line)
            if m:
                freq = int(m.group(1).strip()) 
                word = m.group(2).strip()
                lemmas = Lemma.objects.filter(entry = word)
                if lemmas.count() == 0:
                    if verbs_per_dict == 0:
                        verbs_per_dict = VERBS_IN_DICT
                        dict += 1
                        if dict > new_last_dict:
                            break
                        else:
                            new_voc = Vocabulary(name=dict_basename+str(dict))
                            new_voc.save()
                    new_lemma = Lemma(entry=word, vocabulary=new_voc, 
                                      status=initial_status, old=False)
                    new_lemma.save()
                    verbs_per_dict -= 1
                    added_file.write(dict_basename+str(dict) + ' ' + word + 
                                     ' ' + str(freq) + '\n')     
    added_file.close()