#-*- coding:utf-8 -*- #Copyright (c) 2012, Bartłomiej Nitoń #All rights reserved. #Redistribution and use in source and binary forms, with or without modification, are permitted provided #that the following conditions are met: # Redistributions of source code must retain the above copyright notice, this list of conditions and # the following disclaimer. # Redistributions in binary form must reproduce the above copyright notice, this list of conditions # and the following disclaimer in the documentation and/or other materials provided with the distribution. # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED # WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A # PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR # ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED # TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. import codecs import re from django.core.management.base import BaseCommand from lxml import etree from dictionary.models import Entry, Lemma, Lemma_Status, POS, Vocabulary VERBS_IN_DICT = 600 POLANSKI_PATH = 'data/dictionary.xml' B_PATH = 'data/B_frames/B_cz_frames.txt' PATH_300M = 'data/susp-1.1-verbs-300M-counts.txt' NEW_VERBS_PATH = 'data/new_verbs_140213.txt' NWALENTY_PATH = 'data/checked-nwalenty.txt' # dodawanie nowych haseł, skryptami: # add_verbs # load_frequency # !!! po wprowadzeniu haseł sprawdzić czy nie ma niedodanych multirelacji postaci \).*\( !!!!!!!! ADJS_RELATIONS_PATH = 'data/nverbs/relations/merged_adjs+verb-freq.txt' NOUNS_RELATIONS_PATH = 'data/nverbs/relations/nouns+verb-freq.txt' class Command(BaseCommand): args = 'none' help = """ Add verbs from given freqency list. Script checks if verb is not already included in Slowal tool database. """ def handle(self, **options): #add_verbs_intersec_freq() #get_new_verbs() #add_verbs(NEW_VERBS_PATH, 'data/added_verbs_20140701_pol_i_tajny.txt') #add_verbs(PATH_300M, 'data/added_verbs_20140701.txt') verbs = add_verbs('data/new+plwn-chosen.txt', 'data/added_verbs_20150701.txt') add_relations_by_verb_entries(verbs, ADJS_RELATIONS_PATH, 'adj') add_relations_by_verb_entries(verbs, NOUNS_RELATIONS_PATH, 'noun') def add_verbs(verbs_path, added_path): added_verbs = [] added_file = codecs.open(added_path, 'wt', 'utf-8') dict_basename = 'clarin' dict = 18 new_last_dict = 18 verbs_per_dict = VERBS_IN_DICT min_freq = 0 with codecs.open(verbs_path, 'rt', 'utf8') as infile: if dict == 0: new_voc = Vocabulary(name=dict_basename) new_voc.save() else: new_voc = Vocabulary(name=dict_basename+str(dict)) new_voc.save() initial_status = Lemma_Status.objects.order_by('priority')[0] for line in infile: line = line.strip() if line.startswith('*'): continue print line line_ls = line.split() word = line_ls[0].strip() freq = int(line_ls[1].strip()) lemmas = Lemma.objects.filter(entry = word) if lemmas.count() == 0 and freq >= min_freq: if verbs_per_dict == 0: verbs_per_dict = VERBS_IN_DICT dict += 1 if dict > new_last_dict: break else: new_voc = Vocabulary(name=dict_basename+str(dict)) new_voc.save() entry_obj = get_verb_entry(word) new_lemma = Lemma(entry_obj=entry_obj, entry=word, vocabulary=new_voc, status=initial_status, old=False) new_lemma.save() verbs_per_dict -= 1 added_file.write('%s\t%s\t%d\n' % (dict_basename+str(dict), word, freq)) added_verbs.append(word) added_file.close() return added_verbs def get_verb_entry(verb): try: entry = Entry.objects.get(name=verb, pos__tag='verb') except Entry.DoesNotExist: try: entry = Entry.objects.get(name=verb, pos__tag='unk') verb_pos = POS.objects.get(tag='verb') entry.pos = verb_pos entry.save() except Entry.DoesNotExist: verb_pos = POS.objects.get(tag='verb') entry = Entry(name=verb, pos=verb_pos) entry.save() return entry def add_relations_by_verb_entries(entries, relations_path, pos_tag): print 'Adding relations!' pos = POS.objects.get(tag=pos_tag) try: freq_file = codecs.open(relations_path, "rt", 'utf-8') for line in freq_file: line_ls = line.split() verb = line_ls[3].lstrip('(').strip() try: nverb = line_ls[0].strip() if verb in entries: verb_obj = Lemma.objects.get(old=False, entry=verb, entry_obj__pos__tag='verb') nverb_obj = Lemma.objects.get(old=False, entry=nverb, entry_obj__pos=pos) nverb_entry = nverb_obj.entry_obj verb_entry = verb_obj.entry_obj verb_entry.rel_entries.add(nverb_entry) nverb_entry.rel_entries.add(verb_entry) print line except Lemma.DoesNotExist: pass finally: freq_file.close() def get_polanski_verbs(inpath): verbs = [] tree = etree.parse(inpath) words = tree.xpath("//*[local-name() = 'orth']") for word in words: verb = word.text.replace(u'się', '').strip() if verb not in verbs: verbs.append(verb) print verb return verbs def get_B_verbs(inpath): verbs = [] try: f = codecs.open(inpath, "rt", 'utf-8') for line in f: line_pattern = re.compile(ur"^([^\d]+)[\d]+(.*)$") m = line_pattern.match(line) if not m: print '!!!!!!!!!!!!!!!!match error!!!!!!!!!!!!!!!!!!!!!!!' if m: lemma_str = m.group(1).strip() lemma_ls = lemma_str.split() line = line.strip() if not lemma_ls[0] in verbs: verbs.append(lemma_ls[0]) finally: f.close() return verbs def load_B_lemmas(inpath, voc_name): print 'Loading %s dict.' % (voc_name) try: f = codecs.open(inpath, "rt", 'utf-8') voc_obj, xx = Vocabulary.objects.get_or_create(name=voc_name) initial_status = Lemma_Status.objects.get(status=u'do obróbki') for line in f: line_ls = line.split() entry = line_ls[1].strip() try: Lemma.objects.get(old=False, entry=entry) except Lemma.DoesNotExist: lemma_obj, created = Lemma.objects.get_or_create(old=False, entry=entry, vocabulary=voc_obj, status=initial_status) if created: voc_obj.lemmas.add(lemma_obj) finally: f.close() def compare_to_300M(pol_verbs, b_verbs, path_300M, outpath, nwalenty_path): try: pol_verbs_to_check = [] file_300M = codecs.open(path_300M, "rt", 'utf-8') outfile = codecs.open(outpath, 'wt', 'utf-8') nwalenty_file = codecs.open(nwalenty_path, 'wt', 'utf-8') for line in file_300M: print line.strip() if line.strip().startswith('*'): continue line_ls = line.split() entry = line_ls[0].strip() if entry in pol_verbs and not entry in b_verbs: pol_verbs_to_check.append(entry) if Lemma.objects.filter(old=False, entry=entry).exists(): continue if entry in b_verbs or entry in pol_verbs: outfile.write(line) else: nwalenty_file.write(line) finally: file_300M.close() outfile.close() def get_new_verbs(): pol_verbs = get_polanski_verbs(POLANSKI_PATH) b_verbs = get_B_verbs(B_PATH) compare_to_300M(pol_verbs, b_verbs, PATH_300M, NEW_VERBS_PATH, NWALENTY_PATH) def add_verbs_intersec_freq(): verbs_path = 'data/polanski_verbs_freq_list.txt' added_path = 'data/added_verbs_clarin6.txt' added_file = codecs.open(added_path, 'wt', 'utf-8') dict_basename = 'clarin' dict = 6 new_last_dict = 10 verbs_per_dict = VERBS_IN_DICT with codecs.open(verbs_path,'rt', 'utf8') as infile: new_voc = Vocabulary(name=dict_basename+str(dict)) new_voc.save() initial_status = Lemma_Status.objects.order_by('priority')[0] for line in infile: line = line.strip() ngram_pattern = re.compile(ur'^[\s]*([\d]+)[\s]*([^\s]+).*$') m = ngram_pattern.match(line) if m: freq = int(m.group(1).strip()) word = m.group(2).strip() lemmas = Lemma.objects.filter(entry = word) if lemmas.count() == 0: if verbs_per_dict == 0: verbs_per_dict = VERBS_IN_DICT dict += 1 if dict > new_last_dict: break else: new_voc = Vocabulary(name=dict_basename+str(dict)) new_voc.save() new_lemma = Lemma(entry=word, vocabulary=new_voc, status=initial_status, old=False) new_lemma.save() verbs_per_dict -= 1 added_file.write(dict_basename+str(dict) + ' ' + word + ' ' + str(freq) + '\n') added_file.close()