#-*- coding:utf-8 -*- # author: B.Niton import codecs import operator import re from django.core.management.base import BaseCommand from dictionary.models import * from common.util import triple_arg_poss from django.db.models import Q SKL_FRAMES_PATH = 'data/ramki_skl_v_130925.txt' NLEMMA_PATH = 'data/brak_frames.txt' UNDERLINE_PATH = 'data/underline_frames.txt' class Command(BaseCommand): help = 'Creates składnica frame models from składnica frames.' def handle(self, file_path=SKL_FRAMES_PATH, **options): load_frames() def update_arg(arg): arg = arg.replace(u'mian', u'nom') arg = arg.replace(u'bier', u'acc') arg = arg.replace(u'cel', u'dat') arg = arg.replace(u'dop', u'gen') arg = arg.replace(u'miej', u'loc') arg = arg.replace(u'narz', u'inst') arg = arg.replace(u'pop', u'postp') for atr in Atribute_Value.objects.filter(related=True): for main_atr in atr.main_attr_values.all(): arg = arg.replace(u'(%s)' % atr.value, u'(%s)' % main_atr.value) arg = arg.replace(u'(%s,' % atr.value, u'(%s,' % main_atr.value) arg = arg.replace(u',%s)' % atr.value, u',%s)' % main_atr.value) arg = arg.replace(u',%s,' % atr.value, u',%s,' % main_atr.value) return arg def possible_args(arg, pos, preps): possibilities = [] if arg == 'subj': possibilities.extend(triple_arg_poss('np(str)', ['subj'])) elif arg == 'np(bier)': possibilities.extend(triple_arg_poss('np(str)', ['', 'obj'])) possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj'])) possibilities.extend(triple_arg_poss('np(part)', ['', 'obj', 'subj'])) elif arg == 'np(dop)': possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj'])) possibilities.extend(triple_arg_poss('np(part)', ['', 'obj', 'subj'])) #if pos == 'ger': # przechodzi tez na biernik possibilities.extend(triple_arg_poss('np(str)', ['', 'obj'])) possibilities.extend(triple_arg_poss('np(acc)', ['', 'obj', 'subj'])) elif (arg.startswith('prepnp(jak,') or arg.startswith('prepnp(jako,') or arg.startswith(u'prepnp(niż,')): prepnp_atr_ls = arg.replace('prepnp(', '').replace(')', '').split(',') preposition = prepnp_atr_ls[0] case = prepnp_atr_ls[1] possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj'])) if case == 'mian' or case == 'bier': possibilities.extend(triple_arg_poss('prepnp(' + preposition + ',str)', ['', 'obj', 'subj'])) elif(arg == u'prepnp(na temat,dop)'): # nie znajduje tego w wywleczonych possibilities.extend(triple_arg_poss(u'comprepnp(na temat)', ['', 'obj', 'subj'])) elif(arg == u"prepnp(w sprawie,dop)"): possibilities.extend(triple_arg_poss(u'comprepnp(w sprawie)', ['', 'obj', 'subj'])) elif(arg == u"prepnp(z powodu,dop)"): possibilities.extend(triple_arg_poss(u'comprepnp(z powodu)', ['', 'obj', 'subj'])) elif(arg == u"adjp(mian)"): possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj'])) possibilities.extend(triple_arg_poss('adjp(pred)', ['', 'obj', 'subj'])) elif(arg == u"adjp(narz)"): possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj'])) possibilities.extend(triple_arg_poss('adjp(pred)', ['', 'obj', 'subj'])) elif(arg.startswith(u'sentp')): # liczy przecinki by sprawdzic liczbe atrybutow w 'sentp' number_of_commas = arg.count(u',') conv_arg = arg.replace(u'pz', u'int') if(number_of_commas == 0): possibilities.extend(triple_arg_poss(update_arg(conv_arg.replace(u'sentp', u'cp')), ['', 'obj', 'subj'])) elif(number_of_commas == 1): possibilities.extend(triple_arg_poss(update_arg(conv_arg.replace(u'sentp', u'ncp')), ['', 'obj', 'subj'])) elif(number_of_commas == 2): possibilities.extend(triple_arg_poss(update_arg(conv_arg.replace(u'sentp', u'prepncp')), ['', 'obj', 'subj'])) elif(arg == u'advp'): possibilities.extend(triple_arg_poss(u'xp(_)', ['', 'obj', 'subj'])) possibilities.extend(triple_arg_poss(u'advp(pron)', ['', 'obj', 'subj'])) possibilities.extend(triple_arg_poss(u'advp(misc)', ['', 'obj', 'subj'])) possibilities.extend(triple_arg_poss(u'advp(pred)', ['', 'obj', 'subj'])) #possibilities.extend(triple_arg_poss(u'advp(locat)', ['', 'obj', 'subj'])) #possibilities.extend(triple_arg_poss(u'advp(abl)', ['', 'obj', 'subj'])) #possibilities.extend(triple_arg_poss(u'advp(adl)', ['', 'obj', 'subj'])) #possibilities.extend(triple_arg_poss(u'advp(perl)', ['', 'obj', 'subj'])) #possibilities.extend(triple_arg_poss(u'advp(temp)', ['', 'obj', 'subj'])) #possibilities.extend(triple_arg_poss(u'advp(dur)', ['', 'obj', 'subj'])) #possibilities.extend(triple_arg_poss(u'advp(mod)', ['', 'obj', 'subj'])) for prepnp in preps: q_prepnps = [] possibilities.extend(prepnp['poss_args']) for prep_arg in prepnp['poss_args']: q_prepnps.append(Q(realizations=prep_arg)) if q_prepnps: xps = Argument.objects.filter(type='xp').filter(reduce(operator.or_, q_prepnps)).all() for xp in xps: possibilities.extend(triple_arg_poss(xp.text_rep, ['', 'obj', 'subj'])) elif(arg == u'prepnp(przez,bier)'): possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj'])) #if pos == 'ger' or pos == 'ppas': possibilities.extend(triple_arg_poss('np(str)', ['subj'])) elif arg.startswith('infp('): conv_arg = copy.deepcopy(arg) conv_arg = conv_arg.replace('(nd)', '(imperf)') conv_arg = conv_arg.replace('(dk)', '(perf)') possibilities.extend(triple_arg_poss(update_arg(conv_arg), ['', 'obj', 'subj'])) possibilities.extend(triple_arg_poss('infp(_)', ['', 'obj', 'subj'])) elif(arg == u''): possibilities.extend(triple_arg_poss('E', ['subj'])) possibilities.extend(triple_arg_poss('np(str)', ['subj'])) else: possibilities.extend(triple_arg_poss(update_arg(arg), ['', 'obj', 'subj'])) return possibilities def mark_sub_frames(lemma): print smart_str(lemma) for frame in lemma.skladnica_frames.all(): frame.sub_frames.clear() frame.visible = True frame.save() for frame in lemma.skladnica_frames.all(): print frame bigger_frames = lemma.skladnica_frames.all() bigger_frames = bigger_frames.filter(reflex=frame.reflex).exclude(pk=frame.pk) for arg in frame.arguments.all(): bigger_frames = bigger_frames.filter(arguments__text_id=arg.text_id) if bigger_frames.count() > 0: frame.visible = False frame.save() for bigger_frame in bigger_frames.all(): bigger_frame.sub_frames.add(frame) for bigger_frame in bigger_frames.all(): if(bigger_frame.arguments.count() == frame.arguments.count()): if not frame.skladnica_frames.annotate(arguments_count=Count('arguments')).filter(arguments_count__gt= frame.arguments.count()).count() > 0: bigger_frame.visible = False bigger_frame.save() frame.visible = True frame.save() for sub_frame in frame.sub_frames.all(): if sub_frame.arguments.count() < frame.arguments.count(): sub_frame.visible = False sub_frame.save() if(frame.old_frame_value == '' and lemma.skladnica_frames.count() != lemma.skladnica_frames.filter(old_frame_value='').count()): frame.visible = False frame.save() def load_frames(): print 'Be patient, it can take a while.' # for lemma in Lemma.objects.filter(old=False): ### UWAGA NA TO, POTEM WYWALIC # for frame in lemma.skladnica_frames.all(): # frame.delete() try: f = codecs.open(SKL_FRAMES_PATH, "rt", 'utf-8') nlemmafile = codecs.open(NLEMMA_PATH, 'wt', 'utf-8') underfile = codecs.open(UNDERLINE_PATH, 'wt', 'utf-8') example_source_obj = NKJP_Source.objects.get(source=u'korpus ręcznie anotowany (1.2M segmentów)') example_opinion_obj = NKJP_Opinion.objects.get(opinion=u'dobry') try: for line in f: line_pattern = re.compile(ur"^([^\s]+)[\s]*([^\s]+)[\s]*((\[[^\]]*\])|(_))[\s]*(\[[^\]]*\])[\s]*\[sent_id='([^']*)'\][\s]*\[text=([^\]]*)\][\s]*$") m = line_pattern.match(line) if not m: print smart_str(line) continue if m: lemma_str = m.group(1).strip() pos = m.group(2).strip() frame_str = m.group(3).strip() prep_str = m.group(6).strip() sent_id_str = m.group(7).strip() example_str = m.group(8).strip().strip("'") print smart_str(lemma_str) lemmas = Lemma.objects.filter(entry=lemma_str, old=False) # TODO powinno objac tez pewnie lematy ze starych wersji w kontroli zmian if lemmas.count() == 0: nlemmafile.write(line.strip() + u'\n') continue if frame_str == '_': underfile.write(line.strip() + u'\n') frame_str = '[]' tokens = frame_str.replace('[', '').replace(']', '').split(',') preps = [] prep_obj = None if not prep_str == '[]': for prep in prep_str.replace('[', '').replace(']', '').split(';'): if not prep.startswith('cat='): preps.append({'arg': prep, 'poss_args': possible_args(prep, pos, preps), 'poss_positions': []}) if preps: prep_obj, created = Skladnica_Argument.objects.get_or_create(text_rep=preps[0]['arg'], text_id=preps[0]['arg'], prepnp=None) prep_obj.possible_args.add(*preps[0]['poss_args']) args_ls = [] arg_str = '' for tok in tokens: arg_str += tok.strip() + ',' if (('(' in arg_str and ')' in arg_str) or (not ('(' in arg_str) and not (')' in arg_str))): args_ls.append(arg_str.strip().rstrip(',')) arg_str = '' skladnica_args_ls = [] args_ls.sort() for arg in args_ls: possibilities = [] if arg != 'sie': if prep_obj: prep_text_rep = prep_obj.text_rep else: prep_text_rep = '' possibilities = possible_args(arg, pos, preps) if arg == 'advp' and prep_text_rep: arg_obj, created = Skladnica_Argument.objects.get_or_create(text_rep=arg, text_id=arg+'_'+prep_text_rep, prepnp=prep_obj) else: arg_obj, created = Skladnica_Argument.objects.get_or_create(text_rep=arg, text_id=arg, prepnp=None) arg_obj.possible_args.add(*possibilities) skladnica_args_ls.append(arg_obj) #prepnp=prep_obj) if 'subj' in args_ls: property_obj = Old_Frame_Property.objects.get(name='V') else: property_obj = Old_Frame_Property.objects.get(name='Q') if 'sie' in args_ls: reflex = True else: reflex = False nkjp_example_obj = NKJP_Example(sentence=example_str, source=example_source_obj, comment='', opinion=example_opinion_obj) # nowododany warunek wyszukiwania pole wyszukiwania nkjp_example_obj.save() skladnica_frame_obj = Old_Frame(property=property_obj, reflex=reflex, example=nkjp_example_obj, pos_tag=pos, old_frame_value='+'.join(args_ls), sent_id=sent_id_str) skladnica_frame_obj.save() skladnica_frame_obj.arguments.add(*skladnica_args_ls) for lemma in lemmas.all(): lemma.skladnica_frames.add(skladnica_frame_obj) for lemma in Lemma.objects.filter(old=False, entry=u'być'): mark_sub_frames(lemma) for old_lemma in Lemma.objects.filter(entry=lemma.entry, old=True): old_lemma.skladnica_frames.clear() old_lemma.skladnica_frames.add(*lemma.skladnica_frames.all()) finally: f.close() nlemmafile.close() underfile.close() except IOError: return 'Error: Can not work on file %s, check if it exists!' % SKL_FRAMES_PATH