Commit f32e16af02ed2668d359df4bf8b99ea32206c97e

Authored by Bartłomiej Nitoń
1 parent f7e76d70

Added scripts for adding new entries to dictionary.

INSTALL_PL
... ... @@ -9,6 +9,8 @@ Zainstaluj Django w wersji 1.4.8:
9 9  
10 10 Zainstaluj Django south:
11 11 >> sudo apt-get install python-django-south
  12 +lub:
  13 +>> sudo pip install South
12 14  
13 15 Zainstaluj Django extensions w wersji 1.6.7:
14 16 >> sudo pip install django-extensions==1.6.7
... ...
dictionary/management/commands/add_nverb_entries.py 0 → 100644
  1 +#-*- coding:utf-8 -*-
  2 +
  3 +import codecs
  4 +from operator import itemgetter
  5 +
  6 +from django.core.management.base import BaseCommand
  7 +
  8 +from dictionary.models import Lemma, Lemma_Status, POS, Vocabulary, \
  9 + get_or_create_entry
  10 +from dictionary.management.commands.load_initial_nverb_frames import add_initial_frames_by_entries
  11 +from dictionary.management.commands.load_entries_relations import add_relations_by_nverb_entries
  12 +
  13 +VERBS_IN_DICT = 2000
  14 +ORDERING = '300M'
  15 +
  16 +################# NOUNS ################################
  17 +#VERBAL_NOUNS_PATH = 'data/nverbs/nouns/merged_nouns-freq.txt'
  18 +#NOUNS_VAL_PATH = 'data/nverbs/nouns/merged_nouns_val-poss.txt'
  19 +
  20 +# loading initial entries
  21 +NOUNS_ADDED_PATH = 'data/nverbs/nouns2consider/added-merged_nouns_val_20171102.txt'
  22 +NOUNS_ERROR_PATH = 'data/nverbs/nouns2consider/error-merged_nouns_val_20171102.txt'
  23 +
  24 +NOUNS_FRAMES_PATH = 'data/nverbs/nouns2consider/merged_nouns_val-poss.txt' # hasla z ramkami walencyjnymi z tajnego
  25 +SELECTED_NOUNS_PATH = 'data/nverbs/nouns2consider/nouns+verb2consider-clarin2-add.txt'
  26 +
  27 +# adding entries relations
  28 +NOUN_VERB_RELATIONS_PATH = 'data/nverbs/nouns2consider/nouns+verb2consider-clarin2-add.txt'
  29 +NOUN_ADJ_RELATIONS_PATH = 'data/nverbs/nouns2consider/nouns+adj2consider-clarin2.txt'
  30 +
  31 +################## ADJS ################################
  32 +#VERBAL_ADJS_PATH = 'data/nverbs/adjs/merged_adjs-freq.txt'
  33 +##ADJS_VAL_PATH = 'data/nverbs/adjs/merged_adjs_val-P1.txt'
  34 +#ADJS_RELATIONS_PATH = 'data/nverbs/adjs/ver_adjs+verb-freq_cuted.txt'
  35 +#
  36 +## loading initial entries
  37 +#ADJS_ADDED_PATH = 'data/nverbs/adjs/added-merged_adjs_val_20141219.txt'
  38 +#ADJS_ERROR_PATH = 'data/nverbs/adjs/error-merged_adjs_val_20141219.txt'
  39 +#ADJS_FRAMES_PATH = 'data/nverbs/adjs/merged_adjs_val-P1.txt'
  40 +#
  41 +## adding entries relations
  42 +#ADJ_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt'
  43 +#
  44 +################## ADVS ################################
  45 +#VERBAL_ADVS_PATH = 'data/nverbs/advs/merged_advs-sel-1M-300M.txt' # frekwencja tylko wybrana
  46 +#ADVS_VAL_PATH = 'data/nverbs/advs/merged_advs_val_popr_usu_gdyby_20141113.txt'
  47 +#
  48 +## loading initial entries
  49 +#ADVS_ADDED_PATH = 'data/nverbs/advs/added-merged_advs_val.txt'
  50 +#ADVS_ERROR_PATH = 'data/nverbs/advs/error-merged_advs_val.txt'
  51 +#ADVS_FRAMES_PATH = 'data/nverbs/advs/merged_advs_val_popr_usu_gdyby_20141113.txt'
  52 +
  53 +## adding entries relations # na razie brak danych
  54 +#ADV_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt'
  55 +
  56 +
  57 +class Command(BaseCommand):
  58 + args = 'none'
  59 +
  60 + def handle(self, **options):
  61 + # load nouns
  62 +# entries_with_val = get_entries(NOUNS_VAL_PATH)
  63 +# entries = get_entries_by_freq(VERBAL_NOUNS_PATH, ORDERING)
  64 +# load_entries(entries, B_entries, 'data/added_nouns_20140627.txt', ORDERING, 'noun',
  65 +# 'clarin_nouns', 1, 1, 0)
  66 +
  67 + # load nouns
  68 + entries_to_add = get_entries(SELECTED_NOUNS_PATH)
  69 + ordered_entries = get_entries_by_freq(SELECTED_NOUNS_PATH, ORDERING)
  70 + #related_entries = get_related_entries(NOUNS_RELATIONS_PATH, 'noun')
  71 + added_entries = load_entries(ordered_entries, entries_to_add, 'data/nverbs/nouns2consider/added_nouns_20171103.txt', ORDERING, 'noun',
  72 + 'clarin2_nouns', 3, 3, 0)
  73 + add_initial_frames_by_entries(added_entries,
  74 + NOUNS_FRAMES_PATH, NOUNS_ADDED_PATH, NOUNS_ERROR_PATH,
  75 + 'noun')
  76 + add_relations_by_nverb_entries(added_entries, NOUN_VERB_RELATIONS_PATH, 'verb', 'noun')
  77 + #add_relations_by_nverb_entries(added_entries, NOUN_ADJ_RELATIONS_PATH, 'adj', 'noun')
  78 +
  79 +# # load adjectives
  80 +## entries_with_val = get_entries(ADJS_VAL_PATH)
  81 +# entries = get_entries_by_freq(VERBAL_ADJS_PATH, ORDERING)
  82 +# related_entries = get_related_entries(ADJS_RELATIONS_PATH, 'adj')
  83 +# added_entries = load_entries(entries, related_entries, 'data/added_adjs_20141219.txt', ORDERING, 'adj',
  84 +# 'clarin_adjs', 3, 3, 0)
  85 +# add_initial_frames_by_entries(added_entries,
  86 +# ADJS_FRAMES_PATH, ADJS_ADDED_PATH, ADJS_ERROR_PATH,
  87 +# 'adj')
  88 +# add_relations_by_nverb_entries(added_entries, ADJ_VERB_RELATIONS_PATH, 'adj')
  89 +
  90 +# # load adverbs
  91 +# entries_with_val = get_entries(ADVS_VAL_PATH)
  92 +# entries = get_entries_by_freq(VERBAL_ADVS_PATH, ORDERING)
  93 +# added_entries = load_entries(entries, entries_with_val, 'data/added_advs_20141114.txt', ORDERING, 'adv',
  94 +# 'clarin_advs', 1, 1, 0)
  95 +# add_initial_frames_by_entries(added_entries,
  96 +# ADVS_FRAMES_PATH, ADVS_ADDED_PATH, ADVS_ERROR_PATH,
  97 +# 'adverb')
  98 +# add_relations_by_nverb_entries(added_entries, ADJ_VERB_RELATIONS_PATH, 'adj')
  99 +
  100 +def get_entries(entries_path):
  101 + entries = []
  102 + try:
  103 + entries_file = codecs.open(entries_path, "rt", 'utf-8')
  104 + for line in entries_file:
  105 + line_ls = line.split('\t')
  106 + entries.append({'entry' : line_ls[0].strip()})
  107 + entries = list(set(entries))
  108 + finally:
  109 + entries_file.close()
  110 + return entries
  111 +
  112 +def get_entries_by_freq(entries_path, ordering):
  113 + entries = []
  114 + try:
  115 + freq_file = codecs.open(entries_path, "rt", 'utf-8')
  116 + for line in freq_file:
  117 + line_ls = line.split()
  118 + entries.append({'entry' : line_ls[0].strip(),
  119 + 'freq_1M': int(line_ls[1].strip()),
  120 + 'freq_300M': int(line_ls[2].strip())})
  121 + entries = sorted(entries, key=itemgetter('freq_%s' % ordering), reverse=True)
  122 + finally:
  123 + freq_file.close()
  124 + return entries
  125 +
  126 +def load_entries(sorted_entries, entries_to_add, added_path, ordering, pos_tag,
  127 + dict_basename, first_dict_idx, last_dict_idx, min_freq):
  128 + print 'Loading entries!!'
  129 + try:
  130 + added_entries = []
  131 + added_file = codecs.open(added_path, "wt", 'utf-8')
  132 + dict = first_dict_idx
  133 + new_last_dict = last_dict_idx
  134 + pos_obj = POS.objects.get(tag=pos_tag)
  135 + verbs_per_dict = VERBS_IN_DICT
  136 + initial_status = Lemma_Status.objects.order_by('priority')[0]
  137 + for entry in sorted_entries:
  138 + found_entry = next((item for item in entries_to_add if item['entry'] == entry['entry']), None)
  139 + if found_entry and entry['freq_%s' % ordering] >= min_freq:
  140 + if dict == 0:
  141 + new_voc = Vocabulary(name=dict_basename)
  142 + new_voc.save()
  143 + else:
  144 + new_voc = Vocabulary(name=dict_basename+str(dict))
  145 + new_voc.save()
  146 +
  147 + lemmas = Lemma.objects.filter(entry = entry['entry'])
  148 + if lemmas.count() == 0:
  149 + if verbs_per_dict == 0:
  150 + verbs_per_dict = VERBS_IN_DICT
  151 + dict += 1
  152 + if dict > new_last_dict:
  153 + break
  154 + else:
  155 + new_voc = Vocabulary(name=dict_basename+str(dict))
  156 + new_voc.save()
  157 + val_entry, created = get_or_create_entry(entry['entry'], pos_obj)
  158 + new_lemma = Lemma(entry=entry['entry'],
  159 + entry_obj=val_entry,
  160 + vocabulary=new_voc,
  161 + status=initial_status,
  162 + old=False,
  163 + frequency_300M=entry['freq_300M'],
  164 + frequency_1M=entry['freq_1M'])
  165 + new_lemma.save()
  166 + verbs_per_dict -= 1
  167 + added_file.write('%s\t%s\t%s\n' % (entry['entry'],
  168 + entry['freq_1M'],
  169 + entry['freq_300M']))
  170 + added_entries.append(entry['entry'])
  171 + print entry
  172 + finally:
  173 + added_file.close()
  174 + return added_entries
  175 +
  176 +def get_related_entries(relations_path, pos_tag):
  177 + print 'Checking relations!!!'
  178 + entries = []
  179 + pos = POS.objects.get(tag=pos_tag)
  180 + try:
  181 + relations_file = codecs.open(relations_path, "rt", 'utf-8')
  182 + for line in relations_file:
  183 + #print line
  184 + line_ls = line.split()
  185 + verb = line_ls[3].lstrip('(').strip()
  186 + nverb = line_ls[0].strip()
  187 + if (not Lemma.objects.filter(entry=nverb).exists() and
  188 + Lemma.objects.filter(entry=verb, entry_obj__pos__tag='verb').exists()):
  189 + #entries.append(nverb)
  190 + verb_active_lemma = Lemma.objects.get(entry=verb,
  191 + entry_obj__pos__tag='verb',
  192 + old=False)
  193 + lemma_status_str = verb_active_lemma.status.status
  194 + if (not lemma_status_str == u'w obróbce' and
  195 + not lemma_status_str == u'do obróbki'):
  196 + if (verb_active_lemma.frames.count() == 1 and
  197 + verb_active_lemma.frames.filter(text_rep=u'subj{np(str)}').exists()):
  198 + pass
  199 + else:
  200 + entries.append({'entry' : nverb,
  201 + 'verb' : verb,
  202 + 'freq_1M': int(line_ls[1].strip()),
  203 + 'freq_300M': int(line_ls[2].strip())})
  204 + print line
  205 + finally:
  206 + relations_file.close()
  207 + return entries
... ...
dictionary/management/commands/get_lemmas_list.py 0 → 100644
  1 +# -*- coding:utf-8 -*-
  2 +
  3 +import codecs
  4 +import datetime
  5 +import os
  6 +
  7 +from django.core.management.base import BaseCommand
  8 +
  9 +from dictionary.models import Lemma
  10 +from settings import PROJECT_PATH
  11 +
  12 +
  13 +POS = 'verb'
  14 +OUTPATH = os.path.join(PROJECT_PATH, 'data', '%ss-%s.txt' % (POS, datetime.datetime.now().strftime('%Y%m%d')))
  15 +
  16 +
  17 +class Command(BaseCommand):
  18 + help = 'Get lemmas existing in Walenty'
  19 +
  20 + def handle(self, *args, **options):
  21 + lemmas = Lemma.objects.filter(old=False, entry_obj__pos__tag=POS)
  22 + lemmas = lemmas.exclude(status__status=u'do usunięcia').order_by('entry_obj__name')
  23 + write_lemmas(lemmas)
  24 +
  25 +
  26 +def write_lemmas(lemmas):
  27 + try:
  28 + outfile = codecs.open(OUTPATH, 'w', 'utf-8')
  29 + for lemma in lemmas:
  30 + outfile.write('%s\n' % lemma.entry_obj.name)
  31 + finally:
  32 + outfile.close()
... ...
dictionary/management/commands/load_entries_relations.py 0 → 100644
  1 +#-*- coding:utf-8 -*-
  2 +
  3 +import codecs
  4 +
  5 +from django.core.management.base import BaseCommand
  6 +
  7 +from dictionary.models import Lemma, POS, get_or_create_entry
  8 +
  9 +NOUN_VERB_RELATIONS_PATH = 'data/nverbs/nouns/nouns+verb-freq.txt'
  10 +
  11 +ADJ_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt'
  12 +
  13 +CHECK_PATH = 'data/nverbs/nouns/deriv_nouns-adj-freq-sel.txt'
  14 +
  15 +class Command(BaseCommand):
  16 + args = 'none'
  17 + help = """
  18 + Add relations between entries from given file.
  19 + """
  20 +
  21 + def handle(self, **options):
  22 + #add_relations(NOUN_VERB_RELATIONS_PATH, 'noun')
  23 + #add_relations(ADJ_VERB_RELATIONS_PATH, 'adj')
  24 + check_if_deriv_good_to_add('adj', 'noun', 'data/nverbs/nouns/deriv_nouns-adj-existing-20150928.txt')
  25 +
  26 +def add_relations(entries_path, pos_tag):
  27 + entries = []
  28 + pos = POS.objects.get(tag=pos_tag)
  29 + try:
  30 + freq_file = codecs.open(entries_path, "rt", 'utf-8')
  31 + for line in freq_file:
  32 + #print line
  33 + line_ls = line.split()
  34 + verb = line_ls[3].lstrip('(').strip()
  35 + try:
  36 + nverb = line_ls[0].strip()
  37 + verb_obj = Lemma.objects.get(old=False, entry=verb, entry_obj__pos__tag='verb')
  38 + nverb_obj = Lemma.objects.get(old=False, entry=nverb, entry_obj__pos=pos)
  39 + entry = {'entry' : nverb,
  40 + 'verb' : verb,
  41 + 'freq_1M': int(line_ls[1].strip()),
  42 + 'freq_300M': int(line_ls[2].strip())}
  43 + nverb_entry, created = get_or_create_entry(entry['entry'], pos)
  44 +# try:
  45 +# val_entry = Entry.objects.get(name=entry['entry'])
  46 +# if val_entry.pos.tag == 'verb':
  47 +# continue
  48 +# val_entry.pos = pos
  49 +# val_entry.save()
  50 +# except Entry.DoesNotExist:
  51 +# val_entry = Entry(name=entry['entry'], pos=pos)
  52 +# val_entry.save()
  53 + verb_entry = verb_obj.entry_obj
  54 + verb_entry.rel_entries.add(nverb_entry)
  55 + nverb_entry.rel_entries.add(verb_entry)
  56 + print line
  57 + except Lemma.DoesNotExist:
  58 + pass
  59 + finally:
  60 + freq_file.close()
  61 + return entries
  62 +
  63 +def add_relations_by_nverb_entries(entries, entries_path, from_pos_tag, to_pos_tag):
  64 + print 'Adding relations!'
  65 + from_pos = POS.objects.get(tag=from_pos_tag)
  66 + to_pos = POS.objects.get(tag=to_pos_tag)
  67 + try:
  68 + freq_file = codecs.open(entries_path, "rt", 'utf-8')
  69 + for line in freq_file:
  70 + #print line
  71 + line_ls = line.split()
  72 + verb = line_ls[3].lstrip('(').strip()
  73 + try:
  74 + nverb = line_ls[0].strip()
  75 + if nverb in entries:
  76 + verb_obj = Lemma.objects.get(old=False, entry=verb, entry_obj__pos=from_pos)
  77 + nverb_obj = Lemma.objects.get(old=False, entry=nverb, entry_obj__pos=to_pos)
  78 + entry = {'entry' : nverb,
  79 + 'verb' : verb,
  80 + 'freq_1M': int(line_ls[1].strip()),
  81 + 'freq_300M': int(line_ls[2].strip())}
  82 + nverb_entry = nverb_obj.entry_obj
  83 + verb_entry = verb_obj.entry_obj
  84 + verb_entry.rel_entries.add(nverb_entry)
  85 + nverb_entry.rel_entries.add(verb_entry)
  86 + print line
  87 + except Lemma.DoesNotExist:
  88 + pass
  89 + finally:
  90 + freq_file.close()
  91 +
  92 +def check_if_deriv_good_to_add(from_pos_tag, to_pos_tag, outpath):
  93 + #try:
  94 + freq_file = codecs.open(CHECK_PATH, "rt", 'utf-8')
  95 + good_file = codecs.open(outpath, "wt", 'utf-8')
  96 + for line in freq_file:
  97 + line_ls = line.split()
  98 + to_entry = line_ls[0].strip()
  99 + from_entry = line_ls[3].lstrip('(').strip()
  100 + if not Lemma.objects.filter(old=False, entry=to_entry,
  101 + entry_obj__pos__tag=to_pos_tag).exists():
  102 + try:
  103 + from_lemma = Lemma.objects.get(old=False, entry=from_entry,
  104 + entry_obj__pos__tag=from_pos_tag)
  105 + good_file.write(line)
  106 + print line
  107 + except Lemma.DoesNotExist:
  108 + pass
  109 + #finally:
  110 + good_file.close()
  111 + freq_file.close()
  112 +
0 113 \ No newline at end of file
... ...
dictionary/management/commands/load_initial_nverb_frames.py 0 → 100644
  1 +#-*- coding:utf-8 -*-
  2 +
  3 +#Copyright (c) 2014, Bartłomiej Nitoń
  4 +#All rights reserved.
  5 +
  6 +#Redistribution and use in source and binary forms, with or without modification, are permitted provided
  7 +#that the following conditions are met:
  8 +
  9 +# Redistributions of source code must retain the above copyright notice, this list of conditions and
  10 +# the following disclaimer.
  11 +# Redistributions in binary form must reproduce the above copyright notice, this list of conditions
  12 +# and the following disclaimer in the documentation and/or other materials provided with the distribution.
  13 +
  14 +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED
  15 +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
  16 +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
  17 +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
  18 +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  19 +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  20 +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  21 +# POSSIBILITY OF SUCH DAMAGE.
  22 +
  23 +import codecs
  24 +import itertools
  25 +from operator import itemgetter
  26 +
  27 +from django.core.management.base import BaseCommand
  28 +
  29 +#from dictionary.common_func import arg_data_to_arg, args_to_position, \
  30 +# positions_to_frame
  31 +from dictionary.models import Argument, Argument_Model, Frame_Opinion, \
  32 + Frame_Opinion_Value, Lemma, positions_to_frame, \
  33 + get_or_create_position
  34 +
  35 +
  36 +NOUNS_ADDED_PATH = 'data/nverbs/nouns/added-merged_nouns_val.txt'
  37 +NOUNS_ERROR_PATH = 'data/nverbs/nouns/error-merged_nouns_val.txt'
  38 +NOUNS_FRAMES_PATH = 'data/nverbs/nouns/merged_nouns_val-poss.txt'
  39 +
  40 +ADJS_ADDED_PATH = 'data/nverbs/adjs/added-merged_adjs_val.txt'
  41 +ADJS_ERROR_PATH = 'data/nverbs/adjs/error-merged_adjs_val.txt'
  42 +ADJS_FRAMES_PATH = 'data/nverbs/adjs/merged_adjs_val-P1.txt'
  43 +
  44 +class Command(BaseCommand):
  45 + args = 'none'
  46 + help = """
  47 + Adds initial nverb frames.
  48 + """
  49 +
  50 + def handle(self, **options):
  51 + #add_initial_frames(NOUNS_FRAMES_PATH, NOUNS_ADDED_PATH, NOUNS_ERROR_PATH, 'noun')
  52 + add_initial_frames(ADJS_FRAMES_PATH, ADJS_ADDED_PATH, ADJS_ERROR_PATH, 'adj')
  53 +
  54 +def add_initial_frames(frames_path, added_path, error_path, pos_tag):
  55 + try:
  56 + added_file = codecs.open(added_path, "wt", 'utf-8')
  57 + error_file = codecs.open(error_path, "wt", 'utf-8')
  58 + frames_file = codecs.open(frames_path, "rt", 'utf-8')
  59 + for line in frames_file:
  60 + line = line.strip()
  61 + pred_val = ''
  62 + if line.startswith('%'):
  63 + continue
  64 + lemma, frames_str, pred_val = get_frames_info(line)
  65 + try:
  66 + lemma_obj = Lemma.objects.get(entry=lemma, old=False,
  67 + status__status=u'do obróbki',
  68 + entry_obj__pos__tag=pos_tag)
  69 + #lemma_obj.frames.clear()
  70 + print lemma_obj
  71 + try:
  72 + parse_and_add_frames(lemma_obj, frames_str, pred_val)
  73 + added_file.write(u'%s\n' % line)
  74 + except:
  75 + error_file.write(u'%s\n' % line)
  76 + except Lemma.DoesNotExist:
  77 + pass
  78 + finally:
  79 + added_file.close()
  80 + error_file.close()
  81 + frames_file.close()
  82 +
  83 +def add_initial_frames_by_entries(entries, frames_path, added_path, error_path, pos_tag):
  84 + print 'Adding initial frames!'
  85 + try:
  86 + added_file = codecs.open(added_path, "wt", 'utf-8')
  87 + error_file = codecs.open(error_path, "wt", 'utf-8')
  88 + frames_file = codecs.open(frames_path, "rt", 'utf-8')
  89 + for line in frames_file:
  90 + line = line.strip()
  91 + pred_val = ''
  92 + if line.startswith('%'):
  93 + continue
  94 + lemma, frames_str, pred_val = get_frames_info(line)
  95 + if lemma in entries:
  96 + try:
  97 + lemma_obj = Lemma.objects.get(entry=lemma, old=False,
  98 + status__status=u'do obróbki',
  99 + entry_obj__pos__tag=pos_tag)
  100 + print lemma_obj
  101 + try:
  102 + parse_and_add_frames(lemma_obj, frames_str, pred_val)
  103 + added_file.write(u'%s\n' % line)
  104 + except:
  105 + error_file.write(u'%s\n' % line)
  106 + except Lemma.DoesNotExist:
  107 + pass
  108 + finally:
  109 + added_file.close()
  110 + error_file.close()
  111 + frames_file.close()
  112 +
  113 +def get_frames_info(line):
  114 + predicativity_val = ''
  115 + line_parts = line.split('\t')
  116 + lemma = line_parts[0].strip()
  117 + frames_str = line_parts[1].strip()
  118 + if len(line_parts) == 3 and line_parts[2] == 'PRED':
  119 + predicativity_val = 'pred'
  120 + return lemma, frames_str, predicativity_val
  121 +
  122 +def parse_and_add_frames(lemma_obj, frames_str, predicativity_val):
  123 + poss_ls = []
  124 + valence_ls = [arg.strip() for arg in frames_str.split('+')]
  125 + for pos_arg in valence_ls:
  126 + pos_arg = pos_arg.strip()
  127 + possible_args = pos_arg.split('/')
  128 + possible_args = coordinate_arguments(possible_args)
  129 + poss_ls.append(possible_args)
  130 + confs = itertools.product(*poss_ls)
  131 + for frame_args in list(confs):
  132 + frame_args = list(set(frame_args)) #--> tutaj byl fuckup i tworzyly sie dziwne pozycje majace ten sam argument kilkukrotnie, moze ta linijka pomoze
  133 + frame_obj, frame_opinion_obj = create_frame(frame_args, predicativity_val)
  134 + lemma_obj.frames.add(frame_obj)
  135 + lemma_obj.frame_opinions.add(frame_opinion_obj)
  136 +
  137 +def coordinate_arguments(arguments):
  138 + coordinated_args = []
  139 + for arg in arguments:
  140 + arg_type, attributes = arg_from_text_rep(arg)
  141 + case, preposition = get_arg_case_and_preposition(arg)
  142 + coordinated_arg = next((arg for arg in coordinated_args if (arg['case'] == case and
  143 + arg['preposition'] == preposition)), None)
  144 + if coordinated_arg and case:
  145 + coordinated_arg['argument'] += ';%s' % arg
  146 + else:
  147 + coordinated_arg = {'argument': arg,
  148 + 'case': case,
  149 + 'preposition': preposition}
  150 + coordinated_args.append(coordinated_arg)
  151 + if arg_type == 'ncp':
  152 + additional_arg = u'np(%s)' % case
  153 + coordinated_arg['argument'] += ';%s' % additional_arg
  154 + elif arg_type == 'prepncp':
  155 + additional_arg = u'prepnp(%s,%s)' % (preposition, case)
  156 + coordinated_arg['argument'] += ';%s' % additional_arg
  157 +
  158 + return [arg['argument'] for arg in coordinated_args]
  159 +
  160 +def arg_from_text_rep(argument):
  161 + attributes = []
  162 + arg_parts = argument.split('(')
  163 + arg_type = arg_parts[0]
  164 + if len(arg_parts) > 1:
  165 + attributes = arg_parts[1].rstrip(')').replace("'", "").split(',')
  166 + return arg_type, attributes
  167 +
  168 +def get_arg_case_and_preposition(argument):
  169 + case = ''
  170 + preposition = ''
  171 + argument = arg_conversion(argument)
  172 + arg_type, attributes = arg_from_text_rep(argument)
  173 + argument_model = Argument_Model.objects.get(arg_model_name=arg_type)
  174 + attribute_models = argument_model.atribute_models.order_by('priority')
  175 + for attr_model, attr_text_rep in zip(attribute_models, attributes):
  176 + if attr_model.atr_model_name == u'PRZYPADEK':
  177 + case = attr_text_rep
  178 + elif attr_model.atr_model_name == u'PRZYIMEK':
  179 + preposition = attr_text_rep
  180 + return case, preposition
  181 +
  182 +def arg_conversion(arg_text_rep):
  183 + arg_text_rep = arg_text_rep.replace('!', '').replace('*', '').replace('?', '')
  184 + if arg_text_rep == 'advp':
  185 + arg_text_rep = u'xp(_)'
  186 + elif arg_text_rep.startswith('comprepnp'):
  187 + arg_text_rep = arg_text_rep.replace("'", "").replace(',gen', '')
  188 + return arg_text_rep
  189 +
  190 +def create_frame(frame_args, predicativity_val):
  191 + positions_objs, frame_opinion_value = get_positions(frame_args)
  192 + frame_obj = positions_to_frame(positions_objs,
  193 + reflex='',
  194 + negativity='',
  195 + predicativity=predicativity_val,
  196 + aspect='')
  197 + frame_opinion_obj, xx = Frame_Opinion.objects.get_or_create(frame=frame_obj,
  198 + value=frame_opinion_value)
  199 + return frame_obj, frame_opinion_obj
  200 +
  201 +def get_positions(args_strs):
  202 + poss_objs = []
  203 + frame_opinions = []
  204 + for poss_args_str in args_strs:
  205 + frame_opinions.append(possible_frame_opinion(poss_args_str))
  206 + poss_objs.append(create_position(poss_args_str))
  207 + frame_opinion = sorted(frame_opinions, key=itemgetter('priority'), reverse=False)[0]
  208 + frame_opinion_value = Frame_Opinion_Value.objects.get(value=frame_opinion['opinion'])
  209 + return poss_objs, frame_opinion_value
  210 +
  211 +def possible_frame_opinion(arg_str):
  212 + opinion = {'opinion': 'pewny',
  213 + 'priority': '4'}
  214 + if '!' in arg_str:
  215 + opinion = {'opinion': u'zły',
  216 + 'priority': '1'}
  217 + elif '?' in arg_str:
  218 + opinion = {'opinion': u'wątpliwy',
  219 + 'priority': '2'}
  220 + elif '*' in arg_str:
  221 + opinion = {'opinion': u'archaiczny',
  222 + 'priority': '3'}
  223 + return opinion
  224 +
  225 +def create_position(args_str):
  226 + arg_objs = []
  227 + for arg_text_rep in args_str.split(';'):
  228 + arg_text_rep = arg_conversion(arg_text_rep)
  229 +# try:
  230 + arg_obj = Argument.objects.get(text_rep=arg_text_rep)
  231 +# except Argument.DoesNotExist: # TODO wylaczac przy wstepnym wrzucaniu hasel
  232 +# arg_type, attributes = arg_from_text_rep(arg_text_rep)
  233 +# arg_obj = arg_data_to_arg(arg_type, attributes)
  234 + arg_objs.append(arg_obj)
  235 + pos_obj = get_or_create_position(categories=[], arguments=arg_objs)
  236 + return pos_obj
  237 +
  238 +
  239 +
0 240 \ No newline at end of file
... ...