Commit f32e16af02ed2668d359df4bf8b99ea32206c97e
1 parent
f7e76d70
Added scripts for adding new entries to dictionary.
Showing
5 changed files
with
592 additions
and
0 deletions
INSTALL_PL
dictionary/management/commands/add_nverb_entries.py
0 → 100644
1 | +#-*- coding:utf-8 -*- | |
2 | + | |
3 | +import codecs | |
4 | +from operator import itemgetter | |
5 | + | |
6 | +from django.core.management.base import BaseCommand | |
7 | + | |
8 | +from dictionary.models import Lemma, Lemma_Status, POS, Vocabulary, \ | |
9 | + get_or_create_entry | |
10 | +from dictionary.management.commands.load_initial_nverb_frames import add_initial_frames_by_entries | |
11 | +from dictionary.management.commands.load_entries_relations import add_relations_by_nverb_entries | |
12 | + | |
13 | +VERBS_IN_DICT = 2000 | |
14 | +ORDERING = '300M' | |
15 | + | |
16 | +################# NOUNS ################################ | |
17 | +#VERBAL_NOUNS_PATH = 'data/nverbs/nouns/merged_nouns-freq.txt' | |
18 | +#NOUNS_VAL_PATH = 'data/nverbs/nouns/merged_nouns_val-poss.txt' | |
19 | + | |
20 | +# loading initial entries | |
21 | +NOUNS_ADDED_PATH = 'data/nverbs/nouns2consider/added-merged_nouns_val_20171102.txt' | |
22 | +NOUNS_ERROR_PATH = 'data/nverbs/nouns2consider/error-merged_nouns_val_20171102.txt' | |
23 | + | |
24 | +NOUNS_FRAMES_PATH = 'data/nverbs/nouns2consider/merged_nouns_val-poss.txt' # hasla z ramkami walencyjnymi z tajnego | |
25 | +SELECTED_NOUNS_PATH = 'data/nverbs/nouns2consider/nouns+verb2consider-clarin2-add.txt' | |
26 | + | |
27 | +# adding entries relations | |
28 | +NOUN_VERB_RELATIONS_PATH = 'data/nverbs/nouns2consider/nouns+verb2consider-clarin2-add.txt' | |
29 | +NOUN_ADJ_RELATIONS_PATH = 'data/nverbs/nouns2consider/nouns+adj2consider-clarin2.txt' | |
30 | + | |
31 | +################## ADJS ################################ | |
32 | +#VERBAL_ADJS_PATH = 'data/nverbs/adjs/merged_adjs-freq.txt' | |
33 | +##ADJS_VAL_PATH = 'data/nverbs/adjs/merged_adjs_val-P1.txt' | |
34 | +#ADJS_RELATIONS_PATH = 'data/nverbs/adjs/ver_adjs+verb-freq_cuted.txt' | |
35 | +# | |
36 | +## loading initial entries | |
37 | +#ADJS_ADDED_PATH = 'data/nverbs/adjs/added-merged_adjs_val_20141219.txt' | |
38 | +#ADJS_ERROR_PATH = 'data/nverbs/adjs/error-merged_adjs_val_20141219.txt' | |
39 | +#ADJS_FRAMES_PATH = 'data/nverbs/adjs/merged_adjs_val-P1.txt' | |
40 | +# | |
41 | +## adding entries relations | |
42 | +#ADJ_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt' | |
43 | +# | |
44 | +################## ADVS ################################ | |
45 | +#VERBAL_ADVS_PATH = 'data/nverbs/advs/merged_advs-sel-1M-300M.txt' # frekwencja tylko wybrana | |
46 | +#ADVS_VAL_PATH = 'data/nverbs/advs/merged_advs_val_popr_usu_gdyby_20141113.txt' | |
47 | +# | |
48 | +## loading initial entries | |
49 | +#ADVS_ADDED_PATH = 'data/nverbs/advs/added-merged_advs_val.txt' | |
50 | +#ADVS_ERROR_PATH = 'data/nverbs/advs/error-merged_advs_val.txt' | |
51 | +#ADVS_FRAMES_PATH = 'data/nverbs/advs/merged_advs_val_popr_usu_gdyby_20141113.txt' | |
52 | + | |
53 | +## adding entries relations # na razie brak danych | |
54 | +#ADV_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt' | |
55 | + | |
56 | + | |
57 | +class Command(BaseCommand): | |
58 | + args = 'none' | |
59 | + | |
60 | + def handle(self, **options): | |
61 | + # load nouns | |
62 | +# entries_with_val = get_entries(NOUNS_VAL_PATH) | |
63 | +# entries = get_entries_by_freq(VERBAL_NOUNS_PATH, ORDERING) | |
64 | +# load_entries(entries, B_entries, 'data/added_nouns_20140627.txt', ORDERING, 'noun', | |
65 | +# 'clarin_nouns', 1, 1, 0) | |
66 | + | |
67 | + # load nouns | |
68 | + entries_to_add = get_entries(SELECTED_NOUNS_PATH) | |
69 | + ordered_entries = get_entries_by_freq(SELECTED_NOUNS_PATH, ORDERING) | |
70 | + #related_entries = get_related_entries(NOUNS_RELATIONS_PATH, 'noun') | |
71 | + added_entries = load_entries(ordered_entries, entries_to_add, 'data/nverbs/nouns2consider/added_nouns_20171103.txt', ORDERING, 'noun', | |
72 | + 'clarin2_nouns', 3, 3, 0) | |
73 | + add_initial_frames_by_entries(added_entries, | |
74 | + NOUNS_FRAMES_PATH, NOUNS_ADDED_PATH, NOUNS_ERROR_PATH, | |
75 | + 'noun') | |
76 | + add_relations_by_nverb_entries(added_entries, NOUN_VERB_RELATIONS_PATH, 'verb', 'noun') | |
77 | + #add_relations_by_nverb_entries(added_entries, NOUN_ADJ_RELATIONS_PATH, 'adj', 'noun') | |
78 | + | |
79 | +# # load adjectives | |
80 | +## entries_with_val = get_entries(ADJS_VAL_PATH) | |
81 | +# entries = get_entries_by_freq(VERBAL_ADJS_PATH, ORDERING) | |
82 | +# related_entries = get_related_entries(ADJS_RELATIONS_PATH, 'adj') | |
83 | +# added_entries = load_entries(entries, related_entries, 'data/added_adjs_20141219.txt', ORDERING, 'adj', | |
84 | +# 'clarin_adjs', 3, 3, 0) | |
85 | +# add_initial_frames_by_entries(added_entries, | |
86 | +# ADJS_FRAMES_PATH, ADJS_ADDED_PATH, ADJS_ERROR_PATH, | |
87 | +# 'adj') | |
88 | +# add_relations_by_nverb_entries(added_entries, ADJ_VERB_RELATIONS_PATH, 'adj') | |
89 | + | |
90 | +# # load adverbs | |
91 | +# entries_with_val = get_entries(ADVS_VAL_PATH) | |
92 | +# entries = get_entries_by_freq(VERBAL_ADVS_PATH, ORDERING) | |
93 | +# added_entries = load_entries(entries, entries_with_val, 'data/added_advs_20141114.txt', ORDERING, 'adv', | |
94 | +# 'clarin_advs', 1, 1, 0) | |
95 | +# add_initial_frames_by_entries(added_entries, | |
96 | +# ADVS_FRAMES_PATH, ADVS_ADDED_PATH, ADVS_ERROR_PATH, | |
97 | +# 'adverb') | |
98 | +# add_relations_by_nverb_entries(added_entries, ADJ_VERB_RELATIONS_PATH, 'adj') | |
99 | + | |
100 | +def get_entries(entries_path): | |
101 | + entries = [] | |
102 | + try: | |
103 | + entries_file = codecs.open(entries_path, "rt", 'utf-8') | |
104 | + for line in entries_file: | |
105 | + line_ls = line.split('\t') | |
106 | + entries.append({'entry' : line_ls[0].strip()}) | |
107 | + entries = list(set(entries)) | |
108 | + finally: | |
109 | + entries_file.close() | |
110 | + return entries | |
111 | + | |
112 | +def get_entries_by_freq(entries_path, ordering): | |
113 | + entries = [] | |
114 | + try: | |
115 | + freq_file = codecs.open(entries_path, "rt", 'utf-8') | |
116 | + for line in freq_file: | |
117 | + line_ls = line.split() | |
118 | + entries.append({'entry' : line_ls[0].strip(), | |
119 | + 'freq_1M': int(line_ls[1].strip()), | |
120 | + 'freq_300M': int(line_ls[2].strip())}) | |
121 | + entries = sorted(entries, key=itemgetter('freq_%s' % ordering), reverse=True) | |
122 | + finally: | |
123 | + freq_file.close() | |
124 | + return entries | |
125 | + | |
126 | +def load_entries(sorted_entries, entries_to_add, added_path, ordering, pos_tag, | |
127 | + dict_basename, first_dict_idx, last_dict_idx, min_freq): | |
128 | + print 'Loading entries!!' | |
129 | + try: | |
130 | + added_entries = [] | |
131 | + added_file = codecs.open(added_path, "wt", 'utf-8') | |
132 | + dict = first_dict_idx | |
133 | + new_last_dict = last_dict_idx | |
134 | + pos_obj = POS.objects.get(tag=pos_tag) | |
135 | + verbs_per_dict = VERBS_IN_DICT | |
136 | + initial_status = Lemma_Status.objects.order_by('priority')[0] | |
137 | + for entry in sorted_entries: | |
138 | + found_entry = next((item for item in entries_to_add if item['entry'] == entry['entry']), None) | |
139 | + if found_entry and entry['freq_%s' % ordering] >= min_freq: | |
140 | + if dict == 0: | |
141 | + new_voc = Vocabulary(name=dict_basename) | |
142 | + new_voc.save() | |
143 | + else: | |
144 | + new_voc = Vocabulary(name=dict_basename+str(dict)) | |
145 | + new_voc.save() | |
146 | + | |
147 | + lemmas = Lemma.objects.filter(entry = entry['entry']) | |
148 | + if lemmas.count() == 0: | |
149 | + if verbs_per_dict == 0: | |
150 | + verbs_per_dict = VERBS_IN_DICT | |
151 | + dict += 1 | |
152 | + if dict > new_last_dict: | |
153 | + break | |
154 | + else: | |
155 | + new_voc = Vocabulary(name=dict_basename+str(dict)) | |
156 | + new_voc.save() | |
157 | + val_entry, created = get_or_create_entry(entry['entry'], pos_obj) | |
158 | + new_lemma = Lemma(entry=entry['entry'], | |
159 | + entry_obj=val_entry, | |
160 | + vocabulary=new_voc, | |
161 | + status=initial_status, | |
162 | + old=False, | |
163 | + frequency_300M=entry['freq_300M'], | |
164 | + frequency_1M=entry['freq_1M']) | |
165 | + new_lemma.save() | |
166 | + verbs_per_dict -= 1 | |
167 | + added_file.write('%s\t%s\t%s\n' % (entry['entry'], | |
168 | + entry['freq_1M'], | |
169 | + entry['freq_300M'])) | |
170 | + added_entries.append(entry['entry']) | |
171 | + print entry | |
172 | + finally: | |
173 | + added_file.close() | |
174 | + return added_entries | |
175 | + | |
176 | +def get_related_entries(relations_path, pos_tag): | |
177 | + print 'Checking relations!!!' | |
178 | + entries = [] | |
179 | + pos = POS.objects.get(tag=pos_tag) | |
180 | + try: | |
181 | + relations_file = codecs.open(relations_path, "rt", 'utf-8') | |
182 | + for line in relations_file: | |
183 | + #print line | |
184 | + line_ls = line.split() | |
185 | + verb = line_ls[3].lstrip('(').strip() | |
186 | + nverb = line_ls[0].strip() | |
187 | + if (not Lemma.objects.filter(entry=nverb).exists() and | |
188 | + Lemma.objects.filter(entry=verb, entry_obj__pos__tag='verb').exists()): | |
189 | + #entries.append(nverb) | |
190 | + verb_active_lemma = Lemma.objects.get(entry=verb, | |
191 | + entry_obj__pos__tag='verb', | |
192 | + old=False) | |
193 | + lemma_status_str = verb_active_lemma.status.status | |
194 | + if (not lemma_status_str == u'w obróbce' and | |
195 | + not lemma_status_str == u'do obróbki'): | |
196 | + if (verb_active_lemma.frames.count() == 1 and | |
197 | + verb_active_lemma.frames.filter(text_rep=u'subj{np(str)}').exists()): | |
198 | + pass | |
199 | + else: | |
200 | + entries.append({'entry' : nverb, | |
201 | + 'verb' : verb, | |
202 | + 'freq_1M': int(line_ls[1].strip()), | |
203 | + 'freq_300M': int(line_ls[2].strip())}) | |
204 | + print line | |
205 | + finally: | |
206 | + relations_file.close() | |
207 | + return entries | |
... | ... |
dictionary/management/commands/get_lemmas_list.py
0 → 100644
1 | +# -*- coding:utf-8 -*- | |
2 | + | |
3 | +import codecs | |
4 | +import datetime | |
5 | +import os | |
6 | + | |
7 | +from django.core.management.base import BaseCommand | |
8 | + | |
9 | +from dictionary.models import Lemma | |
10 | +from settings import PROJECT_PATH | |
11 | + | |
12 | + | |
13 | +POS = 'verb' | |
14 | +OUTPATH = os.path.join(PROJECT_PATH, 'data', '%ss-%s.txt' % (POS, datetime.datetime.now().strftime('%Y%m%d'))) | |
15 | + | |
16 | + | |
17 | +class Command(BaseCommand): | |
18 | + help = 'Get lemmas existing in Walenty' | |
19 | + | |
20 | + def handle(self, *args, **options): | |
21 | + lemmas = Lemma.objects.filter(old=False, entry_obj__pos__tag=POS) | |
22 | + lemmas = lemmas.exclude(status__status=u'do usunięcia').order_by('entry_obj__name') | |
23 | + write_lemmas(lemmas) | |
24 | + | |
25 | + | |
26 | +def write_lemmas(lemmas): | |
27 | + try: | |
28 | + outfile = codecs.open(OUTPATH, 'w', 'utf-8') | |
29 | + for lemma in lemmas: | |
30 | + outfile.write('%s\n' % lemma.entry_obj.name) | |
31 | + finally: | |
32 | + outfile.close() | |
... | ... |
dictionary/management/commands/load_entries_relations.py
0 → 100644
1 | +#-*- coding:utf-8 -*- | |
2 | + | |
3 | +import codecs | |
4 | + | |
5 | +from django.core.management.base import BaseCommand | |
6 | + | |
7 | +from dictionary.models import Lemma, POS, get_or_create_entry | |
8 | + | |
9 | +NOUN_VERB_RELATIONS_PATH = 'data/nverbs/nouns/nouns+verb-freq.txt' | |
10 | + | |
11 | +ADJ_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt' | |
12 | + | |
13 | +CHECK_PATH = 'data/nverbs/nouns/deriv_nouns-adj-freq-sel.txt' | |
14 | + | |
15 | +class Command(BaseCommand): | |
16 | + args = 'none' | |
17 | + help = """ | |
18 | + Add relations between entries from given file. | |
19 | + """ | |
20 | + | |
21 | + def handle(self, **options): | |
22 | + #add_relations(NOUN_VERB_RELATIONS_PATH, 'noun') | |
23 | + #add_relations(ADJ_VERB_RELATIONS_PATH, 'adj') | |
24 | + check_if_deriv_good_to_add('adj', 'noun', 'data/nverbs/nouns/deriv_nouns-adj-existing-20150928.txt') | |
25 | + | |
26 | +def add_relations(entries_path, pos_tag): | |
27 | + entries = [] | |
28 | + pos = POS.objects.get(tag=pos_tag) | |
29 | + try: | |
30 | + freq_file = codecs.open(entries_path, "rt", 'utf-8') | |
31 | + for line in freq_file: | |
32 | + #print line | |
33 | + line_ls = line.split() | |
34 | + verb = line_ls[3].lstrip('(').strip() | |
35 | + try: | |
36 | + nverb = line_ls[0].strip() | |
37 | + verb_obj = Lemma.objects.get(old=False, entry=verb, entry_obj__pos__tag='verb') | |
38 | + nverb_obj = Lemma.objects.get(old=False, entry=nverb, entry_obj__pos=pos) | |
39 | + entry = {'entry' : nverb, | |
40 | + 'verb' : verb, | |
41 | + 'freq_1M': int(line_ls[1].strip()), | |
42 | + 'freq_300M': int(line_ls[2].strip())} | |
43 | + nverb_entry, created = get_or_create_entry(entry['entry'], pos) | |
44 | +# try: | |
45 | +# val_entry = Entry.objects.get(name=entry['entry']) | |
46 | +# if val_entry.pos.tag == 'verb': | |
47 | +# continue | |
48 | +# val_entry.pos = pos | |
49 | +# val_entry.save() | |
50 | +# except Entry.DoesNotExist: | |
51 | +# val_entry = Entry(name=entry['entry'], pos=pos) | |
52 | +# val_entry.save() | |
53 | + verb_entry = verb_obj.entry_obj | |
54 | + verb_entry.rel_entries.add(nverb_entry) | |
55 | + nverb_entry.rel_entries.add(verb_entry) | |
56 | + print line | |
57 | + except Lemma.DoesNotExist: | |
58 | + pass | |
59 | + finally: | |
60 | + freq_file.close() | |
61 | + return entries | |
62 | + | |
63 | +def add_relations_by_nverb_entries(entries, entries_path, from_pos_tag, to_pos_tag): | |
64 | + print 'Adding relations!' | |
65 | + from_pos = POS.objects.get(tag=from_pos_tag) | |
66 | + to_pos = POS.objects.get(tag=to_pos_tag) | |
67 | + try: | |
68 | + freq_file = codecs.open(entries_path, "rt", 'utf-8') | |
69 | + for line in freq_file: | |
70 | + #print line | |
71 | + line_ls = line.split() | |
72 | + verb = line_ls[3].lstrip('(').strip() | |
73 | + try: | |
74 | + nverb = line_ls[0].strip() | |
75 | + if nverb in entries: | |
76 | + verb_obj = Lemma.objects.get(old=False, entry=verb, entry_obj__pos=from_pos) | |
77 | + nverb_obj = Lemma.objects.get(old=False, entry=nverb, entry_obj__pos=to_pos) | |
78 | + entry = {'entry' : nverb, | |
79 | + 'verb' : verb, | |
80 | + 'freq_1M': int(line_ls[1].strip()), | |
81 | + 'freq_300M': int(line_ls[2].strip())} | |
82 | + nverb_entry = nverb_obj.entry_obj | |
83 | + verb_entry = verb_obj.entry_obj | |
84 | + verb_entry.rel_entries.add(nverb_entry) | |
85 | + nverb_entry.rel_entries.add(verb_entry) | |
86 | + print line | |
87 | + except Lemma.DoesNotExist: | |
88 | + pass | |
89 | + finally: | |
90 | + freq_file.close() | |
91 | + | |
92 | +def check_if_deriv_good_to_add(from_pos_tag, to_pos_tag, outpath): | |
93 | + #try: | |
94 | + freq_file = codecs.open(CHECK_PATH, "rt", 'utf-8') | |
95 | + good_file = codecs.open(outpath, "wt", 'utf-8') | |
96 | + for line in freq_file: | |
97 | + line_ls = line.split() | |
98 | + to_entry = line_ls[0].strip() | |
99 | + from_entry = line_ls[3].lstrip('(').strip() | |
100 | + if not Lemma.objects.filter(old=False, entry=to_entry, | |
101 | + entry_obj__pos__tag=to_pos_tag).exists(): | |
102 | + try: | |
103 | + from_lemma = Lemma.objects.get(old=False, entry=from_entry, | |
104 | + entry_obj__pos__tag=from_pos_tag) | |
105 | + good_file.write(line) | |
106 | + print line | |
107 | + except Lemma.DoesNotExist: | |
108 | + pass | |
109 | + #finally: | |
110 | + good_file.close() | |
111 | + freq_file.close() | |
112 | + | |
0 | 113 | \ No newline at end of file |
... | ... |
dictionary/management/commands/load_initial_nverb_frames.py
0 → 100644
1 | +#-*- coding:utf-8 -*- | |
2 | + | |
3 | +#Copyright (c) 2014, Bartłomiej Nitoń | |
4 | +#All rights reserved. | |
5 | + | |
6 | +#Redistribution and use in source and binary forms, with or without modification, are permitted provided | |
7 | +#that the following conditions are met: | |
8 | + | |
9 | +# Redistributions of source code must retain the above copyright notice, this list of conditions and | |
10 | +# the following disclaimer. | |
11 | +# Redistributions in binary form must reproduce the above copyright notice, this list of conditions | |
12 | +# and the following disclaimer in the documentation and/or other materials provided with the distribution. | |
13 | + | |
14 | +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED | |
15 | +# WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A | |
16 | +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR | |
17 | +# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED | |
18 | +# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
19 | +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING | |
20 | +# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE | |
21 | +# POSSIBILITY OF SUCH DAMAGE. | |
22 | + | |
23 | +import codecs | |
24 | +import itertools | |
25 | +from operator import itemgetter | |
26 | + | |
27 | +from django.core.management.base import BaseCommand | |
28 | + | |
29 | +#from dictionary.common_func import arg_data_to_arg, args_to_position, \ | |
30 | +# positions_to_frame | |
31 | +from dictionary.models import Argument, Argument_Model, Frame_Opinion, \ | |
32 | + Frame_Opinion_Value, Lemma, positions_to_frame, \ | |
33 | + get_or_create_position | |
34 | + | |
35 | + | |
36 | +NOUNS_ADDED_PATH = 'data/nverbs/nouns/added-merged_nouns_val.txt' | |
37 | +NOUNS_ERROR_PATH = 'data/nverbs/nouns/error-merged_nouns_val.txt' | |
38 | +NOUNS_FRAMES_PATH = 'data/nverbs/nouns/merged_nouns_val-poss.txt' | |
39 | + | |
40 | +ADJS_ADDED_PATH = 'data/nverbs/adjs/added-merged_adjs_val.txt' | |
41 | +ADJS_ERROR_PATH = 'data/nverbs/adjs/error-merged_adjs_val.txt' | |
42 | +ADJS_FRAMES_PATH = 'data/nverbs/adjs/merged_adjs_val-P1.txt' | |
43 | + | |
44 | +class Command(BaseCommand): | |
45 | + args = 'none' | |
46 | + help = """ | |
47 | + Adds initial nverb frames. | |
48 | + """ | |
49 | + | |
50 | + def handle(self, **options): | |
51 | + #add_initial_frames(NOUNS_FRAMES_PATH, NOUNS_ADDED_PATH, NOUNS_ERROR_PATH, 'noun') | |
52 | + add_initial_frames(ADJS_FRAMES_PATH, ADJS_ADDED_PATH, ADJS_ERROR_PATH, 'adj') | |
53 | + | |
54 | +def add_initial_frames(frames_path, added_path, error_path, pos_tag): | |
55 | + try: | |
56 | + added_file = codecs.open(added_path, "wt", 'utf-8') | |
57 | + error_file = codecs.open(error_path, "wt", 'utf-8') | |
58 | + frames_file = codecs.open(frames_path, "rt", 'utf-8') | |
59 | + for line in frames_file: | |
60 | + line = line.strip() | |
61 | + pred_val = '' | |
62 | + if line.startswith('%'): | |
63 | + continue | |
64 | + lemma, frames_str, pred_val = get_frames_info(line) | |
65 | + try: | |
66 | + lemma_obj = Lemma.objects.get(entry=lemma, old=False, | |
67 | + status__status=u'do obróbki', | |
68 | + entry_obj__pos__tag=pos_tag) | |
69 | + #lemma_obj.frames.clear() | |
70 | + print lemma_obj | |
71 | + try: | |
72 | + parse_and_add_frames(lemma_obj, frames_str, pred_val) | |
73 | + added_file.write(u'%s\n' % line) | |
74 | + except: | |
75 | + error_file.write(u'%s\n' % line) | |
76 | + except Lemma.DoesNotExist: | |
77 | + pass | |
78 | + finally: | |
79 | + added_file.close() | |
80 | + error_file.close() | |
81 | + frames_file.close() | |
82 | + | |
83 | +def add_initial_frames_by_entries(entries, frames_path, added_path, error_path, pos_tag): | |
84 | + print 'Adding initial frames!' | |
85 | + try: | |
86 | + added_file = codecs.open(added_path, "wt", 'utf-8') | |
87 | + error_file = codecs.open(error_path, "wt", 'utf-8') | |
88 | + frames_file = codecs.open(frames_path, "rt", 'utf-8') | |
89 | + for line in frames_file: | |
90 | + line = line.strip() | |
91 | + pred_val = '' | |
92 | + if line.startswith('%'): | |
93 | + continue | |
94 | + lemma, frames_str, pred_val = get_frames_info(line) | |
95 | + if lemma in entries: | |
96 | + try: | |
97 | + lemma_obj = Lemma.objects.get(entry=lemma, old=False, | |
98 | + status__status=u'do obróbki', | |
99 | + entry_obj__pos__tag=pos_tag) | |
100 | + print lemma_obj | |
101 | + try: | |
102 | + parse_and_add_frames(lemma_obj, frames_str, pred_val) | |
103 | + added_file.write(u'%s\n' % line) | |
104 | + except: | |
105 | + error_file.write(u'%s\n' % line) | |
106 | + except Lemma.DoesNotExist: | |
107 | + pass | |
108 | + finally: | |
109 | + added_file.close() | |
110 | + error_file.close() | |
111 | + frames_file.close() | |
112 | + | |
113 | +def get_frames_info(line): | |
114 | + predicativity_val = '' | |
115 | + line_parts = line.split('\t') | |
116 | + lemma = line_parts[0].strip() | |
117 | + frames_str = line_parts[1].strip() | |
118 | + if len(line_parts) == 3 and line_parts[2] == 'PRED': | |
119 | + predicativity_val = 'pred' | |
120 | + return lemma, frames_str, predicativity_val | |
121 | + | |
122 | +def parse_and_add_frames(lemma_obj, frames_str, predicativity_val): | |
123 | + poss_ls = [] | |
124 | + valence_ls = [arg.strip() for arg in frames_str.split('+')] | |
125 | + for pos_arg in valence_ls: | |
126 | + pos_arg = pos_arg.strip() | |
127 | + possible_args = pos_arg.split('/') | |
128 | + possible_args = coordinate_arguments(possible_args) | |
129 | + poss_ls.append(possible_args) | |
130 | + confs = itertools.product(*poss_ls) | |
131 | + for frame_args in list(confs): | |
132 | + frame_args = list(set(frame_args)) #--> tutaj byl fuckup i tworzyly sie dziwne pozycje majace ten sam argument kilkukrotnie, moze ta linijka pomoze | |
133 | + frame_obj, frame_opinion_obj = create_frame(frame_args, predicativity_val) | |
134 | + lemma_obj.frames.add(frame_obj) | |
135 | + lemma_obj.frame_opinions.add(frame_opinion_obj) | |
136 | + | |
137 | +def coordinate_arguments(arguments): | |
138 | + coordinated_args = [] | |
139 | + for arg in arguments: | |
140 | + arg_type, attributes = arg_from_text_rep(arg) | |
141 | + case, preposition = get_arg_case_and_preposition(arg) | |
142 | + coordinated_arg = next((arg for arg in coordinated_args if (arg['case'] == case and | |
143 | + arg['preposition'] == preposition)), None) | |
144 | + if coordinated_arg and case: | |
145 | + coordinated_arg['argument'] += ';%s' % arg | |
146 | + else: | |
147 | + coordinated_arg = {'argument': arg, | |
148 | + 'case': case, | |
149 | + 'preposition': preposition} | |
150 | + coordinated_args.append(coordinated_arg) | |
151 | + if arg_type == 'ncp': | |
152 | + additional_arg = u'np(%s)' % case | |
153 | + coordinated_arg['argument'] += ';%s' % additional_arg | |
154 | + elif arg_type == 'prepncp': | |
155 | + additional_arg = u'prepnp(%s,%s)' % (preposition, case) | |
156 | + coordinated_arg['argument'] += ';%s' % additional_arg | |
157 | + | |
158 | + return [arg['argument'] for arg in coordinated_args] | |
159 | + | |
160 | +def arg_from_text_rep(argument): | |
161 | + attributes = [] | |
162 | + arg_parts = argument.split('(') | |
163 | + arg_type = arg_parts[0] | |
164 | + if len(arg_parts) > 1: | |
165 | + attributes = arg_parts[1].rstrip(')').replace("'", "").split(',') | |
166 | + return arg_type, attributes | |
167 | + | |
168 | +def get_arg_case_and_preposition(argument): | |
169 | + case = '' | |
170 | + preposition = '' | |
171 | + argument = arg_conversion(argument) | |
172 | + arg_type, attributes = arg_from_text_rep(argument) | |
173 | + argument_model = Argument_Model.objects.get(arg_model_name=arg_type) | |
174 | + attribute_models = argument_model.atribute_models.order_by('priority') | |
175 | + for attr_model, attr_text_rep in zip(attribute_models, attributes): | |
176 | + if attr_model.atr_model_name == u'PRZYPADEK': | |
177 | + case = attr_text_rep | |
178 | + elif attr_model.atr_model_name == u'PRZYIMEK': | |
179 | + preposition = attr_text_rep | |
180 | + return case, preposition | |
181 | + | |
182 | +def arg_conversion(arg_text_rep): | |
183 | + arg_text_rep = arg_text_rep.replace('!', '').replace('*', '').replace('?', '') | |
184 | + if arg_text_rep == 'advp': | |
185 | + arg_text_rep = u'xp(_)' | |
186 | + elif arg_text_rep.startswith('comprepnp'): | |
187 | + arg_text_rep = arg_text_rep.replace("'", "").replace(',gen', '') | |
188 | + return arg_text_rep | |
189 | + | |
190 | +def create_frame(frame_args, predicativity_val): | |
191 | + positions_objs, frame_opinion_value = get_positions(frame_args) | |
192 | + frame_obj = positions_to_frame(positions_objs, | |
193 | + reflex='', | |
194 | + negativity='', | |
195 | + predicativity=predicativity_val, | |
196 | + aspect='') | |
197 | + frame_opinion_obj, xx = Frame_Opinion.objects.get_or_create(frame=frame_obj, | |
198 | + value=frame_opinion_value) | |
199 | + return frame_obj, frame_opinion_obj | |
200 | + | |
201 | +def get_positions(args_strs): | |
202 | + poss_objs = [] | |
203 | + frame_opinions = [] | |
204 | + for poss_args_str in args_strs: | |
205 | + frame_opinions.append(possible_frame_opinion(poss_args_str)) | |
206 | + poss_objs.append(create_position(poss_args_str)) | |
207 | + frame_opinion = sorted(frame_opinions, key=itemgetter('priority'), reverse=False)[0] | |
208 | + frame_opinion_value = Frame_Opinion_Value.objects.get(value=frame_opinion['opinion']) | |
209 | + return poss_objs, frame_opinion_value | |
210 | + | |
211 | +def possible_frame_opinion(arg_str): | |
212 | + opinion = {'opinion': 'pewny', | |
213 | + 'priority': '4'} | |
214 | + if '!' in arg_str: | |
215 | + opinion = {'opinion': u'zły', | |
216 | + 'priority': '1'} | |
217 | + elif '?' in arg_str: | |
218 | + opinion = {'opinion': u'wątpliwy', | |
219 | + 'priority': '2'} | |
220 | + elif '*' in arg_str: | |
221 | + opinion = {'opinion': u'archaiczny', | |
222 | + 'priority': '3'} | |
223 | + return opinion | |
224 | + | |
225 | +def create_position(args_str): | |
226 | + arg_objs = [] | |
227 | + for arg_text_rep in args_str.split(';'): | |
228 | + arg_text_rep = arg_conversion(arg_text_rep) | |
229 | +# try: | |
230 | + arg_obj = Argument.objects.get(text_rep=arg_text_rep) | |
231 | +# except Argument.DoesNotExist: # TODO wylaczac przy wstepnym wrzucaniu hasel | |
232 | +# arg_type, attributes = arg_from_text_rep(arg_text_rep) | |
233 | +# arg_obj = arg_data_to_arg(arg_type, attributes) | |
234 | + arg_objs.append(arg_obj) | |
235 | + pos_obj = get_or_create_position(categories=[], arguments=arg_objs) | |
236 | + return pos_obj | |
237 | + | |
238 | + | |
239 | + | |
0 | 240 | \ No newline at end of file |
... | ... |