load_entries_relations.py
3.94 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#-*- coding:utf-8 -*-
import codecs
from django.core.management.base import BaseCommand
from dictionary.models import Lemma, POS, get_or_create_entry
NOUN_VERB_RELATIONS_PATH = 'data/nverbs/nouns/nouns+verb-freq.txt'
ADJ_VERB_RELATIONS_PATH = 'data/nverbs/adjs/merged_adjs+verb-freq.txt'
CHECK_PATH = 'data/nverbs/nouns/deriv_nouns-adj-freq-sel.txt'
class Command(BaseCommand):
args = 'none'
help = """
Add relations between entries from given file.
"""
def handle(self, **options):
#add_relations(NOUN_VERB_RELATIONS_PATH, 'noun')
#add_relations(ADJ_VERB_RELATIONS_PATH, 'adj')
check_if_deriv_good_to_add('adj', 'noun', 'data/nverbs/nouns/deriv_nouns-adj-existing-20150928.txt')
def add_relations(entries_path, pos_tag):
entries = []
pos = POS.objects.get(tag=pos_tag)
try:
freq_file = codecs.open(entries_path, "rt", 'utf-8')
for line in freq_file:
#print line
line_ls = line.split()
verb = line_ls[3].lstrip('(').strip()
try:
nverb = line_ls[0].strip()
verb_obj = Lemma.objects.get(old=False, entry_obj__name=verb, entry_obj__pos__tag='verb')
entry = {'entry' : nverb,
'verb' : verb,
'freq_1M': int(line_ls[1].strip()),
'freq_300M': int(line_ls[2].strip())}
nverb_entry, created = get_or_create_entry(entry['entry'], pos)
verb_entry = verb_obj.entry_obj
verb_entry.rel_entries.add(nverb_entry)
nverb_entry.rel_entries.add(verb_entry)
print line
except Lemma.DoesNotExist:
pass
finally:
freq_file.close()
return entries
def add_relations_by_nverb_entries(entries, entries_path, from_pos_tag, to_pos_tag):
print 'Adding relations!'
from_pos = POS.objects.get(tag=from_pos_tag)
to_pos = POS.objects.get(tag=to_pos_tag)
try:
freq_file = codecs.open(entries_path, "rt", 'utf-8')
for line in freq_file:
#print line
line_ls = line.split()
verb = line_ls[3].lstrip('(').strip()
try:
nverb = line_ls[0].strip()
if nverb in entries:
verb_obj = Lemma.objects.get(old=False, entry_obj__name=verb, entry_obj__pos=from_pos)
nverb_obj = Lemma.objects.get(old=False, entry_obj__name=nverb, entry_obj__pos=to_pos)
entry = {'entry' : nverb,
'verb' : verb,
'freq_1M': int(line_ls[1].strip()),
'freq_300M': int(line_ls[2].strip())}
nverb_entry = nverb_obj.entry_obj
verb_entry = verb_obj.entry_obj
verb_entry.rel_entries.add(nverb_entry)
nverb_entry.rel_entries.add(verb_entry)
print line
except Lemma.DoesNotExist:
pass
finally:
freq_file.close()
def check_if_deriv_good_to_add(from_pos_tag, to_pos_tag, outpath):
try:
freq_file = codecs.open(CHECK_PATH, "rt", 'utf-8')
good_file = codecs.open(outpath, "wt", 'utf-8')
for line in freq_file:
line_ls = line.split()
to_entry = line_ls[0].strip()
from_entry = line_ls[3].lstrip('(').strip()
if not Lemma.objects.filter(old=False, entry_obj__name=to_entry,
entry_obj__pos__tag=to_pos_tag).exists():
try:
from_lemma = Lemma.objects.get(old=False, entry_obj__name=from_entry,
entry_obj__pos__tag=from_pos_tag)
good_file.write(line)
print line
except Lemma.DoesNotExist:
pass
finally:
good_file.close()
freq_file.close()