load_morfologik.py
5.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
#-*- coding:utf-8 -*-
import sys
import json
from django.db import connection, transaction
from django.db.models import Max
from django.core.management.base import BaseCommand, CommandError
from common.util import no_history, debug
from dictionary.models import Lexeme, Pattern, LexemeAssociation, \
LexemeInflectionPattern, PartOfSpeech, Vocabulary, InflectionCharacteristic, \
CrossReference, CrossReferenceType, ClassificationValue
START_ID = 500000
END_ID = 1000000
next_id = Lexeme.objects.filter(
pk__gte=START_ID, pk__lt=END_ID).aggregate(Max('id'))['id__max']
next_id = next_id + 1 if next_id else START_ID
class Command(BaseCommand):
args = '<nazwa słownika> <nazwa źródła> <nazwa pliku wejściowego>'
help = 'Load prepared lexeme data'
def handle(self, input_file, vocab_name, source, **options):
load_morfologik(input_file, vocab_name, source)
source = None
vocab = None # brzydko, ale nie chce mi się przerabiać wszystkiego
parts_of_speech = PartOfSpeech.objects.all()
pos_table = {}
for part_of_speech in parts_of_speech:
pos_table[part_of_speech.symbol] = part_of_speech
ic_list = InflectionCharacteristic.objects.all()
ic_table = {}
for ic in ic_list:
ic_table[(ic.entry, ic.part_of_speech.symbol)] = ic
pattern_list = Pattern.objects.all()
pattern_table = {}
for p in pattern_list:
pattern_table[p.name] = p
def associate(l):
la, created = LexemeAssociation.objects.get_or_create(
lexeme=l, vocabulary=vocab)
if not created and l.part_of_speech.symbol not in ('ppas', 'pact', 'ger'):
debug(l.entry, u'wielokrotne przypisanie leksemu do słownika!')
def add_cr(l_from, l_to, symbol):
cr_type = CrossReferenceType.objects.get(
symbol=symbol, from_pos=l_from.part_of_speech, to_pos=l_to.part_of_speech)
CrossReference(from_lexeme=l_from, to_lexeme=l_to, type=cr_type).save()
def create_lexeme(entry, homonym_number, part_of_speech, status, comment,
commonness=None):
global next_id
l = Lexeme(id=next_id, entry=entry, homonym_number=homonym_number,
part_of_speech=part_of_speech, source=source, status=status,
comment=comment, owner_vocabulary=vocab)
l.save()
if commonness:
ClassificationValue.objects.get(label=commonness).lexemes.add(l) #add
associate(l)
next_id += 1
return l
def create_negated(l):
lneg = create_lexeme(
u"nie" + l.entry, l.homonym_number, l.part_of_speech,
"cand" if l.status == "cand" else "desc", '')
for lip in l.lexemeinflectionpattern_set.all():
if lip.inflection_characteristic.entry != "0-":
ic = ic_table[("0-", "adj")]
else:
ic = lip.inflection_characteristic
LexemeInflectionPattern(lexeme=lneg, index=lip.index,
pattern=lip.pattern, root=u"nie" + lip.root,
inflection_characteristic=ic).save()
add_cr(l, lneg, "adjnie")
add_cr(lneg, l, "nieadj")
def check_der(verb, pos, entry, patterns):
lips = verb.lexemeinflectionpattern_set.all()
if not lips:
return None
ic = lips[0].inflection_characteristic.entry
matched = []
for l in Lexeme.objects.filter(
deleted=False, entry=entry, part_of_speech__symbol=pos,
lexemeinflectionpattern__inflection_characteristic__entry=ic):
l_lips = l.lexemeinflectionpattern_set.all()
if l_lips[0].inflection_characteristic.entry == ic:
l_patterns = set(l.patterns.values_list('name', flat=True))
if l_patterns == set(patterns):
matched.append(l)
if len(matched) > 1:
debug(entry, u'niejednoznaczny derywat')
if len(matched) > 0:
return matched[0]
else:
return None
def create_derived(l, pos, entry, patterns):
old_der = check_der(l, pos, entry, patterns)
if old_der:
if vocab not in old_der.vocabularies.all():
associate(old_der)
lder = old_der
else:
# kopiowanie homonym_number nie ma sensu, ale co ma?
lder = create_lexeme(entry, l.homonym_number, pos_table[pos], l.status, u'')
for lip in l.lexemeinflectionpattern_set.all():
if lip.pattern.name in patterns:
ic = lip.inflection_characteristic.entry.lstrip("q")
LexemeInflectionPattern(lexeme=lder, index=lip.index,
pattern=lip.pattern, root=lip.root,
inflection_characteristic=ic_table[(ic, pos)]).save()
add_cr(l, lder, "ver" + pos)
add_cr(lder, l, pos + "ver")
def load_morfologik(filename, vocab_name, source_):
global vocab, source
vocab = Vocabulary.objects.get(id=vocab_name)
source = source_
transaction.commit_unless_managed()
transaction.enter_transaction_management()
transaction.managed(True)
no_history()
with open(filename) as file:
for line in file:
data = json.loads(line.decode('utf-8'))
if data['lexeme']['source'] == 'sgjp':
l = Lexeme.objects.get(pk=data['lexeme']['id'])
associate(l)
elif data['lexeme']['source'] == 'morfologik':
l_data = data['lexeme']
l = create_lexeme(l_data['entry'], l_data['homonym_number'],
pos_table[l_data['part_of_speech']], l_data['status'],
l_data['comment'], l_data.get('commonness'))
for lip_data in data['lips']:
pattern = pattern_table[lip_data['pattern']]
ic = ic_table[tuple(lip_data['ic'])]
if lip_data['root']['type'] == 'string':
root = lip_data['root']['root']
elif lip_data['root']['type'] == 'compute':
root = l.get_root(pattern, ic)
LexemeInflectionPattern(lexeme=l, index=lip_data['ind'],
pattern=pattern, root=root, inflection_characteristic=ic).save()
if 'derived' in data:
for pos, entry, patterns in data['derived']:
create_derived(l, pos, entry, patterns)
if 'negated' in data:
create_negated(l)
transaction.commit()
transaction.leave_transaction_management()