import_witek.py
4.74 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
# -*- coding: utf-8 -*-
import sys
from django.core.management.base import BaseCommand
from django.db.models import Max
from django.db.transaction import atomic
from common.util import uniopen, no_history
from dictionary.models import Lexeme, Vocabulary, LexemeInflectionPattern, \
Qualifier, ClassificationValue, LexemeCV, Gender, \
LexemeAttributeValue, CrossReferenceType, CrossReference
from patterns.models import Pattern
class Command(BaseCommand):
help = "My shiny new management command."
def handle(self, filename, comment, *args, **options):
import_lexemes(uniopen(filename), comment)
WSJP = Vocabulary.objects.get(id='WSJP')
@atomic
def import_lexemes(lines, comment):
no_history()
for line in lines:
elements = line.strip().split(';')
if elements[1] == 'subst':
import_subst(elements, comment)
elif elements[1] == 'adv':
import_adv(elements, comment)
elif elements[1] == 'adj':
import_adj(elements, comment)
next_id = None
def new_lexeme(entry, pos, comment):
global next_id
if next_id:
next_id += 1
else:
next_id = Lexeme.all_objects.aggregate(Max('id'))['id__max'] + 1
l = Lexeme(
id=next_id, entry=entry, part_of_speech_id=pos,
status=Lexeme.STATUS_DESCRIBED, owner_vocabulary_id=WSJP,
comment=comment)
l.save()
WSJP.add_lexeme(l)
return l
def import_subst(elements, comment):
try:
entry, pos, gender, pattern_data, commonness = elements
except ValueError:
print >>sys.stderr, 'zla liczba kolumn', elements
raise
assert pos == 'subst'
genders = [Gender.objects.get(symbol=g) for g in gender.split('/')]
lip_data = [p.rsplit(' ', 1) for p in pattern_data.split('/')]
if len(genders) > 1 and len(lip_data) > 1:
print >>sys.stderr, 'mnogie wzory i rodzaje', elements
return
if len(genders) == 1:
lip_data = [(ld, genders[0]) for ld in lip_data]
else:
lip_data = [(lip_data[0], g) for g in genders]
l = new_lexeme(entry, 'subst', comment)
comm_value = ClassificationValue.objects.get(
classification__name=u'pospolitość', label=commonness)
LexemeCV.objects.create(lexeme=l, classification_value=comm_value)
for i, (ld, gender) in enumerate(lip_data, 1):
lip = LexemeInflectionPattern(lexeme=l, index=i, gender=gender)
if len(ld) == 1:
pattern = ld[0]
qualifier = None
else:
qualifier, pattern = ld
lip.pattern = Pattern.objects.get(name=pattern)
lip.root = lip.get_root()
if lip.get_root() is None:
raise ValueError(u"%s: can't find root" % repr(entry))
lip.save()
if qualifier:
lip.qualifiers.add(Qualifier.objects.get(label=qualifier))
NDM = Pattern.objects.get(name='ndm') # hardcoded pattern
ADVADJ = CrossReferenceType.objects.get(symbol='advadj')
def import_adv(elements, comment):
try:
entry, pos, pattern_name, adj_entry = elements
except ValueError:
print >>sys.stderr, 'zla liczba kolumn', elements
raise
assert pos == 'adv' and pattern_name == 'ndm'
l = new_lexeme(entry, 'adv', comment)
lip = LexemeInflectionPattern(lexeme=l, index=1, pattern=NDM)
lip.root = lip.get_root()
lip.save()
adjs = Lexeme.objects.filter(entry=adj_entry)
if len(adjs) == 1:
adj = adjs.get()
CrossReference.objects.create(from_lexeme=l, to_lexeme=adj, type=ADVADJ)
else:
if len(adjs) == 0:
print >>sys.stderr, 'Brak przymiotnika: %s (%s)' \
% (adj_entry, entry)
else:
print >>sys.stderr, 'Niejednoznaczny przymiotnik: %s (%s)' % (
adj_entry, entry)
POPRZ = LexemeAttributeValue.objects.get(
value=u'obecna', attribute__name=u'forma poprz.')
NO_POPRZ = LexemeAttributeValue.objects.get(
value=u'nieobecna', attribute__name=u'forma poprz.')
ZLOZ = LexemeAttributeValue.objects.get(
value=u'obecna', attribute__name=u'forma złoż.')
NO_ZLOZ = LexemeAttributeValue.objects.get(
value=u'nieobecna', attribute__name=u'forma złoż.')
def import_adj(elements, comment):
try:
entry, pos, pattern_name, zloz, poprz = elements
except ValueError:
print >> sys.stderr, 'zla liczba kolumn', elements
raise
assert pos == 'adj'
l = new_lexeme(entry, 'adj', comment)
if zloz[0] == '+':
ZLOZ.add_lexeme(l)
else:
NO_ZLOZ.add_lexeme(l)
if poprz[0] == '+':
POPRZ.add_lexeme(l)
else:
NO_POPRZ.add_lexeme(l)
pattern = Pattern.objects.get(name=pattern_name)
lip = LexemeInflectionPattern(lexeme=l, index=1, pattern=pattern)
lip.root = lip.get_root()
lip.save()