import_tei.py
11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#! /usr/bin/python
# -*- coding: utf-8 -*-
from django.core.management.base import BaseCommand
import sys, os, shutil, codecs, copy, errno
from xml.sax import saxutils, handler, make_parser
from importer.WalentyXML import WalentyTeiHandler
from importer.WalentyPreprocessXML import WalentyPreprocessTeiHandler
from shellvalier.settings import BASE_DIR
from connections.models import POS, Status
from examples.models import ExampleOpinion, ExampleSource
from syntax.models import SchemaOpinion, Aspect, InherentSie, Negativity, Predicativity, SyntacticFunction, Control, PredicativeControl, Position
from syntax.models_phrase import (
Case, PhraseAspect, AdverbialCategory, PhraseNegativity, PhraseInherentSie,
Number, Gender, Degree,
LemmaOperator, LemmaCooccur,
ModificationType,
)
from semantics.models import FrameOpinion, ArgumentRole, SemanticRole, RoleAttribute, PredefinedSelectionalPreference, SelectionalPreferenceRelation
class Command(BaseCommand):
args = 'none'
help = ''
def handle(self, **options):
import_tei()
def import_tei():
xml_file = os.path.join(BASE_DIR, 'data', 'tei', 'walenty_tei.xml')
#xml_file = os.path.join(BASE_DIR, 'data', 'walenty', 'walenty_20200801.xml')
#xml_file = os.path.join(BASE_DIR, 'data', 'walenty', 'walenty_20200801_smaller.xml')
xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file)
import_constants()
parser = make_parser()
parser.setFeature(handler.feature_external_ges, False)
parser.setContentHandler(WalentyPreprocessTeiHandler())
parser.parse(xml_path)
entry_meanings = parser.getContentHandler().entry_meanings
meanings = parser.getContentHandler().meanings
frames = parser.getContentHandler().frames
parser.setContentHandler(WalentyTeiHandler(entry_meanings, meanings, frames))
parser.parse(xml_path)
def import_constants():
import_poses()
import_statuses()
import_schema_opinions()
import_frame_opinions()
import_aspects()
import_inherent_sies()
import_negativities()
import_predicativities()
import_syntactic_functions()
import_control_tags()
import_semantic_roles()
import_predefined_preferences()
import_preference_relations()
import_examples_sources()
import_examples_opinions()
import_phrase_attributes()
import_lemma_operators()
import_modification_types()
pass
def import_poses():
poses = [u'unk', u'adj', u'noun', u'adv', u'verb']
for pos_tag in poses:
pos = POS(tag=pos_tag)
pos.save()
def import_statuses():
statuses = [(10, u'do obróbki'), (20, u'w obróbce'), (25, u'do usunięcia'), (30, u'gotowe'), (35, u'zalążkowe'), (40, u'sprawdzone'), (50, u'(F) w obróbce'), (60, u'(F) gotowe'), (70, u'(F) sprawdzone'), (80, u'(S) w obróbce'), (90, u'(S) gotowe'), (100, u'(S) sprawdzone')]
for pri, name in statuses:
status = Status(key=name, priority=pri)
status.save()
def import_schema_opinions():
opinions = [(60, u'vul'), (50, u'col'), (40, u'dat'), (30, u'bad'), (20, u'unc'), (10, u'cer')]
for pri, short in opinions:
opinion = SchemaOpinion(key=short, priority=pri)
opinion.save()
def import_frame_opinions():
opinions = [(70, u'met'), (60, u'vul'), (50, u'col'), (40, u'dat'), (30, u'bad'), (20, u'unc'), (10, u'cer'), (80, u'dom'), (90, u'rar'), (100, u'unk')]
for pri, short in opinions:
opinion = FrameOpinion(key=short, priority=pri)
opinion.save()
def import_aspects():
aspects = [(10, u'imperf'), (20, u'perf'), (32, u'_'), (42, u'')]
for pri, name in aspects:
aspect = Aspect(name=name, priority=pri)
aspect.save()
def import_inherent_sies():
sies = [(10, u'false'), (20, u'true')]
for pri, name in sies:
sie = InherentSie(name=name, priority=pri)
sie.save()
def import_negativities():
negativities = [(20, u'aff'), (10, u'neg'), (31, u'_'), (41, u'')]
for pri, name in negativities:
neg = Negativity(name=name, priority=pri)
neg.save()
def import_predicativities():
predicativities = [(20, u'false'), (10, u'true')]
for pri, name in predicativities:
pred = Predicativity(name=name, priority=pri)
pred.save()
def import_syntactic_functions():
functions = [(0, u'subj'), (20, u'head'), (10, u'obj')]
for pri, name in functions:
sf = SyntacticFunction(name=name, priority=pri)
sf.save()
def import_control_tags():
controls = [(10, u'controller'), (20, u'controllee'), (30, u'controller2'), (40, u'controllee2')]
for pri, name in controls:
cont = Control(name=name, priority=pri)
cont.save()
controls = [(10, u'pred_controller'), (20, 'pred_controllee')]
for pri, name in controls:
cont = PredicativeControl(name=name, priority=pri)
cont.save()
def import_semantic_roles():
roles = [
(10, u'Initiator', u'91,106,217', None),
(20, u'Stimulus', u'62,173,226', None),
(30, u'Condition', u'127,199,195', None),
(40, u'Factor', u'82,150,87', None),
(50, u'Experiencer', u'149,195,86', None),
(60, u'Theme', u'90,179,69', None),
(70, u'Recipient', u'203,77,141', None),
(80, u'Result', u'231,155,159', None),
(90, u'Instrument', u'199,221,60', None),
(100, u'Manner', u'191,48,44', None),
(110, u'Purpose', u'171,85,186', None),
(120, u'Attribute', u'220,53,47', None),
(130, u'Location', u'187,129,45', None),
(140, u'Path', u'224,121,44', None),
(150, u'Time', u'242,236,54', None),
(160, u'Duration', u'233,192,6', None),
(170, u'Measure', u'238,72,154', None),
(180, u'Lemma', u'256,256,256', None)
]
# priorities set so that, when role and attribute priorities are added,
# Role_Source < Role_Foreground < Role_Background < Role_Goal
# and Role can be inserted anywhere into that hierarchy
attributes = [(1, u'Source', None, u'left'), (3, u'Foreground', None, u'top'), (5, u'Background', None, u'bottom'), (7, u'Goal', None, u'right')]
for pri, role, color, gradient in roles:
role = SemanticRole(role=role, color=color, priority=pri)
role.save()
for pri, role, color, gradient in attributes:
role = RoleAttribute(attribute=role, gradient=gradient, priority=pri)
role.save()
for role in SemanticRole.objects.all():
r = ArgumentRole(role=role, attribute=None)
r.save()
for attribute in RoleAttribute.objects.all():
r = ArgumentRole(role=role, attribute=attribute)
r.save()
def import_predefined_preferences():
predefs = [u'ALL', u'LUDZIE', u'ISTOTY', u'PODMIOTY', u'KOMUNIKAT', u'KONCEPCJA', u'WYTWÓR', u'JADŁO', u'CZAS', u'OBIEKTY', u'CECHA', u'CZYNNOŚĆ', u'KIEDY', u'CZEMU', u'ILOŚĆ', u'POŁOŻENIE', u'DOBRA', u'MIEJSCE', u'SYTUACJA', u'OTOCZENIE']
for name in predefs:
predef = PredefinedSelectionalPreference(key=name)
predef.save()
def import_preference_relations():
relations = [(14, u'meronimia'), (15, u'holonimia'), (20, u'meronimia (typu część)'), (21, u'meronimia (typu porcja)'), (22, u'meronimia (typu miejsce)'), (23, u'meronimia (typu element)'), (24, u'meronimia (typu materiał)'), (25, u'holonimia (typu część)'), (26, u'holonimia (typu porcja)'), (27, u'holonimia (typu miejsce)'), (28, u'holonimia (typu element)'), (29, u'holonimia (typu materiał)'), (51, u'nosiciel stanu/cechy'), (52, u'stan/cecha'), (61, u'synonimia międzyparadygmatyczna'), (64, u'meronimia (typu element taksonomiczny)'), (65, u'holonimia (typu element taksonomiczny)'), (108, u'fuzzynimia synsetów'), (-1, u'RELAT')]
for id, name in relations:
relat = SelectionalPreferenceRelation(plwn_id=id, key=name)
relat.save()
def import_examples_sources():
sources = [(0, u'NKJP0.5M'), (1, u'NKJP1.2M'), (2, u'NKJP30M'), (3, u'NKJP250M'), (4, u'NKJP300M'), (5, u'NKJP500M'), (6, u'NKJP1800M'), (7, u'linguistic_literature'), (8, u'other_literature'), (9, u'own')]
for pri, name in sources:
es = ExampleSource(key=name, priority=pri)
es.save()
def import_examples_opinions():
opinions = [(0, 'zły'), (1, 'wątpliwy'), (2, 'dobry')]
for pri, name in opinions:
eo = ExampleOpinion(key=name, priority=pri)
eo.save()
def import_phrase_attributes():
import_cases()
import_phrase_aspects()
import_phrase_negativities()
import_phrase_inherent_sies()
import_adverbial_categories()
import_numbers()
import_genders()
import_degrees()
# TODO this is quite terrible... create a dummy position for storing phrases inside a lex
# the store() method for phrases requires a position
dummy_position = Position()
dummy_position.save()
assert (dummy_position.id == 1)
def import_cases():
cases = [(0, u'str'), (1, u'nom'), (2, u'gen'), (3, u'dat'), (4, u'acc'), (5, u'inst'), (6, u'loc'), (10, u'pred'), (11, u'part'), (12, u'postp'), (13, u'agr')]
for pri, name in cases:
case = Case(name=name, priority=pri)
case.save()
def import_phrase_aspects():
aspects = [(10, u'imperf'), (20, u'perf'), (30, u'_')]
for pri, name in aspects:
aspect = PhraseAspect(name=name, priority=pri)
aspect.save()
def import_phrase_negativities():
negativities = [(10, u'aff'), (20, u'neg'), (30, u'_')]
for pri, name in negativities:
negativity = PhraseNegativity(name=name, priority=pri)
negativity.save()
def import_phrase_inherent_sies():
sies = [(10, u'się'), (20, u'')]
for pri, name in sies:
sie = PhraseInherentSie(name=name, priority=pri)
sie.save()
def import_adverbial_categories():
advcats = [(1, u'locat'), (2, u'abl'), (3, u'adl'), (4, u'perl'), (5, u'temp'), (6, u'dur'), (7, 'mod'), (8, 'caus'), (9, 'dest'), (10, 'instr'), (11, 'pron'), (12, 'misc')]
for pri, name in advcats:
advcat = AdverbialCategory(name=name, priority=pri)
advcat.save()
def import_numbers():
numbers = [(1, u'sg'), (2, u'pl'), (10, u'agr'), (20, u'_')]
for pri, name in numbers:
number = Number(name=name, priority=pri)
number.save()
def import_genders():
genders = [(1, u'm1'), (2, u'm2'), (3, u'm3'), (4, u'f'), (5, u'n'), (10, u'agr')]
for pri, name in genders:
gender = Gender(name=name, priority=pri)
gender.save()
def import_degrees():
degrees = [(1, u'pos'), (2, u'com'), (3, u'sup'), (20, u'_')]
for pri, name in degrees:
degree = Degree(name=name, priority=pri)
degree.save()
def import_lemma_operators():
operators = [(1, u'xor'), (2, u'or')]
for pri, name in operators:
operator = LemmaOperator(name=name, priority=pri)
operator.save()
cooccurs = [(1, u'concat'), (2, u'coord')]
for pri, name in cooccurs:
cooccur = LemmaCooccur(name=name, priority=pri)
cooccur.save()
def import_modification_types():
modtypes = [(1, u'ratr'), (2, u'ratr1'), (3, u'atr'), (4, u'atr1'), (5, u'natr')]
for pri, name in modtypes:
modtype = ModificationType(name=name, priority=pri)
modtype.save()