import_expansions.py
7.03 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import os
from xml.sax import handler, make_parser
from django.core.management.base import BaseCommand
from importer.Phrase import phrase_from_tree
from importer.Position import Position
from importer.WalentyXML import XMLNode
from shellvalier.settings import BASE_DIR
from phrase_expansions.models import ExpansionOpinion, PhraseExpansionType, PhraseExpansion, ExpansionPosition, ExpansionPhrase, ExpansionPhraseDescription
from entries.phrase_descriptions.descriptions import phrase_description2
class Command(BaseCommand):
args = 'none'
help = ''
def handle(self, **options):
import_expansions()
OPINION_MAP = {
'archaiczna' : 'dat',
'pewna' : 'cer',
'potoczna' : 'col',
'wątpliwa' : 'unc',
}
def import_expansions():
xml_file = os.path.join(BASE_DIR, 'data', 'walenty', 'phrase_types_expand_20200926.xml')
xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file)
parser = make_parser()
parser.setContentHandler(ExpansionsTeiHandler())
parser.parse(xml_path)
expansions = parser.getContentHandler()._expansions
for cls in (ExpansionOpinion, PhraseExpansion, ExpansionPosition, ExpansionPhrase, ExpansionPhraseDescription):
cls.objects.all().delete()
opinions = [(50, u'col'), (40, u'dat'), (20, u'unc'), (10, u'cer'),]
for pri, short in opinions:
opinion = ExpansionOpinion(key=short, priority=pri)
opinion.save()
for (phrase_type, phrase_subtype), exps in expansions.items():
expansion_type = PhraseExpansionType.objects.create(phrase_type=phrase_type,
phrase_subtype=phrase_subtype)
expansion_type.save()
print(expansion_type)
for i, (positions, opinion) in enumerate(exps):
opinion = ExpansionOpinion.objects.get(key=OPINION_MAP[opinion])
expansion = PhraseExpansion.objects.create(expansion_type=expansion_type,
opinion=opinion,
priority=(i + 1))
expansion.save()
expansion_type.max_positions = max(expansion_type.max_positions, len(positions))
for j, pos in enumerate(positions):
position = ExpansionPosition(expansion=expansion, priority=(j + 1))
position.save()
for k, (text_rep, desc_pl, desc_en) in enumerate(pos):
phrase = ExpansionPhrase(position=position, text_rep=text_rep)
phrase.save()
d_pl = ExpansionPhraseDescription(phrase=phrase, lang='pl', description=desc_pl)
d_en = ExpansionPhraseDescription(phrase=phrase, lang='en', description=desc_en)
d_pl.save()
d_en.save()
expansion_type.save()
print(sum(map(len, expansions.values())))
class ExpansionsTeiHandler(handler.ContentHandler):
def __init__(self):
handler.ContentHandler.__init__(self)
self._subtree = None
self._current = None
self._constructing = False
self._content = ""
self._expansions = dict()
def startElement(self, name, attrs):
if name == 'entry':
self._constructing = True
self._content = ""
if (self._constructing):
node = XMLNode(name, attrs, self._current)
if self._current is not None:
self._current.addChild(node)
else:
self._subtree = node
self._current = node
def endElement(self, name):
if self._current is not None:
self._current.setContent(self._content.strip())
self._current = self._current._parent
if name == 'entry':
if self._current is not None:
raise TEIStructureError()
typ = self._subtree._children[0]._attrs['type']
self.get_expansions(self._subtree, typ)
self._content = ''
def characters(self, content):
self._content += content
def get_expansions(self, tree, phrase_type):
dummy_position = Position(None, None, None, None, None)
if phrase_type == 'advp':
subtype = tree._children[0]._children[0]._children[0]._attrs['value']
elif phrase_type == 'xp':
subtype = tree._children[0]._children[0]._children[0]._children[0]._children[0]._attrs['value']
elif phrase_type == 'comprepnp':
subtype = tree._children[0]._children[0]._children[0]._content
elif phrase_type in ('distrp', 'possp'):
subtype = None
#print('{}({})'.format(phrase_type, subtype))
assert((phrase_type, subtype) not in self._expansions)
expansions = []
for exp in tree._children[1]._children[0]._children:
expansion_positions = []
opinion = exp._children[0]._children[0]._attrs['value']
if exp._children[1]._attrs['name'] == 'phrases':
positions = [exp._children[1]]
elif exp._children[1]._attrs['name'] == 'positions':
positions = [pos._children[0] for pos in exp._children[1]._children[0]._children]
for position in positions:
expansion_position = []
for phrase in position._children[0]._children:
typ = phrase._attrs['type']
if typ == 'adverb':
adverb = phrase._children[0]._children[0]._attrs['value']
expansion_position.append((adverb, 'przysłówek <i>{}</i>'.format(adverb), '<i>{}</i> adverb'.format(adverb)))
elif typ == 'advp':
# xp realised by advp(cat)
advpcat = phrase._children[-1]._children[0]._attrs['value']
expansion_position.append(('advp({})'.format(advpcat), '???', '???'))
elif typ == 'comprepnp':
prep = phrase._children[1]._children[0]._content
expansion_position.append(('comprepnp({})'.format(prep), 'fraza rzeczownikowo-przyimkowa z przyimkiem złożonym <i>{}</i>'.format(prep), 'nominal-prepositional phrase with <i>{}</i> complex preposition'.format(prep)))
else:
phr = phrase_from_tree(phrase)
if False:#str(phr) == 'lex(adjp(agr),agr,agr,pos,OR(cudzy;czyj;czyjkolwiek;czyjś;mój;nasz;niczyj;pański;swój;twój;wasz),natr)':
desc_pl, desc_en = None, None
else:
desc_pl, desc_en = phrase_description2(phr, dummy_position, None, 'pl'), phrase_description2(phr, dummy_position, None, 'en')
expansion_position.append((str(phr), desc_pl, desc_en))
expansion_positions.append(expansion_position)
assert(expansion_positions)
expansions.append((expansion_positions, opinion))
self._expansions[(phrase_type, subtype)] = expansions