utils.py
7.87 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
from itertools import product
import re
from importer.Phrase import *
from .morph_generation import MorphologyError, select_form
PRE, POST = 0, 1
def build_phrase(head, dep, head_type, dep_type, order_override=None):
order = None
if order_override is not None:
order = PRE if order_override == 'pre' else POST
else:
if head_type == NP:
if dep_type in (AdjP, LexAdjP, LexPPasP, LexPActP, PossP, LexQub, Fixed,):
order = PRE
# LexAdvP: nic więcej
if dep_type in (NP, LexNP, PrepNP, ComPrepNP, LexPrepNP, LexPrepGerP, CP, LexCP, NCP, XP, LexXP, LexAdvP, OR,):
order = POST
if head_type == NumP:
if dep_type in (AdjP, LexAdjP, PossP):
order = PRE
# XP: w pół drogi ‹dokądś›
# NP: na dwóch biegunach ‹kogoś/czegoś›
if dep_type in (NP, XP,):
order = POST
if head_type == AdjP:
if dep_type in (AdvP, LexAdvP, AdjP, LexAdjP, LexQub,):
order = PRE
# NP: pełny czegoś
# Fixed: samo przez się
if dep_type in (NP, LexNP, PrepNP, LexPrepNP, XP, LexXP, Compar, LexCompar, Fixed):
order = POST
if head_type == AdvP:
if dep_type in (XP, AdvP,):
order = PRE
# LexNP: dalej własnego nosa
# LexPrepNP: prosto w oczy
# LexCP: tak, że...
if dep_type in (LexCompar, NP, LexNP, PrepNP, LexPrepNP, LexCP,):
order = POST
if head_type == InfP:
order = POST
if head_type == Qub:
if dep_type in (LexQub,):
order = PRE
if order == PRE:
return '{} {}'.format(dep, head)
if order == POST:
return '{} {}'.format(head, dep)
else:
raise RuntimeError('couldn’t build phrase: {} {} {} {}'.format(head, dep, head_type, dep_type))
def correct_lemma(lemma):
# TODO see notes
l = lemma.strip('\'')
if l == 'bliźnięta':
return 'bliźnię'
return l
NUM_LEMMA = { '2' : 'dwa', '3' : 'trzy', '5' : 'pięć', }
def correct_num_lemma(lemma):
return NUM_LEMMA.get(lemma, lemma)
def correct_pos(lemma, pos):
if lemma == 'siebie':
return 'siebie'
if lemma in ('ja', 'ty', 'my', 'wy'):
return 'ppron12'
if lemma == 'on':
return 'ppron3'
if lemma == 'oba':
return 'num'
if lemma == 'jeden':
return 'adj'
return pos
def correct_num(lemma, num):
if lemma == 'siebie':
return ''
if lemma in ('ja', 'ty') and num == '_':
return 'sg'
if lemma in ('oba', 'plecy', 'usta',):
return 'pl'
if lemma in ('pół', 'półtora'):
return 'sg'
# TODO (?)
if num == 'agr':
return 'sg'
# TODO _ -> sg or _ -> sg and pl?
return num if num != '_' else ['sg', 'pl']
def correct_gend(gend):
if gend == 'agr':
return 'm1'
return gend
# TODO is the mapping for no function correct?
# TODO the mapping should be more complex, e.g. most lex(np)s should be in acc (dać kosza etc.),
# but adjps seem to need nom: chrzest bojowy
STR_CASE = {
'subj' : { '_' : 'nom', 'aff' : 'nom', 'neg' : 'nom' },
'obj' : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' },
None : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' },
}
AGR_CASE = { 'subj' : 'nom', 'obj' : 'acc', 'head' : 'nom', None : 'nom' }
PRED_CASE = {
'subj' : { '_' : 'nom', 'aff' : 'nom', 'neg' : 'nom' },
'obj' : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' },
None : { '_' : 'inst', 'aff' : 'inst', 'neg' : 'inst' },
}
def correct_case(case, function, negativity='_'):
if case == 'str':
return STR_CASE[function][negativity]
if case == 'agr':
return AGR_CASE[function]
# TODO both gen and acc?
if case == 'part':
return 'gen'
if case == 'pred':
return PRED_CASE[function][negativity]
if case == 'postp':
return 'dat'
return case
def correct_deg(deg):
# positive degree = positive or no degree at all
if deg == 'pos':
return [deg, '']
if deg == '_':
return ['pos', 'com', 'sup', '']
return deg
def correct_congr(lemma):
if lemma in ('pół', 'półtora'):
return 'rec'
# heuristic: if both congr and rec forms available, prefer congr
# no congr/rec also possible
return ['congr', 'rec', '']
def correct_aff(aff):
if aff == '_':
return ['aff', 'neg']
return aff
NEG = { '_' : '(nie) ', 'aff' : '', 'neg' : 'nie '}
def correct_neg(neg):
return NEG[neg]
SIE = { '' : '', 'się' : 'się ', }
def correct_sie(sie):
return SIE[sie]
def correct_feats(lemma, feats, praep=False):
if lemma == 'on':
return feats + ['m1', 'nakc', 'praep' if praep else 'npraep']
if lemma in ('ja', 'ty',):
# mi, ci, cię
akc = 'nakc' if 'dat' in feats or ({'acc', 'gen'}.intersection(feats) and lemma == 'ty') else 'akc'
return feats + ['m1', [akc, '']]
if lemma in ('my', 'wy'):
return feats + ['m1']
if lemma == 'oba':
return feats + ['congr', 'ncol']
return feats
def get_subst_attrs(lemma, tag):
feats = tag.split(':')
if lemma == 'siebie':
return { 'case' : feats[1] }
return {'num': feats[1], 'case': feats[2], 'gend' : feats[3]}
def get_gender(lemma):
form = get_form(lemma, ['subst', 'sg', 'nom'])
# 1 or 2 values: ['f'], ['n', 'ncol'], ...
gend = form[1].split(':')[3:]
if len(gend) == 2:
# no col/ncol variant for jeden, wiele itp.
gend[1] = [gend[1], '']
else:
# choose ncol for e.g. czterech/czworo m1
gend = [gend[0], ['ncol', '']]
return gend
def get_form(lemma, feats):
if lemma.startswith('E('):
return ('', 'subst:pl:nom:{}'.format(lemma.strip('E()')))
lemma_feats = [f(lemma) if hasattr(f, '__call__') else f for f in feats]
return select_form(lemma, lemma_feats)
def get_forms(lemma, feats):
lemma_feats = [f(lemma) if hasattr(f, '__call__') else f for f in feats]
lemma_feats = [[f] if type(f) == str else f for f in lemma_feats]
ret = []
errors = []
for feats in product(*lemma_feats):
try:
ret.append(select_form(lemma, feats))
except MorphologyError as e:
errors.append(str(e))
if ret:
#print('get_forms', lemma, feats, ret)
return ret
raise MorphologyError('couldn’t select form: {}'.format(' + '.join(errors)))
WOK_PREP = {
'bez' : ('^mn',), # beze mnie
'nad' : (
'^mn',
'^wszystko' # nade wszystko, ale: nad wszystkim
),
'od' : ('^mn',),
'pod' : ('^mn',),
'przed' : (
'^mn',
'^wszystkim$' # przede wszystkim, ale: przed wszystkimi
),
'przez' : ('^mn',),
'w' : (
'^dwoje', # ale: w dwojaki
'^dwój',
'^fr',
'^mgl',
'^mnie$', # ale: w mniejszych
'^wc',
'^wn',
'^wp',
'^wr',
'^ws',
'^wt',
'^wz',
'^wł',
'^znaki$',
'^śnie',
),
'z' : (
'^mnie$',
'^mną$',
'^sobą$',
'^sc',
'^sf',
'^sk',
'^sm',
'^sn',
'^sp',
'^st',
'^sw',
'^szc',
'^szk',
'^szp',
'^szt',
'^szw',
'^sł',
'^wsc',
'^wsi',
'^wsk',
'^wsp',
'^wst',
'^wszec',
'^wszystk',
'^wz',
'^zb',
'^zd',
'^zg',
'^zj',
'^zm',
'^zn',
'^zr',
'^zw',
'^zł',
'^łz',
'^ś',
'^ź',
),
}
def combine_with_prep(prep, rest):
if prep in WOK_PREP:
for pattern in WOK_PREP[prep]:
if re.match(pattern, rest.lower()):
return '{}e {}'.format(prep, rest)
return '{} {}'.format(prep, rest)