utils.py
5.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
from itertools import product
from importer.Phrase import *
from .morph_generation import select_form
PRE, POST = 0, 1
def build_phrase(head, dep, head_type, dep_type):
order = None
if head_type == NP:
if dep_type in (AdjP, LexAdjP, LexPPasP, LexPActP, PossP, LexQub, Fixed,):
order = PRE
# LexAdvP: nic więcej
if dep_type in (NP, LexNP, PrepNP, ComPrepNP, LexPrepNP, LexPrepGerP, CP, LexCP, XP, LexXP, LexAdvP,):
order = POST
if head_type == NumP:
if dep_type in (AdjP, LexAdjP, PossP):
order = PRE
# XP: w pół drogi ‹dokądś›
# NP: na dwóch biegunach ‹kogoś/czegoś›
if dep_type in (NP, XP,):
order = POST
if head_type == AdjP:
if dep_type in (AdvP, LexAdvP, AdjP, LexAdjP, LexQub,):
order = PRE
# NP: pełny czegoś
# Fixed: samo przez się
if dep_type in (NP, LexNP, PrepNP, LexPrepNP, XP, LexXP, Compar, LexCompar, Fixed):
order = POST
if head_type == AdvP:
if dep_type in (XP, AdvP,):
order = PRE
# LexNP: dalej własnego nosa
# LexPrepNP: prosto w oczy
# LexCP: tak, że...
if dep_type in (LexCompar, NP, LexNP, LexPrepNP, LexCP,):
order = POST
if head_type == InfP:
order = POST
if head_type == Qub:
if dep_type in (LexQub,):
return PRE
if order == PRE:
return '{} {}'.format(dep, head)
if order == POST:
return '{} {}'.format(head, dep)
else:
print(head, dep, head_type, dep_type)
1/0
def correct_lemma(lemma):
# TODO see notes
l = lemma.strip('\'')
if l == 'bliźnięta':
return 'bliźnię'
return l
NUM_LEMMA = { '2' : 'dwa', '3' : 'trzy', '5' : 'pięć', }
def correct_num_lemma(lemma):
return NUM_LEMMA.get(lemma, lemma)
def correct_pos(lemma, pos):
if lemma == 'siebie':
return 'siebie'
if lemma in ('ja', 'ty', 'my', 'wy'):
return 'ppron12'
if lemma == 'on':
return 'ppron3'
if lemma == 'oba':
return 'num'
if lemma == 'jeden':
return 'adj'
return pos
def correct_num(lemma, num):
if lemma == 'siebie':
return ''
if lemma in ('ja', 'ty') and num == '_':
return 'sg'
if lemma in ('oba', 'plecy', 'usta',):
return 'pl'
if lemma in ('pół', 'półtora'):
return 'sg'
# TODO (?)
if num == 'agr':
return 'sg'
# TODO _ -> sg or _ -> sg and pl?
return num if num != '_' else ['sg', 'pl']
def correct_gend(gend):
if gend == 'agr':
return 'm1'
return gend
# TODO is the mapping for no function correct?
# TODO the mapping should be more complex, e.g. most lex(np)s should be in acc (dać kosza etc.),
# but adjps seem to need nom: chrzest bojowy
STR_CASE = {
'subj' : { '_' : 'nom', 'aff' : 'nom', 'neg' : 'nom' },
'obj' : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' },
None : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' },
}
AGR_CASE = { 'subj' : 'nom', 'obj' : 'acc', None : 'nom' }
def correct_case(case, function, negativity='_'):
if case == 'str':
return STR_CASE[function][negativity]
if case == 'agr':
return AGR_CASE[function]
# TODO both gen and acc?
if case == 'part':
return 'gen'
# TODO other cases in case of control?
if case == 'pred':
return 'inst'
if case == 'postp':
return 'dat'
return case
def correct_deg(deg):
# positive degree = positive or no degree at all
if deg == 'pos':
return [deg, '']
if deg == '_':
return ['pos', 'com', 'sup', '']
return deg
def correct_congr(lemma):
if lemma in ('pół', 'półtora'):
return 'rec'
# heuristic: if both congr and rec forms available, prefer congr
# no congr/rec also possible
return ['congr', 'rec', '']
def correct_aff(aff):
if aff == '_':
return ['aff', 'neg']
return aff
NEG = { '_' : '(nie) ', 'aff' : '', 'neg' : 'nie '}
def correct_neg(neg):
return NEG[neg]
SIE = { '' : '', 'się' : 'się ', }
def correct_sie(sie):
return SIE[sie]
def correct_feats(lemma, feats, praep=False):
if lemma == 'on':
return feats + ['m1', 'akc', 'praep' if praep else 'npraep']
if lemma in ('ja', 'ty',):
return feats + ['m1', ['akc', '']]
if lemma in ('my', 'wy'):
return feats + ['m1']
if lemma == 'oba':
return feats + ['congr', 'ncol']
return feats
def get_subst_attrs(lemma, tag):
feats = tag.split(':')
if lemma == 'siebie':
return { 'case' : feats[1] }
return {'num': feats[1], 'case': feats[2], 'gend' : feats[3]}
def get_gender_for_num(lemma):
form = get_form(lemma, ['subst', 'sg', 'nom'])
# 1 or 2 values: ['f'], ['n', 'ncol'], ...
gend = form[1].split(':')[3:]
if len(gend) == 2:
# no col/ncol variant for jeden, wiele itp.
gend[1] = [gend[1], '']
else:
# choose ncol for e.g. czterech/czworo m1
gend = [gend[0], ['ncol', '']]
return gend
def get_form(lemma, feats):
if lemma.startswith('E('):
return ('', 'subst:pl:nom:{}'.format(lemma.strip('E()')))
lemma_feats = [f(lemma) if hasattr(f, '__call__') else f for f in feats]
return select_form(lemma, lemma_feats)
def get_forms(lemma, feats):
lemma_feats = [f(lemma) if hasattr(f, '__call__') else f for f in feats]
lemma_feats = [[f] if type(f) == str else f for f in lemma_feats]
ret = []
for feats in product(*lemma_feats):
try:
ret.append(select_form(lemma, feats))
except:
pass
if ret:
return ret
1/0