import_warszawa.py
3.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#-*- coding:utf-8 -*-
from django.core.management.base import BaseCommand
from dictionary.models import Lexeme
from dictionary.management.commands.import_morfologik import create_lexeme, \
create_lip, print_data
class Command(BaseCommand):
args = '<input file name>'
help = 'importuje nazwy warszawskie'
def handle(self, filename, **options):
import_warszawa(open(filename))
def inflection_characteristic(forms, pos):
# w nazwach warszawskich jest tylko subst i adj
tag = forms[0][1]
if pos == 'subst':
if 'depr' in tag or tag.endswith('m1'):
ic = 'm1'
else:
ic = tag.rsplit(':', 1)[1]
elif pos == 'adj':
# formy 3+ tu nie występują
if any(tag == 'adja' for form, tag in forms):
ic = ''
else:
ic = '0-'
#return InflectionCharacteristic.objects.get(
# symbol=ic, part_of_speech__symbol=pos)
return ic
def process_forms(forms, base, pos, patterns):
ic = inflection_characteristic(forms, pos)
#### wyłączone, bo sprawdzone, że wszystkie wzory się zgadzają
#patterns_ok = True
#try:
# k_patterns = [Pattern.objects.get(name=p_name) for p_name in patterns]
# # sprawdzić wygenerowane formy...
# p_forms = set()
# for pattern in k_patterns:
# p_forms |= all_forms(pattern, ic, pos, base, affixes=False)
# w_forms = set(form for form, tag in forms)
# if p_forms != w_forms:
# patterns_ok = False
# print p_forms - w_forms, w_forms - p_forms, patterns
#except Pattern.DoesNotExist:
# patterns_ok = False
# print patterns
# szukamy leksemów wg base, pos, ic, wzory
homonyms = Lexeme.objects.filter(entry=base, part_of_speech__symbol=pos)
for l in homonyms:
lips = l.lexemeinflectionpattern_set.all()
l_patterns = set(lip.pattern.name for lip in lips)
l_ics = [lip.inflection_characteristic.symbol for lip in lips]
if l_ics in ([ic], ['3+']) and l_patterns == patterns:
break # nie importujemy, bo już jest
#else:
# diff = ''
# if l_ics != [ic]:
# diff += '%s %s ' % (l_ics, ic)
# if l_patterns != patterns:
# diff += '%s %s' % (l_patterns, patterns)
# debug(base, diff)
else:
if homonyms:
status = 'cand'
comment = u'z nazw warszawskich; rozbieżność'
else:
status = 'desc'
comment = u'z nazw warszawskich'
lips = []
for i, pattern in enumerate(patterns):
lips.append(create_lip(pattern, None, i + 1, ic, pos))
data = {
'lexeme': create_lexeme(base, 1, pos, status, comment),
'lips': lips,
}
print_data(data)
def import_warszawa(input_file):
last_id = None
forms = None
last_base = None
last_pos = None
patterns = None
for line in input_file:
data = line.strip().decode('utf-8').split('\t')
w_id, lip_ind, pos, pattern, form, base, tag = data
if w_id != last_id:
if last_id is not None:
process_forms(forms, last_base, last_pos, patterns)
last_id = w_id
last_base = base
last_pos = pos
forms = []
patterns = set()
forms.append((form, tag))
patterns.add(pattern)
process_forms(forms, last_base, last_pos, patterns)