check_morfologik.py
4.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#-*- coding:utf-8 -*-
import sys
from django.core.management.base import BaseCommand, CommandError
from common.util import debug
from dictionary.models import Lexeme
class Command(BaseCommand):
args = '<symbol części mowy> <nazwa pliku wejściowego>'
help = 'Check Morfologik import'
def handle(self, lc_sym, input_file, **options):
check_morfologik(lc_sym, input_file)
# i tak nie ma żadnych q* aktualnie...
v_forms = {
('1', 'allq'): u'',
('1', 'all'): u'cie|my|sz',
('2', 'all'): u'',
('3', 'all'): u'',
('3', 'ndk'): u'c',
('3', 'pact'): u'ca|cą|ce|cego|cej|cemu|cy|cych|cym|cymi',
('4', 'all'): u'|że|my|myż|cie|cież',
('5', 'allq'): u'',
('6', 'all'): u'|by|byś|bym',
("6'", 'dk'): u'szy',
('7', 'all'): u'em|eś',
('8', 'allq'): u'o|oby',
('8', 'all'): u'a|aby|abyś|abym|am|aś|obym|obyś|om|oś|'
u'y|yby|ybyście|ybyśmy|yście|yśmy',
('9', 'all'): u'i|iby|ibyście|ibyśmy|iście|iśmy',
('10', 'all'): u'o',
('10', 'ppas'): u'a|ą|e|ego|ej|emu|y|ych|ym|ymi',
('11', 'ger'): u'ie|ia|iach|iami|iem|iom|iu',
('11pg', 'ger'): u'',
('12', 'ppas'): u'',
}
def get_forms(l, lc_sym):
if lc_sym != 'v':
l_forms = set(l.lexemeform_set.values_list('form', flat=True))
if lc_sym == 'adj':
neg = l.refs_to.filter(type__symbol='adjnie')
if neg:
l_neg = neg[0].to_lexeme
neg_forms = l_neg.lexemeform_set.values_list('form', flat=True)
added_forms = l_neg.all_forms(label_filter='^0|3\+$')
l_forms |= set(form for form in neg_forms if form not in added_forms)
else:
tags = ['allq']
if l.refs_to.filter(type__symbol='verpact'):
tags.append('pact')
if l.refs_to.filter(type__symbol='verppas'):
tags.append('ppas')
if l.refs_to.filter(type__symbol='verger'):
tags.append('ger')
lips = l.lexemeinflectionpattern_set.all()
if not lips:
return set()
ic = lips[0].inflection_characteristic.symbol
q = ic.startswith('q')
if not q:
tags.append('all')
if 'ndk' in ic:
tags.append('ndk')
if 'dk' in ic.replace('ndk', ''):
tags.append('dk')
base_forms = {}
for lip in l.lexemeinflectionpattern_set.all():
for ending in lip.pattern.endings.all():
bfl = ending.base_form_label.symbol
if bfl not in base_forms:
base_forms[bfl] = set()
base_forms[bfl].add(lip.root + ending.string)
l_forms = set()
for (label, tag), suffixes in v_forms.iteritems():
if tag in tags and label in base_forms:
new_forms = set()
for base_form in base_forms[label]:
new_forms |= set(base_form + suffix for suffix in suffixes.split('|'))
l_forms |= new_forms
if tag in ('pact', 'ppas', 'ger'):
l_forms |= set('nie' + form for form in new_forms)
return l_forms
def check_forms(lc_sym, forms):
entry = forms[0]
forms = set(forms)
morf_lexemes = Lexeme.objects.filter(
lexemeassociation__vocabulary__id='Morfologik', entry=entry,
part_of_speech__lexical_class__symbol=lc_sym)
for l in morf_lexemes:
if l.part_of_speech.lexical_class.symbol != lc_sym:
continue
l_forms = get_forms(l, lc_sym)
if l_forms == set():
break # brak dopasowania nas tu nie interesuje
if forms == l_forms:
break
if lc_sym == 'subst':
m1_lips = l.lexemeinflectionpattern_set.filter(
inflection_characteristic__symbol='m1')
if m1_lips and u'formę depr' in l.comment:
if forms | l.all_forms(label_filter='^pl:nom$') == l_forms:
break
if (u'rozszerzone singulare' in l.comment
or u'rozszerzyć sgtant' in l.comment
or l.owner_vocabulary.id != 'Morfologik'):
if forms == l.all_forms(label_filter='^sg:'):
break
elif lc_sym == 'adj':
#if u' -o' in l.comment:
if forms | l.all_forms(label_filter='^0$') == l_forms:
break
else: # żaden nie pasował
print entry.encode('utf-8')
for l in morf_lexemes:
l_forms = get_forms(l, lc_sym)
missing = ', '.join(forms - l_forms)
extra = ', '.join(l_forms - forms)
print ('%s|%s' % (missing, extra)).encode('utf-8')
def check_morfologik(lc_sym, input_file):
with open(input_file) as file:
forms = []
for line in file:
line = line.decode('utf-8').rstrip('\n')
if line == '':
check_forms(lc_sym, forms)
forms = []
else:
form, tag = line.split('\t')
forms.append(form)