table_analyzer.py
5.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# -*- coding: utf-8 -*-
from django.core.management import BaseCommand
from dictionary.models import Variant, Cell
class Command(BaseCommand):
args = ''
def handle(self, **options):
analyze_table()
nQ_ICS = ('dk', 'ndk', 'ndk/dk', 'dk/ndk', 'ndk/(dk)', 'dk/(ndk)')
Q_ICS = ('qndk', 'qdk', 'qndk/dk')
TABLE_TEMPLATES = [
(u'rzeczowniki', '1', ('subst', 'osc', 'skrs'), ('f', 'm', 'n', '0'), ()),
(u'pron', '1', ('subst',), ('z0', "z0'", 'z1', 'z1p', 'z2'), ()),
(u'my/wy', '1', ('ppron',), ('a',), ()),
(u'ja/ty/się', '1', ('ppron',), ('b', "b'"), ()),
(u'on', '1', ('ppron',), ('c',), ()),
(u'przymiotniki', '1', ('adj',), (), ()), # formy opcjonalne...
(u'adjcom', '1', ('adjcom',), (), ()), # połączyć z adj
(u'czasowniki', '1', ('v',), ('', '67', 'b'), (
(u'właściwy', ('', '(Q)'), nQ_ICS),
)),
(u'winien', '1', ('v',), ('p',), (
(u'właściwy', ('', '(Q)'), nQ_ICS),
)),
(u'niewłaściwe', '1', ('v', 'pred'), (), (
(u'właściwy', ('Q',), Q_ICS),
)),
(u'ger', '1', ('ger',), (), ()), # docelowo razem z subst
(u'imiesłowy', '1', ('pact', 'ppas', 'appas'), (), ()), # docelowo z adj
(u'num a', '1', ('num',), ('a', 'a1', "a'"), ()),
(u'num b', '1', ('num',), ('b',), ()),
(u'num cd', '1', ('num',), ('c', 'd', "d'", 'd"'), ()),
(u'nieodmienne', '1', (
'adv', 'advcom', 'advndm', 'burk', 'prep', 'pref', 'comp', 'conj',
'interj', 'qub'), (), ()),
]
ASPECT_POS = {'fin', 'imps', 'impt', 'inf', 'praet', 'ger', 'pact', 'ppas'}
GENDER_POS = {'subst'}
PERSON_POS = {'ppron12'}
CASE_POS = {'prep'}
def analyze_table():
variant = Variant.objects.get(id='Morfeusz')
for name, _variant, parts_of_speech, p_types, attrs in TABLE_TEMPLATES:
cells = Cell.objects.filter(
table_template__variant=variant,
table_template__inflection_characteristic__part_of_speech__symbol__in=
parts_of_speech)
if p_types:
cells = cells.filter(
table_template__pattern_type__symbol__in=p_types)
for name, values, ics in attrs:
cells = cells.filter(
table_template__inflection_characteristic__symbol__in=ics)
groups = {}
for cell in cells:
key = cell.tag
ic = cell.table_template.inflection_characteristic
pos = key.split(':', 1)[0]
if pos in ASPECT_POS:
key = key.replace('imperf.perf', 'ASPEKT')
key = key.replace('imperf', 'ASPEKT').replace('perf', 'ASPEKT')
elif pos in GENDER_POS and key.split(':')[-1] == ic.symbol:
key = key[:key.rfind(':')] + ':RODZAJ'
elif pos in PERSON_POS:
key = key.replace(ic.symbol, 'OSOBA')
elif pos in CASE_POS and ic.symbol:
key = key.replace(ic.symbol, 'PRZYPADEK')
if key not in groups:
groups[key] = {}
bfl = (cell.base_form_label.symbol, cell.prefix, cell.suffix)
if bfl not in groups[key]:
groups[key][bfl] = {}
pt = cell.table_template.pattern_type.symbol
if pt not in groups[key][bfl]:
groups[key][bfl][pt] = set()
groups[key][bfl][pt].add(
unicode(cell.table_template.inflection_characteristic))
for key, item in sorted(groups.iteritems()):
print
print key
for bfl, pairs in item.iteritems():
for pt, ic_set in pairs.iteritems():
if ic_set == {u'qndk/dk', u'qndk/(dk)', u'qdk', u'qndk'}:
pairs[pt] = 'q'
if ic_set == {u'ndk/(dk)', u'dk/ndk', u'ndk', u'dk', u'ndk/dk', u'dk/(ndk)'}:
pairs[pt] = '-q'
if ic_set == {u'dk/(ndk)', u'dk/ndk', u'ndk', u'ndk/(dk)', u'ndk/dk'}:
pairs[pt] = 'ndk'
if ic_set >= {u'p2:subst', u'm3:subst', u'f:subst',
u'n1:subst', u'm2:subst', u'p3:subst', u'm1:subst',
u'p1:subst', u'n2:subst'}:
pairs[pt] = '*'
if ic_set == {u':adj', u'0-:adj', u'3+:adj'}:
pairs[pt] = '*'
if ic_set == {u'dk/(ndk):v', u'ndk/dk:v', u'dk/ndk:v',
u'ndk:v', u'ndk/(dk):v', u'dk:v'}:
pairs[pt] = '*'
if ic_set == {u'dk/(ndk):ger', u'ndk/dk:ger', u'dk/ndk:ger',
u'ndk:ger', u'ndk/(dk):ger', u'dk:ger'}:
pairs[pt] = '*'
if ic_set == {u'dk/(ndk):pact', u'ndk/dk:pact', u'dk/ndk:pact',
u'ndk:pact', u'ndk/(dk):pact', u'dk:pact'}:
pairs[pt] = '*'
if ic_set == {u'ndk/dk:ppas', u'dk/ndk:ppas', u'dk:ppas',
u'ndk/(dk):appas', u'dk/(ndk):ppas', u'ndk/dk:appas',
u'dk/ndk:appas', u'dk:appas', u'ndk:appas',
u'dk/(ndk):appas', u'ndk/(dk):ppas', u'ndk:ppas'}:
pairs[pt] = '*'
print bfl[1] + '-' + bfl[0] + '-' + bfl[2], pairs