Commit 24148c0f9d2ef1de5e525289cd3ca7bb54f09a10

Authored by janek37
1 parent fd456679

poprawiony eksport (nieposortowany)

common/util.py
... ... @@ -10,8 +10,8 @@ def uniopen(filename):
10 10 return (line.decode('utf-8').rstrip('\n') for line in open(filename))
11 11  
12 12  
13   -def uniprint(text):
14   - print text.encode('utf-8')
  13 +def uniprint(text, file=sys.stdout):
  14 + print >>file, text.encode('utf-8')
15 15  
16 16  
17 17 def debug(entry, text):
... ...
dictionary/export.py
... ... @@ -2,8 +2,9 @@
2 2  
3 3 import sys
4 4 from django.db import connection
5   -from common.util import debug, flatten
6   -from dictionary.models import CrossReferenceType, ClassificationValue, LexemeAttributeValue, Gender
  5 +from common.util import debug, flatten, uniprint
  6 +from dictionary.lexeme_form_query import attr_clauses_combinations, FROM_CLAUSES, EXPORT_CLAUSES, WHERE_CLAUSES
  7 +from dictionary.models import CrossReferenceType, ClassificationValue, LexemeAttributeValue, Gender, TableTemplate
7 8  
8 9 ADJPREDYKATYWNE = [
9 10 u'ciekaw',
... ... @@ -40,6 +41,9 @@ ASPECT_TRANSLATION = {
40 41 }
41 42  
42 43  
  44 +NESTED_POS = ('adjcom','advcom','ger','pact','ppas','appas')
  45 +
  46 +
43 47 def qualifier_clause(q_id):
44 48 return '''not exists (
45 49 select * from kwalifikatory_leksemow where lexeme_id = l.id and
... ... @@ -61,7 +65,7 @@ def magic_qualifier_clause():
61 65 def export_lexemes(data=None, output_file=None):
62 66 if not data:
63 67 data = {
64   - 'vocabs': ['PoliMorf'],
  68 + 'vocabs': ['SGJP'],
65 69 'antivocabs': [],
66 70 'variant': 'Morfeusz',
67 71 'excluding_qualifiers': [],
... ... @@ -110,7 +114,7 @@ def export_lexemes(data=None, output_file=None):
110 114 "case".attribute_value_id
111 115 '''
112 116  
113   - table_joins = '''
  117 + table_joins = FROM_CLAUSES + EXPORT_CLAUSES + '''
114 118 join leksemy_w_slownikach ls on (ls.l_id = l.id)
115 119 left join dictionary_lexemeav aspect
116 120 on (l.id = aspect.lexeme_id and %(aspect)s)
... ... @@ -118,22 +122,6 @@ def export_lexemes(data=None, output_file=None):
118 122 on (l.id = person.lexeme_id and %(person)s)
119 123 left join dictionary_lexemeav "case"
120 124 on (l.id = "case".lexeme_id and %(case)s)
121   - join odmieniasie o on (o.l_id = l.id)
122   - join wzory w on (o.w_id = w.id)
123   - join dictionary_tabletemplate_pattern_types tt_pt on
124   - w.typ = tt_pt.patterntype_id
125   - join dictionary_tabletemplate tt on
126   - (tt_pt.tabletemplate_id = tt.id)
127   - join dictionary_tabletemplate_parts_of_speech tt_pos on
128   - (tt.id = tt_pos.tabletemplate_id and
129   - l.pos = tt_pos.partofspeech_id)
130   - join dictionary_exportcell ec on tt.id = ec.table_template_id
131   - join dictionary_exportcell_pattern_types ec_pt on
132   - (ec.id = ec_pt.exportcell_id and w.typ = ec_pt.patterntype_id)
133   - left join dictionary_exportcell_genders ec_g on
134   - ec.id = ec_g.exportcell_id
135   - join zakonczenia z on
136   - (o.w_id = z.w_id and ec.base_form_label_id = z.efobaz)
137 125 ''' % {
138 126 'aspect': 'aspect.attribute_value_id in (%s)'
139 127 % ', '.join(str(id) for id in aspects),
... ... @@ -143,117 +131,106 @@ def export_lexemes(data=None, output_file=None):
143 131 % ', '.join(str(id) for id in cases),
144 132 }
145 133  
146   - table_clause = '''
147   - true = all (
148   - select attr_val.id in (select lexemeattributevalue_id from
149   - dictionary_tabletemplate_attribute_values tt_attr_val
150   - where tt_attr_val.tabletemplate_id = tt.id)
151   - from dictionary_lexemeav lav
152   - join dictionary_lexemeattributevalue attr_val
153   - on lav.attribute_value_id = attr_val.id
154   - join dictionary_tabletemplate_attributes tt_attr
155   - on (attr_val.attribute_id = tt_attr.lexemeattribute_id and
156   - tt.id = tt_attr.tabletemplate_id)
157   - where lav.lexeme_id = l.id) and
158   - true = all (
159   - select attr_val.id in (select lexemeattributevalue_id from
160   - dictionary_exportcell_attribute_values ec_attr_val
161   - where ec_attr_val.exportcell_id = ec.id)
162   - from dictionary_lexemeav lav
163   - join dictionary_lexemeattributevalue attr_val
164   - on lav.attribute_value_id = attr_val.id
165   - join dictionary_tabletemplate_cell_attributes ec_attr
166   - on (attr_val.attribute_id = ec_attr.lexemeattribute_id and
167   - tt.id = ec_attr.tabletemplate_id)
168   - where lav.lexeme_id = l.id) and
169   - '''
  134 + table_clause = WHERE_CLAUSES
170 135  
171 136 cursor = connection.cursor()
172   - query = """
  137 + params_part = (list(data['vocabs']) + list(data['antivocabs']) +
  138 + [data['variant']])
  139 + tts = TableTemplate.objects.filter(
  140 + variant_id=data['variant']).prefetch_related(
  141 + 'attributes__values', 'cell_attributes__values', 'parts_of_speech')
  142 + for tt in tts:
  143 + uniprint(u'exporting table: %s' % tt.name, file=sys.stderr)
  144 + attr_clauses, cell_attr_combinations, tt_attr_combinations = \
  145 + attr_clauses_combinations(tt)
  146 + nested = tt.parts_of_speech.all()[0] in NESTED_POS
  147 + for tt_c in tt_attr_combinations:
  148 + for cell_c in cell_attr_combinations:
  149 + if not nested:
  150 + params = params_part + flatten(data['magic_qualifiers'])
  151 + query = """
173 152 select distinct haslo, %(select)s %(clas_field)s
174   -from leksemy l
175   - left join dictionary_lexemeav refl
176   - on (l.id = refl.lexeme_id and %(refl)s)
177 153 %(table_joins)s
178 154 %(clas_join)s
  155 + left join dictionary_lexemeav refl
  156 + on (l.id = refl.lexeme_id and %(refl)s)
179 157 where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
180   - l.pos in ('v','subst','osc','adj','adv', 'num','advndm','fraz','comp',
181   - 'conj','interj','prep','part','ppron','pred') and
182   - variant_id=%%s and l.status<>'cand' and l.usuniety = false %(magic)s
183   - --and haslo < 'b'
184   -union all
185   --- wymagajฤ…ce gniazdowania przy hasล‚owaniu: adjcom, advcom, derywaty:
  158 + and variant_id=%%s and l.status<>'cand' and l.usuniety = false %(magic)s and
  159 + %(attr_clauses)s"""
  160 + else:
  161 + params = params_part
  162 + query = """
186 163 select distinct g.haslo as haslo, %(select)s %(clas_field)s
187   -from leksemy l
  164 + %(table_joins)s
  165 + %(clas_join)s
188 166 join odsylacze on l.id=l_id_od
189 167 join leksemy g on (l_id_do=g.id and g.usuniety = false)
190 168 left join dictionary_lexemeav refl
191 169 on (g.id = refl.lexeme_id and %(refl)s)
192   - %(table_joins)s
193   - %(clas_join)s
194 170 where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
195   - typods_id in (%(crtype_ids)s) and
196   - l.pos in ('adjcom','advcom','ger','pact','ppas','appas') and
197   - variant_id=%%s and l.status<>'cand' and l.usuniety = false
198   - --and g.haslo < 'b'
199   -order by haslo, leksem_id
200   - """ % {
201   - 'vocabs': vocabs_placeholders,
202   - 'antivocabs': antivocabs_clause,
203   - 'x_qual': qualifier_clauses,
204   - 'magic': magic_qualifier_clauses,
205   - 'crtype_ids': ', '.join(str(id) for id in crtype_ids),
206   - 'clas_field': ', classification_value_id' if data['commonness'] else '',
207   - 'select': select,
208   - 'table_joins': table_joins,
209   - 'table_clause': table_clause,
210   - 'clas_join':
211   - 'left outer join dictionary_lexemecv wkl '
212   - 'on (wkl.lexeme_id=l.id and wkl.classification_value_id in (%s))'
213   - % ', '.join(str(id) for id in cv_ids) if data['commonness'] else '',
214   - 'refl':
215   - 'refl.attribute_value_id in (%s)'
216   - % ', '.join(str(id) for id in refls),
217   - }
218   - params_part = (list(data['vocabs']) + list(data['antivocabs']) +
219   - [data['variant']])
220   - params = params_part + flatten(data['magic_qualifiers']) + params_part
221   - cursor.execute(query, params)
222   - refl = data['refl']
223   - cv_table = dict(ClassificationValue.objects.values_list('id', 'label'))
224   - for row in cursor:
225   - if data['commonness']:
226   - entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
227   - person_id, case_id, cv_id = row
228   - else:
229   - entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
230   - person_id, case_id = row
231   - form = form.lstrip('+') # odmienne postfiksy
232   - tags = None
233   - if tag == 'adja':
234   - form = form.rstrip('+')
235   - if tag == 'adjc':
236   - if form not in ADJPREDYKATYWNE:
237   - tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"]
238   - if refl and pos in ('v', 'pact', 'ger'):
239   - if refl_id in refls:
240   - tag += ':' + REFL_TRANSLATION[refls[refl_id]]
241   - else:
242   - debug(entry, u'Nieznana zwrotnoล›ฤ‡: %s' % refl_id)
243   - if 'RODZAJ' in tag:
244   - tag = tag.replace('RODZAJ', genders[gender_id])
245   - if 'ASPEKT' in tag:
246   - tag = tag.replace('ASPEKT', ASPECT_TRANSLATION[aspects[aspect_id]])
247   - if 'OSOBA' in tag:
248   - tag = tag.replace('OSOBA', persons[person_id])
249   - if 'PRZYPADEK' in tag:
250   - tag = tag.replace('PRZYPADEK', cases[case_id])
251   - tags = tags or [tag]
252   - for tag in tags:
253   - if data['commonness']:
254   - cv = cv_table[cv_id] if cv_id else ''
255   - output_file.write(
256   - (u'%s\t%s\t%s\t%s\n' % (form, entry, tag, cv)).encode('utf-8'))
257   - else:
258   - output_file.write(
259   - (u'%s\t%s\t%s\n' % (form, entry, tag)).encode('utf-8'))
  171 + and typods_id in (%(crtype_ids)s) and
  172 + variant_id=%%s and l.status<>'cand' and l.usuniety = false and
  173 + %(attr_clauses)s"""
  174 + query = query % {
  175 + 'vocabs': vocabs_placeholders,
  176 + 'antivocabs': antivocabs_clause,
  177 + 'x_qual': qualifier_clauses,
  178 + 'magic': magic_qualifier_clauses,
  179 + 'crtype_ids': ', '.join(str(id) for id in crtype_ids),
  180 + 'clas_field': ', classification_value_id'
  181 + if data['commonness'] else '',
  182 + 'select': select,
  183 + 'table_joins': table_joins,
  184 + 'table_clause': table_clause,
  185 + 'clas_join':
  186 + '''left outer join dictionary_lexemecv wkl
  187 + on (wkl.lexeme_id=l.id and
  188 + wkl.classification_value_id in (%s))'''
  189 + % ', '.join(str(id) for id in cv_ids)
  190 + if data['commonness'] else '',
  191 + 'refl':
  192 + 'refl.attribute_value_id in (%s)'
  193 + % ', '.join(str(id) for id in refls),
  194 + 'attr_clauses': ' and '.join(attr_clauses)
  195 + }
  196 + #print query
  197 + #print [tt.id] + params + list(tt_c + cell_c + cell_c)
  198 + cursor.execute(
  199 + query, [tt.id] + params + list(tt_c + cell_c + cell_c))
  200 + refl = data['refl']
  201 + cv_table = dict(ClassificationValue.objects.values_list('id', 'label'))
  202 + for row in cursor:
  203 + if data['commonness']:
  204 + entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
  205 + person_id, case_id, cv_id = row
  206 + else:
  207 + entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
  208 + person_id, case_id = row
  209 + form = form.lstrip('+') # odmienne postfiksy
  210 + tags = None
  211 + if tag == 'adja':
  212 + form = form.rstrip('+')
  213 + if tag == 'adjc':
  214 + if form not in ADJPREDYKATYWNE:
  215 + tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"]
  216 + if refl and pos in ('v', 'pact', 'ger'):
  217 + if refl_id in refls:
  218 + tag += ':' + REFL_TRANSLATION[refls[refl_id]]
  219 + else:
  220 + debug(entry, u'Nieznana zwrotnoล›ฤ‡: %s' % refl_id)
  221 + if 'RODZAJ' in tag:
  222 + tag = tag.replace('RODZAJ', genders[gender_id])
  223 + if 'ASPEKT' in tag:
  224 + tag = tag.replace('ASPEKT', ASPECT_TRANSLATION[aspects[aspect_id]])
  225 + if 'OSOBA' in tag:
  226 + tag = tag.replace('OSOBA', persons[person_id])
  227 + if 'PRZYPADEK' in tag:
  228 + tag = tag.replace('PRZYPADEK', cases[case_id])
  229 + tags = tags or [tag]
  230 + for tag in tags:
  231 + if data['commonness']:
  232 + cv = cv_table[cv_id] if cv_id else ''
  233 + line = u'%s\t%s\t%s\t%s' % (form, entry, tag, cv)
  234 + else:
  235 + line = u'%s\t%s\t%s' % (form, entry, tag)
  236 + uniprint(line, file=output_file)
260 237 \ No newline at end of file
... ...
dictionary/lexeme_form_query.py
... ... @@ -10,18 +10,31 @@ join dictionary_tabletemplate_pattern_types tt_pt
10 10 on (w.typ = tt_pt.patterntype_id and tt.id = tt_pt.tabletemplate_id)
11 11 join dictionary_tabletemplate_parts_of_speech tt_pos
12 12 on (tt.id = tt_pos.tabletemplate_id and
13   - l.pos = tt_pos.partofspeech_id)
14   -join dictionary_tablecell tc
15   - on tt.id = tc.table_template_id
16   -join dictionary_tablecell_pattern_types tc_pt
17   - on (tc.id = tc_pt.tablecell_id and w.typ = tc_pt.patterntype_id)
18   -left join dictionary_tablecell_genders tc_g
19   - on tc.id = tc_g.tablecell_id
  13 + l.pos = tt_pos.partofspeech_id)'''
  14 +
  15 +TABLE_CLAUSES = '''
  16 +join dictionary_tablecell cell
  17 + on tt.id = cell.table_template_id
  18 +join dictionary_tablecell_pattern_types cell_pt
  19 + on (cell.id = cell_pt.tablecell_id and w.typ = cell_pt.patterntype_id)
  20 +left join dictionary_tablecell_genders cell_g
  21 + on cell.id = cell_g.tablecell_id
  22 +join zakonczenia z
  23 + on (o.w_id = z.w_id and cell.base_form_label_id = z.efobaz)
  24 +'''
  25 +
  26 +EXPORT_CLAUSES = '''
  27 +join dictionary_exportcell cell
  28 + on tt.id = cell.table_template_id
  29 +join dictionary_exportcell_pattern_types cell_pt
  30 + on (cell.id = cell_pt.exportcell_id and w.typ = cell_pt.patterntype_id)
  31 +left join dictionary_exportcell_genders cell_g
  32 + on cell.id = cell_g.exportcell_id
20 33 join zakonczenia z
21   - on (o.w_id = z.w_id and tc.base_form_label_id = z.efobaz)
  34 + on (o.w_id = z.w_id and cell.base_form_label_id = z.efobaz)
22 35 '''
23 36  
24   -WHERE_CLAUSES = '(not tt.takes_gender or tc_g.gender_id = o.gender_id)'
  37 +WHERE_CLAUSES = '(not tt.takes_gender or cell_g.gender_id = o.gender_id)'
25 38  
26 39 def value_combinations(attributes, attr_vals=None):
27 40 if len(attributes) == 0:
... ... @@ -47,8 +60,8 @@ def attr_clauses_combinations(tt):
47 60 for _ in xrange(len(tt_attrs) + len(cell_attrs))]
48 61 attr_clauses += ['''
49 62 %s in (select lexemeattributevalue_id from
50   - dictionary_tablecell_attribute_values tc_attr_val
51   - where tc_attr_val.tablecell_id = tc.id)'''
  63 + dictionary_tablecell_attribute_values cell_attr_val
  64 + where cell_attr_val.tablecell_id = cell.id)'''
52 65 for _ in cell_attrs]
53 66 if not attr_clauses:
54 67 attr_clauses = ['true']
... ...
dictionary/management/commands/cache_form_qualifiers.py
... ... @@ -2,8 +2,9 @@
2 2  
3 3 from django.db import connection, transaction
4 4 from django.core.management.base import BaseCommand
  5 +from common.util import uniprint
5 6 from dictionary.lexeme_form_query import FROM_CLAUSES, WHERE_CLAUSES, \
6   - attr_clauses_combinations
  7 + attr_clauses_combinations, TABLE_CLAUSES
7 8 from dictionary.models import TableTemplate
8 9  
9 10  
... ... @@ -27,7 +28,7 @@ def create_forms():
27 28 tts = TableTemplate.objects.filter(variant_id=VARIANT).prefetch_related(
28 29 'attributes__values', 'cell_attributes__values')
29 30 for tt in tts:
30   - print 'caching ending qualifiers for: %s' % tt.name
  31 + uniprint('caching ending qualifiers for: %s' % tt.name)
31 32 attr_clauses, cell_attr_combinations, tt_attr_combinations = \
32 33 attr_clauses_combinations(tt)
33 34 for tt_c in tt_attr_combinations:
... ... @@ -37,7 +38,8 @@ def create_forms():
37 38 kz.qualifier_id as qualifier_id
38 39 %s join kwalifikatory_zakonczen kz
39 40 on (z.id = kz.ending_id)
40   - where %s and ''' % (FROM_CLAUSES, WHERE_CLAUSES)
  41 + where %s and ''' % (
  42 + FROM_CLAUSES + TABLE_CLAUSES, WHERE_CLAUSES)
41 43 select_query += ' and '.join(attr_clauses)
42 44 cursor.execute('''
43 45 insert into dictionary_lexemeformqualifier
... ...
dictionary/management/commands/create_forms.py
... ... @@ -4,7 +4,7 @@ from django.db import connection, transaction
4 4 from django.core.management.base import BaseCommand
5 5 from common.util import uniprint
6 6 from dictionary.lexeme_form_query import FROM_CLAUSES, WHERE_CLAUSES, \
7   - attr_clauses_combinations
  7 + attr_clauses_combinations, TABLE_CLAUSES
8 8 from dictionary.models import TableTemplate
9 9  
10 10  
... ... @@ -36,7 +36,8 @@ def create_forms():
36 36 select_query = '''
37 37 select distinct l.id as lexeme_id,
38 38 tc.prefix||o.rdzen||z.zak||tc.suffix as form
39   - %s where %s and ''' % (FROM_CLAUSES, WHERE_CLAUSES)
  39 + %s where %s and ''' % (
  40 + FROM_CLAUSES + TABLE_CLAUSES, WHERE_CLAUSES)
40 41 select_query += ' and '.join(attr_clauses)
41 42 cursor.execute('''
42 43 insert into dictionary_lexemeform (lexeme_id, form)
... ...
dictionary/management/commands/export_lexemes.py
... ... @@ -5,7 +5,6 @@ from dictionary.export import export_lexemes
5 5  
6 6  
7 7 class Command(BaseCommand):
8   - args = 'none'
9 8 help = 'Temporary export script'
10 9  
11 10 def handle(self, **options):
... ...