Commit 24148c0f9d2ef1de5e525289cd3ca7bb54f09a10
1 parent
fd456679
poprawiony eksport (nieposortowany)
Showing
6 changed files
with
133 additions
and
141 deletions
common/util.py
... | ... | @@ -10,8 +10,8 @@ def uniopen(filename): |
10 | 10 | return (line.decode('utf-8').rstrip('\n') for line in open(filename)) |
11 | 11 | |
12 | 12 | |
13 | -def uniprint(text): | |
14 | - print text.encode('utf-8') | |
13 | +def uniprint(text, file=sys.stdout): | |
14 | + print >>file, text.encode('utf-8') | |
15 | 15 | |
16 | 16 | |
17 | 17 | def debug(entry, text): |
... | ... |
dictionary/export.py
... | ... | @@ -2,8 +2,9 @@ |
2 | 2 | |
3 | 3 | import sys |
4 | 4 | from django.db import connection |
5 | -from common.util import debug, flatten | |
6 | -from dictionary.models import CrossReferenceType, ClassificationValue, LexemeAttributeValue, Gender | |
5 | +from common.util import debug, flatten, uniprint | |
6 | +from dictionary.lexeme_form_query import attr_clauses_combinations, FROM_CLAUSES, EXPORT_CLAUSES, WHERE_CLAUSES | |
7 | +from dictionary.models import CrossReferenceType, ClassificationValue, LexemeAttributeValue, Gender, TableTemplate | |
7 | 8 | |
8 | 9 | ADJPREDYKATYWNE = [ |
9 | 10 | u'ciekaw', |
... | ... | @@ -40,6 +41,9 @@ ASPECT_TRANSLATION = { |
40 | 41 | } |
41 | 42 | |
42 | 43 | |
44 | +NESTED_POS = ('adjcom','advcom','ger','pact','ppas','appas') | |
45 | + | |
46 | + | |
43 | 47 | def qualifier_clause(q_id): |
44 | 48 | return '''not exists ( |
45 | 49 | select * from kwalifikatory_leksemow where lexeme_id = l.id and |
... | ... | @@ -61,7 +65,7 @@ def magic_qualifier_clause(): |
61 | 65 | def export_lexemes(data=None, output_file=None): |
62 | 66 | if not data: |
63 | 67 | data = { |
64 | - 'vocabs': ['PoliMorf'], | |
68 | + 'vocabs': ['SGJP'], | |
65 | 69 | 'antivocabs': [], |
66 | 70 | 'variant': 'Morfeusz', |
67 | 71 | 'excluding_qualifiers': [], |
... | ... | @@ -110,7 +114,7 @@ def export_lexemes(data=None, output_file=None): |
110 | 114 | "case".attribute_value_id |
111 | 115 | ''' |
112 | 116 | |
113 | - table_joins = ''' | |
117 | + table_joins = FROM_CLAUSES + EXPORT_CLAUSES + ''' | |
114 | 118 | join leksemy_w_slownikach ls on (ls.l_id = l.id) |
115 | 119 | left join dictionary_lexemeav aspect |
116 | 120 | on (l.id = aspect.lexeme_id and %(aspect)s) |
... | ... | @@ -118,22 +122,6 @@ def export_lexemes(data=None, output_file=None): |
118 | 122 | on (l.id = person.lexeme_id and %(person)s) |
119 | 123 | left join dictionary_lexemeav "case" |
120 | 124 | on (l.id = "case".lexeme_id and %(case)s) |
121 | - join odmieniasie o on (o.l_id = l.id) | |
122 | - join wzory w on (o.w_id = w.id) | |
123 | - join dictionary_tabletemplate_pattern_types tt_pt on | |
124 | - w.typ = tt_pt.patterntype_id | |
125 | - join dictionary_tabletemplate tt on | |
126 | - (tt_pt.tabletemplate_id = tt.id) | |
127 | - join dictionary_tabletemplate_parts_of_speech tt_pos on | |
128 | - (tt.id = tt_pos.tabletemplate_id and | |
129 | - l.pos = tt_pos.partofspeech_id) | |
130 | - join dictionary_exportcell ec on tt.id = ec.table_template_id | |
131 | - join dictionary_exportcell_pattern_types ec_pt on | |
132 | - (ec.id = ec_pt.exportcell_id and w.typ = ec_pt.patterntype_id) | |
133 | - left join dictionary_exportcell_genders ec_g on | |
134 | - ec.id = ec_g.exportcell_id | |
135 | - join zakonczenia z on | |
136 | - (o.w_id = z.w_id and ec.base_form_label_id = z.efobaz) | |
137 | 125 | ''' % { |
138 | 126 | 'aspect': 'aspect.attribute_value_id in (%s)' |
139 | 127 | % ', '.join(str(id) for id in aspects), |
... | ... | @@ -143,117 +131,106 @@ def export_lexemes(data=None, output_file=None): |
143 | 131 | % ', '.join(str(id) for id in cases), |
144 | 132 | } |
145 | 133 | |
146 | - table_clause = ''' | |
147 | - true = all ( | |
148 | - select attr_val.id in (select lexemeattributevalue_id from | |
149 | - dictionary_tabletemplate_attribute_values tt_attr_val | |
150 | - where tt_attr_val.tabletemplate_id = tt.id) | |
151 | - from dictionary_lexemeav lav | |
152 | - join dictionary_lexemeattributevalue attr_val | |
153 | - on lav.attribute_value_id = attr_val.id | |
154 | - join dictionary_tabletemplate_attributes tt_attr | |
155 | - on (attr_val.attribute_id = tt_attr.lexemeattribute_id and | |
156 | - tt.id = tt_attr.tabletemplate_id) | |
157 | - where lav.lexeme_id = l.id) and | |
158 | - true = all ( | |
159 | - select attr_val.id in (select lexemeattributevalue_id from | |
160 | - dictionary_exportcell_attribute_values ec_attr_val | |
161 | - where ec_attr_val.exportcell_id = ec.id) | |
162 | - from dictionary_lexemeav lav | |
163 | - join dictionary_lexemeattributevalue attr_val | |
164 | - on lav.attribute_value_id = attr_val.id | |
165 | - join dictionary_tabletemplate_cell_attributes ec_attr | |
166 | - on (attr_val.attribute_id = ec_attr.lexemeattribute_id and | |
167 | - tt.id = ec_attr.tabletemplate_id) | |
168 | - where lav.lexeme_id = l.id) and | |
169 | - ''' | |
134 | + table_clause = WHERE_CLAUSES | |
170 | 135 | |
171 | 136 | cursor = connection.cursor() |
172 | - query = """ | |
137 | + params_part = (list(data['vocabs']) + list(data['antivocabs']) + | |
138 | + [data['variant']]) | |
139 | + tts = TableTemplate.objects.filter( | |
140 | + variant_id=data['variant']).prefetch_related( | |
141 | + 'attributes__values', 'cell_attributes__values', 'parts_of_speech') | |
142 | + for tt in tts: | |
143 | + uniprint(u'exporting table: %s' % tt.name, file=sys.stderr) | |
144 | + attr_clauses, cell_attr_combinations, tt_attr_combinations = \ | |
145 | + attr_clauses_combinations(tt) | |
146 | + nested = tt.parts_of_speech.all()[0] in NESTED_POS | |
147 | + for tt_c in tt_attr_combinations: | |
148 | + for cell_c in cell_attr_combinations: | |
149 | + if not nested: | |
150 | + params = params_part + flatten(data['magic_qualifiers']) | |
151 | + query = """ | |
173 | 152 | select distinct haslo, %(select)s %(clas_field)s |
174 | -from leksemy l | |
175 | - left join dictionary_lexemeav refl | |
176 | - on (l.id = refl.lexeme_id and %(refl)s) | |
177 | 153 | %(table_joins)s |
178 | 154 | %(clas_join)s |
155 | + left join dictionary_lexemeav refl | |
156 | + on (l.id = refl.lexeme_id and %(refl)s) | |
179 | 157 | where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s |
180 | - l.pos in ('v','subst','osc','adj','adv', 'num','advndm','fraz','comp', | |
181 | - 'conj','interj','prep','part','ppron','pred') and | |
182 | - variant_id=%%s and l.status<>'cand' and l.usuniety = false %(magic)s | |
183 | - --and haslo < 'b' | |
184 | -union all | |
185 | --- wymagajฤ ce gniazdowania przy hasลowaniu: adjcom, advcom, derywaty: | |
158 | + and variant_id=%%s and l.status<>'cand' and l.usuniety = false %(magic)s and | |
159 | + %(attr_clauses)s""" | |
160 | + else: | |
161 | + params = params_part | |
162 | + query = """ | |
186 | 163 | select distinct g.haslo as haslo, %(select)s %(clas_field)s |
187 | -from leksemy l | |
164 | + %(table_joins)s | |
165 | + %(clas_join)s | |
188 | 166 | join odsylacze on l.id=l_id_od |
189 | 167 | join leksemy g on (l_id_do=g.id and g.usuniety = false) |
190 | 168 | left join dictionary_lexemeav refl |
191 | 169 | on (g.id = refl.lexeme_id and %(refl)s) |
192 | - %(table_joins)s | |
193 | - %(clas_join)s | |
194 | 170 | where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s |
195 | - typods_id in (%(crtype_ids)s) and | |
196 | - l.pos in ('adjcom','advcom','ger','pact','ppas','appas') and | |
197 | - variant_id=%%s and l.status<>'cand' and l.usuniety = false | |
198 | - --and g.haslo < 'b' | |
199 | -order by haslo, leksem_id | |
200 | - """ % { | |
201 | - 'vocabs': vocabs_placeholders, | |
202 | - 'antivocabs': antivocabs_clause, | |
203 | - 'x_qual': qualifier_clauses, | |
204 | - 'magic': magic_qualifier_clauses, | |
205 | - 'crtype_ids': ', '.join(str(id) for id in crtype_ids), | |
206 | - 'clas_field': ', classification_value_id' if data['commonness'] else '', | |
207 | - 'select': select, | |
208 | - 'table_joins': table_joins, | |
209 | - 'table_clause': table_clause, | |
210 | - 'clas_join': | |
211 | - 'left outer join dictionary_lexemecv wkl ' | |
212 | - 'on (wkl.lexeme_id=l.id and wkl.classification_value_id in (%s))' | |
213 | - % ', '.join(str(id) for id in cv_ids) if data['commonness'] else '', | |
214 | - 'refl': | |
215 | - 'refl.attribute_value_id in (%s)' | |
216 | - % ', '.join(str(id) for id in refls), | |
217 | - } | |
218 | - params_part = (list(data['vocabs']) + list(data['antivocabs']) + | |
219 | - [data['variant']]) | |
220 | - params = params_part + flatten(data['magic_qualifiers']) + params_part | |
221 | - cursor.execute(query, params) | |
222 | - refl = data['refl'] | |
223 | - cv_table = dict(ClassificationValue.objects.values_list('id', 'label')) | |
224 | - for row in cursor: | |
225 | - if data['commonness']: | |
226 | - entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \ | |
227 | - person_id, case_id, cv_id = row | |
228 | - else: | |
229 | - entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \ | |
230 | - person_id, case_id = row | |
231 | - form = form.lstrip('+') # odmienne postfiksy | |
232 | - tags = None | |
233 | - if tag == 'adja': | |
234 | - form = form.rstrip('+') | |
235 | - if tag == 'adjc': | |
236 | - if form not in ADJPREDYKATYWNE: | |
237 | - tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"] | |
238 | - if refl and pos in ('v', 'pact', 'ger'): | |
239 | - if refl_id in refls: | |
240 | - tag += ':' + REFL_TRANSLATION[refls[refl_id]] | |
241 | - else: | |
242 | - debug(entry, u'Nieznana zwrotnoลฤ: %s' % refl_id) | |
243 | - if 'RODZAJ' in tag: | |
244 | - tag = tag.replace('RODZAJ', genders[gender_id]) | |
245 | - if 'ASPEKT' in tag: | |
246 | - tag = tag.replace('ASPEKT', ASPECT_TRANSLATION[aspects[aspect_id]]) | |
247 | - if 'OSOBA' in tag: | |
248 | - tag = tag.replace('OSOBA', persons[person_id]) | |
249 | - if 'PRZYPADEK' in tag: | |
250 | - tag = tag.replace('PRZYPADEK', cases[case_id]) | |
251 | - tags = tags or [tag] | |
252 | - for tag in tags: | |
253 | - if data['commonness']: | |
254 | - cv = cv_table[cv_id] if cv_id else '' | |
255 | - output_file.write( | |
256 | - (u'%s\t%s\t%s\t%s\n' % (form, entry, tag, cv)).encode('utf-8')) | |
257 | - else: | |
258 | - output_file.write( | |
259 | - (u'%s\t%s\t%s\n' % (form, entry, tag)).encode('utf-8')) | |
171 | + and typods_id in (%(crtype_ids)s) and | |
172 | + variant_id=%%s and l.status<>'cand' and l.usuniety = false and | |
173 | + %(attr_clauses)s""" | |
174 | + query = query % { | |
175 | + 'vocabs': vocabs_placeholders, | |
176 | + 'antivocabs': antivocabs_clause, | |
177 | + 'x_qual': qualifier_clauses, | |
178 | + 'magic': magic_qualifier_clauses, | |
179 | + 'crtype_ids': ', '.join(str(id) for id in crtype_ids), | |
180 | + 'clas_field': ', classification_value_id' | |
181 | + if data['commonness'] else '', | |
182 | + 'select': select, | |
183 | + 'table_joins': table_joins, | |
184 | + 'table_clause': table_clause, | |
185 | + 'clas_join': | |
186 | + '''left outer join dictionary_lexemecv wkl | |
187 | + on (wkl.lexeme_id=l.id and | |
188 | + wkl.classification_value_id in (%s))''' | |
189 | + % ', '.join(str(id) for id in cv_ids) | |
190 | + if data['commonness'] else '', | |
191 | + 'refl': | |
192 | + 'refl.attribute_value_id in (%s)' | |
193 | + % ', '.join(str(id) for id in refls), | |
194 | + 'attr_clauses': ' and '.join(attr_clauses) | |
195 | + } | |
196 | + #print query | |
197 | + #print [tt.id] + params + list(tt_c + cell_c + cell_c) | |
198 | + cursor.execute( | |
199 | + query, [tt.id] + params + list(tt_c + cell_c + cell_c)) | |
200 | + refl = data['refl'] | |
201 | + cv_table = dict(ClassificationValue.objects.values_list('id', 'label')) | |
202 | + for row in cursor: | |
203 | + if data['commonness']: | |
204 | + entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \ | |
205 | + person_id, case_id, cv_id = row | |
206 | + else: | |
207 | + entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \ | |
208 | + person_id, case_id = row | |
209 | + form = form.lstrip('+') # odmienne postfiksy | |
210 | + tags = None | |
211 | + if tag == 'adja': | |
212 | + form = form.rstrip('+') | |
213 | + if tag == 'adjc': | |
214 | + if form not in ADJPREDYKATYWNE: | |
215 | + tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"] | |
216 | + if refl and pos in ('v', 'pact', 'ger'): | |
217 | + if refl_id in refls: | |
218 | + tag += ':' + REFL_TRANSLATION[refls[refl_id]] | |
219 | + else: | |
220 | + debug(entry, u'Nieznana zwrotnoลฤ: %s' % refl_id) | |
221 | + if 'RODZAJ' in tag: | |
222 | + tag = tag.replace('RODZAJ', genders[gender_id]) | |
223 | + if 'ASPEKT' in tag: | |
224 | + tag = tag.replace('ASPEKT', ASPECT_TRANSLATION[aspects[aspect_id]]) | |
225 | + if 'OSOBA' in tag: | |
226 | + tag = tag.replace('OSOBA', persons[person_id]) | |
227 | + if 'PRZYPADEK' in tag: | |
228 | + tag = tag.replace('PRZYPADEK', cases[case_id]) | |
229 | + tags = tags or [tag] | |
230 | + for tag in tags: | |
231 | + if data['commonness']: | |
232 | + cv = cv_table[cv_id] if cv_id else '' | |
233 | + line = u'%s\t%s\t%s\t%s' % (form, entry, tag, cv) | |
234 | + else: | |
235 | + line = u'%s\t%s\t%s' % (form, entry, tag) | |
236 | + uniprint(line, file=output_file) | |
260 | 237 | \ No newline at end of file |
... | ... |
dictionary/lexeme_form_query.py
... | ... | @@ -10,18 +10,31 @@ join dictionary_tabletemplate_pattern_types tt_pt |
10 | 10 | on (w.typ = tt_pt.patterntype_id and tt.id = tt_pt.tabletemplate_id) |
11 | 11 | join dictionary_tabletemplate_parts_of_speech tt_pos |
12 | 12 | on (tt.id = tt_pos.tabletemplate_id and |
13 | - l.pos = tt_pos.partofspeech_id) | |
14 | -join dictionary_tablecell tc | |
15 | - on tt.id = tc.table_template_id | |
16 | -join dictionary_tablecell_pattern_types tc_pt | |
17 | - on (tc.id = tc_pt.tablecell_id and w.typ = tc_pt.patterntype_id) | |
18 | -left join dictionary_tablecell_genders tc_g | |
19 | - on tc.id = tc_g.tablecell_id | |
13 | + l.pos = tt_pos.partofspeech_id)''' | |
14 | + | |
15 | +TABLE_CLAUSES = ''' | |
16 | +join dictionary_tablecell cell | |
17 | + on tt.id = cell.table_template_id | |
18 | +join dictionary_tablecell_pattern_types cell_pt | |
19 | + on (cell.id = cell_pt.tablecell_id and w.typ = cell_pt.patterntype_id) | |
20 | +left join dictionary_tablecell_genders cell_g | |
21 | + on cell.id = cell_g.tablecell_id | |
22 | +join zakonczenia z | |
23 | + on (o.w_id = z.w_id and cell.base_form_label_id = z.efobaz) | |
24 | +''' | |
25 | + | |
26 | +EXPORT_CLAUSES = ''' | |
27 | +join dictionary_exportcell cell | |
28 | + on tt.id = cell.table_template_id | |
29 | +join dictionary_exportcell_pattern_types cell_pt | |
30 | + on (cell.id = cell_pt.exportcell_id and w.typ = cell_pt.patterntype_id) | |
31 | +left join dictionary_exportcell_genders cell_g | |
32 | + on cell.id = cell_g.exportcell_id | |
20 | 33 | join zakonczenia z |
21 | - on (o.w_id = z.w_id and tc.base_form_label_id = z.efobaz) | |
34 | + on (o.w_id = z.w_id and cell.base_form_label_id = z.efobaz) | |
22 | 35 | ''' |
23 | 36 | |
24 | -WHERE_CLAUSES = '(not tt.takes_gender or tc_g.gender_id = o.gender_id)' | |
37 | +WHERE_CLAUSES = '(not tt.takes_gender or cell_g.gender_id = o.gender_id)' | |
25 | 38 | |
26 | 39 | def value_combinations(attributes, attr_vals=None): |
27 | 40 | if len(attributes) == 0: |
... | ... | @@ -47,8 +60,8 @@ def attr_clauses_combinations(tt): |
47 | 60 | for _ in xrange(len(tt_attrs) + len(cell_attrs))] |
48 | 61 | attr_clauses += [''' |
49 | 62 | %s in (select lexemeattributevalue_id from |
50 | - dictionary_tablecell_attribute_values tc_attr_val | |
51 | - where tc_attr_val.tablecell_id = tc.id)''' | |
63 | + dictionary_tablecell_attribute_values cell_attr_val | |
64 | + where cell_attr_val.tablecell_id = cell.id)''' | |
52 | 65 | for _ in cell_attrs] |
53 | 66 | if not attr_clauses: |
54 | 67 | attr_clauses = ['true'] |
... | ... |
dictionary/management/commands/cache_form_qualifiers.py
... | ... | @@ -2,8 +2,9 @@ |
2 | 2 | |
3 | 3 | from django.db import connection, transaction |
4 | 4 | from django.core.management.base import BaseCommand |
5 | +from common.util import uniprint | |
5 | 6 | from dictionary.lexeme_form_query import FROM_CLAUSES, WHERE_CLAUSES, \ |
6 | - attr_clauses_combinations | |
7 | + attr_clauses_combinations, TABLE_CLAUSES | |
7 | 8 | from dictionary.models import TableTemplate |
8 | 9 | |
9 | 10 | |
... | ... | @@ -27,7 +28,7 @@ def create_forms(): |
27 | 28 | tts = TableTemplate.objects.filter(variant_id=VARIANT).prefetch_related( |
28 | 29 | 'attributes__values', 'cell_attributes__values') |
29 | 30 | for tt in tts: |
30 | - print 'caching ending qualifiers for: %s' % tt.name | |
31 | + uniprint('caching ending qualifiers for: %s' % tt.name) | |
31 | 32 | attr_clauses, cell_attr_combinations, tt_attr_combinations = \ |
32 | 33 | attr_clauses_combinations(tt) |
33 | 34 | for tt_c in tt_attr_combinations: |
... | ... | @@ -37,7 +38,8 @@ def create_forms(): |
37 | 38 | kz.qualifier_id as qualifier_id |
38 | 39 | %s join kwalifikatory_zakonczen kz |
39 | 40 | on (z.id = kz.ending_id) |
40 | - where %s and ''' % (FROM_CLAUSES, WHERE_CLAUSES) | |
41 | + where %s and ''' % ( | |
42 | + FROM_CLAUSES + TABLE_CLAUSES, WHERE_CLAUSES) | |
41 | 43 | select_query += ' and '.join(attr_clauses) |
42 | 44 | cursor.execute(''' |
43 | 45 | insert into dictionary_lexemeformqualifier |
... | ... |
dictionary/management/commands/create_forms.py
... | ... | @@ -4,7 +4,7 @@ from django.db import connection, transaction |
4 | 4 | from django.core.management.base import BaseCommand |
5 | 5 | from common.util import uniprint |
6 | 6 | from dictionary.lexeme_form_query import FROM_CLAUSES, WHERE_CLAUSES, \ |
7 | - attr_clauses_combinations | |
7 | + attr_clauses_combinations, TABLE_CLAUSES | |
8 | 8 | from dictionary.models import TableTemplate |
9 | 9 | |
10 | 10 | |
... | ... | @@ -36,7 +36,8 @@ def create_forms(): |
36 | 36 | select_query = ''' |
37 | 37 | select distinct l.id as lexeme_id, |
38 | 38 | tc.prefix||o.rdzen||z.zak||tc.suffix as form |
39 | - %s where %s and ''' % (FROM_CLAUSES, WHERE_CLAUSES) | |
39 | + %s where %s and ''' % ( | |
40 | + FROM_CLAUSES + TABLE_CLAUSES, WHERE_CLAUSES) | |
40 | 41 | select_query += ' and '.join(attr_clauses) |
41 | 42 | cursor.execute(''' |
42 | 43 | insert into dictionary_lexemeform (lexeme_id, form) |
... | ... |