poprawiony eksport (nieposortowany)

janek37
1 parent fd456679
Showing 6 changed files with 133 additions and 141 deletions
common/util.py
dictionary/export.py
dictionary/lexeme_form_query.py
dictionary/management/commands/cache_form_qualifiers.py
dictionary/management/commands/create_forms.py
dictionary/management/commands/export_lexemes.py
@@ -10,8 +10,8 @@ def uniopen(filename):
     return (line.decode('utf-8').rstrip('\n') for line in open(filename))
  
  
-def uniprint(text):
-    print text.encode('utf-8')
+def uniprint(text, file=sys.stdout):
+    print >>file, text.encode('utf-8')
  
  
 def debug(entry, text):
@@ -2,8 +2,9 @@
  
 import sys
 from django.db import connection
-from common.util import debug, flatten
-from dictionary.models import CrossReferenceType, ClassificationValue, LexemeAttributeValue, Gender
+from common.util import debug, flatten, uniprint
+from dictionary.lexeme_form_query import attr_clauses_combinations, FROM_CLAUSES, EXPORT_CLAUSES, WHERE_CLAUSES
+from dictionary.models import CrossReferenceType, ClassificationValue, LexemeAttributeValue, Gender, TableTemplate
  
 ADJPREDYKATYWNE = [
     u'ciekaw',
@@ -40,6 +41,9 @@ ASPECT_TRANSLATION = {
 }
  
  
+NESTED_POS = ('adjcom','advcom','ger','pact','ppas','appas')
+
+
 def qualifier_clause(q_id):
     return '''not exists (
     select * from kwalifikatory_leksemow where lexeme_id = l.id and
@@ -61,7 +65,7 @@ def magic_qualifier_clause():
 def export_lexemes(data=None, output_file=None):
     if not data:
         data = {
-            'vocabs': ['PoliMorf'],
+            'vocabs': ['SGJP'],
             'antivocabs': [],
             'variant': 'Morfeusz',
             'excluding_qualifiers': [],
@@ -110,7 +114,7 @@ def export_lexemes(data=None, output_file=None):
         "case".attribute_value_id
     '''
  
-    table_joins = '''
+    table_joins = FROM_CLAUSES + EXPORT_CLAUSES + '''
     join leksemy_w_slownikach ls on (ls.l_id = l.id)
     left join dictionary_lexemeav aspect
         on (l.id = aspect.lexeme_id and %(aspect)s)
@@ -118,22 +122,6 @@ def export_lexemes(data=None, output_file=None):
         on (l.id = person.lexeme_id and %(person)s)
     left join dictionary_lexemeav "case"
         on (l.id = "case".lexeme_id and %(case)s)
-    join odmieniasie o on (o.l_id = l.id)
-    join wzory w on (o.w_id = w.id)
-    join dictionary_tabletemplate_pattern_types tt_pt on
-        w.typ = tt_pt.patterntype_id
-    join dictionary_tabletemplate tt on
-        (tt_pt.tabletemplate_id = tt.id)
-    join dictionary_tabletemplate_parts_of_speech tt_pos on
-        (tt.id = tt_pos.tabletemplate_id and
-            l.pos = tt_pos.partofspeech_id)
-    join dictionary_exportcell ec on tt.id = ec.table_template_id
-    join dictionary_exportcell_pattern_types ec_pt on
-        (ec.id = ec_pt.exportcell_id and w.typ = ec_pt.patterntype_id)
-    left join dictionary_exportcell_genders ec_g on
-        ec.id = ec_g.exportcell_id
-    join zakonczenia z on
-        (o.w_id = z.w_id and ec.base_form_label_id = z.efobaz)
     ''' % {
         'aspect': 'aspect.attribute_value_id in (%s)'
             % ', '.join(str(id) for id in aspects),
@@ -143,117 +131,106 @@ def export_lexemes(data=None, output_file=None):
             % ', '.join(str(id) for id in cases),
     }
  
-    table_clause = '''
-    true = all (
-        select attr_val.id in (select lexemeattributevalue_id from
-                dictionary_tabletemplate_attribute_values tt_attr_val
-            where tt_attr_val.tabletemplate_id = tt.id)
-        from dictionary_lexemeav lav
-            join dictionary_lexemeattributevalue attr_val
-                on lav.attribute_value_id = attr_val.id
-            join dictionary_tabletemplate_attributes tt_attr
-                on (attr_val.attribute_id = tt_attr.lexemeattribute_id and
-                    tt.id = tt_attr.tabletemplate_id)
-        where lav.lexeme_id = l.id) and
-    true = all (
-        select attr_val.id in (select lexemeattributevalue_id from
-                dictionary_exportcell_attribute_values ec_attr_val
-            where ec_attr_val.exportcell_id = ec.id)
-        from dictionary_lexemeav lav
-            join dictionary_lexemeattributevalue attr_val
-                on lav.attribute_value_id = attr_val.id
-            join dictionary_tabletemplate_cell_attributes ec_attr
-                on (attr_val.attribute_id = ec_attr.lexemeattribute_id and
-                    tt.id = ec_attr.tabletemplate_id)
-        where lav.lexeme_id = l.id) and
-    '''
+    table_clause = WHERE_CLAUSES
  
     cursor = connection.cursor()
-    query = """
+    params_part = (list(data['vocabs']) + list(data['antivocabs']) +
+                   [data['variant']])
+    tts = TableTemplate.objects.filter(
+        variant_id=data['variant']).prefetch_related(
+            'attributes__values', 'cell_attributes__values', 'parts_of_speech')
+    for tt in tts:
+        uniprint(u'exporting table: %s' % tt.name, file=sys.stderr)
+        attr_clauses, cell_attr_combinations, tt_attr_combinations = \
+            attr_clauses_combinations(tt)
+        nested = tt.parts_of_speech.all()[0] in NESTED_POS
+        for tt_c in tt_attr_combinations:
+            for cell_c in cell_attr_combinations:
+                if not nested:
+                    params = params_part + flatten(data['magic_qualifiers'])
+                    query = """
 select distinct haslo, %(select)s %(clas_field)s
-from leksemy l
-    left join dictionary_lexemeav refl
-        on (l.id = refl.lexeme_id and %(refl)s)
     %(table_joins)s
     %(clas_join)s
+    left join dictionary_lexemeav refl
+        on (l.id = refl.lexeme_id and %(refl)s)
 where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
-    l.pos in ('v','subst','osc','adj','adv', 'num','advndm','fraz','comp',
-              'conj','interj','prep','part','ppron','pred') and
-    variant_id=%%s and l.status<>'cand' and l.usuniety = false %(magic)s
-    --and haslo < 'b'
-union all
--- wymagające gniazdowania przy hasłowaniu: adjcom, advcom, derywaty:
+    and variant_id=%%s and l.status<>'cand' and l.usuniety = false %(magic)s and
+    %(attr_clauses)s"""
+                else:
+                    params = params_part
+                    query = """
 select distinct g.haslo as haslo, %(select)s %(clas_field)s
-from leksemy l
+    %(table_joins)s
+    %(clas_join)s
     join odsylacze on l.id=l_id_od
     join leksemy g on (l_id_do=g.id and g.usuniety = false)
     left join dictionary_lexemeav refl
         on (g.id = refl.lexeme_id and %(refl)s)
-    %(table_joins)s
-    %(clas_join)s
 where ls.slownik in (%(vocabs)s) and %(antivocabs)s %(x_qual)s %(table_clause)s
-    typods_id in (%(crtype_ids)s) and
-    l.pos in ('adjcom','advcom','ger','pact','ppas','appas') and
-    variant_id=%%s and l.status<>'cand' and l.usuniety = false
-    --and g.haslo < 'b'
-order by haslo, leksem_id
-    """ % {
-        'vocabs': vocabs_placeholders,
-        'antivocabs': antivocabs_clause,
-        'x_qual': qualifier_clauses,
-        'magic': magic_qualifier_clauses,
-        'crtype_ids': ', '.join(str(id) for id in crtype_ids),
-        'clas_field': ', classification_value_id' if data['commonness'] else '',
-        'select': select,
-        'table_joins': table_joins,
-        'table_clause': table_clause,
-        'clas_join':
-            'left outer join dictionary_lexemecv wkl '
-            'on (wkl.lexeme_id=l.id and wkl.classification_value_id in (%s))'
-            % ', '.join(str(id) for id in cv_ids) if data['commonness'] else '',
-        'refl':
-            'refl.attribute_value_id in (%s)'
-            % ', '.join(str(id) for id in refls),
-    }
-    params_part = (list(data['vocabs']) + list(data['antivocabs']) +
-                   [data['variant']])
-    params = params_part + flatten(data['magic_qualifiers']) + params_part
-    cursor.execute(query, params)
-    refl = data['refl']
-    cv_table = dict(ClassificationValue.objects.values_list('id', 'label'))
-    for row in cursor:
-        if data['commonness']:
-            entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
-                person_id, case_id, cv_id = row
-        else:
-            entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
-                person_id, case_id = row
-        form = form.lstrip('+') # odmienne postfiksy
-        tags = None
-        if tag == 'adja':
-            form = form.rstrip('+')
-        if tag == 'adjc':
-            if form not in ADJPREDYKATYWNE:
-                tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"]
-        if refl and pos in ('v', 'pact', 'ger'):
-            if refl_id in refls:
-                tag += ':' + REFL_TRANSLATION[refls[refl_id]]
-            else:
-                debug(entry, u'Nieznana zwrotność: %s' % refl_id)
-        if 'RODZAJ' in tag:
-            tag = tag.replace('RODZAJ', genders[gender_id])
-        if 'ASPEKT' in tag:
-            tag = tag.replace('ASPEKT', ASPECT_TRANSLATION[aspects[aspect_id]])
-        if 'OSOBA' in tag:
-            tag = tag.replace('OSOBA', persons[person_id])
-        if 'PRZYPADEK' in tag:
-            tag = tag.replace('PRZYPADEK', cases[case_id])
-        tags = tags or [tag]
-        for tag in tags:
-            if data['commonness']:
-                cv = cv_table[cv_id] if cv_id else ''
-                output_file.write(
-                    (u'%s\t%s\t%s\t%s\n' % (form, entry, tag, cv)).encode('utf-8'))
-            else:
-                output_file.write(
-                    (u'%s\t%s\t%s\n' % (form, entry, tag)).encode('utf-8'))
+    and typods_id in (%(crtype_ids)s) and
+    variant_id=%%s and l.status<>'cand' and l.usuniety = false and
+    %(attr_clauses)s"""
+                query = query % {
+                    'vocabs': vocabs_placeholders,
+                    'antivocabs': antivocabs_clause,
+                    'x_qual': qualifier_clauses,
+                    'magic': magic_qualifier_clauses,
+                    'crtype_ids': ', '.join(str(id) for id in crtype_ids),
+                    'clas_field': ', classification_value_id'
+                        if data['commonness'] else '',
+                    'select': select,
+                    'table_joins': table_joins,
+                    'table_clause': table_clause,
+                    'clas_join':
+                        '''left outer join dictionary_lexemecv wkl
+                            on (wkl.lexeme_id=l.id and
+                                wkl.classification_value_id in (%s))'''
+                        % ', '.join(str(id) for id in cv_ids)
+                            if data['commonness'] else '',
+                    'refl':
+                        'refl.attribute_value_id in (%s)'
+                        % ', '.join(str(id) for id in refls),
+                    'attr_clauses': ' and '.join(attr_clauses)
+                }
+                #print query
+                #print [tt.id] + params + list(tt_c + cell_c + cell_c)
+                cursor.execute(
+                    query, [tt.id] + params + list(tt_c + cell_c + cell_c))
+                refl = data['refl']
+                cv_table = dict(ClassificationValue.objects.values_list('id', 'label'))
+                for row in cursor:
+                    if data['commonness']:
+                        entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
+                            person_id, case_id, cv_id = row
+                    else:
+                        entry, form, pos, tag, _id, refl_id, gender_id, aspect_id, \
+                            person_id, case_id = row
+                    form = form.lstrip('+') # odmienne postfiksy
+                    tags = None
+                    if tag == 'adja':
+                        form = form.rstrip('+')
+                    if tag == 'adjc':
+                        if form not in ADJPREDYKATYWNE:
+                            tags = ["adj:sg:nom:m1.m2.m3:pos", "adj:sg:acc:m3:pos"]
+                    if refl and pos in ('v', 'pact', 'ger'):
+                        if refl_id in refls:
+                            tag += ':' + REFL_TRANSLATION[refls[refl_id]]
+                        else:
+                            debug(entry, u'Nieznana zwrotność: %s' % refl_id)
+                    if 'RODZAJ' in tag:
+                        tag = tag.replace('RODZAJ', genders[gender_id])
+                    if 'ASPEKT' in tag:
+                        tag = tag.replace('ASPEKT', ASPECT_TRANSLATION[aspects[aspect_id]])
+                    if 'OSOBA' in tag:
+                        tag = tag.replace('OSOBA', persons[person_id])
+                    if 'PRZYPADEK' in tag:
+                        tag = tag.replace('PRZYPADEK', cases[case_id])
+                    tags = tags or [tag]
+                    for tag in tags:
+                        if data['commonness']:
+                            cv = cv_table[cv_id] if cv_id else ''
+                            line = u'%s\t%s\t%s\t%s' % (form, entry, tag, cv)
+                        else:
+                            line = u'%s\t%s\t%s' % (form, entry, tag)
+                        uniprint(line, file=output_file)
 \ No newline at end of file
@@ -10,18 +10,31 @@ join dictionary_tabletemplate_pattern_types tt_pt
     on (w.typ = tt_pt.patterntype_id and tt.id = tt_pt.tabletemplate_id)
 join dictionary_tabletemplate_parts_of_speech tt_pos
     on (tt.id = tt_pos.tabletemplate_id and
-        l.pos = tt_pos.partofspeech_id)
-join dictionary_tablecell tc
-    on tt.id = tc.table_template_id
-join dictionary_tablecell_pattern_types tc_pt
-    on (tc.id = tc_pt.tablecell_id and w.typ = tc_pt.patterntype_id)
-left join dictionary_tablecell_genders tc_g
-    on tc.id = tc_g.tablecell_id
+        l.pos = tt_pos.partofspeech_id)'''
+
+TABLE_CLAUSES = '''
+join dictionary_tablecell cell
+    on tt.id = cell.table_template_id
+join dictionary_tablecell_pattern_types cell_pt
+    on (cell.id = cell_pt.tablecell_id and w.typ = cell_pt.patterntype_id)
+left join dictionary_tablecell_genders cell_g
+    on cell.id = cell_g.tablecell_id
+join zakonczenia z
+    on (o.w_id = z.w_id and cell.base_form_label_id = z.efobaz)
+'''
+
+EXPORT_CLAUSES = '''
+join dictionary_exportcell cell
+    on tt.id = cell.table_template_id
+join dictionary_exportcell_pattern_types cell_pt
+    on (cell.id = cell_pt.exportcell_id and w.typ = cell_pt.patterntype_id)
+left join dictionary_exportcell_genders cell_g
+    on cell.id = cell_g.exportcell_id
 join zakonczenia z
-    on (o.w_id = z.w_id and tc.base_form_label_id = z.efobaz)
+    on (o.w_id = z.w_id and cell.base_form_label_id = z.efobaz)
 '''
  
-WHERE_CLAUSES = '(not tt.takes_gender or tc_g.gender_id = o.gender_id)'
+WHERE_CLAUSES = '(not tt.takes_gender or cell_g.gender_id = o.gender_id)'
  
 def value_combinations(attributes, attr_vals=None):
     if len(attributes) == 0:
@@ -47,8 +60,8 @@ def attr_clauses_combinations(tt):
         for _ in xrange(len(tt_attrs) + len(cell_attrs))]
     attr_clauses += ['''
             %s in (select lexemeattributevalue_id from
-            dictionary_tablecell_attribute_values tc_attr_val
-            where tc_attr_val.tablecell_id = tc.id)'''
+            dictionary_tablecell_attribute_values cell_attr_val
+            where cell_attr_val.tablecell_id = cell.id)'''
         for _ in cell_attrs]
     if not attr_clauses:
         attr_clauses = ['true']
@@ -2,8 +2,9 @@
  
 from django.db import connection, transaction
 from django.core.management.base import BaseCommand
+from common.util import uniprint
 from dictionary.lexeme_form_query import FROM_CLAUSES, WHERE_CLAUSES, \
-    attr_clauses_combinations
+    attr_clauses_combinations, TABLE_CLAUSES
 from dictionary.models import TableTemplate
  
  
@@ -27,7 +28,7 @@ def create_forms():
     tts = TableTemplate.objects.filter(variant_id=VARIANT).prefetch_related(
         'attributes__values', 'cell_attributes__values')
     for tt in tts:
-        print 'caching ending qualifiers for: %s' % tt.name
+        uniprint('caching ending qualifiers for: %s' % tt.name)
         attr_clauses, cell_attr_combinations, tt_attr_combinations = \
             attr_clauses_combinations(tt)
         for tt_c in tt_attr_combinations:
@@ -37,7 +38,8 @@ def create_forms():
                         kz.qualifier_id as qualifier_id
                     %s join kwalifikatory_zakonczen kz
                         on (z.id = kz.ending_id)
-                    where %s and ''' % (FROM_CLAUSES, WHERE_CLAUSES)
+                    where %s and ''' % (
+                        FROM_CLAUSES + TABLE_CLAUSES, WHERE_CLAUSES)
                 select_query += ' and '.join(attr_clauses)
                 cursor.execute('''
                     insert into dictionary_lexemeformqualifier
@@ -4,7 +4,7 @@ from django.db import connection, transaction
 from django.core.management.base import BaseCommand
 from common.util import uniprint
 from dictionary.lexeme_form_query import FROM_CLAUSES, WHERE_CLAUSES, \
-    attr_clauses_combinations
+    attr_clauses_combinations, TABLE_CLAUSES
 from dictionary.models import TableTemplate
  
  
@@ -36,7 +36,8 @@ def create_forms():
                 select_query = '''
                     select distinct l.id as lexeme_id,
                         tc.prefix||o.rdzen||z.zak||tc.suffix as form
-                    %s where %s and ''' % (FROM_CLAUSES, WHERE_CLAUSES)
+                    %s where %s and ''' % (
+                        FROM_CLAUSES + TABLE_CLAUSES, WHERE_CLAUSES)
                 select_query += ' and '.join(attr_clauses)
                 cursor.execute('''
                     insert into dictionary_lexemeform (lexeme_id, form)
@@ -5,7 +5,6 @@ from dictionary.export import export_lexemes
  
  
 class Command(BaseCommand):
-    args = 'none'
     help = 'Temporary export script'
  
     def handle(self, **options):