import_expansions.py 7.03 KB
import os

from xml.sax import handler, make_parser

from django.core.management.base import BaseCommand

from importer.Phrase import phrase_from_tree
from importer.Position import Position
from importer.WalentyXML import XMLNode
from shellvalier.settings import BASE_DIR

from phrase_expansions.models import ExpansionOpinion, PhraseExpansionType, PhraseExpansion, ExpansionPosition, ExpansionPhrase, ExpansionPhraseDescription

from entries.phrase_descriptions.descriptions import phrase_description2

class Command(BaseCommand):
    args = 'none'
    help = ''

    def handle(self, **options):
        import_expansions()

OPINION_MAP = {
    'archaiczna' : 'dat',    
    'pewna'      : 'cer',
    'potoczna'   : 'col',
    'wątpliwa'   : 'unc',
}

def import_expansions():
    xml_file = os.path.join(BASE_DIR, 'data', 'walenty', 'phrase_types_expand_20200926.xml')
    xml_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), xml_file)
    
    parser = make_parser()
    
    parser.setContentHandler(ExpansionsTeiHandler())
    parser.parse(xml_path)
    
    expansions = parser.getContentHandler()._expansions
    
    for cls in (ExpansionOpinion, PhraseExpansion, ExpansionPosition, ExpansionPhrase, ExpansionPhraseDescription):
        cls.objects.all().delete()
    
    opinions = [(50, u'col'), (40, u'dat'), (20, u'unc'), (10, u'cer'),]
    for pri, short in opinions:
        opinion = ExpansionOpinion(key=short, priority=pri)
        opinion.save()
    
    for (phrase_type, phrase_subtype), exps in expansions.items():
        expansion_type = PhraseExpansionType.objects.create(phrase_type=phrase_type,
                                                            phrase_subtype=phrase_subtype)
        expansion_type.save()
        print(expansion_type)
        for i, (positions, opinion) in enumerate(exps):
            opinion = ExpansionOpinion.objects.get(key=OPINION_MAP[opinion])
            expansion = PhraseExpansion.objects.create(expansion_type=expansion_type,
                                                       opinion=opinion,
                                                       priority=(i + 1))
            expansion.save()
            expansion_type.max_positions = max(expansion_type.max_positions, len(positions))
            for j, pos in enumerate(positions):
                position = ExpansionPosition(expansion=expansion, priority=(j + 1))
                position.save()
                for k, (text_rep, desc_pl, desc_en) in enumerate(pos):
                    phrase = ExpansionPhrase(position=position, text_rep=text_rep)
                    phrase.save()
                    d_pl = ExpansionPhraseDescription(phrase=phrase, lang='pl', description=desc_pl)
                    d_en = ExpansionPhraseDescription(phrase=phrase, lang='en', description=desc_en)
                    d_pl.save()
                    d_en.save()
        expansion_type.save()
    print(sum(map(len, expansions.values())))

class ExpansionsTeiHandler(handler.ContentHandler):

    def __init__(self):
        handler.ContentHandler.__init__(self)
        self._subtree = None
        self._current = None
        self._constructing = False
        self._content = ""
        self._expansions = dict()
    
    def startElement(self, name, attrs):
        if name == 'entry':
            self._constructing = True
            self._content = ""
        if (self._constructing):
            node = XMLNode(name, attrs, self._current)
            if self._current is not None:
                self._current.addChild(node)
            else:
                self._subtree = node
            self._current = node
    
    def endElement(self, name):
        if self._current is not None:
            self._current.setContent(self._content.strip())
            self._current = self._current._parent
            if name == 'entry':
                if self._current is not None:
                    raise TEIStructureError()
                typ = self._subtree._children[0]._attrs['type']
                self.get_expansions(self._subtree, typ)
            self._content = ''
    
    def characters(self, content):
        self._content += content

    def get_expansions(self, tree, phrase_type):
        dummy_position = Position(None, None, None, None, None)
        if phrase_type == 'advp':
            subtype = tree._children[0]._children[0]._children[0]._attrs['value']
        elif phrase_type == 'xp':
            subtype = tree._children[0]._children[0]._children[0]._children[0]._children[0]._attrs['value']
        elif phrase_type == 'comprepnp':
            subtype = tree._children[0]._children[0]._children[0]._content
        elif phrase_type in ('distrp', 'possp'):
            subtype = None
        #print('{}({})'.format(phrase_type, subtype))
        assert((phrase_type, subtype) not in self._expansions)
        expansions = []
        for exp in tree._children[1]._children[0]._children:
            expansion_positions = []
            opinion = exp._children[0]._children[0]._attrs['value']
            if exp._children[1]._attrs['name'] == 'phrases':
                positions = [exp._children[1]]
            elif exp._children[1]._attrs['name'] == 'positions':
                positions = [pos._children[0] for pos in exp._children[1]._children[0]._children]
            for position in positions:
                expansion_position = []
                for phrase in position._children[0]._children:
                    typ = phrase._attrs['type']
                    if typ == 'adverb':
                        adverb = phrase._children[0]._children[0]._attrs['value']
                        expansion_position.append((adverb, 'przysłówek <i>{}</i>'.format(adverb), '<i>{}</i> adverb'.format(adverb)))
                    elif typ == 'advp':
                        # xp realised by advp(cat)
                        advpcat = phrase._children[-1]._children[0]._attrs['value']
                        expansion_position.append(('advp({})'.format(advpcat), '???', '???'))
                    elif typ == 'comprepnp':
                        prep = phrase._children[1]._children[0]._content
                        expansion_position.append(('comprepnp({})'.format(prep), 'fraza rzeczownikowo-przyimkowa z przyimkiem złożonym <i>{}</i>'.format(prep), 'nominal-prepositional phrase with <i>{}</i> complex preposition'.format(prep)))
                    else:
                        phr = phrase_from_tree(phrase)
                        if False:#str(phr) == 'lex(adjp(agr),agr,agr,pos,OR(cudzy;czyj;czyjkolwiek;czyjś;mój;nasz;niczyj;pański;swój;twój;wasz),natr)':
                            desc_pl, desc_en = None, None
                        else:
                            desc_pl, desc_en = phrase_description2(phr, dummy_position, None, 'pl'), phrase_description2(phr, dummy_position, None, 'en')
                        expansion_position.append((str(phr), desc_pl, desc_en))
                expansion_positions.append(expansion_position)
            assert(expansion_positions)
            expansions.append((expansion_positions, opinion))
        self._expansions[(phrase_type, subtype)] = expansions