load_resources.py 7.41 KB
# -*- coding:utf-8 -*-

import re
import sys
import time

import jsonpickle
from django.core.management.base import BaseCommand, make_option
from multiservice.facade import Multiservice
from multiservice.facade.ttypes import *
from multiservice.types.ttypes import *
from lxml import etree
from thrift.transport import TSocket

from webapp.models import Category, Expression, Meaning, Segment, Source


PORT = 20000
HOST = 'multiservice.nlp.ipipan.waw.pl'
PROCESS_CHAIN = ['Concraft', 'Spejd', 'Nerf', 'MentionDetector']


class Command(BaseCommand):
    help = 'Load lemma frequency informations to system.'

    option_list = BaseCommand.option_list + (
        make_option('--path',
                    action='store',
                    dest='path',
                    type='str',
                    default='',
                    help='Path to resource xml'),) + (
        make_option('--source',
                    action='store',
                    dest='source',
                    type='str',
                    default='',
                    help='Resource name'),)

    def handle(self, *args, **options):
        load_sources_data()
        load_resource(options['source'], options['path'])


def load_sources_data():
    sjp, _ = Source.objects.get_or_create(key='sjp', name='SJP', url='http://sjp.pl/',
                description=u'Słownik języka polskiego, ortograficzny, wyrazów obcych i słownik do gier w jednym.')
    szarada, _ = Source.objects.get_or_create(key='szarada', name='szarada.net', url='http://szarada.net/',
                    description=u'Internetowy świat krzyżówek')
    plwn, _ = Source.objects.get_or_create(key='plwn', name=u'Słowosieć', url='http://plwordnet.pwr.wroc.pl/',
                    description=u'Słowosieć (z ang. wordnet) – to słownik semantyczny, który odzwierciedla system leksykalny języka polskiego.')
    wikidata, _ = Source.objects.get_or_create(key='wikidata', name=u'Wikidane', url='https://www.wikidata.org/',
                    description=u'Wikidane, w języku angielskim Wikidata – projekt internetowy mający na celu stworzenie wolnej, otwartej, wielojęzycznej bazy różnorodnych danych. Głównym zastosowaniem tej bazy danych jest używanie jej w projektach Wikimedia Foundation, przede wszystkim w Wikipedii.')


def load_resource(name, path):
    source = Source.objects.get(key=name)
    for _, element in etree.iterparse(path):
        if element.tag == 'entry':
            load_entry(source, element)


def load_entry(source, entry):
    wikilink = ''
    plwn_synset = 0
    expressions = []
    for desc in entry.getchildren():
        print desc.text
        if 'wikilink' in desc.attrib and desc.attrib['wikilink']:
            wikilink = desc.attrib['wikilink']
        if 'synset' in desc.attrib and desc.attrib['synset']:
            plwn_synset = int(desc.attrib['synset'])

    category, _ = Category.objects.get_or_create(key='unk', name='unk', description='niezdefiniowana')
    meaning = Meaning.objects.create(plWN_synset=plwn_synset, wikilink=wikilink, category=category)

    for desc in entry.getchildren():
        parse_and_load_expression(source, meaning, desc.text)


def parse_and_load_expression(source, meaning, expression):
    transport, client = getThriftTransportAndClient(HOST, PORT)
    request = createRequest(expression, PROCESS_CHAIN)
    try:
        token = client.putObjectRequest(request)
        status = None
        while status not in [RequestStatus.DONE, RequestStatus.FAILED]:
            status = client.getRequestStatus(token)
            time.sleep(0.1)
        if status == RequestStatus.DONE:
            result = client.getResultObject(token)
            load_expression(source, expression, meaning, result)
        else:
            print >> sys.stderr, client.getException(token)
    finally:
        transport.close()


def getThriftTransportAndClient(host, port):
    transport = TSocket.TSocket(host, port)
    try:
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = Multiservice.Client(protocol)
        transport.open()
        return (transport, client)
    except:
        transport.close()
        raise


def createRequest(text, serviceNames):
    ttext = TText(paragraphs=[TParagraph(text=chunk)
                              for chunk in re.split(r'\n\n+', text)])
    chain = [RequestPart(serviceName=name) for name in serviceNames]
    request = ObjectRequest(ttext, chain)
    return request


def load_expression(source, expression, meaning, result):
    biggest_mention = None
    biggest_mention_tokens = []
    main_category = ''
    expr_segs = []
    head = None
    jsonStr = jsonpickle.encode(result, unpicklable=False)
    jsonObj = jsonpickle.decode(jsonStr)
    for para in jsonObj['paragraphs']:
        for sent in para['sentences']:
            expr_segs.extend(sent['tokens'])
            for mention in sent['mentions']:
                if biggest_mention == None:
                    biggest_mention = mention
                    biggest_mention_tokens, head, category = parse_mention_info(sent, mention)
                    if category:
                        main_category = category
                elif len(mention['childIds']) > len(biggest_mention['childIds']):
                    biggest_mention = mention
                    biggest_mention_tokens, head, category = parse_mention_info(sent, mention)
                    if category:
                        main_category = category

    if main_category and (len(expr_segs) > 1 or expression[0].isupper()):
        category_obj, _ = Category.objects.get_or_create(key=main_category, name=main_category)
        meaning.category = category_obj
        meaning.save()

    if not meaning.expressions.filter(text=expression, main_expression=None).exists():
        expr_obj = Expression.objects.create(text=expression, meaning=meaning, score=0.0, NKJP_freq=0)
        expr_obj.sources.add(source)
        add_segments(expr_obj, expr_segs, head)

        if biggest_mention_tokens and len(expr_segs) != len(biggest_mention_tokens):
            mention_text = ' '.join([tok['orth'] for tok in biggest_mention_tokens])
            mention_obj = Expression.objects.create(text=mention_text, meaning=meaning, main_expression=expr_obj,
                                                    score=0.0, NKJP_freq=0)
            mention_obj.sources.add(source)
            add_segments(mention_obj, biggest_mention_tokens, head)


def parse_mention_info(sentence, mention):
    tokens = []
    for token_id in mention['childIds']:
        tokens.append((token for token in sentence['tokens'] if token["id"] == token_id).next())
    head = (token for token in sentence['tokens'] if token["id"] == mention['headIds'][0]).next()
    category = get_category(sentence, head)
    return tokens, head, category


def get_category(sentence, mention_head):
    for name in sentence['names']:
        if mention_head['id'] in name['childIds']:
            return name['type']
    return ''


def add_segments(expr_obj, tokens, head):
    position = 1
    for seg in tokens:
        is_head = False
        if seg == head:
            is_head = True
        Segment.objects.create(position_in_expr=position, expression=expr_obj, orth=seg['orth'],
                               base=seg['chosenInterpretation']['base'], ctag=seg['chosenInterpretation']['ctag'],
                               msd=seg['chosenInterpretation']['msd'], is_head=is_head)
        position += 1