load_resources.py 11.4 KB
# -*- coding:utf-8 -*-

import re
import sys
import time

import jsonpickle
from django.core.management.base import BaseCommand, make_option
from multiservice.facade import Multiservice
from multiservice.facade.ttypes import *
from multiservice.types.ttypes import *
from lxml import etree
from thrift.transport import TSocket

from webapp.models import Category, Expression, Segment, Source, \
    SourceLink, get_or_create_meaning


PORT = 20000
HOST = 'multiservice.nlp.ipipan.waw.pl'
PROCESS_CHAIN = ['Concraft', 'Spejd', 'Nerf', 'MentionDetector']

EXPR_DELIMITERS = [',']


class Command(BaseCommand):
    help = 'Load target resource to Periphraser.'

    option_list = BaseCommand.option_list + (
        make_option('--path',
                    action='store',
                    dest='path',
                    type='str',
                    default='',
                    help='Path to resource xml'),) + (
        make_option('--source',
                    action='store',
                    dest='source',
                    type='str',
                    default='',
                    help='Resource name'),)

    def handle(self, *args, **options):
        load_sources_data()
        load_resource(options['source'], options['path'])


def load_sources_data():
    Source.objects.get_or_create(key='sjp', name='SJP', url='http://sjp.pl/',
                                 description=u'Słownik języka polskiego, ortograficzny, wyrazów obcych i słownik do gier w jednym.')
    Source.objects.get_or_create(key='szarada', name='szarada.net', url='http://szarada.net/',
                                 description=u'Internetowy świat krzyżówek')
    Source.objects.get_or_create(key='plwn', name=u'Słowosieć', url='http://plwordnet.pwr.wroc.pl/',
                                 description=u'Słowosieć (z ang. wordnet) – to słownik semantyczny, który odzwierciedla system leksykalny języka polskiego.')
    Source.objects.get_or_create(key='wikidata', name=u'Wikidane', url='https://www.wikidata.org/',
                                 description=u'Wikidane, w języku angielskim Wikidata – projekt internetowy mający na celu stworzenie wolnej, otwartej, wielojęzycznej bazy różnorodnych danych. Głównym zastosowaniem tej bazy danych jest używanie jej w projektach Wikimedia Foundation, przede wszystkim w Wikipedii.')


def load_resource(name, path):
    source = Source.objects.get(key=name)
    for _, element in etree.iterparse(path):
        if element.tag == 'entry' and descriptions_exists(element):
            load_entry(source, element)


def descriptions_exists(entry):
    return len(entry.getchildren()) > 1


def load_entry(source, entry):
    wikilink = ''
    plwn_synset = 0
    for desc in entry.getchildren():
        if 'wikilink' in desc.attrib and desc.attrib['wikilink']:
            wikilink = desc.attrib['wikilink']
        if 'synset' in desc.attrib and desc.attrib['synset']:
            plwn_synset = int(desc.attrib['synset'])

    meaning, _ = get_or_create_meaning(plwn_synset, wikilink)
    for desc in entry.getchildren():
        if desc_is_label(desc, source):
            continue
        print desc.text
        parse_and_load_expression(source, meaning, desc)
    if meaning.expressions.count() < 2:
        meaning.delete()


def desc_is_label(desc, source):
    if source.key == 'wikidata':
        expr = desc.text.lower()
        if expr.startswith(u'strona ujednoznaczniająca'):
            return True
        elif expr.startswith(u'kategoria'):
            return True
        elif expr.startswith(u'lista projektu'):
            return True
        elif expr.startswith(u'lista w projekcie'):
            return True
    return False


def parse_and_load_expression(source, meaning, desc):
    transport, client = getThriftTransportAndClient(HOST, PORT)
    expression = desc.text
    request = createRequest(expression, PROCESS_CHAIN)
    try:
        token = client.putObjectRequest(request)
        status = None
        while status not in [RequestStatus.DONE, RequestStatus.FAILED]:
            status = client.getRequestStatus(token)
            time.sleep(0.1)
        if status == RequestStatus.DONE:
            result = client.getResultObject(token)
            load_expression(source, desc, meaning, result)
        else:
            print >> sys.stderr, client.getException(token)
            sys.exit("Stopped loading data!")
    finally:
        transport.close()


def getThriftTransportAndClient(host, port):
    transport = TSocket.TSocket(host, port)
    try:
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = Multiservice.Client(protocol)
        transport.open()
        return (transport, client)
    except:
        transport.close()
        raise


def createRequest(text, serviceNames):
    ttext = TText(paragraphs=[TParagraph(text=chunk)
                              for chunk in re.split(r'\n\n+', text)])
    chain = [RequestPart(serviceName=name) for name in serviceNames]
    request = ObjectRequest(ttext, chain)
    return request


def load_expression(source, desc, meaning, result):
    jsonStr = jsonpickle.encode(result, unpicklable=False)
    jsonObj = jsonpickle.decode(jsonStr)

    simpler_exprs = []
    detected_mentions = get_detected_mentions(jsonObj)
    if detected_mentions:
        simpler_exprs = split_expr(jsonObj)
    if simpler_exprs:
        for expr in simpler_exprs:
            save_expression(source=source, desc=desc, meaning=meaning,
                            nerf_category=expr['category'],
                            expr_segs=expr['tokens'],
                            biggest_mention_tokens=expr['tokens'],
                            head=expr['head'])
    else:
        main_category, expr_segs, head, biggest_mention_tokens = get_expr_info(jsonObj)
        save_expression(source, desc, meaning, main_category,
                        expr_segs, biggest_mention_tokens, head)


def get_detected_mentions(jsonObj):
    mentions = []
    for para in jsonObj['paragraphs']:
        for sent in para['sentences']:
            for mnt in sent['mentions']:
                mentions.append(mnt)
    return mentions


def split_expr(jsonObj):
    mentions = []
    for para in jsonObj['paragraphs']:
        for sent in para['sentences']:
            expr_tokens = []
            for tok in sent['tokens']:
                if tok['orth'] in EXPR_DELIMITERS:
                    match = get_matching_mention(sent, expr_tokens)
                    if match:
                        tokens, head, category = parse_mention_info(sent, match)
                        mentions.append({'tokens': tokens,
                                         'head': head,
                                         'category': category})
                    else:
                        return []
                    expr_tokens = []
                elif not tok['chosenInterpretation']['ctag'] == 'interp':
                    expr_tokens.append(tok)
            if expr_tokens:
                match = get_matching_mention(sent, expr_tokens)
                if match:
                    tokens, head, category = parse_mention_info(sent, match)
                    mentions.append({'tokens': tokens,
                                     'head': head,
                                     'category': category})
                else:
                    return []
    return mentions


def get_matching_mention(sent, tokens_to_match):
    tokens_to_match_ids = get_tokens_ids(tokens_to_match)
    for mention in sent['mentions']:
        tokens, _, _ = parse_mention_info(sent, mention)
        tokens_ids = get_tokens_ids(tokens)
        if set(tokens_ids) == set(tokens_to_match_ids):
            return mention
    return None


def get_tokens_ids(tokens):
    return [tok['id'] for tok in tokens if not tok['chosenInterpretation']['ctag'] == 'interp']


def get_expr_info(jsonObj):
    biggest_mention = None
    biggest_mention_tokens = []
    main_category = ''
    expr_segs = []
    head = None
    for para in jsonObj['paragraphs']:
        for sent in para['sentences']:
            expr_segs.extend(sent['tokens'])
            for mention in sent['mentions']:
                if (biggest_mention == None or
                            len(mention['childIds']) > len(biggest_mention['childIds'])):
                    biggest_mention = mention
                    biggest_mention_tokens, head, category = parse_mention_info(sent, mention)
                    if category:
                        main_category = category
    return main_category, expr_segs, head, biggest_mention_tokens


def parse_mention_info(sentence, mention):
    tokens = []
    for token_id in mention['childIds']:
        tokens.append((token for token in sentence['tokens'] if token["id"] == token_id).next())
    head = (token for token in sentence['tokens'] if token["id"] == mention['headIds'][0]).next()
    category = get_category(sentence, head)
    return tokens, head, category


def get_category(sentence, mention_head):
    for name in sentence['names']:
        if mention_head['id'] in name['childIds']:
            return name['type']
    return ''


def save_expression(source, desc, meaning, nerf_category,
                    expr_segs, biggest_mention_tokens, head):
    expression = get_expr_text(expr_segs)
    expression_upper = expression.upper()

    categories = []
    if desc.attrib['categories']:
        categories = desc.attrib['categories'].split(';')
    if nerf_category and (len(expr_segs) > 1 or expression[0].isupper()):
        categories.append(nerf_category)
    meaning.add_categories(categories)

    if not meaning.expressions.filter(text=expression_upper, main_expression=None).exists():
        exact_link = u''
        if 'entrylink' in desc.attrib:
            exact_link = desc.attrib['entrylink']

        is_catchword = str2boolean(desc.attrib['catchword']) or len(expr_segs) == 1
        expr_obj = Expression.objects.create(text=expression_upper, meaning=meaning,
                                             score=0.0, NKJP_freq=0, is_catchword=is_catchword)
        add_segments(expr_obj, expr_segs, head)
        SourceLink.objects.create(source=source, exact_link=exact_link, expression=expr_obj)

        if biggest_mention_tokens and len(expr_segs) != len(biggest_mention_tokens):
            mention_text = get_expr_text(biggest_mention_tokens)
            mention_text_upper = mention_text.upper()
            mention_obj = Expression.objects.create(text=mention_text_upper, meaning=meaning,
                                                    main_expression=expr_obj,
                                                    score=0.0, NKJP_freq=0, is_catchword=False)
            add_segments(mention_obj, biggest_mention_tokens, head)


def get_expr_text(tokens):
    expr = ''
    for tok in tokens:
        if tok['noPrecedingSpace']:
            expr += tok['orth']
        else:
            expr += ' %s' % tok['orth']
    return expr


def str2boolean(bool_str):
    if bool_str == 'true':
        return True
    return False


def add_segments(expr_obj, tokens, head):
    position = 1
    for seg in tokens:
        is_head = False
        if seg == head:
            is_head = True
        Segment.objects.create(position_in_expr=position, expression=expr_obj,
                               orth=seg['orth'], base=seg['chosenInterpretation']['base'],
                               ctag=seg['chosenInterpretation']['ctag'],
                               msd=seg['chosenInterpretation']['msd'], is_head=is_head)
        position += 1