thrift_client.py 5.7 KB
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Copyright (C) 2014 Michał Lenart
# Modified by Bartłomiej Nitoń (2016)
# This is THRIFT client for Multiservice platform.
# It is available on the same license as the Multiservice itself.
#

import codecs
import os
import sys
import re
import jsonpickle
import json
import time
import StringIO
from optparse import OptionParser
from thrift.transport import TSocket
from multiservice.facade import Multiservice
from multiservice.facade.ttypes import *
from multiservice.types.ttypes import *

import json2tei
import tei_writer

EXTENSIONS = {'json': '.json',
              'tei': '.xml',
              'packagedtei': '.xml'}


def createSampleRequest(text, serviceNames):
    ttext=TText(paragraphs=[TParagraph(text=chunk)
                            for chunk in re.split(r'\n\n+', text) if chunk])
                            
    chain = [RequestPart(serviceName=name) for name in serviceNames]
    request = ObjectRequest(ttext, chain)
    return request


def getThriftTransportAndClient(host, port):
    transport = TSocket.TSocket(host, port)
    try:
        transport = TTransport.TBufferedTransport(transport)
        protocol = TBinaryProtocol.TBinaryProtocol(transport)
        client = Multiservice.Client(protocol)
        transport.open()
        return (transport, client)
    except:
        transport.close()
        raise


def getResultAsJSON(result):
    jsonStr = jsonpickle.encode(result, unpicklable=False)
    return json.dumps(json.load(StringIO.StringIO(jsonStr)), sort_keys=True, indent=4)


def go():
    parser = OptionParser()
    parser.add_option('-p', '--port', type='int', action='store',
                      dest='port', default=20000,
                      help='multiservice port; default: 20000')
    parser.add_option('--host', type='string', action='store',
                      dest='host', default='multiservice.nlp.ipipan.waw.pl',
                      help='multiservice host; default: multiservice.nlp.ipipan.waw.pl')
    parser.add_option('-f', '--format', type='string', action='store',
                      dest='format', default='json',
                      help='format; tei, packagedtei or json; default: json')
    parser.add_option('-o', '--output', type='string', action='store',
                      dest='output', default='',
                      help='output path; if not specified writes output to standard output')
    parser.add_option('-i', '--input', type='string', action='store',
                      dest='input', default='',
                      help='input file or dir path; if not specified standard input is used')
    (opts, args) = parser.parse_args()

    if len(args) == 0:
        print "Processing chain was not specified!"
        return

    if opts.input and os.path.isdir(opts.input):
        if opts.output and os.path.isdir(opts.output):
            process_directory(opts.input, opts, args)
        else:
            print >> sys.stderr, "Output must be a directory!"
    elif opts.input and os.path.isfile(opts.input):
        process_file(opts.input, opts, args, False, opts.output)
    else:
        process_text(sys.stdin.read(), opts, args)


def process_directory(input, opts, service_names):
    input = os.path.abspath(input)
    output = os.path.abspath(opts.output)
    for filename in sorted(os.listdir(input)):
        print filename
        textname = os.path.splitext(os.path.basename(filename))[0]
        textoutput = os.path.join(output, textname)
        textinput = os.path.join(input, filename)
        process_file(textinput, opts, service_names, True, textoutput)


def process_file(input, opts, service_names, in_dir, output):
    with codecs.open(input, 'rt', 'utf-8') as textfile:
        text = textfile.read()
        process_text(text, opts, service_names, in_dir, output)


def process_text(text, opts, service_names, in_dir, output=None):
    request = createSampleRequest(text, service_names)
    transport, client = getThriftTransportAndClient(opts.host, opts.port)
    try:
        token = client.putObjectRequest(request)
        status = None
        while status not in [RequestStatus.DONE, RequestStatus.FAILED]:
            status = client.getRequestStatus(token)
            time.sleep(0.1)
        if status == RequestStatus.DONE:
            format = opts.format.lower()
            result = client.getResultObject(token)
            if format == 'json':
                result = getResultAsJSON(result)
            elif format == 'packagedtei':
                result = json2tei.convert(result, service_names, True)
            elif format == 'tei':
                result = json2tei.convert(result, service_names, False)
            else:
                print >> sys.stderr, "Unknown format, changed to json!"
                format = 'json'
                result = getResultAsJSON(result)
            write_result(result, format, in_dir, output)
        else:
            print >> sys.stderr, client.getException(token)
    finally:
         transport.close()


def write_result(result, format, in_dir, output):
    if not output:
        print >> sys.stdout, result
    elif format == 'packagedtei' or format == 'json':
        if in_dir:
            output = output+EXTENSIONS[format]
        with codecs.open(output, 'wt', 'utf-8') as outfile:
            outfile.write(result)
    elif format == 'tei' and create_and_check_tei_dir(output):
        tei_writer.write_as_tei_dir(result, output)


def create_and_check_tei_dir(output):
    dir_ok = False
    if os.path.exists(output) and os.path.isdir(output):
        dir_ok = True
    elif not os.path.exists(output):
        os.mkdir(output)
        dir_ok = True
    else:
        print >> sys.stderr, 'Selected output is not a directory.'
    return dir_ok


if __name__ == '__main__':
    go()