lookup_users_ids.py 4.06 KB
import time
import sys
import json
import cPickle as pickle
import os
import datetime
import csv
import codecs
import cStringIO
import urllib

import oauth2


def load_users(path):
    userset = set()
    c = 0
    with codecs.open(path, 'r', encoding="utf-8") as f:
        for line in f.readlines():
            c += 1
            userset.add(line.strip().encode("utf-8"))

    print len(userset), "distinct user names loaded (out of", c, "lines)."

    return [u for u in sorted(userset)]


def load_tokens(loc):
    f = file(loc, "r")
    index = []
    for line in f.readlines():
        if line.startswith("#"): continue
        parts = [x.strip() for x in line.split(",")]
        (consumer_key, consumer_secret, auth_key, auth_secret) = parts
        tokens = dict()
        tokens["CLIENT_KEY"] = consumer_key
        tokens["CLIENT_SECRET"] = consumer_secret
        tokens["ATOKEN_KEY"] = auth_key
        tokens["ATOKEN_SECRET"] = auth_secret
        index = index + [tokens]
    return index[0]


def request(url):
    resp = {}
    try:
        consumer = oauth2.Consumer(key=tokens["CLIENT_KEY"], secret=tokens["CLIENT_SECRET"])
        token = oauth2.Token(key=tokens["ATOKEN_KEY"], secret=tokens["ATOKEN_SECRET"])
        client = oauth2.Client(consumer, token)
        resp, content = client.request(url, method="GET")
    except Exception as ex:
        print "Exception for", url, ":", ex
        resp['status'] = 404
        content = '[]'
    return resp, content


def execute_request(url, WAIT_PERIOD, max_errors):
    error_count = 0
    while True:
        r, c = request(url)
        st = int(r['status'])
        if st == 200:
            return json.loads(c)
        elif st == 429:
            print "Checking rate limit..."
            wait_time = 60
            r, c = request('https://api.twitter.com/1.1/application/rate_limit_status.json')
            try:
                rstatus = json.loads(c)
                if int(rstatus['resources']['users']['/users/show']['remaining']) == 0:
                    now = time.time()
                    reset = int(rstatus['resources']['users']['/users/show']['reset'])
                    wait_time = reset - now + 1
            except:
                pass
            print 'Rate limit reached, waiting %i seconds' % wait_time
            time.sleep(wait_time)
        else:
            print 'Error %i, waiting %i seconds, error count: %i' % (st, WAIT_PERIOD, error_count)
            time.sleep(WAIT_PERIOD)
            error_count += 1
            if error_count >= max_errors:
                raise Exception("Max errors reached, skipping request.")


def update_data(screen_name2user_id, screen_name):
    url = 'https://api.twitter.com/1.1/users/show.json?' \
          + 'screen_name=' + screen_name + "&include_entities=false"
    try:
        response = execute_request(url, WAIT_PERIOD, 5)
        screen_name2user_id[screen_name] = response["id"]
    except Exception as ex:
        print "Exception for name:", screen_name, ":", ex

#### main ########################################################################

# constants
WAIT_PERIOD = 2  # time until retry for a failed Twitter API call

# load parameters
if len(sys.argv) != 4:
    print "Wrong number of arguments! Try: tokens_file user_names_file target_info_file"
    sys.exit()

TOKENS_FILE = sys.argv[1]
USER_NAMES_FILE = sys.argv[2]
TARGET_FILE = sys.argv[3]

# load tokens and userlist
tokens = load_tokens(TOKENS_FILE)
user_names = load_users(USER_NAMES_FILE)

screen_name2user_id = {}
wrong_users = []
c = 1
for name in user_names:
    print "Processing name", c, ":", name
    c += 1
    update_data(screen_name2user_id, name)
    if name not in screen_name2user_id:
        wrong_users.append(name)

print len(wrong_users), "users not found:"
for u in sorted(wrong_users):
    print "\t", u

with open(TARGET_FILE, "w") as f:
    for screen_name in sorted(screen_name2user_id):
        f.write(str(screen_name2user_id[screen_name])+"\t"+screen_name+"\n")

print "Done"