sample_tweets.py 1.57 KB
import sys
from bson.json_util import dumps
from pymongo import MongoClient


def load_user_ids(user_file):
    ids = []
    with open(user_file, 'r') as f:
        for line in f:
            spl = line.strip().split("\t")
            user_id = spl[0]
            ids.append(user_id)
    return ids


if len(sys.argv) != 5:
    print "Wrong number of arguments! Try: python", sys.argv[
        0], "mongo_database:mongo_collection target_file user_list count"
    sys.exit(1)

client = MongoClient()
spl = sys.argv[1].split(":")
tweetsDB = client[spl[0]][spl[1]]
target_file = sys.argv[2]
user_list = sys.argv[3]
count = int(sys.argv[4])

user_ids = load_user_ids(user_list)

print "Sampling from database to:", target_file, "taking", count, "tweets total from", len(user_ids), "users."

user_id2cursor = {}
i = 1
for user_id in sorted(user_ids):
    try:
        cursor = tweetsDB.find({"user.id_str": user_id, "analysis.language.value": "pl"}).sort("created_at_mongo", -1)
        cursor.batch_size(1)
        print i, "user queried."
        i += 1
        user_id2cursor[user_id] = cursor
    except Exception as ex:
        print "Error loading tweets of user", user_id, ":", ex

tweets = []
while len(tweets) < count:
    for user_id in sorted(user_ids):
        cursor = user_id2cursor[user_id]
        tweet = next(cursor, None)
        if tweet:
            print len(tweets), "tweets sampled."
            tweets.append(tweet)
            if len(tweets) >= count:
                break

with open(target_file, 'w') as f:
    f.write(dumps(tweets))

print "Saved", len(tweets), "in file:", target_file