clean_tweets.py 3.27 KB
import json
import os
import sys
import codecs
import datetime
import gc

def get_date(json):
    if "created_at" not in json:
        return datetime.datetime.min
    try:
        dt = datetime.datetime.strptime(json["created_at"], '%a %b %d %H:%M:%S +0000 %Y')
        return dt
    except:
        return datetime.datetime.min


if len(sys.argv) < 2:
    print "Wrong number of arguments! Try: python", sys.argv[0], "tweets_directory [new_tweets_directory]"
    sys.exit(1)

directory = sys.argv[1]
target_dir = None
if len(sys.argv) == 3:
    target_dir = sys.argv[2]

print "Stats for directory: ", directory
files = []
for root, dirs, filess in os.walk(directory):
    for f in filess:
        files.append(os.path.join(root, f))
print len(files), "files found"

i = 1
errors = {}
id2tweet_lines = {}
duplicates = 0
for file in sorted(files):
    print "Processing file", i, "out of", len(files)
    i += 1

    with codecs.open(file, encoding="utf8") as f:
        l = 0
        for line in f:
            try:
                data = json.loads(line)

                id = data["id_str"] # when no id_str is present, we get an exception and skip tweet
                data["user"]["id_str"] # when no id_str or user is present, we get an exception and skip tweet
                data["created_at"] # when no created_at is present, we get an exception and skip tweet

                if id not in id2tweet_lines:
                    id2tweet_lines[id] = []
                else:
                    duplicates += 1

                id2tweet_lines[id].append(line)

            except Exception as ex:
                if str(ex) not in errors:
                    errors[str(ex)] = 0
                errors[str(ex)] += 1

            l += 1

print "Loaded all tweets"
print len(id2tweet_lines), "tweets with distinct id loaded."
print duplicates, "tweets with duplicate id found."

gc.collect()

if target_dir is not None:
    user_id2tweet_lines = {}
    for id in sorted(id2tweet_lines):
        newest_line = id2tweet_lines[id][0]
        newest_tweet = json.loads(newest_line)
        for tweet_line in id2tweet_lines[id]:
            tweet = json.loads(tweet_line)
            if get_date(newest_tweet) < get_date(tweet):
                newest_tweet = tweet
                newest_line = tweet_line

        if get_date(newest_tweet) == datetime.datetime.min:
            print id, "has no tweet with created_at field."
            continue

        try:
            user_id = newest_tweet["user"]["id_str"]
            if user_id not in user_id2tweet_lines:
                user_id2tweet_lines[user_id] = []
            user_id2tweet_lines[user_id].append(newest_line)
        except Exception as ex:
            if str(ex) not in errors:
                errors[str(ex)] = 0
            errors[str(ex)] += 1

    gc.collect()

    # save tweets
    for user_id in user_id2tweet_lines:
        print "Saving", len(user_id2tweet_lines[user_id]), "tweets of user", user_id
        with open(os.path.join(target_dir, user_id), "w") as f:
            tweet_lines = user_id2tweet_lines[user_id]
            tweet_lines.sort(key=lambda t: long(json.loads(t)["id_str"]))
            for tweet_line in tweet_lines:
                f.write(tweet_line)


print "Error counts:"
for error in sorted(errors):
    print error, errors[error]

print "Done!"