clean_tweets.py
3.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import json
import os
import sys
import codecs
import datetime
import gc
def get_date(json):
if "created_at" not in json:
return datetime.datetime.min
try:
dt = datetime.datetime.strptime(json["created_at"], '%a %b %d %H:%M:%S +0000 %Y')
return dt
except:
return datetime.datetime.min
if len(sys.argv) < 2:
print "Wrong number of arguments! Try: python", sys.argv[0], "tweets_directory [new_tweets_directory]"
sys.exit(1)
directory = sys.argv[1]
target_dir = None
if len(sys.argv) == 3:
target_dir = sys.argv[2]
print "Stats for directory: ", directory
files = []
for root, dirs, filess in os.walk(directory):
for f in filess:
files.append(os.path.join(root, f))
print len(files), "files found"
i = 1
errors = {}
id2tweet_lines = {}
duplicates = 0
for file in sorted(files):
print "Processing file", i, "out of", len(files)
i += 1
with codecs.open(file, encoding="utf8") as f:
l = 0
for line in f:
try:
data = json.loads(line)
id = data["id_str"] # when no id_str is present, we get an exception and skip tweet
data["user"]["id_str"] # when no id_str or user is present, we get an exception and skip tweet
data["created_at"] # when no created_at is present, we get an exception and skip tweet
if id not in id2tweet_lines:
id2tweet_lines[id] = []
else:
duplicates += 1
id2tweet_lines[id].append(line)
except Exception as ex:
if str(ex) not in errors:
errors[str(ex)] = 0
errors[str(ex)] += 1
l += 1
print "Loaded all tweets"
print len(id2tweet_lines), "tweets with distinct id loaded."
print duplicates, "tweets with duplicate id found."
gc.collect()
if target_dir is not None:
user_id2tweet_lines = {}
for id in sorted(id2tweet_lines):
newest_line = id2tweet_lines[id][0]
newest_tweet = json.loads(newest_line)
for tweet_line in id2tweet_lines[id]:
tweet = json.loads(tweet_line)
if get_date(newest_tweet) < get_date(tweet):
newest_tweet = tweet
newest_line = tweet_line
if get_date(newest_tweet) == datetime.datetime.min:
print id, "has no tweet with created_at field."
continue
try:
user_id = newest_tweet["user"]["id_str"]
if user_id not in user_id2tweet_lines:
user_id2tweet_lines[user_id] = []
user_id2tweet_lines[user_id].append(newest_line)
except Exception as ex:
if str(ex) not in errors:
errors[str(ex)] = 0
errors[str(ex)] += 1
gc.collect()
# save tweets
for user_id in user_id2tweet_lines:
print "Saving", len(user_id2tweet_lines[user_id]), "tweets of user", user_id
with open(os.path.join(target_dir, user_id), "w") as f:
tweet_lines = user_id2tweet_lines[user_id]
tweet_lines.sort(key=lambda t: long(json.loads(t)["id_str"]))
for tweet_line in tweet_lines:
f.write(tweet_line)
print "Error counts:"
for error in sorted(errors):
print error, errors[error]
print "Done!"