sample_tweets.py
1.57 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import sys
from bson.json_util import dumps
from pymongo import MongoClient
def load_user_ids(user_file):
ids = []
with open(user_file, 'r') as f:
for line in f:
spl = line.strip().split("\t")
user_id = spl[0]
ids.append(user_id)
return ids
if len(sys.argv) != 5:
print "Wrong number of arguments! Try: python", sys.argv[
0], "mongo_database:mongo_collection target_file user_list count"
sys.exit(1)
client = MongoClient()
spl = sys.argv[1].split(":")
tweetsDB = client[spl[0]][spl[1]]
target_file = sys.argv[2]
user_list = sys.argv[3]
count = int(sys.argv[4])
user_ids = load_user_ids(user_list)
print "Sampling from database to:", target_file, "taking", count, "tweets total from", len(user_ids), "users."
user_id2cursor = {}
i = 1
for user_id in sorted(user_ids):
try:
cursor = tweetsDB.find({"user.id_str": user_id, "analysis.language.value": "pl"}).sort("created_at_mongo", -1)
cursor.batch_size(1)
print i, "user queried."
i += 1
user_id2cursor[user_id] = cursor
except Exception as ex:
print "Error loading tweets of user", user_id, ":", ex
tweets = []
while len(tweets) < count:
for user_id in sorted(user_ids):
cursor = user_id2cursor[user_id]
tweet = next(cursor, None)
if tweet:
print len(tweets), "tweets sampled."
tweets.append(tweet)
if len(tweets) >= count:
break
with open(target_file, 'w') as f:
f.write(dumps(tweets))
print "Saved", len(tweets), "in file:", target_file