-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_extractor_rest.py
39 lines (34 loc) · 1008 Bytes
/
data_extractor_rest.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from datetime import datetime
from pymongo import MongoClient
import cPickle
import sys
import csv
import os
client = MongoClient()
db = client["StreamingDb"]
collection = db["StreamingCollection"]
# prep for search
ln = collection.count()
t_1 = datetime.now()
i = 0
os.chdir(sys.argv[2])
for obs in collection.find():
f_name = open(sys.argv[1], "ab")
writer = csv.writer(f_name)
text = obs["text"].encode("utf-8")
agent = obs["user"]["screen_name"]
date = obs["created_at"]
writer.writerow([date, agent, text])
f_name.close()
print (i * 100.) / ln, datetime.now() - t_1, i, "db"
i += 1
rest_timelines = cPickle.load(open(sys.argv[3], "rb"))
for link in rest_timelines:
for tweet in rest_timelines[link]:
f_name = open(sys.argv[1], "ab")
writer = csv.writer(f_name)
text = tweet.text.encode("utf-8")
agent = tweet.user.screen_name
date = tweet.created_at
writer.writerow([date, agent, text])
f_name.close()