-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathstreaming_scraper.py
69 lines (56 loc) · 2.22 KB
/
streaming_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import ConfigParser
import cPickle
import os
from datetime import datetime
import json
import tweepy
import sys
@classmethod
def parse(cls, api, raw):
status = cls.first_parse(api, raw)
setattr(status, 'json', json.dumps(raw))
return status
tweepy.models.Status.first_parse = tweepy.models.Status.parse
tweepy.models.Status.parse = parse
class StreamWatcherListener(tweepy.StreamListener):
def on_status(self, status):
json_status = status.json
output = open("statuses.txt", "ab")
output.write(json.dumps(json_status, output))
output.write("\n")
return True
def on_error(self, status_code):
print 'An error has occured! Status code = %s' % status_code
error = open("errors.txt", "ab")
error.write(str(status_code))
error.write("\n")
return True # keep stream alive
def on_timeout(self):
print 'Snoozing Zzzzzz'
def authorize(consumer_key, consumer_secret,
access_token, access_token_secret,
timeout=60):
"""authorizes the Twitter API"""
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
api = tweepy.streaming.Stream(auth, StreamWatcherListener(), timeout=timeout)
return api
os.chdir(sys.argv[1])
Config = ConfigParser.ConfigParser()
Config.read("twitter.config")
consumer_key = Config.get("Twitter", "consumer_key")
consumer_secret = Config.get("Twitter", "consumer_secret")
access_token = Config.get("Twitter", "access_token")
access_token_secret = Config.get("Twitter", "access_token_secret")
api = authorize(consumer_key, consumer_secret,
access_token, access_token_secret)
#load user list
twitter_dict = cPickle.load(open("twitter_links.pkl"))
twitter_links = [twitter_dict[k][0].split("/")[-1] for k in twitter_dict]
# This is a very hacky solution to a problem that I keep running into, the streaming scraper keeps crashing with an IncompleteRead error, I am going to have to rewrite a lot of this at a lower level at somepoint but this makes the scraper work for now.
t_1 = datetime.now()
while True:
try:
api.filter(follow=twitter_links)
except Exception as e:
print e, datetime.now() - t_1