-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPremiumAPI.py
82 lines (70 loc) · 3.52 KB
/
PremiumAPI.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import json
import math
import time
from datetime import datetime, timedelta
from email.utils import parsedate_tz
from searchtweets import collect_results, gen_rule_payload, load_credentials
def to_datetime(datestring):
time_tuple = parsedate_tz(datestring.strip())
dt = datetime(*time_tuple[:6])
return dt - timedelta(seconds=time_tuple[-1])
premium_search_args = load_credentials(r'C:\Data\Python\Twitter\twitter_keys_free.yaml',
yaml_key="search_tweets_premium",
env_overwrite=False)
# def scrape_tweets(keyword, low, high):
# tweet_dict = {}
# num = 0
# while low < high:
# results_per_call = 100
# max_results = 100
# pt_rule = keyword + ' lang:en'
# low_string = low.strftime('%Y%m%d%H%M')
# high_string = high.strftime('%Y%m%d%H%M')
# rule = gen_rule_payload(pt_rule=pt_rule, from_date=low_string, to_date=high_string, results_per_call=results_per_call)
# premium_search_args['tweetify'] = False
# tweets = collect_results(rule, max_results=max_results, result_stream_args=premium_search_args)
# if tweets == []:
# break
# end = to_datetime(tweets[len(tweets) - 1]['created_at'])
# for tweet in tweets:
# if tweet['lang'] != 'en':
# continue
# tweet_dict[num] = tweet
# num += 1
# high = end - timedelta(seconds=1)
# print(num)
# print(high)
# delay = max_results / results_per_call
# time.sleep(delay)
# with open('tweets1early.json', 'w', encoding='utf-8') as f:
# json.dump(tweet_dict, f)
def scrape(keyword, low, high):
tweet_dicts = []
results_per_call = 100
max_results = 5000
pt_rule = keyword + ' lang:en'
low_string = low.strftime('%Y%m%d%H%M')
high_string = high.strftime('%Y%m%d%H%M')
rule = gen_rule_payload(pt_rule=pt_rule, from_date=low_string, to_date=high_string, results_per_call=results_per_call)
premium_search_args['tweetify'] = False
tweets = collect_results(rule, max_results=max_results, result_stream_args=premium_search_args)
for tweet in tweets:
if tweet['lang'] != 'en':
continue
tweet_dicts.append(tweet)
print('Dumping')
with open('tweets.json', 'w', encoding='utf-8') as f:
json.dump(tweet_dicts, f)
# update key directory in code - good
# update endpoint URL - good
# update start and end date - good
# update results per call and max results - good
# update keyword
if __name__ == '__main__':
keyword = '''("lost my job" OR "laid me off" OR "I have been laid off" OR "I\'ve been laid off" OR "I got laid off" OR "I was laid off" OR "I am laid off" OR "Iam laid off" OR "I am being laid off" OR "I got on unemployment" OR "I am on unemployment" OR "I am going on unemployment" OR "I am getting on unemployment" OR "I went on unemployment" OR "I have been on unemployment" OR "I\'ve been on unemployment" OR "Iam on unemployment" OR "I am unemployed" OR "I have been unemployed" OR "I\'ve been unemployed" OR "Iam unemployed" OR "I got unemployed" OR "I don\'t have a job" OR "I dont have a job" OR "I do not have a job") (profile_country:US OR place_country:US)'''
# keyword = 'telehealth (physicaltherapy OR "physical therapy")'
# keyword = 'telehealth (athletictrainer OR "athletic trainer")'
# keyword = 'telerehabilitation'
low = datetime(year=2020, month=4, day=1)
high = datetime(year=2020, month=6, day=1)
scrape(keyword, low, high)