-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathupdate-data.py
executable file
·166 lines (137 loc) · 5.26 KB
/
update-data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/usr/bin/env python
'''
Fetch and insert Recurse Center API data into database.
This script fetches the list of batches and the list of profiles from the
Recurse Center API using the personal access token specified in the environment
variable RC_API_ACCESS_TOKEN.
It connects to the database specified in the environment variable DATABASE_URL,
opens a transaction, deletes the current data (if any), and inserts the new
data. The database schema must already exist.
'''
import argparse
import json
import logging
import psycopg2
import re
import requests
import sys
def getEnvVar(var_name, fallback = ""):
value = os.getenv(var_name) or fallback
if not value:
logging.error(f"''{var_name}'' value not found.",
" Ensure a .env or .flaskenv file is present",
" with this environment variable set.")
sys.exit()
logging.info(var_name + ": " + value)
return value
def get_people(token):
people = []
session = requests.Session()
session.headers.update({'Authorization': f'Bearer {token}'})
url = 'https://www.recurse.com/api/v1/profiles?limit={limit}&offset={offset}'
limit = 50
offset = 0
while True:
r = session.get(url.format(limit=limit, offset=offset))
if r.status_code != requests.codes.ok:
r.raise_for_status()
page = r.json()
if page == []:
break
people.extend(page)
offset += limit
return people
def replace_data(database_url, people):
connection = psycopg2.connect(database_url)
cursor = connection.cursor()
delete_data(cursor)
logging.info('Deleted existing tables')
insert_data(cursor, people)
logging.info('Completed database update')
connection.commit()
cursor.close()
connection.close()
def delete_data(cursor):
cursor.execute('DELETE FROM stints')
cursor.execute('DELETE FROM people')
cursor.execute('DELETE FROM batches')
def insert_data(cursor, people):
processed_batches = set()
for person in people:
first_name = person.get('first_name')
last_name = person.get('last_name')
name = person.get('name')
middle_name = re.sub(r"^%s\s*(.*)\s+%s" % (re.escape(first_name), re.escape(last_name)), "\\1", name)
logging.debug("Person #{}: {} {} {}; {}".format(
person.get('id'),
first_name,
middle_name,
last_name,
person.get('image')
))
cursor.execute("INSERT INTO people" +
" (person_id, first_name, middle_name," +
" last_name, image_url)" +
" VALUES (%s, %s, %s, %s, %s)",
[person.get('id'),
first_name,
middle_name,
last_name,
person.get('image_path')
]
)
for stint in person['stints']:
batch_id = None
batch = stint.get('batch')
if (batch):
batch_id = batch.get('id')
if (batch_id not in processed_batches):
logging.debug(" Batch {}, \"{}\", {} - {}".format(
batch_id,
batch.get('name'),
batch.get('short_name'),
batch.get('alt_name')
))
cursor.execute("INSERT INTO batches " +
" (batch_id, name, short_name, alt_name)" +
" VALUES (%s, %s, %s, %s)",
[batch_id,
batch.get('name'),
batch.get('short_name'),
batch.get('alt_name')
]
)
processed_batches.add(batch_id)
logging.debug(" Stint: {}, Batch: {}, {} - {}".format(
stint.get('type'),
batch_id,
stint.get('start_date'),
stint.get('end_date')
))
cursor.execute("INSERT INTO stints" +
" (person_id, batch_id, stint_type," +
" start_date, end_date, title)" +
" VALUES (%s, %s, %s, %s, %s, %s)",
[person.get('id'),
batch_id,
stint.get('type'),
stint.get('start_date'),
stint.get('end_date'),
stint.get('title'),
]
)
logging.info('Inserted %s people', len(people))
logging.info('Inserted %s batches', len(processed_batches))
def main(database_url, token):
logging.info('Starting Faces database update...')
people = get_people(token)
logging.info('Found %s people', len(people))
replace_data(database_url, people)
if __name__ == "__main__":
import os
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(level=logging.INFO)
database_url = getEnvVar('DATABASE_URL')
token = getEnvVar('RC_API_ACCESS_TOKEN')
main(database_url, token)