-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfunctions.py
406 lines (379 loc) · 18.3 KB
/
functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
import urllib2; # For retrieving the contents of a page
from bs4 import BeautifulSoup; # For parsing the contents of a page to HTML
import re # For regular expressions
import pandas as pd # For creating dataframes
# Function that takes in a URL and returns the contents of the page as HTML
def loadPage(url):
page = urllib2.urlopen(url)
data = BeautifulSoup(page, "lxml")
return data
# Function that takes in a table(s) as a parameter and outputs a dictionary with information relating to each film.
# The process of how it extracts the data from the tables differs depending on the pageType parameter.
# The third parameter winnersOnly is a boolean with a default value of True. If it is set to False then it returns it also returns a list of Oscar winners as part of the dictionary
def extractFilmData(pageType, tables, winnersOnly=True):
# Winning style of table cells
winnerStyle = 'background:#FAEB86'
# List for storing films
films = []
# Best Picture
if pageType == "picture":
# List for storing the producers of each film
producers = []
# List for Oscar winners
winners = []
# Each decade is in a seperate table so 10 in total
for i in range(10):
# If the winnersOnly parameter is True we only want the rows with winners data in them.
# Else we want all rows
if winnersOnly:
search = tables[i].findAll("tr", style=re.compile('^'+winnerStyle))
else:
search = tables[i].findAll("tr")
# Loop through the list of rows
for row in search:
# Variable to determine whether or not film is an Oscar winner. Default value is False
isWinner = False
# Only if winnersOnly is False AND the row has a style attribute
if not winnersOnly and row.has_attr("style"):
# Check that the starts with the winnerStyle ie. 'background:#FAEB86' or 'background:#FAEB86;'.
# If it is then it is an Oscar winner so set isWinner to True
if row["style"].startswith(winnerStyle):
isWinner = True
# Get each cell within that row
cells = row.findAll('td')
if len(cells) > 1:
# Get the text from within the first cell, encode it and add it to the films list
film = cells[0].find(text=True).encode('utf-8').strip()
films.append(film)
# If winnersOnly is False and isWinner is True then add the film to the list of winners
if not winnersOnly and isWinner:
winners.append(film)
# Get the text from within the second cell - text saved as an array
producer = cells[1].findAll(text=True)
if len(producer) > 1:
# If the last index is a line break then remove it from variable
if producer[len(producer)-1] == '\n':
producer = producer[:-1]
# If first item ends in a line break then remove it
if producer[0].endswith("\n"):
producer = producer[0][0:-1]
# Join all the text to a single string and append it to the list of nominees
producers.append("".join(producer).encode('utf-8').strip())
# Return lists of films and producers as a dictionary
# If winnersOnly parameter is True then also return list of Oscar winners
if winnersOnly:
return {"films": films, "names": producers}
else:
return {"films": films, "names": producers, "winners": winners}
# Best Director
elif pageType == "director":
# List for storing the director of each film
directors = []
# If the winnersOnly parameter is False then create a dictionary for storing each winning director and film
if not winnersOnly:
winners = {'director': [], 'film': []}
# All data is in one table so return all rows
for row in tables[0].findAll("tr"):
# If the winnersOnly parameter is True we only want the cells with winners data in them.
# Otherwise we want all cells from within that row
if winnersOnly:
cells = row.findAll("td", style=re.compile('^'+winnerStyle))
else:
cells = row.findAll("td")
# Should find the director in first cell and the film in second cell so check there is enough
if len(cells) > 1:
# Variable to determine whether or not film is an Oscar winner. Default value is False
isWinner = False
# Only if winnersOnly is False AND the cell has a style attribute
if not winnersOnly and cells[0].has_attr("style"):
# Check that the cell starts with the winnerStyle ie. 'background:#FAEB86' or 'background:#FAEB86;'.
# If it is then it is an Oscar winner so set isWinner to True
if cells[0]["style"].startswith(winnerStyle):
isWinner = True
# Get all text from the first cell. We don't want the last part as this has "\n" characters
director = cells[0].findAll(text=True)[:-1]
# Join together array into a string and add to list of directors
director = "".join(director).encode('utf-8').strip()
directors.append(director)
# Get the text from within the second cell, encode it and add it to the list of nominees
film = cells[1].find(text=True).encode('utf-8').strip()
films.append(film)
# If winnersOnly is False AND isWinner is True then add the director film to the winners dictionary
if not winnersOnly and isWinner:
winners['director'].append(director)
winners['film'].append(film)
# In case a director has more than one film nominated, then the second film will be in a new cell
elif len(cells) == 1:
# In this scenario the cell will have no rowspan or colspan
if cells[0].has_attr("rowspan") == False and cells[0].has_attr("colspan") == False:
film = cells[0].find(text=True).encode('utf-8').strip()
films.append(film)
# Return lists of films and directors as a dictionary
# If winnersOnly parameter is True then also return the dictionary of Oscar winners
if winnersOnly:
return {"films": films, "names": directors}
else:
return {"films": films, "names": directors, "winners": winners}
# Any of Best Actor, Best Actress, Best Supporting Actor, Best Supporting Actress
elif pageType in ["actor", "actress", "supporting actor", "supporting actress"]:
# List for storing the actors/actresses for each film
actors = []
# If the winnersOnly parameter is False then create a dictionary for storing each winning actor/actress and film
if not winnersOnly:
# Depending on the value of the pageType parameter create a dictionary for winners with either an actor or actress column
if pageType in ["actor", "supporting actor"]:
winners = {'actor': [], 'film': []}
else:
winners = {'actress': [], 'film': []}
# All data is in one table so return all rows
for row in tables[0].findAll("tr"):
# If the winnersOnly parameter is True we only want the rows with winners data in them.
# Otherwise we want all rows
if winnersOnly:
cells = row.findAll("td", style=re.compile('^'+winnerStyle))
else:
cells = row.findAll('td')
# Variable to store the current cell being searched
index = 0
if len(cells) > 1:
# Variable to determine whether or not film is an Oscar winner. Default value is False
isWinner = False
# Only if winnersOnly is False AND the row has a style attribute
if not winnersOnly and cells[0].has_attr("style"):
# Check that the cell starts with the winnerStyle ie. 'background:#FAEB86' or 'background:#FAEB86;'.
# If it is then it is an Oscar winner so set isWinner to True
if cells[0]["style"].startswith(winnerStyle):
isWinner = True
# Iterate through each cell
for cell in cells:
# Find all hyperlinks within the cell
links = cell.findAll("a")
# Iterate through each hyperlink
for link in links:
# We are only interested in links that include a title attribute
if link.has_attr('title'):
# First link contains the actor
if index == 0:
actor = link.find(text=True).encode('utf-8').strip()
actors.append(actor)
# Third link contains the film
elif index == 2:
film = link.find(text=True).encode('utf-8').strip()
films.append(film)
# If winnersOnly is False AND isWinner is True then add the film to the winners dictionary
if not winnersOnly and isWinner:
winners['film'].append(film)
# Depending on the pageType add the actor to the appropriate column in the winners dictionary
if pageType in ["actor", "supporting actor"]:
winners['actor'].append(actor)
else:
winners['actress'].append(actor)
# Increment index after each cell
index = index+1
# Return lists of films and actors as a dictionary
# If winnersOnly parameter is True then also return the dictionary of Oscar winners
if winnersOnly:
return {"films": films, "names": actors}
else:
return {"films": films, "names": actors, "winners": winners}
# Function that takes in a table(s) as a parameter and outputs a list of years
# The third parameter winnersOnly is a boolean with a default value of True. If it is set to False then it returns it instead returns a dictionary containing each year
# and the number of films nominated for that year ie. the rowcount
def extractYears(pageType, tables, winnersOnly=True):
# List to store the years
years = []
# List to store the rowcount for each year
list_nrows = []
# Best Picture
if pageType == "picture":
# Each decade is in a seperate table so 10 in total
for i in range(10):
for cell in tables[i].findAll("td", style="text-align:center"):
# Get each link within that cell
links = cell.findAll('a')
# If year is all in one link and in a format like 1927/28 then we want the first two and last two charaters so that
# it will be 1928
if len(links[0].find(text=True)) == 7:
year = links[0].find(text=True)[0:2] + links[0].find(text=True)[5:7]
# If year is in two links and the second link has four numbers ie. 1927/1928 then we want the second link
elif len(links[1].find(text=True)) == 4 and links[1].find(text=True).isnumeric():
year = links[1].find(text=True)
# If the year is in two links and the second link has two numbers ie. 1927/28 then we want the first two characters
# of the first link and the last two of the second link
elif len(links[1].find(text=True)) == 2 and links[1].find(text=True).isnumeric():
year = links[0].find(text=True)[0:2] + links[1].find(text=True)
# Else the year is in a standard format ie. 1928 so get four characters
else:
year = links[0].find(text=True)[0:4]
# Add year to list of years
years.append(int(year))
if not winnersOnly and cell.has_attr('rowspan'):
# Add year and nrows to dataframe
list_nrows.append(cell["rowspan"])
# Best Director, Best Actor or Best Actress
elif pageType in ["director", "actor", "actress"]:
# All data is in one table so find all the row which match
for row in tables[0].findAll("th", scope="row"):
# Get each link within that cell
links = row.findAll('a')
# If year is all in one link and in a format like 1927/28 then we want the first two and last two charaters so that
# it will be 1928
if len(links[0].find(text=True)) == 7:
year = links[0].find(text=True)[0:2] + links[0].find(text=True)[5:7]
# If year is in two links and the second link has four numbers ie. 1927/1928 then we want the second link
elif len(links[1].find(text=True)) == 4 and links[1].find(text=True).isnumeric():
year = links[1].find(text=True)
# If the year is in two links and the second link has two numbers ie. 1927/28 then we want the first two characters
# of the first link and the last two of the second link
elif len(links[1].find(text=True)) == 2 and links[1].find(text=True).isnumeric():
year = links[0].find(text=True)[0:2] + links[1].find(text=True)
# Else the year is in a standard format ie. 1928 so get four characters
else:
year = links[0].find(text=True)[0:4]
# Add year to list of years
years.append(int(year))
# If winnersOnly is False and row has a rowspan attribute then add the rowspan value to the list of rowcounts
if not winnersOnly and row.has_attr('rowspan'):
list_nrows.append(row['rowspan'])
# Best Supporting Actor or Best Supporting Actress
else:
# All data is in one table so find the rows which match
for row in tables[0].findAll("th", scope="row"):
# Get each link within that cell
links = row.findAll('a')
if len(links) > 1:
# All years are in the standard four digit format so add it to the list
year = links[0].find(text=True)
years.append(int(year))
# If winnersOnly is False and row has a rowspan attribute then add the rowspan value to the list of rowcounts
if not winnersOnly and row.has_attr('rowspan'):
list_nrows.append(row['rowspan'])
# If winnersOnly is True then return the list of years.
# Otherwise return a dictionary containing the list of years and the list of rowcounts
if winnersOnly:
return years
else:
return { "years": years, "numFilms": list_nrows }
# Function for creating a dataframe using input parameters - data_type (type estring), data, numFilmsByYear, winners (all type dictionary)
def createDataFrame(data_type, data, numFilmsByYear, winners):
# Depending on the data_type parameter, a dataframe is created with different columns.
# Lists are created using the columns of the data parameter
if data_type == "picture":
df_frame = pd.DataFrame(columns = ["Year","Film","Producers","Winner"])
films = data["films"]
producers = data["producers"]
elif data_type == "director":
df_frame = pd.DataFrame(columns = ["Year","Director","Film","Winner"])
films = data["films"]
directors = data["directors"]
elif data_type in ["actor", "supporting actor"]:
df_frame = pd.DataFrame(columns = ["Year","Actor","Film","Winner"])
films = data["films"]
actors = data["actors"]
elif data_type in ["actress", "supporting actress"]:
df_frame = pd.DataFrame(columns = ["Year","Actress","Film","Winner"])
films = data["films"]
actresses = data["actresses"]
# Variable to track the current list index while looping through lists
start = 0
# Loop through each year stored in the dictionary
for i in range(len(numFilmsByYear['years'])):
# Get the year at the current index
year = numFilmsByYear['years'][i]
# Get the number of films nominated for that year
numFilms = int(numFilmsByYear['numFilms'][i])
# A variable end is calculated by adding start anf numFilms together.
end = start + numFilms
# If data_type is "picture", then subtract 1 from it
if data_type == "picture":
end = end - 1
# Loop through each number from start to end
for j in range(start, end):
# We use a try/except here in case goes out of bounds
try:
# Variable to determine whether or not film is an Oscar winner - default value is "No"
isWinner = "No"
# Best Picture
if data_type == "picture":
# Variables to store the film and producer for the current index
film = films[j]
producer = producers[j]
# If the film is in the list of winners then set variable to "Yes"
if film in winners:
isWinner = "Yes"
# We then want to find the key that stores that film so we can delete it.
# This is necessary in case of remakes of previous winners eg. "Mutiny on the Bounty".
# Break out of loop once deleted
for key, value in enumerate(winners):
if value == film:
del winners[key]
break
# Append all variable to data frame
df_frame.loc[df_frame.shape[0]] = [year, film, producer, isWinner]
# Best Director
elif data_type == "director":
# Variables to store the film and director for the current index
film = films[j]
director = directors[j]
# Check that both the director and film are in the list of winners.
# If they are then change the value of isWinner to "Yes"
if director in winners['director'] and film in winners['film']:
isWinner = "Yes"
# We then want to find the key that stores that film so we can delete it.
# This is necessary in case of a previous winner being nominated for a later film
for key, value in enumerate(winners['director']):
if value == director:
del winners['director'][key]
del winners['film'][key]
break
# Append all variable to data frame
df_frame.loc[df_frame.shape[0]] = [year, director, film, isWinner]
# Best Actor/ Supporting Actor
elif data_type in ["actor", "supporting actor"]:
# Variables to store the film and actor for the current index
film = films[j]
actor = actors[j]
# Check that both the actor and film are in the list of winners.
# If they are then change the value of isWinner to "Yes"
if actor in winners['actor'] and film in winners['film']:
isWinner = "Yes"
# We then want to find the key that stores that film so we can delete it.
# This is necessary in case of a previous winner being nominated for a later film
# Break out of loop once deleted
for key, value in enumerate(winners['actor']):
if value == actor:
del winners['actor'][key]
del winners['film'][key]
break
# Append all variable to data frame
df_frame.loc[df_frame.shape[0]] = [year, actor, film, isWinner]
# Best Actress/Supporting Actress
elif data_type in ["actress", "supporting actress"]:
# Variables to store the film and actress for the current index
film = films[j]
actress = actresses[j]
# Check that both the actress and film are in the list of winners.
# If they are then change the value of isWinner to "Yes"
if actress in winners['actress'] and film in winners['film']:
isWinner = "Yes"
# We then want to find the key that stores that film so we can delete it.
# This is necessary in case of a previous winner being nominated for a later film
# Break out of loop once deleted
for key, value in enumerate(winners['actress']):
if value == actress:
del winners['actress'][key]
del winners['film'][key]
break
# # Append all variable to data frame
df_frame.loc[df_frame.shape[0]] = [year, actress, film, isWinner]
# In case of an index out of bounds exception, break out of loop
except IndexError:
break
# Increment start by numFilms. If data_type is "picture" then subtract 1 from it
start = start + numFilms
if data_type == "picture":
start = start - 1
# Now with all the data in the dataframe, convert the year column to an integer
df_frame["Year"] = df_frame["Year"].astype(int)
# Return the dataframe
return df_frame