-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDATA101-flightEDA
121 lines (89 loc) · 3.2 KB
/
DATA101-flightEDA
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#----------------------------------------------
# DATA 101 - HOMEWORK 2
#
#----------------------------------------------
# Please save this script as "YOUR LAST NAME_HW2.R" and upload the script to Canvas.
# You should also upload a word document containing your write up and graphs.
# Please type your code into the sections outlined below.
#----------------------------------------------
# Question 1
# This simplifies the code by shaving off the "fat" of the data we're looking to
# analyze. In other words, it cleans the data and enable us to more accurately
# view the information that we want to work from.
#----------------------------------------------
# Question 2
# The data:
library(tidyverse)
install.packages("nycflights13")
library(nycflights13)
flights
View(flights)
#Calculating for the delay:
avg_delay <- summarise(by_day, delay = mean(dep_delay, na.rm = TRUE))
# Flights not cancelled:
not_cancelled <- flights %>%
filter(!is.na(dep_delay), !is.na(arr_delay))
# If 328,063 weren't cancelled of 336,776, I can quick do that math:
327346/336776 # = 97.19992% not cancelled, which leaves ~ 2.59% or:
336776-327346
# 9430 cancelled. My proportion of cancelled flights should match that number.
cancelled_per_day <-
flights %>%
mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
group_by(year, month, day) %>%
summarise(
cancelled_num = sum(cancelled),
flights_num = n(),
)
not_cancelled %>%
group_by(arr_delay) %>%
summarise(n = n())
ggplot(cancelled_per_day) +
geom_point(aes(x = flights_num, y = cancelled_num))+
ggplot(avg_delay)+
geom_point(aes(x = delay, y = day))
cancelled_and_delays <-
flights %>%
mutate(cancelled = (is.na(arr_delay) | is.na(dep_delay))) %>%
group_by(year, month, day) %>%
summarise(
cancelled_prop = mean(cancelled),
avg_dep_delay = mean(dep_delay, na.rm = TRUE),
avg_arr_delay = mean(arr_delay, na.rm = TRUE)
) %>%
ungroup()
ggplot(cancelled_and_delays) +
geom_point(aes(x = avg_dep_delay, y = cancelled_prop))
#Yes, our intuition seems to hold to the notion that the amount of average cancellations
# increases as the average delay also increases.
#----------------------------------------------
# Question 3
# Part 1 answer:
flights %>%
ggplot(aes(x=factor(hour), fill=dep_delay>0 | is.na(dep_delay)))+
geom_bar()+
coord_flip()
# Part 2 answer:
flights %>%
ggplot(aes(x=factor(hour), fill=dep_delay<=dep_time))+
geom_bar()+
coord_flip()
# Based on the graph, 8:00 AM is the best time to book a flight if you want your
# departure time to be on time or early.
#----------------------------------------------
# Question 4
flights %>%
ggplot(aes(x=factor(carrier), fill=dep_delay>=30))+
geom_bar()+
coord_flip()
# According the graph, the carrier that is most likely to experience a delay of
# 30 or more minutes is EV.
#----------------------------------------------
# Question 5
flights %>%
ggplot(aes(x=factor(dest), fill=arr_delay<=1))+
geom_bar()+
coord_flip()
# According to the graph, ORD has the smallest average arrival delay, which also
# plays to assumptions that could be made about its central location, size
# (which means larger and faster planes) in comparison to other destinations.