-
Notifications
You must be signed in to change notification settings - Fork 7
/
07-big_data_bigquery.R
108 lines (80 loc) · 3.12 KB
/
07-big_data_bigquery.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
##########################################################################
# Jose Cajide - @jrcajide
# Master Data Science: Getting data from Google BigQuery
##########################################################################
list.of.packages <- c("dplyr", "tidyverse", "bigrquery", "sqldf", "DBI", "ggplot2")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# devtools::install_github("rstats-db/bigrquery")
library(bigrquery)
library(ggplot2)
library(sqldf)
library(dplyr)
# If you don't already have a Google project...
# Login into the Google Developer Console
# Create a project and activate the BigQuery API
# Open public dataset:
# https://bigquery.cloud.google.com/table/datascience-open-data:flights.flights
# Execute your first query...
# SELECT * FROM `datascience-open-data.flights.flights` LIMIT 1000
project <- "master-ds-test" # Pon aquí tu proyecto
con <- DBI::dbConnect(bigquery(),
project = "datascience-open-data",
dataset = "flights",
billing = "datascience-open-data"
)
DBI::dbListTables(con)
flights_db <- tbl(con, "flights",project='datascience-open-data')
# SQL
sql <- "SELECT
COUNT(DISTINCT Year) AS years,
COUNT(DISTINCT UniqueCarrier) AS carriers,
COUNT(DISTINCT Dest) AS airports
FROM
`datascience-open-data.flights.flights`"
resumen <- query_exec(sql, project = "datascience-open-data", use_legacy_sql = F)
# Which is the flight cancellation ratio by year?
# Dplyr
flights_db %>% head
# Let BQ do the aggregation
cancellations <- flights_db %>%
group_by(Year) %>%
summarise(n_flights = n(),
n_canc = sum(Cancelled)) %>%
collect()
# Then apply transformations over your small local data frame for easy EDA
cancellations_by_year <- cancellations %>%
mutate(ratio = n_canc / n_flights) %>%
arrange(desc(Year))
ggplot(cancellations_by_year, aes(x=Year, y=ratio, group=1)) +
geom_line() +
theme_light()
# Segment! Segement! Segment!: Segment by Destination Airport
# Calculate
cancellations <- flights_db %>%
group_by(Year,Dest) %>%
summarise(n_flights = n(),
n_canc = sum(Cancelled)) %>%
collect()
cancellations_1 <- cancellations %>%
mutate(ratio = n_canc / n_flights) %>%
arrange(n_flights)
ggplot(cancellations_1, aes(x=Year, y=ratio, group=Dest, color=Dest)) +
geom_line(show.legend=F, aes(color = Dest), size=0.6, alpha=.5) +
scale_x_continuous()+
theme_light()
# What's happening?
cancellations_1 %>% summary()
hist(cancellations_1$n_flights, breaks = 100)
hist(cancellations_1$n_canc, breaks = 100)
cancellations_2 <- cancellations %>%
filter(abs(n_flights - mean(n_flights)) > 2*sd(n_flights) ) %>%
mutate(ratio = n_canc / n_flights,
top = if_else(ratio > .04, "Y", "N") )%>%
arrange(n_flights)
ggplot(cancellations_2, aes(x=Year, y=ratio, group=Dest, color=top)) +
geom_line(show.legend=T, aes(color = top), size=.6, alpha=.9) +
scale_x_continuous()+
scale_colour_brewer(palette = "Set1", direction = -1) +
theme_light()
DBI::dbDisconnect(con)