This repository has been archived by the owner on Oct 12, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 49
/
Copy pathresource_files_example.R
125 lines (102 loc) · 5.65 KB
/
resource_files_example.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
# =======================================
# === Setup / Install and Credentials ===
# =======================================
# install packages from github
library(devtools)
devtools::install_github("azure/doAzureParallel")
# import packages
library(doAzureParallel)
# set azure credentials
doAzureParallel::setCredentials("credentials.json")
# Add data.table package to the CRAN packages and Azure/rAzureBatch to the Github packages
# in order to install the packages to all of the nodes
# Since reading the large datasets cost high memory, we recommend using Standard_D11_v2
# "rPackages": {
# "cran": ["data.table"],
# "github": ["Azure/rAzureBatch", "Azure/doAzureParallel"]
# }
# ===================================================
# === Setting up your cluster with resource files ===
# ===================================================
# Now we will use resource-files to upload our dataset onto each node of our cluster.
# Currently, our data is stored in Azure Blob in an account called 'playdatastore',
# in a public container called "nyc-taxi-dataset". The default blob containers permissions
# settings are private when creating containers in doAzureParallel / Azure Storage Explorer.
# To get this dataset onto each node,
# we will create a resouceFile object for each blob - we will then use the resourceFile
# when building the cluster so that each node in the cluster knows to download these files
# after the node is provisioned.
# Using the NYC taxi datasets, http://www.nyc.gov/html/tlc/html/about/trip_record_data.shtml
azureStorageUrl <- "http://playdatastore.blob.core.windows.net/nyc-taxi-dataset"
resource_files <- list(
rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-1.csv"), filePath = "yellow_tripdata_2016-1.csv"),
rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-2.csv"), filePath = "yellow_tripdata_2016-2.csv"),
rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-3.csv"), filePath = "yellow_tripdata_2016-3.csv"),
rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-4.csv"), filePath = "yellow_tripdata_2016-4.csv"),
rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-5.csv"), filePath = "yellow_tripdata_2016-5.csv"),
rAzureBatch::createResourceFile(httpUrl = paste0(azureStorageUrl, "/yellow_tripdata_2016-6.csv"), filePath = "yellow_tripdata_2016-6.csv")
)
# add the parameter 'resourceFiles' to download files to nodes
cluster <- makeCluster("resource_files_cluster.json", resourceFiles = resource_files)
# when the cluster is provisioned, register the cluster as your parallel backend
registerDoAzureParallel(cluster)
# ======================================================
# === Setting up storage account to write results to ===
# ======================================================
# Setup storage location to write your results to:
# This step will allow your to upload your results from within your doAzureParallel foreach loop:
#
# 1. Replace the "mystorageaccount" with the name of the storage account you wish to write your results to.
# 2. Create an output container named "nyc-taxi-graphs" to store your results in
# 3. Create a SasToken that allows us to write ("w") to the container
# 4. Notice the parameter 'sr = "c"' in the createSasToken method, this
# simply means that the token is created for that entire container in storage
#
storageAccountName <- "mystorageaccount"
outputsContainer <- "nyc-taxi-graphs"
rAzureBatch::createContainer(outputsContainer)
# permissions: r = read, w = write.
outputSas <- rAzureBatch::createSasToken(permission = "rw", sr = "c", outputsContainer)
# =======================================================
# === Foreach with resourceFiles & writing to storage ===
# =======================================================
results <- foreach(i = 1:6) %dopar% {
library(data.table)
library(ggplot2)
library(rAzureBatch)
# To get access to your azure resource files, user needs to use the special
# environment variable to get the directory
fileDirectory <- paste0(Sys.getenv("AZ_BATCH_NODE_STARTUP_DIR"), "/wd")
print(fileDirectory)
# columns to keep for the datafram
colsToKeep <- c("pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude", "tip_amount", "trip_distance")
# read in data from CSV that was downloaded from the resource file
file <- fread(paste0(fileDirectory, "/yellow_tripdata_2016-", i, ".csv"), select = colsToKeep)
# set the coordinates for the bounds of the plot
min_lat <- 40.5774
max_lat <- 40.9176
min_long <- -74.15
max_long <- -73.7004
# compute intensive plotting
plot <- ggplot(file, aes(x=pickup_longitude, y=pickup_latitude)) +
geom_point(size=0.06) +
scale_x_continuous(limits=c(min_long, max_long)) +
scale_y_continuous(limits=c(min_lat, max_lat)) +
scale_color_gradient(low="#CCCCCC", high="#8E44AD", trans="log") +
labs(title = paste0("Map of NYC, Plotted Using Locations Of All Yellow Taxi Pickups in ", i, " month"))
# build image from plot
image <- paste0("nyc-taxi-", i, ".png")
ggsave(image)
# save image to the storage account using the Sas token we created above
blob <- rAzureBatch::uploadBlob(containerName = outputsContainer,
image,
sasToken = outputSas,
accountName = storageAccountName)
# return the blob url
blob$url
}
# The results object is a list of pointers to files in Azure Storage. Copy and paste the links into your favorite browser
# to see the output per run.
results
# deprovision your cluster after your work is complete
stopCluster(cluster)