Skip to content
This repository has been archived by the owner on Oct 12, 2023. It is now read-only.

Commit

Permalink
Improvement on merge task performance (#223)
Browse files Browse the repository at this point in the history
* Added doParallel support

* Renamed txt file

* Fixed lintr

* Restructured merger script

* Removed some error handling cases

* Fixed syntax

* Renamed error handling test

* Added accumulator

* Using filter on function

* Proper filtering of tasks

* Fixed merge naming

* Added error handling for worker, separate merge task function

* Added buckets

* Added addSubMergeTask

* Added merge sub task functions

* Fixing file names

* Fixed sorting order for merger

* Added space

* Merger in R

* Clean up merger worker script

* Added mergeSize option

* By default one bucket

* Removed merge size flag

* Fixed test

* Fixed lint code

* Fixed more lintr issues

* Fixed lintr

* Fixed the added comments

* Fixed the if statement

* Add list combine function validation

* Removed verification

* Fixed lintr
  • Loading branch information
brnleehng authored Apr 13, 2018
1 parent 1b60e47 commit 852dba0
Show file tree
Hide file tree
Showing 7 changed files with 515 additions and 134 deletions.
188 changes: 179 additions & 9 deletions R/helpers.R → R/batchHelperFunctions.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,182 @@
addFinalMergeTask <- function(jobId, taskId, rCommand, ...){
storageCredentials <- rAzureBatch::getStorageCredentials()

args <- list(...)
dependsOn <- args$dependsOn
cloudCombine <- args$cloudCombine
containerImage <- args$containerImage

resultFile <- paste0(taskId, "-result", ".rds")
accountName <- storageCredentials$name

# Only use the download command if cloudCombine is enabled
# Otherwise just leave it empty
commands <- c()

if (!is.null(cloudCombine)) {
assign("cloudCombine", cloudCombine, .doAzureBatchGlobals)

copyCommand <- sprintf(
"%s %s %s --download --saskey $BLOBXFER_SASKEY --remoteresource . --include results/*.rds",
accountName,
jobId,
"$AZ_BATCH_TASK_WORKING_DIR"
)

downloadCommand <-
dockerRunCommand("alfpark/blobxfer:0.12.1", copyCommand, "blobxfer", FALSE)

commands <- c(downloadCommand)
}

exitConditions <- NULL
if (!is.null(args$dependsOn)) {
dependsOn <- args$dependsOn
}
else {
exitConditions <- list(default = list(dependencyAction = "satisfy"))
}

containerUrl <-
rAzureBatch::createBlobUrl(
storageAccount = storageCredentials$name,
containerName = jobId,
sasToken = rAzureBatch::createSasToken("w", "c", jobId)
)

outputFiles <- list(
list(
filePattern = resultFile,
destination = list(container = list(
path = paste0("results/", resultFile),
containerUrl = containerUrl
)),
uploadOptions = list(uploadCondition = "taskCompletion")
),
list(
filePattern = paste0(taskId, ".txt"),
destination = list(container = list(
path = paste0("logs/", taskId, ".txt"),
containerUrl = containerUrl
)),
uploadOptions = list(uploadCondition = "taskCompletion")
),
list(
filePattern = "../stdout.txt",
destination = list(container = list(
path = paste0("stdout/", taskId, "-stdout.txt"),
containerUrl = containerUrl
)),
uploadOptions = list(uploadCondition = "taskCompletion")
),
list(
filePattern = "../stderr.txt",
destination = list(container = list(
path = paste0("stderr/", taskId, "-stderr.txt"),
containerUrl = containerUrl
)),
uploadOptions = list(uploadCondition = "taskCompletion")
)
)

commands <-
c(commands,
dockerRunCommand(containerImage, rCommand))

commands <- linuxWrapCommands(commands)

sasToken <- rAzureBatch::createSasToken("rwcl", "c", jobId)
queryParameterUrl <- "?"

for (query in names(sasToken)) {
queryParameterUrl <-
paste0(queryParameterUrl,
query,
"=",
RCurl::curlEscape(sasToken[[query]]),
"&")
}

queryParameterUrl <-
substr(queryParameterUrl, 1, nchar(queryParameterUrl) - 1)

setting <- list(name = "BLOBXFER_SASKEY",
value = queryParameterUrl)

containerEnv <- list(name = "CONTAINER_NAME",
value = jobId)

rAzureBatch::addTask(
jobId,
taskId,
environmentSettings = list(setting, containerEnv),
commandLine = commands,
dependsOn = dependsOn,
outputFiles = outputFiles,
exitConditions = exitConditions
)
}

addSubMergeTask <- function(jobId, taskId, rCommand, ...){
storageCredentials <- rAzureBatch::getStorageCredentials()
accountName <- storageCredentials$name

args <- list(...)
dependsOn <- args$dependsOn
containerImage <- args$containerImage
outputFiles <- args$outputFiles

copyCommand <- sprintf(
"%s %s %s --download --saskey $BLOBXFER_SASKEY --remoteresource . --include %s/*.rds",
accountName,
jobId,
"$AZ_BATCH_TASK_WORKING_DIR",
taskId
)

exitConditions <- NULL
if (!is.null(args$dependsOn)) {
dependsOn <- args$dependsOn
}
else {
exitConditions <- list(default = list(dependencyAction = "satisfy"))
}

downloadCommand <-
dockerRunCommand("alfpark/blobxfer:0.12.1", copyCommand, "blobxfer", FALSE)

commands <- c(downloadCommand, dockerRunCommand(containerImage, rCommand))
commands <- linuxWrapCommands(commands)

sasToken <- rAzureBatch::createSasToken("rwcl", "c", jobId)
queryParameterUrl <- "?"

for (query in names(sasToken)) {
queryParameterUrl <-
paste0(queryParameterUrl,
query,
"=",
RCurl::curlEscape(sasToken[[query]]),
"&")
}

queryParameterUrl <-
substr(queryParameterUrl, 1, nchar(queryParameterUrl) - 1)

setting <- list(name = "BLOBXFER_SASKEY",
value = queryParameterUrl)

rAzureBatch::addTask(
jobId,
taskId,
commandLine = commands,
environmentSettings = list(setting),
dependsOn = dependsOn,
outputFiles = outputFiles,
exitConditions = exitConditions
)
}

.addTask <- function(jobId, taskId, rCommand, ...) {
storageCredentials <- rAzureBatch::getStorageCredentials()

Expand All @@ -14,7 +193,6 @@
maxTaskRetryCount <- args$maxTaskRetryCount
}

resultFile <- paste0(taskId, "-result", ".rds")
accountName <- storageCredentials$name

resourceFiles <- NULL
Expand Down Expand Up @@ -67,14 +245,6 @@
)

outputFiles <- list(
list(
filePattern = resultFile,
destination = list(container = list(
path = paste0("result/", resultFile),
containerUrl = containerUrl
)),
uploadOptions = list(uploadCondition = "taskCompletion")
),
list(
filePattern = paste0(taskId, ".txt"),
destination = list(container = list(
Expand Down
Loading

0 comments on commit 852dba0

Please sign in to comment.