From ec770100452ca1a869058e448b1b41c8efb810d9 Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Sat, 5 Dec 2015 09:53:39 -0800 Subject: [PATCH 1/6] add sample functions with seeds --- R/pkg/R/DataFrame.R | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 81b4e6b91d8a..35b2276c5b5b 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -677,25 +677,44 @@ setMethod("unique", #' collect(sample(df, TRUE, 0.5)) #'} setMethod("sample", - # TODO : Figure out how to send integer as java.lang.Long to JVM so - # we can send seed as an argument through callJMethod signature(x = "DataFrame", withReplacement = "logical", - fraction = "numeric"), - function(x, withReplacement, fraction) { + fraction = "numeric", seed = "missing"), + function(x, withReplacement, fraction, seed) { if (fraction < 0.0) stop(cat("Negative fraction value:", fraction)) sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction) dataFrame(sdf) }) +#' @rdname sample +#' @name sample +setMethod("sample", + # we can send seed as an argument through callJMethod + signature(x = "DataFrame", withReplacement = "logical", + fraction = "numeric", seed = "numeric"), + function(x, withReplacement, fraction, seed) { + if (fraction < 0.0) stop(cat("Negative fraction value:", fraction)) + sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction, as.integer(seed)) + dataFrame(sdf) + }) + #' @rdname sample #' @name sample_frac setMethod("sample_frac", signature(x = "DataFrame", withReplacement = "logical", - fraction = "numeric"), - function(x, withReplacement, fraction) { + fraction = "numeric", seed = "missing"), + function(x, withReplacement, fraction, seed) { sample(x, withReplacement, fraction) }) +#' @rdname sample +#' @name sample_frac +setMethod("sample_frac", + signature(x = "DataFrame", withReplacement = "logical", + fraction = "numeric", seed = "numeric"), + function(x, withReplacement, fraction, seed) { + sample(x, withReplacement, fraction, as.integer(seed)) + }) + #' nrow #' #' Returns the number of rows in a DataFrame From 2ab89e8584ce3708d04f2113d552da2b6635922d Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Sun, 6 Dec 2015 00:09:48 -0800 Subject: [PATCH 2/6] combine two variants --- R/pkg/R/DataFrame.R | 37 ++++++++++++------------------------- 1 file changed, 12 insertions(+), 25 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 35b2276c5b5b..33982f555f98 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -678,22 +678,14 @@ setMethod("unique", #'} setMethod("sample", signature(x = "DataFrame", withReplacement = "logical", - fraction = "numeric", seed = "missing"), + fraction = "numeric"), function(x, withReplacement, fraction, seed) { if (fraction < 0.0) stop(cat("Negative fraction value:", fraction)) - sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction) - dataFrame(sdf) - }) - -#' @rdname sample -#' @name sample -setMethod("sample", - # we can send seed as an argument through callJMethod - signature(x = "DataFrame", withReplacement = "logical", - fraction = "numeric", seed = "numeric"), - function(x, withReplacement, fraction, seed) { - if (fraction < 0.0) stop(cat("Negative fraction value:", fraction)) - sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction, as.integer(seed)) + if (!missing(seed)) { + sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction, as.integer(seed)) + } else { + sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction) + } dataFrame(sdf) }) @@ -701,18 +693,13 @@ setMethod("sample", #' @name sample_frac setMethod("sample_frac", signature(x = "DataFrame", withReplacement = "logical", - fraction = "numeric", seed = "missing"), - function(x, withReplacement, fraction, seed) { - sample(x, withReplacement, fraction) - }) - -#' @rdname sample -#' @name sample_frac -setMethod("sample_frac", - signature(x = "DataFrame", withReplacement = "logical", - fraction = "numeric", seed = "numeric"), + fraction = "numeric"), function(x, withReplacement, fraction, seed) { - sample(x, withReplacement, fraction, as.integer(seed)) + if (!missing(seed)) { + sample(x, withReplacement, fraction, as.integer(seed)) + } else { + sample(x, withReplacement, fraction) + } }) #' nrow From 34d01188485ac375c811362af32e9b9aad872631 Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Sun, 6 Dec 2015 21:23:20 -0800 Subject: [PATCH 3/6] added a test case --- R/pkg/inst/tests/test_sparkSQL.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R index 1e7cb5409970..6065e2d480c5 100644 --- a/R/pkg/inst/tests/test_sparkSQL.R +++ b/R/pkg/inst/tests/test_sparkSQL.R @@ -705,6 +705,10 @@ test_that("sample on a DataFrame", { sampled2 <- sample(df, FALSE, 0.1, 0) # set seed for predictable result expect_true(count(sampled2) < 3) + count1 <- count(sample(df, FALSE, 0.1, 0)) + count2 <- count(sample(df, FALSE, 0.1, 0)) + expect_equal(count1, count2) + # Also test sample_frac sampled3 <- sample_frac(df, FALSE, 0.1, 0) # set seed for predictable result expect_true(count(sampled3) < 3) From 4337c35e03f8d70a6346e43888eaf3b6ba341722 Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Sun, 6 Dec 2015 21:41:40 -0800 Subject: [PATCH 4/6] simplified the impl of sample_frac --- R/pkg/R/DataFrame.R | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 33982f555f98..352677c72453 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -682,6 +682,8 @@ setMethod("sample", function(x, withReplacement, fraction, seed) { if (fraction < 0.0) stop(cat("Negative fraction value:", fraction)) if (!missing(seed)) { + # TODO : Figure out how to send integer as java.lang.Long to JVM so + # we can send seed as an argument through callJMethod sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction, as.integer(seed)) } else { sdf <- callJMethod(x@sdf, "sample", withReplacement, fraction) @@ -695,11 +697,7 @@ setMethod("sample_frac", signature(x = "DataFrame", withReplacement = "logical", fraction = "numeric"), function(x, withReplacement, fraction, seed) { - if (!missing(seed)) { sample(x, withReplacement, fraction, as.integer(seed)) - } else { - sample(x, withReplacement, fraction) - } }) #' nrow From a78109e694703686009464f2b733c72070d81928 Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Sun, 6 Dec 2015 22:17:39 -0800 Subject: [PATCH 5/6] simplified the impl of sample_frac. --- R/pkg/R/DataFrame.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index 352677c72453..fd3352fdaefc 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -697,7 +697,7 @@ setMethod("sample_frac", signature(x = "DataFrame", withReplacement = "logical", fraction = "numeric"), function(x, withReplacement, fraction, seed) { - sample(x, withReplacement, fraction, as.integer(seed)) + sample(x, withReplacement, fraction, seed) }) #' nrow From 493b3689d2e4e67ee6c8501cec419e67bd82a69e Mon Sep 17 00:00:00 2001 From: gatorsmile Date: Mon, 7 Dec 2015 13:49:45 -0800 Subject: [PATCH 6/6] added a new parm into the comment --- R/pkg/R/DataFrame.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index fd3352fdaefc..6e3edc576483 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -662,6 +662,7 @@ setMethod("unique", #' @param x A SparkSQL DataFrame #' @param withReplacement Sampling with replacement or not #' @param fraction The (rough) sample target fraction +#' @param seed Randomness seed value #' #' @family DataFrame functions #' @rdname sample