first pass at fixup for c71965

git-svn-id: https://svn.r-project.org/R/trunk@71986 00db46b3-68df-0310-9c12-caf00c1e9a41
wch · Jan 16, 2017 · 7ca0161 · 7ca0161
1 parent 2f9cf02
commit 7ca0161
Show file tree

Hide file tree

Showing 5 changed files with 75 additions and 80 deletions.
diff --git a/doc/NEWS.Rd b/doc/NEWS.Rd
@@ -592,6 +592,9 @@
       URL existence before attempting downloads; this is more robust to
       servers that do not support HEAD or range-based retrieval, but may
       create empty or incomplete files for aborted download requests.
+
+      \item Bandwidth selectors \code{bw.ucv()}, \code{bw.bcv()} and
+      \code{bw.SJ()} now avoid integer overflow for large sample sizes.
     }
   }
 }
@@ -676,15 +679,20 @@
       removes leading whitespace and only attempts to open
       \samp{http://} and \samp{https://} URLs, falling back to emailing
       the maintainer.
+
+      \item Bandwidth selectors \code{bw.ucv()} and
+      \code{bw.SJ()} gave incorrect answers or incorrectly reported an
+      error (because of integer overflow) for inputs longer than
+      46341.  Similarly for \code{b2.bcv()} at length 5793.
+
+      Another possible integer overflow is checked and may result in a
+      reported error (rather than an incorrect result) for much longer
+      inputs (millions for a smooth distribution).
 
       \item \code{findMethod} failed if the active signature had
       expanded beyond what a particular package used. (Example with
       packages \CRANpkg{XR} and \CRANpkg{XRJulia} on \acronym{CRAN}).
-
-      \item \code{bw.SJ()} and \code{density(*, bw="SJ")} no longer fail
-      because of integer overflow for large sample size n (starting from
-      n as small as 65536).
-    }
+   }
   }
 }
 

diff --git a/src/library/stats/R/bandwidths.R b/src/library/stats/R/bandwidths.R
@@ -48,22 +48,20 @@ bw.SJ <- function(x, nb = 1000L, lower = 0.1*hmax, upper = hmax,
     if (is.na(nb) || nb <= 0L) stop("invalid 'nb'")
     storage.mode(x) <- "double"
 
-    ## NB cnt[] := counts of all binned pair distances ==> O(n^2)
-    ##    sum(cnt) == choose(n,2) == n*(n-1)/2
-    Z <- .Call(C_bw_den, nb, x); d <- Z[[1L]]; cnt <- Z[[2L]]
+    method <- match.arg(method)
 
-    ## and these use (d, cnt)
     SDh <- function(h) .Call(C_bw_phi4, n, d, cnt, h)
     TDh <- function(h) .Call(C_bw_phi6, n, d, cnt, h)
 
+    Z <- .Call(C_bw_den, nb, x); d <- Z[[1L]]; cnt <- Z[[2L]]
     scale <- min(sd(x), IQR(x)/1.349)
     a <- 1.24 * scale * n^(-1/7)
     b <- 1.23 * scale * n^(-1/9)
     c1 <- 1/(2*sqrt(pi)*n)
     TD  <- -TDh(b)
     if(!is.finite(TD) || TD <= 0)
         stop("sample is too sparse to find TD", domain = NA)
-    if(match.arg(method) == "dpi")
+    if(method == "dpi")
         res <- (c1/SDh((2.394/(n * TD))^(1/7)))^(1/5)
     else {
         if(bnd.Miss <- missing(lower) || missing(upper)) {
@@ -73,8 +71,8 @@ bw.SJ <- function(x, nb = 1000L, lower = 0.1*hmax, upper = hmax,
         alph2 <- 1.357*(SDh(a)/TD)^(1/7)
         if(!is.finite(alph2))
             stop("sample is too sparse to find alph2", domain  = NA)
-        fSD <- function(h) ( c1 / SDh(alph2 * h^(5/7)) )^(1/5) - h
         itry <- 1L
+        fSD <- function(h) ( c1 / SDh(alph2 * h^(5/7)) )^(1/5) - h
 	while (fSD(lower) * fSD(upper) > 0) {
 	    if(itry > 99L || !bnd.Miss) # 1.2 ^ 99 = 69'014'979 .. enough
 		stop("no solution in the specified range of bandwidths")

diff --git a/src/library/stats/man/bandwidth.Rd b/src/library/stats/man/bandwidth.Rd
@@ -60,14 +60,19 @@ bw.SJ(x, nb = 1000, lower = 0.1 * hmax, upper = hmax,
   biased cross-validation respectively.
 
   \code{bw.SJ} implements the methods of Sheather & Jones (1991)
-  to select the bandwidth using pilot estimation of derivatives, based
-  on all pairwise binned distances, hence of complexity \eqn{O(n^2)} and
-  considerably slower than the other bandwidth selectors for larger \eqn{n}.
-    \cr
+  to select the bandwidth using pilot estimation of derivatives.\cr
   The algorithm for method \code{"ste"} solves an equation (via
   \code{\link{uniroot}}) and because of that, enlarges the interval
   \code{c(lower, upper)} when the boundaries were not user-specified and
   do not bracket the root.
+
+  The last three methods use all pairwise binned distances, hence are of
+  complexity \eqn{O(n^2)} and considerably slower than the first two for
+  larger \eqn{n}.  A faster method for the Sheather--Jones
+  direct-plug-in selector for larger \eqn{n} is available in package
+  \CRANpkg{KernSmooth}, using binning: its function
+  \code{\link[Kernsmooth]{dpik}} is a direct replacement for
+  \code{bw.SJ(method = "dpi")}.
 }
 \value{
   A bandwidth on a scale suitable for the \code{bw} argument

diff --git a/src/library/stats/man/density.Rd b/src/library/stats/man/density.Rd
@@ -1,6 +1,6 @@
 % File src/library/stats/man/density.Rd
 % Part of the R package, https://www.R-project.org
-% Copyright 1995-2017 R Core Team
+% Copyright 1995-2013 R Core Team
 % Distributed under GPL 2 or later
 
 \name{density}
@@ -29,8 +29,8 @@ density(x, \dots)
     bandwidth.  See \code{\link{bw.nrd}}. \cr The default,
     \code{"nrd0"}, has remained the default for historical and
     compatibility reasons, rather than as a general recommendation,
-    where e.g., \code{"SJ"} would rather fit (with a computational cost
-    though, for larger sample sizes), see also Venables and Ripley (2002).
+    where e.g., \code{"SJ"} would rather fit, see also Venables and
+    Ripley (2002).
 
     The specified (or computed) value of \code{bw} is multiplied by
     \code{adjust}.

diff --git a/src/library/stats/src/bandwidths.c b/src/library/stats/src/bandwidths.c
@@ -20,7 +20,6 @@
 
 #include <stdlib.h> //abs
 #include <math.h>
-
 #include <Rmath.h> // M_* constants
 #include <Rinternals.h>
 
@@ -40,23 +39,25 @@
 SEXP bw_ucv(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
 {
     double h = asReal(sh), d = asReal(sd), sum = 0.0, term, u;
-    int n = asInteger(sn), nbin = LENGTH(cnt), *x = INTEGER(cnt);
+    int n = asInteger(sn), nbin = LENGTH(cnt);
+    double *x = REAL(cnt);
     for (int i = 0; i < nbin; i++) {
 	double delta = i * d / h;
 	delta *= delta;
 	if (delta >= DELMAX) break;
-	term = exp(-delta / 4) - sqrt(8.0) * exp(-delta / 2);
+	term = exp(-delta / 4.) - sqrt(8.0) * exp(-delta / 2.);
 	sum += term * x[i];
     }
-    u = (0.5 + sum) / (n * h * M_SQRT_PI);
+    u = (0.5 + sum/n) / (n * h * M_SQRT_PI);
     // = 1 / (2 * n * h * sqrt(PI)) + sum / (n * n * h * sqrt(PI));
     return ScalarReal(u);
 }
 
 SEXP bw_bcv(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
 {
     double h = asReal(sh), d = asReal(sd), sum = 0.0, term, u;
-    int n = asInteger(sn), nbin = LENGTH(cnt), *x = INTEGER(cnt);
+    int n = asInteger(sn), nbin = LENGTH(cnt);
+    double *x = REAL(cnt);
 
     sum = 0.0;
     for (int i = 0; i < nbin; i++) {
@@ -65,7 +66,7 @@ SEXP bw_bcv(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
 	term = exp(-delta / 4) * (delta * delta - 12 * delta + 12);
 	sum += term * x[i];
     }
-    u = (1 + sum/(32*n)) / (2 * n * h * M_SQRT_PI);
+    u = (1 + sum/(32.0*n)) / (2.0 * n * h * M_SQRT_PI);
     // = 1 / (2 * n * h * sqrt(PI)) + sum / (64 * n * n * h * sqrt(PI));
     return ScalarReal(u);
 }
@@ -74,53 +75,47 @@ SEXP bw_phi4(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
 {
     double h = asReal(sh), d = asReal(sd), sum = 0.0, term, u;
     int n = asInteger(sn), nbin = LENGTH(cnt);
+    double *x = REAL(cnt);
 
-#define PHI4_SUM							\
-    for (int i = 0; i < nbin; i++) {					\
-	double delta = i * d / h; delta *= delta;			\
-	if (delta >= DELMAX) break;					\
-	term = exp(-delta / 2) * ((delta - 6)* delta + 3);		\
-	sum += term * x[i];						\
-    }
-    if(TYPEOF(cnt) == INTSXP) {
-	int* x = INTEGER(cnt);
-	PHI4_SUM;
-    } else { // TYPEOF(cnt) == REALSXP
-	double* x = REAL(cnt);
-	PHI4_SUM;
+    for (int i = 0; i < nbin; i++) {
+	double delta = i * d / h; delta *= delta;
+	if (delta >= DELMAX) break;
+	term = exp(-delta / 2.) * (delta * delta - 6. * delta + 3.);
+	sum += term * x[i];
     }
-    sum = 2 * sum + n * 3;	/* add in diagonal */
-    u = sum / (n * ((double)(n - 1)) * pow(h, 5.0)) * M_1_SQRT_2PI;
+    sum = 2. * sum + n * 3.;	/* add in diagonal */
+    u = sum / ((double)n * (n - 1) * pow(h, 5.0)) * M_1_SQRT_2PI;
+    // = sum / (n * (n - 1) * pow(h, 5.0) * sqrt(2 * PI));
     return ScalarReal(u);
 }
-#undef PHI4_SUM
 
 SEXP bw_phi6(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
 {
     double h = asReal(sh), d = asReal(sd), sum = 0.0, term, u;
     int n = asInteger(sn), nbin = LENGTH(cnt);
-#define PHI6_SUM							\
-    for (int i = 0; i < nbin; i++) {					\
-	double delta = i * d / h; delta *= delta;			\
-	if (delta >= DELMAX) break;					\
-	term = exp(-delta / 2) *					\
-	    (((delta - 15) * delta + 45) * delta - 15);			\
-	sum += term * x[i];						\
-    }
-    if(TYPEOF(cnt) == INTSXP) {
-	int* x = INTEGER(cnt);
-	PHI6_SUM;
-    } else { // TYPEOF(cnt) == REALSXP
-	double* x = REAL(cnt);
-	PHI6_SUM;
+    double *x = REAL(cnt);
+
+    for (int i = 0; i < nbin; i++) {
+	double delta = i * d / h; delta *= delta;
+	if (delta >= DELMAX) break;
+	term = exp(-delta / 2) *
+	    (delta * delta * delta - 15 * delta * delta + 45 * delta - 15);
+	sum += term * x[i];
     }
-    sum = 2 * sum - 15 * n;	/* add in diagonal */
-    u = sum / (n * ((double)(n - 1)) * pow(h, 7.0)) * M_1_SQRT_2PI;
+    sum = 2. * sum - 15. * n;	/* add in diagonal */
+    u = sum / ((double)n * (n - 1) * pow(h, 7.0)) * M_1_SQRT_2PI;
+    // = sum / (n * (n - 1) * pow(h, 7.0) * sqrt(2 * PI));
     return ScalarReal(u);
 }
-#undef PHI6_SUM
 
 /* This would be impracticable for long vectors.  Better to bin x first */
+
+/* 
+   Use double cnt as from R 3.4.0, as counts can exceed INT_MAX for
+   large n (65537 in the worse case but typically not at n = 1 million
+   for a smooth distribution).
+*/
+
 SEXP bw_den(SEXP nbin, SEXP sx)
 {
     int nb = asInteger(nbin), n = LENGTH(sx);
@@ -130,35 +125,24 @@ SEXP bw_den(SEXP nbin, SEXP sx)
     for (int i = 0; i < n; i++) {
 	if(!R_FINITE(x[i]))
 	    error(_("non-finite x[%d] in bandwidth calculation"), i+1);
-	if     (x[i] < xmin) xmin = x[i];
-	else if(x[i] > xmax) xmax = x[i];
+	if(x[i] < xmin) xmin = x[i];
+	if(x[i] > xmax) xmax = x[i];
     }
     rang = (xmax - xmin) * 1.01;
     dd = rang / nb;
 
-    Rboolean n_lrg = n >= 65537; // <=> n(n-1)/2 > 2^31 - 1
     SEXP ans = PROTECT(allocVector(VECSXP, 2)),
-	sc = SET_VECTOR_ELT(ans, 1, allocVector(n_lrg ? REALSXP : INTSXP, nb));
+	sc = SET_VECTOR_ELT(ans, 1, allocVector(REALSXP, nb));
     SET_VECTOR_ELT(ans, 0, ScalarReal(dd));
-
-#define DO_CNT_I_J							\
-    for (int i = 0; i < nb; i++) cnt[i] = 0;				\
-									\
-    /* -> O(n^2) ; sum(cnt[]) == n(n-1)/2  => can overflow if(n_lrg) */ \
-    for (int i = 1; i < n; i++) {					\
-	int ii = (int)(x[i] / dd);					\
-	for (int j = 0; j < i; j++) {					\
-	    int jj = (int)(x[j] / dd);					\
-	    cnt[abs(ii - jj)]++;					\
-	}								\
-    }
-
-    if(n_lrg) {
-	double *cnt = REAL(sc);
-	DO_CNT_I_J
-    } else {
-	int *cnt = INTEGER(sc);
-	DO_CNT_I_J
+    double *cnt = REAL(sc);
+    for (int i = 0; i < nb; i++) cnt[i] = 0.0;
+
+    for (int i = 1; i < n; i++) {
+	int ii = (int)(x[i] / dd);
+	for (int j = 0; j < i; j++) {
+	    int jj = (int)(x[j] / dd);
+	    cnt[abs(ii - jj)] += 1.0;
+	}
     }
 
     UNPROTECT(1);