Skip to content

Commit

Permalink
first pass at fixup for c71965
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.r-project.org/R/trunk@71986 00db46b3-68df-0310-9c12-caf00c1e9a41
  • Loading branch information
ripley committed Jan 16, 2017
1 parent 2f9cf02 commit 7ca0161
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 80 deletions.
18 changes: 13 additions & 5 deletions doc/NEWS.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -592,6 +592,9 @@
URL existence before attempting downloads; this is more robust to
servers that do not support HEAD or range-based retrieval, but may
create empty or incomplete files for aborted download requests.

\item Bandwidth selectors \code{bw.ucv()}, \code{bw.bcv()} and
\code{bw.SJ()} now avoid integer overflow for large sample sizes.
}
}
}
Expand Down Expand Up @@ -676,15 +679,20 @@
removes leading whitespace and only attempts to open
\samp{http://} and \samp{https://} URLs, falling back to emailing
the maintainer.

\item Bandwidth selectors \code{bw.ucv()} and
\code{bw.SJ()} gave incorrect answers or incorrectly reported an
error (because of integer overflow) for inputs longer than
46341. Similarly for \code{b2.bcv()} at length 5793.

Another possible integer overflow is checked and may result in a
reported error (rather than an incorrect result) for much longer
inputs (millions for a smooth distribution).

\item \code{findMethod} failed if the active signature had
expanded beyond what a particular package used. (Example with
packages \CRANpkg{XR} and \CRANpkg{XRJulia} on \acronym{CRAN}).

\item \code{bw.SJ()} and \code{density(*, bw="SJ")} no longer fail
because of integer overflow for large sample size n (starting from
n as small as 65536).
}
}
}
}

Expand Down
10 changes: 4 additions & 6 deletions src/library/stats/R/bandwidths.R
Original file line number Diff line number Diff line change
Expand Up @@ -48,22 +48,20 @@ bw.SJ <- function(x, nb = 1000L, lower = 0.1*hmax, upper = hmax,
if (is.na(nb) || nb <= 0L) stop("invalid 'nb'")
storage.mode(x) <- "double"

## NB cnt[] := counts of all binned pair distances ==> O(n^2)
## sum(cnt) == choose(n,2) == n*(n-1)/2
Z <- .Call(C_bw_den, nb, x); d <- Z[[1L]]; cnt <- Z[[2L]]
method <- match.arg(method)

## and these use (d, cnt)
SDh <- function(h) .Call(C_bw_phi4, n, d, cnt, h)
TDh <- function(h) .Call(C_bw_phi6, n, d, cnt, h)

Z <- .Call(C_bw_den, nb, x); d <- Z[[1L]]; cnt <- Z[[2L]]
scale <- min(sd(x), IQR(x)/1.349)
a <- 1.24 * scale * n^(-1/7)
b <- 1.23 * scale * n^(-1/9)
c1 <- 1/(2*sqrt(pi)*n)
TD <- -TDh(b)
if(!is.finite(TD) || TD <= 0)
stop("sample is too sparse to find TD", domain = NA)
if(match.arg(method) == "dpi")
if(method == "dpi")
res <- (c1/SDh((2.394/(n * TD))^(1/7)))^(1/5)
else {
if(bnd.Miss <- missing(lower) || missing(upper)) {
Expand All @@ -73,8 +71,8 @@ bw.SJ <- function(x, nb = 1000L, lower = 0.1*hmax, upper = hmax,
alph2 <- 1.357*(SDh(a)/TD)^(1/7)
if(!is.finite(alph2))
stop("sample is too sparse to find alph2", domain = NA)
fSD <- function(h) ( c1 / SDh(alph2 * h^(5/7)) )^(1/5) - h
itry <- 1L
fSD <- function(h) ( c1 / SDh(alph2 * h^(5/7)) )^(1/5) - h
while (fSD(lower) * fSD(upper) > 0) {
if(itry > 99L || !bnd.Miss) # 1.2 ^ 99 = 69'014'979 .. enough
stop("no solution in the specified range of bandwidths")
Expand Down
13 changes: 9 additions & 4 deletions src/library/stats/man/bandwidth.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -60,14 +60,19 @@ bw.SJ(x, nb = 1000, lower = 0.1 * hmax, upper = hmax,
biased cross-validation respectively.
\code{bw.SJ} implements the methods of Sheather & Jones (1991)
to select the bandwidth using pilot estimation of derivatives, based
on all pairwise binned distances, hence of complexity \eqn{O(n^2)} and
considerably slower than the other bandwidth selectors for larger \eqn{n}.
\cr
to select the bandwidth using pilot estimation of derivatives.\cr
The algorithm for method \code{"ste"} solves an equation (via
\code{\link{uniroot}}) and because of that, enlarges the interval
\code{c(lower, upper)} when the boundaries were not user-specified and
do not bracket the root.
The last three methods use all pairwise binned distances, hence are of
complexity \eqn{O(n^2)} and considerably slower than the first two for
larger \eqn{n}. A faster method for the Sheather--Jones
direct-plug-in selector for larger \eqn{n} is available in package
\CRANpkg{KernSmooth}, using binning: its function
\code{\link[Kernsmooth]{dpik}} is a direct replacement for
\code{bw.SJ(method = "dpi")}.
}
\value{
A bandwidth on a scale suitable for the \code{bw} argument
Expand Down
6 changes: 3 additions & 3 deletions src/library/stats/man/density.Rd
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
% File src/library/stats/man/density.Rd
% Part of the R package, https://www.R-project.org
% Copyright 1995-2017 R Core Team
% Copyright 1995-2013 R Core Team
% Distributed under GPL 2 or later

\name{density}
Expand Down Expand Up @@ -29,8 +29,8 @@ density(x, \dots)
bandwidth. See \code{\link{bw.nrd}}. \cr The default,
\code{"nrd0"}, has remained the default for historical and
compatibility reasons, rather than as a general recommendation,
where e.g., \code{"SJ"} would rather fit (with a computational cost
though, for larger sample sizes), see also Venables and Ripley (2002).
where e.g., \code{"SJ"} would rather fit, see also Venables and
Ripley (2002).

The specified (or computed) value of \code{bw} is multiplied by
\code{adjust}.
Expand Down
108 changes: 46 additions & 62 deletions src/library/stats/src/bandwidths.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

#include <stdlib.h> //abs
#include <math.h>

#include <Rmath.h> // M_* constants
#include <Rinternals.h>

Expand All @@ -40,23 +39,25 @@
SEXP bw_ucv(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
{
double h = asReal(sh), d = asReal(sd), sum = 0.0, term, u;
int n = asInteger(sn), nbin = LENGTH(cnt), *x = INTEGER(cnt);
int n = asInteger(sn), nbin = LENGTH(cnt);
double *x = REAL(cnt);
for (int i = 0; i < nbin; i++) {
double delta = i * d / h;
delta *= delta;
if (delta >= DELMAX) break;
term = exp(-delta / 4) - sqrt(8.0) * exp(-delta / 2);
term = exp(-delta / 4.) - sqrt(8.0) * exp(-delta / 2.);
sum += term * x[i];
}
u = (0.5 + sum) / (n * h * M_SQRT_PI);
u = (0.5 + sum/n) / (n * h * M_SQRT_PI);
// = 1 / (2 * n * h * sqrt(PI)) + sum / (n * n * h * sqrt(PI));
return ScalarReal(u);
}

SEXP bw_bcv(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
{
double h = asReal(sh), d = asReal(sd), sum = 0.0, term, u;
int n = asInteger(sn), nbin = LENGTH(cnt), *x = INTEGER(cnt);
int n = asInteger(sn), nbin = LENGTH(cnt);
double *x = REAL(cnt);

sum = 0.0;
for (int i = 0; i < nbin; i++) {
Expand All @@ -65,7 +66,7 @@ SEXP bw_bcv(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
term = exp(-delta / 4) * (delta * delta - 12 * delta + 12);
sum += term * x[i];
}
u = (1 + sum/(32*n)) / (2 * n * h * M_SQRT_PI);
u = (1 + sum/(32.0*n)) / (2.0 * n * h * M_SQRT_PI);
// = 1 / (2 * n * h * sqrt(PI)) + sum / (64 * n * n * h * sqrt(PI));
return ScalarReal(u);
}
Expand All @@ -74,53 +75,47 @@ SEXP bw_phi4(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
{
double h = asReal(sh), d = asReal(sd), sum = 0.0, term, u;
int n = asInteger(sn), nbin = LENGTH(cnt);
double *x = REAL(cnt);

#define PHI4_SUM \
for (int i = 0; i < nbin; i++) { \
double delta = i * d / h; delta *= delta; \
if (delta >= DELMAX) break; \
term = exp(-delta / 2) * ((delta - 6)* delta + 3); \
sum += term * x[i]; \
}
if(TYPEOF(cnt) == INTSXP) {
int* x = INTEGER(cnt);
PHI4_SUM;
} else { // TYPEOF(cnt) == REALSXP
double* x = REAL(cnt);
PHI4_SUM;
for (int i = 0; i < nbin; i++) {
double delta = i * d / h; delta *= delta;
if (delta >= DELMAX) break;
term = exp(-delta / 2.) * (delta * delta - 6. * delta + 3.);
sum += term * x[i];
}
sum = 2 * sum + n * 3; /* add in diagonal */
u = sum / (n * ((double)(n - 1)) * pow(h, 5.0)) * M_1_SQRT_2PI;
sum = 2. * sum + n * 3.; /* add in diagonal */
u = sum / ((double)n * (n - 1) * pow(h, 5.0)) * M_1_SQRT_2PI;
// = sum / (n * (n - 1) * pow(h, 5.0) * sqrt(2 * PI));
return ScalarReal(u);
}
#undef PHI4_SUM

SEXP bw_phi6(SEXP sn, SEXP sd, SEXP cnt, SEXP sh)
{
double h = asReal(sh), d = asReal(sd), sum = 0.0, term, u;
int n = asInteger(sn), nbin = LENGTH(cnt);
#define PHI6_SUM \
for (int i = 0; i < nbin; i++) { \
double delta = i * d / h; delta *= delta; \
if (delta >= DELMAX) break; \
term = exp(-delta / 2) * \
(((delta - 15) * delta + 45) * delta - 15); \
sum += term * x[i]; \
}
if(TYPEOF(cnt) == INTSXP) {
int* x = INTEGER(cnt);
PHI6_SUM;
} else { // TYPEOF(cnt) == REALSXP
double* x = REAL(cnt);
PHI6_SUM;
double *x = REAL(cnt);

for (int i = 0; i < nbin; i++) {
double delta = i * d / h; delta *= delta;
if (delta >= DELMAX) break;
term = exp(-delta / 2) *
(delta * delta * delta - 15 * delta * delta + 45 * delta - 15);
sum += term * x[i];
}
sum = 2 * sum - 15 * n; /* add in diagonal */
u = sum / (n * ((double)(n - 1)) * pow(h, 7.0)) * M_1_SQRT_2PI;
sum = 2. * sum - 15. * n; /* add in diagonal */
u = sum / ((double)n * (n - 1) * pow(h, 7.0)) * M_1_SQRT_2PI;
// = sum / (n * (n - 1) * pow(h, 7.0) * sqrt(2 * PI));
return ScalarReal(u);
}
#undef PHI6_SUM

/* This would be impracticable for long vectors. Better to bin x first */

/*
Use double cnt as from R 3.4.0, as counts can exceed INT_MAX for
large n (65537 in the worse case but typically not at n = 1 million
for a smooth distribution).
*/

SEXP bw_den(SEXP nbin, SEXP sx)
{
int nb = asInteger(nbin), n = LENGTH(sx);
Expand All @@ -130,35 +125,24 @@ SEXP bw_den(SEXP nbin, SEXP sx)
for (int i = 0; i < n; i++) {
if(!R_FINITE(x[i]))
error(_("non-finite x[%d] in bandwidth calculation"), i+1);
if (x[i] < xmin) xmin = x[i];
else if(x[i] > xmax) xmax = x[i];
if(x[i] < xmin) xmin = x[i];
if(x[i] > xmax) xmax = x[i];
}
rang = (xmax - xmin) * 1.01;
dd = rang / nb;

Rboolean n_lrg = n >= 65537; // <=> n(n-1)/2 > 2^31 - 1
SEXP ans = PROTECT(allocVector(VECSXP, 2)),
sc = SET_VECTOR_ELT(ans, 1, allocVector(n_lrg ? REALSXP : INTSXP, nb));
sc = SET_VECTOR_ELT(ans, 1, allocVector(REALSXP, nb));
SET_VECTOR_ELT(ans, 0, ScalarReal(dd));

#define DO_CNT_I_J \
for (int i = 0; i < nb; i++) cnt[i] = 0; \
\
/* -> O(n^2) ; sum(cnt[]) == n(n-1)/2 => can overflow if(n_lrg) */ \
for (int i = 1; i < n; i++) { \
int ii = (int)(x[i] / dd); \
for (int j = 0; j < i; j++) { \
int jj = (int)(x[j] / dd); \
cnt[abs(ii - jj)]++; \
} \
}

if(n_lrg) {
double *cnt = REAL(sc);
DO_CNT_I_J
} else {
int *cnt = INTEGER(sc);
DO_CNT_I_J
double *cnt = REAL(sc);
for (int i = 0; i < nb; i++) cnt[i] = 0.0;

for (int i = 1; i < n; i++) {
int ii = (int)(x[i] / dd);
for (int j = 0; j < i; j++) {
int jj = (int)(x[j] / dd);
cnt[abs(ii - jj)] += 1.0;
}
}

UNPROTECT(1);
Expand Down

0 comments on commit 7ca0161

Please sign in to comment.