Skip to content

Commit

Permalink
r.quantile, r.stats.quantile, lib/stats: fix quantile algorithm (#2108)
Browse files Browse the repository at this point in the history
* use type 7 algorithm of Hyndman and Fan (1996) for quantiles, as is the default in R and numpy 
 * update manuals for `r.quantile`, `r.stats.quantile`
 * sync `r.stats quantile` to `r.quantile`
 * update test results for `r.neighbors`
  • Loading branch information
metzm authored Jan 21, 2022
1 parent 9317110 commit c700a8a
Show file tree
Hide file tree
Showing 6 changed files with 227 additions and 99 deletions.
3 changes: 2 additions & 1 deletion lib/stats/c_percentile.c
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ void c_quant(DCELL * result, DCELL * values, int n, const void *closure)
return;
}

k = n * quant;
/* algorithm type 7 of Hyndman and Fan (1996), default in R */
k = quant * (n - 1);
i0 = (int)floor(k);
i1 = (int)ceil(k);

Expand Down
98 changes: 49 additions & 49 deletions raster/r.neighbors/testsuite/test_r_neighbors.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,18 @@ class TestNeighbors(TestCase):
"sum": 221701132.328964,
},
"perc90": {
"n": 2022154,
"null_cells": 2846,
"n": 2025000,
"null_cells": 0,
"cells": 2025000,
"min": 50.5460597991944,
"max": 153.0166015625,
"range": 102.470541763306,
"mean": 100.145484105243,
"mean_of_abs": 100.145484105243,
"stddev": 18.2647774894326,
"variance": 333.602096738483,
"coeff_var": 18.2382437437099,
"sum": 202509591.265354,
"min": 56.0450115203857,
"max": 156.317572021484,
"range": 100.272560501099,
"mean": 111.060386339868,
"mean_of_abs": 111.060386339868,
"stddev": 20.301968270214,
"variance": 412.169915644776,
"coeff_var": 18.2801167358501,
"sum": 224897282.338233,
},
"average": {
"n": 2025000,
Expand Down Expand Up @@ -116,43 +116,43 @@ class TestNeighbors(TestCase):
"n": 2025000,
"null_cells": 0,
"cells": 2025000,
"min": 55.5858826446533,
"max": 156.204046478271,
"range": 100.618163833618,
"mean": 109.505643774445,
"mean_of_abs": 109.505643774445,
"stddev": 20.3076172385227,
"variance": 412.399317906343,
"coeff_var": 18.5448133434578,
"sum": 221748928.643252,
"min": 55.5847009658813,
"max": 156.203791503906,
"range": 100.619090538025,
"mean": 109.503018547944,
"mean_of_abs": 109.503018547944,
"stddev": 20.3076172385226,
"variance": 412.398941046239,
"coeff_var": 18.5452494634586,
"sum": 221743612.559588,
},
0.03: {
"n": 2025000,
"null_cells": 0,
"cells": 2025000,
"min": 55.600062789917,
"max": 156.208636016846,
"range": 100.608573226929,
"mean": 109.552850010781,
"mean_of_abs": 109.552850010781,
"stddev": 20.3078616792495,
"variance": 412.409245983529,
"coeff_var": 18.5370455239192,
"sum": 221844521.271832,
"min": 55.5965177536011,
"max": 156.20787109375,
"range": 100.611353340149,
"mean": 109.544974331288,
"mean_of_abs": 109.544974331288,
"stddev": 20.3078125947647,
"variance": 412.407252384084,
"coeff_var": 18.5383334276472,
"sum": 221828573.020859,
},
0.95: {
"n": 2022154,
"null_cells": 2846,
"n": 2025000,
"null_cells": 0,
"cells": 2025000,
"min": 25.2730298995972,
"max": 153.0166015625,
"range": 127.743571662903,
"mean": 50.141999640313,
"mean_of_abs": 50.141999640313,
"stddev": 9.33982798423495,
"variance": 87.2323867750984,
"coeff_var": 18.6267561150991,
"sum": 101394845.140658,
"min": 56.0488119125366,
"max": 156.323718261719,
"range": 100.274906349182,
"mean": 111.161659406447,
"mean_of_abs": 111.161659406447,
"stddev": 20.3000320258108,
"variance": 412.091300248942,
"coeff_var": 18.2617209334619,
"sum": 225102360.298055,
},
},
"test_standard_options_circular": {
Expand All @@ -174,15 +174,15 @@ class TestNeighbors(TestCase):
"n": 2025000,
"null_cells": 0,
"cells": 2025000,
"min": 56.4984405517578,
"max": 156.306396484375,
"range": 99.8079559326172,
"mean": 111.82648180616,
"mean_of_abs": 111.82648180616,
"stddev": 20.2829112001524,
"variance": 411.396486753269,
"coeff_var": 18.1378425508466,
"sum": 226448625.657474,
"min": 56.2710201263428,
"max": 156.299404907227,
"range": 100.028384780884,
"mean": 111.688409057181,
"mean_of_abs": 111.688409057181,
"stddev": 20.286687968792,
"variance": 411.54970874313,
"coeff_var": 18.1636466487815,
"sum": 226169028.340791,
},
"average": {
"n": 2025000,
Expand Down
99 changes: 73 additions & 26 deletions raster/r.quantile/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
#include <grass/raster.h>
#include <grass/glocale.h>

/* TODO: replace long with either size_t or a guaranteed 64 bit integer */
struct bin
{
size_t origin;
Expand All @@ -34,11 +33,11 @@ static DCELL *quants;
static int num_slots;
static size_t *slots;
static DCELL slot_size;
/* total should be a 64bit integer */
static unsigned long total;
static grass_int64 total;
static size_t num_values;
static unsigned short *slot_bins;
static int num_bins;
static int num_bins_alloc;
static int num_bins_used;
static struct bin *bins;
static DCELL *values;

Expand All @@ -53,12 +52,38 @@ static inline int get_slot(DCELL c)
return i;
}

/* get zero-based rank for quantile */
/* generic formula for one-based rank
* rank = quant * (N + 1 - 2C) + C
* with quant = quantile, N = number of values, C = constant
* common values for C:
* C = 0
* rank = quant * (N + 1)
* recommended by NIST (National Institute of Standards and Technology)
* https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm
* C = 0.5
* rank = quant * N + 0.5
* Matlab
* C = 1
* rank = quant * (N - 1) + 1
* numpy, R, MS Excel, ...
* Noted as an alternative by NIST */
static inline double get_quantile(int n)
{
if (n >= num_quants)
double rnk;

if (n >= num_quants) {
/* stop condition for initialize_bins() */
return (double)total + total;
}

return (double)total * quants[n];
rnk = quants[n] * (total - 1);
if (rnk < 0)
rnk = 0;
if (rnk > total - 1)
rnk = total - 1;

return rnk;
}

static void get_slot_counts(int infile)
Expand Down Expand Up @@ -99,18 +124,26 @@ static void initialize_bins(void)
int bin = 0;
size_t accum = 0;
int quant = 0;
int use_next_slot = 0;

G_message(_("Computing bins"));

num_values = 0;
next = get_quantile(quant);

/* for a given quantile, two bins might be needed
* if the index for this quantile is
* > accumulated count of current bin
* and
* < accumulated count of next bin */

for (slot = 0; slot < num_slots; slot++) {
size_t count = slots[slot];
size_t accum2 = accum + count;

if (accum2 > next ||
(slot == num_slots - 1 && accum2 == next)) {
if (count > 0 &&
(accum2 > next || use_next_slot) &&
bin < num_bins_alloc) {
struct bin *b = &bins[bin];

slot_bins[slot] = ++bin;
Expand All @@ -121,18 +154,24 @@ static void initialize_bins(void)
b->min = min + slot_size * slot;
b->max = min + slot_size * (slot + 1);

while (accum2 > next)
next = get_quantile(++quant);
use_next_slot = 0;

if (accum2 - next < 1) {
use_next_slot = 1;
}
else {
while (accum2 > next)
next = get_quantile(++quant);
}

num_values += count;
}

accum = accum2;
}

num_bins = bin;
num_bins_used = bin;

G_debug(1, "Number of bins: %d", num_bins);
G_debug(1, "Number of used bins: %d", num_bins_used);
G_debug(1, "Number of values: %lu", num_values);
}

Expand Down Expand Up @@ -188,7 +227,7 @@ static void sort_bins(void)

G_message(_("Sorting bins"));

for (bin = 0; bin < num_bins; bin++) {
for (bin = 0; bin < num_bins_used; bin++) {
struct bin *b = &bins[bin];

qsort(&values[b->base], b->count, sizeof(DCELL), compare_dcell);
Expand All @@ -209,22 +248,17 @@ static void compute_quantiles(int recode)
double k, v;
size_t i0, i1;

for (; bin < num_bins; bin++) {
for (; bin < num_bins_used; bin++) {
b = &bins[bin];
if (b->origin + b->count >= next)
break;
}

if (bin < num_bins) {
if (bin < num_bins_used) {
k = next - b->origin;
i0 = (size_t)floor(k);
i1 = (size_t)ceil(k);

if (i0 > b->count - 1)
i0 = b->count - 1;
if (i1 > b->count - 1)
i1 = b->count - 1;

v = (i0 == i1)
? values[b->base + i0]
: values[b->base + i0] * (i1 - k) +
Expand Down Expand Up @@ -258,6 +292,7 @@ int main(int argc, char *argv[])
int recode;
int infile;
struct FPRange range;
int num_slots_max;

G_gisinit(argv[0]);

Expand Down Expand Up @@ -301,7 +336,7 @@ int main(int argc, char *argv[])
flag.r = G_define_flag();
flag.r->key = 'r';
flag.r->description = _("Generate recode rules based on quantile-defined intervals");

if (G_parser(argc, argv))
exit(EXIT_FAILURE);

Expand Down Expand Up @@ -341,17 +376,29 @@ int main(int argc, char *argv[])
Rast_read_fp_range(opt.input->answer, "", &range);
Rast_get_fp_range_min_max(&range, &min, &max);

rows = Rast_window_rows();
cols = Rast_window_cols();

/* minimum 1000 values per slot to reduce memory consumption */
num_slots_max = ((size_t)rows * cols) / 1000;
if (num_slots_max < 1)
num_slots_max = 1;
if (num_slots > num_slots_max) {
G_message(_("Reducing number of bins from %d to %d"),
num_slots, num_slots_max);
num_slots = num_slots_max;
}

slots = G_calloc(num_slots, sizeof(size_t));
slot_bins = G_calloc(num_slots, sizeof(unsigned short));

slot_size = (max - min) / num_slots;

rows = Rast_window_rows();
cols = Rast_window_cols();

get_slot_counts(infile);

bins = G_calloc(num_quants, sizeof(struct bin));
/* sometimes two bins are needed to calculate a quantile */
num_bins_alloc = num_quants * 2;
bins = G_calloc(num_bins_alloc, sizeof(struct bin));
initialize_bins();
G_free(slots);

Expand Down
21 changes: 19 additions & 2 deletions raster/r.quantile/r.quantile.html
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,11 @@ <h2>DESCRIPTION</h2>
<em>r.quantile</em> computes quantiles in a manner suitable
for use with large amounts of data. It is using two passes.

<h2>NOTES</h2>

Quantiles are calculated following algorithm 7 from Hyndman and Fan (1996),
which is also the default in R and numpy.

<h2>EXAMPLE</h2>

Calculation of elevation quantiles (printed to standard-out):
Expand All @@ -20,6 +25,18 @@ <h2>EXAMPLE</h2>
out=elev_quant5 rules=-
</pre></div>

<h2>REFERENCES</h2>

<ul>
<li>Hyndman and Fan (1996) <i>Sample Quantiles in Statistical
Packages</i>, <b>American Statistician</b>. American Statistical
Association. 50 (4): 361-365. DOI: <a
href="https://doi.org/10.2307/2684934>10.2307/2684934">10.2307/2684934</a></li>
<li><a
href="https://www.itl.nist.gov/div898/handbook/prc/section2/prc262.htm"><i>Engineering
Statistics Handbook: Percentile</i></a>, NIST</li>
</ul>

<h2>SEE ALSO</h2>

<em>
Expand All @@ -35,10 +52,10 @@ <h2>SEE ALSO</h2>
<a href="v.rast.stats.html">v.rast.stats</a>
</em>


<h2>AUTHOR</h2>

Glynn Clements
Glynn Clements<br>
Markus Metz
<!--
<p>
<i>Last changed: $Date$</i>
Expand Down
Loading

0 comments on commit c700a8a

Please sign in to comment.