Skip to content

Commit

Permalink
✨ feat: First draft for reservoir sampling.
Browse files Browse the repository at this point in the history
Fixes #18.
  • Loading branch information
make-github-pseudonymous-again committed Apr 8, 2021
1 parent dfb68b6 commit f6d707d
Show file tree
Hide file tree
Showing 6 changed files with 137 additions and 0 deletions.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
"@aureooms/js-functools": "2.0.3",
"@aureooms/js-itertools": "5.1.0",
"@aureooms/js-memory": "4.0.0",
"@aureooms/js-red-black-tree": "^9.0.0",
"@aureooms/js-type": "1.0.4",
"@babel/core": "7.13.14",
"@babel/preset-env": "7.13.12",
Expand Down
14 changes: 14 additions & 0 deletions src/api/reservoir.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import _waterman from '../kernel/_waterman.js';
import randint from './randint.js';

/**
* Reservoir sampling.
*
* @function
* @param {number} k The size of the sample.
* @param {Iterable} iterable The input iterable.
* @param {Array} [output=new Array(k)] The output array.
* @return {Array} The output array.
*/
const reservoir = _waterman(randint);
export default reservoir;
2 changes: 2 additions & 0 deletions src/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ export {default as randfloat} from './api/randfloat.js';
export {default as randint} from './api/randint.js';
export {default as random} from './api/random.js';
export {default as randrange} from './api/randrange.js';
export {default as reservoir} from './api/reservoir.js';
export {default as sample} from './api/sample.js';
export {default as shuffle} from './api/shuffle.js';
export {default as shuffled} from './api/shuffled.js';
Expand All @@ -12,3 +13,4 @@ export {default as _fisheryates_inside_out} from './kernel/_fisheryates_inside_o
export {default as _randfloat} from './kernel/_randfloat.js';
export {default as _randint} from './kernel/_randint.js';
export {default as _shuffle} from './kernel/_shuffle.js';
export {default as _waterman} from './kernel/_waterman.js';
77 changes: 77 additions & 0 deletions src/kernel/_waterman.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/**
* Construct a sampling function using Algorithm R due to Alan Waterman (both
* name and attribution are due to Knuth).
*
* @param {Function} randint The randint function.
* @return {Function} The sample function.
*/
const _waterman = (randint) => {
/**
* Samples k items uniformly at random from an iterable of unknown size.
*
* We want each item to have probability k/n of being selected.
*
* The algorithm works as follows:
* 1. We initialize a candidate sample with the first k items.
* 2. For each remaining item i, decide whether to insert it in the
* candidate sample with probability k/i, evicting an item from the
* candidate sample at random, or to discard it immediately (with
* probability 1-k/i),
*
* To prove that the obtained probability of inclusion for each item is correct
* we multiply two probabilities:
* 1. The probability of entering the candidate sample.
* 2. The probability of staying in the candidate sample until the end.
*
* For items 1 to k, probability 1. is 1, and probability 2. is
* (1-1/(k+1))(1-1/(k+2))...(1-1/n)
* = (k/(k+1))((k+1)/(k+2))...((n-1)/n) which telescopes to k/n.
*
* For items i = k+1 to n, where probability 1. is k/i, and probability 2.
* is (1-1/(i+1))(1-1/(i+2))...(1-1/n)
* = (i/(i+1))((i+1)/(i+2))...((n-1)/n) which telescopes to i/n.
*
* NOTE: Could also implement so that it yields after each input item.
* NOTE: One can reduce the expected number of random bits needed by
* avoiding generating any number above k-1:
* - First we branch on whether i < k.
* - Then we generate the random number between 0 and k-1 only if needed.
*
* To decide on the branch, flip a biased coin with parameter p = k/n.
* To do so, flip a fair coin until it differs from the binary
* representation of k/n (0.10110101...).
* The computation can be made efficient by realizing several things:
* - k is fixed and smaller than n (so divmod step can be skipped)
* - k/(n+1) < k/n (so we can avoid recomputing if the biased flip > k/n)
*
* This would reduce the number of necessary random bits from O(n log n) to
* expected O(n).
*
* @param {number} k The size of the sample.
* @param {Iterable} iterable The input iterable.
* @param {Array} [output=new Array(k)] The output array.
* @return {Array} The output array.
*/
const sample = (k, iterable, output = new Array(k)) => {
const it = iterable[Symbol.iterator]();

let n = 0;

for (; n < k; ++n) {
const {value, done} = it.next();
if (done) return output;
output[n] = value;
}

for (; ; ++n) {
const {value, done} = it.next();
if (done) return output;
const i = randint(0, n);
if (i < k) output[i] = value;
}
};

return sample;
};

export default _waterman;
38 changes: 38 additions & 0 deletions test/src/reservoir.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import test from 'ava';
import {range} from '@aureooms/js-itertools';
import {increasing} from '@aureooms/js-compare';
import {RedBlackTree} from '@aureooms/js-red-black-tree';
import {reservoir, _waterman, randint} from '../../src/index.js';

const macro = (t, _, reservoir, k, n) => {
const sample = reservoir(k, range(n));
const source = RedBlackTree.from(increasing, range(n));
// We cannot use a Set as it would smoosh input duplicates

console.debug({sample});
t.is(sample.length, k);
for (const i of range(Math.min(k, n))) t.true(source.remove(sample[i]));
for (const i of range(n, k)) t.true(sample[i] === undefined);
};

macro.title = (title, algo, _, k, n) =>
title || `[${algo}] reservoir(${k}, range(${n}))`;

const algorithms = [
['Waterman', _waterman(randint)],
['API', reservoir],
];

const params = [
[0, 10],
[5, 10],
[10, 5],
[10, 10],
[50, 1000],
];

for (const [name, algorithm] of algorithms) {
for (const [k, input] of params) {
test(macro, name, algorithm, k, input);
}
}
5 changes: 5 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,11 @@
resolved "https://registry.yarnpkg.com/@aureooms/js-memory/-/js-memory-4.0.0.tgz#db87dc64b948f672d73b434ebde047b05869712c"
integrity sha1-24fcZLlI9nLXO0NOveBHsFhpcSw=

"@aureooms/js-red-black-tree@^9.0.0":
version "9.0.0"
resolved "https://registry.yarnpkg.com/@aureooms/js-red-black-tree/-/js-red-black-tree-9.0.0.tgz#ee006f24af42749546232b2d0baa13910c98f7b2"
integrity sha512-sUtY0HnwQnBUjrfwysKc6H4BJO4O2+NnrUHLqTYJyT1l1VSI+oXGffjjmMJTFpIl4L/4FEZAN0L3BiQxgR1T8g==

"@aureooms/js-type@1.0.4":
version "1.0.4"
resolved "https://registry.yarnpkg.com/@aureooms/js-type/-/js-type-1.0.4.tgz#7f9de5f5f8506ff9c8958731744b7427b62e92b7"
Expand Down

0 comments on commit f6d707d

Please sign in to comment.