From f6d707d6a763da80370095dcdc98b5a9e3a4f9fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Aur=C3=A9lien=20Ooms?= Date: Thu, 8 Apr 2021 22:46:18 +0200 Subject: [PATCH] :sparkles: feat: First draft for reservoir sampling. Fixes #18. --- package.json | 1 + src/api/reservoir.js | 14 ++++++++ src/index.js | 2 ++ src/kernel/_waterman.js | 77 +++++++++++++++++++++++++++++++++++++++++ test/src/reservoir.js | 38 ++++++++++++++++++++ yarn.lock | 5 +++ 6 files changed, 137 insertions(+) create mode 100644 src/api/reservoir.js create mode 100644 src/kernel/_waterman.js create mode 100644 test/src/reservoir.js diff --git a/package.json b/package.json index 9374769..2bbfd75 100644 --- a/package.json +++ b/package.json @@ -69,6 +69,7 @@ "@aureooms/js-functools": "2.0.3", "@aureooms/js-itertools": "5.1.0", "@aureooms/js-memory": "4.0.0", + "@aureooms/js-red-black-tree": "^9.0.0", "@aureooms/js-type": "1.0.4", "@babel/core": "7.13.14", "@babel/preset-env": "7.13.12", diff --git a/src/api/reservoir.js b/src/api/reservoir.js new file mode 100644 index 0000000..0ccf6a1 --- /dev/null +++ b/src/api/reservoir.js @@ -0,0 +1,14 @@ +import _waterman from '../kernel/_waterman.js'; +import randint from './randint.js'; + +/** + * Reservoir sampling. + * + * @function + * @param {number} k The size of the sample. + * @param {Iterable} iterable The input iterable. + * @param {Array} [output=new Array(k)] The output array. + * @return {Array} The output array. + */ +const reservoir = _waterman(randint); +export default reservoir; diff --git a/src/index.js b/src/index.js index 7365016..e81330d 100644 --- a/src/index.js +++ b/src/index.js @@ -3,6 +3,7 @@ export {default as randfloat} from './api/randfloat.js'; export {default as randint} from './api/randint.js'; export {default as random} from './api/random.js'; export {default as randrange} from './api/randrange.js'; +export {default as reservoir} from './api/reservoir.js'; export {default as sample} from './api/sample.js'; export {default as shuffle} from './api/shuffle.js'; export {default as shuffled} from './api/shuffled.js'; @@ -12,3 +13,4 @@ export {default as _fisheryates_inside_out} from './kernel/_fisheryates_inside_o export {default as _randfloat} from './kernel/_randfloat.js'; export {default as _randint} from './kernel/_randint.js'; export {default as _shuffle} from './kernel/_shuffle.js'; +export {default as _waterman} from './kernel/_waterman.js'; diff --git a/src/kernel/_waterman.js b/src/kernel/_waterman.js new file mode 100644 index 0000000..2269982 --- /dev/null +++ b/src/kernel/_waterman.js @@ -0,0 +1,77 @@ +/** + * Construct a sampling function using Algorithm R due to Alan Waterman (both + * name and attribution are due to Knuth). + * + * @param {Function} randint The randint function. + * @return {Function} The sample function. + */ +const _waterman = (randint) => { + /** + * Samples k items uniformly at random from an iterable of unknown size. + * + * We want each item to have probability k/n of being selected. + * + * The algorithm works as follows: + * 1. We initialize a candidate sample with the first k items. + * 2. For each remaining item i, decide whether to insert it in the + * candidate sample with probability k/i, evicting an item from the + * candidate sample at random, or to discard it immediately (with + * probability 1-k/i), + * + * To prove that the obtained probability of inclusion for each item is correct + * we multiply two probabilities: + * 1. The probability of entering the candidate sample. + * 2. The probability of staying in the candidate sample until the end. + * + * For items 1 to k, probability 1. is 1, and probability 2. is + * (1-1/(k+1))(1-1/(k+2))...(1-1/n) + * = (k/(k+1))((k+1)/(k+2))...((n-1)/n) which telescopes to k/n. + * + * For items i = k+1 to n, where probability 1. is k/i, and probability 2. + * is (1-1/(i+1))(1-1/(i+2))...(1-1/n) + * = (i/(i+1))((i+1)/(i+2))...((n-1)/n) which telescopes to i/n. + * + * NOTE: Could also implement so that it yields after each input item. + * NOTE: One can reduce the expected number of random bits needed by + * avoiding generating any number above k-1: + * - First we branch on whether i < k. + * - Then we generate the random number between 0 and k-1 only if needed. + * + * To decide on the branch, flip a biased coin with parameter p = k/n. + * To do so, flip a fair coin until it differs from the binary + * representation of k/n (0.10110101...). + * The computation can be made efficient by realizing several things: + * - k is fixed and smaller than n (so divmod step can be skipped) + * - k/(n+1) < k/n (so we can avoid recomputing if the biased flip > k/n) + * + * This would reduce the number of necessary random bits from O(n log n) to + * expected O(n). + * + * @param {number} k The size of the sample. + * @param {Iterable} iterable The input iterable. + * @param {Array} [output=new Array(k)] The output array. + * @return {Array} The output array. + */ + const sample = (k, iterable, output = new Array(k)) => { + const it = iterable[Symbol.iterator](); + + let n = 0; + + for (; n < k; ++n) { + const {value, done} = it.next(); + if (done) return output; + output[n] = value; + } + + for (; ; ++n) { + const {value, done} = it.next(); + if (done) return output; + const i = randint(0, n); + if (i < k) output[i] = value; + } + }; + + return sample; +}; + +export default _waterman; diff --git a/test/src/reservoir.js b/test/src/reservoir.js new file mode 100644 index 0000000..0ac62cc --- /dev/null +++ b/test/src/reservoir.js @@ -0,0 +1,38 @@ +import test from 'ava'; +import {range} from '@aureooms/js-itertools'; +import {increasing} from '@aureooms/js-compare'; +import {RedBlackTree} from '@aureooms/js-red-black-tree'; +import {reservoir, _waterman, randint} from '../../src/index.js'; + +const macro = (t, _, reservoir, k, n) => { + const sample = reservoir(k, range(n)); + const source = RedBlackTree.from(increasing, range(n)); + // We cannot use a Set as it would smoosh input duplicates + + console.debug({sample}); + t.is(sample.length, k); + for (const i of range(Math.min(k, n))) t.true(source.remove(sample[i])); + for (const i of range(n, k)) t.true(sample[i] === undefined); +}; + +macro.title = (title, algo, _, k, n) => + title || `[${algo}] reservoir(${k}, range(${n}))`; + +const algorithms = [ + ['Waterman', _waterman(randint)], + ['API', reservoir], +]; + +const params = [ + [0, 10], + [5, 10], + [10, 5], + [10, 10], + [50, 1000], +]; + +for (const [name, algorithm] of algorithms) { + for (const [k, input] of params) { + test(macro, name, algorithm, k, input); + } +} diff --git a/yarn.lock b/yarn.lock index 03e9091..6132585 100644 --- a/yarn.lock +++ b/yarn.lock @@ -42,6 +42,11 @@ resolved "https://registry.yarnpkg.com/@aureooms/js-memory/-/js-memory-4.0.0.tgz#db87dc64b948f672d73b434ebde047b05869712c" integrity sha1-24fcZLlI9nLXO0NOveBHsFhpcSw= +"@aureooms/js-red-black-tree@^9.0.0": + version "9.0.0" + resolved "https://registry.yarnpkg.com/@aureooms/js-red-black-tree/-/js-red-black-tree-9.0.0.tgz#ee006f24af42749546232b2d0baa13910c98f7b2" + integrity sha512-sUtY0HnwQnBUjrfwysKc6H4BJO4O2+NnrUHLqTYJyT1l1VSI+oXGffjjmMJTFpIl4L/4FEZAN0L3BiQxgR1T8g== + "@aureooms/js-type@1.0.4": version "1.0.4" resolved "https://registry.yarnpkg.com/@aureooms/js-type/-/js-type-1.0.4.tgz#7f9de5f5f8506ff9c8958731744b7427b62e92b7"