mosaicml · karan6181 · Sep 9, 2023 · Aug 21, 2023 · Aug 28, 2023 · Aug 28, 2023
diff --git a/scripts/samples/bench_and_plot.py b/scripts/samples/bench_and_plot.py
@@ -341,11 +341,11 @@ def bench(args: Namespace, bench_name: str, desc: str, generate: Callable,
     plt.grid(which='minor', ls=':', c='#ddd', lw=0.5)
     ax = plt.gca()
     ax.xaxis.set_major_formatter(ScalarFormatter())
-    ax.xaxis.get_major_formatter().set_scientific(False)
-    ax.xaxis.get_major_formatter().set_useOffset(False)
+    ax.xaxis.get_major_formatter().set_scientific(False)  # pyright: ignore
+    ax.xaxis.get_major_formatter().set_useOffset(False)  # pyright: ignore
     ax.xaxis.set_minor_formatter(ScalarFormatter())
-    ax.xaxis.get_minor_formatter().set_scientific(False)
-    ax.xaxis.get_minor_formatter().set_useOffset(False)
+    ax.xaxis.get_minor_formatter().set_scientific(False)  # pyright: ignore
+    ax.xaxis.get_minor_formatter().set_useOffset(False)  # pyright: ignore
     ax.xaxis.set_tick_params(which='minor', pad=5)
     print('  Stats')
     for (format_name, writer_class, color), seq, rand in zip(format_infos, seqs, rands):

diff --git a/streaming/base/dataset.py b/streaming/base/dataset.py
@@ -28,6 +28,7 @@
 from streaming.base.distributed import maybe_init_dist
 from streaming.base.format import get_index_basename
 from streaming.base.partition import get_partitions
+from streaming.base.sampling import get_sampling
 from streaming.base.shared import (SharedArray, SharedBarrier, SharedMemory, SharedScalar,
                                    _get_path, get_shm_prefix)
 from streaming.base.shuffle import get_shuffle
@@ -203,6 +204,7 @@ class StreamingDataset(Array, IterableDataset):
       * Sampling:
 
         * ``sampling_method``
+        * ``sampling_granularity``
 
 
     Args:
@@ -265,6 +267,9 @@ class StreamingDataset(Array, IterableDataset):
             of this size, and samples within each block are shuffled. Defaults to ``1 << 18``.
         sampling_method (str): Which sampling method to use, either ``balanced`` or ``fixed``.
             Defaults to ``balanced``.
+        sampling_granularity (int): When picking samples for a stream's final partial repeat,
+            how many samples to pick from the same shard at a time (``1`` for evenly balanced
+            across shards). Defaults to ``1``.
     """
 
     def __init__(self,
@@ -287,7 +292,8 @@ def __init__(self,
                  shuffle_algo: str = 'py1s',
                  shuffle_seed: int = 9176,
                  shuffle_block_size: int = 1 << 18,
-                 sampling_method: str = 'balanced') -> None:
+                 sampling_method: str = 'balanced',
+                 sampling_granularity: int = 1) -> None:
         # Global arguments (which do not live in Streams).
         self.predownload = predownload
         self.cache_limit = cache_limit
@@ -299,6 +305,7 @@ def __init__(self,
         self.shuffle_seed = shuffle_seed
         self.shuffle_block_size = shuffle_block_size
         self.sampling_method = sampling_method.lower().strip()
+        self.sampling_granularity = sampling_granularity
 
         # Check streams vs remote/local.
         if bool(streams) == (bool(remote) or bool(local)):
@@ -716,17 +723,13 @@ def _resample_streams(self, epoch: int) -> Tuple[NDArray[np.int64], NDArray[np.i
 
             # Calculate choose per stream shard.
             samples_per_stream_shard = self.samples_per_shard[stream_shard_ids]
-            stream_samples = sum(samples_per_stream_shard)
-            # the number of items to choose from each stream (calculated during dataset initialization)
+            # the number of items to choose from each stream (calculated during dataset
+            # initialization)
             stream_choose = self.streams[stream_id].choose
-            if stream_choose == stream_samples:
-                choose_per_stream_shard = samples_per_stream_shard
-            else:
-                choose_per_stream_shard = \
-                    samples_per_stream_shard * stream_choose // stream_samples
-                shortfall = stream_choose - choose_per_stream_shard.sum()
-                indices = rng.choice(num_stream_shards, shortfall, False)
-                choose_per_stream_shard[indices] += 1
+            use_epoch = self.sampling_method == 'balanced'
+            choose_per_stream_shard = get_sampling(samples_per_stream_shard, stream_choose,
+                                                   self.sampling_granularity, self.shuffle_seed,
+                                                   epoch, use_epoch)
 
             # Iterate over each shard of this stream.
             for shard_id, shard_samples, shard_choose in zip(stream_shard_ids,

diff --git a/streaming/base/sampling.py b/streaming/base/sampling.py
@@ -0,0 +1,75 @@
+# Copyright 2023 MosaicML Streaming authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Functionality relating to sampling."""
+
+import numpy as np
+from numpy.typing import NDArray
+
+
+def get_sampling(samples_per_shard: NDArray[np.int64], choose: int, granularity: int, seed: int,
+                 epoch: int, use_epoch: bool) -> NDArray[np.int64]:
+    """Get how many samples to draw from each shard of the given stream.
+
+    Args:
+        samples_per_shard (NDArray[np.int64]): Array of underlying shard sizes.
+        choose (int): How many samples to draw in total over all shards.
+        granularity (int): How many samples to draw at a time from the same shard.
+        seed (int): Seed for shuffling sampling granules.
+        epoch (int): Which epoch we are sampling for.
+        use_epoch (bool): Whether to factor epoch into the base seed, or use the same seed across
+            epochs.
+
+    Returns:
+        NDArray[np.int64]: Array of ephemeral samples chosen per shard.
+    """
+    if choose < 0:
+        raise ValueError('Choose must be a non-negative integer.')
+
+    if granularity <= 0:
+        raise ValueError('Granularity must be a positive integer.')
+
+    if seed < 0:
+        raise ValueError('Seed must be a non-negative integer.')
+
+    if epoch < 0:
+        raise ValueError('Epoch must be a non-negative integer.')
+
+    # Handle whole integer repeat case.
+    num_samples = sum(samples_per_shard)
+    if not choose % num_samples:
+        return samples_per_shard * choose // num_samples
+
+    # Fractional repeat case.
+
+    # Get the ordering by which we will exhaust the shards.
+    pairs = []  # List of (shard ID, samples to draw).
+    for shard_id, shard_samples in enumerate(samples_per_shard):
+        num_granules = (shard_samples + granularity - 1) // granularity
+        shard_ids = np.full(num_granules, shard_id)
+        counts = np.full(num_granules, granularity)
+        if shard_samples % granularity:
+            counts[-1] = shard_samples % granularity
+        pair = shard_ids, counts
+        pairs.append(pair)
+    shard_ids, counts = zip(*pairs)
+    shard_ids = np.concatenate(shard_ids)
+    counts = np.concatenate(counts)
+    num_granules = len(shard_ids)
+    epoch_seed = seed + epoch if use_epoch else seed
+    rng = np.random.default_rng(epoch_seed)
+    ordering = rng.permutation(num_granules)
+
+    # Collect choose per shard.
+    choose_per_shard = samples_per_shard * (choose // num_samples)
+    choose %= num_samples
+    for index in ordering:
+        shard_id = shard_ids[index]
+        count = counts[index]
+        count = min(choose, int(count))
+        choose_per_shard[shard_id] += count
+        choose -= count
+        if not choose:
+            break
+
+    return choose_per_shard
diff --git a/tests/test_sampling.py b/tests/test_sampling.py
@@ -0,0 +1,40 @@
+# Copyright 2023 MosaicML Streaming authors
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+
+from streaming.base.sampling import get_sampling
+
+
+def test_choose_per_shard_adds_up():
+    for granularity in range(1, 100):
+        for _ in range(10):
+            samples_per_shard = np.random.choice(100, 10)
+            samples = sum(samples_per_shard)
+            choose = np.random.choice(samples)
+            seed = np.random.choice(31337)
+            epoch = np.random.choice(42)
+            use_epoch = bool(np.random.choice(2))
+            choose_per_shard = get_sampling(samples_per_shard, choose, granularity, seed, epoch,
+                                            use_epoch)
+            assert (0 <= choose_per_shard).all()
+            assert (choose_per_shard <= samples_per_shard).all()
+            assert sum(choose_per_shard) == choose
+
+
+def test_is_deterministic():
+    for granularity in range(1, 100):
+        for _iter in range(3):
+            samples_per_shard = np.random.choice(100, 10)
+            samples = sum(samples_per_shard)
+            choose = np.random.choice(samples)
+            seed = np.random.choice(31337)
+            epoch = np.random.choice(42)
+            use_epoch = bool(np.random.choice(2))
+            last = None
+            for _repeat in range(2):
+                choose_per_shard = get_sampling(samples_per_shard, choose, granularity, seed,
+                                                epoch, use_epoch)
+                if last is not None:
+                    assert (last == choose_per_shard).all()
+                last = choose_per_shard