[datasets] Add a Datasets class for managing datasets.

Issue #45.
facebookresearch · Apr 23, 2021 · 992c0e0 · 992c0e0
1 parent 32a1b4a
commit 992c0e0
Show file tree

Hide file tree

Showing 7 changed files with 655 additions and 51 deletions.
diff --git a/compiler_gym/datasets/BUILD b/compiler_gym/datasets/BUILD
@@ -10,6 +10,7 @@ py_library(
         "__init__.py",
         "benchmark.py",
         "dataset.py",
+        "datasets.py",
         "files_dataset.py",
         "tar_dataset.py",
     ],

diff --git a/compiler_gym/datasets/__init__.py b/compiler_gym/datasets/__init__.py
@@ -17,6 +17,7 @@
     delete,
     require,
 )
+from compiler_gym.datasets.datasets import Datasets
 from compiler_gym.datasets.files_dataset import FilesDataset
 from compiler_gym.datasets.tar_dataset import TarDataset, TarDatasetWithManifest
 
@@ -27,6 +28,7 @@
     "BenchmarkSource",
     "Dataset",
     "DatasetInitError",
+    "Datasets",
     "deactivate",
     "delete",
     "FilesDataset",

diff --git a/compiler_gym/datasets/datasets.py b/compiler_gym/datasets/datasets.py
@@ -0,0 +1,314 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from collections import deque
+from typing import Dict, Iterable, Optional, Set, TypeVar, Union
+
+import numpy as np
+
+from compiler_gym.datasets.benchmark import (
+    BENCHMARK_URI_RE,
+    Benchmark,
+    resolve_uri_protocol,
+)
+from compiler_gym.datasets.dataset import Dataset
+
+T = TypeVar("T")
+
+
+def round_robin_iterables(iters: Iterable[Iterable[T]]) -> Iterable[T]:
+    """Yield from the given iterators in round robin order."""
+    # Use a queue of iterators to iterate over. Repeatedly pop an iterator from
+    # the queue, yield the next value from it, then put it at the back of the
+    # queue. The iterator is discarded once exhausted.
+    iters = deque(iters)
+    while len(iters) > 1:
+        it = iters.popleft()
+        try:
+            yield next(it)
+            iters.append(it)
+        except StopIteration:
+            pass
+    # Once we have only a single iterator left, return it directly rather
+    # continuing with the round robin.
+    if len(iters) == 1:
+        yield from iters.popleft()
+
+
+class Datasets(object):
+    """A collection of datasets.
+
+    This class provides a dictionary-like interface for indexing and iterating
+    over multiple :class:`Dataset <compiler_gym.datasets.Dataset>` objects.
+    Select a dataset by URI using:
+
+        >>> env.datasets["benchmark://cbench-v1"]
+
+    Check whether a dataset exists using:
+
+        >>> "benchmark://cbench-v1" in env.datasets
+        True
+
+    Or iterate over the datasets using:
+
+        >>> for dataset in env.datasets:
+        ...     print(dataset.name)
+        benchmark://cbench-v1
+        benchmark://github-v0
+        benchmark://npb-v0
+
+    To select a benchmark from the datasets, use :meth:`benchmark()`:
+
+        >>> env.datasets.benchmark("benchmark://a-v0/a")
+
+    Use the :meth:`benchmarks()` method to iterate over every benchmark in the
+    datasets in a stable round robin order:
+
+        >>> for benchmark in env.datasets.benchmarks():
+        ...     print(benchmark)
+        benchmark://cbench-v1/1
+        benchmark://github-v0/1
+        benchmark://npb-v0/1
+        benchmark://cbench-v1/2
+        ...
+
+    If you want to exclude a dataset, delete it:
+
+        >>> del env.datasets["benchmark://b-v0"]
+
+    To iterate over the benchmarks in a random order, use :meth:`benchmark()`
+    and omit the URI:
+
+        >>> for i in range(100):
+        ...     benchmark = env.datasets.benchmark()
+
+    This uses uniform random selection to sample across datasets. For finite
+    datasets, you could weight the sample by the size of each dataset:
+
+        >>> weights = [len(d) for d in env.datasets]
+        >>> np.random.choice(list(env.datasets), p=weights).benchmark()
+    """
+
+    def __init__(
+        self,
+        datasets: Iterable[Dataset],
+        random: Optional[np.random.Generator] = None,
+    ):
+        self._datasets: Dict[str, Dataset] = {d.name: d for d in datasets}
+        self._visible_datasets: Set[str] = set(
+            name for name, dataset in self._datasets.items() if not dataset.hidden
+        )
+        self.random = random or np.random.default_rng()
+
+    def seed(self, seed: Optional[int] = None) -> None:
+        """Set the random state.
+
+        Setting a random state will fix the order that
+        :meth:`datasets.benchmark() <compiler_gym.datasets.Datasets.benchmark>`
+        returns benchmarks when called without arguments.
+
+        Calling this method recursively calls :meth:`seed()
+        <compiler_gym.datasets.Dataset.seed>` on all member datasets.
+
+        :param seed: An optional seed value.
+        """
+        self.random = np.random.default_rng(seed)
+        for dataset in self._datasets.values():
+            dataset.seed(seed)
+
+    def datasets(self, with_deprecated: bool = False) -> Iterable[Dataset]:
+        """Enumerate the datasets.
+
+        Dataset order is consistent across runs.
+
+        :param with_deprecated: If :code:`True`, include datasets that have been
+            marked as deprecated.
+
+        :return: An iterable sequence of :meth:`Dataset
+            <compiler_gym.datasets.Dataset>` instances.
+        """
+        datasets = self._datasets.values()
+        if not with_deprecated:
+            datasets = (d for d in datasets if not d.hidden)
+        yield from sorted(datasets, key=lambda d: (d.sort_order, d.name))
+
+    def __iter__(self) -> Iterable[Dataset]:
+        """Iterate over the datasets.
+
+        Dataset order is consistent across runs.
+
+        Equivalent to :meth:`datasets.datasets()
+        <compiler_gym.datasets.Dataset.datasets>`, but without the ability to
+        iterate over the deprecated datasets.
+
+        :return: An iterable sequence of :meth:`Dataset
+            <compiler_gym.datasets.Dataset>` instances.
+        """
+        return self.datasets()
+
+    def dataset(self, dataset: Optional[Union[str, Dataset]] = None) -> Dataset:
+        """Get a dataset.
+
+        If a name is given, return the corresponding :meth:`Dataset
+        <compiler_gym.datasets.Dataset>`. Else, return a dataset uniformly
+        randomly from the set of available datasets.
+
+        Use :meth:`seed() <compiler_gym.datasets.Dataset.seed>` to force a
+        reproducible order for randomly selected datasets.
+
+        Name lookup will succeed whether or not the dataset is deprecated.
+
+        :param dataset: A dataset name, a :class:`Dataset` instance, or
+            :code:`None` to select a dataset randomly.
+
+        :return: A :meth:`Dataset <compiler_gym.datasets.Dataset>` instance.
+
+        :raises LookupError: If :code:`dataset` is not found.
+        """
+        if dataset is None:
+            if not self._visible_datasets:
+                raise ValueError("No datasets")
+
+            return self._datasets[self.random.choice(list(self._visible_datasets))]
+
+        if isinstance(dataset, Dataset):
+            dataset_name = dataset.name
+        else:
+            dataset_name = resolve_uri_protocol(dataset)
+
+        if dataset_name not in self._datasets:
+            raise LookupError(f"Dataset not found: {dataset_name}")
+
+        return self._datasets[dataset_name]
+
+    def __getitem__(self, dataset: Union[str, Dataset]) -> Dataset:
+        """Lookup a dataset.
+
+        :param dataset: A dataset name, a :class:`Dataset` instance, or
+            :code:`None` to select a dataset randomly.
+
+        :return: A :meth:`Dataset <compiler_gym.datasets.Dataset>` instance.
+
+        :raises LookupError: If :code:`dataset` is not found.
+        """
+        return self.dataset(dataset)
+
+    def __setitem__(self, key: str, dataset: Dataset):
+        self._datasets[key] = dataset
+        if not dataset.hidden:
+            self._visible_datasets.add(dataset.name)
+
+    def __delitem__(self, dataset: Union[str, Dataset]):
+        """Remove a dataset from the collection.
+
+        This does not affect any underlying storage used by dataset. See
+        :meth:`uninstall() <compiler_gym.datasets.Datasets.uninstall>` to clean
+        up.
+
+        :param dataset: A :meth:`Dataset <compiler_gym.datasets.Dataset>`
+            instance, or the name of a dataset.
+
+        :return: :code:`True` if the dataset was removed, :code:`False` if it
+            was already removed.
+        """
+        dataset_name: str = self.dataset(dataset).name
+        if dataset_name in self._visible_datasets:
+            self._visible_datasets.remove(dataset_name)
+        del self._datasets[dataset_name]
+
+    def __contains__(self, dataset: Union[str, Dataset]) -> bool:
+        """Returns whether the dataset is contained."""
+        try:
+            self.dataset(dataset)
+            return True
+        except LookupError:
+            return False
+
+    def benchmarks(self, with_deprecated: bool = False) -> Iterable[Benchmark]:
+        """Enumerate the (possibly infinite) benchmarks lazily.
+
+        Benchmarks order is consistent across runs. One benchmark from each
+        dataset is returned in round robin order until all datasets have been
+        fully enumerated. The order of :meth:`benchmarks()
+        <compiler_gym.datasets.Datasets.benchmarks>` and :meth:`benchmark_uris()
+        <compiler_gym.datasets.Datasets.benchmark_uris>` is the same.
+
+        :param with_deprecated: If :code:`True`, include benchmarks from
+            datasets that have been marked deprecated.
+
+        :return: An iterable sequence of :class:`Benchmark
+            <compiler_gym.datasets.Benchmark>` instances.
+        """
+        return round_robin_iterables(
+            (d.benchmarks() for d in self.datasets(with_deprecated=with_deprecated))
+        )
+
+    def benchmark_uris(self, with_deprecated: bool = False) -> Iterable[str]:
+        """Enumerate the (possibly infinite) benchmark URIs.
+
+        Benchmark URI order is consistent across runs. URIs from datasets are
+        returned in round robin order. The order of :meth:`benchmarks()
+        <compiler_gym.datasets.Datasets.benchmarks>` and :meth:`benchmark_uris()
+        <compiler_gym.datasets.Datasets.benchmark_uris>` is the same.
+
+        :param with_deprecated: If :code:`True`, include benchmarks from
+            datasets that have been marked deprecated.
+
+        :return: An iterable sequence of benchmark URI strings.
+        """
+        return round_robin_iterables(
+            (d.benchmark_uris() for d in self.datasets(with_deprecated=with_deprecated))
+        )
+
+    def benchmark(self, uri: Optional[str] = None) -> Benchmark:
+        """Select a benchmark.
+
+        If a benchmark URI is given, the corresponding :class:`Benchmark
+        <compiler_gym.datasets.Benchmark>` is returned, regardless of whether
+        the containing dataset is installed or deprecated.
+
+        If no URI is given, a benchmark is selected randomly. First, a dataset
+        is selected uniformly randomly from the set of available datasets. Then
+        a benchmark is selected randomly from the chosen dataset.
+
+        Calling :code:`benchmark()` will yield benchmarks from all available
+        datasets with equal probability, regardless of how many benchmarks are
+        in each dataset. Given a pool of available datasets of differing sizes,
+        smaller datasets will be overrepresented and large datasets will be
+        underrepresented.
+
+        Use :meth:`seed() <compiler_gym.datasets.Dataset.seed>` to force a
+        reproducible order for randomly selected benchmarks.
+
+        :param uri: The URI of the benchmark to return. If :code:`None`, select
+            a benchmark randomly using :code:`self.random`.
+
+        :return: A :class:`Benchmark <compiler_gym.datasets.Benchmark>`
+            instance.
+        """
+        if uri is None and not self._visible_datasets:
+            raise ValueError("No datasets")
+        elif uri is None:
+            return self.dataset().benchmark()
+
+        uri = resolve_uri_protocol(uri)
+
+        match = BENCHMARK_URI_RE.match(uri)
+        if not match:
+            raise ValueError(f"Invalid benchmark URI: '{uri}'")
+
+        dataset_name = match.group("dataset")
+        dataset = self._datasets[dataset_name]
+
+        if len(uri) > len(dataset_name) + 1:
+            return dataset.benchmark(uri)
+        else:
+            return dataset.benchmark()
+
+    @property
+    def size(self) -> int:
+        return len(self._visible_datasets)
+
+    def __len__(self) -> int:
+        return self.size
diff --git a/tests/datasets/BUILD b/tests/datasets/BUILD
@@ -26,6 +26,16 @@ py_test(
     ],
 )
 
+py_test(
+    name = "datasets_test",
+    srcs = ["datasets_test.py"],
+    deps = [
+        "//compiler_gym/datasets",
+        "//tests:test_main",
+        "//tests/pytest_plugins:common",
+    ],
+)
+
 py_test(
     name = "files_dataset_test",
     timeout = "short",