WIP: extend LLVM tar

facebookresearch · Mar 24, 2021 · 05c1804 · 05c1804
1 parent b430c6e
commit 05c1804
Show file tree

Hide file tree

Showing 10 changed files with 292 additions and 222 deletions.
diff --git a/compiler_gym/bin/service.py b/compiler_gym/bin/service.py
@@ -107,13 +107,18 @@ def summarize_datasets(datasets: Iterable[Dataset]) -> str:
                 dataset.name,
                 truncate(dataset.description, max_line_len=60),
                 dataset.n,
-                dataset.site_data_size_in_bytes,
+                dataset.site_data_size_in_bytes if dataset.installed else 0,
             )
         )
     rows.append(("Total", "", sum(r[2] for r in rows), sum(r[3] for r in rows)))
     return tabulate(
         [
-            (n, l, humanize.intcomma(f) if f else "∞", humanize.naturalsize(s))
+            (
+                n,
+                l,
+                humanize.intcomma(f) if f else "∞",
+                humanize.naturalsize(s) if s else "-",
+            )
             for n, l, f, s in rows
         ],
         headers=("Dataset", "Description", "#. Benchmarks", "Size on disk"),

diff --git a/compiler_gym/datasets/dataset.py b/compiler_gym/datasets/dataset.py
@@ -40,6 +40,7 @@ def __init__(
         site_data_base: Path,
         long_description_url: Optional[str] = None,
         random: Optional[np.random.Generator] = None,
+        hidden: bool = False,
     ):
         self._name = name
         components = DATASET_NAME_RE.match(name)
@@ -53,6 +54,7 @@ def __init__(
         self._protocol = components.group("dataset_protocol")
         self._version = int(components.group("dataset_version"))
         self._long_description_url = long_description_url
+        self._hidden = hidden
 
         self.random = random or np.random.default_rng()
         self.logger = logging.getLogger("compiler_gym.datasets")
@@ -92,6 +94,10 @@ def version(self) -> int:
     def long_description_url(self) -> str:
         return self._long_description_url
 
+    @property
+    def hidden(self) -> str:
+        return self._hidden
+
     @property
     def site_data_path(self) -> Path:
         return self._site_data_path
@@ -114,6 +120,10 @@ def n(self) -> int:
         """
         return 0
 
+    @property
+    def installed(self) -> bool:
+        return True
+
     def install(self) -> None:
         """
         Implementing this method is optional.

diff --git a/compiler_gym/datasets/datasets.py b/compiler_gym/datasets/datasets.py
@@ -22,7 +22,10 @@ def __init__(
         random: Optional[np.random.Generator] = None,
     ):
         # A look-up table mapping dataset names to Dataset instances.
-        self._datasets: Dict[str, Dataset] = {d.name: d for d in datasets}
+        self._all_datasets: Dict[str, Dataset] = {d.name: d for d in datasets}
+        self._visible_datasets: Dict[str, Dataset] = {
+            k: v for k, v in self._all_datasets.items() if not v.hidden
+        }
         self._site_data_path = Path(site_data_path)
         self._site_data_path.mkdir(exist_ok=True, parents=True)
         self.random = random or np.random.default_rng()
@@ -38,7 +41,7 @@ def datasets(self) -> Iterable[Dataset]:
         """
         Iteration order is consistent between runs.
         """
-        yield from sorted(self._datasets.values(), key=lambda d: d.name)
+        yield from sorted(self._visible_datasets.values(), key=lambda d: d.name)
 
     def __getitem__(self, name: str) -> Dataset:
         return self.dataset(name)
@@ -49,20 +52,24 @@ def __iter__(self) -> Iterable[Dataset]:
     def dataset(self, name: str) -> Dataset:
         if "://" not in name:
             name = f"benchmark://{name}"
-        if name not in self._datasets:
+        if name not in self._all_datasets:
             raise LookupError(f"Dataset not found: '{name}'")
-        return self._datasets[name]
+        return self._all_datasets[name]
 
     def add(self, dataset: Dataset) -> None:
-        if dataset.name in self._datasets:
+        if dataset.name in self._all_datasets:
             warnings.warn(f"Overwriting existing dataset '{dataset.name}'")
-        self._datasets[dataset.name] = dataset
+        self._all_datasets[dataset.name] = dataset
+        if not dataset.hidden:
+            self._visible_datasets[dataset.name] = dataset
 
     def remove(self, dataset: Union[str, Dataset]) -> bool:
         dataset_name: str = dataset.name if isinstance(dataset, Dataset) else dataset
 
-        if dataset_name in self._datasets:
-            del self._datasets[dataset_name]
+        if dataset_name in self._all_datasets:
+            del self._all_datasets[dataset_name]
+            if dataset_name in self._visible_datasets:
+                del self._visible_datasets[dataset_name]
             return True
 
         return False
@@ -76,11 +83,11 @@ def benchmark_uris(self) -> Iterable[str]:
             yield from dataset.benchmark_uris()
 
     def benchmark(self, uri: Optional[str] = None) -> Benchmark:
-        if not self._datasets:
+        if not self._all_datasets:
             raise ValueError("No datasets available")
 
         if uri is None:
-            dataset = self.random.choice(list(self._datasets.values()))
+            dataset = self.random.choice(list(self._visible_datasets.values()))
             return dataset.benchmark()
 
         # Prepend the default benchmark:// protocol on URIs
@@ -92,7 +99,7 @@ def benchmark(self, uri: Optional[str] = None) -> Benchmark:
             raise ValueError(f"Invalid benchmark URI: '{uri}'")
 
         dataset_name = match.group("dataset")
-        if dataset_name not in self._datasets:
+        if dataset_name not in self._all_datasets:
             raise LookupError(f"Dataset not found: '{dataset_name}'")
 
-        return self._datasets[dataset_name].benchmark(uri)
+        return self._all_datasets[dataset_name].benchmark(uri)
diff --git a/compiler_gym/datasets/tar_dataset.py b/compiler_gym/datasets/tar_dataset.py
@@ -17,7 +17,7 @@
 from compiler_gym.util.download import download
 
 
-class FilesystemDirectoryDataset(Dataset):
+class BenchmarkFilesDataset(Dataset):
     def __init__(
         self,
         dataset_root: Path,
@@ -100,7 +100,7 @@ def _get_benchmark_by_index(self, n: int) -> Benchmark:
 
 class TarDataset(Dataset):
 
-    # TODO: Subclass FilesystemDirectoryDataset
+    # TODO: Subclass BenchmarkFilesDataset
 
     def __init__(
         self,
@@ -116,16 +116,24 @@ def __init__(
         self.benchmark_file_suffix = benchmark_file_suffix
         self.strip_prefix = strip_prefix
 
+        self._installed = False
         self._tar_extracted_marker = self.site_data_path / ".extracted"
         self._tar_lock = Lock()
         self._tar_lockfile = self.site_data_path / "LOCK"
         self._tar_data = self.site_data_path / "contents" / self.strip_prefix
 
+    @property
+    def installed(self) -> bool:
+        # Fast path for repeated checks to 'installed' without a disk op.
+        if not self._installed:
+            self._installed = self._tar_extracted_marker.is_file()
+        return self._installed
+
     def install(self) -> None:
         """
         Implementing this method is optional.
         """
-        if self._tar_extracted_marker.is_file():
+        if self.installed:
             return
 
         self.logger.info("Downloading %s dataset", self.name)
@@ -227,9 +235,13 @@ def __init__(self, manifest_url: str, manifest_sha256: str, **dataet_args):
         self.manifest_sha256 = manifest_sha256
         self._manifest_path = self.site_data_path / "manifest.txt"
 
-    def _install_manifest(self) -> List[str]:
+    @memoized_property
+    def _benchmark_uris(self) -> List[str]:
         if self._manifest_path.is_file():
-            return
+            with open(self._manifest_path) as f:
+                uris = f.readlines()
+                self.logger.debug("Read %s manifest, %d entries", self.name, len(uris))
+                return uris
         with self._tar_lock:
             with fasteners.InterProcessLock(self._tar_lockfile):
                 self.logger.debug("Downloading %s manifest", self.name)
@@ -240,16 +252,11 @@ def _install_manifest(self) -> List[str]:
                     manifest = gzipf.read().decode("utf-8").strip()
                 with open(self._manifest_path, "w") as f:
                     f.write(manifest)
-                return manifest.split("\n")
-
-    @memoized_property
-    def _benchmark_uris(self) -> List[str]:
-        uris = self._install_manifest()
-        if not uris:
-            with open(self._manifest_path) as f:
-                uris = f.readlines()
-        self.logger.debug("Read %s manifest, %d entries", self.name, len(uris))
-        return uris
+                uris = manifest.split("\n")
+                self.logger.debug(
+                    "Downloaded %s manifest, %d entries", self.name, len(uris)
+                )
+                return uris
 
     @memoized_property
     def n(self) -> int:

diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py
@@ -7,6 +7,7 @@
 
 from compiler_gym.datasets import Dataset, TarDatasetWithManifest
 from compiler_gym.envs.llvm.datasets.cBench import CBenchDataset, CBenchLegacyDataset
+from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
 
 
 class BlasDataset(TarDatasetWithManifest):
@@ -33,7 +34,7 @@ def __init__(self, site_data_base: Path):
             tar_url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-github-v0.tar.bz2",
             tar_sha256="880269dd7a5c2508ea222a2e54c318c38c8090eb105c0a87c595e9dd31720764",
             manifest_url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-github-v0-manifest.gz",
-            manifest_sha256="d7b42ef68c9b452233baa13303d5140d6b9bf15da2ba9d4e7b0ef73524611a42",
+            manifest_sha256="6d0ed47f8c70868db62ae9e3d2f33dbad9fda5ef1cfe99a9855eef4618ddef1b",
             license="CC BY 4.0",
             long_description_url="https://github.com/ctuning/ctuning-programs",
             strip_prefix="github-v0",
@@ -50,7 +51,7 @@ def __init__(self, site_data_base: Path):
             tar_url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-linux-v0.tar.bz2",
             tar_sha256="a1ae5c376af30ab042c9e54dc432f89ce75f9ebaee953bc19c08aff070f12566",
             manifest_url="https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-linux-v0-manifest.gz",
-            manifest_sha256="dbdf82046cb5779fc48f47c4899deea51d396daddee1dd448ccf411010788b96",
+            manifest_sha256="6b45716ca142950e42958634366626d06f02e73d37ddce225b3ef55468011aa8",
             long_description_url="https://github.com/spcl/ncc/tree/master/data",
             license="GPL-2.0",
             strip_prefix="linux-v0",
@@ -156,6 +157,7 @@ def get_llvm_datasets(site_data_base: Path) -> Iterable[Dataset]:
     yield OpenCVDataset(site_data_base=site_data_base)
     yield POJ104Dataset(site_data_base=site_data_base)
     yield TensorflowDataset(site_data_base=site_data_base)
+    yield LlvmStressDataset(site_data_base=site_data_base)
 
 
 __all__ = [
@@ -169,4 +171,5 @@ def get_llvm_datasets(site_data_base: Path) -> Iterable[Dataset]:
     "OpenCVDataset",
     "POJ104Dataset",
     "TensorflowDataset",
+    "LlvmStressDataset",
 ]
diff --git a/compiler_gym/envs/llvm/datasets/cBench.py b/compiler_gym/envs/llvm/datasets/cBench.py
@@ -7,7 +7,7 @@
 
 from deprecated.sphinx import deprecated
 
-from compiler_gym.datasets import TarDatasetWithManifest
+from compiler_gym.datasets import Benchmark, TarDatasetWithManifest
 
 _CBENCH_TARS = {
     "darwin": (
@@ -21,6 +21,10 @@
 }
 
 
+class CBenchBenchmark(Benchmark):
+    pass
+
+
 class CBenchDataset(TarDatasetWithManifest):
     def __init__(self, site_data_base: Path):
         platform = {"darwin": "macos"}.get(sys.platform, sys.platform)
@@ -70,4 +74,5 @@ def __init__(self, site_data_base: Path):
             strip_prefix="cBench-v0",
             benchmark_file_suffix=".bc",
             site_data_base=site_data_base,
+            hidden=True,
         )
diff --git a/compiler_gym/envs/llvm/datasets/llvm_stress.py b/compiler_gym/envs/llvm/datasets/llvm_stress.py
@@ -7,10 +7,7 @@
 from typing import Iterable, Optional
 
 from compiler_gym.datasets import Benchmark, Dataset
-from compiler_gym.util.runfiles_path import runfiles_path
-
-LLVM_AS = runfiles_path("compiler_gym/third_party/llvm/bin/llvm-as")
-LLVM_STRESS = runfiles_path("compiler_gym/third_party/llvm/bin/llvm-stress")
+from compiler_gym.third_party import llvm
 
 # The maximum value for the --seed argument to llvm-stress.
 UINT_MAX = (2 << 32) - 1
@@ -38,12 +35,12 @@ def benchmark(self, uri: Optional[str] = None):
         # Run llvm-stress with the given seed and pipe the output to llvm-as to
         # assemble a bitcode.
         llvm_stress = subprocess.Popen(
-            [str(LLVM_STRESS), f"--seed={seed}"],
+            [str(llvm.llvm_stress_path()), f"--seed={seed}"],
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,
         )
         llvm_as = subprocess.Popen(
-            [str(LLVM_AS), "-"],
+            [str(llvm.llvm_as_path()), "-"],
             stdin=llvm_stress.stdout,
             stdout=subprocess.PIPE,
             stderr=subprocess.PIPE,

diff --git a/compiler_gym/envs/llvm/service/BenchmarkFactory.cc b/compiler_gym/envs/llvm/service/BenchmarkFactory.cc
@@ -24,10 +24,6 @@ using grpc::StatusCode;
 
 namespace compiler_gym::llvm_service {
 
-static const std::string kExpectedExtension = ".bc";
-
-static const fs::path kSiteBenchmarksDir = util::getSiteDataPath("llvm/10.0.0/bitcode_benchmarks");
-
 BenchmarkFactory::BenchmarkFactory(const boost::filesystem::path& workingDirectory,
                                    std::optional<std::mt19937_64> rand,
                                    size_t maxLoadedBenchmarkSize)