facebookresearch · ChrisCummins · May 26, 2021 · May 20, 2021 · May 25, 2021
diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD
@@ -10,6 +10,7 @@ py_library(
         "__init__.py",
         "anghabench.py",
         "cbench.py",
+        "chstone.py",
         "clgen.py",
         "csmith.py",
         "llvm_stress.py",

diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py
@@ -9,6 +9,7 @@
 from compiler_gym.datasets import Dataset, TarDatasetWithManifest
 from compiler_gym.envs.llvm.datasets.anghabench import AnghaBenchDataset
 from compiler_gym.envs.llvm.datasets.cbench import CBenchDataset, CBenchLegacyDataset
+from compiler_gym.envs.llvm.datasets.chstone import CHStoneDataset
 from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset
 from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
 from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
@@ -248,6 +249,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
         sort_order=100,
     )
     yield CBenchLegacyDataset(site_data_base=site_data_base)
+    yield CHStoneDataset(site_data_base=site_data_base)
     yield CsmithDataset(site_data_base=site_data_base, sort_order=0)
     yield GitHubDataset(site_data_base=site_data_base, sort_order=0)
     yield LinuxDataset(site_data_base=site_data_base, sort_order=0)

diff --git a/compiler_gym/envs/llvm/datasets/chstone.py b/compiler_gym/envs/llvm/datasets/chstone.py
@@ -0,0 +1,135 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import subprocess
+from concurrent.futures import as_completed
+from pathlib import Path
+from typing import Iterable
+
+from compiler_gym.datasets import Benchmark, TarDatasetWithManifest
+from compiler_gym.datasets.benchmark import BenchmarkWithSource
+from compiler_gym.envs.llvm.llvm_benchmark import ClangInvocation
+from compiler_gym.util import thread_pool
+from compiler_gym.util.filesystem import atomic_file_write
+
+URIS = [
+    "benchmark://chstone-v0/adpcm",
+    "benchmark://chstone-v0/aes",
+    "benchmark://chstone-v0/blowfish",
+    "benchmark://chstone-v0/dfadd",
+    "benchmark://chstone-v0/dfdiv",
+    "benchmark://chstone-v0/dfmul",
+    "benchmark://chstone-v0/dfsin",
+    "benchmark://chstone-v0/gsm",
+    "benchmark://chstone-v0/jpeg",
+    "benchmark://chstone-v0/mips",
+    "benchmark://chstone-v0/motion",
+    "benchmark://chstone-v0/sha",
+]
+
+
+class CHStoneDataset(TarDatasetWithManifest):
+    """A dataset of C programs curated from GitHub source code.
+
+    The dataset is from:
+
+        Hara, Yuko, Hiroyuki Tomiyama, Shinya Honda, Hiroaki Takada, and Katsuya
+        Ishii. "Chstone: A benchmark program suite for practical c-based
+        high-level synthesis." In 2008 IEEE International Symposium on Circuits
+        and Systems, pp. 1192-1195. IEEE, 2008.
+
+    And is available at:
+
+        http://www.ertl.jp/chstone/
+    """
+
+    def __init__(
+        self,
+        site_data_base: Path,
+        sort_order: int = 0,
+    ):
+        super().__init__(
+            name="benchmark://chstone-v0",
+            description="Benchmarks for C-based High-Level Synthesis",
+            references={
+                "Paper": "http://www.yxi.com/applications/iscas2008-300_1027.pdf",
+                "Homepage": "http://www.ertl.jp/chstone/",
+            },
+            license="Mixture of open source and public domain licenses",
+            site_data_base=site_data_base,
+            tar_urls=[
+                "https://github.com/ChrisCummins/patmos_HLS/archive/e62d878ceb91e5a18007ca2e0a9602ee44ff7d59.tar.gz"
+            ],
+            tar_sha256="f7acab9d3c3dc7b971e62c8454bc909d84bddb6d0a96378e41beb94231739acb",
+            strip_prefix="patmos_HLS-e62d878ceb91e5a18007ca2e0a9602ee44ff7d59/benchmarks/CHStone",
+            tar_compression="gz",
+            benchmark_file_suffix=".bc",
+            sort_order=sort_order,
+            # We provide our own manifest.
+            manifest_urls=[],
+            manifest_sha256="",
+        )
+
+    def benchmark_uris(self) -> Iterable[str]:
+        yield from URIS
+
+    def benchmark(self, uri: str) -> Benchmark:
+        self.install()
+
+        benchmark_name = uri[len(self.name) + 1 :]
+        if not benchmark_name:
+            raise LookupError(f"No benchmark specified: {uri}")
+
+        bitcode_abspath = self.dataset_root / f"{benchmark_name}.bc"
+
+        # Most of the source files are named after the parent directory, but not
+        # all.
+        c_file_name = {
+            "blowfish": "bf.c",
+            "motion": "mpeg2.c",
+            "sha": "sha_driver.c",
+            "jpeg": "main.c",
+        }.get(benchmark_name, f"{benchmark_name}.c")
+        c_file_abspath = self.dataset_root / benchmark_name / c_file_name
+
+        # If the file does not exist, compile it on-demand.
+        if not bitcode_abspath.is_file():
+            if not c_file_abspath.is_file():
+                raise LookupError(
+                    f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
+                )
+
+            with atomic_file_write(bitcode_abspath) as tmp_path:
+                compile_cmd = ClangInvocation.from_c_file(
+                    c_file_abspath,
+                    copt=[
+                        "-ferror-limit=1",  # Stop on first error.
+                        "-w",  # No warnings.
+                    ],
+                ).command(outpath=tmp_path)
+                subprocess.check_call(compile_cmd, timeout=300)
+
+        return BenchmarkWithSource.create(
+            uri, bitcode_abspath, "function.c", c_file_abspath
+        )
+
+    @property
+    def size(self) -> int:
+        return len(URIS)
+
+    def compile_all(self):
+        n = self.size
+        executor = thread_pool.get_thread_pool_executor()
+        # Since the dataset is lazily compiled, simply iterating over the full
+        # set of URIs will compile everything. Do this in parallel.
+        futures = (
+            executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
+        )
+        for i, future in enumerate(as_completed(futures), start=1):
+            future.result()
+            print(
+                f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
+                flush=True,
+                end="",
+            )
diff --git a/compiler_gym/util/download.py b/compiler_gym/util/download.py
@@ -68,6 +68,9 @@ def _do_download_attempt(url: str, sha256: Optional[str]) -> bytes:
 
 
 def _download(urls: List[str], sha256: Optional[str], max_retries: int) -> bytes:
+    if not urls:
+        raise ValueError("No URLs to download")
+
     # Cache hit.
     if sha256 and cache_path(f"downloads/{sha256}").is_file():
         with open(str(cache_path(f"downloads/{sha256}")), "rb") as f:

diff --git a/tests/llvm/datasets/BUILD b/tests/llvm/datasets/BUILD
@@ -45,6 +45,19 @@ py_test(
     ],
 )
 
+py_test(
+    name = "chstone_test",
+    timeout = "moderate",
+    srcs = ["chstone_test.py"],
+    deps = [
+        "//compiler_gym/envs/llvm",
+        "//compiler_gym/envs/llvm/datasets",
+        "//tests:test_main",
+        "//tests/pytest_plugins:common",
+        "//tests/pytest_plugins:llvm",
+    ],
+)
+
 py_test(
     name = "clgen_test",
     timeout = "moderate",

diff --git a/tests/llvm/datasets/chstone_test.py b/tests/llvm/datasets/chstone_test.py
@@ -0,0 +1,57 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tests for the AnghaBench dataset."""
+import gym
+import pytest
+
+import compiler_gym.envs.llvm  # noqa register environments
+from compiler_gym.envs.llvm import LlvmEnv
+from compiler_gym.envs.llvm.datasets import CHStoneDataset, chstone
+from tests.test_main import main
+
+pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"]
+
+
+@pytest.fixture(scope="module")
+def chstone_dataset() -> CHStoneDataset:
+    env = gym.make("llvm-v0")
+    try:
+        ds = env.datasets["chstone-v0"]
+    finally:
+        env.close()
+    yield ds
+
+
+def test_anghabench_size(chstone_dataset: CHStoneDataset):
+    assert chstone_dataset.size == 12
+
+
+def test_missing_benchmark_name(chstone_dataset: CHStoneDataset, mocker):
+    # Mock install() so that on CI it doesn't download and unpack the tarfile.
+    mocker.patch.object(chstone_dataset, "install")
+
+    with pytest.raises(
+        LookupError, match=r"^No benchmark specified: benchmark://chstone-v0$"
+    ):
+        chstone_dataset.benchmark("benchmark://chstone-v0")
+    chstone_dataset.install.assert_called_once()
+
+    with pytest.raises(
+        LookupError, match=r"^No benchmark specified: benchmark://chstone-v0/$"
+    ):
+        chstone_dataset.benchmark("benchmark://chstone-v0/")
+    assert chstone_dataset.install.call_count == 2
+
+
+@pytest.mark.parametrize("uri", chstone.URIS)
+def test_chstone_benchmark_reset(
+    env: LlvmEnv, chstone_dataset: CHStoneDataset, uri: str
+):
+    env.reset(chstone_dataset.benchmark(uri))
+    assert env.benchmark == uri
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/llvm/datasets/llvm_datasets_test.py b/tests/llvm/datasets/llvm_datasets_test.py
@@ -16,6 +16,7 @@ def test_default_dataset_list():
             "benchmark://cbench-v1",
             "benchmark://anghabench-v1",
             "benchmark://blas-v0",
+            "benchmark://chstone-v0",
             "benchmark://clgen-v0",
             "benchmark://github-v0",
             "benchmark://linux-v0",

diff --git a/tests/util/download_test.py b/tests/util/download_test.py
@@ -98,5 +98,10 @@ def patched_download(*args):
         download.download("example", sha256="123")
 
 
+def test_download_no_urls():
+    with pytest.raises(ValueError, match="No URLs to download"):
+        download.download(urls=[])
+
+
 if __name__ == "__main__":
     main()