facebookresearch · ChrisCummins · Nov 2, 2022 · Jun 4, 2022 · Jun 4, 2022 · Jun 16, 2022
diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD
@@ -13,6 +13,7 @@ py_library(
         "chstone.py",
         "clgen.py",
         "csmith.py",
+        "jotaibench.py",
         "llvm_stress.py",
         "poj104.py",
     ],

diff --git a/compiler_gym/envs/llvm/datasets/CMakeLists.txt b/compiler_gym/envs/llvm/datasets/CMakeLists.txt
@@ -11,6 +11,7 @@ cg_py_library(
   SRCS
     "__init__.py"
     "anghabench.py"
+    "jotaibench.py"
     "cbench.py"
     "chstone.py"
     "clgen.py"

diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py
@@ -16,6 +16,7 @@
 from compiler_gym.envs.llvm.datasets.chstone import CHStoneDataset
 from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset
 from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
+from compiler_gym.envs.llvm.datasets.jotaibench import JotaiBenchDataset
 from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
 from compiler_gym.envs.llvm.datasets.poj104 import POJ104Dataset, POJ104LegacyDataset
 from compiler_gym.util.runfiles_path import site_data_path
@@ -261,6 +262,26 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
         manifest_sha256=anghabench_v0_manifest_sha256,
         deprecated="Please use anghabench-v1",
     )
+    yield JotaiBenchDataset(site_data_base=site_data_base, sort_order=0)
+    # Add legacy version of Jotaibench using an old manifest.
+    jotaibench_v0_manifest_url, jotaibench_v0_manifest_sha256 = {
+        "darwin": (
+            "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
+            "39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1",
+        ),
+        "linux": (
+            "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
+            "3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
+        ),
+    }[sys.platform]
+    yield JotaiBenchDataset(
+        name="benchmark://jotaibench-v0",
+        site_data_base=site_data_base,
+        sort_order=0,
+        manifest_url=jotaibench_v0_manifest_url,
+        manifest_sha256=jotaibench_v0_manifest_sha256,
+        deprecated="Please use jotaibench-v1",
+    )
     yield BlasDataset(site_data_base=site_data_base, sort_order=0)
     yield CLgenDataset(site_data_base=site_data_base, sort_order=0)
     yield CBenchDataset(site_data_base=site_data_base)
@@ -302,6 +323,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
     "CsmithDataset",
     "get_llvm_datasets",
     "GitHubDataset",
+    "JotaiBenchDataset",
     "LinuxDataset",
     "LlvmStressDataset",
     "MibenchDataset",

diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -0,0 +1,229 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import subprocess
+import sys
+from concurrent.futures import as_completed
+from pathlib import Path
+from typing import Optional
+
+from compiler_gym.datasets import Benchmark, TarDataset, TarDatasetWithManifest
+from compiler_gym.datasets.benchmark import BenchmarkWithSource
+from compiler_gym.datasets.uri import BenchmarkUri
+from compiler_gym.envs.llvm.llvm_benchmark import (
+    ClangInvocation,
+    get_system_library_flags,
+)
+from compiler_gym.service.proto import BenchmarkDynamicConfig, Command
+from compiler_gym.util import thread_pool
+from compiler_gym.util.filesystem import atomic_file_write
+
+
+class JotaiBenchDataset(TarDatasetWithManifest):
+    """A dataset of C programs curated from GitHub source code.
+
+    The dataset is from:
+
+        da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza
+        Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and
+        Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million
+        Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM
+        International Symposium on Code Generation and Optimization (CGO),
+        pp. 378-390. IEEE, 2021.
+
+    And is available at:
+
+        http://cuda.dcc.ufmg.br/Jotai/src/
+
+    Installation
+    ------------
+
+    The JotaiBench dataset consists of C functions that are compiled to LLVM-IR
+    on-demand and cached. The first time each benchmark is used there is an
+    overhead of compiling it from C to bitcode. This is a one-off cost.
+    """
+
+    def __init__(
+        self,
+        site_data_base: Path,
+        sort_order: int = 0,
+        manifest_url: Optional[str] = None,
+        manifest_sha256: Optional[str] = None,
+        deprecated: Optional[str] = None,
+        name: Optional[str] = None,
+    ):
+        manifest_url_, manifest_sha256_ = {
+            "darwin": (
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true",
+                "202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
+            ),
+            "linux": (
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true",
+                "202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
+            ),
+        }[sys.platform]
+        super().__init__(
+            name=name or "benchmark://jotaibench-v1",
+            description="Compile-only C/C++ functions extracted from GitHub",
+            references={
+                "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
+                "Homepage": "http://cuda.dcc.ufmg.br/angha/",
+            },
+            license="GNU General Public License v3.0 (GPLv3)",
+            site_data_base=site_data_base,
+            manifest_urls=[manifest_url or manifest_url_],
+            manifest_sha256=manifest_sha256 or manifest_sha256_,
+            tar_urls=[
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true"
+            ],
+            tar_sha256="202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
+            strip_prefix="jotaibench-v1",
+            tar_compression="bz2",
+            benchmark_file_suffix=".c",
+            sort_order=sort_order,
+            deprecated=deprecated,
+        )
+
+    def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark:
+        self.install()
+
+        benchmark_name = uri.path[1:]
+        if not benchmark_name:
+            raise LookupError(f"No benchmark specified: {uri}")
+
+        # The absolute path of the file, without an extension.
+        path_stem = self.dataset_root / benchmark_name
+
+        bitcode_abspath = Path(f"{path_stem}.bc")
+        c_file_abspath = Path(f"{path_stem}.c")
+
+        # If the file does not exist, compile it on-demand.
+        if not bitcode_abspath.is_file():
+            if not c_file_abspath.is_file():
+                raise LookupError(
+                    f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
+                )
+
+            with atomic_file_write(bitcode_abspath) as tmp_path:
+                compile_cmd = ClangInvocation.from_c_file(
+                    c_file_abspath,
+                    copt=[
+                        "-ferror-limit=1",  # Stop on first error.
+                        "-w",  # No warnings.
+                    ],
+                ).command(outpath=tmp_path)
+                subprocess.check_call(compile_cmd, timeout=300)
+
+        return BenchmarkWithSource.create(
+            uri, bitcode_abspath, "function.c", c_file_abspath
+        )
+
+    def compile_all(self):
+        n = self.size
+        executor = thread_pool.get_thread_pool_executor()
+        # Since the dataset is lazily compiled, simply iterating over the full
+        # set of URIs will compile everything. Do this in parallel.
+        futures = (
+            executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
+        )
+        for i, future in enumerate(as_completed(futures), start=1):
+            future.result()
+            print(
+                f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
+                flush=True,
+                end="",
+            )
+
+
+class JotaiBenchRunnableDataset(TarDataset):
+    def __init__(
+        self,
+        site_data_base: Path,
+    ):
+        super().__init__(
+            name="benchmark://jotai-runnable-v1",
+            description="Runnable C/C++ functions extracted from GitHub",
+            references={
+                "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
+                "Homepage": "http://cuda.dcc.ufmg.br/angha/",
+            },
+            license="GNU General Public License v3.0 (GPLv3)",
+            site_data_base=site_data_base,
+            tar_urls=[
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true"
+            ],
+            tar_sha256="202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
+            strip_prefix="jotaibench-v1",
+            tar_compression="bz2",
+            benchmark_file_suffix=".c",
+        )
+
+    def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark:
+        self.install()
+
+        benchmark_name = uri.path[1:]
+        if not benchmark_name:
+            raise LookupError(f"No benchmark specified: {uri}")
+
+        # The absolute path of the file, without an extension.
+        path_stem = self.dataset_root / benchmark_name
+
+        bitcode_abspath = Path(f"{path_stem}.bc")
+        c_file_abspath = Path(f"{path_stem}.c")
+
+        # If the file does not exist, compile it to a bitcode file on-demand.
+        if not bitcode_abspath.is_file():
+            if not c_file_abspath.is_file():
+                raise LookupError(
+                    f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
+                )
+
+            with atomic_file_write(bitcode_abspath) as tmp_path:
+                compile_cmd = ClangInvocation.from_c_file(
+                    c_file_abspath,
+                    copt=[
+                        "-ferror-limit=1",  # Stop on first error.
+                        "-w",  # No warnings.
+                    ],
+                ).command(outpath=tmp_path)
+                subprocess.check_call(compile_cmd, timeout=300)
+
+        benchmark = BenchmarkWithSource.create(
+            uri, bitcode_abspath, "function.c", c_file_abspath
+        )
+
+        # This is what makes a benchmark "runnable".
+        benchmark.proto.dynamic_config.MergeFrom(
+            BenchmarkDynamicConfig(
+                build_cmd=Command(
+                    argument=["$CC", "$IN"] + get_system_library_flags(),
+                    timeout_seconds=30,
+                    outfile=["a.out"],
+                ),
+                run_cmd=Command(
+                    argument=["./a.out 0"],
+                    timeout_seconds=30,
+                    infile=[],
+                    outfile=[],
+                ),
+            )
+        )
+
+        return benchmark
+
+    def compile_all(self):
+        n = self.size
+        executor = thread_pool.get_thread_pool_executor()
+        # Since the dataset is lazily compiled, simply iterating over the full
+        # set of URIs will compile everything. Do this in parallel.
+        futures = (
+            executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
+        )
+        for i, future in enumerate(as_completed(futures), start=1):
+            future.result()
+            print(
+                f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
+                flush=True,
+                end="",
+            )
diff --git a/docs/source/llvm/api.rst b/docs/source/llvm/api.rst
@@ -45,6 +45,8 @@ Datasets
 
 .. autoclass:: GitHubDataset
 
+.. autoclass:: JotaiBenchDataset
+
 .. autoclass:: LinuxDataset
 
 .. autoclass:: LlvmStressDataset

diff --git a/docs/source/llvm/index.rst b/docs/source/llvm/index.rst
@@ -40,6 +40,8 @@ We provide several datasets of open-source LLVM-IR benchmarks for use:
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 | benchmark://github-v0      | 49,738                   | Compile-only C/C++ objects from GitHub [`Paper <https://arxiv.org/pdf/2012.01470.pdf>`__]                                                                                                                          | No                   |
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+| benchmark://jotaibench-v1  | 18,761                   | Compile-only C/C++ functions extracted from GitHub [`Homepage <https://github.com/lac-dcc/jotai-benchmarks>`__, `Paper`__]                                                                                         | No                   |
++----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 | benchmark://linux-v0       | 13,894                   | Compile-only object files from C Linux kernel [`Homepage <https://www.linux.org/>`__]                                                                                                                              | No                   |
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 | benchmark://mibench-v1     | 40                       | C benchmarks [`Paper <http://vhosts.eecs.umich.edu/mibench/Publications/MiBench.pdf>`__]                                                                                                                           | No                   |
@@ -56,7 +58,7 @@ We provide several datasets of open-source LLVM-IR benchmarks for use:
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 | generator://llvm-stress-v0 | ∞                        | Randomly generated LLVM-IR [`Documentation <https://llvm.org/docs/CommandGuide/llvm-stress.html>`__]                                                                                                               | No                   |
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
-| Total                      | 1,158,701                |                                                                                                                                                                                                                    |                      |
+| Total                      | 1,177,462                |                                                                                                                                                                                                                    |                      |
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 
 .. [#f1] Values are for the Linux datasets. Some of the datasets contain fewer

diff --git a/tests/llvm/datasets/BUILD b/tests/llvm/datasets/BUILD
@@ -18,6 +18,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "jotaibench_test",
+    timeout = "long",
+    srcs = ["jotaibench_test.py"],
+    shard_count = 8,
+    deps = [
+        "//compiler_gym/envs/llvm",
+        "//compiler_gym/envs/llvm/datasets",
+        "//tests:test_main",
+        "//tests/pytest_plugins:common",
+        "//tests/pytest_plugins:llvm",
+    ],
+)
+
 py_test(
     name = "cbench_test",
     timeout = "long",

diff --git a/tests/llvm/datasets/CMakeLists.txt b/tests/llvm/datasets/CMakeLists.txt
@@ -97,6 +97,19 @@ cg_py_test(
     tests::test_main
 )
 
+cg_py_test(
+  NAME
+    jotaibench_test
+  SRCS
+    "jotaibench_test.py"
+  DEPS
+    compiler_gym::envs::llvm::llvm
+    compiler_gym::envs::llvm::datasets::datasets
+    tests::pytest_plugins::common
+    tests::pytest_plugins::llvm
+    tests::test_main
+)
+
 cg_py_test(
   NAME
     llvm_datasets_test