diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD index 3fe83022d..4f2577db0 100644 --- a/compiler_gym/envs/llvm/datasets/BUILD +++ b/compiler_gym/envs/llvm/datasets/BUILD @@ -13,6 +13,7 @@ py_library( "chstone.py", "clgen.py", "csmith.py", + "jotaibench.py", "llvm_stress.py", "poj104.py", ], diff --git a/compiler_gym/envs/llvm/datasets/CMakeLists.txt b/compiler_gym/envs/llvm/datasets/CMakeLists.txt index 3dd710eb6..bb776d1e6 100644 --- a/compiler_gym/envs/llvm/datasets/CMakeLists.txt +++ b/compiler_gym/envs/llvm/datasets/CMakeLists.txt @@ -11,6 +11,7 @@ cg_py_library( SRCS "__init__.py" "anghabench.py" + "jotaibench.py" "cbench.py" "chstone.py" "clgen.py" diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py index 31d714b32..b6d4deb7c 100644 --- a/compiler_gym/envs/llvm/datasets/__init__.py +++ b/compiler_gym/envs/llvm/datasets/__init__.py @@ -16,6 +16,7 @@ from compiler_gym.envs.llvm.datasets.chstone import CHStoneDataset from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset +from compiler_gym.envs.llvm.datasets.jotaibench import JotaiBenchDataset from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset from compiler_gym.envs.llvm.datasets.poj104 import POJ104Dataset, POJ104LegacyDataset from compiler_gym.util.runfiles_path import site_data_path @@ -261,6 +262,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset manifest_sha256=anghabench_v0_manifest_sha256, deprecated="Please use anghabench-v1", ) + yield JotaiBenchDataset(site_data_base=site_data_base, sort_order=0) yield BlasDataset(site_data_base=site_data_base, sort_order=0) yield CLgenDataset(site_data_base=site_data_base, sort_order=0) yield CBenchDataset(site_data_base=site_data_base) @@ -302,6 +304,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset "CsmithDataset", "get_llvm_datasets", "GitHubDataset", + "JotaiBenchDataset", "LinuxDataset", "LlvmStressDataset", "MibenchDataset", diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py new file mode 100644 index 000000000..0012cf7ad --- /dev/null +++ b/compiler_gym/envs/llvm/datasets/jotaibench.py @@ -0,0 +1,229 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +import subprocess +import sys +from concurrent.futures import as_completed +from pathlib import Path +from typing import Optional + +from compiler_gym.datasets import Benchmark, TarDataset, TarDatasetWithManifest +from compiler_gym.datasets.benchmark import BenchmarkWithSource +from compiler_gym.datasets.uri import BenchmarkUri +from compiler_gym.envs.llvm.llvm_benchmark import ( + ClangInvocation, + get_system_library_flags, +) +from compiler_gym.service.proto import BenchmarkDynamicConfig, Command +from compiler_gym.util import thread_pool +from compiler_gym.util.filesystem import atomic_file_write + + +class JotaiBenchDataset(TarDatasetWithManifest): + """A dataset of C programs curated from GitHub source code. + + The dataset is from: + + da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza + Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and + Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million + Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM + International Symposium on Code Generation and Optimization (CGO), + pp. 378-390. IEEE, 2021. + + And is available at: + + http://cuda.dcc.ufmg.br/Jotai/src/ + + Installation + ------------ + + The JotaiBench dataset consists of C functions that are compiled to LLVM-IR + on-demand and cached. The first time each benchmark is used there is an + overhead of compiling it from C to bitcode. This is a one-off cost. + """ + + def __init__( + self, + site_data_base: Path, + sort_order: int = 0, + manifest_url: Optional[str] = None, + manifest_sha256: Optional[str] = None, + deprecated: Optional[str] = None, + name: Optional[str] = None, + ): + manifest_url_, manifest_sha256_ = { + "darwin": ( + "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true", + "b5a51af3d4e2f77a66001635ec64ed321e0ece19873c4a888040859af7556401", + ), + "linux": ( + "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true", + "b5a51af3d4e2f77a66001635ec64ed321e0ece19873c4a888040859af7556401", + ), + }[sys.platform] + super().__init__( + name=name or "benchmark://jotaibench-v0", + description="Compile-only C/C++ functions extracted from GitHub", + references={ + "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf", + "Homepage": "http://cuda.dcc.ufmg.br/angha/", + }, + license="GNU General Public License v3.0 (GPLv3)", + site_data_base=site_data_base, + manifest_urls=[manifest_url or manifest_url_], + manifest_sha256=manifest_sha256 or manifest_sha256_, + tar_urls=[ + "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true" + ], + tar_sha256="b5a51af3d4e2f77a66001635ec64ed321e0ece19873c4a888040859af7556401", + strip_prefix="jotaibench-v0", + tar_compression="bz2", + benchmark_file_suffix=".c", + sort_order=sort_order, + deprecated=deprecated, + ) + + def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark: + self.install() + + benchmark_name = uri.path[1:] + if not benchmark_name: + raise LookupError(f"No benchmark specified: {uri}") + + # The absolute path of the file, without an extension. + path_stem = self.dataset_root / benchmark_name + + bitcode_abspath = Path(f"{path_stem}.bc") + c_file_abspath = Path(f"{path_stem}.c") + + # If the file does not exist, compile it on-demand. + if not bitcode_abspath.is_file(): + if not c_file_abspath.is_file(): + raise LookupError( + f"Benchmark not found: {uri} (file not found: {c_file_abspath})" + ) + + with atomic_file_write(bitcode_abspath) as tmp_path: + compile_cmd = ClangInvocation.from_c_file( + c_file_abspath, + copt=[ + "-ferror-limit=1", # Stop on first error. + "-w", # No warnings. + ], + ).command(outpath=tmp_path) + subprocess.check_call(compile_cmd, timeout=300) + + return BenchmarkWithSource.create( + uri, bitcode_abspath, "function.c", c_file_abspath + ) + + def compile_all(self): + n = self.size + executor = thread_pool.get_thread_pool_executor() + # Since the dataset is lazily compiled, simply iterating over the full + # set of URIs will compile everything. Do this in parallel. + futures = ( + executor.submit(self.benchmark, uri) for uri in self.benchmark_uris() + ) + for i, future in enumerate(as_completed(futures), start=1): + future.result() + print( + f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)", + flush=True, + end="", + ) + + +class JotaiBenchRunnableDataset(TarDataset): + def __init__( + self, + site_data_base: Path, + ): + super().__init__( + name="benchmark://jotai-runnable-v0", + description="Runnable C/C++ functions extracted from GitHub", + references={ + "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf", + "Homepage": "http://cuda.dcc.ufmg.br/angha/", + }, + license="GNU General Public License v3.0 (GPLv3)", + site_data_base=site_data_base, + tar_urls=[ + "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true" + ], + tar_sha256="b5a51af3d4e2f77a66001635ec64ed321e0ece19873c4a888040859af7556401", + strip_prefix="jotaibench-v0", + tar_compression="bz2", + benchmark_file_suffix=".c", + ) + + def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark: + self.install() + + benchmark_name = uri.path[1:] + if not benchmark_name: + raise LookupError(f"No benchmark specified: {uri}") + + # The absolute path of the file, without an extension. + path_stem = self.dataset_root / benchmark_name + + bitcode_abspath = Path(f"{path_stem}.bc") + c_file_abspath = Path(f"{path_stem}.c") + + # If the file does not exist, compile it to a bitcode file on-demand. + if not bitcode_abspath.is_file(): + if not c_file_abspath.is_file(): + raise LookupError( + f"Benchmark not found: {uri} (file not found: {c_file_abspath})" + ) + + with atomic_file_write(bitcode_abspath) as tmp_path: + compile_cmd = ClangInvocation.from_c_file( + c_file_abspath, + copt=[ + "-ferror-limit=1", # Stop on first error. + "-w", # No warnings. + ], + ).command(outpath=tmp_path) + subprocess.check_call(compile_cmd, timeout=300) + + benchmark = BenchmarkWithSource.create( + uri, bitcode_abspath, "function.c", c_file_abspath + ) + + # This is what makes a benchmark "runnable". + benchmark.proto.dynamic_config.MergeFrom( + BenchmarkDynamicConfig( + build_cmd=Command( + argument=["$CC", "$IN"] + get_system_library_flags(), + timeout_seconds=30, + outfile=["a.out"], + ), + run_cmd=Command( + argument=["./a.out 0"], + timeout_seconds=30, + infile=[], + outfile=[], + ), + ) + ) + + return benchmark + + def compile_all(self): + n = self.size + executor = thread_pool.get_thread_pool_executor() + # Since the dataset is lazily compiled, simply iterating over the full + # set of URIs will compile everything. Do this in parallel. + futures = ( + executor.submit(self.benchmark, uri) for uri in self.benchmark_uris() + ) + for i, future in enumerate(as_completed(futures), start=1): + future.result() + print( + f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)", + flush=True, + end="", + ) diff --git a/docs/source/llvm/api.rst b/docs/source/llvm/api.rst index 36c3adc5c..06af7fffd 100644 --- a/docs/source/llvm/api.rst +++ b/docs/source/llvm/api.rst @@ -45,6 +45,8 @@ Datasets .. autoclass:: GitHubDataset +.. autoclass:: JotaiBenchDataset + .. autoclass:: LinuxDataset .. autoclass:: LlvmStressDataset diff --git a/docs/source/llvm/index.rst b/docs/source/llvm/index.rst index 704a2f5c7..216cc8ddb 100644 --- a/docs/source/llvm/index.rst +++ b/docs/source/llvm/index.rst @@ -40,6 +40,8 @@ We provide several datasets of open-source LLVM-IR benchmarks for use: +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ | benchmark://github-v0 | 49,738 | Compile-only C/C++ objects from GitHub [`Paper `__] | No | +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ +| benchmark://jotaibench-v1 | 18,761 | Compile-only C/C++ functions extracted from GitHub [`Homepage `__, `Paper`__] | No | ++----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ | benchmark://linux-v0 | 13,894 | Compile-only object files from C Linux kernel [`Homepage `__] | No | +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ | benchmark://mibench-v1 | 40 | C benchmarks [`Paper `__] | No | @@ -56,7 +58,7 @@ We provide several datasets of open-source LLVM-IR benchmarks for use: +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ | generator://llvm-stress-v0 | ∞ | Randomly generated LLVM-IR [`Documentation `__] | No | +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ -| Total | 1,158,701 | | | +| Total | 1,177,462 | | | +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+ .. [#f1] Values are for the Linux datasets. Some of the datasets contain fewer diff --git a/tests/llvm/datasets/BUILD b/tests/llvm/datasets/BUILD index 880afaae3..5c59969b5 100644 --- a/tests/llvm/datasets/BUILD +++ b/tests/llvm/datasets/BUILD @@ -18,6 +18,20 @@ py_test( ], ) +py_test( + name = "jotaibench_test", + timeout = "long", + srcs = ["jotaibench_test.py"], + shard_count = 8, + deps = [ + "//compiler_gym/envs/llvm", + "//compiler_gym/envs/llvm/datasets", + "//tests:test_main", + "//tests/pytest_plugins:common", + "//tests/pytest_plugins:llvm", + ], +) + py_test( name = "cbench_test", timeout = "long", diff --git a/tests/llvm/datasets/CMakeLists.txt b/tests/llvm/datasets/CMakeLists.txt index e07084a4c..f0cd0a736 100644 --- a/tests/llvm/datasets/CMakeLists.txt +++ b/tests/llvm/datasets/CMakeLists.txt @@ -97,6 +97,19 @@ cg_py_test( tests::test_main ) +cg_py_test( + NAME + jotaibench_test + SRCS + "jotaibench_test.py" + DEPS + compiler_gym::envs::llvm::llvm + compiler_gym::envs::llvm::datasets::datasets + tests::pytest_plugins::common + tests::pytest_plugins::llvm + tests::test_main +) + cg_py_test( NAME llvm_datasets_test diff --git a/tests/llvm/datasets/jotaibench_test.py b/tests/llvm/datasets/jotaibench_test.py new file mode 100644 index 000000000..e0cc408ff --- /dev/null +++ b/tests/llvm/datasets/jotaibench_test.py @@ -0,0 +1,68 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. +"""Tests for the JotaiBench dataset.""" +import sys +from itertools import islice +from pathlib import Path + +import gym +import pytest + +import compiler_gym.envs.llvm # noqa register environments +from compiler_gym.envs.llvm import LlvmEnv +from compiler_gym.envs.llvm.datasets import JotaiBenchDataset +from tests.pytest_plugins.common import skip_on_ci +from tests.test_main import main + +pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"] + + +@pytest.fixture(scope="module") +def jotaibench_dataset() -> JotaiBenchDataset: + with gym.make("llvm-v0") as env: + ds = env.datasets["jotaibench-v0"] + yield ds + + +def test_jotaibench_size(jotaibench_dataset: JotaiBenchDataset): + if sys.platform == "darwin": + assert jotaibench_dataset.size == 2138894 + else: + assert jotaibench_dataset.size == 2138894 + + +def test_missing_benchmark_name(jotaibench_dataset: JotaiBenchDataset, mocker): + # Mock install() so that on CI it doesn't download and unpack the tarfile. + mocker.patch.object(jotaibench_dataset, "install") + + with pytest.raises( + LookupError, match=r"^No benchmark specified: benchmark://jotaibench-v0$" + ): + jotaibench_dataset.benchmark("benchmark://jotaibench-v0") + jotaibench_dataset.install.assert_called_once() + + with pytest.raises( + LookupError, match=r"^No benchmark specified: benchmark://jotaibench-v0/$" + ): + jotaibench_dataset.benchmark("benchmark://jotaibench-v0/") + assert jotaibench_dataset.install.call_count == 2 + + +@skip_on_ci +@pytest.mark.parametrize("index", range(250)) +def test_anghabench_random_select( + env: LlvmEnv, jotaibench_dataset: JotaiBenchDataset, index: int, tmpwd: Path +): + uri = next(islice(jotaibench_dataset.benchmark_uris(), index, None)) + benchmark = jotaibench_dataset.benchmark(uri) + env.reset(benchmark=benchmark) + + assert benchmark.source + benchmark.write_sources_to_directory(tmpwd) + assert (tmpwd / "function.c").is_file() + + +if __name__ == "__main__": + main() diff --git a/tests/llvm/datasets/llvm_datasets_test.py b/tests/llvm/datasets/llvm_datasets_test.py index 61251f649..bf2f34f58 100644 --- a/tests/llvm/datasets/llvm_datasets_test.py +++ b/tests/llvm/datasets/llvm_datasets_test.py @@ -18,6 +18,7 @@ def test_default_dataset_list(): "benchmark://chstone-v0", "benchmark://clgen-v0", "benchmark://github-v0", + "benchmark://jotaibench-v1", "benchmark://linux-v0", "benchmark://mibench-v1", "benchmark://npb-v0",