Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jotaibench #705

Merged
merged 18 commits into from
Nov 2, 2022
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions compiler_gym/envs/llvm/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ py_library(
"chstone.py",
"clgen.py",
"csmith.py",
"jotaibench.py",
"llvm_stress.py",
"poj104.py",
],
Expand Down
1 change: 1 addition & 0 deletions compiler_gym/envs/llvm/datasets/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ cg_py_library(
SRCS
"__init__.py"
"anghabench.py"
"jotaibench.py"
"cbench.py"
"chstone.py"
"clgen.py"
Expand Down
22 changes: 22 additions & 0 deletions compiler_gym/envs/llvm/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from compiler_gym.envs.llvm.datasets.chstone import CHStoneDataset
from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset
from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
from compiler_gym.envs.llvm.datasets.jotaibench import JotaiBenchDataset
from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
from compiler_gym.envs.llvm.datasets.poj104 import POJ104Dataset, POJ104LegacyDataset
from compiler_gym.util.runfiles_path import site_data_path
Expand Down Expand Up @@ -261,6 +262,26 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
manifest_sha256=anghabench_v0_manifest_sha256,
deprecated="Please use anghabench-v1",
)
yield JotaiBenchDataset(site_data_base=site_data_base, sort_order=0)
# Add legacy version of Jotaibench using an old manifest.
jotaibench_v0_manifest_url, jotaibench_v0_manifest_sha256 = {
"darwin": (
"https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
"39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1",
),
"linux": (
"https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
"3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
),
}[sys.platform]
yield JotaiBenchDataset(
name="benchmark://jotaibench-v0",
site_data_base=site_data_base,
sort_order=0,
manifest_url=jotaibench_v0_manifest_url,
manifest_sha256=jotaibench_v0_manifest_sha256,
deprecated="Please use jotaibench-v1",
)
yield BlasDataset(site_data_base=site_data_base, sort_order=0)
yield CLgenDataset(site_data_base=site_data_base, sort_order=0)
yield CBenchDataset(site_data_base=site_data_base)
Expand Down Expand Up @@ -302,6 +323,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
"CsmithDataset",
"get_llvm_datasets",
"GitHubDataset",
"JotaiBenchDataset",
"LinuxDataset",
"LlvmStressDataset",
"MibenchDataset",
Expand Down
229 changes: 229 additions & 0 deletions compiler_gym/envs/llvm/datasets/jotaibench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
import subprocess
import sys
from concurrent.futures import as_completed
from pathlib import Path
from typing import Optional

from compiler_gym.datasets import Benchmark, TarDataset, TarDatasetWithManifest
from compiler_gym.datasets.benchmark import BenchmarkWithSource
from compiler_gym.datasets.uri import BenchmarkUri
from compiler_gym.envs.llvm.llvm_benchmark import (
ClangInvocation,
get_system_library_flags,
)
from compiler_gym.service.proto import BenchmarkDynamicConfig, Command
from compiler_gym.util import thread_pool
from compiler_gym.util.filesystem import atomic_file_write


class JotaiBenchDataset(TarDatasetWithManifest):
"""A dataset of C programs curated from GitHub source code.

The dataset is from:

da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza
Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and
Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million
Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM
International Symposium on Code Generation and Optimization (CGO),
pp. 378-390. IEEE, 2021.

And is available at:

http://cuda.dcc.ufmg.br/Jotai/src/

Installation
------------

The JotaiBench dataset consists of C functions that are compiled to LLVM-IR
on-demand and cached. The first time each benchmark is used there is an
overhead of compiling it from C to bitcode. This is a one-off cost.
"""

def __init__(
self,
site_data_base: Path,
sort_order: int = 0,
manifest_url: Optional[str] = None,
manifest_sha256: Optional[str] = None,
deprecated: Optional[str] = None,
name: Optional[str] = None,
):
manifest_url_, manifest_sha256_ = {
"darwin": (
"https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true",
"202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
),
"linux": (
"https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true",
"202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
),
}[sys.platform]
super().__init__(
name=name or "benchmark://jotaibench-v1",
description="Compile-only C/C++ functions extracted from GitHub",
references={
"Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
"Homepage": "http://cuda.dcc.ufmg.br/angha/",
},
license="GNU General Public License v3.0 (GPLv3)",
site_data_base=site_data_base,
manifest_urls=[manifest_url or manifest_url_],
manifest_sha256=manifest_sha256 or manifest_sha256_,
tar_urls=[
"https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true"
],
tar_sha256="202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
strip_prefix="jotaibench-v1",
tar_compression="bz2",
benchmark_file_suffix=".c",
sort_order=sort_order,
deprecated=deprecated,
)

def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark:
self.install()

benchmark_name = uri.path[1:]
if not benchmark_name:
raise LookupError(f"No benchmark specified: {uri}")

# The absolute path of the file, without an extension.
path_stem = self.dataset_root / benchmark_name

bitcode_abspath = Path(f"{path_stem}.bc")
c_file_abspath = Path(f"{path_stem}.c")

# If the file does not exist, compile it on-demand.
if not bitcode_abspath.is_file():
if not c_file_abspath.is_file():
raise LookupError(
f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
)

with atomic_file_write(bitcode_abspath) as tmp_path:
compile_cmd = ClangInvocation.from_c_file(
c_file_abspath,
copt=[
"-ferror-limit=1", # Stop on first error.
"-w", # No warnings.
],
).command(outpath=tmp_path)
subprocess.check_call(compile_cmd, timeout=300)

return BenchmarkWithSource.create(
uri, bitcode_abspath, "function.c", c_file_abspath
)

def compile_all(self):
n = self.size
executor = thread_pool.get_thread_pool_executor()
# Since the dataset is lazily compiled, simply iterating over the full
# set of URIs will compile everything. Do this in parallel.
futures = (
executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
)
for i, future in enumerate(as_completed(futures), start=1):
future.result()
print(
f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
flush=True,
end="",
)


class JotaiBenchRunnableDataset(TarDataset):
def __init__(
self,
site_data_base: Path,
):
super().__init__(
name="benchmark://jotai-runnable-v1",
description="Runnable C/C++ functions extracted from GitHub",
references={
"Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
"Homepage": "http://cuda.dcc.ufmg.br/angha/",
},
license="GNU General Public License v3.0 (GPLv3)",
site_data_base=site_data_base,
tar_urls=[
"https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true"
],
tar_sha256="202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
strip_prefix="jotaibench-v1",
tar_compression="bz2",
benchmark_file_suffix=".c",
)

def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark:
self.install()

benchmark_name = uri.path[1:]
if not benchmark_name:
raise LookupError(f"No benchmark specified: {uri}")

# The absolute path of the file, without an extension.
path_stem = self.dataset_root / benchmark_name

bitcode_abspath = Path(f"{path_stem}.bc")
c_file_abspath = Path(f"{path_stem}.c")

# If the file does not exist, compile it to a bitcode file on-demand.
if not bitcode_abspath.is_file():
if not c_file_abspath.is_file():
raise LookupError(
f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
)

with atomic_file_write(bitcode_abspath) as tmp_path:
compile_cmd = ClangInvocation.from_c_file(
c_file_abspath,
copt=[
"-ferror-limit=1", # Stop on first error.
"-w", # No warnings.
],
).command(outpath=tmp_path)
subprocess.check_call(compile_cmd, timeout=300)

benchmark = BenchmarkWithSource.create(
uri, bitcode_abspath, "function.c", c_file_abspath
)

# This is what makes a benchmark "runnable".
benchmark.proto.dynamic_config.MergeFrom(
BenchmarkDynamicConfig(
build_cmd=Command(
argument=["$CC", "$IN"] + get_system_library_flags(),
timeout_seconds=30,
outfile=["a.out"],
),
run_cmd=Command(
argument=["./a.out 0"],
timeout_seconds=30,
infile=[],
outfile=[],
),
)
)

return benchmark

def compile_all(self):
n = self.size
executor = thread_pool.get_thread_pool_executor()
# Since the dataset is lazily compiled, simply iterating over the full
# set of URIs will compile everything. Do this in parallel.
futures = (
executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
)
for i, future in enumerate(as_completed(futures), start=1):
future.result()
print(
f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
flush=True,
end="",
)
2 changes: 2 additions & 0 deletions docs/source/llvm/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ Datasets

.. autoclass:: GitHubDataset

.. autoclass:: JotaiBenchDataset

.. autoclass:: LinuxDataset

.. autoclass:: LlvmStressDataset
Expand Down
4 changes: 3 additions & 1 deletion docs/source/llvm/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ We provide several datasets of open-source LLVM-IR benchmarks for use:
+----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| benchmark://github-v0 | 49,738 | Compile-only C/C++ objects from GitHub [`Paper <https://arxiv.org/pdf/2012.01470.pdf>`__] | No |
+----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| benchmark://jotaibench-v1 | 18,761 | Compile-only C/C++ functions extracted from GitHub [`Homepage <https://github.com/lac-dcc/jotai-benchmarks>`__, `Paper`__] | No |
+----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| benchmark://linux-v0 | 13,894 | Compile-only object files from C Linux kernel [`Homepage <https://www.linux.org/>`__] | No |
+----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| benchmark://mibench-v1 | 40 | C benchmarks [`Paper <http://vhosts.eecs.umich.edu/mibench/Publications/MiBench.pdf>`__] | No |
Expand All @@ -56,7 +58,7 @@ We provide several datasets of open-source LLVM-IR benchmarks for use:
+----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| generator://llvm-stress-v0 | ∞ | Randomly generated LLVM-IR [`Documentation <https://llvm.org/docs/CommandGuide/llvm-stress.html>`__] | No |
+----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
| Total | 1,158,701 | | |
| Total | 1,177,462 | | |
+----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+

.. [#f1] Values are for the Linux datasets. Some of the datasets contain fewer
Expand Down
14 changes: 14 additions & 0 deletions tests/llvm/datasets/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,20 @@ py_test(
],
)

py_test(
name = "jotaibench_test",
timeout = "long",
srcs = ["jotaibench_test.py"],
shard_count = 8,
deps = [
"//compiler_gym/envs/llvm",
"//compiler_gym/envs/llvm/datasets",
"//tests:test_main",
"//tests/pytest_plugins:common",
"//tests/pytest_plugins:llvm",
],
)

py_test(
name = "cbench_test",
timeout = "long",
Expand Down
13 changes: 13 additions & 0 deletions tests/llvm/datasets/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,19 @@ cg_py_test(
tests::test_main
)

cg_py_test(
NAME
jotaibench_test
SRCS
"jotaibench_test.py"
DEPS
compiler_gym::envs::llvm::llvm
compiler_gym::envs::llvm::datasets::datasets
tests::pytest_plugins::common
tests::pytest_plugins::llvm
tests::test_main
)

cg_py_test(
NAME
llvm_datasets_test
Expand Down
Loading