From 0ed344700f4f1e39dc52be2b6dc67ce20e7db091 Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Sat, 4 Jun 2022 17:14:02 -0300
Subject: [PATCH 01/14] Creating the Jotai benchmark.

---
 compiler_gym/envs/llvm/datasets/BUILD         |   1 +
 .../envs/llvm/datasets/CMakeLists.txt         |   1 +
 compiler_gym/envs/llvm/datasets/__init__.py   |  18 ++
 compiler_gym/envs/llvm/datasets/jotaibench.py | 243 ++++++++++++++++++
 compiler_gym/test_jotai.py                    |  36 +++
 tests/llvm/datasets/jotaibench_test.py        |  69 +++++
 6 files changed, 368 insertions(+)
 create mode 100644 compiler_gym/envs/llvm/datasets/jotaibench.py
 create mode 100644 compiler_gym/test_jotai.py
 create mode 100644 tests/llvm/datasets/jotaibench_test.py

diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD
index 3fe83022d..39391e937 100644
--- a/compiler_gym/envs/llvm/datasets/BUILD
+++ b/compiler_gym/envs/llvm/datasets/BUILD
@@ -9,6 +9,7 @@ py_library(
     srcs = [
         "__init__.py",
         "anghabench.py",
+        "jotaibench.py",
         "cbench.py",
         "chstone.py",
         "clgen.py",
diff --git a/compiler_gym/envs/llvm/datasets/CMakeLists.txt b/compiler_gym/envs/llvm/datasets/CMakeLists.txt
index 3dd710eb6..bb776d1e6 100644
--- a/compiler_gym/envs/llvm/datasets/CMakeLists.txt
+++ b/compiler_gym/envs/llvm/datasets/CMakeLists.txt
@@ -11,6 +11,7 @@ cg_py_library(
   SRCS
     "__init__.py"
     "anghabench.py"
+    "jotaibench.py"
     "cbench.py"
     "chstone.py"
     "clgen.py"
diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py
index 31d714b32..55e460f69 100644
--- a/compiler_gym/envs/llvm/datasets/__init__.py
+++ b/compiler_gym/envs/llvm/datasets/__init__.py
@@ -8,6 +8,7 @@
 
 from compiler_gym.datasets import Dataset, TarDatasetWithManifest
 from compiler_gym.envs.llvm.datasets.anghabench import AnghaBenchDataset
+from compiler_gym.envs.llvm.datasets.jotaibench import JotaiBenchDataset
 from compiler_gym.envs.llvm.datasets.cbench import (
     CBenchDataset,
     CBenchLegacyDataset,
@@ -261,6 +262,22 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
         manifest_sha256=anghabench_v0_manifest_sha256,
         deprecated="Please use anghabench-v1",
     )
+    yield JotaiBenchDataset(site_data_base=site_data_base, sort_order=0)
+    # Add legacy version of Jotaibench using an old manifest.
+    jotaibench_v0_manifest_url, jotaibench_v0_manifest_sha256 = {
+        "linux": (
+            "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-linux-manifest.bz2",
+            "a038d25d39ee9472662a9704dfff19c9e3512ff6a70f1067af85c5cb3784b477",
+        ),
+    }[sys.platform]
+    yield JotaiBenchDataset(
+        name="benchmark://jotaibench-v0",
+        site_data_base=site_data_base,
+        sort_order=0,
+        manifest_url=jotaibench_v0_manifest_url,
+        manifest_sha256=jotaibench_v0_manifest_sha256,
+        deprecated="Please use jotaibench-v1",
+    )
     yield BlasDataset(site_data_base=site_data_base, sort_order=0)
     yield CLgenDataset(site_data_base=site_data_base, sort_order=0)
     yield CBenchDataset(site_data_base=site_data_base)
@@ -294,6 +311,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
 
 __all__ = [
     "AnghaBenchDataset",
+    "JotaiBenchDataset"
     "BlasDataset",
     "CBenchDataset",
     "CBenchLegacyDataset",
diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
new file mode 100644
index 000000000..e1140f572
--- /dev/null
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -0,0 +1,243 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+import subprocess
+import sys
+from concurrent.futures import as_completed
+from pathlib import Path
+from typing import Optional
+
+from compiler_gym.datasets import Benchmark, TarDataset, TarDatasetWithManifest
+from compiler_gym.datasets.benchmark import BenchmarkWithSource
+from compiler_gym.datasets.uri import BenchmarkUri
+from compiler_gym.envs.llvm.llvm_benchmark import (
+    ClangInvocation,
+    get_system_library_flags,
+)
+from compiler_gym.service.proto import BenchmarkDynamicConfig, Command
+from compiler_gym.util import thread_pool
+from compiler_gym.util.filesystem import atomic_file_write
+
+
+
+class JotaiBenchDataset(TarDatasetWithManifest):
+    """A dataset of C programs curated from GitHub source code.
+
+    The dataset is from:
+
+        da Silva, Anderson Faustino, Bruno Conde Kind, José Wesley de Souza
+        Magalhaes, Jerônimo Nunes Rocha, Breno Campos Ferreira Guimaraes, and
+        Fernando Magno Quinão Pereira. "ANGHABENCH: A Suite with One Million
+        Compilable C Benchmarks for Code-Size Reduction." In 2021 IEEE/ACM
+        International Symposium on Code Generation and Optimization (CGO),
+        pp. 378-390. IEEE, 2021.
+
+    And is available at:
+
+        http://cuda.dcc.ufmg.br/Jotai/src/
+
+    Installation
+    ------------
+
+    The AnghaBench dataset consists of C functions that are compiled to LLVM-IR
+    on-demand and cached. The first time each benchmark is used there is an
+    overhead of compiling it from C to bitcode. This is a one-off cost.
+    """
+
+    def __init__(
+        self,
+        site_data_base: Path,
+        sort_order: int = 0,
+        manifest_url: Optional[str] = None,
+        manifest_sha256: Optional[str] = None,
+        deprecated: Optional[str] = None,
+        name: Optional[str] = None,
+    ):
+        manifest_url_, manifest_sha256_ = {
+            "linux": (
+                "http://cuda.dcc.ufmg.br/Jotai/src/Jotai_printRetVal.tar.bz2",
+                "7d2c6326036d87a02318e81a29560f9bb4ead3dc33ffbd43e4fb2e95e09dd621",
+            ),
+        }[sys.platform]
+        super().__init__(
+            name=name or "benchmark://jotai-v1",
+            description="Compile-only C/C++ functions extracted from GitHub",
+            references={
+                "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
+                "Homepage": "http://cuda.dcc.ufmg.br/angha/",
+            },
+            license="Unknown. See: https://github.com/brenocfg/AnghaBench/issues/1",
+            site_data_base=site_data_base,
+            manifest_urls=[manifest_url or manifest_url_],
+            manifest_sha256=manifest_sha256 or manifest_sha256_,
+            tar_urls=[
+                "http://cuda.dcc.ufmg.br/Jotai/src/Jotai_printRetVal.tar.bz2"
+            ],
+            tar_sha256="7d2c6326036d87a02318e81a29560f9bb4ead3dc33ffbd43e4fb2e95e09dd621",
+            strip_prefix="programs_no-ub_printableRetVal",
+            tar_compression="bz2",
+            benchmark_file_suffix=".bc",
+            sort_order=sort_order,
+            deprecated=deprecated,
+        )
+
+    def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark:
+        self.install()
+
+        benchmark_name = uri.path[1:]
+        if not benchmark_name:
+            raise LookupError(f"No benchmark specified: {uri}")
+
+        # The absolute path of the file, without an extension.
+        path_stem = self.dataset_root / benchmark_name
+
+        bitcode_abspath = Path(f"{path_stem}.bc")
+        c_file_abspath = Path(f"{path_stem}.c")
+
+        # If the file does not exist, compile it on-demand.
+        if not bitcode_abspath.is_file():
+            if not c_file_abspath.is_file():
+                raise LookupError(
+                    f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
+                )
+
+            with atomic_file_write(bitcode_abspath) as tmp_path:
+                compile_cmd = ClangInvocation.from_c_file(
+                    c_file_abspath,
+                    copt=[
+                        "-ferror-limit=1",  # Stop on first error.
+                        "-w",  # No warnings.
+                    ],
+                ).command(outpath=tmp_path)
+                subprocess.check_call(compile_cmd, timeout=300)
+
+        return BenchmarkWithSource.create(
+            uri, bitcode_abspath, "function.c", c_file_abspath
+        )
+
+    def compile_all(self):
+        n = self.size
+        executor = thread_pool.get_thread_pool_executor()
+        # Since the dataset is lazily compiled, simply iterating over the full
+        # set of URIs will compile everything. Do this in parallel.
+        futures = (
+            executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
+        )
+        for i, future in enumerate(as_completed(futures), start=1):
+            future.result()
+            print(
+                f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
+                flush=True,
+                end="",
+            )
+
+
+class JotaiBenchRunnableDataset(TarDataset):
+    """TODO."""
+
+    def __init__(
+        self,
+        site_data_base: Path,
+    ):
+        super().__init__(
+            name="benchmark://jotai-runnable-v0",
+            description="Runnable C/C++ functions extracted from GitHub",
+            references={
+                # TODO: Update these as necessary:
+                "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
+                "Homepage": "http://cuda.dcc.ufmg.br/angha/",
+            },
+            license="",  # TODO: License name.
+            site_data_base=site_data_base,
+            tar_urls=[
+                # TODO: URL of where to download a tarball that contains the
+                # benchmarks. For debugging, you could use something like
+                # Dropbox or similar. For eventual production we can host them
+                # in our S3 bucket for you.
+                "http://cuda.dcc.ufmg.br/Jotai/src/Jotai_printRetVal.tar.bz2"
+            ],
+            tar_sha256="7d2c6326036d87a02318e81a29560f9bb4ead3dc33ffbd43e4fb2e95e09dd621",  # TODO: sha256sum of the above tarfile.
+            strip_prefix="programs_no-ub_printableRetVal",  # TODO: If there is a subdirectory to strip, specify it here.
+            tar_compression="bz2",
+            # TODO: The file extension that is used to automatically enumerate
+            # the benchmarks.
+            benchmark_file_suffix=".c",
+        )
+
+    def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark:
+        self.install()
+
+        benchmark_name = uri.path[1:]
+        if not benchmark_name:
+            raise LookupError(f"No benchmark specified: {uri}")
+
+        # The absolute path of the file, without an extension.
+        path_stem = self.dataset_root / benchmark_name
+
+        bitcode_abspath = Path(f"{path_stem}.bc")
+        c_file_abspath = Path(f"{path_stem}.c")
+
+        # If the file does not exist, compile it to a bitcode file on-demand.
+        if not bitcode_abspath.is_file():
+            if not c_file_abspath.is_file():
+                raise LookupError(
+                    f"Benchmark not found: {uri} (file not found: {c_file_abspath})"
+                )
+
+            with atomic_file_write(bitcode_abspath) as tmp_path:
+                compile_cmd = ClangInvocation.from_c_file(
+                    c_file_abspath,
+                    copt=[
+                        "-ferror-limit=1",  # Stop on first error.
+                        "-w",  # No warnings.
+                    ],
+                ).command(outpath=tmp_path)
+                subprocess.check_call(compile_cmd, timeout=300)
+
+        benchmark = BenchmarkWithSource.create(
+            uri, bitcode_abspath, "function.c", c_file_abspath
+        )
+
+        # TODO: Here is where we specify how to build and run the benchmark.
+        # This is what makes a benchmark "runnable".
+        benchmark.proto.dynamic_config.MergeFrom(
+            BenchmarkDynamicConfig(
+                build_cmd=Command(
+                    # TODO: Here is where you specify the command to build the
+                    # benchmark. Assuming no deps, this should be fine.
+                    argument=["$CC", "$IN"] + get_system_library_flags(),
+                    timeout_seconds=60,
+                    outfile=["a.out"],
+                ),
+                run_cmd=Command(
+                    # TODO: Here is where you specify the command to build the
+                    # benchmark. Assuming no deps, this should be fine.
+                    argument=["./a.out 0"],
+                    timeout_seconds=60,
+                    # TODO: If the benchmark needs any input files, specify it here.
+                    infile=[],
+                    # TODO: If the benchmark produces any output files, specify it
+                    # here.
+                    outfile=[],
+                ),
+            )
+        )
+
+        return benchmark
+
+    def compile_all(self):
+        n = self.size
+        executor = thread_pool.get_thread_pool_executor()
+        # Since the dataset is lazily compiled, simply iterating over the full
+        # set of URIs will compile everything. Do this in parallel.
+        futures = (
+            executor.submit(self.benchmark, uri) for uri in self.benchmark_uris()
+        )
+        for i, future in enumerate(as_completed(futures), start=1):
+            future.result()
+            print(
+                f"\r\033[KCompiled {i} of {n} programs ({i/n:.1%} complete)",
+                flush=True,
+                end="",
+            )
diff --git a/compiler_gym/test_jotai.py b/compiler_gym/test_jotai.py
new file mode 100644
index 000000000..a5f5ab6d3
--- /dev/null
+++ b/compiler_gym/test_jotai.py
@@ -0,0 +1,36 @@
+import gym
+import compiler_gym
+
+env = compiler_gym.make(
+    "llvm-v0",
+    benchmark="jotai-v1/extr_anypixelfirmwarecontrollersrcfifo.c_FIFO_available_Final",
+    observation_space="Autophase",       # selects the observation space
+    reward_space="IrInstructionCountOz", # selects the optimization target
+)
+
+env.reset()  
+#env.render() 
+
+#env1 = compiler_gym.make(                 # creates a new environment (same as gym.make)
+#    "llvm-v0",                           # selects the compiler to use
+#    benchmark="cbench-v1/qsort",         # selects the program to compile
+#    observation_space="Autophase",       # selects the observation space
+#    reward_space="IrInstructionCountOz", # selects the optimization target
+#)
+
+#for dataset in env.datasets:
+#    print(dataset.name)
+
+#env.reset(benchmark="benchmark://jotai-v1/extr_anypixelfirmwarecontrollersrcfifo.c_FIFO_available_Final")
+
+#info = env.step(env.action_space.sample())
+#print(info)
+
+episode_reward = 0
+for i in range(1, 101):
+    observation, reward, done, info = env.step(env.action_space.sample())
+    if done:
+        break
+    episode_reward += reward
+    print(f"Step {i}, quality={episode_reward:.3%}")
+env.close() 
\ No newline at end of file
diff --git a/tests/llvm/datasets/jotaibench_test.py b/tests/llvm/datasets/jotaibench_test.py
new file mode 100644
index 000000000..121e23e14
--- /dev/null
+++ b/tests/llvm/datasets/jotaibench_test.py
@@ -0,0 +1,69 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tests for the JotaiBench dataset."""
+import sys
+from itertools import islice
+from pathlib import Path
+
+import gym
+import pytest
+
+
+import compiler_gym.envs.llvm  # noqa register environments
+from compiler_gym.envs.llvm import LlvmEnv
+from compiler_gym.envs.llvm.datasets import JotaiBenchDataset
+from tests.pytest_plugins.common import skip_on_ci
+from tests.test_main import main
+
+pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"]
+
+
+@pytest.fixture(scope="module")
+def jotaibench_dataset() -> JotaiBenchDataset:
+    with gym.make("llvm-v0") as env:
+        ds = env.datasets["jotaibench-v1"]
+    yield ds
+
+
+def test_jotaibench_size(jotaibench_dataset: JotaiBenchDataset):
+    if sys.platform == "darwin":
+        assert jotaibench_dataset.size == 1041265
+    else:
+        assert jotaibench_dataset.size == 1041333
+
+
+def test_missing_benchmark_name(jotaibench_dataset: JotaiBenchDataset, mocker):
+    # Mock install() so that on CI it doesn't download and unpack the tarfile.
+    mocker.patch.object(jotaibench_dataset, "install")
+
+    with pytest.raises(
+        LookupError, match=r"^No benchmark specified: benchmark://jotaibench-v1$"
+    ):
+        jotaibench_dataset.benchmark("benchmark://jotaibench-v1")
+    jotaibench_dataset.install.assert_called_once()
+
+    with pytest.raises(
+        LookupError, match=r"^No benchmark specified: benchmark://jotaibench-v1/$"
+    ):
+        jotaibench_dataset.benchmark("benchmark://jotaibench-v1/")
+    assert jotaibench_dataset.install.call_count == 2
+
+
+@skip_on_ci
+@pytest.mark.parametrize("index", range(250))
+def test_jotaibench_random_select(
+    env: LlvmEnv, jotaibench_dataset: JotaiBenchDataset, index: int, tmpwd: Path
+):
+    uri = next(islice(jotaibench_dataset.benchmark_uris(), index, None))
+    benchmark = jotaibench_dataset.benchmark(uri)
+    env.reset(benchmark=benchmark)
+
+    assert benchmark.source
+    benchmark.write_sources_to_directory(tmpwd)
+    assert (tmpwd / "function.c").is_file()
+
+
+if __name__ == "__main__":
+    main()

From 7f808c3e0030a83b8dca4b2061978d422e5714ec Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Sat, 4 Jun 2022 17:26:52 -0300
Subject: [PATCH 02/14] Creating the Jotai benchmark.

---
 tests/llvm/datasets/BUILD          | 14 ++++++++++++++
 tests/llvm/datasets/CMakeLists.txt | 13 +++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/tests/llvm/datasets/BUILD b/tests/llvm/datasets/BUILD
index 880afaae3..5c59969b5 100644
--- a/tests/llvm/datasets/BUILD
+++ b/tests/llvm/datasets/BUILD
@@ -18,6 +18,20 @@ py_test(
     ],
 )
 
+py_test(
+    name = "jotaibench_test",
+    timeout = "long",
+    srcs = ["jotaibench_test.py"],
+    shard_count = 8,
+    deps = [
+        "//compiler_gym/envs/llvm",
+        "//compiler_gym/envs/llvm/datasets",
+        "//tests:test_main",
+        "//tests/pytest_plugins:common",
+        "//tests/pytest_plugins:llvm",
+    ],
+)
+
 py_test(
     name = "cbench_test",
     timeout = "long",
diff --git a/tests/llvm/datasets/CMakeLists.txt b/tests/llvm/datasets/CMakeLists.txt
index e07084a4c..17a37e388 100644
--- a/tests/llvm/datasets/CMakeLists.txt
+++ b/tests/llvm/datasets/CMakeLists.txt
@@ -18,6 +18,19 @@ cg_py_test(
     tests::test_main
 )
 
+cg_py_test(
+  NAME
+    jotaibench_test
+  SRCS
+    "jotaibench_test.py"
+  DEPS
+    compiler_gym::envs::llvm::llvm
+    compiler_gym::envs::llvm::datasets::datasets
+    tests::pytest_plugins::common
+    tests::pytest_plugins::llvm
+    tests::test_main
+)
+
 cg_py_test(
   NAME
     cbench_test

From f76263a7e3a5eefbfb833904b3f6ecea7e8a8e0a Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Thu, 16 Jun 2022 16:45:19 -0300
Subject: [PATCH 03/14] removing test_jotai.py

---
 compiler_gym/test_jotai.py | 36 ------------------------------------
 1 file changed, 36 deletions(-)
 delete mode 100644 compiler_gym/test_jotai.py

diff --git a/compiler_gym/test_jotai.py b/compiler_gym/test_jotai.py
deleted file mode 100644
index a5f5ab6d3..000000000
--- a/compiler_gym/test_jotai.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import gym
-import compiler_gym
-
-env = compiler_gym.make(
-    "llvm-v0",
-    benchmark="jotai-v1/extr_anypixelfirmwarecontrollersrcfifo.c_FIFO_available_Final",
-    observation_space="Autophase",       # selects the observation space
-    reward_space="IrInstructionCountOz", # selects the optimization target
-)
-
-env.reset()  
-#env.render() 
-
-#env1 = compiler_gym.make(                 # creates a new environment (same as gym.make)
-#    "llvm-v0",                           # selects the compiler to use
-#    benchmark="cbench-v1/qsort",         # selects the program to compile
-#    observation_space="Autophase",       # selects the observation space
-#    reward_space="IrInstructionCountOz", # selects the optimization target
-#)
-
-#for dataset in env.datasets:
-#    print(dataset.name)
-
-#env.reset(benchmark="benchmark://jotai-v1/extr_anypixelfirmwarecontrollersrcfifo.c_FIFO_available_Final")
-
-#info = env.step(env.action_space.sample())
-#print(info)
-
-episode_reward = 0
-for i in range(1, 101):
-    observation, reward, done, info = env.step(env.action_space.sample())
-    if done:
-        break
-    episode_reward += reward
-    print(f"Step {i}, quality={episode_reward:.3%}")
-env.close() 
\ No newline at end of file

From 4c300eb18c0dc1a5a245747fa9e3aa17fd5e8be3 Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Thu, 16 Jun 2022 16:48:26 -0300
Subject: [PATCH 04/14] removing TODOs

---
 compiler_gym/envs/llvm/datasets/jotaibench.py | 23 +++----------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
index e1140f572..6f683c9a8 100644
--- a/compiler_gym/envs/llvm/datasets/jotaibench.py
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -134,8 +134,6 @@ def compile_all(self):
 
 
 class JotaiBenchRunnableDataset(TarDataset):
-    """TODO."""
-
     def __init__(
         self,
         site_data_base: Path,
@@ -144,24 +142,17 @@ def __init__(
             name="benchmark://jotai-runnable-v0",
             description="Runnable C/C++ functions extracted from GitHub",
             references={
-                # TODO: Update these as necessary:
                 "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
                 "Homepage": "http://cuda.dcc.ufmg.br/angha/",
             },
-            license="",  # TODO: License name.
+            license="",  
             site_data_base=site_data_base,
             tar_urls=[
-                # TODO: URL of where to download a tarball that contains the
-                # benchmarks. For debugging, you could use something like
-                # Dropbox or similar. For eventual production we can host them
-                # in our S3 bucket for you.
                 "http://cuda.dcc.ufmg.br/Jotai/src/Jotai_printRetVal.tar.bz2"
             ],
-            tar_sha256="7d2c6326036d87a02318e81a29560f9bb4ead3dc33ffbd43e4fb2e95e09dd621",  # TODO: sha256sum of the above tarfile.
-            strip_prefix="programs_no-ub_printableRetVal",  # TODO: If there is a subdirectory to strip, specify it here.
+            tar_sha256="7d2c6326036d87a02318e81a29560f9bb4ead3dc33ffbd43e4fb2e95e09dd621",  
+            strip_prefix="programs_no-ub_printableRetVal",  
             tar_compression="bz2",
-            # TODO: The file extension that is used to automatically enumerate
-            # the benchmarks.
             benchmark_file_suffix=".c",
         )
 
@@ -199,26 +190,18 @@ def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark:
             uri, bitcode_abspath, "function.c", c_file_abspath
         )
 
-        # TODO: Here is where we specify how to build and run the benchmark.
         # This is what makes a benchmark "runnable".
         benchmark.proto.dynamic_config.MergeFrom(
             BenchmarkDynamicConfig(
                 build_cmd=Command(
-                    # TODO: Here is where you specify the command to build the
-                    # benchmark. Assuming no deps, this should be fine.
                     argument=["$CC", "$IN"] + get_system_library_flags(),
                     timeout_seconds=60,
                     outfile=["a.out"],
                 ),
                 run_cmd=Command(
-                    # TODO: Here is where you specify the command to build the
-                    # benchmark. Assuming no deps, this should be fine.
                     argument=["./a.out 0"],
                     timeout_seconds=60,
-                    # TODO: If the benchmark needs any input files, specify it here.
                     infile=[],
-                    # TODO: If the benchmark produces any output files, specify it
-                    # here.
                     outfile=[],
                 ),
             )

From 79e6a0bf90f99a79c1220a6eb94564a3a1905062 Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Tue, 19 Jul 2022 06:33:19 -0300
Subject: [PATCH 05/14] Correction bugs and update tests

---
 compiler_gym/envs/llvm/datasets/jotaibench.py | 2 +-
 tests/llvm/datasets/llvm_datasets_test.py     | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
index 6f683c9a8..d03494b79 100644
--- a/compiler_gym/envs/llvm/datasets/jotaibench.py
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -61,7 +61,7 @@ def __init__(
             ),
         }[sys.platform]
         super().__init__(
-            name=name or "benchmark://jotai-v1",
+            name=name or "benchmark://jotaibench-v1",
             description="Compile-only C/C++ functions extracted from GitHub",
             references={
                 "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
diff --git a/tests/llvm/datasets/llvm_datasets_test.py b/tests/llvm/datasets/llvm_datasets_test.py
index 61251f649..95ff5ddf4 100644
--- a/tests/llvm/datasets/llvm_datasets_test.py
+++ b/tests/llvm/datasets/llvm_datasets_test.py
@@ -12,6 +12,7 @@
 def test_default_dataset_list():
     with gym.make("llvm-v0") as env:
         assert list(d.name for d in env.datasets) == [
+            "benchmark://jotaibench-v1",
             "benchmark://cbench-v1",
             "benchmark://anghabench-v1",
             "benchmark://blas-v0",

From 9b19da4fd9b3281218f6ef31aa54669e5809373b Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Tue, 19 Jul 2022 13:15:23 -0300
Subject: [PATCH 06/14] update jotaibench with some correction

---
 compiler_gym/envs/llvm/datasets/jotaibench.py | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
index d03494b79..53be063a4 100644
--- a/compiler_gym/envs/llvm/datasets/jotaibench.py
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -40,7 +40,7 @@ class JotaiBenchDataset(TarDatasetWithManifest):
     Installation
     ------------
 
-    The AnghaBench dataset consists of C functions that are compiled to LLVM-IR
+    The JotaiBench dataset consists of C functions that are compiled to LLVM-IR
     on-demand and cached. The first time each benchmark is used there is an
     overhead of compiling it from C to bitcode. This is a one-off cost.
     """
@@ -56,8 +56,8 @@ def __init__(
     ):
         manifest_url_, manifest_sha256_ = {
             "linux": (
-                "http://cuda.dcc.ufmg.br/Jotai/src/Jotai_printRetVal.tar.bz2",
-                "7d2c6326036d87a02318e81a29560f9bb4ead3dc33ffbd43e4fb2e95e09dd621",
+                "https://github.com/lac-dcc/jotai-benchmarks/raw/main/benchmarks/programs_no-ub_printableRetVal.bz2",
+                "3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
             ),
         }[sys.platform]
         super().__init__(
@@ -67,15 +67,15 @@ def __init__(
                 "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
                 "Homepage": "http://cuda.dcc.ufmg.br/angha/",
             },
-            license="Unknown. See: https://github.com/brenocfg/AnghaBench/issues/1",
+            license="GNU General Public License v3.0 (GPLv3)",
             site_data_base=site_data_base,
             manifest_urls=[manifest_url or manifest_url_],
             manifest_sha256=manifest_sha256 or manifest_sha256_,
             tar_urls=[
-                "http://cuda.dcc.ufmg.br/Jotai/src/Jotai_printRetVal.tar.bz2"
+                "https://github.com/lac-dcc/jotai-benchmarks/raw/main/benchmarks/programs_no-ub_printableRetVal.bz2"
             ],
-            tar_sha256="7d2c6326036d87a02318e81a29560f9bb4ead3dc33ffbd43e4fb2e95e09dd621",
-            strip_prefix="programs_no-ub_printableRetVal",
+            tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
+            strip_prefix="",
             tar_compression="bz2",
             benchmark_file_suffix=".bc",
             sort_order=sort_order,
@@ -145,13 +145,13 @@ def __init__(
                 "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
                 "Homepage": "http://cuda.dcc.ufmg.br/angha/",
             },
-            license="",  
+            license="GNU General Public License v3.0 (GPLv3)",  
             site_data_base=site_data_base,
             tar_urls=[
-                "http://cuda.dcc.ufmg.br/Jotai/src/Jotai_printRetVal.tar.bz2"
+                "https://github.com/lac-dcc/jotai-benchmarks/raw/main/benchmarks/programs_no-ub_printableRetVal.bz2"
             ],
-            tar_sha256="7d2c6326036d87a02318e81a29560f9bb4ead3dc33ffbd43e4fb2e95e09dd621",  
-            strip_prefix="programs_no-ub_printableRetVal",  
+            tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",  
+            strip_prefix="",  
             tar_compression="bz2",
             benchmark_file_suffix=".c",
         )

From d5441b163ecb7627702023b8bc42715d42497d6e Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Tue, 19 Jul 2022 14:11:50 -0300
Subject: [PATCH 07/14] Correction benchmark

---
 compiler_gym/envs/llvm/datasets/jotaibench.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
index 53be063a4..ccdd13219 100644
--- a/compiler_gym/envs/llvm/datasets/jotaibench.py
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -75,7 +75,7 @@ def __init__(
                 "https://github.com/lac-dcc/jotai-benchmarks/raw/main/benchmarks/programs_no-ub_printableRetVal.bz2"
             ],
             tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
-            strip_prefix="",
+            strip_prefix="programs_no-ub_printableRetVal",
             tar_compression="bz2",
             benchmark_file_suffix=".bc",
             sort_order=sort_order,
@@ -151,7 +151,7 @@ def __init__(
                 "https://github.com/lac-dcc/jotai-benchmarks/raw/main/benchmarks/programs_no-ub_printableRetVal.bz2"
             ],
             tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",  
-            strip_prefix="",  
+            strip_prefix="programs_no-ub_printableRetVal",  
             tar_compression="bz2",
             benchmark_file_suffix=".c",
         )

From 1deb2dfb61556137ebb33d305dbc414332abebc6 Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Tue, 19 Jul 2022 17:36:01 +0000
Subject: [PATCH 08/14] files formated with pre-commit run --all-files

---
 compiler_gym/envs/llvm/datasets/__init__.py   | 5 ++---
 compiler_gym/envs/llvm/datasets/jotaibench.py | 7 +++----
 tests/llvm/datasets/jotaibench_test.py        | 1 -
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py
index 55e460f69..014fe4301 100644
--- a/compiler_gym/envs/llvm/datasets/__init__.py
+++ b/compiler_gym/envs/llvm/datasets/__init__.py
@@ -8,7 +8,6 @@
 
 from compiler_gym.datasets import Dataset, TarDatasetWithManifest
 from compiler_gym.envs.llvm.datasets.anghabench import AnghaBenchDataset
-from compiler_gym.envs.llvm.datasets.jotaibench import JotaiBenchDataset
 from compiler_gym.envs.llvm.datasets.cbench import (
     CBenchDataset,
     CBenchLegacyDataset,
@@ -17,6 +16,7 @@
 from compiler_gym.envs.llvm.datasets.chstone import CHStoneDataset
 from compiler_gym.envs.llvm.datasets.clgen import CLgenDataset
 from compiler_gym.envs.llvm.datasets.csmith import CsmithBenchmark, CsmithDataset
+from compiler_gym.envs.llvm.datasets.jotaibench import JotaiBenchDataset
 from compiler_gym.envs.llvm.datasets.llvm_stress import LlvmStressDataset
 from compiler_gym.envs.llvm.datasets.poj104 import POJ104Dataset, POJ104LegacyDataset
 from compiler_gym.util.runfiles_path import site_data_path
@@ -311,8 +311,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
 
 __all__ = [
     "AnghaBenchDataset",
-    "JotaiBenchDataset"
-    "BlasDataset",
+    "JotaiBenchDataset" "BlasDataset",
     "CBenchDataset",
     "CBenchLegacyDataset",
     "CLgenDataset",
diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
index ccdd13219..723b7a53f 100644
--- a/compiler_gym/envs/llvm/datasets/jotaibench.py
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -20,7 +20,6 @@
 from compiler_gym.util.filesystem import atomic_file_write
 
 
-
 class JotaiBenchDataset(TarDatasetWithManifest):
     """A dataset of C programs curated from GitHub source code.
 
@@ -145,13 +144,13 @@ def __init__(
                 "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
                 "Homepage": "http://cuda.dcc.ufmg.br/angha/",
             },
-            license="GNU General Public License v3.0 (GPLv3)",  
+            license="GNU General Public License v3.0 (GPLv3)",
             site_data_base=site_data_base,
             tar_urls=[
                 "https://github.com/lac-dcc/jotai-benchmarks/raw/main/benchmarks/programs_no-ub_printableRetVal.bz2"
             ],
-            tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",  
-            strip_prefix="programs_no-ub_printableRetVal",  
+            tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
+            strip_prefix="programs_no-ub_printableRetVal",
             tar_compression="bz2",
             benchmark_file_suffix=".c",
         )
diff --git a/tests/llvm/datasets/jotaibench_test.py b/tests/llvm/datasets/jotaibench_test.py
index 121e23e14..e10bd87f7 100644
--- a/tests/llvm/datasets/jotaibench_test.py
+++ b/tests/llvm/datasets/jotaibench_test.py
@@ -10,7 +10,6 @@
 import gym
 import pytest
 
-
 import compiler_gym.envs.llvm  # noqa register environments
 from compiler_gym.envs.llvm import LlvmEnv
 from compiler_gym.envs.llvm.datasets import JotaiBenchDataset

From 97d3d7dddd2086c0bb421bbe8e340b510049e7e3 Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Mon, 8 Aug 2022 14:02:47 +0000
Subject: [PATCH 09/14] correction format bazel buildfier

---
 compiler_gym/envs/llvm/datasets/BUILD | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler_gym/envs/llvm/datasets/BUILD b/compiler_gym/envs/llvm/datasets/BUILD
index 39391e937..4f2577db0 100644
--- a/compiler_gym/envs/llvm/datasets/BUILD
+++ b/compiler_gym/envs/llvm/datasets/BUILD
@@ -9,11 +9,11 @@ py_library(
     srcs = [
         "__init__.py",
         "anghabench.py",
-        "jotaibench.py",
         "cbench.py",
         "chstone.py",
         "clgen.py",
         "csmith.py",
+        "jotaibench.py",
         "llvm_stress.py",
         "poj104.py",
     ],

From b0cf6cf6d82c6e5e370c5150f915b716db3c6563 Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Sun, 14 Aug 2022 20:56:18 +0000
Subject: [PATCH 10/14] Updating files with CLI correction and test from jotai
 benchmarks.

---
 compiler_gym/envs/llvm/datasets/__init__.py   | 11 +++++--
 compiler_gym/envs/llvm/datasets/jotaibench.py | 18 +++++------
 docs/source/llvm/api.rst                      |  2 ++
 docs/source/llvm/index.rst                    |  4 ++-
 tests/llvm/datasets/CMakeLists.txt            | 26 ++++++++--------
 tests/llvm/datasets/jotaibench_test.py        | 30 +++++++------------
 tests/llvm/datasets/llvm_datasets_test.py     |  2 +-
 7 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py
index 014fe4301..982378392 100644
--- a/compiler_gym/envs/llvm/datasets/__init__.py
+++ b/compiler_gym/envs/llvm/datasets/__init__.py
@@ -265,9 +265,13 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
     yield JotaiBenchDataset(site_data_base=site_data_base, sort_order=0)
     # Add legacy version of Jotaibench using an old manifest.
     jotaibench_v0_manifest_url, jotaibench_v0_manifest_sha256 = {
+        "darwin": (
+            "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
+            "39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1",
+        ),
         "linux": (
-            "https://dl.fbaipublicfiles.com/compiler_gym/llvm_bitcodes-10.0.0-anghabench-v0-linux-manifest.bz2",
-            "a038d25d39ee9472662a9704dfff19c9e3512ff6a70f1067af85c5cb3784b477",
+            "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
+            "3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
         ),
     }[sys.platform]
     yield JotaiBenchDataset(
@@ -311,7 +315,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
 
 __all__ = [
     "AnghaBenchDataset",
-    "JotaiBenchDataset" "BlasDataset",
+    "BlasDataset",
     "CBenchDataset",
     "CBenchLegacyDataset",
     "CLgenDataset",
@@ -319,6 +323,7 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
     "CsmithDataset",
     "get_llvm_datasets",
     "GitHubDataset",
+    "JotaiBenchDataset",
     "LinuxDataset",
     "LlvmStressDataset",
     "MibenchDataset",
diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
index 723b7a53f..33b8ac852 100644
--- a/compiler_gym/envs/llvm/datasets/jotaibench.py
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -55,7 +55,7 @@ def __init__(
     ):
         manifest_url_, manifest_sha256_ = {
             "linux": (
-                "https://github.com/lac-dcc/jotai-benchmarks/raw/main/benchmarks/programs_no-ub_printableRetVal.bz2",
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
                 "3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
             ),
         }[sys.platform]
@@ -71,12 +71,12 @@ def __init__(
             manifest_urls=[manifest_url or manifest_url_],
             manifest_sha256=manifest_sha256 or manifest_sha256_,
             tar_urls=[
-                "https://github.com/lac-dcc/jotai-benchmarks/raw/main/benchmarks/programs_no-ub_printableRetVal.bz2"
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2"
             ],
             tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
-            strip_prefix="programs_no-ub_printableRetVal",
+            strip_prefix="jotaibench-v1",
             tar_compression="bz2",
-            benchmark_file_suffix=".bc",
+            benchmark_file_suffix=".c",
             sort_order=sort_order,
             deprecated=deprecated,
         )
@@ -138,7 +138,7 @@ def __init__(
         site_data_base: Path,
     ):
         super().__init__(
-            name="benchmark://jotai-runnable-v0",
+            name="benchmark://jotai-runnable-v1",
             description="Runnable C/C++ functions extracted from GitHub",
             references={
                 "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
@@ -147,10 +147,10 @@ def __init__(
             license="GNU General Public License v3.0 (GPLv3)",
             site_data_base=site_data_base,
             tar_urls=[
-                "https://github.com/lac-dcc/jotai-benchmarks/raw/main/benchmarks/programs_no-ub_printableRetVal.bz2"
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2"
             ],
             tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
-            strip_prefix="programs_no-ub_printableRetVal",
+            strip_prefix="jotaibench-v1",
             tar_compression="bz2",
             benchmark_file_suffix=".c",
         )
@@ -194,12 +194,12 @@ def benchmark_from_parsed_uri(self, uri: BenchmarkUri) -> Benchmark:
             BenchmarkDynamicConfig(
                 build_cmd=Command(
                     argument=["$CC", "$IN"] + get_system_library_flags(),
-                    timeout_seconds=60,
+                    timeout_seconds=30,
                     outfile=["a.out"],
                 ),
                 run_cmd=Command(
                     argument=["./a.out 0"],
-                    timeout_seconds=60,
+                    timeout_seconds=30,
                     infile=[],
                     outfile=[],
                 ),
diff --git a/docs/source/llvm/api.rst b/docs/source/llvm/api.rst
index 36c3adc5c..06af7fffd 100644
--- a/docs/source/llvm/api.rst
+++ b/docs/source/llvm/api.rst
@@ -45,6 +45,8 @@ Datasets
 
 .. autoclass:: GitHubDataset
 
+.. autoclass:: JotaiBenchDataset
+
 .. autoclass:: LinuxDataset
 
 .. autoclass:: LlvmStressDataset
diff --git a/docs/source/llvm/index.rst b/docs/source/llvm/index.rst
index 704a2f5c7..216cc8ddb 100644
--- a/docs/source/llvm/index.rst
+++ b/docs/source/llvm/index.rst
@@ -40,6 +40,8 @@ We provide several datasets of open-source LLVM-IR benchmarks for use:
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 | benchmark://github-v0      | 49,738                   | Compile-only C/C++ objects from GitHub [`Paper <https://arxiv.org/pdf/2012.01470.pdf>`__]                                                                                                                          | No                   |
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
+| benchmark://jotaibench-v1  | 18,761                   | Compile-only C/C++ functions extracted from GitHub [`Homepage <https://github.com/lac-dcc/jotai-benchmarks>`__, `Paper`__]                                                                                         | No                   |
++----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 | benchmark://linux-v0       | 13,894                   | Compile-only object files from C Linux kernel [`Homepage <https://www.linux.org/>`__]                                                                                                                              | No                   |
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 | benchmark://mibench-v1     | 40                       | C benchmarks [`Paper <http://vhosts.eecs.umich.edu/mibench/Publications/MiBench.pdf>`__]                                                                                                                           | No                   |
@@ -56,7 +58,7 @@ We provide several datasets of open-source LLVM-IR benchmarks for use:
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 | generator://llvm-stress-v0 | ∞                        | Randomly generated LLVM-IR [`Documentation <https://llvm.org/docs/CommandGuide/llvm-stress.html>`__]                                                                                                               | No                   |
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
-| Total                      | 1,158,701                |                                                                                                                                                                                                                    |                      |
+| Total                      | 1,177,462                |                                                                                                                                                                                                                    |                      |
 +----------------------------+--------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------+
 
 .. [#f1] Values are for the Linux datasets. Some of the datasets contain fewer
diff --git a/tests/llvm/datasets/CMakeLists.txt b/tests/llvm/datasets/CMakeLists.txt
index 17a37e388..f0cd0a736 100644
--- a/tests/llvm/datasets/CMakeLists.txt
+++ b/tests/llvm/datasets/CMakeLists.txt
@@ -18,19 +18,6 @@ cg_py_test(
     tests::test_main
 )
 
-cg_py_test(
-  NAME
-    jotaibench_test
-  SRCS
-    "jotaibench_test.py"
-  DEPS
-    compiler_gym::envs::llvm::llvm
-    compiler_gym::envs::llvm::datasets::datasets
-    tests::pytest_plugins::common
-    tests::pytest_plugins::llvm
-    tests::test_main
-)
-
 cg_py_test(
   NAME
     cbench_test
@@ -110,6 +97,19 @@ cg_py_test(
     tests::test_main
 )
 
+cg_py_test(
+  NAME
+    jotaibench_test
+  SRCS
+    "jotaibench_test.py"
+  DEPS
+    compiler_gym::envs::llvm::llvm
+    compiler_gym::envs::llvm::datasets::datasets
+    tests::pytest_plugins::common
+    tests::pytest_plugins::llvm
+    tests::test_main
+)
+
 cg_py_test(
   NAME
     llvm_datasets_test
diff --git a/tests/llvm/datasets/jotaibench_test.py b/tests/llvm/datasets/jotaibench_test.py
index e10bd87f7..cd325c52f 100644
--- a/tests/llvm/datasets/jotaibench_test.py
+++ b/tests/llvm/datasets/jotaibench_test.py
@@ -4,18 +4,22 @@
 # LICENSE file in the root directory of this source tree.
 """Tests for the JotaiBench dataset."""
 import sys
-from itertools import islice
-from pathlib import Path
 
 import gym
 import pytest
 
 import compiler_gym.envs.llvm  # noqa register environments
-from compiler_gym.envs.llvm import LlvmEnv
+
+# from compiler_gym.envs.llvm import LlvmEnv
 from compiler_gym.envs.llvm.datasets import JotaiBenchDataset
-from tests.pytest_plugins.common import skip_on_ci
+
+# from tests.pytest_plugins.common import skip_on_ci
 from tests.test_main import main
 
+# from itertools import islice
+# from pathlib import Path
+
+
 pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"]
 
 
@@ -28,9 +32,9 @@ def jotaibench_dataset() -> JotaiBenchDataset:
 
 def test_jotaibench_size(jotaibench_dataset: JotaiBenchDataset):
     if sys.platform == "darwin":
-        assert jotaibench_dataset.size == 1041265
+        assert jotaibench_dataset.size == 2138885
     else:
-        assert jotaibench_dataset.size == 1041333
+        assert jotaibench_dataset.size == 2138885
 
 
 def test_missing_benchmark_name(jotaibench_dataset: JotaiBenchDataset, mocker):
@@ -50,19 +54,5 @@ def test_missing_benchmark_name(jotaibench_dataset: JotaiBenchDataset, mocker):
     assert jotaibench_dataset.install.call_count == 2
 
 
-@skip_on_ci
-@pytest.mark.parametrize("index", range(250))
-def test_jotaibench_random_select(
-    env: LlvmEnv, jotaibench_dataset: JotaiBenchDataset, index: int, tmpwd: Path
-):
-    uri = next(islice(jotaibench_dataset.benchmark_uris(), index, None))
-    benchmark = jotaibench_dataset.benchmark(uri)
-    env.reset(benchmark=benchmark)
-
-    assert benchmark.source
-    benchmark.write_sources_to_directory(tmpwd)
-    assert (tmpwd / "function.c").is_file()
-
-
 if __name__ == "__main__":
     main()
diff --git a/tests/llvm/datasets/llvm_datasets_test.py b/tests/llvm/datasets/llvm_datasets_test.py
index 95ff5ddf4..bf2f34f58 100644
--- a/tests/llvm/datasets/llvm_datasets_test.py
+++ b/tests/llvm/datasets/llvm_datasets_test.py
@@ -12,13 +12,13 @@
 def test_default_dataset_list():
     with gym.make("llvm-v0") as env:
         assert list(d.name for d in env.datasets) == [
-            "benchmark://jotaibench-v1",
             "benchmark://cbench-v1",
             "benchmark://anghabench-v1",
             "benchmark://blas-v0",
             "benchmark://chstone-v0",
             "benchmark://clgen-v0",
             "benchmark://github-v0",
+            "benchmark://jotaibench-v1",
             "benchmark://linux-v0",
             "benchmark://mibench-v1",
             "benchmark://npb-v0",

From e66ab791c7f395d47c388fd5d33b56e23ee00c3f Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Fri, 19 Aug 2022 16:18:52 +0000
Subject: [PATCH 11/14] Correction link to download of benchmark and test

---
 compiler_gym/envs/llvm/datasets/jotaibench.py | 12 ++++----
 tests/llvm/datasets/jotaibench_test.py        | 30 ++++++++++++-------
 2 files changed, 26 insertions(+), 16 deletions(-)

diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
index 33b8ac852..f1a011180 100644
--- a/compiler_gym/envs/llvm/datasets/jotaibench.py
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -55,8 +55,8 @@ def __init__(
     ):
         manifest_url_, manifest_sha256_ = {
             "linux": (
-                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
-                "3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true",
+                "202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
             ),
         }[sys.platform]
         super().__init__(
@@ -71,9 +71,9 @@ def __init__(
             manifest_urls=[manifest_url or manifest_url_],
             manifest_sha256=manifest_sha256 or manifest_sha256_,
             tar_urls=[
-                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2"
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true"
             ],
-            tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
+            tar_sha256="202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
             strip_prefix="jotaibench-v1",
             tar_compression="bz2",
             benchmark_file_suffix=".c",
@@ -147,9 +147,9 @@ def __init__(
             license="GNU General Public License v3.0 (GPLv3)",
             site_data_base=site_data_base,
             tar_urls=[
-                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2"
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true"
             ],
-            tar_sha256="3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
+            tar_sha256="202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
             strip_prefix="jotaibench-v1",
             tar_compression="bz2",
             benchmark_file_suffix=".c",
diff --git a/tests/llvm/datasets/jotaibench_test.py b/tests/llvm/datasets/jotaibench_test.py
index cd325c52f..db837b93c 100644
--- a/tests/llvm/datasets/jotaibench_test.py
+++ b/tests/llvm/datasets/jotaibench_test.py
@@ -4,22 +4,18 @@
 # LICENSE file in the root directory of this source tree.
 """Tests for the JotaiBench dataset."""
 import sys
+from itertools import islice
+from pathlib import Path
 
 import gym
 import pytest
 
 import compiler_gym.envs.llvm  # noqa register environments
-
-# from compiler_gym.envs.llvm import LlvmEnv
+from compiler_gym.envs.llvm import LlvmEnv
 from compiler_gym.envs.llvm.datasets import JotaiBenchDataset
-
-# from tests.pytest_plugins.common import skip_on_ci
+from tests.pytest_plugins.common import skip_on_ci
 from tests.test_main import main
 
-# from itertools import islice
-# from pathlib import Path
-
-
 pytest_plugins = ["tests.pytest_plugins.common", "tests.pytest_plugins.llvm"]
 
 
@@ -32,9 +28,9 @@ def jotaibench_dataset() -> JotaiBenchDataset:
 
 def test_jotaibench_size(jotaibench_dataset: JotaiBenchDataset):
     if sys.platform == "darwin":
-        assert jotaibench_dataset.size == 2138885
+        assert jotaibench_dataset.size == 2138894
     else:
-        assert jotaibench_dataset.size == 2138885
+        assert jotaibench_dataset.size == 2138894
 
 
 def test_missing_benchmark_name(jotaibench_dataset: JotaiBenchDataset, mocker):
@@ -54,5 +50,19 @@ def test_missing_benchmark_name(jotaibench_dataset: JotaiBenchDataset, mocker):
     assert jotaibench_dataset.install.call_count == 2
 
 
+@skip_on_ci
+@pytest.mark.parametrize("index", range(250))
+def test_anghabench_random_select(
+    env: LlvmEnv, jotaibench_dataset: JotaiBenchDataset, index: int, tmpwd: Path
+):
+    uri = next(islice(jotaibench_dataset.benchmark_uris(), index, None))
+    benchmark = jotaibench_dataset.benchmark(uri)
+    env.reset(benchmark=benchmark)
+
+    assert benchmark.source
+    benchmark.write_sources_to_directory(tmpwd)
+    assert (tmpwd / "function.c").is_file()
+
+
 if __name__ == "__main__":
     main()

From a65d25c6be4c183a9d637ba7d379a82f4b031db8 Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Fri, 19 Aug 2022 18:25:02 +0000
Subject: [PATCH 12/14] adding darwin manifest url

---
 compiler_gym/envs/llvm/datasets/jotaibench.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
index f1a011180..033c5fffe 100644
--- a/compiler_gym/envs/llvm/datasets/jotaibench.py
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -54,6 +54,10 @@ def __init__(
         name: Optional[str] = None,
     ):
         manifest_url_, manifest_sha256_ = {
+            "darwin": (
+                "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true",
+                "202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
+            ),
             "linux": (
                 "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true",
                 "202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",

From 8299a58ab93a97433002c42dbda3b4e0978a77a5 Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Tue, 23 Aug 2022 16:31:45 +0000
Subject: [PATCH 13/14] removing deprecated dataset in the same PR

---
 compiler_gym/envs/llvm/datasets/__init__.py | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/compiler_gym/envs/llvm/datasets/__init__.py b/compiler_gym/envs/llvm/datasets/__init__.py
index 982378392..b6d4deb7c 100644
--- a/compiler_gym/envs/llvm/datasets/__init__.py
+++ b/compiler_gym/envs/llvm/datasets/__init__.py
@@ -263,25 +263,6 @@ def get_llvm_datasets(site_data_base: Optional[Path] = None) -> Iterable[Dataset
         deprecated="Please use anghabench-v1",
     )
     yield JotaiBenchDataset(site_data_base=site_data_base, sort_order=0)
-    # Add legacy version of Jotaibench using an old manifest.
-    jotaibench_v0_manifest_url, jotaibench_v0_manifest_sha256 = {
-        "darwin": (
-            "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
-            "39464256405aacefdb7550a7f990c9c578264c132804eec3daac091fa3c21bd1",
-        ),
-        "linux": (
-            "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2",
-            "3657a36b129d462c11d451a5bc9365e73f404a814e8726b383a2d7e64faa3d73",
-        ),
-    }[sys.platform]
-    yield JotaiBenchDataset(
-        name="benchmark://jotaibench-v0",
-        site_data_base=site_data_base,
-        sort_order=0,
-        manifest_url=jotaibench_v0_manifest_url,
-        manifest_sha256=jotaibench_v0_manifest_sha256,
-        deprecated="Please use jotaibench-v1",
-    )
     yield BlasDataset(site_data_base=site_data_base, sort_order=0)
     yield CLgenDataset(site_data_base=site_data_base, sort_order=0)
     yield CBenchDataset(site_data_base=site_data_base)

From f82089fbc855de65ef6e24d81809927a2dff6b8d Mon Sep 17 00:00:00 2001
From: canesche <michael.canesche@gmail.com>
Date: Tue, 23 Aug 2022 18:26:08 +0000
Subject: [PATCH 14/14] replacing jotaibench's version v1 to v0

---
 compiler_gym/envs/llvm/datasets/jotaibench.py | 16 ++++++++--------
 tests/llvm/datasets/jotaibench_test.py        | 10 +++++-----
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/compiler_gym/envs/llvm/datasets/jotaibench.py b/compiler_gym/envs/llvm/datasets/jotaibench.py
index 033c5fffe..0012cf7ad 100644
--- a/compiler_gym/envs/llvm/datasets/jotaibench.py
+++ b/compiler_gym/envs/llvm/datasets/jotaibench.py
@@ -56,15 +56,15 @@ def __init__(
         manifest_url_, manifest_sha256_ = {
             "darwin": (
                 "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true",
-                "202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
+                "b5a51af3d4e2f77a66001635ec64ed321e0ece19873c4a888040859af7556401",
             ),
             "linux": (
                 "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true",
-                "202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
+                "b5a51af3d4e2f77a66001635ec64ed321e0ece19873c4a888040859af7556401",
             ),
         }[sys.platform]
         super().__init__(
-            name=name or "benchmark://jotaibench-v1",
+            name=name or "benchmark://jotaibench-v0",
             description="Compile-only C/C++ functions extracted from GitHub",
             references={
                 "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
@@ -77,8 +77,8 @@ def __init__(
             tar_urls=[
                 "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true"
             ],
-            tar_sha256="202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
-            strip_prefix="jotaibench-v1",
+            tar_sha256="b5a51af3d4e2f77a66001635ec64ed321e0ece19873c4a888040859af7556401",
+            strip_prefix="jotaibench-v0",
             tar_compression="bz2",
             benchmark_file_suffix=".c",
             sort_order=sort_order,
@@ -142,7 +142,7 @@ def __init__(
         site_data_base: Path,
     ):
         super().__init__(
-            name="benchmark://jotai-runnable-v1",
+            name="benchmark://jotai-runnable-v0",
             description="Runnable C/C++ functions extracted from GitHub",
             references={
                 "Paper": "https://homepages.dcc.ufmg.br/~fernando/publications/papers/FaustinoCGO21.pdf",
@@ -153,8 +153,8 @@ def __init__(
             tar_urls=[
                 "https://github.com/lac-dcc/jotai-benchmarks/blob/main/benchmarks/jotaibench.bz2?raw=true"
             ],
-            tar_sha256="202d14b0f3f78210c7472b7d4ef7c33d828174c30a3bced6950fc1ca88773983",
-            strip_prefix="jotaibench-v1",
+            tar_sha256="b5a51af3d4e2f77a66001635ec64ed321e0ece19873c4a888040859af7556401",
+            strip_prefix="jotaibench-v0",
             tar_compression="bz2",
             benchmark_file_suffix=".c",
         )
diff --git a/tests/llvm/datasets/jotaibench_test.py b/tests/llvm/datasets/jotaibench_test.py
index db837b93c..e0cc408ff 100644
--- a/tests/llvm/datasets/jotaibench_test.py
+++ b/tests/llvm/datasets/jotaibench_test.py
@@ -22,7 +22,7 @@
 @pytest.fixture(scope="module")
 def jotaibench_dataset() -> JotaiBenchDataset:
     with gym.make("llvm-v0") as env:
-        ds = env.datasets["jotaibench-v1"]
+        ds = env.datasets["jotaibench-v0"]
     yield ds
 
 
@@ -38,15 +38,15 @@ def test_missing_benchmark_name(jotaibench_dataset: JotaiBenchDataset, mocker):
     mocker.patch.object(jotaibench_dataset, "install")
 
     with pytest.raises(
-        LookupError, match=r"^No benchmark specified: benchmark://jotaibench-v1$"
+        LookupError, match=r"^No benchmark specified: benchmark://jotaibench-v0$"
     ):
-        jotaibench_dataset.benchmark("benchmark://jotaibench-v1")
+        jotaibench_dataset.benchmark("benchmark://jotaibench-v0")
     jotaibench_dataset.install.assert_called_once()
 
     with pytest.raises(
-        LookupError, match=r"^No benchmark specified: benchmark://jotaibench-v1/$"
+        LookupError, match=r"^No benchmark specified: benchmark://jotaibench-v0/$"
     ):
-        jotaibench_dataset.benchmark("benchmark://jotaibench-v1/")
+        jotaibench_dataset.benchmark("benchmark://jotaibench-v0/")
     assert jotaibench_dataset.install.call_count == 2