diff --git a/.github/workflows/fuzz.yaml b/.github/workflows/fuzz.yaml
index 84a031c01..1ae439a14 100644
--- a/.github/workflows/fuzz.yaml
+++ b/.github/workflows/fuzz.yaml
@@ -42,4 +42,4 @@ jobs:
                   BAZEL_TEST_OPTS: --config=ci
 
             - name: Test
-              run: FUZZ_TIME=600 make fuzz
+              run: FUZZ_TIME=600 make install-fuzz
diff --git a/compiler_gym/envs/compiler_env.py b/compiler_gym/envs/compiler_env.py
index 16db59fb3..d819512f4 100644
--- a/compiler_gym/envs/compiler_env.py
+++ b/compiler_gym/envs/compiler_env.py
@@ -973,23 +973,25 @@ def step(
             reward_spaces: List[Reward] = []
 
         # Perform the underlying environment step.
-        observations, rewards, done, info = self.raw_step(
+        observation_values, reward_values, done, info = self.raw_step(
             actions, observation_spaces, reward_spaces
         )
 
         # Translate observations lists back to the appropriate types.
-        if self.observation_space_spec and len(observations) == 1:
-            observations = observations[0]
+        if observations is None and self.observation_space_spec:
+            observation_values = observation_values[0]
         elif not observation_spaces:
-            observations = None
+            observation_values = None
 
         # Translate reward lists back to the appropriate types.
-        if self.reward_space_spec and len(rewards) == 1:
-            rewards = rewards[0]
+        if rewards is None and self.reward_space:
+            reward_values = reward_values[0]
+            # Update the cumulative episode reward
+            self.episode_reward += reward_values
         elif not reward_spaces:
-            rewards = None
+            reward_values = None
 
-        return observations, rewards, done, info
+        return observation_values, reward_values, done, info
 
     def render(
         self,
diff --git a/compiler_gym/spaces/reward.py b/compiler_gym/spaces/reward.py
index 1576d5154..7c2ea57ea 100644
--- a/compiler_gym/spaces/reward.py
+++ b/compiler_gym/spaces/reward.py
@@ -2,7 +2,7 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import numpy as np
 
@@ -137,6 +137,14 @@ def range(self) -> Tuple[RewardType, RewardType]:
     def __repr__(self):
         return self.id
 
+    def __eq__(self, other: Union["Reward", str]) -> bool:
+        if isinstance(other, str):
+            return self.id == other
+        elif isinstance(other, Reward):
+            return self.id == other.id
+        else:
+            return False
+
 
 class DefaultRewardFromObservation(Reward):
     def __init__(self, observation_name: str, **kwargs):
diff --git a/compiler_gym/third_party/llvm/__init__.py b/compiler_gym/third_party/llvm/__init__.py
index d832b7a89..58be6ba2c 100644
--- a/compiler_gym/third_party/llvm/__init__.py
+++ b/compiler_gym/third_party/llvm/__init__.py
@@ -107,6 +107,11 @@ def llvm_stress_path() -> Path:
     return download_llvm_files() / "bin/llvm-stress"
 
 
+def llvm_diff_path() -> Path:
+    """Return the path of llvm-diff."""
+    return download_llvm_files() / "bin/llvm-diff"
+
+
 def opt_path() -> Path:
     """Return the path of opt."""
     return download_llvm_files() / "bin/opt"
diff --git a/compiler_gym/wrappers/core.py b/compiler_gym/wrappers/core.py
index fb9fddf99..f74b46fa5 100644
--- a/compiler_gym/wrappers/core.py
+++ b/compiler_gym/wrappers/core.py
@@ -81,7 +81,16 @@ def reset(self, *args, **kwargs):
 
     def step(self, *args, **kwargs):
         observation, reward, done, info = self.env.step(*args, **kwargs)
-        return observation, self.reward(reward), done, info
+        # Undo the episode_reward update and reapply it once we have transformed
+        # the reward.
+        #
+        # TODO(cummins): Refactor step() so that we don't have to do this
+        # recalculation of episode_reward, as this is prone to errors if, say,
+        # the base reward returns NaN or an invalid type.
+        self.unwrapped.episode_reward -= reward
+        reward = self.reward(reward)
+        self.unwrapped.episode_reward += reward
+        return observation, reward, done, info
 
     def reward(self, reward):
         """Translate a reward to the new space."""
diff --git a/tests/fuzzing/BUILD b/tests/fuzzing/BUILD
index ac4d27556..682779ff4 100644
--- a/tests/fuzzing/BUILD
+++ b/tests/fuzzing/BUILD
@@ -6,6 +6,17 @@
 # LICENSE file in the root directory of this source tree.
 load("@rules_python//python:defs.bzl", "py_test")
 
+py_test(
+    name = "llvm_cbench_validate_fuzz_test",
+    srcs = ["llvm_cbench_validate_fuzz_test.py"],
+    tags = ["manual"],
+    deps = [
+        "//compiler_gym",
+        "//tests:test_main",
+        "//tests/pytest_plugins:llvm",
+    ],
+)
+
 py_test(
     name = "llvm_commandline_opt_equivalence_fuzz_test",
     srcs = ["llvm_commandline_opt_equivalence_fuzz_test.py"],
@@ -68,17 +79,6 @@ py_test(
     ],
 )
 
-py_test(
-    name = "llvm_validate_fuzz_test",
-    srcs = ["llvm_validate_fuzz_test.py"],
-    tags = ["manual"],
-    deps = [
-        "//compiler_gym",
-        "//tests:test_main",
-        "//tests/pytest_plugins:llvm",
-    ],
-)
-
 py_test(
     name = "llvm_stress_fuzz_test",
     timeout = "long",
diff --git a/tests/fuzzing/llvm_validate_fuzz_test.py b/tests/fuzzing/llvm_cbench_validate_fuzz_test.py
similarity index 90%
rename from tests/fuzzing/llvm_validate_fuzz_test.py
rename to tests/fuzzing/llvm_cbench_validate_fuzz_test.py
index eae460a0b..8a5faa67e 100644
--- a/tests/fuzzing/llvm_validate_fuzz_test.py
+++ b/tests/fuzzing/llvm_cbench_validate_fuzz_test.py
@@ -5,6 +5,8 @@
 """Fuzz test for LlvmEnv.validate()."""
 import random
 
+import pytest
+
 from compiler_gym.envs import LlvmEnv
 from tests.pytest_plugins.llvm import VALIDATABLE_CBENCH_URIS
 from tests.test_main import main
@@ -16,6 +18,7 @@
 RANDOM_TRAJECTORY_LENGTH_RANGE = (1, 50)
 
 
+@pytest.mark.timeout(600)
 def test_fuzz(env: LlvmEnv):
     """This test generates a random trajectory and validates the semantics."""
     benchmark = random.choice(VALIDATABLE_CBENCH_URIS)
@@ -29,7 +32,8 @@ def test_fuzz(env: LlvmEnv):
                 break  # Broken trajectory, retry.
         else:
             print(f"Validating state {env.state}")
-            assert env.validate() == []
+            result = env.validate()
+            assert result.okay(), result
             # Stop the test.
             break
 
diff --git a/tests/fuzzing/llvm_commandline_opt_equivalence_fuzz_test.py b/tests/fuzzing/llvm_commandline_opt_equivalence_fuzz_test.py
index 12a9a9122..57b852b23 100644
--- a/tests/fuzzing/llvm_commandline_opt_equivalence_fuzz_test.py
+++ b/tests/fuzzing/llvm_commandline_opt_equivalence_fuzz_test.py
@@ -3,8 +3,8 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 """Fuzz test for LlvmEnv.commandline()."""
+import os
 import subprocess
-from difflib import unified_diff
 from pathlib import Path
 
 import pytest
@@ -22,10 +22,13 @@
 RANDOM_TRAJECTORY_LENGTH_RANGE = (1, 50)
 
 
+@pytest.mark.timeout(600)
 def test_fuzz(env: LlvmEnv, tmpwd: Path, llvm_opt: Path, llvm_diff: Path):
     """This test produces a random trajectory and then uses the commandline()
     generated with opt to check that the states are equivalent.
     """
+    del tmpwd
+
     env.reset()
     env.write_ir("input.ll")
     assert Path("input.ll").is_file()
@@ -47,20 +50,21 @@ def test_fuzz(env: LlvmEnv, tmpwd: Path, llvm_opt: Path, llvm_diff: Path):
         commandline, env={"PATH": str(llvm_opt.parent)}, shell=True, timeout=60
     )
     assert Path("output.ll").is_file()
+    os.rename("output.ll", "opt.ll")
 
-    with open("output.ll") as f1, open("env.ll") as f2:
-        # Diff the IR files but exclude the first line which is the module name.
-        diff = list(unified_diff(f1.readlines()[1:], f2.readlines()[1:]))
+    diff = subprocess.Popen(
+        [llvm_diff, "opt.ll", "env.ll"],
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE,
+        universal_newlines=True,
+    )
+    stdout, stderr = diff.communicate(timeout=300)
 
-        if diff and len(diff) < 25:
-            diff = "\n".join(diff)
-            pytest.fail(f"Opt produced different output to CompilerGym:\n{diff}")
-        elif diff:
-            # If it's a big diff then we will require the user to reproduce it
-            # themselves using the environment state we printed earlier.
-            pytest.fail(
-                f"Opt produced different output to CompilerGym ({len(diff)}-line diff)"
-            )
+    if diff.returncode:
+        pytest.fail(
+            f"Opt produced different output to CompilerGym "
+            f"(returncode: {diff.returncode}):\n{stdout}\n{stderr}"
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/fuzzing/llvm_deterministic_action_fuzz_test.py b/tests/fuzzing/llvm_deterministic_action_fuzz_test.py
index 6e5b314a9..3c4af9617 100644
--- a/tests/fuzzing/llvm_deterministic_action_fuzz_test.py
+++ b/tests/fuzzing/llvm_deterministic_action_fuzz_test.py
@@ -24,6 +24,7 @@ def sha1(string: str):
     return sha1.hexdigest()
 
 
+@pytest.mark.timeout(600)
 def test_fuzz(env: LlvmEnv):
     """Run an action multiple times from the same starting state and check that
     the generated LLVM-IR is the same.
diff --git a/tests/fuzzing/llvm_fork_env_fuzz_test.py b/tests/fuzzing/llvm_fork_env_fuzz_test.py
index 800838e87..ac73a5f5b 100644
--- a/tests/fuzzing/llvm_fork_env_fuzz_test.py
+++ b/tests/fuzzing/llvm_fork_env_fuzz_test.py
@@ -17,6 +17,7 @@
 POST_FORK_ACTIONS = 10
 
 
+@pytest.mark.timeout(600)
 def test_fuzz(env: LlvmEnv, reward_space: str):
     """This test generates a random trajectory and checks that fork() produces
     an equivalent state. It then runs a second trajectory on the two
diff --git a/tests/fuzzing/llvm_random_actions_fuzz_test.py b/tests/fuzzing/llvm_random_actions_fuzz_test.py
index f4c68ceb7..aabff3455 100644
--- a/tests/fuzzing/llvm_random_actions_fuzz_test.py
+++ b/tests/fuzzing/llvm_random_actions_fuzz_test.py
@@ -8,6 +8,7 @@
 
 import gym
 import numpy as np
+import pytest
 
 from compiler_gym.third_party.autophase import AUTOPHASE_FEATURE_DIM
 from tests.test_main import main
@@ -18,7 +19,8 @@
 FUZZ_TIME_SECONDS = 2
 
 
-def test_benchmark_random_actions(benchmark_name: str):
+@pytest.mark.timeout(600)
+def test_fuzz(benchmark_name: str):
     """Run randomly selected actions on a benchmark until a minimum amount of time has elapsed."""
     env = gym.make(
         "llvm-v0",
diff --git a/tests/fuzzing/llvm_stress_fuzz_test.py b/tests/fuzzing/llvm_stress_fuzz_test.py
index 751fc4284..9960d67a4 100644
--- a/tests/fuzzing/llvm_stress_fuzz_test.py
+++ b/tests/fuzzing/llvm_stress_fuzz_test.py
@@ -3,6 +3,9 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 """Fuzz test LLVM backend using llvm-stress."""
+import pytest
+
+from compiler_gym.datasets import BenchmarkInitError
 from compiler_gym.envs import LlvmEnv
 from tests.pytest_plugins.random_util import apply_random_trajectory
 from tests.test_main import main
@@ -13,22 +16,28 @@
 RANDOM_TRAJECTORY_LENGTH_RANGE = (1, 10)
 
 
+@pytest.mark.timeout(600)
 def test_fuzz(env: LlvmEnv, observation_space: str, reward_space: str):
     """This test produces a random trajectory using a program generated using
     llvm-stress.
     """
-    env.benchmark = env.datasets["llvm-stress-v0"].random_benchmark()
+    benchmark = env.datasets["generator://llvm-stress-v0"].random_benchmark()
+    print(benchmark.uri)  # For debugging in case of failure.
 
     env.observation_space = observation_space
     env.reward_space = reward_space
 
-    env.reset()
-    apply_random_trajectory(
-        env,
-        random_trajectory_length_range=RANDOM_TRAJECTORY_LENGTH_RANGE,
-        timeout=10,
-    )
-    print(env.state)  # For debugging in case of failure.
+    try:
+        env.reset(benchmark=benchmark)
+        apply_random_trajectory(
+            env,
+            random_trajectory_length_range=RANDOM_TRAJECTORY_LENGTH_RANGE,
+            timeout=10,
+        )
+        print(env.state)  # For debugging in case of failure.
+    except BenchmarkInitError:
+        # Benchmark is invalid.
+        pass
 
 
 if __name__ == "__main__":
diff --git a/tests/fuzzing/llvm_trajectory_replay_fuzz_test.py b/tests/fuzzing/llvm_trajectory_replay_fuzz_test.py
index a95486f6a..b0c8b07cf 100644
--- a/tests/fuzzing/llvm_trajectory_replay_fuzz_test.py
+++ b/tests/fuzzing/llvm_trajectory_replay_fuzz_test.py
@@ -3,12 +3,11 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 """Fuzz test for LlvmEnv.validate()."""
-import random
-
 import numpy as np
+import pytest
 
+from compiler_gym.datasets import BenchmarkInitError
 from compiler_gym.envs import LlvmEnv
-from tests.pytest_plugins.llvm import BENCHMARK_NAMES
 from tests.pytest_plugins.random_util import apply_random_trajectory
 from tests.test_main import main
 
@@ -19,19 +18,26 @@
 RANDOM_TRAJECTORY_LENGTH_RANGE = (1, 50)
 
 
+@pytest.mark.timeout(600)
 def test_fuzz(env: LlvmEnv, reward_space: str):
     """This test produces a random trajectory, resets the environment, then
     replays the trajectory and checks that it produces the same state.
     """
     env.observation_space = "Autophase"
     env.reward_space = reward_space
+    benchmark = env.datasets["generator://csmith-v0"].random_benchmark()
+    print(benchmark.uri)  # For debugging in case of failure.
+
+    try:
+        env.reset(benchmark=benchmark)
+    except BenchmarkInitError:
+        return
 
-    env.reset(benchmark=random.choice(BENCHMARK_NAMES))
     trajectory = apply_random_trajectory(
         env, random_trajectory_length_range=RANDOM_TRAJECTORY_LENGTH_RANGE
     )
     print(env.state)  # For debugging in case of failure.
-    env.reset()
+    env.reset(benchmark=benchmark)
 
     for i, (action, observation, reward, done) in enumerate(trajectory, start=1):
         print(f"Replaying step {i}: {env.action_space.flags[action]}")
diff --git a/tests/llvm/BUILD b/tests/llvm/BUILD
index 22f35fbdb..f37795922 100644
--- a/tests/llvm/BUILD
+++ b/tests/llvm/BUILD
@@ -68,6 +68,17 @@ py_test(
     ],
 )
 
+py_test(
+    name = "episode_reward_test",
+    timeout = "long",
+    srcs = ["episode_reward_test.py"],
+    deps = [
+        "//compiler_gym/envs",
+        "//tests:test_main",
+        "//tests/pytest_plugins:llvm",
+    ],
+)
+
 py_test(
     name = "fork_env_test",
     timeout = "long",
diff --git a/tests/llvm/episode_reward_test.py b/tests/llvm/episode_reward_test.py
new file mode 100644
index 000000000..ba5d6e3d7
--- /dev/null
+++ b/tests/llvm/episode_reward_test.py
@@ -0,0 +1,38 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+"""Tests for LlvmEnv.episode_reward."""
+from compiler_gym.envs import LlvmEnv
+from tests.test_main import main
+
+pytest_plugins = ["tests.pytest_plugins.llvm"]
+
+
+def test_episode_reward_init_zero(env: LlvmEnv):
+    env.reward_space = "IrInstructionCount"
+    env.reset("cbench-v1/crc32")
+    assert env.episode_reward == 0
+    _, reward, _, _ = env.step(env.action_space["-mem2reg"])
+    assert reward > 0
+    assert env.episode_reward == reward
+    env.reset()
+    assert env.episode_reward == 0
+
+
+def test_episode_reward_with_non_default_reward_space(env: LlvmEnv):
+    """Test that episode_reward is not updated when custom rewards passed to
+    step()."""
+    env.reward_space = "IrInstructionCountOz"
+    env.reset("cbench-v1/crc32")
+    assert env.episode_reward == 0
+    _, rewards, _, _ = env.step(
+        env.action_space["-mem2reg"],
+        rewards=["IrInstructionCount"],
+    )
+    assert rewards[0] > 0
+    assert env.episode_reward == 0
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/requirements.txt b/tests/requirements.txt
index c99c25b9c..6eadb637d 100644
--- a/tests/requirements.txt
+++ b/tests/requirements.txt
@@ -5,4 +5,5 @@ pytest-mock==3.6.0
 pytest-shard==0.1.1
 pytest-stress==1.0.1
 pytest-sugar==0.9.4
+pytest-timeout==1.4.2
 pytest-xdist==2.2.1
diff --git a/tests/wrappers/core_wrappers_test.py b/tests/wrappers/core_wrappers_test.py
index a4c8f7a0b..8080288a7 100644
--- a/tests/wrappers/core_wrappers_test.py
+++ b/tests/wrappers/core_wrappers_test.py
@@ -124,6 +124,11 @@ def reward(self, reward):
     env.reset()
     _, reward, _, _ = env.step(0)
     assert reward == -5
+    assert env.episode_reward == -5
+
+    _, reward, _, _ = env.step(0)
+    assert reward == -5
+    assert env.episode_reward == -10
 
 
 if __name__ == "__main__":