diff --git a/.jenkins/build.sh b/.jenkins/build.sh
index 58483c168b5..7b7c75f7c24 100755
--- a/.jenkins/build.sh
+++ b/.jenkins/build.sh
@@ -16,6 +16,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 sudo apt-get update
 sudo apt-get install -y pandoc
 
+# export CUBLAS_WORKSPACE_CONFIG=:4096:8
+
 # NS: Path to python runtime should already be part of docker container
 # export PATH=/opt/conda/bin:$PATH
 
diff --git a/.jenkins/get_files_to_run.py b/.jenkins/get_files_to_run.py
index bdf4562a827..49541543e08 100644
--- a/.jenkins/get_files_to_run.py
+++ b/.jenkins/get_files_to_run.py
@@ -41,24 +41,27 @@ def add_to_shard(i, filename):
 
     all_other_files = all_files.copy()
     needs_multigpu = list(
-        filter(lambda x: get_needs_machine(x) == "linux.16xlarge.nvidia.gpu", all_files,)
-    )
-    needs_a10g = list(
-        filter(lambda x: get_needs_machine(x) == "linux.g5.4xlarge.nvidia.gpu", all_files,)
+        filter(lambda x: get_needs_machine(x) == "multigpu", all_files,)
     )
+    # Magic code for torchvision: for some reason, it needs to run after
+    # beginner_source/basics/data_tutorial.py.  Very specifically:
+    # https://github.com/pytorch/tutorials/blob/edff1330ca6c198e8e29a3d574bfb4afbe191bfd/beginner_source/basics/data_tutorial.py#L49-L60
+    # So manually add them to the last shard.  I think some other files also
+    # work but I'm too lazy to figure out which ones.
+    # add_to_shard(num_shards - 1, "beginner_source/basics/data_tutorial.py")
+    # add_to_shard(num_shards - 1, "intermediate_source/torchvision_tutorial.py")
+    # all_other_files.remove("beginner_source/basics/data_tutorial.py")
+    # all_other_files.remove("intermediate_source/torchvision_tutorial.py")
+
     for filename in needs_multigpu:
         # currently, the only job that has multigpu is the 0th worker,
         # so we'll add all the jobs that need this machine to the 0th worker
         add_to_shard(0, filename)
         all_other_files.remove(filename)
-    for filename in needs_a10g:
-        # currently, workers 1-5 use linux.g5.4xlarge.nvidia.gpu (sm86, A10G),
-        # so we'll add all the jobs that need this machine to the 1st worker
-        add_to_shard(1, filename)
-        all_other_files.remove(filename)
     sorted_files = sorted(all_other_files, key=get_duration, reverse=True,)
 
     for filename in sorted_files:
+        # If you don't specify a machine, you get the default
         min_shard_index = sorted(range(1, num_shards), key=lambda i: sharded_files[i][0])[
             0
         ]
diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json
index 6e82d054b4e..4a38132b41f 100644
--- a/.jenkins/metadata.json
+++ b/.jenkins/metadata.json
@@ -1,76 +1,316 @@
 {
-  "intermediate_source/ax_multiobjective_nas_tutorial.py": {
-    "extra_files": ["intermediate_source/mnist_train_nas.py"],
-    "duration": 2000
+  "advanced_source/coding_ddpg.py": {
+    "duration": 29.18
+  },
+  "advanced_source/dynamic_quantization_tutorial.py": {
+    "duration": 317.87
+  },
+  "advanced_source/neural_style_tutorial.py": {
+    "duration": 11.97
+  },
+  "advanced_source/numpy_extensions_tutorial.py": {
+    "duration": 0.59
+  },
+  "advanced_source/pendulum.py": {
+    "duration": 131.67
+  },
+  "advanced_source/python_custom_ops.py": {
+    "duration": 3.06
+  },
+  "advanced_source/semi_structured_sparse.py": {
+    "needs": "a10g"
+  },
+  "beginner_source/Intro_to_TorchScript_tutorial.py": {
+    "duration": 0.16
+  },
+  "beginner_source/basics/autogradqs_tutorial.py": {
+    "duration": 0.12
+  },
+  "beginner_source/basics/buildmodel_tutorial.py": {
+    "duration": 0.52
+  },
+  "beginner_source/basics/data_tutorial.py": {
+    "duration": 5.02
+  },
+  "beginner_source/basics/optimization_tutorial.py": {
+    "duration": 70.39
+  },
+  "beginner_source/basics/quickstart_tutorial.py": {
+    "duration": 35.06
+  },
+  "beginner_source/basics/saveloadrun_tutorial.py": {
+    "duration": 5.83
+  },
+  "beginner_source/basics/tensorqs_tutorial.py": {
+    "duration": 0.26
+  },
+  "beginner_source/basics/transforms_tutorial.py": {
+    "duration": 4.57
+  },
+  "beginner_source/blitz/autograd_tutorial.py": {
+    "duration": 0.69
+  },
+  "beginner_source/blitz/cifar10_tutorial.py": {
+    "duration": 87.38
+  },
+  "beginner_source/blitz/data_parallel_tutorial.py": {
+    "duration": 1.83,
+    "needs": "multigpu"
+  },
+  "beginner_source/blitz/neural_networks_tutorial.py": {
+    "duration": 0.14
+  },
+  "beginner_source/blitz/tensor_tutorial.py": {
+    "duration": 0.26
+  },
+  "beginner_source/chatbot_tutorial.py": {
+    "duration": 133.34
+  },
+  "beginner_source/data_loading_tutorial.py": {
+    "duration": 1.93
   },
   "beginner_source/dcgan_faces_tutorial.py": {
-    "duration": 2000
+    "duration": 391.33
   },
-  "intermediate_source/seq2seq_translation_tutorial.py": {
-    "duration": 1200
+  "beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py": {
+    "duration": 0.83
+  },
+  "beginner_source/examples_nn/polynomial_nn.py": {
+    "duration": 0.54
+  },
+  "beginner_source/examples_tensor/polynomial_numpy.py": {
+    "duration": 0.24
+  },
+  "beginner_source/examples_tensor/polynomial_tensor.py": {
+    "duration": 0.2
+  },
+  "beginner_source/fgsm_tutorial.py": {
+    "duration": 148.95
+  },
+  "beginner_source/flava_finetuning_tutorial.py": {
+    "duration": 118.96
+  },
+  "beginner_source/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.py": {
+    "duration": 0.08
   },
   "beginner_source/hyperparameter_tuning_tutorial.py": {
-    "duration": 0
+    "duration": 429.45
   },
-  "advanced_source/dynamic_quantization_tutorial.py": {
-    "duration": 380
+  "beginner_source/introyt/autogradyt_tutorial.py": {
+    "duration": 0.78
   },
-  "beginner_source/chatbot_tutorial.py": {
-    "duration": 330
+  "beginner_source/introyt/introyt1_tutorial.py": {
+    "duration": 78.55
   },
-  "intermediate_source/pipeline_tutorial.py": {
-    "duration": 320,
-    "needs": "linux.16xlarge.nvidia.gpu"
+  "beginner_source/introyt/modelsyt_tutorial.py": {
+    "duration": 0.03
   },
-  "beginner_source/blitz/data_parallel_tutorial.py": {
-    "needs": "linux.16xlarge.nvidia.gpu"
+  "beginner_source/introyt/tensorboardyt_tutorial.py": {
+    "duration": 110.45
+  },
+  "beginner_source/introyt/tensors_deeper_tutorial.py": {
+    "duration": 0.41
+  },
+  "beginner_source/introyt/trainingyt.py": {
+    "duration": 180.44
+  },
+  "beginner_source/knowledge_distillation_tutorial.py": {
+    "duration": 251.75
+  },
+  "beginner_source/nlp/advanced_tutorial.py": {
+    "duration": 6.46
+  },
+  "beginner_source/nlp/deep_learning_tutorial.py": {
+    "duration": 0.26
+  },
+  "beginner_source/nlp/pytorch_tutorial.py": {
+    "duration": 0.12
+  },
+  "beginner_source/nlp/sequence_models_tutorial.py": {
+    "duration": 0.63
+  },
+  "beginner_source/nlp/word_embeddings_tutorial.py": {
+    "duration": 0.57
+  },
+  "beginner_source/nn_tutorial.py": {
+    "duration": 25.62
+  },
+  "beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py": {
+    "duration": 2.38
+  },
+  "beginner_source/onnx/export_simple_model_to_onnx_tutorial.py": {
+    "duration": 1.44
+  },
+  "beginner_source/onnx/onnx_registry_tutorial.py": {
+    "duration": 2.45
+  },
+  "beginner_source/template_tutorial.py": {
+    "duration": 0.01
+  },
+  "beginner_source/transfer_learning_tutorial.py": {
+    "duration": 63.94
+  },
+  "beginner_source/vt_tutorial.py": {
+    "duration": 13.31
+  },
+  "intermediate_source/autograd_saved_tensors_hooks_tutorial.py": {
+    "duration": 0.26
+  },
+  "intermediate_source/ax_multiobjective_nas_tutorial.py": {
+    "duration": 1291.0,
+    "extra_files": [
+      "intermediate_source/mnist_train_nas.py"
+    ]
+  },
+  "intermediate_source/char_rnn_classification_tutorial.py": {
+    "duration": 338.98
+  },
+  "intermediate_source/char_rnn_generation_tutorial.py": {
+    "duration": 208.18
+  },
+  "intermediate_source/custom_function_conv_bn_tutorial.py": {
+    "duration": 21.68
+  },
+  "intermediate_source/dqn_with_rnn_tutorial.py": {
+    "duration": 132.19
+  },
+  "intermediate_source/ensembling.py": {
+    "duration": 1.04
+  },
+  "intermediate_source/forward_ad_usage.py": {
+    "duration": 0.12
+  },
+  "intermediate_source/fx_profiling_tutorial.py": {
+    "duration": 0.32
+  },
+  "intermediate_source/inductor_debug_cpu.py": {
+    "duration": 656.57
+  },
+  "intermediate_source/jacobians_hessians.py": {
+    "duration": 11.93
+  },
+  "intermediate_source/mario_rl_tutorial.py": {
+    "duration": 79.78
+  },
+  "intermediate_source/memory_format_tutorial.py": {
+    "duration": 0.37
   },
   "intermediate_source/model_parallel_tutorial.py": {
-    "needs": "linux.16xlarge.nvidia.gpu"
+    "needs": "multigpu"
   },
-  "intermediate_source/torchrec_intro_tutorial.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  "intermediate_source/neural_tangent_kernels.py": {
+    "duration": 0.79
   },
-  "recipes_source/torch_export_aoti_python.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  "intermediate_source/optimizer_step_in_backward_tutorial.py": {
+    "duration": 10.23
   },
-  "advanced_source/pendulum.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu",
-    "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run."
+  "intermediate_source/parametrizations.py": {
+    "duration": 0.06
   },
-  "intermediate_source/torchvision_tutorial.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu",
-    "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py."
+  "intermediate_source/per_sample_grads.py": {
+    "duration": 7.76
   },
-  "advanced_source/coding_ddpg.py": {
-     "needs": "linux.g5.4xlarge.nvidia.gpu",
-     "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py."
+  "intermediate_source/pinmem_nonblock.py": {
+    "duration": 65.39
   },
-  "recipes_source/compiling_optimizer_lr_scheduler.py": {
-     "needs": "linux.g5.4xlarge.nvidia.gpu"
+  "intermediate_source/pipeline_tutorial.py": {
+    "duration": 320,
+    "needs": "multigpu"
+  },
+  "intermediate_source/pruning_tutorial.py": {
+    "duration": 0.44
+  },
+  "intermediate_source/reinforcement_ppo.py": {
+    "duration": 142.03
+  },
+  "intermediate_source/reinforcement_q_learning.py": {
+    "duration": 402.66
+  },
+  "intermediate_source/scaled_dot_product_attention_tutorial.py": {
+    "duration": 7.03,
+    "needs": "a10g"
+  },
+  "intermediate_source/seq2seq_translation_tutorial.py": {
+    "duration": 506.96
+  },
+  "intermediate_source/spatial_transformer_tutorial.py": {
+    "duration": 95.16
   },
   "intermediate_source/torch_compile_tutorial.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+    "duration": 322.87,
+    "needs": "a10g"
   },
   "intermediate_source/torch_export_tutorial.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+    "duration": 4.44,
+    "needs": "a10g"
   },
-  "intermediate_source/scaled_dot_product_attention_tutorial.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  "intermediate_source/torchrec_intro_tutorial.py": {
+    "needs": "a10g"
+  },
+  "intermediate_source/torchvision_tutorial.py": {
+    "duration": 48.01
   },
   "intermediate_source/transformer_building_blocks.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+    "duration": 65.46,
+    "needs": "a10g"
   },
-  "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  "prototype_source/gpu_quantization_torchao_tutorial.py": {
+    "duration": 814.12,
+    "needs": "a10g"
+  },
+  "prototype_source/maskedtensor_adagrad.py": {
+    "duration": 0.03
+  },
+  "prototype_source/maskedtensor_advanced_semantics.py": {
+    "duration": 0.02
+  },
+  "prototype_source/maskedtensor_overview.py": {
+    "duration": 0.15
+  },
+  "prototype_source/maskedtensor_sparsity.py": {
+    "duration": 0.04
+  },
+  "prototype_source/numeric_suite_tutorial.py": {
+    "duration": 1.55
+  },
+  "recipes_source/compiling_optimizer_lr_scheduler.py": {
+    "duration": 16.09,
+    "needs": "a10g"
+  },
+  "recipes_source/foreach_map.py": {
+    "duration": 12.59
+  },
+  "recipes_source/recipes/changing_default_device.py": {
+    "duration": 0.32
+  },
+  "recipes_source/recipes/module_load_state_dict_tips.py": {
+    "duration": 0.55
+  },
+  "recipes_source/recipes/reasoning_about_shapes.py": {
+    "duration": 0.02
+  },
+  "recipes_source/recipes/swap_tensors.py": {
+    "duration": 0.02
   },
   "recipes_source/regional_compilation.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+    "duration": 28.48,
+    "needs": "a10g"
   },
-  "advanced_source/semi_structured_sparse.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  "recipes_source/torch_compile_torch_function_modes.py": {
+    "duration": 6.84
   },
-  "prototype_source/gpu_quantization_torchao_tutorial.py": {
-    "needs": "linux.g5.4xlarge.nvidia.gpu"
+  "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": {
+    "duration": 2.98,
+    "needs": "a10g"
+  },
+  "recipes_source/torch_compiler_set_stance_tutorial.py": {
+    "duration": 8.4
+  },
+  "recipes_source/torch_export_aoti_python.py": {
+    "duration": 28.25,
+    "needs": "a10g"
+  },
+  "recipes_source/torch_logs.py": {
+    "duration": 2.59
   }
 }
diff --git a/.jenkins/update_metadata_times.py b/.jenkins/update_metadata_times.py
new file mode 100644
index 00000000000..1f86dd08460
--- /dev/null
+++ b/.jenkins/update_metadata_times.py
@@ -0,0 +1,80 @@
+# Run this script locally to update the metadata.json file with the latest
+# computation times from main and then make a PR to commit the change to the
+# repo.
+import os
+import re
+import requests
+from typing import  List
+import json
+from pathlib import Path
+
+REPO_ROOT = Path(__file__).absolute().parent.parent
+GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN")
+if GITHUB_TOKEN is None:
+    raise RuntimeError("GITHUB_TOKEN is not set")
+
+
+def get_log(id: str) -> str:
+    url = f"https://api.github.com/repos/pytorch/tutorials/actions/jobs/{id}/logs"
+    headers = {
+        "Accept": "application/vnd.github.v3+json",
+        "Authorization": f"token {GITHUB_TOKEN}",
+    }
+    res = requests.get(url, headers=headers)
+    res.raise_for_status()
+    log_data = res.text
+    return log_data
+
+
+def parse_log(log: str) -> dict:
+    res = {}
+    for line in log.splitlines():
+        rematch = re.search(" - ([^ ]+.py): +(\d*\.\d*) sec +\d+\.\d+ MB", line)
+        if rematch:
+            res[rematch.group(1)] = float(rematch.group(2))
+    return res
+
+
+def get_log_ids() -> List[str]:
+    url = f"https://api.github.com/repos/pytorch/tutorials/actions/workflows/build-tutorials.yml/runs?branch=main&status=completed&per_page=100"
+    headers = {
+        "Accept": "application/vnd.github.v3+json",
+        "Authorization": f"token {GITHUB_TOKEN}",
+    }
+    response = requests.get(url, headers=headers)
+    response.raise_for_status()
+    runs = response.json().get("workflow_runs", [])
+    for run in runs:
+        jobs_url = run.get("jobs_url")
+        if not jobs_url:
+            continue
+        jobs_response = requests.get(jobs_url, headers=headers)
+        jobs_response.raise_for_status()
+        print(json.dumps(jobs_response.json(), indent=2))
+        jobs =jobs_response.json().get("jobs", [])
+        return [job["id"] for job in jobs]
+    raise RuntimeError("No jobs found for the given SHA")
+
+def main():
+    log_ids = get_log_ids()
+    durations = {}
+    for log_id in log_ids:
+        log = get_log(log_id)
+        res = parse_log(log)
+        for k, v in res.items():
+            if v > durations.get(k, 0):
+                durations[k] = v
+
+    # Write back to metadata.json
+    with open(REPO_ROOT / ".jenkins/metadata.json", "r") as f:
+        metadata = json.load(f)
+    for k, v in durations.items():
+        if k not in metadata:
+            metadata[k] = {}
+        metadata[k]["duration"] = v
+    with open(REPO_ROOT / ".jenkins/metadata.json", "w") as f:
+        json.dump(metadata, f, indent=2, sort_keys=True)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/intermediate_source/ax_multiobjective_nas_tutorial.py b/intermediate_source/ax_multiobjective_nas_tutorial.py
index 0f1ae21a556..6b22788c2a3 100644
--- a/intermediate_source/ax_multiobjective_nas_tutorial.py
+++ b/intermediate_source/ax_multiobjective_nas_tutorial.py
@@ -245,7 +245,7 @@ def _get_event_multiplexer_for_trial(self, trial):
         mul = event_multiplexer.EventMultiplexer(max_reload_threads=20)
         mul.AddRunsFromDirectory(Path(log_dir).joinpath(str(trial.index)).as_posix(), None)
         mul.Reload()
-    
+
         return mul
 
     # This indicates whether the metric is queryable while the trial is
@@ -351,7 +351,7 @@ def is_available_while_running(cls):
 #
 
 
-total_trials = 48  # total evaluation budget
+total_trials = 24  # total evaluation budget
 
 from ax.modelbridge.dispatch_utils import choose_generation_strategy
 
@@ -394,7 +394,7 @@ def is_available_while_running(cls):
     experiment=experiment,
     generation_strategy=gs,
     options=SchedulerOptions(
-        total_trials=total_trials, max_pending_trials=4
+        total_trials=total_trials, max_pending_trials=8
     ),
 )