pytorch · clee2000 · Jun 4, 2025 · Jun 4, 2025 · Jun 4, 2025
diff --git a/.jenkins/build.sh b/.jenkins/build.sh
@@ -16,6 +16,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )"
 sudo apt-get update
 sudo apt-get install -y pandoc
 
+# export CUBLAS_WORKSPACE_CONFIG=:4096:8
+
 # NS: Path to python runtime should already be part of docker container
 # export PATH=/opt/conda/bin:$PATH
 

diff --git a/.jenkins/get_files_to_run.py b/.jenkins/get_files_to_run.py
@@ -41,24 +41,27 @@ def add_to_shard(i, filename):
 
     all_other_files = all_files.copy()
     needs_multigpu = list(
-        filter(lambda x: get_needs_machine(x) == "linux.16xlarge.nvidia.gpu", all_files,)
-    )
-    needs_a10g = list(
-        filter(lambda x: get_needs_machine(x) == "linux.g5.4xlarge.nvidia.gpu", all_files,)
+        filter(lambda x: get_needs_machine(x) == "multigpu", all_files,)
     )
+    # Magic code for torchvision: for some reason, it needs to run after
+    # beginner_source/basics/data_tutorial.py.  Very specifically:
+    # https://github.com/pytorch/tutorials/blob/edff1330ca6c198e8e29a3d574bfb4afbe191bfd/beginner_source/basics/data_tutorial.py#L49-L60
+    # So manually add them to the last shard.  I think some other files also
+    # work but I'm too lazy to figure out which ones.
+    # add_to_shard(num_shards - 1, "beginner_source/basics/data_tutorial.py")
+    # add_to_shard(num_shards - 1, "intermediate_source/torchvision_tutorial.py")
+    # all_other_files.remove("beginner_source/basics/data_tutorial.py")
+    # all_other_files.remove("intermediate_source/torchvision_tutorial.py")
+
     for filename in needs_multigpu:
         # currently, the only job that has multigpu is the 0th worker,
         # so we'll add all the jobs that need this machine to the 0th worker
         add_to_shard(0, filename)
         all_other_files.remove(filename)
-    for filename in needs_a10g:
-        # currently, workers 1-5 use linux.g5.4xlarge.nvidia.gpu (sm86, A10G),
-        # so we'll add all the jobs that need this machine to the 1st worker
-        add_to_shard(1, filename)
-        all_other_files.remove(filename)
     sorted_files = sorted(all_other_files, key=get_duration, reverse=True,)
 
     for filename in sorted_files:
+        # If you don't specify a machine, you get the default
         min_shard_index = sorted(range(1, num_shards), key=lambda i: sharded_files[i][0])[
             0
         ]