diff --git a/.jenkins/build.sh b/.jenkins/build.sh index 58483c168b5..7b7c75f7c24 100755 --- a/.jenkins/build.sh +++ b/.jenkins/build.sh @@ -16,6 +16,8 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null && pwd )" sudo apt-get update sudo apt-get install -y pandoc +# export CUBLAS_WORKSPACE_CONFIG=:4096:8 + # NS: Path to python runtime should already be part of docker container # export PATH=/opt/conda/bin:$PATH diff --git a/.jenkins/get_files_to_run.py b/.jenkins/get_files_to_run.py index bdf4562a827..49541543e08 100644 --- a/.jenkins/get_files_to_run.py +++ b/.jenkins/get_files_to_run.py @@ -41,24 +41,27 @@ def add_to_shard(i, filename): all_other_files = all_files.copy() needs_multigpu = list( - filter(lambda x: get_needs_machine(x) == "linux.16xlarge.nvidia.gpu", all_files,) - ) - needs_a10g = list( - filter(lambda x: get_needs_machine(x) == "linux.g5.4xlarge.nvidia.gpu", all_files,) + filter(lambda x: get_needs_machine(x) == "multigpu", all_files,) ) + # Magic code for torchvision: for some reason, it needs to run after + # beginner_source/basics/data_tutorial.py. Very specifically: + # https://github.com/pytorch/tutorials/blob/edff1330ca6c198e8e29a3d574bfb4afbe191bfd/beginner_source/basics/data_tutorial.py#L49-L60 + # So manually add them to the last shard. I think some other files also + # work but I'm too lazy to figure out which ones. + # add_to_shard(num_shards - 1, "beginner_source/basics/data_tutorial.py") + # add_to_shard(num_shards - 1, "intermediate_source/torchvision_tutorial.py") + # all_other_files.remove("beginner_source/basics/data_tutorial.py") + # all_other_files.remove("intermediate_source/torchvision_tutorial.py") + for filename in needs_multigpu: # currently, the only job that has multigpu is the 0th worker, # so we'll add all the jobs that need this machine to the 0th worker add_to_shard(0, filename) all_other_files.remove(filename) - for filename in needs_a10g: - # currently, workers 1-5 use linux.g5.4xlarge.nvidia.gpu (sm86, A10G), - # so we'll add all the jobs that need this machine to the 1st worker - add_to_shard(1, filename) - all_other_files.remove(filename) sorted_files = sorted(all_other_files, key=get_duration, reverse=True,) for filename in sorted_files: + # If you don't specify a machine, you get the default min_shard_index = sorted(range(1, num_shards), key=lambda i: sharded_files[i][0])[ 0 ] diff --git a/.jenkins/metadata.json b/.jenkins/metadata.json index 6e82d054b4e..4a38132b41f 100644 --- a/.jenkins/metadata.json +++ b/.jenkins/metadata.json @@ -1,76 +1,316 @@ { - "intermediate_source/ax_multiobjective_nas_tutorial.py": { - "extra_files": ["intermediate_source/mnist_train_nas.py"], - "duration": 2000 + "advanced_source/coding_ddpg.py": { + "duration": 29.18 + }, + "advanced_source/dynamic_quantization_tutorial.py": { + "duration": 317.87 + }, + "advanced_source/neural_style_tutorial.py": { + "duration": 11.97 + }, + "advanced_source/numpy_extensions_tutorial.py": { + "duration": 0.59 + }, + "advanced_source/pendulum.py": { + "duration": 131.67 + }, + "advanced_source/python_custom_ops.py": { + "duration": 3.06 + }, + "advanced_source/semi_structured_sparse.py": { + "needs": "a10g" + }, + "beginner_source/Intro_to_TorchScript_tutorial.py": { + "duration": 0.16 + }, + "beginner_source/basics/autogradqs_tutorial.py": { + "duration": 0.12 + }, + "beginner_source/basics/buildmodel_tutorial.py": { + "duration": 0.52 + }, + "beginner_source/basics/data_tutorial.py": { + "duration": 5.02 + }, + "beginner_source/basics/optimization_tutorial.py": { + "duration": 70.39 + }, + "beginner_source/basics/quickstart_tutorial.py": { + "duration": 35.06 + }, + "beginner_source/basics/saveloadrun_tutorial.py": { + "duration": 5.83 + }, + "beginner_source/basics/tensorqs_tutorial.py": { + "duration": 0.26 + }, + "beginner_source/basics/transforms_tutorial.py": { + "duration": 4.57 + }, + "beginner_source/blitz/autograd_tutorial.py": { + "duration": 0.69 + }, + "beginner_source/blitz/cifar10_tutorial.py": { + "duration": 87.38 + }, + "beginner_source/blitz/data_parallel_tutorial.py": { + "duration": 1.83, + "needs": "multigpu" + }, + "beginner_source/blitz/neural_networks_tutorial.py": { + "duration": 0.14 + }, + "beginner_source/blitz/tensor_tutorial.py": { + "duration": 0.26 + }, + "beginner_source/chatbot_tutorial.py": { + "duration": 133.34 + }, + "beginner_source/data_loading_tutorial.py": { + "duration": 1.93 }, "beginner_source/dcgan_faces_tutorial.py": { - "duration": 2000 + "duration": 391.33 }, - "intermediate_source/seq2seq_translation_tutorial.py": { - "duration": 1200 + "beginner_source/deploy_seq2seq_hybrid_frontend_tutorial.py": { + "duration": 0.83 + }, + "beginner_source/examples_nn/polynomial_nn.py": { + "duration": 0.54 + }, + "beginner_source/examples_tensor/polynomial_numpy.py": { + "duration": 0.24 + }, + "beginner_source/examples_tensor/polynomial_tensor.py": { + "duration": 0.2 + }, + "beginner_source/fgsm_tutorial.py": { + "duration": 148.95 + }, + "beginner_source/flava_finetuning_tutorial.py": { + "duration": 118.96 + }, + "beginner_source/hybrid_frontend/learning_hybrid_frontend_through_example_tutorial.py": { + "duration": 0.08 }, "beginner_source/hyperparameter_tuning_tutorial.py": { - "duration": 0 + "duration": 429.45 }, - "advanced_source/dynamic_quantization_tutorial.py": { - "duration": 380 + "beginner_source/introyt/autogradyt_tutorial.py": { + "duration": 0.78 }, - "beginner_source/chatbot_tutorial.py": { - "duration": 330 + "beginner_source/introyt/introyt1_tutorial.py": { + "duration": 78.55 }, - "intermediate_source/pipeline_tutorial.py": { - "duration": 320, - "needs": "linux.16xlarge.nvidia.gpu" + "beginner_source/introyt/modelsyt_tutorial.py": { + "duration": 0.03 }, - "beginner_source/blitz/data_parallel_tutorial.py": { - "needs": "linux.16xlarge.nvidia.gpu" + "beginner_source/introyt/tensorboardyt_tutorial.py": { + "duration": 110.45 + }, + "beginner_source/introyt/tensors_deeper_tutorial.py": { + "duration": 0.41 + }, + "beginner_source/introyt/trainingyt.py": { + "duration": 180.44 + }, + "beginner_source/knowledge_distillation_tutorial.py": { + "duration": 251.75 + }, + "beginner_source/nlp/advanced_tutorial.py": { + "duration": 6.46 + }, + "beginner_source/nlp/deep_learning_tutorial.py": { + "duration": 0.26 + }, + "beginner_source/nlp/pytorch_tutorial.py": { + "duration": 0.12 + }, + "beginner_source/nlp/sequence_models_tutorial.py": { + "duration": 0.63 + }, + "beginner_source/nlp/word_embeddings_tutorial.py": { + "duration": 0.57 + }, + "beginner_source/nn_tutorial.py": { + "duration": 25.62 + }, + "beginner_source/onnx/export_control_flow_model_to_onnx_tutorial.py": { + "duration": 2.38 + }, + "beginner_source/onnx/export_simple_model_to_onnx_tutorial.py": { + "duration": 1.44 + }, + "beginner_source/onnx/onnx_registry_tutorial.py": { + "duration": 2.45 + }, + "beginner_source/template_tutorial.py": { + "duration": 0.01 + }, + "beginner_source/transfer_learning_tutorial.py": { + "duration": 63.94 + }, + "beginner_source/vt_tutorial.py": { + "duration": 13.31 + }, + "intermediate_source/autograd_saved_tensors_hooks_tutorial.py": { + "duration": 0.26 + }, + "intermediate_source/ax_multiobjective_nas_tutorial.py": { + "duration": 1291.0, + "extra_files": [ + "intermediate_source/mnist_train_nas.py" + ] + }, + "intermediate_source/char_rnn_classification_tutorial.py": { + "duration": 338.98 + }, + "intermediate_source/char_rnn_generation_tutorial.py": { + "duration": 208.18 + }, + "intermediate_source/custom_function_conv_bn_tutorial.py": { + "duration": 21.68 + }, + "intermediate_source/dqn_with_rnn_tutorial.py": { + "duration": 132.19 + }, + "intermediate_source/ensembling.py": { + "duration": 1.04 + }, + "intermediate_source/forward_ad_usage.py": { + "duration": 0.12 + }, + "intermediate_source/fx_profiling_tutorial.py": { + "duration": 0.32 + }, + "intermediate_source/inductor_debug_cpu.py": { + "duration": 656.57 + }, + "intermediate_source/jacobians_hessians.py": { + "duration": 11.93 + }, + "intermediate_source/mario_rl_tutorial.py": { + "duration": 79.78 + }, + "intermediate_source/memory_format_tutorial.py": { + "duration": 0.37 }, "intermediate_source/model_parallel_tutorial.py": { - "needs": "linux.16xlarge.nvidia.gpu" + "needs": "multigpu" }, - "intermediate_source/torchrec_intro_tutorial.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "intermediate_source/neural_tangent_kernels.py": { + "duration": 0.79 }, - "recipes_source/torch_export_aoti_python.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "intermediate_source/optimizer_step_in_backward_tutorial.py": { + "duration": 10.23 }, - "advanced_source/pendulum.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu", - "_comment": "need to be here for the compiling_optimizer_lr_scheduler.py to run." + "intermediate_source/parametrizations.py": { + "duration": 0.06 }, - "intermediate_source/torchvision_tutorial.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu", - "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py." + "intermediate_source/per_sample_grads.py": { + "duration": 7.76 }, - "advanced_source/coding_ddpg.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu", - "_comment": "does not require a5g but needs to run before gpu_quantization_torchao_tutorial.py." + "intermediate_source/pinmem_nonblock.py": { + "duration": 65.39 }, - "recipes_source/compiling_optimizer_lr_scheduler.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "intermediate_source/pipeline_tutorial.py": { + "duration": 320, + "needs": "multigpu" + }, + "intermediate_source/pruning_tutorial.py": { + "duration": 0.44 + }, + "intermediate_source/reinforcement_ppo.py": { + "duration": 142.03 + }, + "intermediate_source/reinforcement_q_learning.py": { + "duration": 402.66 + }, + "intermediate_source/scaled_dot_product_attention_tutorial.py": { + "duration": 7.03, + "needs": "a10g" + }, + "intermediate_source/seq2seq_translation_tutorial.py": { + "duration": 506.96 + }, + "intermediate_source/spatial_transformer_tutorial.py": { + "duration": 95.16 }, "intermediate_source/torch_compile_tutorial.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "duration": 322.87, + "needs": "a10g" }, "intermediate_source/torch_export_tutorial.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "duration": 4.44, + "needs": "a10g" }, - "intermediate_source/scaled_dot_product_attention_tutorial.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "intermediate_source/torchrec_intro_tutorial.py": { + "needs": "a10g" + }, + "intermediate_source/torchvision_tutorial.py": { + "duration": 48.01 }, "intermediate_source/transformer_building_blocks.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "duration": 65.46, + "needs": "a10g" }, - "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "prototype_source/gpu_quantization_torchao_tutorial.py": { + "duration": 814.12, + "needs": "a10g" + }, + "prototype_source/maskedtensor_adagrad.py": { + "duration": 0.03 + }, + "prototype_source/maskedtensor_advanced_semantics.py": { + "duration": 0.02 + }, + "prototype_source/maskedtensor_overview.py": { + "duration": 0.15 + }, + "prototype_source/maskedtensor_sparsity.py": { + "duration": 0.04 + }, + "prototype_source/numeric_suite_tutorial.py": { + "duration": 1.55 + }, + "recipes_source/compiling_optimizer_lr_scheduler.py": { + "duration": 16.09, + "needs": "a10g" + }, + "recipes_source/foreach_map.py": { + "duration": 12.59 + }, + "recipes_source/recipes/changing_default_device.py": { + "duration": 0.32 + }, + "recipes_source/recipes/module_load_state_dict_tips.py": { + "duration": 0.55 + }, + "recipes_source/recipes/reasoning_about_shapes.py": { + "duration": 0.02 + }, + "recipes_source/recipes/swap_tensors.py": { + "duration": 0.02 }, "recipes_source/regional_compilation.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "duration": 28.48, + "needs": "a10g" }, - "advanced_source/semi_structured_sparse.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "recipes_source/torch_compile_torch_function_modes.py": { + "duration": 6.84 }, - "prototype_source/gpu_quantization_torchao_tutorial.py": { - "needs": "linux.g5.4xlarge.nvidia.gpu" + "recipes_source/torch_compile_user_defined_triton_kernel_tutorial.py": { + "duration": 2.98, + "needs": "a10g" + }, + "recipes_source/torch_compiler_set_stance_tutorial.py": { + "duration": 8.4 + }, + "recipes_source/torch_export_aoti_python.py": { + "duration": 28.25, + "needs": "a10g" + }, + "recipes_source/torch_logs.py": { + "duration": 2.59 } } diff --git a/.jenkins/update_metadata_times.py b/.jenkins/update_metadata_times.py new file mode 100644 index 00000000000..1f86dd08460 --- /dev/null +++ b/.jenkins/update_metadata_times.py @@ -0,0 +1,80 @@ +# Run this script locally to update the metadata.json file with the latest +# computation times from main and then make a PR to commit the change to the +# repo. +import os +import re +import requests +from typing import List +import json +from pathlib import Path + +REPO_ROOT = Path(__file__).absolute().parent.parent +GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN") +if GITHUB_TOKEN is None: + raise RuntimeError("GITHUB_TOKEN is not set") + + +def get_log(id: str) -> str: + url = f"https://api.github.com/repos/pytorch/tutorials/actions/jobs/{id}/logs" + headers = { + "Accept": "application/vnd.github.v3+json", + "Authorization": f"token {GITHUB_TOKEN}", + } + res = requests.get(url, headers=headers) + res.raise_for_status() + log_data = res.text + return log_data + + +def parse_log(log: str) -> dict: + res = {} + for line in log.splitlines(): + rematch = re.search(" - ([^ ]+.py): +(\d*\.\d*) sec +\d+\.\d+ MB", line) + if rematch: + res[rematch.group(1)] = float(rematch.group(2)) + return res + + +def get_log_ids() -> List[str]: + url = f"https://api.github.com/repos/pytorch/tutorials/actions/workflows/build-tutorials.yml/runs?branch=main&status=completed&per_page=100" + headers = { + "Accept": "application/vnd.github.v3+json", + "Authorization": f"token {GITHUB_TOKEN}", + } + response = requests.get(url, headers=headers) + response.raise_for_status() + runs = response.json().get("workflow_runs", []) + for run in runs: + jobs_url = run.get("jobs_url") + if not jobs_url: + continue + jobs_response = requests.get(jobs_url, headers=headers) + jobs_response.raise_for_status() + print(json.dumps(jobs_response.json(), indent=2)) + jobs =jobs_response.json().get("jobs", []) + return [job["id"] for job in jobs] + raise RuntimeError("No jobs found for the given SHA") + +def main(): + log_ids = get_log_ids() + durations = {} + for log_id in log_ids: + log = get_log(log_id) + res = parse_log(log) + for k, v in res.items(): + if v > durations.get(k, 0): + durations[k] = v + + # Write back to metadata.json + with open(REPO_ROOT / ".jenkins/metadata.json", "r") as f: + metadata = json.load(f) + for k, v in durations.items(): + if k not in metadata: + metadata[k] = {} + metadata[k]["duration"] = v + with open(REPO_ROOT / ".jenkins/metadata.json", "w") as f: + json.dump(metadata, f, indent=2, sort_keys=True) + + +if __name__ == "__main__": + main() diff --git a/intermediate_source/ax_multiobjective_nas_tutorial.py b/intermediate_source/ax_multiobjective_nas_tutorial.py index 0f1ae21a556..6b22788c2a3 100644 --- a/intermediate_source/ax_multiobjective_nas_tutorial.py +++ b/intermediate_source/ax_multiobjective_nas_tutorial.py @@ -245,7 +245,7 @@ def _get_event_multiplexer_for_trial(self, trial): mul = event_multiplexer.EventMultiplexer(max_reload_threads=20) mul.AddRunsFromDirectory(Path(log_dir).joinpath(str(trial.index)).as_posix(), None) mul.Reload() - + return mul # This indicates whether the metric is queryable while the trial is @@ -351,7 +351,7 @@ def is_available_while_running(cls): # -total_trials = 48 # total evaluation budget +total_trials = 24 # total evaluation budget from ax.modelbridge.dispatch_utils import choose_generation_strategy @@ -394,7 +394,7 @@ def is_available_while_running(cls): experiment=experiment, generation_strategy=gs, options=SchedulerOptions( - total_trials=total_trials, max_pending_trials=4 + total_trials=total_trials, max_pending_trials=8 ), )