diff --git a/distributed/tensor_parallelism/log_utils.py b/distributed/tensor_parallelism/log_utils.py index f16d46526d..d103df892b 100644 --- a/distributed/tensor_parallelism/log_utils.py +++ b/distributed/tensor_parallelism/log_utils.py @@ -17,6 +17,6 @@ def rank_log(_rank, logger, msg): def verify_min_gpu_count(min_gpus: int = 2) -> bool: """ verification that we have at least 2 gpus to run dist examples """ - has_cuda = torch.cuda.is_available() - gpu_count = torch.cuda.device_count() - return has_cuda and gpu_count >= min_gpus + has_gpu = torch.accelerator.is_available() + gpu_count = torch.accelerator.device_count() + return has_gpu and gpu_count >= min_gpus diff --git a/distributed/tensor_parallelism/requirements.txt b/distributed/tensor_parallelism/requirements.txt index 80fad36bf2..08392d244a 100644 --- a/distributed/tensor_parallelism/requirements.txt +++ b/distributed/tensor_parallelism/requirements.txt @@ -3,4 +3,6 @@ --pre --extra-index-url https://download.pytorch.org/whl/nightly/cu118 --extra-index-url https://download.pytorch.org/whl/nightly/cu121 -torch >= 2.3.0.dev0; sys_platform == "linux" +--extra-index-url https://download.pytorch.org/whl/nightly/cu126 +--extra-index-url https://download.pytorch.org/whl/nightly/cu128 +torch >= 2.7.1; sys_platform == "linux" diff --git a/distributed/tensor_parallelism/sequence_parallel_example.py b/distributed/tensor_parallelism/sequence_parallel_example.py index 3324d28d4a..0be33dc0d6 100644 --- a/distributed/tensor_parallelism/sequence_parallel_example.py +++ b/distributed/tensor_parallelism/sequence_parallel_example.py @@ -1,3 +1,4 @@ +# torchrun --nnodes 1 --nproc-per-node 4 import os import sys import torch @@ -63,9 +64,10 @@ def forward(self, x): """ logger = get_logger() +device_type = torch.accelerator.current_accelerator().type # create a device mesh based on the given world_size. device_mesh = init_device_mesh( - device_type="cuda", mesh_shape=(int(os.environ["WORLD_SIZE"]),) + device_type=device_type, mesh_shape=(int(os.environ["WORLD_SIZE"]),) ) _rank = device_mesh.get_rank() @@ -75,7 +77,7 @@ def forward(self, x): rank_log(_rank, logger, f"Device Mesh created: {device_mesh=}") # create model and move it to GPU. Init_device_mesh has already assigned gpu ids... -model = ToyModel().to("cuda") +model = ToyModel().to(device_type) # Custom parallelization plan for the model sp_model = parallelize_module( @@ -100,7 +102,8 @@ def forward(self, x): for i in range(num_iters): # For SP, input can be different across all ranks. - inp = torch.rand(20, 10, device="cuda") + #inp = torch.rand(20, 10, device=device_type) + inp = torch.rand(1, 10, device=device_type) output = sp_model(inp) output.sum().backward() optimizer.step() diff --git a/distributed/tensor_parallelism/tensor_parallel_example.py b/distributed/tensor_parallelism/tensor_parallel_example.py index 0b9c884507..627f4611eb 100755 --- a/distributed/tensor_parallelism/tensor_parallel_example.py +++ b/distributed/tensor_parallelism/tensor_parallel_example.py @@ -1,3 +1,4 @@ +# torchrun --nnodes 1 --nproc-per-node 4 import os import sys import torch @@ -76,8 +77,8 @@ def forward(self, x): # create a device mesh based on the given world_size. _world_size = int(os.environ["WORLD_SIZE"]) - -device_mesh = init_device_mesh(device_type="cuda", mesh_shape=(_world_size,)) +device_type = torch.accelerator.current_accelerator().type +device_mesh = init_device_mesh(device_type=device_type, mesh_shape=(_world_size,)) _rank = device_mesh.get_rank() @@ -88,8 +89,8 @@ def forward(self, x): rank_log(_rank, logger, f"Device Mesh created: {device_mesh=}") -# create model and move it to GPU - init"cuda"_mesh has already mapped GPU ids. -tp_model = ToyModel().to("cuda") +# create model and move it to GPU - initdevice_type_mesh has already mapped GPU ids. +tp_model = ToyModel().to(device_type) # Custom parallelization plan for the model @@ -116,7 +117,7 @@ def forward(self, x): # For TP, input needs to be same across all TP ranks. # Setting the random seed is to mimic the behavior of dataloader. torch.manual_seed(i) - inp = torch.rand(20, 10, device="cuda") + inp = torch.rand(20, 10, device=device_type) output = tp_model(inp) output.sum().backward() optimizer.step()