From 3589cad83a41e89447c171a8cd8f67edbb9d66b3 Mon Sep 17 00:00:00 2001 From: Quentin Anthony Date: Thu, 26 Oct 2023 12:35:20 -0400 Subject: [PATCH 1/4] Remove PP Grad Tail Check (#2538) * Only communicate grad tail if it exists Co-authored-by: Dashiell Stander * Revert previous patch and just always send the grad tail * Formatting --------- Co-authored-by: Dashiell Stander Co-authored-by: Olatunji Ruwase Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com> --- deepspeed/runtime/pipe/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py index 2542dd602b1f..b89c1aca7f2a 100644 --- a/deepspeed/runtime/pipe/engine.py +++ b/deepspeed/runtime/pipe/engine.py @@ -988,7 +988,7 @@ def _exec_send_grads(self, buffer_id): if isinstance(inputs, tuple): first_input = inputs[0] assert all([torch.is_tensor(elt) for elt in inputs[1:]]) - inputs_grad_tail = [elt.grad for elt in inputs[1:] if elt.grad is not None] + inputs_grad_tail = [elt.grad for elt in inputs[1:]] elif torch.is_tensor(inputs): first_input = inputs inputs_grad_tail = [] From 764f5b073c26de8c9fd1d888371e6708d92bca2c Mon Sep 17 00:00:00 2001 From: Ramya Ramineni <62723901+rraminen@users.noreply.github.com> Date: Thu, 26 Oct 2023 12:37:13 -0500 Subject: [PATCH 2/4] Added __HIP_PLATFORM_AMD__=1 (#4570) --- op_builder/builder.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/op_builder/builder.py b/op_builder/builder.py index 1fdcd485f5fb..62683774f20a 100644 --- a/op_builder/builder.py +++ b/op_builder/builder.py @@ -486,6 +486,9 @@ def jit_load(self, verbose=True): cxx_args.append("-DBF16_AVAILABLE") nvcc_args.append("-DBF16_AVAILABLE") + if self.is_rocm_pytorch(): + cxx_args.append("-D__HIP_PLATFORM_AMD__=1") + op_module = load(name=self.name, sources=self.strip_empty_entries(sources), extra_include_paths=self.strip_empty_entries(extra_include_paths), From 8f168c2f8d0459592b51e3f2f245729f459bd00d Mon Sep 17 00:00:00 2001 From: Xie Zejian Date: Fri, 27 Oct 2023 05:48:07 +0800 Subject: [PATCH 3/4] fix multiple definition while building evoformer (#4556) Current builder for evoformer use the same name for `attention.cpp` and `attention.cu`, leading to same intermediate filename `attention.o`: ```shell march=nocona -mtune=haswell -ftree-vectorize -fPIC -fstack-protector-strong -fno-plt -O2 -ffunction-sections -pipe - isystem /home/zejianxie/.conda/envs/dll/include -DNDEBUG -D_FORTIFY_SOURCE=2 -O2 -isystem /home/zejianxie/.conda/envs/dll/include build/temp.linux-x86_64-cpython- 310/csrc/deepspeed4science/evoformer_attn/attention.o build/temp.linux-x86_64-cpython- 310/csrc/deepspeed4science/evoformer_attn/attention.o build/temp.linux-x86_64-cpython- 310/csrc/deepspeed4science/evoformer_attn/attention_back.o ``` and ``` `attention_impl(at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&)': tmpxft_0012bef1_00000000-6_attention.compute_86.cudafe1.cpp:(.text+0x330): multiple definition of `attention_impl(at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&, at::Tensor&)'; build/temp.linux-x86_64-cpython-310/csrc/deepspeed4science/evoformer_attn/attention.o:tmpxft_0012bef1_00000000-6_attention.compute_86.cudafe1.cpp:(.text+0x330): first defined here /home/zejianxie/.conda/envs/dll/bin/../lib/gcc/x86_64-conda-linux-gnu/11.4.0/../../../../x86_64-conda-linux-gnu/bin/ld: build/temp.linux-x86_64-cpython-310/csrc/deepspeed4science/evoformer_attn/attention.o:(.bss+0x0): multiple definition of `torch::autograd::(anonymous namespace)::graph_task_id'; build/temp.linux-x86_64-cpython-310/csrc/deepspeed4science/evoformer_attn/attention.o:(.bss+0x0): first defined here ``` I use following to reproduce and confirm my fix works: ``` git clone https://github.com/NVIDIA/cutlass --depth 1 CUTLASS_PATH=$PWD/cutlass DS_BUILD_EVOFORMER_ATTN=1 pip install ./DeepSpeed --global-option="build_ext" ``` ![image](https://github.com/microsoft/DeepSpeed/assets/41792945/9e406b37-330c-431c-8bf9-6be378dee4ff) Co-authored-by: Conglong Li --- .../evoformer_attn/{attention.cu => attention_cu.cu} | 0 op_builder/evoformer_attn.py | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename csrc/deepspeed4science/evoformer_attn/{attention.cu => attention_cu.cu} (100%) diff --git a/csrc/deepspeed4science/evoformer_attn/attention.cu b/csrc/deepspeed4science/evoformer_attn/attention_cu.cu similarity index 100% rename from csrc/deepspeed4science/evoformer_attn/attention.cu rename to csrc/deepspeed4science/evoformer_attn/attention_cu.cu diff --git a/op_builder/evoformer_attn.py b/op_builder/evoformer_attn.py index f4311848d0d4..6e7721f94e01 100644 --- a/op_builder/evoformer_attn.py +++ b/op_builder/evoformer_attn.py @@ -27,7 +27,7 @@ def extra_ldflags(self): def sources(self): src_dir = 'csrc/deepspeed4science/evoformer_attn' - return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention.cu'] + return [f'{src_dir}/attention.cpp', f'{src_dir}/attention_back.cu', f'{src_dir}/attention_cu.cu'] def nvcc_args(self): args = super().nvcc_args() From 67aaf052d2d7b37d6cd71001c9633a4e11d8817a Mon Sep 17 00:00:00 2001 From: Liangliang-Ma Date: Fri, 27 Oct 2023 20:52:32 +0800 Subject: [PATCH 4/4] Update ccl.py --- deepspeed/comm/ccl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/deepspeed/comm/ccl.py b/deepspeed/comm/ccl.py index 6e915d0ca430..38a22f20109f 100644 --- a/deepspeed/comm/ccl.py +++ b/deepspeed/comm/ccl.py @@ -170,7 +170,7 @@ def get_all_ranks_from_group(self, group): while True: results.append(super(CCLBackend, self).get_global_rank(group, rank)) rank += 1 - except ValueError: + except (ValueError, RuntimeError): pass if tuple(results) not in self.groups: self._new_group(results, group)