Skip to content

Commit

Permalink
Polish mp async allreduce implementation. (PaddlePaddle#57965)
Browse files Browse the repository at this point in the history
* Remove paddle.ones as PR 57574 has fixed mp aysnc allreduce overlap failure

* Add warnings for mp async all_reduce when CUDA_DEVICE_MAX_CONNECTIONS is unset.
  • Loading branch information
GhostScreaming authored and Frida-a committed Oct 14, 2023
1 parent fd846d2 commit e422434
Showing 1 changed file with 17 additions and 2 deletions.
19 changes: 17 additions & 2 deletions python/paddle/distributed/fleet/layers/mpu/mp_layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import paddle
from paddle.autograd import PyLayer
from paddle.base import core
Expand All @@ -20,6 +22,7 @@

from ....communication.reduce import ReduceOp, _get_reduce_op
from ...base import topology as tp
from ...utils.log_util import logger
from . import mp_ops
from .random import get_rng_state_tracker

Expand Down Expand Up @@ -177,6 +180,9 @@ def forward(self, x):
return output


_raise_cuda_env_unset_warning = True


class InnerOverlapLinear(paddle.autograd.PyLayer):
@staticmethod
def forward(
Expand Down Expand Up @@ -216,8 +222,17 @@ def backward(ctx, dy):
task = ctx.model_parallel_group.process_group.all_reduce(
dx, op_type, sync_op=False
)
# TODO(GhostScreaming): remove it in future.
tmp = paddle.ones([512])
# Using small operation to preempt GPU SMs for all_reduce to achieve overlap.
if int(os.getenv("CUDA_DEVICE_MAX_CONNECTIONS", "0")) != 1:
global _raise_cuda_env_unset_warning
if _raise_cuda_env_unset_warning:
logger.warning(
"You set mp_async_allreduce=True, but you forget to set environment "
"variable CUDA_DEVICE_MAX_CONNECTIONS=1, which may leads to performance "
"loss. Try to export CUDA_DEVICE_MAX_CONNECTIONS=1 for better performance."
)
_raise_cuda_env_unset_warning = False
tmp = paddle.ones([512])

if ctx.mp_fused_linear_param_grad_add:
if not is_fused_linear_param_grad_add_supported():
Expand Down

0 comments on commit e422434

Please sign in to comment.