@@ -415,7 +415,7 @@ The routines for modifying parameters and optimizer states can be used at any po
415415.. code-block :: python
416416
417417 [... ]
418- from deepspeed.runtime.zero.utils import is_zero_param
418+ from deepspeed.runtime.zero.utils import is_zero_param
419419 from deepspeed.utils import safe_set_full_fp32_param, safe_set_full_optimizer_state
420420 from deepspeed.utils import safe_set_local_fp32_param, safe_set_local_optimizer_state
421421 # Here is an example to zero all the fp32 parameters and optimizer states.
@@ -443,16 +443,16 @@ The routines for modifying gradients can be used after ``backward`` but before `
443443
444444 backward(loss)
445445 [... ]
446- from deepspeed.runtime.zero.utils import is_zero_param
446+ from deepspeed.runtime.zero.utils import is_zero_param
447447 from deepspeed.utils import safe_set_full_grad, safe_set_local_grad
448- # Here is an example of how to zero all the gradients.
448+ # Here is an example of how to zero all the gradients.
449449 for n, lp in model.named_parameters():
450450 # 1. For zero stage 1, 2, or 3 set the full gradient.
451451 zero_tensor = torch.zeros(lp.ds_shape) if is_zero_param(lp) else torch.zeros(lp.shape)
452452
453453 safe_set_full_grad(lp, zero_tensor)
454454
455- # 2. For zero stage 3, each process sets its local gradient partition.
455+ # 2. For zero stage 3, each process sets its local gradient partition.
456456 zero_tensor_local = torch.zeros_like(lp.ds_tensor.shape)
457457
458458 safe_set_local_grad(lp, zero_tensor_local)
0 commit comments