HabanaAI · ZailiWang · Jul 19, 2024
diff --git a/PyTorch/examples/computer_vision/hello_world/README.md b/PyTorch/examples/computer_vision/hello_world/README.md
@@ -42,34 +42,56 @@ export PYTHONPATH=$PYTHONPATH:/path/to/Model-References
 
 **Run training on 1 HPU:**
 
-- 1 HPU in FP32 Lazy mode:
+- 1 HPU in FP32 Eager mode:
 
 ```bash
-PT_HPU_LAZY_MODE=1 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu
+PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu
 ```
 
-- 1 HPU in BF16 Lazy mode:
+- 1 HPU in BF16 Eager mode:
 
 ```bash
-PT_HPU_LAZY_MODE=1 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast
+PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast
+```
+
+- 1 HPU in FP32 using `torch.compile()`:
+
+```bash
+PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --use-torch-compile
+```
+
+- 1 HPU in BF16 using `torch.compile()`:
+
+```bash
+PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast --use-torch-compile
 ```
 
 **Run training on 8 HPUs:**
 
 **NOTE:** mpirun map-by PE attribute value may vary on your setup. For the recommended calculation, refer to the instructions detailed in [mpirun Configuration](https://docs.habana.ai/en/latest/PyTorch/PyTorch_Scaling_Guide/mpirun_Configuration.html#mpirun-configuration).
 
+- 8 HPUs, 1 server in FP32 Eager mode:
+
+```bash
+mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu
+```
 
+- 8 HPU, 1 server in BF16 Eager mode:
+
+```bash
+mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast
+```
 
-- 8 HPUs, 1 server in FP32 Lazy mode:
+- 8 HPUs, 1 server in FP32 using `torch.compile()`:
 
 ```bash
-mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=1 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu
+mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --use-torch-compile
 ```
 
-- 8 HPU, 1 server in BF16 Lazy mode:
+- 8 HPU, 1 server in BF16 using `torch.compile()`:
 
 ```bash
-mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=1 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast
+mpirun -n 8 --bind-to core --map-by socket:PE=6 --rank-by core --report-bindings --allow-run-as-root -x PT_HPU_LAZY_MODE=0 $PYTHON mnist.py --batch-size=64 --epochs=1 --lr=1.0 --gamma=0.7 --hpu --autocast --use-torch-compile
 ```
 
 #### Examples in Python Script
@@ -81,7 +103,7 @@ The `example.py` presents a basic PyTorch code example. For more details, refer
 On 1 HPU in Lazy mode, run the following command:
 
 ```bash
-$PYTHON example.py
+PT_HPU_LAZY_MODE=0 $PYTHON example.py
 ```
 
 ## Changelog

diff --git a/PyTorch/examples/computer_vision/hello_world/example.py b/PyTorch/examples/computer_vision/hello_world/example.py
@@ -47,18 +47,8 @@ def train(net,criterion,optimizer,trainloader,device,lazy_mode):
 
         loss.backward()
 
-        ##############################################################################
-        if(lazy_mode):
-            htcore.mark_step()
-        ##############################################################################
-
         optimizer.step()
 
-        ##############################################################################
-        if(lazy_mode):
-            htcore.mark_step()
-        ##############################################################################
-
         train_loss += loss.item()
         _, predicted = outputs.max(1)
         total += targets.size(0)

diff --git a/PyTorch/examples/computer_vision/hello_world/mnist.py b/PyTorch/examples/computer_vision/hello_world/mnist.py
@@ -18,9 +18,6 @@
 # todo: [SW-165872] revert below W/A when PR 113374 included in pytorch fork
 torch._dynamo.config.optimize_ddp = False
 
-def is_lazy():
-    return os.getenv("PT_HPU_LAZY_MODE", "1") != "0"
-
 
 class Net(nn.Module):
     def __init__(self, use_autocast=False):
@@ -67,8 +64,6 @@ def train_function(data, target):
     for batch_idx, (data, target) in enumerate(train_loader):
         data, target = data.to(device), target.to(device)
         loss = train_function(data, target)
-        if is_lazy():
-            htcore.mark_step()
         if batch_idx % args.log_interval == 0:
             print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                 epoch, batch_idx *
@@ -183,9 +178,6 @@ def main():
     if args.use_torch_compile:
         assert int(torch.__version__.split('.')[
                    0]) >= 2, "Graph mode is available only in PyTorch 2.x."
-        assert not is_lazy(), "Dynamo and lazy are mutually exclusive."
-        # Note: PT_HPU_LAZY_MODE=0 needs to be set before library is loaded,
-        #       setting it here would be too late - hence assertion.
 
     utils.init_distributed_mode(args)