diff --git a/performance_profiling/pathology/profiling_train_base_nvtx.md b/performance_profiling/pathology/profiling_train_base_nvtx.md
index 3f67420b13..6620fba452 100644
--- a/performance_profiling/pathology/profiling_train_base_nvtx.md
+++ b/performance_profiling/pathology/profiling_train_base_nvtx.md
@@ -21,6 +21,7 @@ The pipeline that we are profiling `rain_evaluate_nvtx_profiling.py` requires [C
 Instead of the whole dataset, just for the experiment of this performance analysis, users can also download a single whole slide image `tumor_091.tif` from [here](https://drive.google.com/uc?id=1OxAeCMVqH9FGpIWpAXSEJe6cLinEGQtF), as well as its coordinates and labels (`dataset_0.json`), from [here](https://drive.google.com/uc?id=1F-lR9tXoFkPkC1yueM-_TyaFk3CO7v0s).
 
 ## Run Nsight Profiling
+In `requirements.txt`, `cupy-cuda114` is set in default. If your cuda version is different, you may need to modify it into a suitable version, you can refer to [here](https://docs.cupy.dev/en/stable/install.html) for more details.
 With environment prepared `requirements.txt`, we use `nsys profile` to get the information regarding the training pipeline's behavior across several steps. Since an epoch for pathology is long (covering 400,000 images), here we run profile on the trainer under basic settings for 30 seconds, with 50 seconds' delay. All results shown below are from experiments performed on a DGX-2 workstation using a single V-100 GPU over the full dataset.
 
 ```python
diff --git a/performance_profiling/pathology/requirements.txt b/performance_profiling/pathology/requirements.txt
index 0a7549292c..9d22fcfea1 100644
--- a/performance_profiling/pathology/requirements.txt
+++ b/performance_profiling/pathology/requirements.txt
@@ -4,6 +4,5 @@ torchvision
 cucim==21.8.2
 cupy-cuda114
 pytorch-ignite
-nvidia-pyindex
-nvidia-dlprof[pytorch]
 nvtx
+tensorboard
diff --git a/performance_profiling/radiology/profiling_train_base_nvtx.md b/performance_profiling/radiology/profiling_train_base_nvtx.md
index a74a0c801c..b17edffe58 100644
--- a/performance_profiling/radiology/profiling_train_base_nvtx.md
+++ b/performance_profiling/radiology/profiling_train_base_nvtx.md
@@ -15,19 +15,19 @@ For training and validation steps, they are easier to track by setting NVTX anno
 
 # Profiling Spleen Segmentation Pipeline
 ## Run Nsight Profiling
-With environment prepared `requirements.txt`, we run DLprof (v1.4.0 / r21.08) on the trainer under basic settings for 6 epochs (with validation every 2 epochs). All results shown below are from experiments performed on a DGX-2 workstation using a single V-100 GPU.
+With environment prepared `requirements.txt`, we use `nsys profile` on the trainer under basic settings for 6 epochs (with validation every 2 epochs). All results shown below are from experiments performed on a DGX-2 workstation using a single V-100 GPU.
 
 ```python
-!dlprof --mode pytorch \
-        --reports=summary \
-        --formats json \
-        --output_path ./outputs_base \
-        python3 train_base_nvtx.py
+nsys profile \
+     --output ./output_base \
+     --force-overwrite true \
+     --trace-fork-before-exec true \
+     python3 train_base_nvtx.py
 ```
 
 # Identify Potential Performance Improvements
 ## Profile Results
-After profiling, DLProf provides summary regarding the training process. Also, the computing details can be visualized via Nsight System GUI. (The version of Nsight used in the tutorial is 2021.3.1.54-ee9c30a OSX)
+After profiling, the computing details can be visualized via Nsight System GUI. (The version of Nsight used in the tutorial is 2021.3.1.54-ee9c30a OSX)
 
 ![png](Figure/nsight_base.png)
 
@@ -59,14 +59,14 @@ One optimized solution can be found [here](https://github.com/Project-MONAI/tuto
 
 # Analyzing Performance Improvement
 ## Profile Results
-We again use DLProf to further analyze the optimized training script.
+We again use `nsys profile` to further analyze the optimized training script.
 
 ```python
-!dlprof --mode pytorch \
-        --reports=summary \
-        --formats json \
-        --output_path ./outputs_fast \
-        python3 train_fast_nvtx.py
+nsys profile \
+     --output ./outputs_fast \
+     --force-overwrite true \
+     --trace-fork-before-exec true \
+     python3 train_fast_nvtx.py
 ```
 And the profiling result is
 
diff --git a/performance_profiling/radiology/requirements.txt b/performance_profiling/radiology/requirements.txt
index 1db12ed742..61d42383d6 100644
--- a/performance_profiling/radiology/requirements.txt
+++ b/performance_profiling/radiology/requirements.txt
@@ -2,6 +2,5 @@ git+https://github.com/Project-MONAI/MONAI
 pytorch-ignite
 nibabel
 tqdm
-nvidia-pyindex
-nvidia-dlprof[pytorch]
 nvtx
+tensorboard
diff --git a/performance_profiling/radiology/train_base_nvtx.py b/performance_profiling/radiology/train_base_nvtx.py
index 0a0eef5bbd..440488c2d6 100644
--- a/performance_profiling/radiology/train_base_nvtx.py
+++ b/performance_profiling/radiology/train_base_nvtx.py
@@ -22,7 +22,6 @@
 from torch.utils.tensorboard import SummaryWriter
 torch.backends.cudnn.benchmark = True
 
-import nvidia_dlprof_pytorch_nvtx
 import nvtx
 
 from monai.apps import download_and_extract
@@ -47,7 +46,6 @@
 )
 from monai.utils import Range, set_determinism
 
-nvidia_dlprof_pytorch_nvtx.init()
 
 # set directories
 random.seed(0)
@@ -143,7 +141,7 @@
     num_workers=8
 )
 train_loader = DataLoader(
-    train_ds, num_workers=8, batch_size=4, shuffle=True
+    train_ds, num_workers=0, batch_size=4, shuffle=True
 )
 val_ds = CacheDataset(
     data=val_files,
@@ -152,7 +150,7 @@
     num_workers=8
 )
 val_loader = DataLoader(
-    val_ds, num_workers=8, batch_size=1
+    val_ds, num_workers=0, batch_size=1
 )
 
 # standard PyTorch program style: create UNet, DiceLoss and Adam optimizer
diff --git a/performance_profiling/radiology/train_fast_nvtx.py b/performance_profiling/radiology/train_fast_nvtx.py
index 652380ae87..07d2b61562 100644
--- a/performance_profiling/radiology/train_fast_nvtx.py
+++ b/performance_profiling/radiology/train_fast_nvtx.py
@@ -22,7 +22,6 @@
 from torch.utils.tensorboard import SummaryWriter
 torch.backends.cudnn.benchmark = True
 
-import nvidia_dlprof_pytorch_nvtx
 import nvtx
 
 from monai.apps import download_and_extract
@@ -51,7 +50,6 @@
 from monai.utils import set_determinism
 from monai.utils.nvtx import Range
 
-nvidia_dlprof_pytorch_nvtx.init()
 
 # set directories
 random.seed(0)