Merge plugin/vnext to main (#517)

Summary: main updates: 1. Add module view 2. Add DataPipe support 3. Add pytorch lightning view 4. Fix some bugs Pull Request resolved: #517 Reviewed By: chaekit Differential Revision: D34307403 Pulled By: robieta fbshipit-source-id: 8526342f03e2b8cd47c6121a9984d6a706058916
pytorch · Feb 18, 2022 · cdfa44f · cdfa44f
1 parent b654f58
commit cdfa44f
Show file tree

Hide file tree

Showing 73 changed files with 14,911 additions and 3,884 deletions.
diff --git a/.github/workflows/tb_plugin_ci.yml b/.github/workflows/tb_plugin_ci.yml
@@ -26,7 +26,7 @@ jobs:
           then
             echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\"]}"
           else
-            echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\", \"1.10rc\", \"stable\"]}"
+            echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\", \"1.11rc\", \"stable\"]}"
           fi
 
   build:

diff --git a/tb_plugin/.flake8 b/tb_plugin/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+max-line-length = 120
+per-file-ignores = __init__.py:F401 torch_tb_profiler/io/file.py: F401
diff --git a/tb_plugin/.pre-commit-config.yaml b/tb_plugin/.pre-commit-config.yaml
@@ -0,0 +1,34 @@
+# -------------------------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# -------------------------------------------------------------------------
+default_language_version:
+  python: python3.8
+
+ci:
+  autofix_prs: true
+  autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions'
+  autoupdate_schedule: quarterly
+  # submodules: true
+
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.1.0
+    hooks:
+      - id: end-of-file-fixer
+        exclude: torch_tb_profiler/static/index.html
+      - id: trailing-whitespace
+      - id: double-quote-string-fixer
+
+  - repo: https://github.com/pre-commit/mirrors-autopep8
+    rev: v1.6.0
+    hooks:
+      - id: autopep8
+        name: Format code
+  - repo: https://github.com/PyCQA/flake8
+    rev: 4.0.1
+    hooks:
+      - id: flake8
+        args:
+          - "--max-line-length=120"
+          - "--per-file-ignores=__init__.py:F401 tb_plugin/torch_tb_profiler/io/file.py: F401"
+        name: Check PEP8
diff --git a/tb_plugin/README.md b/tb_plugin/README.md
@@ -219,7 +219,7 @@ We describe each of these views below.
     CallStack: All call stacks of this operator if it has been recorded in profiling trace file.
                To dump this call stack information, you should set the 'with_stack' parameter in torch.profiler API.
                The TensorBoard has integrated to VSCode, if you launch TensorBoard in VSCode, clicking this CallStack will forward to corresponding line of source code as below:
-           
+
    ![Alt text](./docs/images/vscode_stack.PNG)
 
     Note: Each above duration means wall-clock time. It doesn't mean the GPU or CPU during this period is fully utilized.
@@ -270,7 +270,7 @@ We describe each of these views below.
 
     * Mean Blocks Per SM: Blocks per SM = Blocks of this kernel / SM number of this GPU. If this number is less than 1, it indicates the GPU multiprocessors are not fully utilized. "Mean Blocks per SM" is weighted average of all runs of this kernel name, using each run’s duration as weight.
 
-    * Mean Est. Achieved Occupancy: The definition of Est. Achieved Occupancy can refer to [gpu_utilization](./docs/gpu_utilization.md), It is weighted average of all runs of this kernel name, using each run’s duration as weight. 
+    * Mean Est. Achieved Occupancy: The definition of Est. Achieved Occupancy can refer to [gpu_utilization](./docs/gpu_utilization.md), It is weighted average of all runs of this kernel name, using each run’s duration as weight.
 
     The top left pie chart is a visualization of "Total Duration" column.
     It makes the breakdowns visible at a glance.
@@ -387,9 +387,9 @@ We describe each of these views below.
 
       * Self Size Increase: The memory increase size associated with the operator itself excluding that of its children. It sums up all allocation bytes and minus all the memory release bytes.
 
-      * Allocation Count: The allocation count including all children operators. 
+      * Allocation Count: The allocation count including all children operators.
 
-      * Self Allocation Count: The allocation count belonging to the operator itself excluding its children. 
+      * Self Allocation Count: The allocation count belonging to the operator itself excluding its children.
 
       * Allocation Size: The allocation size including all children operators. It sums up all allocation bytes without considering the memory free.
 

diff --git a/tb_plugin/ci_scripts/install_env.sh b/tb_plugin/ci_scripts/install_env.sh
@@ -24,12 +24,12 @@ pip install numpy tensorboard typing-extensions pillow pytest
 if [ "$PYTORCH_VERSION" = "nightly" ]; then
     pip install --pre torch -f "https://download.pytorch.org/whl/nightly/$CUDA_VERSION/torch_nightly.html"
     pip install --pre torchvision --no-deps -f "https://download.pytorch.org/whl/nightly/$CUDA_VERSION/torch_nightly.html"
-elif [ "$PYTORCH_VERSION" = "1.10rc" ]; then
+elif [ "$PYTORCH_VERSION" = "1.11rc" ]; then
     pip install --pre torch -f "https://download.pytorch.org/whl/test/$CUDA_VERSION/torch_test.html"
-    pip install --pre torchvision --no-deps -f "https://download.pytorch.org/whl/test/$CUDA_VERSION/torch_test.html"
+    #pip install --pre torchvision --no-deps -f "https://download.pytorch.org/whl/test/$CUDA_VERSION/torch_test.html"
+    pip install --pre torchvision --no-deps -f "https://download.pytorch.org/whl/nightly/$CUDA_VERSION/torch_nightly.html"
 elif [ "$PYTORCH_VERSION" = "stable" ]; then
     pip install torch torchvision
 fi
 
 python -c "import torch; print(torch.__version__, torch.version.git_version); from torch.autograd import kineto_available; print(kineto_available())"
-
diff --git a/tb_plugin/docs/gpu_utilization.md b/tb_plugin/docs/gpu_utilization.md
@@ -1,19 +1,19 @@
 * GPU Utilization: GPU busy time / all steps time. The higher, the better. All steps time is the total time of all profiler steps(or called as iterations).
-                   GPU busy time is the time during “all steps time” when is at least one GPU kernel running on this GPU. 
-                   However, this high-level utilization metric is coarse. It can’t tell how many SMs(Stream Multiprocessors) are in use. 
-                   For example, a kernel with a single thread running continuously will get 100% GPU utilization. 
+                   GPU busy time is the time during “all steps time” when is at least one GPU kernel running on this GPU.
+                   However, this high-level utilization metric is coarse. It can’t tell how many SMs(Stream Multiprocessors) are in use.
+                   For example, a kernel with a single thread running continuously will get 100% GPU utilization.
 
 * Est. SM Efficiency: Estimated Stream Multiprocessor Efficiency. The higher, the better. This metric of a kernel, SM_Eff_K = min(blocks of this kernel / SM number of this GPU, 100%).
-                      This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by “all steps time”. 
-                      It shows GPU Stream Multiprocessors’ utilization. 
-                      Although it is finer grained than above “GPU Utilization”, it still can’t tell the whole story. 
-                      For example, a kernel with only one thread per block can’t fully utilize each SM. 
+                      This overall number is the sum of all kernels' SM_Eff_K weighted by kernel's execution duration, divided by “all steps time”.
+                      It shows GPU Stream Multiprocessors’ utilization.
+                      Although it is finer grained than above “GPU Utilization”, it still can’t tell the whole story.
+                      For example, a kernel with only one thread per block can’t fully utilize each SM.
 
 * Est. Achieved Occupancy: For most cases such as memory bandwidth bound kernels, a higher value often translates to better performance, especially when the initial value is very low. [Reference](http://developer.download.nvidia.com/GTC/PDF/GTC2012/PresentationPDF/S0514-GTC2012-GPU-Performance-Analysis.pdf). The definition of occupancy is [here](https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/achievedoccupancy.htm).
                            Occupancy is the ratio of active warps on an SM to the maximum number of
-                           active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple 
-                           factors such as kernel shape, kernel used resource, and the GPU compute capability. 
-                           Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel). 
+                           active warps supported by the SM. The theoretical occupancy of a kernel is upper limit occupancy of this kernel, limited by multiple
+                           factors such as kernel shape, kernel used resource, and the GPU compute capability.
+                           Est. Achieved Occupancy of a kernel, OCC_K = min(threads of the kernel / SM number / max threads per SM, theoretical occupancy of the kernel).
                            This overall number is the weighted sum of all kernels OCC_K using kernel's execution duration as weight. It shows fine-grained low-level GPU utilization.
 
  * Kernel Time using Tensor Cores: Total GPU Time for Tensor Core kernels / Total GPU Time for all kernels. Higher is better.

diff --git a/tb_plugin/examples/datapipe_example.py b/tb_plugin/examples/datapipe_example.py
@@ -0,0 +1,50 @@
+import torch
+import torch.nn as nn
+import torch.optim
+from torch.utils.data.dataloader_experimental import DataLoader2
+
+from torchvision import transforms as T
+import torchvision.prototype.datasets as pdatasets
+import torchvision.prototype.models as models
+from torchvision.prototype.datasets._builtin import Cifar10
+
+
+if __name__ == "__main__":
+    model = models.resnet50(models.ResNet50_Weights.ImageNet1K_V1)
+    trainset = Cifar10().to_datapipe(root='./data', decoder=pdatasets.decoder.raw)
+    transform = T.Compose([T.Resize(256), T.CenterCrop(224)])
+    trainset = trainset.map(transform, input_col="image")
+    trainset = trainset.map(fn=T.functional.convert_image_dtype, input_col="image")
+    dl = DataLoader2(trainset, batch_size=64)
+    criterion = nn.CrossEntropyLoss().cuda(0)
+    optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
+    device = torch.device("cuda:0")
+    model.to(device=device).train()
+
+    with torch.profiler.profile(
+        activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.CUDA],
+        schedule=torch.profiler.schedule(
+            wait=1,
+            warmup=1,
+            active=2),
+        on_trace_ready=torch.profiler.tensorboard_trace_handler('./result', worker_name='datapipe0'),
+        record_shapes=True,
+        profile_memory=True,  # This will take 1 to 2 minutes. Setting it to False could greatly speedup.
+        with_stack=True
+    ) as p:
+        for step, data in enumerate(dl, 0):
+            print("step:{}".format(step))
+            input_tensors = data['image']
+            label_tensors = data['label']
+            inputs, labels = input_tensors.to(device=device), label_tensors.to(device=device)
+            outputs = model(inputs)
+            loss = criterion(outputs, labels)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            if step + 1 >= 4:
+                break
+            p.step()
+        print("done")
diff --git a/tb_plugin/examples/resnet50_ddp_profiler.py b/tb_plugin/examples/resnet50_ddp_profiler.py
@@ -73,14 +73,16 @@ def example(rank, use_gpu=True):
             if step + 1 >= 10:
                 break
 
+
 def init_process(rank, size, fn, backend='nccl'):
     """ Initialize the distributed environment. """
     os.environ['MASTER_ADDR'] = '127.0.0.1'
     os.environ['MASTER_PORT'] = '29500'
     dist.init_process_group(backend, rank=rank, world_size=size)
     fn(rank, size)
 
-if __name__=="__main__":
+
+if __name__ == "__main__":
     size = 4
     processes = []
     mp.set_start_method("spawn")
@@ -91,4 +93,3 @@ def init_process(rank, size, fn, backend='nccl'):
 
     for p in processes:
         p.join()
-
diff --git a/tb_plugin/examples/resnet50_profiler_api.py b/tb_plugin/examples/resnet50_profiler_api.py
@@ -1,12 +1,10 @@
-import os
 import torch
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
 import torch.optim
 import torch.utils.data
 import torchvision
 import torchvision.transforms as T
-import torchvision.datasets as datasets
 import torchvision.models as models
 
 import torch.profiler

diff --git a/tb_plugin/fe/README.md b/tb_plugin/fe/README.md
@@ -12,6 +12,7 @@
      echo "deb https://dl.yarnpkg.com/debian/ stable main" | sudo tee /etc/apt/sources.list.d/yarn.list
      sudo apt update && sudo apt install yarn
   ```
-3. shell `yarn`
-4. shell `yarn build`
-5. `./dist/index.html`
+3. shell `yarn` to prepare JS dependency
+4. shell `yarn build:copy`
+5. Go to `tb_plugin` folder and install the package using `python setup.py develop`
+6. Launch tensorboard
diff --git a/tb_plugin/fe/index.html b/tb_plugin/fe/index.html
@@ -8,4 +8,3 @@
     <div id="app"></div>
   </body>
 </html>
-
diff --git a/tb_plugin/fe/package.json b/tb_plugin/fe/package.json
@@ -14,11 +14,12 @@
     "@babel/runtime": "^7.13.10",
     "@material-ui/core": "^4.11.3",
     "@material-ui/icons": "^4.11.2",
-    "antd": "^4.15.1",
+    "antd": "^4.17.0",
     "clsx": "^1.1.1",
     "portable-fetch": "^3.0.0",
     "react": "^16.13.1",
-    "react-dom": "^16.13.1"
+    "react-dom": "^16.13.1",
+    "react-flame-graph": "^1.4.0"
   },
   "devDependencies": {
     "@types/react": "^16.9.51",
@@ -38,6 +39,7 @@
   "resolutions": {
     "portable-fetch/**/node-fetch": "^2.6.1",
     "webpack/**/browserslist": "^4.16.5",
-    "webpack-dev-server/**/chokidar": "^3.5.2"
+    "webpack-dev-server/**/chokidar": "^3.5.2",
+    "postcss/**/nanoid": "^3.1.31"
   }
 }
diff --git a/tb_plugin/fe/prettier.json b/tb_plugin/fe/prettier.json
@@ -10,4 +10,3 @@
     "proseWrap": "always",
     "endOfLine": "lf"
 }
-
diff --git a/tb_plugin/fe/scripts/add_header.py b/tb_plugin/fe/scripts/add_header.py
@@ -3,28 +3,30 @@
 import os
 import sys
 
-HEADER='''/*---------------------------------------------------------------------------------------------
+HEADER = '''/*---------------------------------------------------------------------------------------------
  * Copyright (c) Microsoft Corporation. All rights reserved.
  *--------------------------------------------------------------------------------------------*/
 
 '''
 
+
 def add_header(file):
     with open(file, 'r') as f:
         contents = f.readlines()
 
     # do nothing if there is already header
-    if contents and contents[0].startswith("/*-"):
+    if contents and contents[0].startswith('/*-'):
         return
 
     with open(file, 'w') as out:
         out.write(HEADER)
         out.writelines(contents)
 
+
 if __name__ == '__main__':
     dir = sys.argv[1]
     if not os.path.isdir(dir):
-        raise ValueError("{} is not a directory".format(dir))
+        raise ValueError('{} is not a directory'.format(dir))
 
-    for file in glob.glob(dir + "/*.ts"):
+    for file in glob.glob(dir + '/*.ts'):
         add_header(file)
diff --git a/tb_plugin/fe/scripts/build.sh b/tb_plugin/fe/scripts/build.sh
@@ -5,7 +5,7 @@ current_dir="$( cd "$( dirname "$0" )" && pwd )"
 FE_ROOT="$(dirname "$current_dir")"
 cd $FE_ROOT/
 
-java -jar $FE_ROOT/swagger-codegen-cli.jar generate -i $FE_ROOT/src/api/openapi.yaml -l typescript-fetch -o $FE_ROOT/src/api/generated/
+java -jar $FE_ROOT/swagger-codegen-cli.jar generate -i $FE_ROOT/src/api/openapi.yaml -l typescript-fetch -o $FE_ROOT/src/api/generated/ --additional-properties modelPropertyNaming=original
 rm $FE_ROOT/src/api/generated/api_test.spec.ts
 yarn prettier --end-of-line lf
 python $FE_ROOT/scripts/add_header.py $FE_ROOT/src/api/generated/

diff --git a/tb_plugin/fe/src/api/README.md b/tb_plugin/fe/src/api/README.md
@@ -6,7 +6,7 @@
 ```bash
   cd fe
   wget https://repo1.maven.org/maven2/io/swagger/codegen/v3/swagger-codegen-cli/3.0.25/swagger-codegen-cli-3.0.25.jar -O swagger-codegen-cli.jar
-  java -jar swagger-codegen-cli.jar generate -i ./src/api/openapi.yaml -l typescript-fetch -o ./src/api/generated/
+  java -jar swagger-codegen-cli.jar generate -i ./src/api/openapi.yaml -l typescript-fetch -o ./src/api/generated/  --additional-properties modelPropertyNaming=original
   rm ./src/api/generated/api_test.spec.ts
   yarn prettier --end-of-line lf
   python ./scripts/add_header.py ./src/api/generated/
-Original file line number
+Diff line change
@@ Expand Up / @@ -26,7 +26,7 @@ jobs: @@
               then
                 echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\"]}"
               else
-                echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\", \"1.10rc\", \"stable\"]}"
+                echo "::set-output name=matrix::{\"python-version\":[3.7, 3.8, 3.9], \"cuda-version\":[\"cpu\"], \"pytorch-version\":[\"nightly\", \"1.11rc\", \"stable\"]}"
               fi
       build:
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -10,4 +10,3 @@
		"proseWrap": "always",
		"endOfLine": "lf"
		}