Merge pull request #182 from OpenBMB/dev

BMTrain New Version Release v1.0.0
OpenBMB · Feb 26, 2024 · dd2b5bc · dd2b5bc
2 parents 5843590 + 5713d76
commit dd2b5bc
Show file tree

Hide file tree

Showing 78 changed files with 3,492 additions and 1,772 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,34 @@
+name: Build
+
+on:
+  pull_request:
+    types: [opened, reopened, synchronize]
+    branches:
+      - 'dev'
+      - 'main'
+
+jobs:
+  build-archive-wheel:
+
+    uses: OpenBMB/BMTrain/.github/workflows/build_whl.yml@workflow
+    secrets:
+      DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
+      DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
+
+  publish:
+    needs: build-archive-wheel
+    runs-on: ubuntu-latest
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Set Up the Python
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.9
+
+    - name: Download distribution files
+      uses: actions/download-artifact@v2
+      with:
+        name: dist
+        path: dist
diff --git a/.github/workflows/build_whl.yml b/.github/workflows/build_whl.yml
@@ -0,0 +1,55 @@
+name: Build wheels in docker and archive 
+
+on:
+  workflow_call:
+    secrets:
+      DOCKERHUB_TOKEN:
+        required: true
+      DOCKERHUB_USERNAME:
+        required: true
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ['37', '38', '39', '310', '311']
+
+
+    steps:
+
+    - name: Check the disk space and clear unnecessary library
+      run: |
+        rm -rf /home/runner/work/BMTrain/BMTrain/dist
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /opt/ghc
+        sudo rm -rf "/usr/local/share/boost"
+        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
+        df -hl   
+        
+    - name: Checkout code
+      uses: actions/checkout@v3
+
+    - name: Login to DockerHub
+      uses: docker/login-action@v2
+      with:
+        username: ${{ secrets.DOCKERHUB_USERNAME }}
+        password: ${{ secrets.DOCKERHUB_TOKEN }}
+
+    - name: Pull Docker image
+      run: docker pull pytorch/manylinux-cuda113:latest
+
+    - name: Run Docker image and execute script
+      run: |
+        version=${{ matrix.python-version }}
+        docker run -e BUILD_DOCKER_ENV=1 -e CUDACXX=/usr/local/cuda-11.3/bin/nvcc -e PATH="/opt/rh/devtoolset-9/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-9/root/usr/lib64:/opt/rh/devtoolset-9/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i pytorch/manylinux-cuda113:latest /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/pip install build; /opt/python/cp${version}*/bin/python -m build .;for file in dist/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"
+
+    - name: Archive distribution files
+      uses: actions/upload-artifact@v2
+      with:
+        name: dist
+        path: |
+          dist/*.tar.gz
+          dist/*.whl
+
+    
diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml
@@ -7,50 +7,15 @@ on:
       - "v*.*.*"
 
 jobs:
-  build:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ['37', '38', '39', '310', '311']
-
-
-    steps:
-
-    - name: Check the disk space and clear unnecessary library
-      run: |
-        rm -rf /home/runner/work/BMTrain/BMTrain/dist
-        sudo rm -rf /usr/share/dotnet
-        sudo rm -rf /opt/ghc
-        sudo rm -rf "/usr/local/share/boost"
-        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-        df -hl   
-        
-    - name: Checkout code
-      uses: actions/checkout@v3
 
-    - name: Login to DockerHub
-      uses: docker/login-action@v2
-      with:
-        username: ${{ secrets.DOCKERHUB_USERNAME }}
-        password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-    - name: Pull Docker image
-      run: docker pull pytorch/manylinux-cuda113:latest
-    - name: Run Docker image and execute script
-      run: |
-        version=${{ matrix.python-version }}
-        docker run -e BUILD_DOCKER_ENV=1 -e CUDACXX=/usr/local/cuda-11.3/bin/nvcc -e PATH="/opt/rh/devtoolset-9/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-9/root/usr/lib64:/opt/rh/devtoolset-9/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i pytorch/manylinux-cuda113:latest /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/pip install build; /opt/python/cp${version}*/bin/python -m build .;for file in dist/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"
-        
-    - name: Archive distribution files
-      uses: actions/upload-artifact@v2
-      with:
-        name: dist
-        path: |
-          dist/*.tar.gz
-          dist/*.whl
+  build-archive-wheel:
+    uses: OpenBMB/BMTrain/.github/workflows/build_whl.yml@workflow
+    secrets:
+      DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
+      DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
 
   publish:
-    needs: build
+    needs: build-archive-wheel
     runs-on: ubuntu-latest
     steps:
     - name: Set Up the Python

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -6,47 +6,13 @@ on:
       - "v*.*.*"
 
 jobs:
-  build:
-    runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python-version: ['37', '38', '39', '310', '311']
-
-    steps:
 
-    - name: Check the disk space and clear unnecessary library
-      run: |
-        rm -rf /home/runner/work/BMTrain/BMTrain/dist
-        sudo rm -rf /usr/share/dotnet
-        sudo rm -rf /opt/ghc
-        sudo rm -rf "/usr/local/share/boost"
-        sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-        df -hl   
+  build-archive-wheel:
 
-    - name: Checkout code
-      uses: actions/checkout@v3
-
-    - name: Login to DockerHub
-      uses: docker/login-action@v2
-      with:
-        username: ${{ secrets.DOCKERHUB_USERNAME }}
-        password: ${{ secrets.DOCKERHUB_TOKEN }}
-
-    - name: Pull Docker image
-      run: docker pull pytorch/manylinux-cuda113:latest
-
-    - name: Run Docker image and execute script
-      run: |
-        version=${{ matrix.python-version }}
-        docker run -e BUILD_DOCKER_ENV=1 -e CUDACXX=/usr/local/cuda-11.3/bin/nvcc -e PATH="/opt/rh/devtoolset-9/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-9/root/usr/lib64:/opt/rh/devtoolset-9/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i pytorch/manylinux-cuda113:latest /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/pip install build;/opt/python/cp${version}*/bin/python -m build .;for file in dist/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"
-      
-    - name: Archive distribution files
-      uses: actions/upload-artifact@v2
-      with:
-        name: dist
-        path: |
-          dist/*.tar.gz
-          dist/*.whl
+    uses: OpenBMB/BMTrain/.github/workflows/build_whl.yml@workflow
+    secrets:
+      DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
+      DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}
 
   publish:
     needs: build

diff --git a/.gitignore b/.gitignore
@@ -150,4 +150,6 @@ log
 .vscode
 
 !bmtrain/dist
-tests/test_log.txt
+tests/test_log.txt
+tests/*.opt
+tests/*.ckp
diff --git a/README.md b/README.md
@@ -82,7 +82,7 @@ To enable ZeRO optimization, you need to make some simple replacements to the or
 * `torch.nn.Module` -> `bmtrain.DistributedModule`
 * `torch.nn.Parameter` -> `bmtrain.DistributedParameter`
 
-And wrap the transformer blocks with `bmtrain.CheckpointBlock`.
+And wrap the transformer blocks with `bmtrain.Block`.
 
 Here is an example.
 
@@ -118,9 +118,9 @@ class MyModule(bmt.DistributedModule): # changed here
         super().__init__()
         self.param = bmt.DistributedParameter(torch.empty(1024)) # changed here
         self.module_list = torch.nn.ModuleList([
-            bmt.CheckpointBlock(SomeTransformerBlock()), # changed here
-            bmt.CheckpointBlock(SomeTransformerBlock()), # changed here
-            bmt.CheckpointBlock(SomeTransformerBlock())  # changed here
+            bmt.Block(SomeTransformerBlock()), # changed here
+            bmt.Block(SomeTransformerBlock()), # changed here
+            bmt.Block(SomeTransformerBlock())  # changed here
         ])
 
     def forward(self):
@@ -151,9 +151,9 @@ class MyModule(bmt.DistributedModule):
         super().__init__()
         self.param = bmt.DistributedParameter(torch.empty(1024))
         self.module_list = torch.nn.ModuleList([
-            bmt.CheckpointBlock(SomeTransformerBlock()),
-            bmt.CheckpointBlock(SomeTransformerBlock()),
-            bmt.CheckpointBlock(SomeTransformerBlock())
+            bmt.Block(SomeTransformerBlock()),
+            bmt.Block(SomeTransformerBlock()),
+            bmt.Block(SomeTransformerBlock())
         ])
 
     def forward(self):
@@ -174,9 +174,9 @@ class MyModule(bmt.DistributedModule):
         super().__init__()
         self.param = bmt.DistributedParameter(torch.empty(1024))
         self.module_list = bmt.TransformerBlockList([ # changed here
-            bmt.CheckpointBlock(SomeTransformerBlock()),
-            bmt.CheckpointBlock(SomeTransformerBlock()),
-            bmt.CheckpointBlock(SomeTransformerBlock())
+            bmt.Block(SomeTransformerBlock()),
+            bmt.Block(SomeTransformerBlock()),
+            bmt.Block(SomeTransformerBlock())
         ])
 
     def forward(self):

diff --git a/bmtrain/__init__.py b/bmtrain/__init__.py
@@ -10,15 +10,17 @@
 from .layer import DistributedModule
 from .param_init import init_parameters, grouped_parameters
 from .synchronize import synchronize, sum_loss, wait_loader, gather_result
-from .block_layer import CheckpointBlock, TransformerBlockList
+from .block_layer import Block, TransformerBlockList
 from .wrapper import BMTrainModelWrapper
 from .pipe_layer import PipelineTransformerBlockList
 from . import debug
 from .store import save, load
 
-from . import benchmark
+from . import loss
+from . import distributed
+from . import nn
 from . import optim
 from . import inspect
 from . import lr_scheduler
-from . import loss
-from . import distributed
+
+CheckpointBlock = Block