Skip to content

Commit

Permalink
Merge pull request #182 from OpenBMB/dev
Browse files Browse the repository at this point in the history
BMTrain New Version Release v1.0.0
  • Loading branch information
MayDomine authored Feb 26, 2024
2 parents 5843590 + 5713d76 commit dd2b5bc
Show file tree
Hide file tree
Showing 78 changed files with 3,492 additions and 1,772 deletions.
34 changes: 34 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
name: Build

on:
pull_request:
types: [opened, reopened, synchronize]
branches:
- 'dev'
- 'main'

jobs:
build-archive-wheel:

uses: OpenBMB/BMTrain/.github/workflows/build_whl.yml@workflow
secrets:
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}

publish:
needs: build-archive-wheel
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set Up the Python
uses: actions/setup-python@v2
with:
python-version: 3.9

- name: Download distribution files
uses: actions/download-artifact@v2
with:
name: dist
path: dist
55 changes: 55 additions & 0 deletions .github/workflows/build_whl.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
name: Build wheels in docker and archive

on:
workflow_call:
secrets:
DOCKERHUB_TOKEN:
required: true
DOCKERHUB_USERNAME:
required: true

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['37', '38', '39', '310', '311']


steps:

- name: Check the disk space and clear unnecessary library
run: |
rm -rf /home/runner/work/BMTrain/BMTrain/dist
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -hl
- name: Checkout code
uses: actions/checkout@v3

- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Pull Docker image
run: docker pull pytorch/manylinux-cuda113:latest

- name: Run Docker image and execute script
run: |
version=${{ matrix.python-version }}
docker run -e BUILD_DOCKER_ENV=1 -e CUDACXX=/usr/local/cuda-11.3/bin/nvcc -e PATH="/opt/rh/devtoolset-9/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-9/root/usr/lib64:/opt/rh/devtoolset-9/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i pytorch/manylinux-cuda113:latest /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/pip install build; /opt/python/cp${version}*/bin/python -m build .;for file in dist/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"
- name: Archive distribution files
uses: actions/upload-artifact@v2
with:
name: dist
path: |
dist/*.tar.gz
dist/*.whl
47 changes: 6 additions & 41 deletions .github/workflows/publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,50 +7,15 @@ on:
- "v*.*.*"

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['37', '38', '39', '310', '311']


steps:

- name: Check the disk space and clear unnecessary library
run: |
rm -rf /home/runner/work/BMTrain/BMTrain/dist
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -hl
- name: Checkout code
uses: actions/checkout@v3

- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Pull Docker image
run: docker pull pytorch/manylinux-cuda113:latest
- name: Run Docker image and execute script
run: |
version=${{ matrix.python-version }}
docker run -e BUILD_DOCKER_ENV=1 -e CUDACXX=/usr/local/cuda-11.3/bin/nvcc -e PATH="/opt/rh/devtoolset-9/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-9/root/usr/lib64:/opt/rh/devtoolset-9/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i pytorch/manylinux-cuda113:latest /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/pip install build; /opt/python/cp${version}*/bin/python -m build .;for file in dist/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"
- name: Archive distribution files
uses: actions/upload-artifact@v2
with:
name: dist
path: |
dist/*.tar.gz
dist/*.whl
build-archive-wheel:
uses: OpenBMB/BMTrain/.github/workflows/build_whl.yml@workflow
secrets:
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}

publish:
needs: build
needs: build-archive-wheel
runs-on: ubuntu-latest
steps:
- name: Set Up the Python
Expand Down
44 changes: 5 additions & 39 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,47 +6,13 @@ on:
- "v*.*.*"

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['37', '38', '39', '310', '311']

steps:

- name: Check the disk space and clear unnecessary library
run: |
rm -rf /home/runner/work/BMTrain/BMTrain/dist
sudo rm -rf /usr/share/dotnet
sudo rm -rf /opt/ghc
sudo rm -rf "/usr/local/share/boost"
sudo rm -rf "$AGENT_TOOLSDIRECTORY"
df -hl
build-archive-wheel:

- name: Checkout code
uses: actions/checkout@v3

- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Pull Docker image
run: docker pull pytorch/manylinux-cuda113:latest

- name: Run Docker image and execute script
run: |
version=${{ matrix.python-version }}
docker run -e BUILD_DOCKER_ENV=1 -e CUDACXX=/usr/local/cuda-11.3/bin/nvcc -e PATH="/opt/rh/devtoolset-9/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-9/root/usr/lib64:/opt/rh/devtoolset-9/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i pytorch/manylinux-cuda113:latest /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/pip install build;/opt/python/cp${version}*/bin/python -m build .;for file in dist/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"
- name: Archive distribution files
uses: actions/upload-artifact@v2
with:
name: dist
path: |
dist/*.tar.gz
dist/*.whl
uses: OpenBMB/BMTrain/.github/workflows/build_whl.yml@workflow
secrets:
DOCKERHUB_TOKEN: ${{ secrets.DOCKERHUB_TOKEN }}
DOCKERHUB_USERNAME: ${{ secrets.DOCKERHUB_USERNAME }}

publish:
needs: build
Expand Down
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,6 @@ log
.vscode

!bmtrain/dist
tests/test_log.txt
tests/test_log.txt
tests/*.opt
tests/*.ckp
20 changes: 10 additions & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ To enable ZeRO optimization, you need to make some simple replacements to the or
* `torch.nn.Module` -> `bmtrain.DistributedModule`
* `torch.nn.Parameter` -> `bmtrain.DistributedParameter`

And wrap the transformer blocks with `bmtrain.CheckpointBlock`.
And wrap the transformer blocks with `bmtrain.Block`.

Here is an example.

Expand Down Expand Up @@ -118,9 +118,9 @@ class MyModule(bmt.DistributedModule): # changed here
super().__init__()
self.param = bmt.DistributedParameter(torch.empty(1024)) # changed here
self.module_list = torch.nn.ModuleList([
bmt.CheckpointBlock(SomeTransformerBlock()), # changed here
bmt.CheckpointBlock(SomeTransformerBlock()), # changed here
bmt.CheckpointBlock(SomeTransformerBlock()) # changed here
bmt.Block(SomeTransformerBlock()), # changed here
bmt.Block(SomeTransformerBlock()), # changed here
bmt.Block(SomeTransformerBlock()) # changed here
])

def forward(self):
Expand Down Expand Up @@ -151,9 +151,9 @@ class MyModule(bmt.DistributedModule):
super().__init__()
self.param = bmt.DistributedParameter(torch.empty(1024))
self.module_list = torch.nn.ModuleList([
bmt.CheckpointBlock(SomeTransformerBlock()),
bmt.CheckpointBlock(SomeTransformerBlock()),
bmt.CheckpointBlock(SomeTransformerBlock())
bmt.Block(SomeTransformerBlock()),
bmt.Block(SomeTransformerBlock()),
bmt.Block(SomeTransformerBlock())
])

def forward(self):
Expand All @@ -174,9 +174,9 @@ class MyModule(bmt.DistributedModule):
super().__init__()
self.param = bmt.DistributedParameter(torch.empty(1024))
self.module_list = bmt.TransformerBlockList([ # changed here
bmt.CheckpointBlock(SomeTransformerBlock()),
bmt.CheckpointBlock(SomeTransformerBlock()),
bmt.CheckpointBlock(SomeTransformerBlock())
bmt.Block(SomeTransformerBlock()),
bmt.Block(SomeTransformerBlock()),
bmt.Block(SomeTransformerBlock())
])

def forward(self):
Expand Down
10 changes: 6 additions & 4 deletions bmtrain/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,17 @@
from .layer import DistributedModule
from .param_init import init_parameters, grouped_parameters
from .synchronize import synchronize, sum_loss, wait_loader, gather_result
from .block_layer import CheckpointBlock, TransformerBlockList
from .block_layer import Block, TransformerBlockList
from .wrapper import BMTrainModelWrapper
from .pipe_layer import PipelineTransformerBlockList
from . import debug
from .store import save, load

from . import benchmark
from . import loss
from . import distributed
from . import nn
from . import optim
from . import inspect
from . import lr_scheduler
from . import loss
from . import distributed

CheckpointBlock = Block
Loading

0 comments on commit dd2b5bc

Please sign in to comment.