Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BMTrain v0.2.3 #115

Merged
merged 13 commits into from
Jul 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 61 additions & 22 deletions .github/workflows/publish.yaml
Original file line number Diff line number Diff line change
@@ -1,26 +1,65 @@
name: "Publish to PyPI"
name: Build and Publish to PyPI

on:
release:
types:
- published
push:
tags:
- "v*.*.*"

jobs:
build-n-publish:
name: Build and publish
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['36', '37', '38', '39']

steps:
- uses: actions/checkout@master
- uses: actions/setup-python@v2
with:
python-version: '3.7'
architecture: 'x64'
- name: Run build script
run: |
pip install twine --user
pip install wheel
pip install torch==1.11.0+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html
python setup.py sdist --format=gztar
- name: Publish distribution 📦 to PyPI
uses: pypa/gh-action-pypi-publish@master
with:
user: __token__
password: ${{ secrets.pypi_password }}
- name: Checkout code
uses: actions/checkout@v3

- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Pull Docker image
run: docker pull maydomine/bmtrain-manylinux:cu110

- name: Run Docker image and execute script
run: |
version=${{ matrix.python-version }}
docker run -e CUDACXX=/usr/local/cuda/bin/nvcc -e PATH="/workspace/cmake-3.26.4-linux-x86_64/bin:/opt/rh/devtoolset-7/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i maydomine/bmtrain-manylinux:cu110 /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/python setup.py bdist_wheel -d ./wheel/;/opt/python/cp${version}*/bin/python setup.py sdist -d ./sdist/;for file in wheel/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"

- name: Archive distribution files
uses: actions/upload-artifact@v2
with:
name: dist
path: |
sdist/*.tar.gz
wheel/*.whl

publish:
needs: build
runs-on: ubuntu-latest
steps:
- name: Set Up the Python
uses: actions/setup-python@v2
with:
python-version: 3.9

- name: Install twine
run: python -m pip install twine

- name: Download distribution files
uses: actions/download-artifact@v2
with:
name: dist
path: dist

- name: Publish to PyPI
env:
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
run: |
cd dist
python -m twine upload sdist/*.tar.gz wheel/*.whl
71 changes: 71 additions & 0 deletions .github/workflows/release.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
name: Publish release in Github

on:
push:
tags:
- "v*.*.*"

jobs:
build:
runs-on: ubuntu-latest
strategy:
matrix:
python-version: ['36', '37', '38', '39']

steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Login to DockerHub
uses: docker/login-action@v2
with:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_TOKEN }}

- name: Pull Docker image
run: docker pull maydomine/bmtrain-manylinux:cu110

- name: Run Docker image and execute script
run: |
version=${{ matrix.python-version }}
docker run -e CUDACXX=/usr/local/cuda/bin/nvcc -e PATH="/workspace/cmake-3.26.4-linux-x86_64/bin:/opt/rh/devtoolset-7/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i maydomine/bmtrain-manylinux:cu110 /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/python setup.py bdist_wheel -d ./wheel/;/opt/python/cp${version}*/bin/python setup.py sdist -d ./sdist/;for file in wheel/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done"

- name: Archive distribution files
uses: actions/upload-artifact@v2
with:
name: dist
path: |
sdist/*.tar.gz
wheel/*.whl

publish:
needs: build
runs-on: ubuntu-latest
steps:
- name: Checkout code
uses: actions/checkout@v3

- name: Set Up the Python
uses: actions/setup-python@v2
with:
python-version: 3.9

- name: Download distribution files
uses: actions/download-artifact@v2
with:
name: dist
path: dist

- name: Upload Distribution Files
uses: softprops/action-gh-release@v1
with:
body_path: "Release.txt"
files: |
dist/sdist/*.tar.gz
dist/wheel/*.whl
prerelease: false
token: ${{ secrets.RELEASE_TOKEN }}
release_tag: ${{ steps.create_release.outputs.tag }}
github_token: ${{ secrets.GITHUB_TOKEN }}
env:
GITHUB_REPOSITORY: MayDomine/BMTrain
62 changes: 62 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
cmake_minimum_required(VERSION 3.18)
project(bmtrain)
enable_language(C)
enable_language(CXX)
set(CMAKE_CUDA_ARCHITECTURES "61;62;70;72;75;80")
enable_language(CUDA)
set(CMAKE_CXX_STANDARD 14)
set(CMAKE_CXX_STANDARD_REQUIRED True)
set(CMAKE_CUDA_STANDARD 14)
set(CMAKE_CUDA_STANDARD_REQUIRED True)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80")

if(NOT DEFINED ENV{BUILD_DOCKER_ENV} OR "$ENV{BUILD_DOCKER_ENV}" STREQUAL "0")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86")
endif()

set(CMAKE_BUILD_RPATH $ORIGIN)
set(CMAKE_INSTALL_RPATH $ORIGIN)
set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/)

find_package(NCCL REQUIRED)
find_package(Python ${PYTHON_VERSION} EXACT COMPONENTS Interpreter Development.Module REQUIRED)
message (STATUS "Python_EXECUTABLE: ${Python_EXECUTABLE}")
execute_process(COMMAND ${Python_EXECUTABLE} "-c"
"import pybind11; print(pybind11.get_cmake_dir())"
OUTPUT_VARIABLE PYBIND11_CMAKE_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE)
message (STATUS "PYBIND11_CMAKE_DIR: ${PYB
IND11_CMAKE_DIR}")
list(APPEND CMAKE_PREFIX_PATH ${PYBIND11_CMAKE_DIR})
find_package(pybind11 REQUIRED)

message (STATUS "CMAKE_INSTALL_RPATH: ${CMAKE_INSTALL_RPATH}")

file(GLOB_RECURSE SOURCES "csrc/*.cpp")
file(GLOB_RECURSE CUDA_SOURCES "csrc/cuda/*.cu")

set(AVX_FLAGS "${AVX_FLAGS} -march=native")

pybind11_add_module(C ${SOURCES} ${CUDA_SOURCES})

set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")

target_link_libraries(C PRIVATE
"-Wl,-Bsymbolic"
"-Wl,-Bsymbolic-functions"
${NCCL_LIBRARIES}
)
target_include_directories(C PRIVATE ${NCCL_INCLUDE_DIRS})
target_compile_definitions(C
PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO})

set_target_properties(C PROPERTIES CUDA_ARCHITECTURES "61;62;70;72;75;80")

target_include_directories(C
PRIVATE "csrc/include"
PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
)



3 changes: 3 additions & 0 deletions Release.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# BMTrain New Version Release v0.2.3
- easier to install (without torch dependency while compiling)
- compatible with torch 2.0
11 changes: 9 additions & 2 deletions bmtrain/block_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .checkpointing import ScopedTensorInspectorContext
from . import debug
import copy
import inspect


# the flag is used to control the zero level , 0 means normal zero3 , 1 means forward without release parameter ,2 means backward without gather parameter
Expand Down Expand Up @@ -491,6 +492,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
if key in state_dict:
# load here
input_param = state_dict[key]
if input_param.__class__.__name__ == "DistributedTensorWrapper":
input_param = input_param.broadcast()
if input_param.shape != it["shape"]:
error_msgs.append('size mismatch for {}: copying a param with shape {} from checkpoint, '
'the shape in current model is {}.'
Expand Down Expand Up @@ -617,10 +620,14 @@ def init_parameters(self):
torch.tensor([], dtype=d_dtype, device=d_device).set_(tmp_tensor.storage(), offset_st, (offset_end - offset_st,))[:]
del tmp_tensor

def _named_members(self, get_members_fn, prefix='', recurse=True):
def _named_members(self, get_members_fn, prefix='', recurse=True, **kwargs):
r"""Helper method for yielding various names + members of modules."""
return self._module._named_members(get_members_fn, prefix, recurse)

#compitibity with torch 2.0
if "remove_duplicate" in inspect.signature(torch.nn.Module._named_members).parameters and "remove_duplicate" not in kwargs:
kwargs['remove_duplicate'] = True
return self._module._named_members(get_members_fn, prefix, recurse, **kwargs)

def named_modules(self, memo = None, prefix: str = '', remove_duplicate: bool = True):
r"""Returns an iterator over all modules in the network, yielding
both the name of the module as well as the module itself.
Expand Down
13 changes: 10 additions & 3 deletions bmtrain/distributed/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,29 @@
def send_activations(hidden_state, next_rank, comm):
send_meta(hidden_state, next_rank, comm)
ncclSend(hidden_state.storage(), next_rank, comm)

def recv_activations(prev_rank, comm):
dtype, shape = recv_meta(prev_rank, comm)
hidden_state = torch.empty(shape, dtype=dtype, device="cuda")
ncclRecv(hidden_state.storage(), prev_rank, comm)
return hidden_state

def send_meta(x, next_rank, comm):
meta = [len(x.size()), DTYPE_LIST.index(x.dtype)] + list(x.size())
meta_data = torch.tensor(data=meta, device=x.device, dtype=torch.long)
meta_data = torch.tensor(data=[0]*50, device="cuda", dtype=torch.int)
meta_data[0] = len(x.size())
meta_data[1] = DTYPE_LIST.index(x.dtype)
meta_data[2:len(x.size())+2] = torch.tensor(x.size(), device="cuda", dtype=torch.int)
meta_data = meta_data.contiguous()
ncclSend(meta_data.storage(), next_rank, comm)

def recv_meta(prev_rank, comm):
meta_data = torch.tensor(data=[0]*50, device="cuda", dtype=torch.long)
meta_data = torch.tensor(data=[0]*50, device="cuda", dtype=torch.int)
ncclRecv(meta_data.storage(), prev_rank, comm)
n_dims = meta_data[0].item()
dtype = DTYPE_LIST[meta_data[1].item()]
shape = meta_data[2:n_dims+2].tolist()
return dtype,shape

class OpBroadcast(torch.autograd.Function):
@staticmethod
def forward(ctx, src, root, comm = None):
Expand Down
2 changes: 2 additions & 0 deletions bmtrain/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,13 @@ def __init__(self,config):
self.prev_rank = self.stage_id-1 if self.stage_id > 0 else -1
self.tails = self.pp_group[self.pipe_idx, self.stage_id:].tolist()
self.heads = self.pp_group[self.pipe_idx, :self.stage_id + 1].tolist()

def get_group_id(self,group_name):
if group_name == "pipe":
return self.pipe_idx
elif group_name == "zero":
return self.zero_idx

def get_group_rank(self,group_name):
if group_name == "pipe":
return self.stage_id
Expand Down
2 changes: 2 additions & 0 deletions bmtrain/layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
key = prefix + name
if key in state_dict:
input_param = state_dict[key]
if input_param.__class__.__name__ == "DistributedTensorWrapper":
input_param = input_param.broadcast()
# This is used to avoid copying uninitialized parameters into
# non-lazy modules, since they dont have the hook to do the checks
# in such case, it will error when accessing the .shape attribute.
Expand Down
Loading