From c8e465d059be925d1dc274c4bce5d68f3622b5f1 Mon Sep 17 00:00:00 2001 From: maydomine <1583143678@qq.com> Date: Mon, 29 May 2023 18:06:47 +0800 Subject: [PATCH 01/13] modify the version check to make it work with torch2.0 --- bmtrain/optim/adam.py | 4 ++-- bmtrain/optim/adam_offload.py | 4 ++-- bmtrain/utils.py | 12 ++++++++++++ 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/bmtrain/optim/adam.py b/bmtrain/optim/adam.py index 689b151e..dda46f18 100644 --- a/bmtrain/optim/adam.py +++ b/bmtrain/optim/adam.py @@ -4,7 +4,7 @@ from . import _cuda as C from .. import nccl import inspect - +from ..utils import check_torch_version from copy import deepcopy from itertools import chain from collections import defaultdict @@ -110,7 +110,7 @@ def step(self, closure=None, scale=1): [state['exp_avg']], [state["exp_avg_sq"]], [], - [state["step"]] if int(torch.__version__.split('.')[1]) < 12 + [state["step"]] if check_torch_version("1.12.0") < 0 else [torch.tensor(state["step"])], amsgrad=False, beta1=group['betas'][0], diff --git a/bmtrain/optim/adam_offload.py b/bmtrain/optim/adam_offload.py index 1b337268..7e80b866 100644 --- a/bmtrain/optim/adam_offload.py +++ b/bmtrain/optim/adam_offload.py @@ -5,7 +5,7 @@ from .. import nccl import torch.optim._functional as F import inspect - +from ..utils import check_torch_version from copy import deepcopy from itertools import chain from collections import defaultdict @@ -136,7 +136,7 @@ def step(self, closure=None, scale=1): [state["exp_avg"]], [state["exp_avg_sq"]], [], - [state["step"]] if int(torch.__version__.split('.')[1]) < 12 + [state["step"]] if check_torch_version("1.12.0") < 0 else [torch.tensor(state["step"])], amsgrad=False, beta1=beta1, diff --git a/bmtrain/utils.py b/bmtrain/utils.py index 2df9a3a5..391b5a8f 100644 --- a/bmtrain/utils.py +++ b/bmtrain/utils.py @@ -5,6 +5,18 @@ ALIGN = 4 ROW_WIDTH = 60 +def check_torch_version(version_str): + """ + Checks if the current torch version is greater than or equal to the given version. + version_str (str): The version to compare with, in the format of "x.y.z" ,and the func will convert it into a int value of x*100+y*10+z. + """ + version_int_arr = [int(v) for v in version_str.split(".")] + + version_int = version_int_arr[0] * 10000 + version_int_arr[1] * 100 + version_int_arr[2] + torch_version = torch.__version__.split("+")[0] + current_version_int_arr = [int(v) for v in torch_version.split(".")] + current_version_int = current_version_int_arr[0] * 10000 + current_version_int_arr[1] * 100 + current_version_int_arr[2] + return current_version_int - version_int def round_up(x, d): return (x + d - 1) // d * d From 41612bc9420dc157f7e2215abcd3dc8aca137bba Mon Sep 17 00:00:00 2001 From: maydomine <1583143678@qq.com> Date: Wed, 31 May 2023 20:13:27 +0800 Subject: [PATCH 02/13] torch 2.0 named_members interface && fix pipeline communication bug --- bmtrain/block_layer.py | 2 +- bmtrain/distributed/ops.py | 13 ++++++++++--- bmtrain/init.py | 2 ++ 3 files changed, 13 insertions(+), 4 deletions(-) diff --git a/bmtrain/block_layer.py b/bmtrain/block_layer.py index 5269673b..2210c297 100644 --- a/bmtrain/block_layer.py +++ b/bmtrain/block_layer.py @@ -617,7 +617,7 @@ def init_parameters(self): torch.tensor([], dtype=d_dtype, device=d_device).set_(tmp_tensor.storage(), offset_st, (offset_end - offset_st,))[:] del tmp_tensor - def _named_members(self, get_members_fn, prefix='', recurse=True): + def _named_members(self, get_members_fn, prefix='', recurse=True, remove_duplicate=True): r"""Helper method for yielding various names + members of modules.""" return self._module._named_members(get_members_fn, prefix, recurse) diff --git a/bmtrain/distributed/ops.py b/bmtrain/distributed/ops.py index 6124622d..ef69659a 100644 --- a/bmtrain/distributed/ops.py +++ b/bmtrain/distributed/ops.py @@ -20,22 +20,29 @@ def send_activations(hidden_state, next_rank, comm): send_meta(hidden_state, next_rank, comm) ncclSend(hidden_state.storage(), next_rank, comm) + def recv_activations(prev_rank, comm): dtype, shape = recv_meta(prev_rank, comm) hidden_state = torch.empty(shape, dtype=dtype, device="cuda") ncclRecv(hidden_state.storage(), prev_rank, comm) return hidden_state + def send_meta(x, next_rank, comm): - meta = [len(x.size()), DTYPE_LIST.index(x.dtype)] + list(x.size()) - meta_data = torch.tensor(data=meta, device=x.device, dtype=torch.long) + meta_data = torch.tensor(data=[0]*50, device="cuda", dtype=torch.int) + meta_data[0] = len(x.size()) + meta_data[1] = DTYPE_LIST.index(x.dtype) + meta_data[2:len(x.size())+2] = torch.tensor(x.size(), device="cuda", dtype=torch.int) + meta_data = meta_data.contiguous() ncclSend(meta_data.storage(), next_rank, comm) + def recv_meta(prev_rank, comm): - meta_data = torch.tensor(data=[0]*50, device="cuda", dtype=torch.long) + meta_data = torch.tensor(data=[0]*50, device="cuda", dtype=torch.int) ncclRecv(meta_data.storage(), prev_rank, comm) n_dims = meta_data[0].item() dtype = DTYPE_LIST[meta_data[1].item()] shape = meta_data[2:n_dims+2].tolist() return dtype,shape + class OpBroadcast(torch.autograd.Function): @staticmethod def forward(ctx, src, root, comm = None): diff --git a/bmtrain/init.py b/bmtrain/init.py index b98064b7..5c3006d2 100644 --- a/bmtrain/init.py +++ b/bmtrain/init.py @@ -149,11 +149,13 @@ def __init__(self,config): self.prev_rank = self.stage_id-1 if self.stage_id > 0 else -1 self.tails = self.pp_group[self.pipe_idx, self.stage_id:].tolist() self.heads = self.pp_group[self.pipe_idx, :self.stage_id + 1].tolist() + def get_group_id(self,group_name): if group_name == "pipe": return self.pipe_idx elif group_name == "zero": return self.zero_idx + def get_group_rank(self,group_name): if group_name == "pipe": return self.stage_id From bb7381e8c757e2404dae01dd3a4235a640fad4b4 Mon Sep 17 00:00:00 2001 From: maydomine <1583143678@qq.com> Date: Thu, 1 Jun 2023 14:21:41 +0800 Subject: [PATCH 03/13] nccl header update && nccl cpp extension has no denpendcy of torch now --- csrc/include/nccl.h | 55 ++++++++-- csrc/include/nccl_net.h | 217 +++++++++++++++++++++++++++++++++++++--- csrc/nccl.cpp | 6 +- setup.py | 146 ++++++++++++++++++++++++++- 4 files changed, 397 insertions(+), 27 deletions(-) diff --git a/csrc/include/nccl.h b/csrc/include/nccl.h index 4bdda76a..90f721c7 100644 --- a/csrc/include/nccl.h +++ b/csrc/include/nccl.h @@ -14,11 +14,11 @@ #endif #define NCCL_MAJOR 2 -#define NCCL_MINOR 11 -#define NCCL_PATCH 4 +#define NCCL_MINOR 14 +#define NCCL_PATCH 3 #define NCCL_SUFFIX "" -#define NCCL_VERSION_CODE 21104 +#define NCCL_VERSION_CODE 21403 #define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) #ifdef __cplusplus @@ -38,7 +38,29 @@ typedef enum { ncclSuccess = 0, ncclInternalError = 3, ncclInvalidArgument = 4, ncclInvalidUsage = 5, - ncclNumResults = 6 } ncclResult_t; + ncclRemoteError = 6, + ncclInProgress = 7, + ncclNumResults = 8 } ncclResult_t; + +/* Communicator configuration. Users can assign value to attributes to specify the + * behavior of a communicator. */ +typedef struct ncclConfig_v21400 { + /* attributes that users should never touch. */ + size_t size; + unsigned int magic; + unsigned int version; + /* attributes that users are able to customize. */ + int blocking; +} ncclConfig_t; + +/* Config initializer must be assigned to initialize config structure when it is created. + * Not initialized config will result in NCCL error. */ +#define NCCL_CONFIG_INITIALIZER { \ + sizeof(ncclConfig_t), /* size */ \ + 0xcafebeef, /* magic */ \ + NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ + 1 /* blocking */ \ +} /* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. * This integer is coded with the MAJOR, MINOR and PATCH level of the @@ -53,6 +75,11 @@ ncclResult_t pncclGetVersion(int *version); ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); +/* Create a new communicator (multi thread/process version) with a configuration + * set by users. */ +ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); +ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); + /* Creates a new communicator (multi thread/process version). * rank must be between 0 and nranks-1 and unique within a communicator clique. * Each rank is associated to a CUDA device, which has to be set before calling @@ -71,8 +98,15 @@ ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); -/* Frees resources associated with communicator object, but waits for any operations - * that might still be running on the device. */ +/* Finalize a communicator. ncclCommFinalize flushes all issued communications, + * and marks communicator state as ncclInProgress. The state will change to ncclSuccess + * when the communicator is globally quiescent and related resources are freed; then, + * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator + * itself) without blocking. */ +ncclResult_t ncclCommFinalize(ncclComm_t comm); +ncclResult_t pncclCommFinalize(ncclComm_t comm); + +/* Frees local resources associated with communicator object. */ ncclResult_t ncclCommDestroy(ncclComm_t comm); ncclResult_t pncclCommDestroy(ncclComm_t comm); @@ -81,10 +115,16 @@ ncclResult_t pncclCommDestroy(ncclComm_t comm); ncclResult_t ncclCommAbort(ncclComm_t comm); ncclResult_t pncclCommAbort(ncclComm_t comm); -/* Returns a human-readable error message. */ +/* Returns a string for each error code. */ const char* ncclGetErrorString(ncclResult_t result); const char* pncclGetErrorString(ncclResult_t result); +/* Returns a human-readable message of the last error that occurred. + * comm is currently unused and can be set to NULL + */ +const char* ncclGetLastError(ncclComm_t comm); +const char* pncclGetError(ncclComm_t comm); + /* Checks whether the comm has encountered any asynchronous errors */ ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); @@ -351,7 +391,6 @@ ncclResult_t pncclGroupStart(); ncclResult_t ncclGroupEnd(); ncclResult_t pncclGroupEnd(); - #ifdef __cplusplus } // end extern "C" #endif diff --git a/csrc/include/nccl_net.h b/csrc/include/nccl_net.h index 389c1eaa..255a44ee 100644 --- a/csrc/include/nccl_net.h +++ b/csrc/include/nccl_net.h @@ -1,5 +1,5 @@ /************************************************************************* - * Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. * * See LICENSE.txt for license information ************************************************************************/ @@ -10,16 +10,17 @@ #include "nccl.h" #include -#define NCCL_NET_HANDLE_MAXSIZE 64 +#define NCCL_NET_HANDLE_MAXSIZE 128 #define NCCL_PTR_HOST 0x1 #define NCCL_PTR_CUDA 0x2 +#define NCCL_PTR_DMABUF 0x4 // Maximum number of requests per comm object #define NCCL_NET_MAX_REQUESTS 8 typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_ALL=~0} ncclDebugLogSubSys; +typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys; typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); @@ -28,14 +29,209 @@ typedef struct { char* pciPath; // Path to the PCI device in /sys. uint64_t guid; // Unique identifier for the NIC chip. Important for // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA + int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] int speed; // Port speed in Mbps. int port; // Port number. + float latency; // Network latency int maxComms; // Maximum number of comms we can create -}ncclNetProperties_v4_t; + int maxRecvs; // Maximum number of grouped receives. +}ncclNetProperties_v6_t; + +typedef ncclNetProperties_v6_t ncclNetProperties_t; + +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v6_t; + +typedef ncclNet_v6_t ncclNet_t; + +#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6 + +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + /* DMA-BUF support */ + ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v6_t; -typedef ncclNetProperties_v4_t ncclNetProperties_t; +typedef ncclCollNet_v6_t ncclCollNet_t; +#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6 + +// v5 struct for backwards compatibility +typedef struct { + // Name of the network (mainly for logs) + const char* name; + // Initialize the network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create a connection. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Connect to a handle and return a sending comm object for that peer. + // This call must not block for the connection to be established, and instead + // should return successfully with sendComm == NULL with the expectation that + // it will be called again until sendComm != NULL. + ncclResult_t (*connect)(int dev, void* handle, void** sendComm); + // Finalize connection establishment after remote peer has called connect. + // This call must not block for the connection to be established, and instead + // should return successfully with recvComm == NULL with the expectation that + // it will be called again until recvComm != NULL. + ncclResult_t (*accept)(void* listenComm, void** recvComm); + // Register/Deregister memory. Comm can be either a sendComm or a recvComm. + // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* comm, void* mhandle); + // Asynchronous send to a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); + // Asynchronous recv from a peer. + // May return request == NULL if the call cannot be performed (or would block) + ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* sizes); + // Close and free send/recv comm objects + ncclResult_t (*closeSend)(void* sendComm); + ncclResult_t (*closeRecv)(void* recvComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclNet_v5_t; + +// v5 struct for backwards compatibility +typedef struct { + // Name of the collective network (mainly for logs) + const char* name; + // Initialize the collective network. + ncclResult_t (*init)(ncclDebugLogger_t logFunction); + // Return the number of adapters capable of doing collective operations. + // If ndev returns 0, all other functions might be set to NULL. + ncclResult_t (*devices)(int* ndev); + // Get various device properties. + ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); + // Create a receiving object and provide a handle to connect to it. The + // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged + // between ranks to create connections. + ncclResult_t (*listen)(int dev, void* handle, void** listenComm); + // Create a group for collective operations. handles have been created + // using listen() above. rank indicates caller's rank in the collective network. + ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); + // Returns whether a reduction operation on a data type is supported. + // 1 for supported, 0 otherwise. + ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); + // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. + ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); + ncclResult_t (*deregMr)(void* collComm, void* mhandle); + // Performs an asynchronous allreduce operation on the collective group. + // May return request == NULL if the call cannot be performed (or would block). + ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, + ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); + // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is + // visible to the GPU + ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); + // Test whether a request is complete. If size is not NULL, it returns the + // number of bytes sent/received. + ncclResult_t (*test)(void* request, int* done, int* size); + // Close and free collective comm objects + ncclResult_t (*closeColl)(void* collComm); + ncclResult_t (*closeListen)(void* listenComm); +} ncclCollNet_v5_t; + +// v4 struct for backwards compatibility +typedef struct { + char* name; // Used mostly for logging. + char* pciPath; // Path to the PCI device in /sys. + uint64_t guid; // Unique identifier for the NIC chip. Important for + // cards with multiple PCI functions (Physical or virtual). + int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA + int speed; // Port speed in Mbps. + int port; // Port number. + int maxComms; // Maximum number of comms we can create +} ncclNetProperties_v4_t; + +// v4 struct for backwards compatibility typedef struct { // Name of the network (mainly for logs) const char* name; @@ -75,10 +271,7 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclNet_v4_t; -typedef ncclNet_v4_t ncclNet_t; - -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v4 - +// v4 struct for backwards compatibility typedef struct { // Name of the collective network (mainly for logs) const char* name; @@ -117,8 +310,4 @@ typedef struct { ncclResult_t (*closeListen)(void* listenComm); } ncclCollNet_v4_t; -typedef ncclCollNet_v4_t ncclCollNet_t; - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v4 - #endif // end include guard diff --git a/csrc/nccl.cpp b/csrc/nccl.cpp index 2d0ac8de..c1dbb321 100644 --- a/csrc/nccl.cpp +++ b/csrc/nccl.cpp @@ -1,7 +1,9 @@ +#include +#include +#include -#include +namespace py = pybind11; #include "include/nccl.h" -#include void checkNCCLStatus(ncclResult_t result) { if (result == ncclSuccess) return; diff --git a/setup.py b/setup.py index 7c29a10f..b7411f4c 100644 --- a/setup.py +++ b/setup.py @@ -1,7 +1,147 @@ from setuptools import setup, find_packages import torch -from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CppExtension +from setuptools import Extension +from setuptools.command.build_ext import build_ext import os +import os +import setuptools +import subprocess +from pkg_resources import packaging +from typing import Dict, List, Optional, Union, Tuple +def CppExtension(name, sources, *args, **kwargs): + include_dirs = kwargs.get('include_dirs', []) + include_dirs += include_paths() + kwargs['include_dirs'] = include_dirs + + library_dirs = kwargs.get('library_dirs', []) + library_dirs += library_paths() + kwargs['library_dirs'] = library_dirs + + libraries = kwargs.get('libraries', []) + kwargs['libraries'] = libraries + + kwargs['language'] = 'c++' + return setuptools.Extension(name, sources, *args, **kwargs) + + +def CUDAExtension(name, sources, *args, **kwargs): + library_dirs = kwargs.get('library_dirs', []) + library_dirs += library_paths(cuda=True) + kwargs['library_dirs'] = library_dirs + + libraries = kwargs.get('libraries', []) + kwargs['libraries'] = libraries + + include_dirs = kwargs.get('include_dirs', []) + + + include_dirs += include_paths(cuda=True) + kwargs['include_dirs'] = include_dirs + + kwargs['language'] = 'c++' + + dlink_libraries = kwargs.get('dlink_libraries', []) + dlink = kwargs.get('dlink', False) or dlink_libraries + if dlink: + extra_compile_args = kwargs.get('extra_compile_args', {}) + + extra_compile_args_dlink = extra_compile_args.get('nvcc_dlink', []) + extra_compile_args_dlink += ['-dlink'] + extra_compile_args_dlink += [f'-L{x}' for x in library_dirs] + extra_compile_args_dlink += [f'-l{x}' for x in dlink_libraries] + + + extra_compile_args['nvcc_dlink'] = extra_compile_args_dlink + + kwargs['extra_compile_args'] = extra_compile_args + + return setuptools.Extension(name, sources, *args, **kwargs) + +def _find_cuda_home() -> Optional[str]: + r'''Finds the CUDA install path.''' + # Guess #1 + cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH') + if cuda_home is None: + # Guess #2 + try: + which = 'which' + with open(os.devnull, 'w') as devnull: + nvcc = subprocess.check_output([which, 'nvcc'], + stderr=devnull).decode(*SUBPROCESS_DECODE_ARGS).rstrip('\r\n') + cuda_home = os.path.dirname(os.path.dirname(nvcc)) + except Exception: + #Guess #3 + cuda_home = '/usr/local/cuda' + if not os.path.exists(cuda_home): + cuda_home = None + return cuda_home + +CUDA_HOME = _find_cuda_home() +_BMT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) + +def _join_cuda_home(*paths) -> str: + if CUDA_HOME is None: + raise EnvironmentError('CUDA_HOME environment variable is not set. ' + 'Please set it to your CUDA install root.') + return os.path.join(CUDA_HOME, *paths) + +def CppExtension(name, sources, *args, **kwargs): + library_dirs = kwargs.get('library_dirs', []) + library_dirs += library_paths(cuda=True) + kwargs['library_dirs'] = library_dirs + + libraries = kwargs.get('libraries', []) + libraries.append('cudart') + kwargs['libraries'] = libraries + + include_dirs = kwargs.get('include_dirs', []) + + include_dirs += include_paths(cuda=True) + kwargs['include_dirs'] = include_dirs + + kwargs['language'] = 'c++' + + dlink_libraries = kwargs.get('dlink_libraries', []) + dlink = kwargs.get('dlink', False) or dlink_libraries + if dlink: + extra_compile_args = kwargs.get('extra_compile_args', {}) + + extra_compile_args_dlink = extra_compile_args.get('nvcc_dlink', []) + extra_compile_args_dlink += ['-dlink'] + extra_compile_args_dlink += [f'-L{x}' for x in library_dirs] + extra_compile_args_dlink += [f'-l{x}' for x in dlink_libraries] + + extra_compile_args['nvcc_dlink'] = extra_compile_args_dlink + + kwargs['extra_compile_args'] = extra_compile_args + + return setuptools.Extension(name, sources, *args, **kwargs) + + +def include_paths(cuda: bool = False) -> List[str]: + paths = [ + os.path.join(_BMT_PATH, 'csrc', 'include'), + ] + if cuda: + cuda_home_include = _join_cuda_home('include') + # if we have the Debian/Ubuntu packages for cuda, we get /usr as cuda home. + # but gcc doesn't like having /usr/include passed explicitly + if cuda_home_include != '/usr/include': + paths.append(cuda_home_include) + return paths + + +def library_paths(cuda: bool = False) -> List[str]: + paths = [] + + if cuda: + lib_dir = 'lib64' + if (not os.path.exists(_join_cuda_home(lib_dir)) and + os.path.exists(_join_cuda_home('lib'))): + lib_dir = 'lib' + + paths.append(_join_cuda_home(lib_dir)) + return paths def get_avx_flags(): if os.environ.get("BMT_AVX256", "").lower() in ["1", "true", "on"]: @@ -51,7 +191,7 @@ def get_device_cc(): ext_modules = [ CUDAExtension('bmtrain.nccl._C', [ 'csrc/nccl.cpp', - ], include_dirs=["csrc/nccl/build/include"], extra_compile_args={}), + ], include_dirs=["csrc/nccl/build/include"], extra_compile_args={}, extra_link_args=["../libnccl_static.a"]), CUDAExtension('bmtrain.optim._cuda', [ 'csrc/adam_cuda.cpp', 'csrc/cuda/adam.cu', @@ -83,6 +223,6 @@ def get_device_cc(): ], ext_modules=ext_modules, cmdclass={ - 'build_ext': BuildExtension + 'build_ext': build_ext }) From 0200974299df4f0df0428efc259ff29215567ad5 Mon Sep 17 00:00:00 2001 From: maydomine <1583143678@qq.com> Date: Mon, 12 Jun 2023 14:42:02 +0800 Subject: [PATCH 04/13] use cmake to simpify the installition and better support for torch 2.0 --- CMakeLists.txt | 52 ++++ bmtrain/block_layer.py | 9 +- bmtrain/loss/_function.py | 86 ++++++ bmtrain/loss/cross_entropy.py | 11 +- bmtrain/nccl/__init__.py | 2 +- bmtrain/optim/_function.py | 80 ++++++ bmtrain/optim/adam.py | 11 +- bmtrain/optim/adam_offload.py | 10 +- bmtrain/optim/optim_manager.py | 4 +- cmake/FindNCCL.cmake | 75 +++++ csrc/adam_cuda.cpp | 58 ---- csrc/bind.cpp | 25 ++ csrc/cross_entropy_loss.cpp | 98 ------- csrc/cuda/{adam.cu => adam_cuda.cu} | 31 +- csrc/cuda/cross_entropy.cu | 94 +++--- csrc/cuda/has_inf_nan.cu | 23 +- csrc/cuda/reduce.cuh | 2 - csrc/{adam_cpu.cpp => include/adam_cpu.hpp} | 178 ++++++++---- csrc/include/bind.hpp | 55 ++++ csrc/{nccl.cpp => include/nccl.hpp} | 18 +- setup.py | 302 ++++++-------------- 21 files changed, 687 insertions(+), 537 deletions(-) create mode 100644 CMakeLists.txt create mode 100644 bmtrain/loss/_function.py create mode 100644 bmtrain/optim/_function.py create mode 100644 cmake/FindNCCL.cmake delete mode 100644 csrc/adam_cuda.cpp create mode 100644 csrc/bind.cpp delete mode 100644 csrc/cross_entropy_loss.cpp rename csrc/cuda/{adam.cu => adam_cuda.cu} (62%) rename csrc/{adam_cpu.cpp => include/adam_cpu.hpp} (57%) create mode 100644 csrc/include/bind.hpp rename csrc/{nccl.cpp => include/nccl.hpp} (83%) diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 00000000..a00f3f6d --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,52 @@ +cmake_minimum_required(VERSION 3.18) +project(bmtrain) +enable_language(C) +enable_language(CXX) +set(CMAKE_CUDA_ARCHITECTURES "61;62;70;72;75;80") +enable_language(CUDA) + +set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD_REQUIRED True) +set(CMAKE_CUDA_STANDARD 14) +set(CMAKE_CUDA_STANDARD_REQUIRED True) +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86") +set(CMAKE_BUILD_RPATH $ORIGIN) +set(CMAKE_INSTALL_RPATH $ORIGIN) +set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/) + +find_package(NCCL REQUIRED) +find_package(Python ${PYTHON_VERSION} EXACT COMPONENTS Interpreter Development.Module REQUIRED) +message (STATUS "Python_EXECUTABLE: ${Python_EXECUTABLE}") +execute_process(COMMAND ${Python_EXECUTABLE} "-c" + "import pybind11; print(pybind11.get_cmake_dir())" + OUTPUT_VARIABLE PYBIND11_CMAKE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE) +message (STATUS "PYBIND11_CMAKE_DIR: ${PYBIND11_CMAKE_DIR}") +list(APPEND CMAKE_PREFIX_PATH ${PYBIND11_CMAKE_DIR}) +find_package(pybind11 REQUIRED) + +message (STATUS "CMAKE_INSTALL_RPATH: ${CMAKE_INSTALL_RPATH}") + +file(GLOB_RECURSE SOURCES "csrc/*.cpp") +file(GLOB_RECURSE CUDA_SOURCES "csrc/cuda/*.cu") + +pybind11_add_module(C ${SOURCES} ${CUDA_SOURCES}) +target_link_libraries(C PRIVATE + "-Wl,-Bsymbolic" + "-Wl,-Bsymbolic-functions" + ${NCCL_LIBRARIES} +) +target_include_directories(C PRIVATE ${NCCL_INCLUDE_DIRS}) +target_compile_definitions(C + PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO}) + +set_target_properties(C PROPERTIES CUDA_ARCHITECTURES "61;62;70;72;75;80;86") + +target_include_directories(C + PRIVATE "csrc/include" + PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} +) + +#STATIC ${bmengine_STATIC_LIBRARIES} + + diff --git a/bmtrain/block_layer.py b/bmtrain/block_layer.py index 2210c297..652e085a 100644 --- a/bmtrain/block_layer.py +++ b/bmtrain/block_layer.py @@ -9,6 +9,7 @@ from .checkpointing import ScopedTensorInspectorContext from . import debug import copy +import inspect # the flag is used to control the zero level , 0 means normal zero3 , 1 means forward without release parameter ,2 means backward without gather parameter @@ -617,10 +618,14 @@ def init_parameters(self): torch.tensor([], dtype=d_dtype, device=d_device).set_(tmp_tensor.storage(), offset_st, (offset_end - offset_st,))[:] del tmp_tensor - def _named_members(self, get_members_fn, prefix='', recurse=True, remove_duplicate=True): + def _named_members(self, get_members_fn, prefix='', recurse=True, **kwargs): r"""Helper method for yielding various names + members of modules.""" - return self._module._named_members(get_members_fn, prefix, recurse) + #compitibity with torch 2.0 + if "remove_duplicate" in inspect.signature(torch.nn.Module._named_members).parameters and "remove_duplicate" not in kwargs: + kwargs['remove_duplicate'] = True + return self._module._named_members(get_members_fn, prefix, recurse, **kwargs) + def named_modules(self, memo = None, prefix: str = '', remove_duplicate: bool = True): r"""Returns an iterator over all modules in the network, yielding both the name of the module as well as the module itself. diff --git a/bmtrain/loss/_function.py b/bmtrain/loss/_function.py new file mode 100644 index 00000000..658ef242 --- /dev/null +++ b/bmtrain/loss/_function.py @@ -0,0 +1,86 @@ + +from .. import C +import torch +CHECK_INPUT = lambda x: x.is_contiguous() and x.is_cuda +def has_inf_nan(g_fp16: torch.Tensor, out: torch.Tensor) -> None: + assert g_fp16.dtype == torch.float16, "g_fp16 must be a half tensor" + assert out.dtype == torch.uint8, "out must be a uint8 tensor" + assert CHECK_INPUT(g_fp16), "g_fp16 must be contiguous and on cuda" + assert CHECK_INPUT(out), "out must be contiguous and on cuda" + mid = torch.zeros(1024, device=out.device, dtype=out.dtype) + stream = torch.cuda.current_stream().cuda_stream + C.has_nan_inf_launcher(g_fp16.numel(), g_fp16.data_ptr(), mid.data_ptr(), out.data_ptr(), stream) + + + +def cross_entropy_forward(m: int, n: int, input: torch.Tensor, target: torch.Tensor, + softmax: torch.Tensor, output: torch.Tensor, ignore_index: int) -> None: + CHECK_INPUT(input) + CHECK_INPUT(target) + CHECK_INPUT(softmax) + CHECK_INPUT(output) + assert input.dtype == torch.float16, "input must be a half tensor" + assert target.dtype == torch.int32, "target must be an int tensor" + assert softmax.dtype == torch.float16, "softmax must be a half tensor" + assert output.dtype == torch.float32, "output must be a float tensor" + assert input.numel() == softmax.numel(), "input and softmax must have the same number of elements" + assert target.numel() == output.numel(), "target and output must have the same number of elements" + input_ptr = input.data_ptr() + target_ptr = target.data_ptr() + softmax_ptr = softmax.data_ptr() + output_ptr = output.data_ptr() + cuda_stream = torch.cuda.current_stream().cuda_stream + C.cross_entropy_forward_launcher(m, n, input_ptr, target_ptr, softmax_ptr, output_ptr, ignore_index, cuda_stream) + +def cross_entropy_backward(m: int, n: int, grad_output: torch.Tensor, target: torch.Tensor, + softmax: torch.Tensor, grad_input: torch.Tensor, ignore_index: int) -> None: + CHECK_INPUT(grad_output) + CHECK_INPUT(target) + CHECK_INPUT(softmax) + CHECK_INPUT(grad_input) + assert grad_output.dtype == torch.float32, "grad_output must be a float tensor" + assert target.dtype == torch.int32, "target must be an int tensor" + assert softmax.dtype == torch.float16, "softmax must be a half tensor" + assert grad_input.dtype == torch.float16, "grad_input must be a half tensor" + assert grad_input.numel() == softmax.numel(), "grad_input and softmax must have the same number of elements" + assert target.numel() == grad_output.numel(), "target and grad_output must have the same number of elements" + grad_output_ptr = grad_output.data_ptr() + target_ptr = target.data_ptr() + softmax_ptr = softmax.data_ptr() + grad_input_ptr = grad_input.data_ptr() + cuda_stream = torch.cuda.current_stream().cuda_stream + C.cross_entropy_backward_launcher(m, n, grad_output_ptr, target_ptr, softmax_ptr, grad_input_ptr, ignore_index, cuda_stream) + +def cross_entropy_forward_inplace(m: int, n: int, x: torch.Tensor, target: torch.Tensor, + output: torch.Tensor, ignore_index: int) -> None: + CHECK_INPUT(x) + CHECK_INPUT(target) + CHECK_INPUT(output) + assert x.dtype == torch.float16, "x must be a half tensor" + assert target.dtype == torch.int32, "target must be an int tensor" + assert output.dtype == torch.float32, "output must be a float tensor" + assert target.numel() == output.numel(), "target and output must have the same number of elements" + cuda_stream = torch.cuda.current_stream().cuda_stream + x_ptr = x.data_ptr() + output_ptr = output.data_ptr() + target_ptr = target.data_ptr() + output_ptr = output.data_ptr() + + C.cross_entropy_forward_inplace_launcher(m, n, x_ptr, target_ptr, output_ptr, ignore_index, cuda_stream) + +def cross_entropy_backward_inplace(m: int, n: int, grad_output: torch.Tensor, target: torch.Tensor, + x: torch.Tensor, ignore_index: int) -> None: + CHECK_INPUT(grad_output) + CHECK_INPUT(target) + CHECK_INPUT(x) + assert grad_output.dtype == torch.float32, "grad_output must be a float tensor" + assert target.dtype == torch.int32, "target must be an int tensor" + assert x.dtype == torch.float16, "x must be a half tensor" + assert target.numel() == grad_output.numel(), "target and grad_output must have the same number of elements" + cuda_stream = torch.cuda.current_stream().cuda_stream + grad_output_ptr = grad_output.data_ptr() + target_ptr = target.data_ptr() + x_ptr = x.data_ptr() + + C.cross_entropy_backward_inplace_launcher(m, n, grad_output_ptr, target_ptr, x_ptr, ignore_index, cuda_stream) + diff --git a/bmtrain/loss/cross_entropy.py b/bmtrain/loss/cross_entropy.py index 1fb37181..160ef421 100644 --- a/bmtrain/loss/cross_entropy.py +++ b/bmtrain/loss/cross_entropy.py @@ -1,7 +1,6 @@ from typing import Optional import torch -from . import _cuda as C - +from . import _function as F class OpFusedCrossEntropy(torch.autograd.Function): """ CrossEntropy dim = 1 @@ -11,7 +10,7 @@ def forward(ctx, x : torch.Tensor, target : torch.Tensor, ignore_index: int): assert x.ndim == 2 softmax = torch.empty(x.size(), device=x.device, dtype=x.dtype) out = torch.empty(x.size(0), device=x.device, dtype=torch.float) - C.f_cross_entropy_forward( + F.cross_entropy_forward( x.size(0), x.size(1), x, target, softmax, out, @@ -25,7 +24,7 @@ def forward(ctx, x : torch.Tensor, target : torch.Tensor, ignore_index: int): def backward(ctx, grad_output : torch.Tensor): grad_output = grad_output.contiguous() softmax, target = ctx.saved_tensors - C.f_cross_entropy_backward_inplace( + F.cross_entropy_backward_inplace( softmax.size(0), softmax.size(1), grad_output, target, softmax, @@ -41,7 +40,7 @@ class OpFusedCrossEntropyInplace(torch.autograd.Function): def forward(ctx, x : torch.Tensor, target : torch.Tensor, ignore_index: int): assert x.ndim == 2 out = torch.empty(x.size(0), device=x.device, dtype=torch.float) - C.f_cross_entropy_forward_inplace( + F.cross_entropy_forward_inplace( x.size(0), x.size(1), x, target, out, @@ -55,7 +54,7 @@ def forward(ctx, x : torch.Tensor, target : torch.Tensor, ignore_index: int): def backward(ctx, grad_output : torch.Tensor): grad_output = grad_output.contiguous() softmax, target = ctx.saved_tensors - C.f_cross_entropy_backward_inplace( + F.cross_entropy_backward_inplace( softmax.size(0), softmax.size(1), grad_output, target, softmax, diff --git a/bmtrain/nccl/__init__.py b/bmtrain/nccl/__init__.py index b6b2de91..0f4129d5 100644 --- a/bmtrain/nccl/__init__.py +++ b/bmtrain/nccl/__init__.py @@ -1,7 +1,7 @@ from typing_extensions import Literal import torch -from . import _C as C +from .. import C from .enums import * class NCCLCommunicator: diff --git a/bmtrain/optim/_function.py b/bmtrain/optim/_function.py new file mode 100644 index 00000000..342998fa --- /dev/null +++ b/bmtrain/optim/_function.py @@ -0,0 +1,80 @@ + +from .. import C +import torch +CHECK_INPUT = lambda x: x.is_contiguous() and x.is_cuda +def adam_cpu(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tensor, m_fp32: torch.Tensor, + v_fp32: torch.Tensor, beta1: float, beta2: float, eps: float, lr: float, scale: float, + weight_decay: float, step: int) -> None: + assert param_fp32.is_contiguous(), "param_fp32 must be contiguous" + assert param_fp16.is_contiguous(), "param_fp16 must be contiguous" + assert g_fp16.is_contiguous(), "g_fp16 must be contiguous" + assert m_fp32.is_contiguous(), "m_fp32 must be contiguous" + assert v_fp32.is_contiguous(), "v_fp32 must be contiguous" + assert param_fp32.dtype == torch.float32, "param_fp32 must be float32 tensor" + assert param_fp16.dtype == torch.float16, "param_fp16 must be float16 tensor" + assert g_fp16.dtype == torch.float16, "g_fp16 must be float16 tensor" + assert m_fp32.dtype == torch.float32, "m_fp32 must be float32 tensor" + assert v_fp32.dtype == torch.float32, "v_fp32 must be float32 tensor" + assert param_fp32.device == torch.device("cpu"), "param_fp32 must be a cpu tensor" + assert param_fp16.device == torch.device("cpu"), "param_fp16 must be a cpu tensor" + assert g_fp16.device == torch.device("cpu"), "g_fp16 must be a cpu tensor" + assert m_fp32.device == torch.device("cpu"), "m_fp32 must be a cpu tensor" + assert v_fp32.device == torch.device("cpu"), "v_fp32 must be a cpu tensor" + assert param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements" + assert param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements" + assert param_fp32.numel() == m_fp32.numel(), "param_fp32 and m_fp32 must have the same number of elements" + assert param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements" + bias_correction1 = 1 - beta1 ** step + bias_correction2 = 1 - beta2 ** step + stream = torch.cuda.current_stream().cuda_stream + C.adam_cpu_launcher( + param_fp32.numel(), + param_fp32.data_ptr(), + param_fp16.data_ptr(), + g_fp16.data_ptr(), + m_fp32.data_ptr(), + v_fp32.data_ptr(), + beta1, beta2, + eps, lr, + scale, + weight_decay, + bias_correction1, + bias_correction2, + stream + ) + +def adam(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tensor, m_fp16: torch.Tensor, + v_fp32: torch.Tensor, beta1: float, beta2: float, eps: float, lr: float, scale: float, + weight_decay: float, step: int) -> None: + assert CHECK_INPUT(param_fp32), "param_fp32 must be contiguous and on cuda" + assert CHECK_INPUT(param_fp16), "param_fp16 must be contiguous and on cuda" + assert CHECK_INPUT(g_fp16), "g_fp16 must be contiguous and on cuda" + assert CHECK_INPUT(m_fp16), "m_fp32 must be contiguous and on cuda" + assert CHECK_INPUT(v_fp32), "v_fp32 must be contiguous and on cuda" + assert param_fp32.dtype == torch.float32, "param_fp32 must be float32 tensor" + assert param_fp16.dtype == torch.float16, "param_fp16 must be float16 tensor" + assert g_fp16.dtype == torch.float16, "g_fp16 must be float16 tensor" + assert m_fp16.dtype == torch.float32, "m_fp32 must be float32 tensor" + assert v_fp32.dtype == torch.float32, "v_fp32 must be float32 tensor" + assert param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements" + assert param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements" + assert param_fp32.numel() == m_fp16.numel(), "param_fp32 and m_fp32 must have the same number of elements" + assert param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements" + bias_correction1 = 1 - beta1 ** step + bias_correction2 = 1 - beta2 ** step + stream = torch.cuda.current_stream().cuda_stream + C.adam_launcher( + param_fp32.numel(), + param_fp32.data_ptr(), + param_fp16.data_ptr(), + g_fp16.data_ptr(), + m_fp16.data_ptr(), + v_fp32.data_ptr(), + beta1, beta2, + eps, lr, + scale, + weight_decay, + bias_correction1, + bias_correction2, + stream + ) \ No newline at end of file diff --git a/bmtrain/optim/adam.py b/bmtrain/optim/adam.py index dda46f18..4cdb7914 100644 --- a/bmtrain/optim/adam.py +++ b/bmtrain/optim/adam.py @@ -1,7 +1,8 @@ import torch from ..global_var import config -import torch.optim._functional as F -from . import _cuda as C +from . import _function as F +import torch.optim._functional +from .. import C from .. import nccl import inspect from ..utils import check_torch_version @@ -87,7 +88,7 @@ def step(self, closure=None, scale=1): grad = p.grad if p.dtype == torch.half: - C.f_adam( + F.adam( state["_param_fp32"], # fp32 p, # fp16 grad, # fp16 @@ -102,9 +103,9 @@ def step(self, closure=None, scale=1): ) else: other_kwargs = {} - if 'maximize' in inspect.signature(F.adam).parameters: + if 'maximize' in inspect.signature(torch.optim._functional.adam).parameters: other_kwargs['maximize'] = False - F.adam( + torch.optim._functional.adam( [p], [grad / scale], [state['exp_avg']], diff --git a/bmtrain/optim/adam_offload.py b/bmtrain/optim/adam_offload.py index 7e80b866..ecf416f6 100644 --- a/bmtrain/optim/adam_offload.py +++ b/bmtrain/optim/adam_offload.py @@ -1,9 +1,7 @@ import torch from ..global_var import config -from . import _cpu as C -from . import _cuda as G +from . import _function as F from .. import nccl -import torch.optim._functional as F import inspect from ..utils import check_torch_version from copy import deepcopy @@ -107,7 +105,7 @@ def step(self, closure=None, scale=1): grad = -state["_grad_fp16"] else: grad = state["_grad_fp16"] - C.f_adam_cpu( + F.adam_cpu( state["_param_fp32"].view(-1), state["_param_fp16"].view(-1), grad.view(-1), @@ -128,9 +126,9 @@ def step(self, closure=None, scale=1): else: grad = state["_grad_fp32"] other_kwargs = {} - if 'maximize' in inspect.signature(F.adam).parameters: + if 'maximize' in inspect.signature(torch.optim._functional.adam).parameters: other_kwargs['maximize'] = False - F.adam( + torch.optim._functional.adam( [state["_param_fp32"]], [grad], [state["exp_avg"]], diff --git a/bmtrain/optim/optim_manager.py b/bmtrain/optim/optim_manager.py index bb145748..e0943ca0 100644 --- a/bmtrain/optim/optim_manager.py +++ b/bmtrain/optim/optim_manager.py @@ -1,7 +1,7 @@ from typing import Optional, Union, List, Dict, Tuple import torch -from . import _cuda as G +from .. import C from ..utils import print_rank from ..lr_scheduler.warmup import WarmupLRScheduler from .. import nccl @@ -13,7 +13,7 @@ def check_overflow(param_groups): for group in param_groups: for p in group['params']: if p.grad is not None and p.dtype == torch.half: # TODO support other types - G.f_has_inf_nan(p.grad, has_inf_or_nan) + C.f_has_inf_nan(p.grad, has_inf_or_nan) if "comm" in config: nccl.allReduce(has_inf_or_nan.storage(), has_inf_or_nan.storage(), "max", config["comm"]) diff --git a/cmake/FindNCCL.cmake b/cmake/FindNCCL.cmake new file mode 100644 index 00000000..e6991733 --- /dev/null +++ b/cmake/FindNCCL.cmake @@ -0,0 +1,75 @@ +list(APPEND NCCL_ROOT $ENV{NCCL_ROOT_DIR} ${CUDA_TOOLKIT_ROOT_DIR}) +if(DEFINED ENV{NCCL_ROOT_DIR}) + set(NCCL_ROOT_DIR $ENV{NCCL_ROOT_DIR}) + set(NCCL_INCLUDE_DIR "${NCCL_ROOT_DIR}/include" CACHE PATH "Folder contains NVIDIA NCCL headers") + set(NCCL_LIB_DIR "${NCCL_ROOT_DIR}/lib" CACHE PATH "Folder contains NVIDIA NCCL libraries") +else() + set(NCCL_INCLUDE_DIR $ENV{NCCL_INCLUDE_DIR} CACHE PATH "Folder contains NVIDIA NCCL headers") + set(NCCL_LIB_DIR $ENV{NCCL_LIB_DIR} CACHE PATH "Folder contains NVIDIA NCCL libraries") +endif() + +# Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12. +list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT}) + +find_path(NCCL_INCLUDE_DIRS + NAMES nccl.h + HINTS ${NCCL_INCLUDE_DIR}) + +if (USE_STATIC_NCCL) + MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.") + SET(NCCL_LIBNAME "nccl_static") + if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified + set(CMAKE_FIND_LIBRARY_SUFFIXES ".a.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif() +else() + SET(NCCL_LIBNAME "nccl") + if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified + set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES}) + endif() +endif() + +find_library(NCCL_LIBRARIES + NAMES ${NCCL_LIBNAME} + HINTS ${NCCL_LIB_DIR}) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES) + +if(NCCL_FOUND) # obtaining NCCL version and some sanity checks + set (NCCL_HEADER_FILE "${NCCL_INCLUDE_DIRS}/nccl.h") + message (STATUS "Determining NCCL version from ${NCCL_HEADER_FILE}...") + set (OLD_CMAKE_REQUIRED_INCLUDES ${CMAKE_REQUIRED_INCLUDES}) + list (APPEND CMAKE_REQUIRED_INCLUDES ${NCCL_INCLUDE_DIRS}) + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(NCCL_VERSION_CODE nccl.h NCCL_VERSION_DEFINED) + + if (NCCL_VERSION_DEFINED) + set(file "${PROJECT_BINARY_DIR}/detect_nccl_version.cc") + file(WRITE ${file} " + #include + #include + int main() + { + std::cout << NCCL_MAJOR << '.' << NCCL_MINOR << '.' << NCCL_PATCH << std::endl; + int x; + ncclGetVersion(&x); + return x == NCCL_VERSION_CODE; + } +") + try_run(NCCL_VERSION_MATCHED compile_result ${PROJECT_BINARY_DIR} ${file} + RUN_OUTPUT_VARIABLE NCCL_VERSION_FROM_HEADER + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${NCCL_INCLUDE_DIRS}" + LINK_LIBRARIES ${NCCL_LIBRARIES}) + if (NOT NCCL_VERSION_MATCHED) + message(FATAL_ERROR "Found NCCL header version and library version do not match! \ +(include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES}) Please set NCCL_INCLUDE_DIR and NCCL_LIB_DIR manually.") + endif() + message(STATUS "NCCL version: ${NCCL_VERSION_FROM_HEADER}") + else() + # message(STATUS "NCCL version < 2.3.5-5") + endif () + set (CMAKE_REQUIRED_INCLUDES ${OLD_CMAKE_REQUIRED_INCLUDES}) + + message(STATUS "Found NCCL (include: ${NCCL_INCLUDE_DIRS}, library: ${NCCL_LIBRARIES})") + mark_as_advanced(NCCL_ROOT_DIR NCCL_INCLUDE_DIRS NCCL_LIBRARIES) +endif() diff --git a/csrc/adam_cuda.cpp b/csrc/adam_cuda.cpp deleted file mode 100644 index 2c102d94..00000000 --- a/csrc/adam_cuda.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include - -void adam_launcher(const torch::Tensor ¶m_fp32, const torch::Tensor ¶m_fp16, const torch::Tensor &g_fp16, const torch::Tensor &m_fp16, const torch::Tensor &v_fp32, float beta1, float beta2, float eps, float lr, float scale, float weight_decay, float bias_correction1, float bias_correction2); -void has_nan_inf_launcher(const torch::Tensor &g_fp16, torch::Tensor mid, torch::Tensor out); - -#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") -#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) - -void F_adam( - const torch::Tensor ¶m_fp32, - const torch::Tensor ¶m_fp16, - const torch::Tensor &g_fp16, - const torch::Tensor &m_fp16, - const torch::Tensor &v_fp32, - float beta1, float beta2, - float eps, float lr, - float scale, - float weight_decay, - int64_t step -) { - CHECK_INPUT(param_fp32); - CHECK_INPUT(param_fp16); - CHECK_INPUT(g_fp16); - CHECK_INPUT(m_fp16); - CHECK_INPUT(v_fp32); - AT_ASSERTM(param_fp32.dtype() == torch::kFloat, "param_fp32 must be a float tensor"); - AT_ASSERTM(param_fp16.dtype() == torch::kHalf, "param_fp16 must be a half tensor"); - AT_ASSERTM(g_fp16.dtype() == torch::kHalf, "g_fp16 must be a half tensor"); - AT_ASSERTM(m_fp16.dtype() == torch::kHalf, "m_fp16 must be a half tensor"); - AT_ASSERTM(v_fp32.dtype() == torch::kFloat, "v_fp32 must be a float tensor"); - AT_ASSERTM(param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements"); - AT_ASSERTM(param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements"); - AT_ASSERTM(param_fp32.numel() == m_fp16.numel(), "param_fp32 and m_fp16 must have the same number of elements"); - AT_ASSERTM(param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements"); - - float bias_correction1 = 1 - powf(beta1, step); - float bias_correction2 = 1 - powf(beta2, step); - - adam_launcher(param_fp32, param_fp16, g_fp16, m_fp16, v_fp32, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2); -} - -void F_has_inf_nan(const torch::Tensor &g_fp16, torch::Tensor &out) { - CHECK_INPUT(g_fp16); - CHECK_INPUT(out); - AT_ASSERTM(g_fp16.dtype() == torch::kHalf, "g_fp16 must be a half tensor"); - AT_ASSERTM(out.dtype() == torch::kUInt8, "out must be a uint8 tensor"); - - torch::Tensor mid = out.new_zeros({1024}); - - has_nan_inf_launcher(g_fp16, mid, out); -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("f_adam", &F_adam, "adam function"); - m.def("f_has_inf_nan", &F_has_inf_nan, "has inf or nan"); -} diff --git a/csrc/bind.cpp b/csrc/bind.cpp new file mode 100644 index 00000000..8324ba52 --- /dev/null +++ b/csrc/bind.cpp @@ -0,0 +1,25 @@ +#include "include/bind.hpp" + +PYBIND11_MODULE(C, m) { + m.def("has_nan_inf_launcher",&has_nan_inf_launcher,"has nan inf"); + m.def("adam_launcher", &adam_launcher, "adam function cpu"); + m.def("adam_cpu_launcher", &adam_cpu_launcher, "adam function cpu"); + m.def("cross_entropy_forward_launcher", &cross_entropy_forward_launcher, "cross entropy forward"); + m.def("cross_entropy_backward_launcher", &cross_entropy_backward_launcher, "cross entropy backward"); + m.def("cross_entropy_forward_inplace_launcher", &cross_entropy_forward_inplace_launcher, "cross entropy forward inplace"); + m.def("cross_entropy_backward_inplace_launcher", &cross_entropy_backward_inplace_launcher, "cross entropy backward inplace"); + m.def("ncclGetUniqueId", &pyNCCLGetUniqueID, "nccl get unique ID"); + m.def("ncclCommInitRank", &pyNCCLCommInitRank, "nccl init rank"); + m.def("ncclCommDestroy", &pyNCCLCommDestroy, "nccl delete rank"); + m.def("ncclAllGather", &pyNCCLAllGather, "nccl all gather"); + m.def("ncclAllReduce", &pyNCCLAllReduce, "nccl all reduce"); + m.def("ncclBroadcast", &pyNCCLBroadcast, "nccl broadcast"); + m.def("ncclReduce", &pyNCCLReduce, "nccl reduce"); + m.def("ncclReduceScatter", &pyNCCLReduceScatter, "nccl reduce scatter"); + m.def("ncclGroupStart", &pyNCCLGroupStart, "nccl group start"); + m.def("ncclGroupEnd", &pyNCCLGroupEnd, "nccl group end"); + m.def("ncclSend",&pyNCCLSend,"nccl send"); + m.def("ncclRecv",&pyNCCLRecv,"nccl recv"); + m.def("ncclCommCount",&pyNCCLCommCount,"nccl comm count"); + m.def("ncclCommUserRank",&pyNCCLCommUserRank,"nccl comm user rank"); +} diff --git a/csrc/cross_entropy_loss.cpp b/csrc/cross_entropy_loss.cpp deleted file mode 100644 index 40adef36..00000000 --- a/csrc/cross_entropy_loss.cpp +++ /dev/null @@ -1,98 +0,0 @@ -#include -#include - -void cross_entropy_forward_launcher(int32_t m, int32_t n, const torch::Tensor &input, const torch::Tensor &target, torch::Tensor &softmax, torch::Tensor &output, int32_t ignore_index); -void cross_entropy_backward_launcher(int32_t m, int32_t n, const torch::Tensor &grad_output, const torch::Tensor &target, const torch::Tensor &softmax, torch::Tensor &grad_input, int32_t ignore_index); -void cross_entropy_forward_inplace_launcher(int32_t m, int32_t n, torch::Tensor &x, const torch::Tensor &target, torch::Tensor &output, int32_t ignore_index); -void cross_entropy_backward_inplace_launcher(int32_t m, int32_t n, const torch::Tensor &grad_output, const torch::Tensor &target, torch::Tensor &x, int32_t ignore_index); - -#define CHECK_CUDA(x) AT_ASSERTM(x.is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") -#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x); - -void F_cross_entropy_forward( - int32_t m, int32_t n, - const torch::Tensor &input, // (m, n) - const torch::Tensor &target, // (m) - torch::Tensor &softmax, // (m, n) - torch::Tensor &output, // (m) - int32_t ignore_index -) { - CHECK_INPUT(input); - CHECK_INPUT(target); - CHECK_INPUT(softmax); - CHECK_INPUT(output); - AT_ASSERTM(input.dtype() == torch::kHalf, "input must be a half tensor"); - AT_ASSERTM(target.dtype() == torch::kInt, "target must be a int tensor"); - AT_ASSERTM(softmax.dtype() == torch::kHalf, "softmax must be a half tensor"); - AT_ASSERTM(output.dtype() == torch::kFloat, "output must be a float tensor"); - AT_ASSERTM(input.numel() == softmax.numel(), "input and softmax must have the same number of elements"); - AT_ASSERTM(target.numel() == output.numel(), "target and output must have the same number of elements"); - - cross_entropy_forward_launcher(m, n, input, target, softmax, output, ignore_index); -} - -void F_cross_entropy_backward( - int32_t m, int32_t n, - const torch::Tensor &grad_output, // (m) - const torch::Tensor &target, // (m) - const torch::Tensor &softmax, // (m, n) - torch::Tensor &grad_input, // (m, n) - int32_t ignore_index -) { - CHECK_INPUT(grad_output); - CHECK_INPUT(target); - CHECK_INPUT(softmax); - CHECK_INPUT(grad_input); - AT_ASSERTM(grad_output.dtype() == torch::kFloat, "grad_output must be a float tensor"); - AT_ASSERTM(target.dtype() == torch::kInt, "target must be a int tensor"); - AT_ASSERTM(softmax.dtype() == torch::kHalf, "softmax must be a half tensor"); - AT_ASSERTM(grad_input.dtype() == torch::kHalf, "grad_input must be a half tensor"); - AT_ASSERTM(grad_input.numel() == softmax.numel(), "grad_input and softmax must have the same number of elements"); - AT_ASSERTM(target.numel() == grad_output.numel(), "target and grad_output must have the same number of elements"); - - cross_entropy_backward_launcher(m, n, grad_output, target, softmax, grad_input, ignore_index); -} - -void F_cross_entropy_forward_inplace( - int32_t m, int32_t n, - torch::Tensor &x, // (m, n) - const torch::Tensor &target, // (m) - torch::Tensor &output, // (m) - int32_t ignore_index -) { - CHECK_INPUT(x); - CHECK_INPUT(target); - CHECK_INPUT(output); - AT_ASSERTM(x.dtype() == torch::kHalf, "x must be a half tensor"); - AT_ASSERTM(target.dtype() == torch::kInt, "target must be a int tensor"); - AT_ASSERTM(output.dtype() == torch::kFloat, "output must be a float tensor"); - AT_ASSERTM(target.numel() == output.numel(), "target and output must have the same number of elements"); - - cross_entropy_forward_inplace_launcher(m, n, x, target, output, ignore_index); -} - -void F_cross_entropy_backward_inplace( - int32_t m, int32_t n, - const torch::Tensor &grad_output, // (m) - const torch::Tensor &target, // (m) - torch::Tensor &x, // (m, n) - int32_t ignore_index -) { - CHECK_INPUT(grad_output); - CHECK_INPUT(target); - CHECK_INPUT(x); - AT_ASSERTM(grad_output.dtype() == torch::kFloat, "grad_output must be a float tensor"); - AT_ASSERTM(target.dtype() == torch::kInt, "target must be a int tensor"); - AT_ASSERTM(x.dtype() == torch::kHalf, "x must be a half tensor"); - AT_ASSERTM(target.numel() == grad_output.numel(), "target and grad_output must have the same number of elements"); - - cross_entropy_backward_inplace_launcher(m, n, grad_output, target, x, ignore_index); -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("f_cross_entropy_forward", &F_cross_entropy_forward, "cross entropy forward"); - m.def("f_cross_entropy_backward", &F_cross_entropy_backward, "cross entropy backward"); - m.def("f_cross_entropy_forward_inplace", &F_cross_entropy_forward_inplace, "cross entropy forward inplace"); - m.def("f_cross_entropy_backward_inplace", &F_cross_entropy_backward_inplace, "cross entropy backward inplace"); -} diff --git a/csrc/cuda/adam.cu b/csrc/cuda/adam_cuda.cu similarity index 62% rename from csrc/cuda/adam.cu rename to csrc/cuda/adam_cuda.cu index c7bff5ca..0ab55934 100644 --- a/csrc/cuda/adam.cu +++ b/csrc/cuda/adam_cuda.cu @@ -1,6 +1,5 @@ #include -#include -#include +#include namespace { // blocks , threads @@ -37,28 +36,28 @@ __global__ void adam_fp32_accum( } void adam_launcher( - const torch::Tensor ¶m_fp32, - const torch::Tensor ¶m_fp16, - const torch::Tensor &g_fp16, - const torch::Tensor &m_fp16, - const torch::Tensor &v_fp32, + int n, + std::uintptr_t param_fp32, + std::uintptr_t param_fp16, + std::uintptr_t g_fp16, + std::uintptr_t m_fp16, + std::uintptr_t v_fp32, float beta1, float beta2, float eps, float lr, float scale, float weight_decay, float bias_correction1, - float bias_correction2 + float bias_correction2, + uintptr_t stream ) { - int32_t n = param_fp32.numel(); if (n <= 0) return; - auto g_ptr = reinterpret_cast(g_fp16.data_ptr()); - auto m_ptr = reinterpret_cast(m_fp16.data_ptr()); - auto v_ptr = v_fp32.data_ptr(); - auto param_ptr = param_fp32.data_ptr(); - auto param_h_ptr = reinterpret_cast(param_fp16.data_ptr()); + auto g_ptr = reinterpret_cast(g_fp16); + auto m_ptr = reinterpret_cast(m_fp16); + auto param_h_ptr = reinterpret_cast(param_fp16); + auto param_fp32_ptr = reinterpret_cast(param_fp32); + auto v_fp32_ptr = reinterpret_cast(v_fp32); int32_t threads = 1024; dim3 block_size = dim3(threads, 1, 1); dim3 grid_size = dim3((n + threads - 1) / threads, 1, 1); - auto stream = at::cuda::getCurrentCUDAStream(); - adam_fp32_accum<<>>(n, g_ptr, m_ptr, v_ptr, param_ptr, param_h_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2); + adam_fp32_accum<<(stream)>>>(n, g_ptr, m_ptr, v_fp32_ptr, param_fp32_ptr, param_h_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2); } \ No newline at end of file diff --git a/csrc/cuda/cross_entropy.cu b/csrc/cuda/cross_entropy.cu index 574ebde0..c0b742ac 100644 --- a/csrc/cuda/cross_entropy.cu +++ b/csrc/cuda/cross_entropy.cu @@ -1,7 +1,7 @@ #include -#include -#include #include "reduce.cuh" +#include +#include namespace { // blocks , threads<1024> @@ -134,64 +134,74 @@ __global__ void cross_entropy_backward_inplace( void cross_entropy_forward_launcher( int32_t m, int32_t n, - const torch::Tensor &input, - const torch::Tensor &target, - torch::Tensor &softmax, - torch::Tensor &output, - int32_t ignore_index + std::uintptr_t input, + std::uintptr_t target, + std::uintptr_t softmax, + std::uintptr_t output, + int32_t ignore_index, + std::uintptr_t stream ) { - auto input_ptr = reinterpret_cast(input.data_ptr()); - auto target_ptr = target.data_ptr(); - auto softmax_ptr = reinterpret_cast(softmax.data_ptr()); - auto output_ptr = output.data_ptr(); + auto input_ptr = reinterpret_cast(input); + auto target_ptr = reinterpret_cast(target); + auto softmax_ptr = reinterpret_cast(softmax); + auto output_ptr = reinterpret_cast(output); int32_t threads = 1024; - auto stream = at::cuda::getCurrentCUDAStream(); - cross_entropy_forward<<>>(n, input_ptr, target_ptr, softmax_ptr, output_ptr, ignore_index); + cross_entropy_forward<<(stream)>>>(n, input_ptr, target_ptr, softmax_ptr, output_ptr, ignore_index); } void cross_entropy_backward_launcher( int32_t m, int32_t n, - const torch::Tensor &grad_output, - const torch::Tensor &target, - const torch::Tensor &softmax, - torch::Tensor &grad_input, - int32_t ignore_index + std::uintptr_t grad_output, + std::uintptr_t target, + std::uintptr_t softmax, + std::uintptr_t grad_input, + int32_t ignore_index, + std::uintptr_t stream ) { - auto output_ptr = grad_output.data_ptr(); - auto target_ptr = target.data_ptr(); - auto softmax_ptr = reinterpret_cast(softmax.data_ptr()); - auto input_ptr = reinterpret_cast(grad_input.data_ptr()); + // auto output_ptr = grad_output.data_ptr(); + auto output_ptr = reinterpret_cast(grad_output); + // auto target_ptr = target.data_ptr(); + auto target_ptr = reinterpret_cast(target); + auto softmax_ptr = reinterpret_cast(softmax); + auto input_ptr = reinterpret_cast(grad_input); int32_t threads = 1024; - auto stream = at::cuda::getCurrentCUDAStream(); - cross_entropy_backward<<>>(n, output_ptr, target_ptr, softmax_ptr, input_ptr, ignore_index); + cross_entropy_backward<<(stream)>>>(n, output_ptr, target_ptr, softmax_ptr, input_ptr, ignore_index); } void cross_entropy_forward_inplace_launcher( int32_t m, int32_t n, - torch::Tensor &x, - const torch::Tensor &target, - torch::Tensor &output, - int32_t ignore_index + std::uintptr_t x, + std::uintptr_t target, + std::uintptr_t output, + int32_t ignore_index, + std::uintptr_t stream ) { - auto x_ptr = reinterpret_cast(x.data_ptr()); - auto target_ptr = target.data_ptr(); - auto output_ptr = output.data_ptr(); + // auto x_ptr = reinterpret_cast(x.data_ptr()); + auto x_ptr = reinterpret_cast(x); + // auto target_ptr = target.data_ptr(); + auto target_ptr = reinterpret_cast(target); + // auto output_ptr = output.data_ptr(); + auto output_ptr = reinterpret_cast(output); int32_t threads = 1024; - auto stream = at::cuda::getCurrentCUDAStream(); - cross_entropy_forward_inplace<<>>(n, x_ptr, target_ptr, output_ptr, ignore_index); + // auto stream = at::cuda::getCurrentCUDAStream(); + cross_entropy_forward_inplace<<(stream)>>>(n, x_ptr, target_ptr, output_ptr, ignore_index); } void cross_entropy_backward_inplace_launcher( int32_t m, int32_t n, - const torch::Tensor &grad_output, - const torch::Tensor &target, - torch::Tensor &x, - int32_t ignore_index + std::uintptr_t grad_output, + std::uintptr_t target, + std::uintptr_t x, + int32_t ignore_index, + std::uintptr_t stream ) { - auto output_ptr = grad_output.data_ptr(); - auto target_ptr = target.data_ptr(); - auto x_ptr = reinterpret_cast(x.data_ptr()); + // auto output_ptr = grad_output.data_ptr(); + auto output_ptr = reinterpret_cast(grad_output); + // auto target_ptr = target.data_ptr(); + auto target_ptr = reinterpret_cast(target); + // auto x_ptr = reinterpret_cast(x.data_ptr()); + auto x_ptr = reinterpret_cast(x); int32_t threads = 1024; - auto stream = at::cuda::getCurrentCUDAStream(); - cross_entropy_backward_inplace<<>>(n, output_ptr, target_ptr, x_ptr, ignore_index); + // auto stream = at::cuda::getCurrentCUDAStream(); + cross_entropy_backward_inplace<<(stream)>>>(n, output_ptr, target_ptr, x_ptr, ignore_index); } \ No newline at end of file diff --git a/csrc/cuda/has_inf_nan.cu b/csrc/cuda/has_inf_nan.cu index c8bcca1a..b0e906ff 100644 --- a/csrc/cuda/has_inf_nan.cu +++ b/csrc/cuda/has_inf_nan.cu @@ -1,6 +1,5 @@ #include -#include -#include +#include namespace{ __inline__ __device__ bool isnan_(half v) { @@ -68,21 +67,21 @@ __global__ void bmt_has_nan_inf_2( } void has_nan_inf_launcher( - const torch::Tensor &g_fp16, - torch::Tensor mid, - torch::Tensor out + int32_t n, + std::uintptr_t g_fp16, + std::uintptr_t mid, + std::uintptr_t out, + std::uintptr_t stream ) { - int n = g_fp16.numel(); if (n <= 0) return; - auto g_ptr = reinterpret_cast(g_fp16.data_ptr()); - auto mid_ptr = mid.data_ptr(); - auto stream = at::cuda::getCurrentCUDAStream(); - + auto g_ptr = reinterpret_cast(g_fp16); + auto mid_ptr = reinterpret_cast(mid); + auto out_ptr = reinterpret_cast(out); int32_t threads = 1024; dim3 block_size = dim3(threads, 1, 1); dim3 grid_size = dim3((n + threads - 1) / threads, 1, 1); dim3 clamp_grid_size = dim3(min((n + threads - 1) / threads, 1024), 1, 1); - bmt_has_nan_inf_1<<>>(n, g_ptr, mid_ptr); - bmt_has_nan_inf_2<<<1, block_size, 0, stream.stream()>>>(mid_ptr, out.data_ptr()); + bmt_has_nan_inf_1<<(stream)>>>(n, g_ptr, mid_ptr); + bmt_has_nan_inf_2<<<1, block_size, 0, reinterpret_cast(stream)>>>(mid_ptr, out_ptr); } \ No newline at end of file diff --git a/csrc/cuda/reduce.cuh b/csrc/cuda/reduce.cuh index 8179443f..095e8593 100644 --- a/csrc/cuda/reduce.cuh +++ b/csrc/cuda/reduce.cuh @@ -1,6 +1,4 @@ #include -#include -#include namespace { const int WARP_SZ = 32; diff --git a/csrc/adam_cpu.cpp b/csrc/include/adam_cpu.hpp similarity index 57% rename from csrc/adam_cpu.cpp rename to csrc/include/adam_cpu.hpp index b8685b60..1c263b65 100644 --- a/csrc/adam_cpu.cpp +++ b/csrc/include/adam_cpu.hpp @@ -1,8 +1,11 @@ -#include -#include -#include #include +#include +#include +#include +#include + +#include #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") #if defined(__AVX512F__) @@ -14,14 +17,118 @@ #pragma message "Using AVX256" #define __AVX256__ 1 +#endif +#include +#include +#include + +inline float fp32_from_bits(uint32_t w) { + union { + uint32_t as_bits; + float as_value; + } fp32 = {w}; + return fp32.as_value; +} + +inline uint32_t fp32_to_bits(float f) { + union { + float as_value; + uint32_t as_bits; + } fp32 = {f}; + return fp32.as_bits; +} + +template +inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F& f) { + // Number of iterations + int64_t numiter = end - begin; + + // Number of threads to use + int64_t num_threads = std::max(numiter / grain_size, static_cast(1)); + + // Check if parallel execution is feasible + if (num_threads > 1) { + std::vector threads(num_threads); + for (int64_t t = 0; t < num_threads; ++t) { + threads[t] = std::thread([&, t]() { + int64_t start = begin + t * grain_size; + int64_t end = std::min(begin + (t + 1) * grain_size, end); + f(start, end); + }); + } + // Join all threads + for (auto& thread : threads) { + thread.join(); + } + } else { + // If not feasible, perform the operation serially + f(begin, end); + } +} + + +inline uint16_t fp16_ieee_from_fp32_value(float f) { + // const float scale_to_inf = 0x1.0p+112f; + // const float scale_to_zero = 0x1.0p-110f; + uint32_t scale_to_inf_bits = (uint32_t) 239 << 23; + uint32_t scale_to_zero_bits = (uint32_t) 17 << 23; + float scale_to_inf_val, scale_to_zero_val; + std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val)); + std::memcpy(&scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val)); + const float scale_to_inf = scale_to_inf_val; + const float scale_to_zero = scale_to_zero_val; +#if defined(_MSC_VER) && _MSC_VER == 1916 + float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero; +#else + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; #endif + const uint32_t w = (uint32_t)fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = (uint32_t)fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return static_cast( + (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign) + ); + } + +inline float fp16_ieee_to_fp32_value(uint16_t h) { + + const uint32_t w = (uint32_t)h << 16; + const uint32_t sign = w & UINT32_C(0x80000000); + const uint32_t two_w = w + w; + + const uint32_t exp_offset = UINT32_C(0xE0) << 23; + const float exp_scale = 0x1.0p-112f; + const float normalized_value = + fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + const uint32_t magic_mask = UINT32_C(126) << 23; + const float magic_bias = 0.5f; + const float denormalized_value = + fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + const uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = + sign | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) + : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +} void adam_cpu_launcher( int n, float* param_fp32, - at::Half* param_fp16, - at::Half* g_fp16, + u_int16_t* param_fp16, + u_int16_t* g_fp16, float* m_fp32, float* v_fp32, float beta1, float beta2, @@ -59,7 +166,7 @@ void adam_cpu_launcher( int64_t span = 1; #endif - at::parallel_for(0, n, 0, [&](int64_t start, int64_t end) { + parallel_for(0, n, 0, [&](int64_t start, int64_t end) { for (int64_t j = start; j < end; j += span) { #if defined(__AVX256__) or defined(__AVX512__) if (j + span > end) { @@ -68,7 +175,7 @@ void adam_cpu_launcher( #endif // No AVX or n is not alinged for (int64_t i = j; i < end; i++) { - float g = c10::detail::fp16_ieee_to_fp32_value(g_fp16[i].x) / scale; + float g = fp16_ieee_to_fp32_value(g_fp16[i]) / scale; float m = m_fp32[i]; float v = v_fp32[i]; float p = param_fp32[i]; @@ -76,7 +183,7 @@ void adam_cpu_launcher( v = beta2 * v + (1 - beta2) * g * g; p = p - lr * m / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p; param_fp32[i] = p; - param_fp16[i] = at::Half(p); + param_fp16[i] = fp16_ieee_from_fp32_value(p); m_fp32[i] = m; v_fp32[i] = v; } @@ -133,57 +240,4 @@ void adam_cpu_launcher( }); } -void F_adam_cpu( - const torch::Tensor ¶m_fp32, - const torch::Tensor ¶m_fp16, - const torch::Tensor &g_fp16, - const torch::Tensor &m_fp32, - const torch::Tensor &v_fp32, - float beta1, float beta2, - float eps, float lr, - float scale, - float weight_decay, - int64_t step -) { - CHECK_CONTIGUOUS(param_fp32); - CHECK_CONTIGUOUS(param_fp16); - CHECK_CONTIGUOUS(g_fp16); - CHECK_CONTIGUOUS(m_fp32); - CHECK_CONTIGUOUS(v_fp32); - AT_ASSERTM(param_fp32.dtype() == torch::kFloat, "param_fp32 must be a float tensor"); - AT_ASSERTM(param_fp16.dtype() == torch::kHalf, "param_fp16 must be a half tensor"); - AT_ASSERTM(g_fp16.dtype() == torch::kHalf, "g_fp16 must be a half tensor"); - AT_ASSERTM(m_fp32.dtype() == torch::kFloat, "m_fp32 must be a float tensor"); - AT_ASSERTM(v_fp32.dtype() == torch::kFloat, "v_fp32 must be a float tensor"); - AT_ASSERTM(param_fp32.is_cpu(), "param_fp32 must be a cpu tensor"); - AT_ASSERTM(param_fp16.is_cpu(), "param_fp16 must be a cpu tensor"); - AT_ASSERTM(g_fp16.is_cpu(), "g_fp16 must be a cpu tensor"); - AT_ASSERTM(m_fp32.is_cpu(), "m_fp32 must be a cpu tensor"); - AT_ASSERTM(v_fp32.is_cpu(), "v_fp32 must be a cpu tensor"); - AT_ASSERTM(param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements"); - AT_ASSERTM(param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements"); - AT_ASSERTM(param_fp32.numel() == m_fp32.numel(), "param_fp32 and m_fp32 must have the same number of elements"); - AT_ASSERTM(param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements"); - - float bias_correction1 = 1 - powf(beta1, step); - float bias_correction2 = 1 - powf(beta2, step); - - adam_cpu_launcher( - param_fp32.numel(), - param_fp32.data_ptr(), - param_fp16.data_ptr(), - g_fp16.data_ptr(), - m_fp32.data_ptr(), - v_fp32.data_ptr(), - beta1, beta2, - eps, lr, - scale, - weight_decay, - bias_correction1, - bias_correction2 - ); -} - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("f_adam_cpu", &F_adam_cpu, "adam function cpu"); -} + diff --git a/csrc/include/bind.hpp b/csrc/include/bind.hpp new file mode 100644 index 00000000..0929de91 --- /dev/null +++ b/csrc/include/bind.hpp @@ -0,0 +1,55 @@ +#include +#include "nccl.hpp" +#include "adam_cpu.hpp" + +void has_nan_inf_launcher(int32_t n,std::uintptr_t g_fp16,std::uintptr_t mid,std::uintptr_t out,std::uintptr_t stream); + +void cross_entropy_backward_launcher( + int32_t m, int32_t n, + std::uintptr_t grad_output, + std::uintptr_t target, + std::uintptr_t softmax, + std::uintptr_t grad_input, + int32_t ignore_index, + std::uintptr_t stream +); +void cross_entropy_backward_inplace_launcher( + int32_t m, int32_t n, + std::uintptr_t grad_output, + std::uintptr_t target, + std::uintptr_t x, + int32_t ignore_index, + std::uintptr_t stream +); + void cross_entropy_forward_inplace_launcher( + int32_t m, int32_t n, + std::uintptr_t x, + std::uintptr_t target, + std::uintptr_t output, + int32_t ignore_index, + std::uintptr_t stream +); +void cross_entropy_forward_launcher( + int32_t m, int32_t n, + std::uintptr_t input, + std::uintptr_t target, + std::uintptr_t softmax, + std::uintptr_t output, + int32_t ignore_index, + std::uintptr_t stream +); +void adam_launcher( + int n, + std::uintptr_t param_fp32, + std::uintptr_t param_fp16, + std::uintptr_t g_fp16, + std::uintptr_t m_fp16, + std::uintptr_t v_fp32, + float beta1, float beta2, + float eps, float lr, + float scale, + float weight_decay, + float bias_correction1, + float bias_correction2, + uintptr_t stream +); \ No newline at end of file diff --git a/csrc/nccl.cpp b/csrc/include/nccl.hpp similarity index 83% rename from csrc/nccl.cpp rename to csrc/include/nccl.hpp index c1dbb321..297fa5ab 100644 --- a/csrc/nccl.cpp +++ b/csrc/include/nccl.hpp @@ -3,7 +3,7 @@ #include namespace py = pybind11; -#include "include/nccl.h" +#include "nccl.h" void checkNCCLStatus(ncclResult_t result) { if (result == ncclSuccess) return; @@ -186,19 +186,3 @@ int pyNCCLCommUserRank( checkNCCLStatus(ncclCommUserRank(reinterpret_cast(comm),&rank)); return rank; } -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.def("ncclGetUniqueId", &pyNCCLGetUniqueID, "nccl get unique ID"); - m.def("ncclCommInitRank", &pyNCCLCommInitRank, "nccl init rank"); - m.def("ncclCommDestroy", &pyNCCLCommDestroy, "nccl delete rank"); - m.def("ncclAllGather", &pyNCCLAllGather, "nccl all gather"); - m.def("ncclAllReduce", &pyNCCLAllReduce, "nccl all reduce"); - m.def("ncclBroadcast", &pyNCCLBroadcast, "nccl broadcast"); - m.def("ncclReduce", &pyNCCLReduce, "nccl reduce"); - m.def("ncclReduceScatter", &pyNCCLReduceScatter, "nccl reduce scatter"); - m.def("ncclGroupStart", &pyNCCLGroupStart, "nccl group start"); - m.def("ncclGroupEnd", &pyNCCLGroupEnd, "nccl group end"); - m.def("ncclSend",&pyNCCLSend,"nccl send"); - m.def("ncclRecv",&pyNCCLRecv,"nccl recv"); - m.def("ncclCommCount",&pyNCCLCommCount,"nccl comm count"); - m.def("ncclCommUserRank",&pyNCCLCommUserRank,"nccl comm user rank"); -} diff --git a/setup.py b/setup.py index b7411f4c..26fcfed3 100644 --- a/setup.py +++ b/setup.py @@ -1,216 +1,102 @@ -from setuptools import setup, find_packages import torch -from setuptools import Extension -from setuptools.command.build_ext import build_ext -import os import os +from setuptools.command.build_ext import build_ext +from setuptools import setup, find_packages, Extension +from torch.utils.cpp_extension import BuildExtension import setuptools +import warnings +import sys import subprocess -from pkg_resources import packaging -from typing import Dict, List, Optional, Union, Tuple -def CppExtension(name, sources, *args, **kwargs): - include_dirs = kwargs.get('include_dirs', []) - include_dirs += include_paths() - kwargs['include_dirs'] = include_dirs - - library_dirs = kwargs.get('library_dirs', []) - library_dirs += library_paths() - kwargs['library_dirs'] = library_dirs - - libraries = kwargs.get('libraries', []) - kwargs['libraries'] = libraries - - kwargs['language'] = 'c++' - return setuptools.Extension(name, sources, *args, **kwargs) - - -def CUDAExtension(name, sources, *args, **kwargs): - library_dirs = kwargs.get('library_dirs', []) - library_dirs += library_paths(cuda=True) - kwargs['library_dirs'] = library_dirs - - libraries = kwargs.get('libraries', []) - kwargs['libraries'] = libraries - - include_dirs = kwargs.get('include_dirs', []) - - - include_dirs += include_paths(cuda=True) - kwargs['include_dirs'] = include_dirs - - kwargs['language'] = 'c++' - - dlink_libraries = kwargs.get('dlink_libraries', []) - dlink = kwargs.get('dlink', False) or dlink_libraries - if dlink: - extra_compile_args = kwargs.get('extra_compile_args', {}) - - extra_compile_args_dlink = extra_compile_args.get('nvcc_dlink', []) - extra_compile_args_dlink += ['-dlink'] - extra_compile_args_dlink += [f'-L{x}' for x in library_dirs] - extra_compile_args_dlink += [f'-l{x}' for x in dlink_libraries] - - - extra_compile_args['nvcc_dlink'] = extra_compile_args_dlink - - kwargs['extra_compile_args'] = extra_compile_args - - return setuptools.Extension(name, sources, *args, **kwargs) - -def _find_cuda_home() -> Optional[str]: - r'''Finds the CUDA install path.''' - # Guess #1 - cuda_home = os.environ.get('CUDA_HOME') or os.environ.get('CUDA_PATH') - if cuda_home is None: - # Guess #2 +COMMON_NVCC_FLAGS = [ + '-D__CUDA_NO_HALF_OPERATORS__', + '-D__CUDA_NO_HALF_CONVERSIONS__', + '-D__CUDA_NO_HALF2_OPERATORS__', + '--expt-relaxed-constexpr' +] +class CMakeExtension(Extension): + def __init__(self, name, sourcedir=""): + Extension.__init__(self, name, sources=[]) + self.sourcedir = os.path.abspath(sourcedir) + + +def is_ninja_available(): + r''' + Returns ``True`` if the `ninja `_ build system is + available on the system, ``False`` otherwise. + ''' + with open(os.devnull, 'wb') as devnull: try: - which = 'which' - with open(os.devnull, 'w') as devnull: - nvcc = subprocess.check_output([which, 'nvcc'], - stderr=devnull).decode(*SUBPROCESS_DECODE_ARGS).rstrip('\r\n') - cuda_home = os.path.dirname(os.path.dirname(nvcc)) - except Exception: - #Guess #3 - cuda_home = '/usr/local/cuda' - if not os.path.exists(cuda_home): - cuda_home = None - return cuda_home - -CUDA_HOME = _find_cuda_home() -_BMT_PATH = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) - -def _join_cuda_home(*paths) -> str: - if CUDA_HOME is None: - raise EnvironmentError('CUDA_HOME environment variable is not set. ' - 'Please set it to your CUDA install root.') - return os.path.join(CUDA_HOME, *paths) - -def CppExtension(name, sources, *args, **kwargs): - library_dirs = kwargs.get('library_dirs', []) - library_dirs += library_paths(cuda=True) - kwargs['library_dirs'] = library_dirs - - libraries = kwargs.get('libraries', []) - libraries.append('cudart') - kwargs['libraries'] = libraries - - include_dirs = kwargs.get('include_dirs', []) - - include_dirs += include_paths(cuda=True) - kwargs['include_dirs'] = include_dirs - - kwargs['language'] = 'c++' - - dlink_libraries = kwargs.get('dlink_libraries', []) - dlink = kwargs.get('dlink', False) or dlink_libraries - if dlink: - extra_compile_args = kwargs.get('extra_compile_args', {}) - - extra_compile_args_dlink = extra_compile_args.get('nvcc_dlink', []) - extra_compile_args_dlink += ['-dlink'] - extra_compile_args_dlink += [f'-L{x}' for x in library_dirs] - extra_compile_args_dlink += [f'-l{x}' for x in dlink_libraries] - - extra_compile_args['nvcc_dlink'] = extra_compile_args_dlink - - kwargs['extra_compile_args'] = extra_compile_args - - return setuptools.Extension(name, sources, *args, **kwargs) - - -def include_paths(cuda: bool = False) -> List[str]: - paths = [ - os.path.join(_BMT_PATH, 'csrc', 'include'), - ] - if cuda: - cuda_home_include = _join_cuda_home('include') - # if we have the Debian/Ubuntu packages for cuda, we get /usr as cuda home. - # but gcc doesn't like having /usr/include passed explicitly - if cuda_home_include != '/usr/include': - paths.append(cuda_home_include) - return paths - - -def library_paths(cuda: bool = False) -> List[str]: - paths = [] - - if cuda: - lib_dir = 'lib64' - if (not os.path.exists(_join_cuda_home(lib_dir)) and - os.path.exists(_join_cuda_home('lib'))): - lib_dir = 'lib' - - paths.append(_join_cuda_home(lib_dir)) - return paths - -def get_avx_flags(): - if os.environ.get("BMT_AVX256", "").lower() in ["1", "true", "on"]: - return ["-mavx", "-mfma", "-mf16c"] - elif os.environ.get("BMT_AVX512", "").lower() in ["1", "true", "on"]: - return ["-mavx512f"] - else: - return ["-march=native"] - -def get_device_cc(): - try: - CC_SET = set() - for i in range(torch.cuda.device_count()): - CC_SET.add(torch.cuda.get_device_capability(i)) - - if len(CC_SET) == 0: - return None - - ret = "" - for it in CC_SET: - if len(ret) > 0: - ret = ret + " " - ret = ret + ("%d.%d" % it) - return ret - except RuntimeError: - return None - -avx_flag = get_avx_flags() -device_cc = get_device_cc() -if device_cc is None: - if not torch.cuda.is_available(): - os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0+PTX") - else: - if torch.version.cuda.startswith("10"): - os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5+PTX") + subprocess.check_call('ninja --version'.split(), stdout=devnull) + except OSError: + return False else: - if not torch.version.cuda.startswith("11.0"): - os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0 8.6+PTX") - else: - os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", "6.0 6.1 7.0 7.5 8.0+PTX") -else: - os.environ["TORCH_CUDA_ARCH_LIST"] = os.environ.get("TORCH_CUDA_ARCH_LIST", device_cc) - -ext_modules = [] - -if os.environ.get("GITHUB_ACTIONS", "false") == "false": - ext_modules = [ - CUDAExtension('bmtrain.nccl._C', [ - 'csrc/nccl.cpp', - ], include_dirs=["csrc/nccl/build/include"], extra_compile_args={}, extra_link_args=["../libnccl_static.a"]), - CUDAExtension('bmtrain.optim._cuda', [ - 'csrc/adam_cuda.cpp', - 'csrc/cuda/adam.cu', - 'csrc/cuda/has_inf_nan.cu' - ], extra_compile_args={}), - CppExtension("bmtrain.optim._cpu", [ - "csrc/adam_cpu.cpp", - ], extra_compile_args=[ - '-fopenmp', - *avx_flag - ], extra_link_args=['-lgomp']), - CUDAExtension('bmtrain.loss._cuda', [ - 'csrc/cross_entropy_loss.cpp', - 'csrc/cuda/cross_entropy.cu', - ], extra_compile_args={}), - ] -else: - ext_modules = [] - + return True + + +class CMakeBuild(build_ext): + + def build_extension(self, ext): + extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name))) + + # required for auto-detection & inclusion of auxiliary "native" libs + if not extdir.endswith(os.path.sep): + extdir += os.path.sep + + debug = int(os.environ.get("DEBUG", 0)) if self.debug is None else self.debug + cfg = "Debug" if debug else "Release" + + # CMake lets you override the generator - we need to check this. + # Can be set with Conda-Build, for example. + cmake_generator = os.environ.get("CMAKE_GENERATOR", "") + + # Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON + # EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code + # from Python. + cmake_args = [ + f"-DCMAKE_CXX_STANDARD=14", + f"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={extdir}", + f"-DPYTHON_EXECUTABLE={sys.executable}", + f"-DPYTHON_VERSION={sys.version_info.major}.{sys.version_info.minor}", + f"-DCMAKE_BUILD_TYPE={cfg}", # not used on MSVC, but no harm + ] + + build_args = [] + if "CMAKE_ARGS" in os.environ: + cmake_args += [item for item in os.environ["CMAKE_ARGS"].split(" ") if item] + + cmake_args += [f"-DEXAMPLE_VERSION_INFO={self.distribution.get_version()}"] + + + if not cmake_generator or cmake_generator == "Ninja": + try: + import ninja # noqa: F401 + + ninja_executable_path = os.path.join(ninja.BIN_DIR, "ninja") + cmake_args += [ + "-GNinja", + f"-DCMAKE_MAKE_PROGRAM:FILEPATH={ninja_executable_path}", + ] + except ImportError: + pass + + if "CMAKE_BUILD_PARALLEL_LEVEL" not in os.environ: + # self.parallel is a Python 3 only way to set parallel jobs by hand + # using -j in the build_ext call, not supported by pip or PyPA-build. + if hasattr(self, "parallel") and self.parallel: + # CMake 3.12+ only. + build_args += [f"-j{self.parallel}"] + + build_temp = os.path.join(self.build_temp, ext.name) + if not os.path.exists(build_temp): + os.makedirs(build_temp) + + cmake_args += ["-DPython_ROOT_DIR=" + os.path.dirname(os.path.dirname(sys.executable))] + subprocess.check_call(["cmake", ext.sourcedir] + cmake_args, cwd=build_temp) + subprocess.check_call(["cmake", "--build", "."] + build_args, cwd=build_temp) + +ext_modules = [ + CMakeExtension("bmtrain.C"), +] setup( name='bmtrain', version='0.2.2', @@ -223,6 +109,6 @@ def get_device_cc(): ], ext_modules=ext_modules, cmdclass={ - 'build_ext': build_ext + 'build_ext': CMakeBuild }) From 6f07c3725d8f5237092674bbbfd02f4e67be5eec Mon Sep 17 00:00:00 2001 From: maydomine <1583143678@qq.com> Date: Mon, 12 Jun 2023 17:51:23 +0800 Subject: [PATCH 05/13] fix some legacy bug --- bmtrain/optim/_function.py | 2 +- bmtrain/optim/optim_manager.py | 7 ++++--- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bmtrain/optim/_function.py b/bmtrain/optim/_function.py index 342998fa..5ed9079a 100644 --- a/bmtrain/optim/_function.py +++ b/bmtrain/optim/_function.py @@ -54,7 +54,7 @@ def adam(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tenso assert param_fp32.dtype == torch.float32, "param_fp32 must be float32 tensor" assert param_fp16.dtype == torch.float16, "param_fp16 must be float16 tensor" assert g_fp16.dtype == torch.float16, "g_fp16 must be float16 tensor" - assert m_fp16.dtype == torch.float32, "m_fp32 must be float32 tensor" + assert m_fp16.dtype == torch.float16, "m_fp16 must be float16 tensor" assert v_fp32.dtype == torch.float32, "v_fp32 must be float32 tensor" assert param_fp32.numel() == param_fp16.numel(), "param_fp32 and param_fp16 must have the same number of elements" assert param_fp32.numel() == g_fp16.numel(), "param_fp32 and g_fp16 must have the same number of elements" diff --git a/bmtrain/optim/optim_manager.py b/bmtrain/optim/optim_manager.py index e0943ca0..d387faa9 100644 --- a/bmtrain/optim/optim_manager.py +++ b/bmtrain/optim/optim_manager.py @@ -1,7 +1,6 @@ from typing import Optional, Union, List, Dict, Tuple import torch - -from .. import C +from ..loss._function import has_inf_nan from ..utils import print_rank from ..lr_scheduler.warmup import WarmupLRScheduler from .. import nccl @@ -13,7 +12,7 @@ def check_overflow(param_groups): for group in param_groups: for p in group['params']: if p.grad is not None and p.dtype == torch.half: # TODO support other types - C.f_has_inf_nan(p.grad, has_inf_or_nan) + has_inf_nan(p.grad, has_inf_or_nan) if "comm" in config: nccl.allReduce(has_inf_or_nan.storage(), has_inf_or_nan.storage(), "max", config["comm"]) @@ -81,6 +80,7 @@ def add_optimizer( self.lr_schedulers.append(lr_scheduler) def scale_loss(self, loss : torch.Tensor) -> torch.Tensor: + return loss * (self.loss_scale / config['world_size']) # loss scale def backward(self, loss : torch.Tensor): @@ -128,6 +128,7 @@ def step(self): for optimizer, lr_scheduler in zip(self.optimizers, self.lr_schedulers): if hasattr(optimizer, "_bmtrain_optimizer") and optimizer._bmtrain_optimizer: + print("BMTrain optimizer step") optimizer.step(scale=self.loss_scale) else: if self.loss_scale_enabled: From 50e0f5b9dfc5d4ebcd467008b62b34ea7a793642 Mon Sep 17 00:00:00 2001 From: maydomine <1583143678@qq.com> Date: Tue, 13 Jun 2023 10:53:03 +0800 Subject: [PATCH 06/13] fix the grad overflow caused by torch2.0.0 --- bmtrain/optim/adam.py | 6 +++++- bmtrain/optim/adam_offload.py | 7 ++++++- bmtrain/optim/optim_manager.py | 8 ++++---- 3 files changed, 15 insertions(+), 6 deletions(-) diff --git a/bmtrain/optim/adam.py b/bmtrain/optim/adam.py index 4cdb7914..b63a4f51 100644 --- a/bmtrain/optim/adam.py +++ b/bmtrain/optim/adam.py @@ -177,4 +177,8 @@ def update_group(group, new_group): return new_group param_groups = [ update_group(g, ng) for g, ng in zip(groups, saved_groups)] - self.__setstate__({'state': state, 'param_groups': param_groups}) \ No newline at end of file + self.__setstate__({'state': state, 'param_groups': param_groups}) + + #TODO zero_grad(set_to_none=True) makes optimizer crashed, maybe the reason of grad accu + def zero_grad(self, set_to_none: bool = False): + super().zero_grad(set_to_none=set_to_none) \ No newline at end of file diff --git a/bmtrain/optim/adam_offload.py b/bmtrain/optim/adam_offload.py index ecf416f6..e33219bf 100644 --- a/bmtrain/optim/adam_offload.py +++ b/bmtrain/optim/adam_offload.py @@ -250,4 +250,9 @@ def cut_states(state): return { 'state': packed_state, 'param_groups': param_groups, - } \ No newline at end of file + } + + #TODO zero_grad(set_to_none=True) makes optimizer crashed, maybe the reason of grad accu + def zero_grad(self, set_to_none: bool = False): + super().zero_grad(set_to_none=set_to_none) + diff --git a/bmtrain/optim/optim_manager.py b/bmtrain/optim/optim_manager.py index d387faa9..8d8161f8 100644 --- a/bmtrain/optim/optim_manager.py +++ b/bmtrain/optim/optim_manager.py @@ -101,7 +101,7 @@ def zero_grad(self): This is a helper function to call optimizer.zero_grad() """ for optimizer in self.optimizers: - optimizer.zero_grad() + optimizer.zero_grad(set_to_none=False) def step(self): """ @@ -122,10 +122,10 @@ def step(self): break if has_overflow: print_rank("Gradient overflow, change scale from %lf to %lf" % (self.loss_scale, self.loss_scale / self.loss_scale_factor)) - self._justify_scale(self.loss_scale / self.loss_scale_factor) - self.zero_grad() + with torch.no_grad(): + self._justify_scale(self.loss_scale / self.loss_scale_factor) + self.zero_grad() return - for optimizer, lr_scheduler in zip(self.optimizers, self.lr_schedulers): if hasattr(optimizer, "_bmtrain_optimizer") and optimizer._bmtrain_optimizer: print("BMTrain optimizer step") From 88a8f90a3d0f1713ef1643952601ec16fc579cee Mon Sep 17 00:00:00 2001 From: maydomine <1583143678@qq.com> Date: Tue, 13 Jun 2023 11:21:26 +0800 Subject: [PATCH 07/13] legacy debug statement --- bmtrain/optim/optim_manager.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bmtrain/optim/optim_manager.py b/bmtrain/optim/optim_manager.py index 8d8161f8..23eaa868 100644 --- a/bmtrain/optim/optim_manager.py +++ b/bmtrain/optim/optim_manager.py @@ -128,7 +128,6 @@ def step(self): return for optimizer, lr_scheduler in zip(self.optimizers, self.lr_schedulers): if hasattr(optimizer, "_bmtrain_optimizer") and optimizer._bmtrain_optimizer: - print("BMTrain optimizer step") optimizer.step(scale=self.loss_scale) else: if self.loss_scale_enabled: From 63831f33290737dc32b891a50a60c69ca400bec1 Mon Sep 17 00:00:00 2001 From: MayDomine <1583143678@qq.com> Date: Tue, 13 Jun 2023 16:25:10 +0800 Subject: [PATCH 08/13] fix the adam_offload bug --- CMakeLists.txt | 29 +++++++++++- bmtrain/optim/_function.py | 4 +- csrc/include/adam_cpu.hpp | 94 +++++++++++++++++++++----------------- 3 files changed, 80 insertions(+), 47 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a00f3f6d..ffecf462 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,6 @@ enable_language(C) enable_language(CXX) set(CMAKE_CUDA_ARCHITECTURES "61;62;70;72;75;80") enable_language(CUDA) - set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED True) set(CMAKE_CUDA_STANDARD 14) @@ -30,7 +29,34 @@ message (STATUS "CMAKE_INSTALL_RPATH: ${CMAKE_INSTALL_RPATH}") file(GLOB_RECURSE SOURCES "csrc/*.cpp") file(GLOB_RECURSE CUDA_SOURCES "csrc/cuda/*.cu") +set(BMT_AVX256 $ENV{BMT_AVX256}) +set(BMT_AVX512 $ENV{BMT_AVX512}) + +# Convert them to lowercase +string(TOLOWER "${BMT_AVX256}" BMT_AVX256_LOWER) +string(TOLOWER "${BMT_AVX512}" BMT_AVX512_LOWER) + +# Check the flags +if ("${BMT_AVX256_LOWER}" STREQUAL "1" OR "${BMT_AVX256_LOWER}" STREQUAL "true" OR "${BMT_AVX256_LOWER}" STREQUAL "on") + set(AVX_FLAGS "${AVX_FLAGS} -mavx -mfma -mf16c") +elseif ("${BMT_AVX512_LOWER}" STREQUAL "1" OR "${BMT_AVX512_LOWER}" STREQUAL "true" OR "${BMT_AVX512_LOWER}" STREQUAL "on") + set(AVX_FLAGS "${AVX_FLAGS} -mavx512f") +elseif ("${BMT_AVX256_LOWER}" STREQUAL "" OR "${BMT_AVX512_LOWER}" STREQUAL "") + message(WARNING "BMT_AVX256 or BMT_AVX512 is empty or not defined.") +else() + set(AVX_FLAGS "${AVX_FLAGS} -march=native") +endif() + + + +# Now you can use AVX_FLAGS variable in your target_compile_options +# For example, if your target is named 'my_target': + + pybind11_add_module(C ${SOURCES} ${CUDA_SOURCES}) + +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}") + target_link_libraries(C PRIVATE "-Wl,-Bsymbolic" "-Wl,-Bsymbolic-functions" @@ -47,6 +73,5 @@ target_include_directories(C PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} ) -#STATIC ${bmengine_STATIC_LIBRARIES} diff --git a/bmtrain/optim/_function.py b/bmtrain/optim/_function.py index 5ed9079a..ee4b04a7 100644 --- a/bmtrain/optim/_function.py +++ b/bmtrain/optim/_function.py @@ -26,7 +26,6 @@ def adam_cpu(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.T assert param_fp32.numel() == v_fp32.numel(), "param_fp32 and v_fp32 must have the same number of elements" bias_correction1 = 1 - beta1 ** step bias_correction2 = 1 - beta2 ** step - stream = torch.cuda.current_stream().cuda_stream C.adam_cpu_launcher( param_fp32.numel(), param_fp32.data_ptr(), @@ -40,7 +39,6 @@ def adam_cpu(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.T weight_decay, bias_correction1, bias_correction2, - stream ) def adam(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tensor, m_fp16: torch.Tensor, @@ -77,4 +75,4 @@ def adam(param_fp32: torch.Tensor, param_fp16: torch.Tensor, g_fp16: torch.Tenso bias_correction1, bias_correction2, stream - ) \ No newline at end of file + ) diff --git a/csrc/include/adam_cpu.hpp b/csrc/include/adam_cpu.hpp index 1c263b65..dd283bb6 100644 --- a/csrc/include/adam_cpu.hpp +++ b/csrc/include/adam_cpu.hpp @@ -3,7 +3,6 @@ #include #include -#include #include #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") @@ -42,18 +41,27 @@ template inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F& f) { // Number of iterations int64_t numiter = end - begin; - + // Number of threads to use - int64_t num_threads = std::max(numiter / grain_size, static_cast(1)); + int64_t num_threads = 1; // Default to serial execution + + if (grain_size > 0) { + num_threads = std::max(numiter / grain_size, static_cast(1)); + } + else{ + int64_t n = std::thread::hardware_concurrency(); + grain_size = std::max(numiter / n, static_cast(1)); + num_threads = std::max(numiter / grain_size, static_cast(1)); + } // Check if parallel execution is feasible if (num_threads > 1) { std::vector threads(num_threads); for (int64_t t = 0; t < num_threads; ++t) { threads[t] = std::thread([&, t]() { - int64_t start = begin + t * grain_size; - int64_t end = std::min(begin + (t + 1) * grain_size, end); - f(start, end); + int64_t left = begin + t * grain_size; + int64_t right = std::min(begin + (t + 1) * grain_size, end); + f(left, right); }); } // Join all threads @@ -61,12 +69,13 @@ inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F thread.join(); } } else { - // If not feasible, perform the operation serially + // If not feasible or grain_size is 0, perform the operation serially f(begin, end); } } + inline uint16_t fp16_ieee_from_fp32_value(float f) { // const float scale_to_inf = 0x1.0p+112f; // const float scale_to_zero = 0x1.0p-110f; @@ -78,11 +87,7 @@ inline uint16_t fp16_ieee_from_fp32_value(float f) { const float scale_to_inf = scale_to_inf_val; const float scale_to_zero = scale_to_zero_val; -#if defined(_MSC_VER) && _MSC_VER == 1916 - float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero; -#else - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; -#endif + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; const uint32_t w = (uint32_t)fp32_to_bits(f); const uint32_t shl1_w = w + w; @@ -125,12 +130,12 @@ inline float fp16_ieee_to_fp32_value(uint16_t h) { return fp32_from_bits(result); } void adam_cpu_launcher( - int n, - float* param_fp32, - u_int16_t* param_fp16, - u_int16_t* g_fp16, - float* m_fp32, - float* v_fp32, + int64_t n, + std::uintptr_t param_fp32, + std::uintptr_t param_fp16, + std::uintptr_t g_fp16, + std::uintptr_t m_fp32, + std::uintptr_t v_fp32, float beta1, float beta2, float eps, float lr, float scale, @@ -138,6 +143,11 @@ void adam_cpu_launcher( float bias_correction1, float bias_correction2 ) { + auto param_fp32_ptr = reinterpret_cast(param_fp32); + auto m_fp32_ptr = reinterpret_cast(m_fp32); + auto v_fp32_ptr = reinterpret_cast(v_fp32); + auto p_fp16_ptr = reinterpret_cast(param_fp16); + auto g_fp16_ptr = reinterpret_cast(g_fp16); #if defined(__AVX512__) auto avx_beta1 = _mm512_set1_ps(beta1); auto avx_beta2 = _mm512_set1_ps(beta2); @@ -175,26 +185,26 @@ void adam_cpu_launcher( #endif // No AVX or n is not alinged for (int64_t i = j; i < end; i++) { - float g = fp16_ieee_to_fp32_value(g_fp16[i]) / scale; - float m = m_fp32[i]; - float v = v_fp32[i]; - float p = param_fp32[i]; + float g = fp16_ieee_to_fp32_value(g_fp16_ptr[i]) / scale; + float m = m_fp32_ptr[i]; + float v = v_fp32_ptr[i]; + float p = param_fp32_ptr[i]; m = beta1 * m + (1 - beta1) * g; v = beta2 * v + (1 - beta2) * g * g; p = p - lr * m / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p; - param_fp32[i] = p; - param_fp16[i] = fp16_ieee_from_fp32_value(p); - m_fp32[i] = m; - v_fp32[i] = v; + param_fp32_ptr[i] = p; + p_fp16_ptr[i] = fp16_ieee_from_fp32_value(p); + m_fp32_ptr[i] = m; + v_fp32_ptr[i] = v; } break; // must break here } else { // use AVX here #if defined(__AVX512__) - auto g = _mm512_div_ps(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)&g_fp16[j])), avx_scale); - auto m = _mm512_loadu_ps(&m_fp32[j]); - auto v = _mm512_loadu_ps(&v_fp32[j]); - auto p = _mm512_loadu_ps(¶m_fp32[j]); + auto g = _mm512_div_ps(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)&g_fp16_ptr[j])), avx_scale); + auto m = _mm512_loadu_ps(&m_fp32_ptr[j]); + auto v = _mm512_loadu_ps(&v_fp32_ptr[j]); + auto p = _mm512_loadu_ps(¶m_fp32_ptr[j]); m = _mm512_fmadd_ps(avx_beta1, m, _mm512_mul_ps(avx_beta1_1, g)); v = _mm512_fmadd_ps(avx_beta2, v, _mm512_mul_ps(avx_beta2_1, _mm512_mul_ps(g, g))); p = _mm512_fmadd_ps(avx_neg_lr, _mm512_mul_ps(avx_weight_decay, p), p); // p = p - lr * weight_decay * p @@ -209,15 +219,15 @@ void adam_cpu_launcher( ), p ); // p = p - lr * m / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - _mm512_storeu_ps(¶m_fp32[j], p); - _mm256_storeu_si256((__m256i*)¶m_fp16[j], _mm512_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); - _mm512_storeu_ps(&m_fp32[j], m); - _mm512_storeu_ps(&v_fp32[j], v); + _mm512_storeu_ps(¶m_fp32_ptr[j], p); + _mm256_storeu_si256((__m256i*)&p_fp16_ptr[j], _mm512_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + _mm512_storeu_ps(&m_fp32_ptr[j], m); + _mm512_storeu_ps(&v_fp32_ptr[j], v); #elif defined(__AVX256__) - auto g = _mm256_div_ps(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)&g_fp16[j])), avx_scale); - auto m = _mm256_loadu_ps(&m_fp32[j]); - auto v = _mm256_loadu_ps(&v_fp32[j]); - auto p = _mm256_loadu_ps(¶m_fp32[j]); + auto g = _mm256_div_ps(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)&g_fp16_ptr[j])), avx_scale); + auto m = _mm256_loadu_ps(&m_fp32_ptr[j]); + auto v = _mm256_loadu_ps(&v_fp32_ptr[j]); + auto p = _mm256_loadu_ps(¶m_fp32_ptr[j]); m = _mm256_fmadd_ps(avx_beta1, m, _mm256_mul_ps(avx_beta1_1, g)); v = _mm256_fmadd_ps(avx_beta2, v, _mm256_mul_ps(avx_beta2_1, _mm256_mul_ps(g, g))); p = _mm256_fmadd_ps(avx_neg_lr, _mm256_mul_ps(avx_weight_decay, p), p); // p = p - lr * weight_decay * p @@ -229,10 +239,10 @@ void adam_cpu_launcher( ), // m / bias_correction1 / (sqrt(v / bias_correction2) + eps) p ); // p = p - lr * m / bias_correction1 / (sqrt(v / bias_correction2) + eps) - _mm256_storeu_ps(¶m_fp32[j], p); - _mm_storeu_si128((__m128i*)¶m_fp16[j], _mm256_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); - _mm256_storeu_ps(&m_fp32[j], m); - _mm256_storeu_ps(&v_fp32[j], v); + _mm256_storeu_ps(¶m_fp32_ptr[j], p); + _mm_storeu_si128((__m128i*)&p_fp16_ptr[j], _mm256_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + _mm256_storeu_ps(&m_fp32_ptr[j], m); + _mm256_storeu_ps(&v_fp32_ptr[j], v); #endif } } From bb5f0aecd85242f81d2c0970c7a3fe6ffb9a7936 Mon Sep 17 00:00:00 2001 From: MayDomine <1583143678@qq.com> Date: Wed, 14 Jun 2023 07:08:24 +0000 Subject: [PATCH 09/13] modified the setup.py and CMakeList.txt to simpify the compiling process and add github action config git action workflow --- .github/workflows/publish.yaml | 83 +++++-- .github/workflows/release.yml | 71 ++++++ CMakeLists.txt | 7 +- Release.txt | 3 + cmake/FindNCCL.cmake | 27 ++- csrc/include/nccl.h | 398 --------------------------------- csrc/include/nccl.hpp | 2 +- csrc/include/nccl_net.h | 313 -------------------------- setup.py | 8 +- 9 files changed, 171 insertions(+), 741 deletions(-) create mode 100644 .github/workflows/release.yml create mode 100644 Release.txt delete mode 100644 csrc/include/nccl.h delete mode 100644 csrc/include/nccl_net.h diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 43d8d52a..4f40bf66 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -1,26 +1,65 @@ -name: "Publish to PyPI" +name: Build and Publish to PyPI + on: - release: - types: - - published + push: + branches: + - main + jobs: - build-n-publish: - name: Build and publish + build: runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['36', '37', '38', '39'] + steps: - - uses: actions/checkout@master - - uses: actions/setup-python@v2 - with: - python-version: '3.7' - architecture: 'x64' - - name: Run build script - run: | - pip install twine --user - pip install wheel - pip install torch==1.11.0+cpu -f https://download.pytorch.org/whl/cpu/torch_stable.html - python setup.py sdist --format=gztar - - name: Publish distribution 📦 to PyPI - uses: pypa/gh-action-pypi-publish@master - with: - user: __token__ - password: ${{ secrets.pypi_password }} \ No newline at end of file + - name: Checkout code + uses: actions/checkout@v3 + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Pull Docker image + run: docker pull maydomine/bmtrain-manylinux:cu110 + + - name: Run Docker image and execute script + run: | + version=${{ matrix.python-version }} + docker run -e CUDACXX=/usr/local/cuda/bin/nvcc -e PATH="/workspace/cmake-3.26.4-linux-x86_64/bin:/opt/rh/devtoolset-7/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i maydomine/bmtrain-manylinux:cu110 /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/python setup.py bdist_wheel -d ./wheel/;/opt/python/cp${version}*/bin/python setup.py sdist -d ./sdist/;for file in wheel/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done" + + - name: Archive distribution files + uses: actions/upload-artifact@v2 + with: + name: dist + path: | + sdist/*.tar.gz + wheel/*.whl + + publish: + needs: build + runs-on: ubuntu-latest + steps: + - name: Set Up the Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Install twine + run: python -m pip install twine + + - name: Download distribution files + uses: actions/download-artifact@v2 + with: + name: dist + path: dist + + - name: Publish to PyPI + env: + TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} + TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} + run: | + cd dist + python -m twine upload sdist/*.tar.gz wheel/*.whl diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 00000000..dc3c593e --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,71 @@ +name: Publish release in Github + +on: + push: + tags: + - "v*.*.*" + +jobs: + build: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: ['36', '37', '38', '39'] + + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + + - name: Pull Docker image + run: docker pull maydomine/bmtrain-manylinux:cu110 + + - name: Run Docker image and execute script + run: | + version=${{ matrix.python-version }} + docker run -e CUDACXX=/usr/local/cuda/bin/nvcc -e PATH="/workspace/cmake-3.26.4-linux-x86_64/bin:/opt/rh/devtoolset-7/root/usr/bin:$PATH" -e LD_LIBRARY_PATH="/opt/rh/devtoolset-7/root/usr/lib64:/opt/rh/devtoolset-7/root/usr/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$LD_LIBRARY_PATH" -v ${{ github.workspace }}:/workspace/BMTrain -i maydomine/bmtrain-manylinux:cu110 /bin/bash -c "cd /workspace/BMTrain;/opt/python/cp${version}*/bin/python setup.py bdist_wheel -d ./wheel/;/opt/python/cp${version}*/bin/python setup.py sdist -d ./sdist/;for file in wheel/*-linux_x86_64.whl; do mv \"\$file\" \"\${file//-linux_x86_64/-manylinux2014_x86_64}\"; done" + + - name: Archive distribution files + uses: actions/upload-artifact@v2 + with: + name: dist + path: | + sdist/*.tar.gz + wheel/*.whl + + publish: + needs: build + runs-on: ubuntu-latest + steps: + - name: Checkout code + uses: actions/checkout@v3 + + - name: Set Up the Python + uses: actions/setup-python@v2 + with: + python-version: 3.9 + + - name: Download distribution files + uses: actions/download-artifact@v2 + with: + name: dist + path: dist + + - name: Upload Distribution Files + uses: softprops/action-gh-release@v1 + with: + body_path: "Release.txt" + files: | + dist/sdist/*.tar.gz + dist/wheel/*.whl + prerelease: false + token: ${{ secrets.RELEASE_TOKEN }} + release_tag: ${{ steps.create_release.outputs.tag }} + github_token: ${{ secrets.GITHUB_TOKEN }} + env: + GITHUB_REPOSITORY: MayDomine/BMTrain diff --git a/CMakeLists.txt b/CMakeLists.txt index ffecf462..403a4df3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,7 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED True) set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_STANDARD_REQUIRED True) -set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80 -gencode=arch=compute_86,code=sm_86") +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80") set(CMAKE_BUILD_RPATH $ORIGIN) set(CMAKE_INSTALL_RPATH $ORIGIN) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/) @@ -20,7 +20,8 @@ execute_process(COMMAND ${Python_EXECUTABLE} "-c" "import pybind11; print(pybind11.get_cmake_dir())" OUTPUT_VARIABLE PYBIND11_CMAKE_DIR OUTPUT_STRIP_TRAILING_WHITESPACE) -message (STATUS "PYBIND11_CMAKE_DIR: ${PYBIND11_CMAKE_DIR}") +message (STATUS "PYBIND11_CMAKE_DIR: ${PYB +IND11_CMAKE_DIR}") list(APPEND CMAKE_PREFIX_PATH ${PYBIND11_CMAKE_DIR}) find_package(pybind11 REQUIRED) @@ -66,7 +67,7 @@ target_include_directories(C PRIVATE ${NCCL_INCLUDE_DIRS}) target_compile_definitions(C PRIVATE VERSION_INFO=${EXAMPLE_VERSION_INFO}) -set_target_properties(C PROPERTIES CUDA_ARCHITECTURES "61;62;70;72;75;80;86") +set_target_properties(C PROPERTIES CUDA_ARCHITECTURES "61;62;70;72;75;80") target_include_directories(C PRIVATE "csrc/include" diff --git a/Release.txt b/Release.txt new file mode 100644 index 00000000..8597044f --- /dev/null +++ b/Release.txt @@ -0,0 +1,3 @@ +# BMTrain New Version Release v0.2.3 +- easier to install (without torch dependency while compiling) +- compatible with torch 2.0 \ No newline at end of file diff --git a/cmake/FindNCCL.cmake b/cmake/FindNCCL.cmake index e6991733..2af8e3b9 100644 --- a/cmake/FindNCCL.cmake +++ b/cmake/FindNCCL.cmake @@ -9,12 +9,31 @@ else() endif() # Compatible layer for CMake <3.12. NCCL_ROOT will be accounted in for searching paths and libraries for CMake >=3.12. -list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT}) +if(NOT NCCL_INCLUDE_DIR OR NOT NCCL_LIB_DIR) + execute_process( + COMMAND python -c "import nvidia.nccl;import os; print(os.path.dirname(nvidia.nccl.__file__))" + OUTPUT_VARIABLE NCCL_PIP_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + list(APPEND NCCL_ROOT $ENV{NCCL_PIP_DIR}) + if(NOT NCCL_INCLUDE_DIR) + set(NCCL_INCLUDE_DIR "${NCCL_PIP_DIR}/include") + endif() + if(NOT NCCL_LIB_DIR) + set(NCCL_LIB_DIR "${NCCL_PIP_DIR}/lib") + endif() + find_library(NCCL_LIBRARIES + NAMES ${NCCL_LIBNAME} + HINTS ${NCCL_LIB_DIR}) +endif() +list(APPEND CMAKE_PREFIX_PATH ${NCCL_ROOT}) find_path(NCCL_INCLUDE_DIRS NAMES nccl.h HINTS ${NCCL_INCLUDE_DIR}) + + if (USE_STATIC_NCCL) MESSAGE(STATUS "USE_STATIC_NCCL is set. Linking with static NCCL library.") SET(NCCL_LIBNAME "nccl_static") @@ -23,15 +42,21 @@ if (USE_STATIC_NCCL) endif() else() SET(NCCL_LIBNAME "nccl") + if (NCCL_VERSION) # Prefer the versioned library if a specific NCCL version is specified + message(STATUS "NCCL version: ${NCCL_VERSION}") set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.${NCCL_VERSION}" ${CMAKE_FIND_LIBRARY_SUFFIXES}) + else() + set(CMAKE_FIND_LIBRARY_SUFFIXES ".so.2" ${CMAKE_FIND_LIBRARY_SUFFIXES}) endif() + endif() find_library(NCCL_LIBRARIES NAMES ${NCCL_LIBNAME} HINTS ${NCCL_LIB_DIR}) + include(FindPackageHandleStandardArgs) find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIRS NCCL_LIBRARIES) diff --git a/csrc/include/nccl.h b/csrc/include/nccl.h deleted file mode 100644 index 90f721c7..00000000 --- a/csrc/include/nccl.h +++ /dev/null @@ -1,398 +0,0 @@ -/************************************************************************* - * Copyright (c) 2015-2021, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_H_ -#define NCCL_H_ - -#include -#include -#if CUDART_VERSION >= 11000 -#include -#endif - -#define NCCL_MAJOR 2 -#define NCCL_MINOR 14 -#define NCCL_PATCH 3 -#define NCCL_SUFFIX "" - -#define NCCL_VERSION_CODE 21403 -#define NCCL_VERSION(X,Y,Z) (((X) <= 2 && (Y) <= 8) ? (X) * 1000 + (Y) * 100 + (Z) : (X) * 10000 + (Y) * 100 + (Z)) - -#ifdef __cplusplus -extern "C" { -#endif - -/* Opaque handle to communicator */ -typedef struct ncclComm* ncclComm_t; - -#define NCCL_UNIQUE_ID_BYTES 128 -typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId; - -/* Error type */ -typedef enum { ncclSuccess = 0, - ncclUnhandledCudaError = 1, - ncclSystemError = 2, - ncclInternalError = 3, - ncclInvalidArgument = 4, - ncclInvalidUsage = 5, - ncclRemoteError = 6, - ncclInProgress = 7, - ncclNumResults = 8 } ncclResult_t; - -/* Communicator configuration. Users can assign value to attributes to specify the - * behavior of a communicator. */ -typedef struct ncclConfig_v21400 { - /* attributes that users should never touch. */ - size_t size; - unsigned int magic; - unsigned int version; - /* attributes that users are able to customize. */ - int blocking; -} ncclConfig_t; - -/* Config initializer must be assigned to initialize config structure when it is created. - * Not initialized config will result in NCCL error. */ -#define NCCL_CONFIG_INITIALIZER { \ - sizeof(ncclConfig_t), /* size */ \ - 0xcafebeef, /* magic */ \ - NCCL_VERSION(NCCL_MAJOR, NCCL_MINOR, NCCL_PATCH), /* version */ \ - 1 /* blocking */ \ -} - -/* Return the NCCL_VERSION_CODE of the NCCL library in the supplied integer. - * This integer is coded with the MAJOR, MINOR and PATCH level of the - * NCCL library - */ -ncclResult_t ncclGetVersion(int *version); -ncclResult_t pncclGetVersion(int *version); - -/* Generates an Id to be used in ncclCommInitRank. ncclGetUniqueId should be - * called once and the Id should be distributed to all ranks in the - * communicator before calling ncclCommInitRank. */ -ncclResult_t ncclGetUniqueId(ncclUniqueId* uniqueId); -ncclResult_t pncclGetUniqueId(ncclUniqueId* uniqueId); - -/* Create a new communicator (multi thread/process version) with a configuration - * set by users. */ -ncclResult_t ncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); -ncclResult_t pncclCommInitRankConfig(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank, ncclConfig_t* config); - -/* Creates a new communicator (multi thread/process version). - * rank must be between 0 and nranks-1 and unique within a communicator clique. - * Each rank is associated to a CUDA device, which has to be set before calling - * ncclCommInitRank. - * ncclCommInitRank implicitly syncronizes with other ranks, so it must be - * called by different threads/processes or use ncclGroupStart/ncclGroupEnd. */ -ncclResult_t ncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); -ncclResult_t pncclCommInitRank(ncclComm_t* comm, int nranks, ncclUniqueId commId, int rank); - -/* Creates a clique of communicators (single process version). - * This is a convenience function to create a single-process communicator clique. - * Returns an array of ndev newly initialized communicators in comm. - * comm should be pre-allocated with size at least ndev*sizeof(ncclComm_t). - * If devlist is NULL, the first ndev CUDA devices are used. - * Order of devlist defines user-order of processors within the communicator. */ -ncclResult_t ncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); -ncclResult_t pncclCommInitAll(ncclComm_t* comm, int ndev, const int* devlist); - -/* Finalize a communicator. ncclCommFinalize flushes all issued communications, - * and marks communicator state as ncclInProgress. The state will change to ncclSuccess - * when the communicator is globally quiescent and related resources are freed; then, - * calling ncclCommDestroy can locally free the rest of the resources (e.g. communicator - * itself) without blocking. */ -ncclResult_t ncclCommFinalize(ncclComm_t comm); -ncclResult_t pncclCommFinalize(ncclComm_t comm); - -/* Frees local resources associated with communicator object. */ -ncclResult_t ncclCommDestroy(ncclComm_t comm); -ncclResult_t pncclCommDestroy(ncclComm_t comm); - -/* Frees resources associated with communicator object and aborts any operations - * that might still be running on the device. */ -ncclResult_t ncclCommAbort(ncclComm_t comm); -ncclResult_t pncclCommAbort(ncclComm_t comm); - -/* Returns a string for each error code. */ -const char* ncclGetErrorString(ncclResult_t result); -const char* pncclGetErrorString(ncclResult_t result); - -/* Returns a human-readable message of the last error that occurred. - * comm is currently unused and can be set to NULL - */ -const char* ncclGetLastError(ncclComm_t comm); -const char* pncclGetError(ncclComm_t comm); - -/* Checks whether the comm has encountered any asynchronous errors */ -ncclResult_t ncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); -ncclResult_t pncclCommGetAsyncError(ncclComm_t comm, ncclResult_t *asyncError); - -/* Gets the number of ranks in the communicator clique. */ -ncclResult_t ncclCommCount(const ncclComm_t comm, int* count); -ncclResult_t pncclCommCount(const ncclComm_t comm, int* count); - -/* Returns the cuda device number associated with the communicator. */ -ncclResult_t ncclCommCuDevice(const ncclComm_t comm, int* device); -ncclResult_t pncclCommCuDevice(const ncclComm_t comm, int* device); - -/* Returns the user-ordered "rank" associated with the communicator. */ -ncclResult_t ncclCommUserRank(const ncclComm_t comm, int* rank); -ncclResult_t pncclCommUserRank(const ncclComm_t comm, int* rank); - -/* Reduction operation selector */ -typedef enum { ncclNumOps_dummy = 5 } ncclRedOp_dummy_t; -typedef enum { ncclSum = 0, - ncclProd = 1, - ncclMax = 2, - ncclMin = 3, - ncclAvg = 4, - /* ncclNumOps: The number of built-in ncclRedOp_t values. Also - * serves as the least possible value for dynamic ncclRedOp_t's - * as constructed by ncclRedOpCreate*** functions. */ - ncclNumOps = 5, - /* ncclMaxRedOp: The largest valid value for ncclRedOp_t. - * It is defined to be the largest signed value (since compilers - * are permitted to use signed enums) that won't grow - * sizeof(ncclRedOp_t) when compared to previous NCCL versions to - * maintain ABI compatibility. */ - ncclMaxRedOp = 0x7fffffff>>(32-8*sizeof(ncclRedOp_dummy_t)) - } ncclRedOp_t; - -/* Data types */ -typedef enum { ncclInt8 = 0, ncclChar = 0, - ncclUint8 = 1, - ncclInt32 = 2, ncclInt = 2, - ncclUint32 = 3, - ncclInt64 = 4, - ncclUint64 = 5, - ncclFloat16 = 6, ncclHalf = 6, - ncclFloat32 = 7, ncclFloat = 7, - ncclFloat64 = 8, ncclDouble = 8, -#if defined(__CUDA_BF16_TYPES_EXIST__) - ncclBfloat16 = 9, - ncclNumTypes = 10 -#else - ncclNumTypes = 9 -#endif -} ncclDataType_t; - -/* ncclScalarResidence_t: Location and dereferencing logic for scalar arguments. */ -typedef enum { - /* ncclScalarDevice: The scalar is in device-visible memory and will be - * dereferenced while the collective is running. */ - ncclScalarDevice = 0, - - /* ncclScalarHostImmediate: The scalar is in host-visible memory and will be - * dereferenced before the ncclRedOpCreate***() function returns. */ - ncclScalarHostImmediate = 1 -} ncclScalarResidence_t; - -/* - * ncclRedOpCreatePreMulSum - * - * Creates a new reduction operator which pre-multiplies input values by a given - * scalar locally before reducing them with peer values via summation. For use - * only with collectives launched against *comm* and *datatype*. The - * *residence* argument indicates how/when the memory pointed to by *scalar* - * will be dereferenced. Upon return, the newly created operator's handle - * is stored in *op*. - */ -ncclResult_t ncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); -ncclResult_t pncclRedOpCreatePreMulSum(ncclRedOp_t *op, void *scalar, ncclDataType_t datatype, ncclScalarResidence_t residence, ncclComm_t comm); - -/* - * ncclRedOpDestroy - * - * Destroys the reduction operator *op*. The operator must have been created by - * ncclRedOpCreatePreMul with the matching communicator *comm*. An operator may be - * destroyed as soon as the last NCCL function which is given that operator returns. - */ -ncclResult_t ncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); -ncclResult_t pncclRedOpDestroy(ncclRedOp_t op, ncclComm_t comm); - -/* - * Collective communication operations - * - * Collective communication operations must be called separately for each - * communicator in a communicator clique. - * - * They return when operations have been enqueued on the CUDA stream. - * - * Since they may perform inter-CPU synchronization, each call has to be done - * from a different thread or process, or need to use Group Semantics (see - * below). - */ - -/* - * Reduce - * - * Reduces data arrays of length count in sendbuff into recvbuff using op - * operation. - * recvbuff may be NULL on all calls except for root device. - * root is the rank (not the CUDA device) where data will reside after the - * operation is complete. - * - * In-place operation will happen if sendbuff == recvbuff. - */ -ncclResult_t ncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclReduce(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, - ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream); - -/* - * (deprecated) Broadcast (in-place) - * - * Copies count values from root to all other devices. - * root is the rank (not the CUDA device) where data resides before the - * operation is started. - * - * This operation is implicitely in place. - */ -ncclResult_t ncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclBcast(void* buff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); - -/* - * Broadcast - * - * Copies count values from root to all other devices. - * root is the rank (not the CUDA device) where data resides before the - * operation is started. - * - * In-place operation will happen if sendbuff == recvbuff. - */ -ncclResult_t ncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclBroadcast(const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, int root, - ncclComm_t comm, cudaStream_t stream); - -/* - * All-Reduce - * - * Reduces data arrays of length count in sendbuff using op operation, and - * leaves identical copies of result on each recvbuff. - * - * In-place operation will happen if sendbuff == recvbuff. - */ -ncclResult_t ncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclAllReduce(const void* sendbuff, void* recvbuff, size_t count, - ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream); - -/* - * Reduce-Scatter - * - * Reduces data in sendbuff using op operation and leaves reduced result - * scattered over the devices so that recvbuff on rank i will contain the i-th - * block of the result. - * Assumes sendcount is equal to nranks*recvcount, which means that sendbuff - * should have a size of at least nranks*recvcount elements. - * - * In-place operations will happen if recvbuff == sendbuff + rank * recvcount. - */ -ncclResult_t ncclReduceScatter(const void* sendbuff, void* recvbuff, - size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - cudaStream_t stream); -ncclResult_t pncclReduceScatter(const void* sendbuff, void* recvbuff, - size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, - cudaStream_t stream); - -/* - * All-Gather - * - * Each device gathers sendcount values from other GPUs into recvbuff, - * receiving data from rank i at offset i*sendcount. - * Assumes recvcount is equal to nranks*sendcount, which means that recvbuff - * should have a size of at least nranks*sendcount elements. - * - * In-place operations will happen if sendbuff == recvbuff + rank * sendcount. - */ -ncclResult_t ncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclAllGather(const void* sendbuff, void* recvbuff, size_t sendcount, - ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream); - -/* - * Send - * - * Send data from sendbuff to rank peer. - * - * Rank peer needs to call ncclRecv with the same datatype and the same count from this - * rank. - * - * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations - * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ - * ncclGroupEnd section. - */ -ncclResult_t ncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t pncclSend(const void* sendbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); - -/* - * Receive - * - * Receive data from rank peer into recvbuff. - * - * Rank peer needs to call ncclSend with the same datatype and the same count to this - * rank. - * - * This operation is blocking for the GPU. If multiple ncclSend and ncclRecv operations - * need to progress concurrently to complete, they must be fused within a ncclGroupStart/ - * ncclGroupEnd section. - */ -ncclResult_t pncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); -ncclResult_t ncclRecv(void* recvbuff, size_t count, ncclDataType_t datatype, int peer, - ncclComm_t comm, cudaStream_t stream); - -/* - * Group semantics - * - * When managing multiple GPUs from a single thread, and since NCCL collective - * calls may perform inter-CPU synchronization, we need to "group" calls for - * different ranks/devices into a single call. - * - * Grouping NCCL calls as being part of the same collective operation is done - * using ncclGroupStart and ncclGroupEnd. ncclGroupStart will enqueue all - * collective calls until the ncclGroupEnd call, which will wait for all calls - * to be complete. Note that for collective communication, ncclGroupEnd only - * guarantees that the operations are enqueued on the streams, not that - * the operation is effectively done. - * - * Both collective communication and ncclCommInitRank can be used in conjunction - * of ncclGroupStart/ncclGroupEnd, but not together. - * - * Group semantics also allow to fuse multiple operations on the same device - * to improve performance (for aggregated collective calls), or to permit - * concurrent progress of multiple send/receive operations. - */ - -/* - * Group Start - * - * Start a group call. All calls to NCCL until ncclGroupEnd will be fused into - * a single NCCL operation. Nothing will be started on the CUDA stream until - * ncclGroupEnd. - */ -ncclResult_t ncclGroupStart(); -ncclResult_t pncclGroupStart(); - -/* - * Group End - * - * End a group call. Start a fused NCCL operation consisting of all calls since - * ncclGroupStart. Operations on the CUDA stream depending on the NCCL operations - * need to be called after ncclGroupEnd. - */ -ncclResult_t ncclGroupEnd(); -ncclResult_t pncclGroupEnd(); - -#ifdef __cplusplus -} // end extern "C" -#endif - -#endif // end include guard diff --git a/csrc/include/nccl.hpp b/csrc/include/nccl.hpp index 297fa5ab..bba0278b 100644 --- a/csrc/include/nccl.hpp +++ b/csrc/include/nccl.hpp @@ -3,7 +3,7 @@ #include namespace py = pybind11; -#include "nccl.h" +#include void checkNCCLStatus(ncclResult_t result) { if (result == ncclSuccess) return; diff --git a/csrc/include/nccl_net.h b/csrc/include/nccl_net.h deleted file mode 100644 index 255a44ee..00000000 --- a/csrc/include/nccl_net.h +++ /dev/null @@ -1,313 +0,0 @@ -/************************************************************************* - * Copyright (c) 2017-2022, NVIDIA CORPORATION. All rights reserved. - * - * See LICENSE.txt for license information - ************************************************************************/ - -#ifndef NCCL_NET_H_ -#define NCCL_NET_H_ - -#include "nccl.h" -#include - -#define NCCL_NET_HANDLE_MAXSIZE 128 - -#define NCCL_PTR_HOST 0x1 -#define NCCL_PTR_CUDA 0x2 -#define NCCL_PTR_DMABUF 0x4 - -// Maximum number of requests per comm object -#define NCCL_NET_MAX_REQUESTS 8 - -typedef enum {NCCL_LOG_NONE=0, NCCL_LOG_VERSION=1, NCCL_LOG_WARN=2, NCCL_LOG_INFO=3, NCCL_LOG_ABORT=4, NCCL_LOG_TRACE=5} ncclDebugLogLevel; -typedef enum {NCCL_INIT=1, NCCL_COLL=2, NCCL_P2P=4, NCCL_SHM=8, NCCL_NET=16, NCCL_GRAPH=32, NCCL_TUNING=64, NCCL_ENV=128, NCCL_ALLOC=256, NCCL_CALL=512, NCCL_ALL=~0} ncclDebugLogSubSys; - -typedef void (*ncclDebugLogger_t)(ncclDebugLogLevel level, unsigned long flags, const char *file, int line, const char *fmt, ...); - -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // [NCCL_PTR_HOST|NCCL_PTR_CUDA|NCCL_PTR_DMABUF] - int speed; // Port speed in Mbps. - int port; // Port number. - float latency; // Network latency - int maxComms; // Maximum number of comms we can create - int maxRecvs; // Maximum number of grouped receives. -}ncclNetProperties_v6_t; - -typedef ncclNetProperties_v6_t ncclNetProperties_t; - -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* comm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v6_t; - -typedef ncclNet_v6_t ncclNet_t; - -#define NCCL_PLUGIN_SYMBOL ncclNetPlugin_v6 - -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - /* DMA-BUF support */ - ncclResult_t (*regMrDmaBuf)(void* collComm, void* data, size_t size, int type, uint64_t offset, int fd, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v6_t; - -typedef ncclCollNet_v6_t ncclCollNet_t; - -#define NCCL_COLLNET_PLUGIN_SYMBOL ncclCollNetPlugin_v6 - -// v5 struct for backwards compatibility -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - // This call must not block for the connection to be established, and instead - // should return successfully with sendComm == NULL with the expectation that - // it will be called again until sendComm != NULL. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connect. - // This call must not block for the connection to be established, and instead - // should return successfully with recvComm == NULL with the expectation that - // it will be called again until recvComm != NULL. - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, int tag, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, int n, void** data, int* sizes, int* tags, void** mhandles, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, int n, void** data, int* sizes, void** mhandles, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* sizes); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v5_t; - -// v5 struct for backwards compatibility -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v6_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v5_t; - -// v4 struct for backwards compatibility -typedef struct { - char* name; // Used mostly for logging. - char* pciPath; // Path to the PCI device in /sys. - uint64_t guid; // Unique identifier for the NIC chip. Important for - // cards with multiple PCI functions (Physical or virtual). - int ptrSupport; // NCCL_PTR_HOST or NCCL_PTR_HOST|NCCL_PTR_CUDA - int speed; // Port speed in Mbps. - int port; // Port number. - int maxComms; // Maximum number of comms we can create -} ncclNetProperties_v4_t; - -// v4 struct for backwards compatibility -typedef struct { - // Name of the network (mainly for logs) - const char* name; - // Initialize the network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create a connection. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Connect to a handle and return a sending comm object for that peer. - ncclResult_t (*connect)(int dev, void* handle, void** sendComm); - // Finalize connection establishment after remote peer has called connectHandle - ncclResult_t (*accept)(void* listenComm, void** recvComm); - // Register/Deregister memory. Comm can be either a sendComm or a recvComm. - // Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* comm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* comm, void* mhandle); - // Asynchronous send to a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*isend)(void* sendComm, void* data, int size, void* mhandle, void** request); - // Asynchronous recv from a peer. - // May return request == NULL if the call cannot be performed (or would block) - ncclResult_t (*irecv)(void* recvComm, void* data, int size, void* mhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* recvComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free send/recv comm objects - ncclResult_t (*closeSend)(void* sendComm); - ncclResult_t (*closeRecv)(void* recvComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclNet_v4_t; - -// v4 struct for backwards compatibility -typedef struct { - // Name of the collective network (mainly for logs) - const char* name; - // Initialize the collective network. - ncclResult_t (*init)(ncclDebugLogger_t logFunction); - // Return the number of adapters capable of doing collective operations. - // If ndev returns 0, all other functions might be set to NULL. - ncclResult_t (*devices)(int* ndev); - // Get various device properties. - ncclResult_t (*getProperties)(int dev, ncclNetProperties_v4_t* props); - // Create a receiving object and provide a handle to connect to it. The - // handle can be up to NCCL_NET_HANDLE_MAXSIZE bytes and will be exchanged - // between ranks to create connections. - ncclResult_t (*listen)(int dev, void* handle, void** listenComm); - // Create a group for collective operations. handles have been created - // using listen() above. rank indicates caller's rank in the collective network. - ncclResult_t (*connect)(void* handles[], int nranks, int rank, void* listenComm, void** collComm); - // Returns whether a reduction operation on a data type is supported. - // 1 for supported, 0 otherwise. - ncclResult_t (*reduceSupport)(ncclDataType_t dataType, ncclRedOp_t redOp, int* supported); - // Register/Deregister memory. Type is either NCCL_PTR_HOST or NCCL_PTR_CUDA. - ncclResult_t (*regMr)(void* collComm, void* data, int size, int type, void** mhandle); - ncclResult_t (*deregMr)(void* collComm, void* mhandle); - // Performs an asynchronous allreduce operation on the collective group. - // May return request == NULL if the call cannot be performed (or would block). - ncclResult_t (*iallreduce)(void* collComm, void* sendData, void* recvData, int count, - ncclDataType_t dataType, ncclRedOp_t redOp, void* sendMhandle, void* recvMhandle, void** request); - // Perform a flush/fence to make sure all data received with NCCL_PTR_CUDA is - // visible to the GPU - ncclResult_t (*iflush)(void* collComm, void* data, int size, void* mhandle, void** request); - // Test whether a request is complete. If size is not NULL, it returns the - // number of bytes sent/received. - ncclResult_t (*test)(void* request, int* done, int* size); - // Close and free collective comm objects - ncclResult_t (*closeColl)(void* collComm); - ncclResult_t (*closeListen)(void* listenComm); -} ncclCollNet_v4_t; - -#endif // end include guard diff --git a/setup.py b/setup.py index 26fcfed3..16eb535d 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,6 @@ -import torch import os from setuptools.command.build_ext import build_ext from setuptools import setup, find_packages, Extension -from torch.utils.cpp_extension import BuildExtension import setuptools import warnings import sys @@ -99,13 +97,17 @@ def build_extension(self, ext): ] setup( name='bmtrain', - version='0.2.2', + version='0.2.3', author="Guoyang Zeng", author_email="qbjooo@qq.com", description="A toolkit for training big models", packages=find_packages(), install_requires=[ "numpy", + "nvidia-nccl-cu11>=2.14.3" + ], + setup_requires=[ + "nvidia-nccl-cu11>=2.14.3" ], ext_modules=ext_modules, cmdclass={ From 1d2a667ef24b3e709fa50adfaf873a7bd3afb746 Mon Sep 17 00:00:00 2001 From: maydomine <1583143678@qq.com> Date: Wed, 5 Jul 2023 10:53:07 +0800 Subject: [PATCH 10/13] fix the thread's bug in adam offload Update publish.yaml to undo pypi release at every push action --- .github/workflows/publish.yaml | 4 ++-- csrc/include/adam_cpu.hpp | 43 +++++++++++++++++++++++++++------- 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 4f40bf66..8dd8bec7 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -2,8 +2,8 @@ name: Build and Publish to PyPI on: push: - branches: - - main + tags: + - "v*.*.*" jobs: build: diff --git a/csrc/include/adam_cpu.hpp b/csrc/include/adam_cpu.hpp index dd283bb6..93df3207 100644 --- a/csrc/include/adam_cpu.hpp +++ b/csrc/include/adam_cpu.hpp @@ -2,8 +2,9 @@ #include #include #include - - +#include +#include +#include #include #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") @@ -20,6 +21,7 @@ #include #include #include +using namespace std; inline float fp32_from_bits(uint32_t w) { union { @@ -49,25 +51,48 @@ inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F num_threads = std::max(numiter / grain_size, static_cast(1)); } else{ - int64_t n = std::thread::hardware_concurrency(); - grain_size = std::max(numiter / n, static_cast(1)); - num_threads = std::max(numiter / grain_size, static_cast(1)); + cpu_set_t get; + // int64_t n = sched_getaffinity(0, sizeof(get), &get); + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + sched_getaffinity(0, sizeof(cpu_set), &cpu_set); + num_threads = CPU_COUNT(&cpu_set); + grain_size = std::max(numiter / num_threads, static_cast(1)); + // grain_size = std::max(numiter / n, static_cast(1)); + // num_threads = std::max(numiter / grain_size, static_cast(1)); } // Check if parallel execution is feasible if (num_threads > 1) { + py::gil_scoped_release release; // Release the GIL std::vector threads(num_threads); for (int64_t t = 0; t < num_threads; ++t) { threads[t] = std::thread([&, t]() { - int64_t left = begin + t * grain_size; - int64_t right = std::min(begin + (t + 1) * grain_size, end); - f(left, right); + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + sched_getaffinity(0, sizeof(cpu_set), &cpu_set); + if (CPU_ISSET(t, &cpu_set)) { + int64_t left = begin + t * grain_size; + int64_t right = std::min(begin + (t + 1) * grain_size, end); + f(left, right); + } }); } - // Join all threads for (auto& thread : threads) { thread.join(); } + // std::vector threads(num_threads); + // for (int64_t t = 0; t < num_threads; ++t) { + // threads[t] = std::thread([&, t]() { + // int64_t left = begin + t * grain_size; + // int64_t right = std::min(begin + (t + 1) * grain_size, end); + // f(left, right); + // }); + // } + // // Join all threads + // for (auto& thread : threads) { + // thread.join(); + // } } else { // If not feasible or grain_size is 0, perform the operation serially f(begin, end); From 726d0cb244fe08fe7dc04e85a8badda0f646a49a Mon Sep 17 00:00:00 2001 From: maydomine <1583143678@qq.com> Date: Wed, 5 Jul 2023 11:28:52 +0800 Subject: [PATCH 11/13] inner thread loop bug --- csrc/include/adam_cpu.hpp | 28 ++++------------------------ 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/csrc/include/adam_cpu.hpp b/csrc/include/adam_cpu.hpp index 93df3207..6f70463e 100644 --- a/csrc/include/adam_cpu.hpp +++ b/csrc/include/adam_cpu.hpp @@ -51,15 +51,12 @@ inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F num_threads = std::max(numiter / grain_size, static_cast(1)); } else{ - cpu_set_t get; - // int64_t n = sched_getaffinity(0, sizeof(get), &get); cpu_set_t cpu_set; CPU_ZERO(&cpu_set); sched_getaffinity(0, sizeof(cpu_set), &cpu_set); num_threads = CPU_COUNT(&cpu_set); grain_size = std::max(numiter / num_threads, static_cast(1)); - // grain_size = std::max(numiter / n, static_cast(1)); - // num_threads = std::max(numiter / grain_size, static_cast(1)); + } // Check if parallel execution is feasible @@ -68,31 +65,14 @@ inline void parallel_for(int64_t begin, int64_t end, int64_t grain_size, const F std::vector threads(num_threads); for (int64_t t = 0; t < num_threads; ++t) { threads[t] = std::thread([&, t]() { - cpu_set_t cpu_set; - CPU_ZERO(&cpu_set); - sched_getaffinity(0, sizeof(cpu_set), &cpu_set); - if (CPU_ISSET(t, &cpu_set)) { - int64_t left = begin + t * grain_size; - int64_t right = std::min(begin + (t + 1) * grain_size, end); - f(left, right); - } + int64_t left = begin + t * grain_size; + int64_t right = std::min(begin + (t + 1) * grain_size, end); + f(left, right); }); } for (auto& thread : threads) { thread.join(); } - // std::vector threads(num_threads); - // for (int64_t t = 0; t < num_threads; ++t) { - // threads[t] = std::thread([&, t]() { - // int64_t left = begin + t * grain_size; - // int64_t right = std::min(begin + (t + 1) * grain_size, end); - // f(left, right); - // }); - // } - // // Join all threads - // for (auto& thread : threads) { - // thread.join(); - // } } else { // If not feasible or grain_size is 0, perform the operation serially f(begin, end); From 97bab329ef246aafdb0804a47a507c51098adbcd Mon Sep 17 00:00:00 2001 From: MayDomine <1583143678@qq.com> Date: Thu, 20 Jul 2023 16:34:18 +0800 Subject: [PATCH 12/13] fix load model oom issue debug --- bmtrain/block_layer.py | 2 ++ bmtrain/layer.py | 2 ++ bmtrain/store.py | 70 +++++++++++++++++++++++++++++------------- 3 files changed, 52 insertions(+), 22 deletions(-) diff --git a/bmtrain/block_layer.py b/bmtrain/block_layer.py index 652e085a..8647b7ff 100644 --- a/bmtrain/block_layer.py +++ b/bmtrain/block_layer.py @@ -492,6 +492,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, if key in state_dict: # load here input_param = state_dict[key] + if input_param.__class__.__name__ == "DistributedTensorWrapper": + input_param = input_param.broadcast() if input_param.shape != it["shape"]: error_msgs.append('size mismatch for {}: copying a param with shape {} from checkpoint, ' 'the shape in current model is {}.' diff --git a/bmtrain/layer.py b/bmtrain/layer.py index c32018db..ebbef815 100644 --- a/bmtrain/layer.py +++ b/bmtrain/layer.py @@ -82,6 +82,8 @@ def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, key = prefix + name if key in state_dict: input_param = state_dict[key] + if input_param.__class__.__name__ == "DistributedTensorWrapper": + input_param = input_param.broadcast() # This is used to avoid copying uninitialized parameters into # non-lazy modules, since they dont have the hook to do the checks # in such case, it will error when accessing the .shape attribute. diff --git a/bmtrain/store.py b/bmtrain/store.py index c0914f9a..80007a93 100644 --- a/bmtrain/store.py +++ b/bmtrain/store.py @@ -136,6 +136,50 @@ def broadcast_object(obj, comm, src = 0): return obj # Must be a Mapping after pytorch 1.12.0 +class DistributedTensorWrapper: + def __init__(self, tensor, shape=None): + self._dtype = tensor.dtype + self._device = tensor.device + self.shape = shape + self.tensor = tensor + + def broadcast(self): + output_param = torch.empty(self.shape, dtype=self._dtype, device="cuda") + if config['rank'] == 0: + input_param = self.tensor + if input_param.is_cuda: + input_param = input_param.clone().contiguous() + else: + input_param = input_param.cuda().contiguous() + + nccl.broadcast( + input_param.storage(), + output_param.storage(), + 0, + config['comm'] + ) + else: + nccl.broadcast( + output_param.storage(), + output_param.storage(), + 0, + config['comm'] + ) + return output_param + def copy(self): + return self.tensor + + def __getattribute__(self, name): + if name == "tensor" or name == "shape": + return object.__getattribute__(self, name) + else: + try: + return object.__getattribute__(self, name) + except AttributeError: + pass + + return getattr(self.tensor, name) + class DistributedStateDictWrapper(Mapping): def __init__(self, state_dict : Dict) -> None: self._state_dict = state_dict @@ -165,30 +209,12 @@ def __getitem__(self, key : str): dtype_idx = tmp_shape[1].item() shape_list = torch.Size(tmp_shape[2: 2 + shape_list_size].tolist()) - output_param = torch.empty(shape_list, dtype=DTYPE_LIST[dtype_idx], device="cuda") - - if config['rank'] == 0: - input_param : torch.Tensor = self._state_dict[key] - if input_param.is_cuda: - input_param = input_param.clone().contiguous() - else: - input_param = input_param.cuda().contiguous() - - nccl.broadcast( - input_param.storage(), - output_param.storage(), - 0, - config['comm'] - ) + if config['rank'] != 0: + return DistributedTensorWrapper(torch.tensor([],dtype=DTYPE_LIST[dtype_idx],device="cuda"),shape=shape_list) else: - nccl.broadcast( - output_param.storage(), - output_param.storage(), - 0, - config['comm'] - ) + return DistributedTensorWrapper(self._state_dict[key],shape=shape_list) + - return output_param def copy(self): return self From 49f277d4a17b5cca36392d6518c860ecb4f36016 Mon Sep 17 00:00:00 2001 From: MayDomine <1583143678@qq.com> Date: Mon, 24 Jul 2023 11:32:05 +0800 Subject: [PATCH 13/13] choose avx in runtime --- CMakeLists.txt | 30 ++---- bmtrain/store.py | 5 +- csrc/include/adam_cpu.hpp | 220 +++++++++++++++++++++++++------------- csrc/include/cpu_info.h | 38 +++++++ 4 files changed, 195 insertions(+), 98 deletions(-) create mode 100644 csrc/include/cpu_info.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 403a4df3..455f7dbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,7 +8,13 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED True) set(CMAKE_CUDA_STANDARD 14) set(CMAKE_CUDA_STANDARD_REQUIRED True) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_72,code=sm_72 -gencode=arch=compute_75,code=sm_75 -gencode=arch=compute_80,code=sm_80") + +if(NOT DEFINED ENV{BUILD_DOCKER_ENV} OR "$ENV{BUILD_DOCKER_ENV}" STREQUAL "0") + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -gencode=arch=compute_86,code=sm_86") +endif() + set(CMAKE_BUILD_RPATH $ORIGIN) set(CMAKE_INSTALL_RPATH $ORIGIN) set(CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/) @@ -30,29 +36,7 @@ message (STATUS "CMAKE_INSTALL_RPATH: ${CMAKE_INSTALL_RPATH}") file(GLOB_RECURSE SOURCES "csrc/*.cpp") file(GLOB_RECURSE CUDA_SOURCES "csrc/cuda/*.cu") -set(BMT_AVX256 $ENV{BMT_AVX256}) -set(BMT_AVX512 $ENV{BMT_AVX512}) - -# Convert them to lowercase -string(TOLOWER "${BMT_AVX256}" BMT_AVX256_LOWER) -string(TOLOWER "${BMT_AVX512}" BMT_AVX512_LOWER) - -# Check the flags -if ("${BMT_AVX256_LOWER}" STREQUAL "1" OR "${BMT_AVX256_LOWER}" STREQUAL "true" OR "${BMT_AVX256_LOWER}" STREQUAL "on") - set(AVX_FLAGS "${AVX_FLAGS} -mavx -mfma -mf16c") -elseif ("${BMT_AVX512_LOWER}" STREQUAL "1" OR "${BMT_AVX512_LOWER}" STREQUAL "true" OR "${BMT_AVX512_LOWER}" STREQUAL "on") - set(AVX_FLAGS "${AVX_FLAGS} -mavx512f") -elseif ("${BMT_AVX256_LOWER}" STREQUAL "" OR "${BMT_AVX512_LOWER}" STREQUAL "") - message(WARNING "BMT_AVX256 or BMT_AVX512 is empty or not defined.") -else() - set(AVX_FLAGS "${AVX_FLAGS} -march=native") -endif() - - - -# Now you can use AVX_FLAGS variable in your target_compile_options -# For example, if your target is named 'my_target': - +set(AVX_FLAGS "${AVX_FLAGS} -march=native") pybind11_add_module(C ${SOURCES} ${CUDA_SOURCES}) diff --git a/bmtrain/store.py b/bmtrain/store.py index 80007a93..0e0fd7ca 100644 --- a/bmtrain/store.py +++ b/bmtrain/store.py @@ -166,6 +166,7 @@ def broadcast(self): config['comm'] ) return output_param + def copy(self): return self.tensor @@ -210,9 +211,9 @@ def __getitem__(self, key : str): shape_list = torch.Size(tmp_shape[2: 2 + shape_list_size].tolist()) if config['rank'] != 0: - return DistributedTensorWrapper(torch.tensor([],dtype=DTYPE_LIST[dtype_idx],device="cuda"),shape=shape_list) + return DistributedTensorWrapper(torch.tensor([], dtype=DTYPE_LIST[dtype_idx], device="cuda"), shape=shape_list) else: - return DistributedTensorWrapper(self._state_dict[key],shape=shape_list) + return DistributedTensorWrapper(self._state_dict[key], shape=shape_list) diff --git a/csrc/include/adam_cpu.hpp b/csrc/include/adam_cpu.hpp index 6f70463e..b070c7a0 100644 --- a/csrc/include/adam_cpu.hpp +++ b/csrc/include/adam_cpu.hpp @@ -6,22 +6,12 @@ #include #include #include -#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") - -#if defined(__AVX512F__) - -#pragma message "Using AVX512" -#define __AVX512__ 1 - -#elif defined(__AVX__) and defined(__FMA__) and defined(__F16C__) - -#pragma message "Using AVX256" -#define __AVX256__ 1 -#endif #include #include #include -using namespace std; +#include "cpu_info.h" +#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") + inline float fp32_from_bits(uint32_t w) { union { @@ -134,38 +124,56 @@ inline float fp16_ieee_to_fp32_value(uint16_t h) { : fp32_to_bits(normalized_value)); return fp32_from_bits(result); } -void adam_cpu_launcher( + +void adam_cpu_0( int64_t n, - std::uintptr_t param_fp32, - std::uintptr_t param_fp16, - std::uintptr_t g_fp16, - std::uintptr_t m_fp32, - std::uintptr_t v_fp32, + float* param_fp32_ptr, + uint16_t* param_fp16_ptr, + uint16_t* g_fp16_ptr, + float* m_fp32_ptr, + float* v_fp32_ptr, float beta1, float beta2, float eps, float lr, float scale, float weight_decay, float bias_correction1, float bias_correction2 -) { - auto param_fp32_ptr = reinterpret_cast(param_fp32); - auto m_fp32_ptr = reinterpret_cast(m_fp32); - auto v_fp32_ptr = reinterpret_cast(v_fp32); - auto p_fp16_ptr = reinterpret_cast(param_fp16); - auto g_fp16_ptr = reinterpret_cast(g_fp16); -#if defined(__AVX512__) - auto avx_beta1 = _mm512_set1_ps(beta1); - auto avx_beta2 = _mm512_set1_ps(beta2); - auto avx_beta1_1 = _mm512_set1_ps(1 - beta1); - auto avx_beta2_1 = _mm512_set1_ps(1 - beta2); - auto avx_eps = _mm512_set1_ps(eps); - auto avx_neg_lr = _mm512_set1_ps(-lr); - auto avx_scale = _mm512_set1_ps(scale); - auto avx_weight_decay = _mm512_set1_ps(weight_decay); - auto avx_bias_correction1 = _mm512_set1_ps(bias_correction1); - auto avx_bias_correction2 = _mm512_set1_ps(bias_correction2); - int64_t span = 16; -#elif defined(__AVX256__) +){ + int64_t span = 1; + parallel_for(0, n, 0, [&](int64_t start, int64_t end) { + for (int64_t j = start; j < end; j += span) { + for (int64_t i = j; i < end; i++) { + float g = fp16_ieee_to_fp32_value(g_fp16_ptr[i]) / scale; + float m = m_fp32_ptr[i]; + float v = v_fp32_ptr[i]; + float p = param_fp32_ptr[i]; + m = beta1 * m + (1 - beta1) * g; + v = beta2 * v + (1 - beta2) * g * g; + p = p - lr * m / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p; + param_fp32_ptr[i] = p; + param_fp16_ptr[i] = fp16_ieee_from_fp32_value(p); + m_fp32_ptr[i] = m; + v_fp32_ptr[i] = v; + } + break; // must break here + } + }); +} + +static void __attribute__ ((__target__ ("avx,fma,f16c"))) adam_cpu_1( + int64_t n, + float* param_fp32_ptr, + uint16_t* param_fp16_ptr, + uint16_t* g_fp16_ptr, + float* m_fp32_ptr, + float* v_fp32_ptr, + float beta1, float beta2, + float eps, float lr, + float scale, + float weight_decay, + float bias_correction1, + float bias_correction2 +){ auto avx_beta1 = _mm256_set1_ps(beta1); auto avx_beta2 = _mm256_set1_ps(beta2); auto avx_beta1_1 = _mm256_set1_ps(1 - beta1); @@ -177,18 +185,9 @@ void adam_cpu_launcher( auto avx_bias_correction1 = _mm256_set1_ps(bias_correction1); auto avx_bias_correction2 = _mm256_set1_ps(bias_correction2); int64_t span = 8; -#else - int64_t span = 1; -#endif - parallel_for(0, n, 0, [&](int64_t start, int64_t end) { for (int64_t j = start; j < end; j += span) { -#if defined(__AVX256__) or defined(__AVX512__) if (j + span > end) { -#else - if (true) { -#endif - // No AVX or n is not alinged for (int64_t i = j; i < end; i++) { float g = fp16_ieee_to_fp32_value(g_fp16_ptr[i]) / scale; float m = m_fp32_ptr[i]; @@ -198,14 +197,78 @@ void adam_cpu_launcher( v = beta2 * v + (1 - beta2) * g * g; p = p - lr * m / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p; param_fp32_ptr[i] = p; - p_fp16_ptr[i] = fp16_ieee_from_fp32_value(p); + param_fp16_ptr[i] = fp16_ieee_from_fp32_value(p); m_fp32_ptr[i] = m; v_fp32_ptr[i] = v; } break; // must break here } else { - // use AVX here -#if defined(__AVX512__) + auto g = _mm256_div_ps(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)&g_fp16_ptr[j])), avx_scale); + auto m = _mm256_loadu_ps(&m_fp32_ptr[j]); + auto v = _mm256_loadu_ps(&v_fp32_ptr[j]); + auto p = _mm256_loadu_ps(¶m_fp32_ptr[j]); + m = _mm256_fmadd_ps(avx_beta1, m, _mm256_mul_ps(avx_beta1_1, g)); + v = _mm256_fmadd_ps(avx_beta2, v, _mm256_mul_ps(avx_beta2_1, _mm256_mul_ps(g, g))); + p = _mm256_fmadd_ps(avx_neg_lr, _mm256_mul_ps(avx_weight_decay, p), p); // p = p - lr * weight_decay * p + p = _mm256_fmadd_ps( + avx_neg_lr, + _mm256_div_ps( + _mm256_div_ps(m, avx_bias_correction1), // m / bias_correction1 + _mm256_add_ps(_mm256_sqrt_ps(_mm256_div_ps(v, avx_bias_correction2)), avx_eps) // sqrt(v / bias_correction2) + eps + ), // m / bias_correction1 / (sqrt(v / bias_correction2) + eps) + p + ); // p = p - lr * m / bias_correction1 / (sqrt(v / bias_correction2) + eps) + _mm256_storeu_ps(¶m_fp32_ptr[j], p); + _mm_storeu_si128((__m128i*)¶m_fp16_ptr[j], _mm256_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + _mm256_storeu_ps(&m_fp32_ptr[j], m); + _mm256_storeu_ps(&v_fp32_ptr[j], v); + } + }}); +} + +static void __attribute__ ((__target__ ("avx512f"))) adam_cpu_2( + int64_t n, + float* param_fp32_ptr, + uint16_t* param_fp16_ptr, + uint16_t* g_fp16_ptr, + float* m_fp32_ptr, + float* v_fp32_ptr, + float beta1, float beta2, + float eps, float lr, + float scale, + float weight_decay, + float bias_correction1, + float bias_correction2 +){ + auto avx_beta1 = _mm512_set1_ps(beta1); + auto avx_beta2 = _mm512_set1_ps(beta2); + auto avx_beta1_1 = _mm512_set1_ps(1 - beta1); + auto avx_beta2_1 = _mm512_set1_ps(1 - beta2); + auto avx_eps = _mm512_set1_ps(eps); + auto avx_neg_lr = _mm512_set1_ps(-lr); + auto avx_scale = _mm512_set1_ps(scale); + auto avx_weight_decay = _mm512_set1_ps(weight_decay); + auto avx_bias_correction1 = _mm512_set1_ps(bias_correction1); + auto avx_bias_correction2 = _mm512_set1_ps(bias_correction2); + int64_t span = 16; + parallel_for(0, n, 0, [&](int64_t start, int64_t end) { + for (int64_t j = start; j < end; j += span) { + if (j + span > end) { + for (int64_t i = j; i < end; i++) { + float g = fp16_ieee_to_fp32_value(g_fp16_ptr[i]) / scale; + float m = m_fp32_ptr[i]; + float v = v_fp32_ptr[i]; + float p = param_fp32_ptr[i]; + m = beta1 * m + (1 - beta1) * g; + v = beta2 * v + (1 - beta2) * g * g; + p = p - lr * m / bias_correction1 / (sqrtf(v / bias_correction2) + eps) - lr * weight_decay * p; + param_fp32_ptr[i] = p; + param_fp16_ptr[i] = fp16_ieee_from_fp32_value(p); + m_fp32_ptr[i] = m; + v_fp32_ptr[i] = v; + } + break; // must break here + }else{ auto g = _mm512_div_ps(_mm512_cvtph_ps(_mm256_loadu_si256((const __m256i*)&g_fp16_ptr[j])), avx_scale); auto m = _mm512_loadu_ps(&m_fp32_ptr[j]); auto v = _mm512_loadu_ps(&v_fp32_ptr[j]); @@ -225,34 +288,45 @@ void adam_cpu_launcher( p ); // p = p - lr * m / bias_correction1 / (sqrtf(v / bias_correction2) + eps) _mm512_storeu_ps(¶m_fp32_ptr[j], p); - _mm256_storeu_si256((__m256i*)&p_fp16_ptr[j], _mm512_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); + _mm256_storeu_si256((__m256i*)¶m_fp16_ptr[j], _mm512_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); _mm512_storeu_ps(&m_fp32_ptr[j], m); _mm512_storeu_ps(&v_fp32_ptr[j], v); -#elif defined(__AVX256__) - auto g = _mm256_div_ps(_mm256_cvtph_ps(_mm_loadu_si128((const __m128i*)&g_fp16_ptr[j])), avx_scale); - auto m = _mm256_loadu_ps(&m_fp32_ptr[j]); - auto v = _mm256_loadu_ps(&v_fp32_ptr[j]); - auto p = _mm256_loadu_ps(¶m_fp32_ptr[j]); - m = _mm256_fmadd_ps(avx_beta1, m, _mm256_mul_ps(avx_beta1_1, g)); - v = _mm256_fmadd_ps(avx_beta2, v, _mm256_mul_ps(avx_beta2_1, _mm256_mul_ps(g, g))); - p = _mm256_fmadd_ps(avx_neg_lr, _mm256_mul_ps(avx_weight_decay, p), p); // p = p - lr * weight_decay * p - p = _mm256_fmadd_ps( - avx_neg_lr, - _mm256_div_ps( - _mm256_div_ps(m, avx_bias_correction1), // m / bias_correction1 - _mm256_add_ps(_mm256_sqrt_ps(_mm256_div_ps(v, avx_bias_correction2)), avx_eps) // sqrt(v / bias_correction2) + eps - ), // m / bias_correction1 / (sqrt(v / bias_correction2) + eps) - p - ); // p = p - lr * m / bias_correction1 / (sqrt(v / bias_correction2) + eps) - _mm256_storeu_ps(¶m_fp32_ptr[j], p); - _mm_storeu_si128((__m128i*)&p_fp16_ptr[j], _mm256_cvtps_ph(p, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); - _mm256_storeu_ps(&m_fp32_ptr[j], m); - _mm256_storeu_ps(&v_fp32_ptr[j], v); -#endif } } + }); +} + - }); + + +void adam_cpu_launcher( + int64_t n, + std::uintptr_t param_fp32, + std::uintptr_t param_fp16, + std::uintptr_t g_fp16, + std::uintptr_t m_fp32, + std::uintptr_t v_fp32, + float beta1, float beta2, + float eps, float lr, + float scale, + float weight_decay, + float bias_correction1, + float bias_correction2 +) { + + auto param_fp32_ptr = reinterpret_cast(param_fp32); + auto m_fp32_ptr = reinterpret_cast(m_fp32); + auto v_fp32_ptr = reinterpret_cast(v_fp32); + auto param_fp16_ptr = reinterpret_cast(param_fp16); + auto g_fp16_ptr = reinterpret_cast(g_fp16); + int cpu_level = get_cpu_level(); + if (cpu_level == 0 ){ + adam_cpu_0(n, param_fp32_ptr, param_fp16_ptr, g_fp16_ptr, m_fp32_ptr, v_fp32_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2); + }else if(cpu_level == 1){ + adam_cpu_1(n, param_fp32_ptr, param_fp16_ptr, g_fp16_ptr, m_fp32_ptr, v_fp32_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2); + }else{ + adam_cpu_2(n, param_fp32_ptr, param_fp16_ptr, g_fp16_ptr, m_fp32_ptr, v_fp32_ptr, beta1, beta2, eps, lr, scale, weight_decay, bias_correction1, bias_correction2); + } } diff --git a/csrc/include/cpu_info.h b/csrc/include/cpu_info.h new file mode 100644 index 00000000..53ed48f8 --- /dev/null +++ b/csrc/include/cpu_info.h @@ -0,0 +1,38 @@ +#include + +static void cpuid(int info[4], int InfoType){ + __cpuid_count(InfoType, 0, info[0], info[1], info[2], info[3]); +} + +int get_cpu_level() { + // SIMD: 128-bit + bool HW_F16C; + + // SIMD: 256-bit + bool HW_AVX; + bool HW_FMA; + + // SIMD: 512-bit + bool HW_AVX512F; // AVX512 Foundation + + int info[4]; + cpuid(info, 0); + int nIds = info[0]; + + // Detect Features + if (nIds >= 0x00000001){ + cpuid(info,0x00000001); + HW_AVX = (info[2] & ((int)1 << 28)) != 0; + HW_FMA = (info[2] & ((int)1 << 12)) != 0; + HW_F16C = (info[2] & ((int)1 << 29)) != 0; + } + if (nIds >= 0x00000007){ + cpuid(info,0x00000007); + HW_AVX512F = (info[1] & ((int)1 << 16)) != 0; + } + + int ret = 0; + if (HW_AVX && HW_FMA && HW_F16C) ret = 1; + if (HW_AVX512F) ret = 2; + return ret; +}